1 /***************************************************************************
2 * Copyright (C) 2014 by PIR (University of Oviedo) and *
3 * INCO2 (Polytechnic University of Valencia) groups. *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU General Public License as published by *
8 * the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU General Public License for more details. *
16 * You should have received a copy of the GNU General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20 ***************************************************************************
24 * \brief File with functions to calcule NNMF using mlsa method for GPUs
25 * \author Information Retrieval and Parallel Computing Group (IRPCG)
26 * \author University of Oviedo, Spain
27 * \author Interdisciplinary Computation and Communication Group (INCO2)
28 * \author Universitat Politecnica de Valencia, Spain.
29 * \author Contact: nnmfpack@gmail.com
32 #include "mlsa_cuda.h"
36 * \fn int dmlsa_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
37 * \brief this function performs NNMF using beta-divergence when beta=2, using double precision.
39 * The algorithm is<BR>
40 * repit nIter times<BR>
41 * STEP 1<BR>
42 * L=W*H<BR>
43 * B=W'*A<BR>
44 * C=W'*L<BR>
45 * H=H(.*)B(./)C<BR>
47 * STEP 2<BR>
48 * L=W*H<BR>
49 * D=A*H'<BR>
50 * E=L*H'<BR>
51 * W=W(.*)D(./)E<BR>
52 * end repit<BR>
55 * To save some FLOPs and RAM a modified Lee and Seung version is used, so we have Step 1: L=W'*W,
56 * C=L*H and then B=W'*A; Step 2: L=H*H', E=W*L and D=A*H'. W'*W (H*H') and L*H (W*L) can do in
57 * parallel with W'*A (A*H'). Here we use streams.
59 * In real live B and C are (k*n) matrices used in STEP 1, and D and E are (m*k)
60 * matrices used in STEP 2. B/C and D/E are independent. For this reason only 2 matrices are declared
61 * to save space. They are matrices B and C with size max(m,n)*k
63 * \param m: (input) Number of rows of matrix A and matrix W
64 * \param n: (input) Number of columns of matrix A and matrix H
65 * \param k: (input) Number of columns of matrix W and rows of H
66 * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
67 * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
68 * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
69 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
70 * \param nIter: (input) Number of iterations
72 * It returns 0 if all is OK.
74 int dmlsa_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
98 CUDAERR(cudaGetDevice(&devID));
103 CUDAERR(cudaMalloc((void **)&B, max(m,n)*k*sizeof(double)));
104 CUDAERR(cudaMalloc((void **)&C, max(m,n)*k*sizeof(double)));
105 CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));
107 CUDAERR(cudaStreamCreate(&stream1));
108 CUDAERR(cudaStreamCreate(&stream2));
110 CUBLASERR(cublasCreate(&handle1));
111 CUBLASERR(cublasCreate(&handle2));
113 CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
114 CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
116 CUBLASERR(cublasSetStream(handle1, stream1));
117 CUBLASERR(cublasSetStream(handle2, stream2));
119 /* Why streams? Why not. Today perhaps no but in the future CUBLAS... */
120 /* Why this approach of streams? */
121 /* Let m=n and k = n/2. Thereby B=W'*A has n^3 flops. On other hand */
122 /* W'*W has (n^3)/2 and L*H the same. That is, stream 2 has n^3 flops */
123 /* and stream 1 the same ((n^3)/2 + (n^3)/2 = n^3). When k is smaller */
124 /* than n/2 stream 2 runs more flops (slower). When k is bigger... In */
125 /* real problems k (is) should be much smaller than min(m,n) so this */
126 /* approach is (maybe) the best. */
128 /* Stream1 recording event "event2" for stream2 (for the 1st time) */
129 CUDAERR(cudaEventRecord(event2, stream1));
133 /* ************************** Phase 1 **************************** */
134 /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
135 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
138 CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
140 /* Stream2 recording event "event1" for stream1 */
141 CUDAERR(cudaEventRecord(event1, stream2));
144 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
147 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
149 /* Stream1 waiting "event1" set by stream2 */
150 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
153 ddotdiv_cuda(k*n, B, C, H, stream1);
155 /* Stream1 recording event "event2" for stream2 */
156 CUDAERR(cudaEventRecord(event2, stream1));
159 /* ************************** Phase 2 **************************** */
160 /* Stream2 waiting "event2" set by stream1 */
161 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
164 CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
166 /* Stream2 recording event "event1" for stream1 */
167 CUDAERR(cudaEventRecord(event1, stream2));
170 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
173 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
175 /* Stream1 waiting "event1" set by stream 2 */
176 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
179 ddotdiv_cuda(m*k, B, C, W, stream1);
181 /* Stream1 recording event "event2" for stream2 */
182 CUDAERR(cudaEventRecord(event2, stream1));
184 /* maybe not needed but by completeness ... */
185 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
187 CUBLASERR(cublasDestroy(handle1));
188 CUBLASERR(cublasDestroy(handle2));
190 CUDAERR(cudaStreamDestroy(stream1));
191 CUDAERR(cudaStreamDestroy(stream2));
193 CUDAERR(cudaEventDestroy(event1));
194 CUDAERR(cudaEventDestroy(event2));
196 CUDAERR(cudaFree(B));
197 CUDAERR(cudaFree(C));
198 CUDAERR(cudaFree(L));
202 CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(double)));
203 CUDAERR(cudaMalloc((void **)&C, m*k*sizeof(double)));
204 CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));
206 CUDAERR(cudaStreamCreate(&stream1));
207 CUDAERR(cudaStreamCreate(&stream2));
209 CUBLASERR(cublasCreate(&handle1));
210 CUBLASERR(cublasCreate(&handle2));
212 CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
213 CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
215 CUBLASERR(cublasSetStream(handle1, stream1));
216 CUBLASERR(cublasSetStream(handle2, stream2));
218 /* Why streams? Why not. Today perhaps no but in the future CUBLAS... */
219 /* Why this approach of streams? */
220 /* Let m=n and k = n/2. Thereby B=W'*A has n^3 flops. On other hand */
221 /* W'*W has (n^3)/2 and L*H the same. That is, stream 2 has n^3 flops */
222 /* and stream 1 the same ((n^3)/2 + (n^3)/2 = n^3). When k is smaller */
223 /* than n/2 stream 2 runs more flops (slower). When k is bigger... In */
224 /* real problems k (is) should be much smaller than min(m,n) so this */
225 /* approach is (maybe) the best. */
227 /* Stream1 recording event "event2" for stream2 (for the 1st time) */
228 CUDAERR(cudaEventRecord(event2, stream1));
232 /* ************************** Phase 2 **************************** */
233 /* Stream2 waiting "event2" set by stream1 */
234 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
237 CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
239 /* Stream2 recording event "event1" for stream1 */
240 CUDAERR(cudaEventRecord(event1, stream2));
243 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
246 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
248 /* Stream1 waiting "event1" set by stream 2 */
249 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
252 ddotdiv_cuda(m*k, B, C, W, stream1);
254 /* Stream1 recording event "event2" for stream2 */
255 CUDAERR(cudaEventRecord(event2, stream1));
257 /* maybe not needed but by completeness ... */
258 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
260 CUBLASERR(cublasDestroy(handle1));
261 CUBLASERR(cublasDestroy(handle2));
263 CUDAERR(cudaStreamDestroy(stream1));
264 CUDAERR(cudaStreamDestroy(stream2));
266 CUDAERR(cudaEventDestroy(event1));
267 CUDAERR(cudaEventDestroy(event2));
269 CUDAERR(cudaFree(B));
270 CUDAERR(cudaFree(C));
271 CUDAERR(cudaFree(L));
275 CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(double)));
276 CUDAERR(cudaMalloc((void **)&C, n*k*sizeof(double)));
277 CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));
279 CUDAERR(cudaStreamCreate(&stream1));
280 CUDAERR(cudaStreamCreate(&stream2));
282 CUBLASERR(cublasCreate(&handle1));
283 CUBLASERR(cublasCreate(&handle2));
285 CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
286 CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
288 CUBLASERR(cublasSetStream(handle1, stream1));
289 CUBLASERR(cublasSetStream(handle2, stream2));
291 /* Stream1 recording event "event2" for stream2 (for the 1st time) */
292 CUDAERR(cudaEventRecord(event2, stream1));
296 /* ************************** Phase 1 **************************** */
297 /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
298 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
301 CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
303 /* Stream2 recording event "event1" for stream1 */
304 CUDAERR(cudaEventRecord(event1, stream2));
307 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
310 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
312 /* Stream1 waiting "event1" set by stream2 */
313 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
316 ddotdiv_cuda(k*n, B, C, H, stream1);
318 /* Stream1 recording event "event2" for stream2 */
319 CUDAERR(cudaEventRecord(event2, stream1));
321 /* maybe not needed but by completeness ... */
322 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
324 CUBLASERR(cublasDestroy(handle1));
325 CUBLASERR(cublasDestroy(handle2));
327 CUDAERR(cudaStreamDestroy(stream1));
328 CUDAERR(cudaStreamDestroy(stream2));
330 CUDAERR(cudaEventDestroy(event1));
331 CUDAERR(cudaEventDestroy(event2));
333 CUDAERR(cudaFree(B));
334 CUDAERR(cudaFree(C));
335 CUDAERR(cudaFree(L));
346 * \fn int smlsa_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
347 * \brief smlsa_cuda performs NNMF using betadi-vergence when beta=2, using simple precision.
348 * See description of dmlsa_cuda for more info
349 * \param m: (input) Number of rows of matrix A and matrix W
350 * \param n: (input) Number of columns of matrix A and matrix H
351 * \param k: (input) Number of columns of matrix W and rows of H
352 * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
353 * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
354 * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
355 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
356 * \param nIter: (input) Number of iterations
358 * It returns 0 if all is OK.
360 int smlsa_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
384 CUDAERR(cudaGetDevice(&devID));
389 CUDAERR(cudaMalloc((void **)&B, max(m,n)*k*sizeof(float)));
390 CUDAERR(cudaMalloc((void **)&C, max(m,n)*k*sizeof(float)));
391 CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));
393 CUDAERR(cudaStreamCreate(&stream1));
394 CUDAERR(cudaStreamCreate(&stream2));
396 CUBLASERR(cublasCreate(&handle1));
397 CUBLASERR(cublasCreate(&handle2));
399 CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
400 CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
402 CUBLASERR(cublasSetStream(handle1, stream1));
403 CUBLASERR(cublasSetStream(handle2, stream2));
405 /* Why streams? Why this approach of streams? See dmlsa_cuda */
407 /* Stream1 recording event "event2" for stream2 (for the 1st time) */
408 CUDAERR(cudaEventRecord(event2, stream1));
412 /* ************************** Phase 1 **************************** */
413 /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
414 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
417 CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
419 /* Stream2 recording event "event1" for stream1 */
420 CUDAERR(cudaEventRecord(event1, stream2));
423 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
426 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
428 /* Stream1 waiting "event1" set by stream2 */
429 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
432 sdotdiv_cuda(k*n, B, C, H, stream1);
434 /* Stream1 recording event "event2" for stream2 */
435 CUDAERR(cudaEventRecord(event2, stream1));
438 /* ************************** Phase 2 **************************** */
439 /* Stream2 waiting "event2" set by stream1 */
440 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
443 CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
445 /* Stream2 recording event "event1" for stream1 */
446 CUDAERR(cudaEventRecord(event1, stream2));
449 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
452 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
454 /* Stream1 waiting "event1" set by stream 2 */
455 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
458 sdotdiv_cuda(m*k, B, C, W, stream1);
460 /* Stream1 recording event "event2" for stream2 */
461 CUDAERR(cudaEventRecord(event2, stream1));
463 /* maybe not needed but by completeness ... */
464 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
466 CUBLASERR(cublasDestroy(handle1));
467 CUBLASERR(cublasDestroy(handle2));
469 CUDAERR(cudaStreamDestroy(stream1));
470 CUDAERR(cudaStreamDestroy(stream2));
472 CUDAERR(cudaEventDestroy(event1));
473 CUDAERR(cudaEventDestroy(event2));
475 CUDAERR(cudaFree(B));
476 CUDAERR(cudaFree(C));
477 CUDAERR(cudaFree(L));
481 CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(float)));
482 CUDAERR(cudaMalloc((void **)&C, m*k*sizeof(float)));
483 CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));
485 CUDAERR(cudaStreamCreate(&stream1));
486 CUDAERR(cudaStreamCreate(&stream2));
488 CUBLASERR(cublasCreate(&handle1));
489 CUBLASERR(cublasCreate(&handle2));
491 CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
492 CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
494 CUBLASERR(cublasSetStream(handle1, stream1));
495 CUBLASERR(cublasSetStream(handle2, stream2));
497 /* Why streams? Why this approach of streams? See dmlsa_cuda */
499 /* Stream1 recording event "event2" for stream2 (for the 1st time) */
500 CUDAERR(cudaEventRecord(event2, stream1));
504 /* ************************** Phase 2 **************************** */
505 /* Stream2 waiting "event2" set by stream1 */
506 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
509 CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
511 /* Stream2 recording event "event1" for stream1 */
512 CUDAERR(cudaEventRecord(event1, stream2));
515 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
518 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
520 /* Stream1 waiting "event1" set by stream 2 */
521 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
524 sdotdiv_cuda(m*k, B, C, W, stream1);
526 /* Stream1 recording event "event2" for stream2 */
527 CUDAERR(cudaEventRecord(event2, stream1));
529 /* maybe not needed but by completeness ... */
530 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
532 CUBLASERR(cublasDestroy(handle1));
533 CUBLASERR(cublasDestroy(handle2));
535 CUDAERR(cudaStreamDestroy(stream1));
536 CUDAERR(cudaStreamDestroy(stream2));
538 CUDAERR(cudaEventDestroy(event1));
539 CUDAERR(cudaEventDestroy(event2));
541 CUDAERR(cudaFree(B));
542 CUDAERR(cudaFree(C));
543 CUDAERR(cudaFree(L));
547 CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(float)));
548 CUDAERR(cudaMalloc((void **)&C, n*k*sizeof(float)));
549 CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));
551 CUDAERR(cudaStreamCreate(&stream1));
552 CUDAERR(cudaStreamCreate(&stream2));
554 CUBLASERR(cublasCreate(&handle1));
555 CUBLASERR(cublasCreate(&handle2));
557 CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
558 CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
560 CUBLASERR(cublasSetStream(handle1, stream1));
561 CUBLASERR(cublasSetStream(handle2, stream2));
563 /* Why streams? Why this approach of streams? See dmlsa_cuda */
565 /* Stream1 recording event "event2" for stream2 (for the 1st time) */
566 CUDAERR(cudaEventRecord(event2, stream1));
570 /* ************************** Phase 1 **************************** */
571 /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
572 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
575 CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
577 /* Stream2 recording event "event1" for stream1 */
578 CUDAERR(cudaEventRecord(event1, stream2));
581 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
584 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
586 /* Stream1 waiting "event1" set by stream2 */
587 CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
590 sdotdiv_cuda(k*n, B, C, H, stream1);
592 /* Stream1 recording event "event2" for stream2 */
593 CUDAERR(cudaEventRecord(event2, stream1));
595 /* maybe not needed but by completeness ... */
596 CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
598 CUBLASERR(cublasDestroy(handle1));
599 CUBLASERR(cublasDestroy(handle2));
601 CUDAERR(cudaStreamDestroy(stream1));
602 CUDAERR(cudaStreamDestroy(stream2));
604 CUDAERR(cudaEventDestroy(event1));
605 CUDAERR(cudaEventDestroy(event2));
607 CUDAERR(cudaFree(B));
608 CUDAERR(cudaFree(C));
609 CUDAERR(cudaFree(L));
621 * \fn void ddotdiv_cuda(const int n, const double *x, const double *y, double *z, cudaStream_t stream)
622 * \brief It calls kernel vddotdiv_cuda that performs z[i]=x[i]*y[i]/z[i]
623 * \param n: (input) Number of elements of x, y and z
624 * \param x: (input) Double precision input vector/matrix (1D column-major)
625 * \param y: (input) Double precision input vector/matrix (1D column-major)
626 * \param z: (inout) Double precision input/output vector/matrix (1D column-major)
627 * \param stream: (input) ID of the stream to use
629 void ddotdiv_cuda(const int n, const double *x, const double *y, double *z, cudaStream_t stream)
631 dim3 dimGrid, dimBlock;
633 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
639 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
640 vddotdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y, z);
642 cudaDeviceSynchronize();
648 * \fn void sdotdiv_cuda(const int n, const float *x, const float *y, float *z, cudaStream_t stream)
649 * \brief It calls kernel vsdotdiv_cuda that performs z[i]=x[i]*y[i]/z[i]
650 * \param n: (input) Number of elements of x, y and z
651 * \param x: (input) Simple precision input vector/matrix (1D column-major)
652 * \param y: (input) Simple precision input vector/matrix (1D column-major)
653 * \param z: (inout) Simple precision input/output vector/matrix (1D column-major)
654 * \param stream: (input) ID of the stream to use
656 void sdotdiv_cuda(const int n, const float *x, const float *y, float *z, cudaStream_t stream)
658 dim3 dimGrid, dimBlock;
660 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
666 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
667 vsdotdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y, z);
669 cudaDeviceSynchronize();
675 * \fn __global__ void vddotdiv_cuda(const int n, const double* __restrict__ x, const double* __restrict__ y, double *z)
676 * \brief This kernel computes double precision z[i]=z[i]*x[i]/y[i]
677 * \param n: (input) Number of elements of x, y and z
678 * \param x: (input) Double precision input vector/matrix (1D column-major)
679 * \param y: (input) Double precision input vector/matrix (1D column-major)
680 * \param z: (inout) Double precision input/output vector/matrix (1D column-major)
682 __global__ void vddotdiv_cuda(const int n, const double* __restrict__ x, const double* __restrict__ y, double *z)
684 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
688 z[pos] = z[pos] * x[pos] / y[pos];
689 assert(!fpe(z[pos]));
691 z[pos] = z[pos] * x[pos] / y[pos];
697 * \fn __global__ void vsdotdiv_cuda(const int n, const float* __restrict__ x, const float* __restrict__ y, float *z)
698 * \brief This kernel calculates simple precision z[i]=z[i]*x[i]/y[i]
699 * \param n: (input) Number of elements of x, y and z
700 * \param x: (input) Simple precision input vector/matrix (1D column-major)
701 * \param y: (input) Simple precision input vector/matrix (1D column-major)
702 * \param z: (inout) Simple precision input/output vector/matrix (1D column-major)
704 __global__ void vsdotdiv_cuda(const int n, const float* __restrict__ x, const float* __restrict__ y, float *z)
706 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
710 /* Here we can have NaN and Inf if y(pos) and/or x(pos) and/or z[pos]=0 */
711 z[pos] = z[pos] * x[pos] / y[pos];
712 assert(!fpe(z[pos]));
714 z[pos] = z[pos] * x[pos] / y[pos];