1 /***************************************************************************
2 * Copyright (C) 2014 by PIR (University of Oviedo) and *
3 * INCO2 (Polytechnic University of Valencia) groups. *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU General Public License as published by *
8 * the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU General Public License for more details. *
16 * You should have received a copy of the GNU General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20 ***************************************************************************
24 * \brief File with functions to calcule NNMF using betadivergence methods for GPUs
25 * \author Information Retrieval and Parallel Computing Group (IRPCG)
26 * \author University of Oviedo, Spain
27 * \author Interdisciplinary Computation and Communication Group (INCO2)
28 * \author Universitat Politecnica de Valencia, Spain.
29 * \author Contact: nnmfpack@gmail.com
32 #include "bdiv_cuda.h"
36 * \fn int dbdivg_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, const int nIter)
37 * \brief dbdivg_cuda performs the NNMF using beta-divergence when beta is != 1 and !=2, using double precision.
39 * The algorithm is<BR>
40 * repit nIter times<BR>
41 * STEP 1<BR>
42 * L=W*H<BR>
43 * L1=L(.^)(beta-2)<BR>
44 * L2=L1(.*)A<BR>
45 * L1=L1(.*)L<BR>
46 * B=W'*L2<BR>
47 * C=W'*L1<BR>
48 * H=H(.*)B(./)C<BR>
50 * STEP 2<BR>
51 * L=W*H<BR>
52 * L1=L(.^)(beta-2)<BR>
53 * L2=L1(.*)A<BR>
54 * L1=L1(.*)L<BR>
55 * D=L2*H'<BR>
56 * E=L1*H'<BR>
57 * W=W(.*)D(./)E<BR>
58 * end repit<BR>
61 * In real live L1 and L2 are (m*n) matrices used in STEP 1 and 2. For performance
62 * reasons only one 1D colum-mayor buffer, named R in next code, of size 2*m*n is used
63 * In STEP 1, first part of R (m*n positions) is L2 and the second part is L1.
64 * In STEP 2, fisrt column of R (2*m positions) is composed by the first column of L2
65 * following first column of L1. Second column of R (2*m positions) is composed by the
66 * sencond column of L2 following second column of L1. 3rd column of R ... and so on
68 * In real live B and C are (k*n) matrices used in STEP 1, and D and E are (m*k)
69 * matrices used in STEP 2. B/C and D/E are independent. However we do not have L1
70 * and L2, we have R, and we can do B=W'*L2 and C=W'*L1 (or D=L2*H' and E=L1*H') at
71 * the same time. For this reason only one matrix is declared to save space. This is
72 * matrix M with size 2*max(m,n)*k
74 * \param m: (input) Number of rows of matrix A and matrix W
75 * \param n: (input) Number of columns of matrix A and matrix H
76 * \param k: (input) Number of columns of matrix W and rows of H
77 * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
78 * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
79 * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
80 * \param expo: (input) Double precision value. The exponent beta of betadivergence method
81 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
82 * \param nIter: (input) Number of iterations
84 * It returns 0 if all is OK.
86 int dbdivg_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, const int nIter)
101 CUDAERR(cudaGetDevice(&devID));
106 CUDAERR(cudaMalloc((void **)&M, 2*max(m, n) * k * sizeof(double)));
107 CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(double)));
108 CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));
110 CUBLASERR(cublasCreate(&handle));
114 /* ************************ Phase 1 *************************** */
116 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
118 /* L1=L(.^)(expo-2) */
121 dkernelH_cuda(m, n, L, A, R, expo-2.0, 0);
125 /* above is equal to R=W'*|L2 | L1| */
126 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
128 /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
129 dupdate1H_cuda(k*n, M, H, 0);
132 /* ************************ Phase 2 *************************** */
134 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
136 /* L1=L(.^)(expo-2) */
139 dkernelW_cuda(m, n, L, A, R, expo-2.0, 0);
144 /* above is equal to R=| | * H' */
146 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
149 dupdate1W_cuda(m, k, M, W, 0);
151 CUBLASERR(cublasDestroy(handle));
153 CUDAERR(cudaFree(M));
154 CUDAERR(cudaFree(L));
155 CUDAERR(cudaFree(R));
159 CUDAERR(cudaMalloc((void **)&M, 2*m * k * sizeof(double)));
160 CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(double)));
161 CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));
163 CUBLASERR(cublasCreate(&handle));
167 /* ************************ Phase 2 *************************** */
169 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
171 /* L1=L(.^)(expo-2) */
174 dkernelW_cuda(m, n, L, A, R, expo-2.0, 0);
179 /* above is equal to R=| | * H' */
181 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
184 dupdate1W_cuda(m, k, M, W, 0);
186 CUBLASERR(cublasDestroy(handle));
188 CUDAERR(cudaFree(M));
189 CUDAERR(cudaFree(L));
190 CUDAERR(cudaFree(R));
194 CUDAERR(cudaMalloc((void **)&M, 2*n * k * sizeof(double)));
195 CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(double)));
196 CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));
198 CUBLASERR(cublasCreate(&handle));
202 /* ************************ Phase 1 *************************** */
204 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
206 /* L1=L(.^)(expo-2) */
209 dkernelH_cuda(m, n, L, A, R, expo-2.0, 0);
213 /* above is equal to R=W'*|L2 | L1| */
214 CUBLASERR(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
216 /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
217 dupdate1H_cuda(k*n, M, H, 0);
219 CUBLASERR(cublasDestroy(handle));
221 CUDAERR(cudaFree(M));
222 CUDAERR(cudaFree(L));
223 CUDAERR(cudaFree(R));
233 * \fn int sbdivg_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, const int nIter)
234 * \brief sbdivg_cuda performs NNMF using beta-divergence when beta is != 1 and !=2, using simple precision
235 * See description of dbdivg_cuda for more info
236 * \param m: (input) Number of rows of matrix A and matrix W
237 * \param n: (input) Number of columns of matrix A and matrix H
238 * \param k: (input) Number of columns of matrix W and rows of H
239 * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
240 * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
241 * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
242 * \param expo: (input) Simple precision value. The exponent beta of betadivergence method
243 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
244 * \param nIter: (input) Number of iterations
246 * It returns 0 if all is OK.
248 int sbdivg_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, const int nIter)
263 CUDAERR(cudaGetDevice(&devID));
268 CUDAERR(cudaMalloc((void **)&M, 2*max(m, n) * k * sizeof(float)));
269 CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(float)));
270 CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));
272 CUBLASERR(cublasCreate(&handle));
276 /* ************************ Phase 1 *************************** */
278 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
280 /* L1=L(.^)(expo-2) */
283 skernelH_cuda(m, n, L, A, R, expo-2.0f, 0);
287 /* above is equal to R=W'*|L2 | L1| */
288 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
291 supdate1H_cuda(k*n, M, H, 0);
294 /* ************************ Phase 2 *************************** */
296 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
298 /* L1=L(.^)(expo-2) */
301 skernelW_cuda(m, n, L, A, R, expo-2.0f, 0);
306 /* above is equal to R=| | * H' */
308 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
311 supdate1W_cuda(m, k, M, W, 0);
313 CUBLASERR(cublasDestroy(handle));
315 CUDAERR(cudaFree(M));
316 CUDAERR(cudaFree(L));
317 CUDAERR(cudaFree(R));
321 CUDAERR(cudaMalloc((void **)&M, 2*m * k * sizeof(float)));
322 CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(float)));
323 CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));
325 CUBLASERR(cublasCreate(&handle));
329 /* ************************ Phase 2 *************************** */
331 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
333 /* L1=L(.^)(expo-2) */
336 skernelW_cuda(m, n, L, A, R, expo-2.0f, 0);
341 /* above is equal to R=| | * H' */
343 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
346 supdate1W_cuda(m, k, M, W, 0);
348 CUBLASERR(cublasDestroy(handle));
350 CUDAERR(cudaFree(M));
351 CUDAERR(cudaFree(L));
352 CUDAERR(cudaFree(R));
356 CUDAERR(cudaMalloc((void **)&M, 2*n * k * sizeof(float)));
357 CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(float)));
358 CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));
360 CUBLASERR(cublasCreate(&handle));
364 /* ************************ Phase 1 *************************** */
366 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
368 /* L1=L(.^)(expo-2) */
371 skernelH_cuda(m, n, L, A, R, expo-2.0f, 0);
375 /* above is equal to R=W'*|L2 | L1| */
376 CUBLASERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
379 supdate1H_cuda(k*n, M, H, 0);
381 CUBLASERR(cublasDestroy(handle));
383 CUDAERR(cudaFree(M));
384 CUDAERR(cudaFree(L));
385 CUDAERR(cudaFree(R));
396 * \fn int dbdivone_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
397 * \brief dbdivone_cuda performs NNMF using beta-divergence when beta=1, using double precision.
399 * The algorithm is<BR>
400 * repit nIter times<BR>
401 * STEP 1<BR>
402 * y(i)=sum(W(j,i) for all j in range) for all i in range<BR>
403 * L=W*H<BR>
404 * L=A(./)L<BR>
405 * B=W'*L<BR>
406 * B(i,j)=B(i,j) / y(i) for all B elements<BR>
407 * H=H(.*)B<BR>
409 * STEP 2<BR>
410 * y(i)=sum(H(i,j) for all j in range) for all i in range<BR>
411 * L=W*H<BR>
412 * L=A(./)L <BR>
413 * D=L*H'<BR>
414 * B(i,j)=B(i,j) / y(j) for all B elements<BR>
415 * W=W(.*)D<BR>
416 * end repit<BR>
419 * In real live B is a (k*n) matrix used in STEP 1, and D is a (m*k)
420 * matrix used in STEP 2. B and D are independent. For this reason only 1 matrix
421 * of size max(m,n)*k is declared/used
423 * \param m: (input) Number of rows of matrix A and matrix W
424 * \param n: (input) Number of columns of matrix A and matrix H
425 * \param k: (input) Number of columns of matrix W and rows of H
426 * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
427 * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
428 * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
429 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
430 * \param nIter: (input) Number of iterations
432 * It returns 0 if all is OK.
434 int dbdivone_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
458 CUDAERR(cudaGetDevice(&devID));
463 CUDAERR(cudaMalloc((void **)&B, k*max(m,n)*sizeof(double)));
464 CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
466 /* These two vectors (x and y) are used both in Phase 1 and Phase 2. With */
467 /* the strategy used with matrices B and D the size of x is max(m,n) */
468 CUDAERR(cudaMalloc((void **)&x, max(m,n)*sizeof(double)));
469 CUDAERR(cudaMalloc((void **)&y, k*sizeof(double)));
471 CUDAERR(cudaStreamCreate(&stream1));
472 CUDAERR(cudaStreamCreate(&stream2));
474 CUBLASERR(cublasCreate(&handle1));
475 CUBLASERR(cublasCreate(&handle2));
477 CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
479 CUBLASERR(cublasSetStream(handle1, stream1));
480 CUBLASERR(cublasSetStream(handle2, stream2));
482 /* Why streams? Because we use them before */
483 /* For small sizes without streams is fast (n=m <= 500) */
484 /* For medium sizes with streams is fast (500 <= n=m <= 1000) */
485 /* For other sizes streams is less (aprox. =) than without streams */
486 /* Why this approach? */
487 /* Maybe this is the best way to use less calls to cudaStreamWait */
488 /* due to one cublasDgemv is faster than two cublasDgemm plus one */
489 /* kernel. Call latter cublasDgemv perhaps it is ok but... */
491 /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
492 dmemset_cuda(max(m,n), x, 1.0, stream1);
494 /* Stream1 recording event "event" for stream2 (for the 1st time) */
495 CUDAERR(cudaEventRecord(event, stream1));
499 /* ************************ Phase 1 *************************** */
500 /* Stream2 waiting "event" set by stream1. 1st time is always ok */
501 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
503 /* Calculate the sums of all W columns via dgemv(W, x) */
504 CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
506 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
507 /* CUDAERR(cudaEventRecord(event1, stream2)); */
510 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
513 ddiv_cuda(m*n, A, L, stream1);
516 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
518 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
519 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
521 /* B(i,j)=B(i,j) / y(i) for all B elements */
523 dupdate2H_cuda(k, n, y, B, H, stream1);
525 /* Stream1 recording event "event" for stream2 */
526 CUDAERR(cudaEventRecord(event, stream1));
529 /* ************************** Phase 2 **************************** */
530 /* Stream2 waiting "event" set by stream1 */
531 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
533 /* Calculate the sums of all H rows via dgemv(H, x) is performed */
534 CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
536 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
537 /* CUDAERR(cudaEventRecord(event1, stream2)); */
540 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
543 ddiv_cuda(m*n, A, L, stream1);
546 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
548 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
549 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
551 /* B(i,j)=B(i,j)/y(j) for all B elements */
553 dupdate2W_cuda(m, k, y, B, W, stream1);
555 /* Stream1 recording event "event2" for stream2 */
556 CUDAERR(cudaEventRecord(event, stream1));
558 /* maybe not needed but by completeness ... */
559 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
561 CUBLASERR(cublasDestroy(handle1));
562 CUBLASERR(cublasDestroy(handle2));
564 CUDAERR(cudaStreamDestroy(stream1));
565 CUDAERR(cudaStreamDestroy(stream2));
567 CUDAERR(cudaEventDestroy(event));
569 CUDAERR(cudaFree(B));
570 CUDAERR(cudaFree(L));
571 CUDAERR(cudaFree(x));
572 CUDAERR(cudaFree(y));
576 CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(double)));
577 CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
578 CUDAERR(cudaMalloc((void **)&x, n*sizeof(double)));
579 CUDAERR(cudaMalloc((void **)&y, k*sizeof(double)));
581 CUDAERR(cudaStreamCreate(&stream1));
582 CUDAERR(cudaStreamCreate(&stream2));
584 CUBLASERR(cublasCreate(&handle1));
585 CUBLASERR(cublasCreate(&handle2));
587 CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
589 CUBLASERR(cublasSetStream(handle1, stream1));
590 CUBLASERR(cublasSetStream(handle2, stream2));
592 /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
593 dmemset_cuda(max(m,n), x, 1.0, stream1);
595 /* Stream1 recording event "event" for stream2 (for the 1st time) */
596 CUDAERR(cudaEventRecord(event, stream1));
600 /* ************************** Phase 2 **************************** */
601 /* Stream2 waiting "event" set by stream1 */
602 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
604 /* Calculate the sums of all H rows via dgemv(H, x) is performed */
605 CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
607 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
608 /* CUDAERR(cudaEventRecord(event1, stream2)); */
611 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
614 ddiv_cuda(m*n, A, L, stream1);
617 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
619 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
620 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
622 /* B(i,j)=B(i,j)/y(j) for all B elements */
624 dupdate2W_cuda(m, k, y, B, W, stream1);
626 /* Stream1 recording event "event2" for stream2 */
627 CUDAERR(cudaEventRecord(event, stream1));
629 /* maybe not needed but by completeness ... */
630 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
632 CUBLASERR(cublasDestroy(handle1));
633 CUBLASERR(cublasDestroy(handle2));
635 CUDAERR(cudaStreamDestroy(stream1));
636 CUDAERR(cudaStreamDestroy(stream2));
638 CUDAERR(cudaEventDestroy(event));
640 CUDAERR(cudaFree(B));
641 CUDAERR(cudaFree(L));
642 CUDAERR(cudaFree(x));
643 CUDAERR(cudaFree(y));
647 CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(double)));
648 CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
649 CUDAERR(cudaMalloc((void **)&x, m*sizeof(double)));
650 CUDAERR(cudaMalloc((void **)&y, k*sizeof(double)));
652 CUDAERR(cudaStreamCreate(&stream1));
653 CUDAERR(cudaStreamCreate(&stream2));
655 CUBLASERR(cublasCreate(&handle1));
656 CUBLASERR(cublasCreate(&handle2));
658 CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
660 CUBLASERR(cublasSetStream(handle1, stream1));
661 CUBLASERR(cublasSetStream(handle2, stream2));
663 /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
664 dmemset_cuda(max(m,n), x, 1.0, stream1);
666 /* Stream1 recording event "event" for stream2 (for the 1st time) */
667 CUDAERR(cudaEventRecord(event, stream1));
671 /* ************************ Phase 1 *************************** */
672 /* Stream2 waiting "event" set by stream1. 1st time is always ok */
673 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
675 /* Calculate the sums of all W columns via dgemv(W, x) */
676 CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
678 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
679 /* CUDAERR(cudaEventRecord(event1, stream2)); */
682 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
685 ddiv_cuda(m*n, A, L, stream1);
688 CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
690 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
691 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
693 /* B(i,j)=B(i,j) / y(i) for all B elements */
695 dupdate2H_cuda(k, n, y, B, H, stream1);
697 /* Stream1 recording event "event" for stream2 */
698 CUDAERR(cudaEventRecord(event, stream1));
700 /* maybe not needed but by completeness ... */
701 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
703 CUBLASERR(cublasDestroy(handle1));
704 CUBLASERR(cublasDestroy(handle2));
706 CUDAERR(cudaStreamDestroy(stream1));
707 CUDAERR(cudaStreamDestroy(stream2));
709 CUDAERR(cudaEventDestroy(event));
711 CUDAERR(cudaFree(B));
712 CUDAERR(cudaFree(L));
713 CUDAERR(cudaFree(x));
714 CUDAERR(cudaFree(y));
725 * \fn int sbdivone_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
726 * \brief sbdivone_cuda performs NNMF using betadivergence when beta=2 using simple precision
727 * See description of dbdivone_cuda for more info
728 * \param m: (input) Number of rows of matrix A and matrix W
729 * \param n: (input) Number of columns of matrix A and matrix H
730 * \param k: (input) Number of columns of matrix W and rows of H
731 * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
732 * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
733 * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
734 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
735 * \param nIter: (input) Number of iterations
737 * It returns 0 if all is OK.
739 int sbdivone_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
763 CUDAERR(cudaGetDevice(&devID));
768 CUDAERR(cudaMalloc((void **)&B, k*max(m,n)*sizeof(float)));
769 CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
771 /* This two vectors (x and y) are used both in Step 1 and Step 2. With */
772 /* the strategy used with matrices B and D the size of x is max(m,n) */
773 CUDAERR(cudaMalloc((void **)&x, max(m,n)*sizeof(float)));
774 CUDAERR(cudaMalloc((void **)&y, k*sizeof(float)));
776 CUDAERR(cudaStreamCreate(&stream1));
777 CUDAERR(cudaStreamCreate(&stream2));
779 CUBLASERR(cublasCreate(&handle1));
780 CUBLASERR(cublasCreate(&handle2));
782 CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
784 CUBLASERR(cublasSetStream(handle1, stream1));
785 CUBLASERR(cublasSetStream(handle2, stream2));
787 /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
788 smemset_cuda(max(m,n), x, 1.0f, stream1);
790 /* Stream1 recording event "event" for stream2 (for the 1st time) */
791 CUDAERR(cudaEventRecord(event, stream1));
795 /* ************************ Phase 1 *************************** */
796 /* Stream2 waiting "event" set by stream1. 1st time is always ok */
797 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
799 /* Calculate the sums of all W columns via dgemv(W, x) */
800 CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
802 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
803 /* CUDAERR(cudaEventRecord(event1, stream2)); */
806 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
809 sdiv_cuda(m*n, A, L, stream1);
812 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
814 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
815 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
817 /* B(i,j)=B(i,j)/y(i) for all B elements */
819 supdate2H_cuda(k, n, y, B, H, stream1);
821 /* Stream1 recording event "event" for stream2 */
822 CUDAERR(cudaEventRecord(event, stream1));
825 /* ************************** Phase 2 **************************** */
826 /* Stream2 waiting "event" set by stream1 */
827 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
829 /* Calculate the sums of all H rows via dgemv(H, x) is performed */
830 CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
832 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
833 /* CUDAERR(cudaEventRecord(event1, stream2)); */
836 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
839 sdiv_cuda(m*n, A, L, stream1);
842 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
844 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
845 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
847 /* B(i,j)=B(i,j)/y(j) for all B elements */
849 supdate2W_cuda(m, k, y, B, W, stream1);
851 /* Stream1 recording event "event2" for stream2 */
852 CUDAERR(cudaEventRecord(event, stream1));
854 /* maybe not needed but by completeness ... */
855 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
857 CUBLASERR(cublasDestroy(handle1));
858 CUBLASERR(cublasDestroy(handle2));
860 CUDAERR(cudaStreamDestroy(stream1));
861 CUDAERR(cudaStreamDestroy(stream2));
863 CUDAERR(cudaEventDestroy(event));
865 CUDAERR(cudaFree(B));
866 CUDAERR(cudaFree(L));
867 CUDAERR(cudaFree(x));
868 CUDAERR(cudaFree(y));
872 CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(float)));
873 CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
874 CUDAERR(cudaMalloc((void **)&x, n*sizeof(float)));
875 CUDAERR(cudaMalloc((void **)&y, k*sizeof(float)));
877 CUDAERR(cudaStreamCreate(&stream1));
878 CUDAERR(cudaStreamCreate(&stream2));
880 CUBLASERR(cublasCreate(&handle1));
881 CUBLASERR(cublasCreate(&handle2));
883 CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
885 CUBLASERR(cublasSetStream(handle1, stream1));
886 CUBLASERR(cublasSetStream(handle2, stream2));
888 /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
889 smemset_cuda(max(m,n), x, 1.0f, stream1);
891 /* Stream1 recording event "event" for stream2 (for the 1st time) */
892 CUDAERR(cudaEventRecord(event, stream1));
896 /* ************************** Phase 2 **************************** */
897 /* Stream2 waiting "event" set by stream1 */
898 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
900 /* Calculate the sums of all H rows via dgemv(H, x) is performed */
901 CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
903 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
904 /* CUDAERR(cudaEventRecord(event1, stream2)); */
907 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
910 sdiv_cuda(m*n, A, L, stream1);
913 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
915 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
916 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
918 /* B(i,j)=B(i,j)/y(j) for all B elements */
920 supdate2W_cuda(m, k, y, B, W, stream1);
922 /* Stream1 recording event "event2" for stream2 */
923 CUDAERR(cudaEventRecord(event, stream1));
925 /* maybe not needed but by completeness ... */
926 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
928 CUBLASERR(cublasDestroy(handle1));
929 CUBLASERR(cublasDestroy(handle2));
931 CUDAERR(cudaStreamDestroy(stream1));
932 CUDAERR(cudaStreamDestroy(stream2));
934 CUDAERR(cudaEventDestroy(event));
936 CUDAERR(cudaFree(B));
937 CUDAERR(cudaFree(L));
938 CUDAERR(cudaFree(x));
939 CUDAERR(cudaFree(y));
943 CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(float)));
944 CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
945 CUDAERR(cudaMalloc((void **)&x, m*sizeof(float)));
946 CUDAERR(cudaMalloc((void **)&y, k*sizeof(float)));
948 CUDAERR(cudaStreamCreate(&stream1));
949 CUDAERR(cudaStreamCreate(&stream2));
951 CUBLASERR(cublasCreate(&handle1));
952 CUBLASERR(cublasCreate(&handle2));
954 CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
956 CUBLASERR(cublasSetStream(handle1, stream1));
957 CUBLASERR(cublasSetStream(handle2, stream2));
959 /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
960 smemset_cuda(max(m,n), x, 1.0f, stream1);
962 /* Stream1 recording event "event" for stream2 (for the 1st time) */
963 CUDAERR(cudaEventRecord(event, stream1));
967 /* ************************ Phase 1 *************************** */
968 /* Stream2 waiting "event" set by stream1. 1st time is always ok */
969 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
971 /* Calculate the sums of all W columns via dgemv(W, x) */
972 CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
974 /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
975 /* CUDAERR(cudaEventRecord(event1, stream2)); */
978 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
981 sdiv_cuda(m*n, A, L, stream1);
984 CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
986 /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
987 /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
989 /* B(i,j)=B(i,j)/y(i) for all B elements */
991 supdate2H_cuda(k, n, y, B, H, stream1);
993 /* Stream1 recording event "event" for stream2 */
994 CUDAERR(cudaEventRecord(event, stream1));
996 /* maybe not needed but by completeness ... */
997 CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
999 CUBLASERR(cublasDestroy(handle1));
1000 CUBLASERR(cublasDestroy(handle2));
1002 CUDAERR(cudaStreamDestroy(stream1));
1003 CUDAERR(cudaStreamDestroy(stream2));
1005 CUDAERR(cudaEventDestroy(event));
1007 CUDAERR(cudaFree(B));
1008 CUDAERR(cudaFree(L));
1009 CUDAERR(cudaFree(x));
1010 CUDAERR(cudaFree(y));
1021 * \fn int dbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1022 * \brief dbdiv_cuda is a wrapper that calls the adequate function to performs NNMF using betadivergence using double precision with GPUs
1023 * \param m: (input) Number of rows of matrix A and matrix W
1024 * \param n: (input) Number of columns of matrix A and matrix H
1025 * \param k: (input) Number of columns of matrix W and rows of H
1026 * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
1027 * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
1028 * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
1029 * \param beta: (input) Double precision value. The parameter beta of betadivergence method
1030 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
1031 * \param nIter: (input) Number of iterations
1033 int dbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1035 if ((beta < 0.0) || (nIter <= 0))
1038 if (beta>=2.0 && beta<=2.0)
1039 return dmlsa_cuda(m, n, k, A, W, H, uType, nIter);
1042 if (beta>=1.0 && beta<=1.0)
1043 return dbdivone_cuda(m, n, k, A, W, H, uType, nIter);
1045 return dbdivg_cuda(m, n, k, A, W, H, beta, uType, nIter);
1051 * \fn int sbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1052 * \brief sbdiv_cuda is a wrapper that calls the adequate function to performs NNMF using betadivergence
1053 * using simple precision with GPUs
1054 * \param m: (input) Number of rows of matrix A and matrix W
1055 * \param n: (input) Number of columns of matrix A and matrix H
1056 * \param k: (input) Number of columns of matrix W and rows of H
1057 * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
1058 * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
1059 * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
1060 * \param beta: (input) Simple precision value. The parameter beta of betadivergence method
1061 * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
1062 * \param nIter: (input) Number of iterations
1064 int sbdiv_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
1066 if ((beta < 0.0f) || (nIter <= 0))
1069 if (beta>=2.0f && beta<=2.0f)
1070 return smlsa_cuda(m, n, k, A, W, H, uType, nIter);
1073 if (beta>=1.0f && beta<=1.0f)
1074 return sbdivone_cuda(m, n, k, A, W, H, uType, nIter);
1076 return sbdivg_cuda(m, n, k, A, W, H, beta, uType, nIter);
1082 * \fn void dupdate1H_cuda(const int n, const double *X, double *H, cudaStream_t stream)
1083 * \brief It calls cuda kernel vdupdate1H_cuda that performs H=H(.*)B(./)C where matrices
1084 * B and C are stored in buffer X by columns. All B 1st and after all C
1085 * \param n: (input) Number of elements of H
1086 * \param X: (input) Double precision input buffer (1D column-major)
1087 * \param H: (inout) Double precision input/output matrix (1D column-major)
1088 * \param stream: (input) ID of the stream to use
1090 void dupdate1H_cuda(const int n, const double *X, double *H, cudaStream_t stream)
1092 dim3 dimGrid, dimBlock;
1094 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1100 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
1101 vdupdate1H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(n, X, H);
1103 cudaDeviceSynchronize();
1109 * \fn void supdate1H_cuda(const int n, const float *X, float *H, cudaStream_t stream)
1110 * \brief It calls cuda kernel vsupdate1H_cuda that performs H=H(.*)B(./)C where matrices
1111 * B and C are stored in buffer X by columns. All B 1st and after all C
1112 * \param n: (input) Number of elements of H
1113 * \param X: (input) Simple precision input buffer (1D column-major)
1114 * \param H: (inout) Simple precision input/output matrix (1D column-major)
1115 * \param stream: (input) ID of the stream to use
1117 void supdate1H_cuda(const int n, const float *X, float *H, cudaStream_t stream)
1119 dim3 dimGrid, dimBlock;
1121 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1127 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
1128 vsupdate1H_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, X, H);
1130 cudaDeviceSynchronize();
1136 * \fn void dupdate1W_cuda(const int m, const int n, const double *X, double *W, cudaStream_t stream)
1137 * \brief It calls kernel vdupdate1W_cuda that performs W=W(.*)D(./)E where matrices
1138 * D and E are stored in buffer X according beta-divergence general case (see dbdivg_cuda(...))
1139 * \param m: (input) Number of rows of W
1140 * \param n: (input) Number of colums of W
1141 * \param X: (input) Double precision input buffer (1D column-major)
1142 * \param W: (inout) Double precision input/output matrix (1D column-major)
1143 * \param stream: (input) ID of the stream to use
1145 void dupdate1W_cuda(const int m, const int n, const double *X, double *W, cudaStream_t stream)
1147 dim3 dimGrid, dimBlock;
1149 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1155 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1156 vdupdate1W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, X, W);
1158 cudaDeviceSynchronize();
1164 * \fn void supdate1W_cuda(const int m, const int n, const float *X, float *W, cudaStream_t stream)
1165 * \brief It calls kernel vsupdate1W_cuda that performs W=W(.*)D(./)E where matrices
1166 * D and E are stored in buffer X according beta-divergence general case (see sbdivg_cuda(...))
1167 * \param m: (input) Number of rows of W
1168 * \param n: (input) Number of colums of W
1169 * \param X: (input) Simple precision input buffer (1D column-major)
1170 * \param W: (inout) Simple precision input/output matrix (1D column-major)
1171 * \param stream: (input) ID of the stream to use
1173 void supdate1W_cuda(const int m, const int n, const float *X, float *W, cudaStream_t stream)
1175 dim3 dimGrid, dimBlock;
1177 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1183 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1184 vsupdate1W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, X, W);
1186 cudaDeviceSynchronize();
1192 * \fn void dupdate2H_cuda(const int m, const int n, const double *x, const double *B, double *H, cudaStream_t stream)
1193 * \brief It calls kernel vdupdate2H_cuda that performs H(i)=H(i)*(B(i)/x(j))
1194 * \param m: (input) Number of rows of B and H, and number of elements of vector x
1195 * \param n: (input) Number of columns of B and A
1196 * \param x: (input) Double precision vector with the sum of W columns
1197 * \param B: (input) Double precision input matrix (1D column-major)
1198 * \param H: (inout) Double precision input/output matrix (1D column-major)
1199 * \param stream: (input) ID of the stream to use
1201 void dupdate2H_cuda(const int m, const int n, const double *x, const double *B, double *H, cudaStream_t stream)
1203 dim3 dimGrid, dimBlock;
1205 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1211 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1212 vdupdate2H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(m, n, x, B, H);
1214 cudaDeviceSynchronize();
1220 * \fn void supdate2H_cuda(const int m, const int n, const float *x, const float *B, float *H, cudaStream_t stream)
1221 * \brief It calls kernel vsupdate2H_cuda that performs H(i)=H(i)*(B(i)/x(j))
1222 * \param m: (input) Number of rows of B and H, and number of elements of vector x
1223 * \param n: (input) Number of columns of B and A
1224 * \param x: (input) Simple precision vector with the sum of W columns
1225 * \param B: (input) Simple precision input matrix (1D column-major)
1226 * \param H: (inout) Simple precision input/output matrix (1D column-major)
1227 * \param stream: (input) ID of the stream to use
1229 void supdate2H_cuda(const int m, const int n, const float *x, const float *B, float *H, cudaStream_t stream)
1231 dim3 dimGrid, dimBlock;
1233 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1239 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1240 vsupdate2H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(m, n, x, B, H);
1242 cudaDeviceSynchronize();
1248 * \fn void dupdate2W_cuda(const int m, const int n, const double *x, const double *B, double *W, cudaStream_t stream)
1249 * \brief It calls kernel vdupdate2W_cuda that performs W(i)=W(i)*(B(i)/x(j))
1250 * \param m: (input) Number of rows of W and B,
1251 * \param n: (input) Number of columns of W and B, and number of elements of vector x
1252 * \param x: (input) Double precision vector with the sum of W fills
1253 * \param B: (input) Double precision input matrix (1D column-major)
1254 * \param W: (inout) Double precision input/output matrix (1D column-major)
1255 * \param stream: (input) ID of the stream to use
1257 void dupdate2W_cuda(const int m, const int n, const double *x, const double *B, double *W, cudaStream_t stream)
1259 dim3 dimGrid, dimBlock;
1261 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1267 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1268 vdupdate2W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, x, B, W);
1270 cudaDeviceSynchronize();
1276 * \fn void supdate2W_cuda(const int m, const int n, const float *x, const float *B, float *W, cudaStream_t stream)
1277 * \brief It calls kernel vsupdate2W_cuda that performs W(i)=W(i)*(B(i)/x(j))
1278 * \param m: (input) Number of rows of W and B,
1279 * \param n: (input) Number of columns of W and B, and number of elements of vector x
1280 * \param x: (input) Simple precision vector with the sum of W fills
1281 * \param B: (input) Simple precision input matrix (1D column-major)
1282 * \param W: (inout) Simple precision input/output matrix (1D column-major)
1283 * \param stream: (input) ID of the stream to use
1285 void supdate2W_cuda(const int m, const int n, const float *x, const float *B, float *W, cudaStream_t stream)
1287 dim3 dimGrid, dimBlock;
1289 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1295 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1296 vsupdate2W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, x, B, W);
1298 cudaDeviceSynchronize();
1304 * \fn void dkernelH_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1305 * \brief It calls kernel vdkernelH_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1306 * Note expo is a real number expo < 0 or expo > 0
1307 * \param m: (input) Number of rows of L, A and R matrices
1308 * \param n: (input) Number of columns of L, A and R matrices
1309 * \param L: (input) Double precision input matrix (1D column-major)
1310 * \param A: (input) Double precision input matrix (1D column-major)
1311 * \param R: (output) Double precision output matrix (1D column-major)
1312 * \param expo: (input) the "power of" for function pow(). It is a double precision value
1313 * \param stream: (input) ID of the stream to use
1315 void dkernelH_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1317 dim3 dimGrid, dimBlock;
1319 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1325 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1326 vdkernelH_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
1328 cudaDeviceSynchronize();
1334 * \fn void skernelH_cuda(const int m, const int n, const float *L, const float *A, float *R, const double expo, cudaStream_t stream)
1335 * \brief It calls kernel vskernelH_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1336 * Note expo is a real number expo < 0 or expo > 0
1337 * \param m: (input) Number of rows of L, A and R matrices
1338 * \param n: (input) Number of columns of L, A and R matrices
1339 * \param L: (input) Simple precision input matrix (1D column-major)
1340 * \param A: (input) Simple precision input matrix (1D column-major)
1341 * \param R: (output) Simple precision output matrix (1D column-major)
1342 * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1343 * \param stream: (input) ID of the stream to use
1345 void skernelH_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
1347 dim3 dimGrid, dimBlock;
1349 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1355 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1356 vskernelH_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
1358 cudaDeviceSynchronize();
1364 * \fn void dkernelW_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1365 * \brief It calls kernel vdkernelW_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1366 * \param m: (input) Number of rows of L, A and R matrices
1367 * \param n: (input) Number of columns of L, A and R matrices
1368 * \param L: (input) Double precision input matrix (1D column-major)
1369 * \param A: (input) Double precision input matrix (1D column-major)
1370 * \param R: (output) Double precision output matrix (1D column-major)
1371 * \param expo: (input) the "power of" for function pow(). It is a double precision value
1372 * \param stream: (input) ID of the stream to use
1374 void dkernelW_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1376 dim3 dimGrid, dimBlock;
1378 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1384 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1385 vdkernelW_cuda<<<dimGrid, dimBlock, 0 ,stream>>>(m, n, L, A, R, expo);
1387 cudaDeviceSynchronize();
1393 * \fn void skernelW_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
1394 * \brief It calls kernel vskernelW_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1395 * \param m: (input) Number of rows of L, A and R matrices
1396 * \param n: (input) Number of columns of L, A and R matrices
1397 * \param L: (input) Simple precision input matrix (1D column-major)
1398 * \param A: (input) Simple precision input matrix (1D column-major)
1399 * \param R: (output) Simple precision output matrix (1D column-major)
1400 * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1401 * \param stream: (input) ID of the stream to use
1403 void skernelW_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
1405 dim3 dimGrid, dimBlock;
1407 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1413 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1414 vskernelW_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
1416 cudaDeviceSynchronize();
1423 * \fn __global__ void vdkernelH_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1424 * \brief This kernel performs double precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo)
1425 * Note expo is a real number expo < 0 or expo > 0
1426 * \param m: (input) Number of rows of L, A and R matrices
1427 * \param n: (input) Number of columns of L, A and R matrices
1428 * \param L: (input) Double precision input matrix (1D column-major)
1429 * \param A: (input) Double precision input matrix (1D column-major)
1430 * \param R: (output) Double precision output matrix (1D column-major)
1431 * \param expo: (input) the "power of" for function pow(). It is a double precision value
1433 __global__ void vdkernelH_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1436 pos =blockDim.x * blockIdx.x + threadIdx.x,
1447 /* if (dtmp1>=0.0 && dtmp1<=0.0) */
1448 R[pos] = R[pos+size] = 0.0;
1451 dtmp2 = pow(dtmp1, expo);
1453 /* ask for A[i]=0.0 don't give improvements. We don't do it */
1454 R[pos] =dtmp2*A[pos];
1455 R[pos+size]=dtmp1*dtmp2;
1461 * \fn __global__ void vskernelH_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const double expo)
1462 * \brief This kernel computes simple precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo)
1463 * Note expo is a real number expo < 0 or expo > 0
1464 * \param m: (input) Number of rows of L, A and R matrices
1465 * \param n: (input) Number of columns of L, A and R matrices
1466 * \param L: (input) Simpel precision input matrix (1D column-major)
1467 * \param A: (input) Simpel precision input matrix (1D column-major)
1468 * \param R: (output) Simpel precision output matrix (1D column-major)
1469 * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1471 __global__ void vskernelH_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const float expo)
1474 pos =blockDim.x * blockIdx.x + threadIdx.x,
1485 /* if (ftmp1>=0.0f && ftmp1<=0.0f) */
1486 R[pos] = R[pos+size] = 0.0f;
1489 ftmp2 = powf(ftmp1, expo);
1491 /* ask for A[i]=0.0 don't give improvements. We don't do it */
1492 R[pos] =ftmp2*A[pos];
1493 R[pos+size]=ftmp1*ftmp2;
1500 * \fn __global__ void vdkernelW_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1501 * \brief This kernel computes double precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo)
1502 * Note expo is a real number expo < 0 or expo > 0
1503 * \param m: (input) Number of rows of L, A and R matrices
1504 * \param n: (input) Number of columns of L, A and R matrices
1505 * \param L: (input) Double precision input matrix (1D column-major)
1506 * \param A: (input) Double precision input matrix (1D column-major)
1507 * \param R: (output) Double precision output matrix (1D column-major)
1508 * \param expo: (input) the "power of" for function pow(). It is a double precision value
1510 __global__ void vdkernelW_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1513 pos=blockDim.x * blockIdx.x + threadIdx.x,
1521 newpos = 2*m*(pos/m)+(pos%m);
1525 /* if (dtmp1>=0.0 && dtmp1<=0.0) */
1526 R[pos] = R[pos+m] = 0.0;
1529 dtmp2 = pow(dtmp1, expo);
1531 R[newpos] = dtmp2*A[pos];
1532 R[newpos+m]= dtmp1*dtmp2;
1539 * \fn __global__ void vskernelW_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const double expo)
1540 * \brief This kernel computes simple precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo)
1541 * Note expo is a real number expo < 0 or expo > 0
1542 * \param m: (input) Number of rows of L, A and R matrices
1543 * \param n: (input) Number of columns of L, A and R matrices
1544 * \param L: (input) Simple precision input matrix (1D column-major)
1545 * \param A: (input) Simple precision input matrix (1D column-major)
1546 * \param R: (output) Simple precision output matrix (1D column-major)
1547 * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1549 __global__ void vskernelW_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const float expo)
1552 pos=blockDim.x * blockIdx.x + threadIdx.x,
1560 newpos = 2*m*(pos/m)+(pos%m);
1564 /* if (ftmp1>=0.0f && ftmp1<=0.0f) */
1565 R[pos] = R[pos+m] = 0.0f;
1568 ftmp2 = powf(ftmp1, expo);
1570 R[newpos] = ftmp2*A[pos];
1571 R[newpos+m]= ftmp1*ftmp2;
1578 * \fn __global__ void vdupdate1H_cuda(const int n, const double* __restrict__ X, double *H)
1579 * \brief This kernel computes double precision H(i)=H(i)*B(i)/C(i) where matrices
1580 * B and C are stored in the same buffer (called X). All B 1st and after C
1581 * \param n: (input) Number of elements of H
1582 * \param X: (input) Double precision input matrix (1D column-major)
1583 * \param H: (inout) Double precision input/output matrix (1D column-major)
1585 __global__ void vdupdate1H_cuda(const int n, const double* __restrict__ X, double *H)
1586 /* The use of "const double* __restrict__ A" and after "...=A[i]" is */
1587 /* the same that "double *A" and after "...=__ldg(&A[i])" but __ldg() */
1588 /* intrinsic is only available on compute capability 3.5 (or newer) */
1589 /* With both go through the read-only cache */
1591 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1595 /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
1596 H[pos] = H[pos] * X[pos] / X[pos+n];
1597 assert(!fpe(H[pos]));
1599 H[pos] = H[pos] * X[pos] / X[pos+n];
1605 * \fn __global__ void vsupdate1H_cuda(const int n, const float* __restrict__ X, float *H)
1606 * \brief This kernel computes simple precision H(i)=H(i)*B(i)/C(i) where matrices
1607 * B and C are stored in the same buffer (called X). All B 1st and after C
1608 * \param n: (input) Number of elements of H
1609 * \param X: (input) Simple precision input matrix (1D column-major)
1610 * \param H: (inout) Simple precision input/output matrix (1D column-major)
1612 __global__ void vsupdate1H_cuda(const int n, const float* __restrict__ X, float *H)
1614 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1618 /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
1619 H[pos] = H[pos] * X[pos] / X[pos+n];
1620 assert(!fpe(H[pos]));
1622 H[pos] = H[pos] * X[pos] / X[pos+n];
1628 * \fn __global__ void vdupdate1W_cuda(const int m, const int n, const double* __restrict__ X, double *W)
1629 * \brief This kernel computes double precision W[i]=W[i]*D[i]/E[i] where matrices D and E are
1630 * stored in the same buffer (called X) according beta-divergence general case (see dbdivg_cuda(...))
1631 * \param m: (input) Number of rows of W
1632 * \param n: (input) Number of columns of W
1633 * \param X: (input) Double precision input matrix (1D column-major)
1634 * \param W: (inout) Double precision input/output matrix (1D column-major)
1636 __global__ void vdupdate1W_cuda(const int m, const int n, const double* __restrict__ X, double *W)
1639 pos=blockDim.x * blockIdx.x + threadIdx.x,
1644 newpos = pos+(pos/m)*m;
1646 W[pos] = W[pos] * X[newpos] / X[newpos+m];
1647 assert(!fpe(W[pos]));
1649 W[pos] = W[pos] * X[newpos] / X[newpos+m];
1656 * \fn __global__ void vsupdate1W_cuda(const int m, const int n, const float* __rectrict__ X, float *W)
1657 * \brief This kernel computes simple precision W[i]=W[i]*D[i]/E[i] where matrices D and E are
1658 * stored in the same buffer (called X) according beta-divergence general case (see dbdivg_cuda(...))
1659 * \param m: (input) Number of rows of W
1660 * \param n: (input) Number of columns of W
1661 * \param X: (input) Simple precision input matrix (1D column-major)
1662 * \param W: (inout) Simple precision input/output matrix (1D column-major)
1664 __global__ void vsupdate1W_cuda(const int m, const int n, const float* __restrict__ X, float *W)
1667 pos=blockDim.x * blockIdx.x + threadIdx.x,
1672 newpos = pos+(pos/m)*m;
1674 /* Here we can have NaN and Inf if X(newpos+m) and/or W(pos) and or/ X(newpos)=0 */
1675 W[pos] = W[pos] * X[newpos] / X[newpos+m];
1676 assert(!fpe(W[pos]));
1678 W[pos] = W[pos] * X[newpos] / X[newpos+m];
1685 * \fn __global__ void vdupdate2H_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict_ B, double *H)
1686 * \brief This kernel computes double precision H(i)=H(i)*(B(i)/y(j))
1687 * \param m: (input) Number of rows of B and H, and number of elements of vector y
1688 * \param n: (input) Number of columns of B and A
1689 * \param y: (input) Double precision vector with the sum of W columns
1690 * \param B: (input) Double precision input matrix (1D column-major)
1691 * \param H: (inout) Double precision input/output matrix (1D column-major)
1693 __global__ void vdupdate2H_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *H)
1695 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1699 /* Here we can have NaN and Inf if y(pos%m) and/or H(pos) and or/ B(pos)=0 */
1700 H[pos] = H[pos] * (B[pos] / y[pos%m]);
1701 assert(!fpe(H[pos]));
1703 H[pos] = H[pos] * (B[pos] / y[pos%m]);
1709 * \fn __global__ void vsupdate2H_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *H)
1710 * \brief This kernel performs the simple H(i)=H(i)*(B(i)/y(j))
1711 * \param m: (input) Number of rows of B and H, and number of elements of vector y
1712 * \param n: (input) Number of columns of B and A
1713 * \param y: (input) Simple precision vector with the sum of W columns
1714 * \param B: (input) Simple precision input matrix (1D column-major)
1715 * \param H: (inout) Simple precision input/output matrix (1D column-major)
1717 __global__ void vsupdate2H_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *H)
1719 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1723 /* Here we can have NaN and Inf if y(pos%m) and/or H(pos) and or/ B(pos)=0 */
1724 H[pos] = H[pos] * (B[pos] / y[pos%m]);
1725 assert(!fpe(H[pos]));
1727 H[pos] = H[pos] * (B[pos] / y[pos%m]);
1733 * \fn __global__ void vdupdate2W_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *W)
1734 * \brief This kernel performs double precision W(i)=W(i)*(B(i)/y(j))
1735 * \param m: (input) Number of rows of W and B,
1736 * \param n: (input) Number of columns of W and B, and number of elements of vector y
1737 * \param y: (input) Double precision vector with the sum of H rows
1738 * \param B: (input) Double precision input matrix (1D column-major)
1739 * \param W: (inout) Double precision input/output matrix (1D column-major)
1741 __global__ void vdupdate2W_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *W)
1743 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1747 /* Here we can have NaN and Inf if y(pos/m) and/or W(pos) and/or B(pos)=0 */
1748 W[pos] = W[pos] * (B[pos] / y[pos/m]);
1749 assert(!fpe(W[pos]));
1751 W[pos] = W[pos] * (B[pos] / y[pos/m]);
1757 * \fn __global__ void vsupdate2W_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *W)
1758 * \brief This kernel computes simple precision W(i)=W(i)*(B(i)/y(j))
1759 * \param m: (input) Number of rows of W and B,
1760 * \param n: (input) Number of columns of W and B, and number of elements of vector y
1761 * \param y: (input) Simple precision vector with the sum of H rows
1762 * \param B: (input) Simple precision input matrix (1D column-major)
1763 * \param W: (inout) Simple precision input/output matrix (1D column-major)
1765 __global__ void vsupdate2W_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *W)
1767 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1771 /* Here we can have NaN and Inf if y(pos/m) and/or W(pos) and/or B(pos)=0 */
1772 W[pos] = W[pos] * (B[pos] / y[pos/m]);
1773 assert(!fpe(W[pos]));
1775 W[pos] = W[pos] * (B[pos] / y[pos/m]);