NnmfPack  2.1
bdiv_cuda.cu
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2014 by PIR (University of Oviedo) and *
3  * INCO2 (Polytechnic University of Valencia) groups. *
4  * nnmfpack@gmail.com *
5  * *
6  * This program is free software; you can redistribute it and/or modify *
7  * it under the terms of the GNU General Public License as published by *
8  * the Free Software Foundation; either version 2 of the License, or *
9  * (at your option) any later version. *
10  * *
11  * This program is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14  * GNU General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU General Public License *
17  * along with this program; if not, write to the *
18  * Free Software Foundation, Inc., *
19  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20  ***************************************************************************
21 */
22 /**
23  * \file bdiv_cuda.cu
24  * \brief File with functions to calcule NNMF using betadivergence methods for GPUs
25  * \author Information Retrieval and Parallel Computing Group (IRPCG)
26  * \author University of Oviedo, Spain
27  * \author Interdisciplinary Computation and Communication Group (INCO2)
28  * \author Universitat Politecnica de Valencia, Spain.
29  * \author Contact: nnmfpack@gmail.com
30  * \date 04/11/14
31 */
32 #include "bdiv_cuda.h"
33 
34 
35 /**
36  * \fn int dbdivg_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, const int nIter)
37  * \brief dbdivg_cuda performs the NNMF using beta-divergence when beta is != 1 and !=2, using double precision.
38  *
39  * The algorithm is<BR>
40  * &nbsp;&nbsp;repit nIter times<BR>
41  * &nbsp;&nbsp;&nbsp;&nbsp;STEP 1<BR>
42  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
43  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L(.^)(beta-2)<BR>
44  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L2=L1(.*)A<BR>
45  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L1(.*)L<BR>
46  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B=W'*L2<BR>
47  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;C=W'*L1<BR>
48  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H=H(.*)B(./)C<BR>
49  *
50  * &nbsp;&nbsp;&nbsp;&nbsp;STEP 2<BR>
51  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
52  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L(.^)(beta-2)<BR>
53  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L2=L1(.*)A<BR>
54  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L1(.*)L<BR>
55  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;D=L2*H'<BR>
56  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;E=L1*H'<BR>
57  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;W=W(.*)D(./)E<BR>
58  * &nbsp;&nbsp;end repit<BR>
59  * End algorithm<BR>
60  *
61  * In real live L1 and L2 are (m*n) matrices used in STEP 1 and 2. For performance
62  * reasons only one 1D colum-mayor buffer, named R in next code, of size 2*m*n is used
63  * In STEP 1, first part of R (m*n positions) is L2 and the second part is L1.
64  * In STEP 2, fisrt column of R (2*m positions) is composed by the first column of L2
65  * following first column of L1. Second column of R (2*m positions) is composed by the
66  * sencond column of L2 following second column of L1. 3rd column of R ... and so on
67  *
68  * In real live B and C are (k*n) matrices used in STEP 1, and D and E are (m*k)
69  * matrices used in STEP 2. B/C and D/E are independent. However we do not have L1
70  * and L2, we have R, and we can do B=W'*L2 and C=W'*L1 (or D=L2*H' and E=L1*H') at
71  * the same time. For this reason only one matrix is declared to save space. This is
72  * matrix M with size 2*max(m,n)*k
73  *
74  * \param m: (input) Number of rows of matrix A and matrix W
75  * \param n: (input) Number of columns of matrix A and matrix H
76  * \param k: (input) Number of columns of matrix W and rows of H
77  * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
78  * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
79  * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
80  * \param expo: (input) Double precision value. The exponent beta of betadivergence method
81  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
82  * \param nIter: (input) Number of iterations
83  *
84  * It returns 0 if all is OK.
85 */
86 int dbdivg_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, const int nIter)
87 {
88  double
89  *M=NULL,
90  *L=NULL,
91  *R=NULL,
92  alpha=1.0,
93  beta =0.0;
94 
95  int
96  i, devID;
97 
98  cublasHandle_t
99  handle;
100 
101  CUDAERR(cudaGetDevice(&devID));
102 
103  switch (uType)
104  {
105  case UpdateAll:
106  CUDAERR(cudaMalloc((void **)&M, 2*max(m, n) * k * sizeof(double)));
107  CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(double)));
108  CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));
109 
110  CUBLASERR(cublasCreate(&handle));
111 
112  for(i=0;i<nIter;i++)
113  {
114  /* ************************ Phase 1 *************************** */
115  /* L=W*H */
116  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
117 
118  /* L1=L(.^)(expo-2) */
119  /* L2=L1(.*)A */
120  /* L1=L1(.*)L */
121  dkernelH_cuda(m, n, L, A, R, expo-2.0, 0);
122 
123  /* B=W'*L2 */
124  /* C=W'*L1 */
125  /* above is equal to R=W'*|L2 | L1| */
126  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
127 
128  /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
129  dupdate1H_cuda(k*n, M, H, 0);
130 
131 
132  /* ************************ Phase 2 *************************** */
133  /* L=W*H */
134  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
135 
136  /* L1=L(.^)(expo-2) */
137  /* L2=L1(.*)A */
138  /* L1=L1(.*)L */
139  dkernelW_cuda(m, n, L, A, R, expo-2.0, 0);
140 
141  /* D=L2*H' */
142  /* E=L1*H' */
143  /* |L2| */
144  /* above is equal to R=| | * H' */
145  /* |L1| */
146  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
147 
148  /* W=W(.*)D(./)E */
149  dupdate1W_cuda(m, k, M, W, 0);
150  }
151  CUBLASERR(cublasDestroy(handle));
152 
153  CUDAERR(cudaFree(M));
154  CUDAERR(cudaFree(L));
155  CUDAERR(cudaFree(R));
156  break;
157 
158  case UpdateW:
159  CUDAERR(cudaMalloc((void **)&M, 2*m * k * sizeof(double)));
160  CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(double)));
161  CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));
162 
163  CUBLASERR(cublasCreate(&handle));
164 
165  for(i=0;i<nIter;i++)
166  {
167  /* ************************ Phase 2 *************************** */
168  /* L=W*H */
169  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
170 
171  /* L1=L(.^)(expo-2) */
172  /* L2=L1(.*)A */
173  /* L1=L1(.*)L */
174  dkernelW_cuda(m, n, L, A, R, expo-2.0, 0);
175 
176  /* D=L2*H' */
177  /* E=L1*H' */
178  /* |L2| */
179  /* above is equal to R=| | * H' */
180  /* |L1| */
181  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
182 
183  /* W=W(.*)D(./)E */
184  dupdate1W_cuda(m, k, M, W, 0);
185  }
186  CUBLASERR(cublasDestroy(handle));
187 
188  CUDAERR(cudaFree(M));
189  CUDAERR(cudaFree(L));
190  CUDAERR(cudaFree(R));
191  break;
192 
193  case UpdateH:
194  CUDAERR(cudaMalloc((void **)&M, 2*n * k * sizeof(double)));
195  CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(double)));
196  CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));
197 
198  CUBLASERR(cublasCreate(&handle));
199 
200  for(i=0;i<nIter;i++)
201  {
202  /* ************************ Phase 1 *************************** */
203  /* L=W*H */
204  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
205 
206  /* L1=L(.^)(expo-2) */
207  /* L2=L1(.*)A */
208  /* L1=L1(.*)L */
209  dkernelH_cuda(m, n, L, A, R, expo-2.0, 0);
210 
211  /* B=W'*L2 */
212  /* C=W'*L1 */
213  /* above is equal to R=W'*|L2 | L1| */
214  CUBLASERR(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
215 
216  /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
217  dupdate1H_cuda(k*n, M, H, 0);
218  }
219  CUBLASERR(cublasDestroy(handle));
220 
221  CUDAERR(cudaFree(M));
222  CUDAERR(cudaFree(L));
223  CUDAERR(cudaFree(R));
224  break;
225 
226  default:
227  return -1;
228  }
229  return 0;
230 }
231 
232 /**
233  * \fn int sbdivg_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, const int nIter)
234  * \brief sbdivg_cuda performs NNMF using beta-divergence when beta is != 1 and !=2, using simple precision
235  * See description of dbdivg_cuda for more info
236  * \param m: (input) Number of rows of matrix A and matrix W
237  * \param n: (input) Number of columns of matrix A and matrix H
238  * \param k: (input) Number of columns of matrix W and rows of H
239  * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
240  * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
241  * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
242  * \param expo: (input) Simple precision value. The exponent beta of betadivergence method
243  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
244  * \param nIter: (input) Number of iterations
245  *
246  * It returns 0 if all is OK.
247 */
248 int sbdivg_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, const int nIter)
249 {
250  float
251  *M=NULL,
252  *L=NULL,
253  *R=NULL,
254  alpha=1.0f,
255  beta =0.0f;
256 
257  int
258  i, devID;
259 
260  cublasHandle_t
261  handle;
262 
263  CUDAERR(cudaGetDevice(&devID));
264 
265  switch (uType)
266  {
267  case UpdateAll:
268  CUDAERR(cudaMalloc((void **)&M, 2*max(m, n) * k * sizeof(float)));
269  CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(float)));
270  CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));
271 
272  CUBLASERR(cublasCreate(&handle));
273 
274  for(i=0;i<nIter;i++)
275  {
276  /* ************************ Phase 1 *************************** */
277  /* L=W*H */
278  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
279 
280  /* L1=L(.^)(expo-2) */
281  /* L2=L1(.*)A */
282  /* L1=L1(.*)L */
283  skernelH_cuda(m, n, L, A, R, expo-2.0f, 0);
284 
285  /* B=W'*L2 */
286  /* C=W'*L1 */
287  /* above is equal to R=W'*|L2 | L1| */
288  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
289 
290  /* H=H(.*)B(./)C */
291  supdate1H_cuda(k*n, M, H, 0);
292 
293 
294  /* ************************ Phase 2 *************************** */
295  /* L=W*H */
296  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
297 
298  /* L1=L(.^)(expo-2) */
299  /* L2=L1(.*)A */
300  /* L1=L1(.*)L */
301  skernelW_cuda(m, n, L, A, R, expo-2.0f, 0);
302 
303  /* D=L2*H' */
304  /* E=L1*H' */
305  /* |L2| */
306  /* above is equal to R=| | * H' */
307  /* |L1| */
308  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
309 
310  /* W=W(.*)D(./)E */
311  supdate1W_cuda(m, k, M, W, 0);
312  }
313  CUBLASERR(cublasDestroy(handle));
314 
315  CUDAERR(cudaFree(M));
316  CUDAERR(cudaFree(L));
317  CUDAERR(cudaFree(R));
318  break;
319 
320  case UpdateW:
321  CUDAERR(cudaMalloc((void **)&M, 2*m * k * sizeof(float)));
322  CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(float)));
323  CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));
324 
325  CUBLASERR(cublasCreate(&handle));
326 
327  for(i=0;i<nIter;i++)
328  {
329  /* ************************ Phase 2 *************************** */
330  /* L=W*H */
331  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
332 
333  /* L1=L(.^)(expo-2) */
334  /* L2=L1(.*)A */
335  /* L1=L1(.*)L */
336  skernelW_cuda(m, n, L, A, R, expo-2.0f, 0);
337 
338  /* D=L2*H' */
339  /* E=L1*H' */
340  /* |L2| */
341  /* above is equal to R=| | * H' */
342  /* |L1| */
343  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));
344 
345  /* W=W(.*)D(./)E */
346  supdate1W_cuda(m, k, M, W, 0);
347  }
348  CUBLASERR(cublasDestroy(handle));
349 
350  CUDAERR(cudaFree(M));
351  CUDAERR(cudaFree(L));
352  CUDAERR(cudaFree(R));
353  break;
354 
355  case UpdateH:
356  CUDAERR(cudaMalloc((void **)&M, 2*n * k * sizeof(float)));
357  CUDAERR(cudaMalloc((void **)&L, m * n * sizeof(float)));
358  CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));
359 
360  CUBLASERR(cublasCreate(&handle));
361 
362  for(i=0;i<nIter;i++)
363  {
364  /* ************************ Phase 1 *************************** */
365  /* L=W*H */
366  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
367 
368  /* L1=L(.^)(expo-2) */
369  /* L2=L1(.*)A */
370  /* L1=L1(.*)L */
371  skernelH_cuda(m, n, L, A, R, expo-2.0f, 0);
372 
373  /* B=W'*L2 */
374  /* C=W'*L1 */
375  /* above is equal to R=W'*|L2 | L1| */
376  CUBLASERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));
377 
378  /* H=H(.*)B(./)C */
379  supdate1H_cuda(k*n, M, H, 0);
380  }
381  CUBLASERR(cublasDestroy(handle));
382 
383  CUDAERR(cudaFree(M));
384  CUDAERR(cudaFree(L));
385  CUDAERR(cudaFree(R));
386  break;
387 
388  default:
389  return -1;
390  }
391  return 0;
392 }
393 
394 
395 /**
396  * \fn int dbdivone_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
397  * \brief dbdivone_cuda performs NNMF using beta-divergence when beta=1, using double precision.
398  *
399  * The algorithm is<BR>
400  * &nbsp;&nbsp;repit nIter times<BR>
401  * &nbsp;&nbsp;&nbsp;&nbsp;STEP 1<BR>
402  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;y(i)=sum(W(j,i) for all j in range) for all i in range<BR>
403  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
404  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=A(./)L<BR>
405  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B=W'*L<BR>
406  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B(i,j)=B(i,j) / y(i) for all B elements<BR>
407  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H=H(.*)B<BR>
408  *
409  * &nbsp;&nbsp;&nbsp;&nbsp;STEP 2<BR>
410  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;y(i)=sum(H(i,j) for all j in range) for all i in range<BR>
411  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
412  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=A(./)L <BR>
413  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;D=L*H'<BR>
414  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B(i,j)=B(i,j) / y(j) for all B elements<BR>
415  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;W=W(.*)D<BR>
416  * &nbsp;&nbsp;&nbsp;&nbsp;end repit<BR>
417  * End algorithm<BR>
418  *
419  * In real live B is a (k*n) matrix used in STEP 1, and D is a (m*k)
420  * matrix used in STEP 2. B and D are independent. For this reason only 1 matrix
421  * of size max(m,n)*k is declared/used
422  *
423  * \param m: (input) Number of rows of matrix A and matrix W
424  * \param n: (input) Number of columns of matrix A and matrix H
425  * \param k: (input) Number of columns of matrix W and rows of H
426  * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
427  * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
428  * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
429  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
430  * \param nIter: (input) Number of iterations
431  *
432  * It returns 0 if all is OK.
433 */
434 int dbdivone_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
435 {
436  double
437  *B=NULL,
438  *L=NULL,
439  *x=NULL,
440  *y=NULL,
441  alpha=1.0,
442  beta =0.0;
443 
444  int
445  i, devID;
446 
447  cublasHandle_t
448  handle1,
449  handle2;
450 
451  cudaStream_t
452  stream1,
453  stream2;
454 
455  cudaEvent_t
456  event;
457 
458  CUDAERR(cudaGetDevice(&devID));
459 
460  switch (uType)
461  {
462  case UpdateAll:
463  CUDAERR(cudaMalloc((void **)&B, k*max(m,n)*sizeof(double)));
464  CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
465 
466  /* These two vectors (x and y) are used both in Phase 1 and Phase 2. With */
467  /* the strategy used with matrices B and D the size of x is max(m,n) */
468  CUDAERR(cudaMalloc((void **)&x, max(m,n)*sizeof(double)));
469  CUDAERR(cudaMalloc((void **)&y, k*sizeof(double)));
470 
471  CUDAERR(cudaStreamCreate(&stream1));
472  CUDAERR(cudaStreamCreate(&stream2));
473 
474  CUBLASERR(cublasCreate(&handle1));
475  CUBLASERR(cublasCreate(&handle2));
476 
477  CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
478 
479  CUBLASERR(cublasSetStream(handle1, stream1));
480  CUBLASERR(cublasSetStream(handle2, stream2));
481 
482  /* Why streams? Because we use them before */
483  /* For small sizes without streams is fast (n=m <= 500) */
484  /* For medium sizes with streams is fast (500 <= n=m <= 1000) */
485  /* For other sizes streams is less (aprox. =) than without streams */
486  /* Why this approach? */
487  /* Maybe this is the best way to use less calls to cudaStreamWait */
488  /* due to one cublasDgemv is faster than two cublasDgemm plus one */
489  /* kernel. Call latter cublasDgemv perhaps it is ok but... */
490 
491  /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
492  dmemset_cuda(max(m,n), x, 1.0, stream1);
493 
494  /* Stream1 recording event "event" for stream2 (for the 1st time) */
495  CUDAERR(cudaEventRecord(event, stream1));
496 
497  for(i=0;i<nIter;i++)
498  {
499  /* ************************ Phase 1 *************************** */
500  /* Stream2 waiting "event" set by stream1. 1st time is always ok */
501  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
502 
503  /* Calculate the sums of all W columns via dgemv(W, x) */
504  CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
505 
506  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
507  /* CUDAERR(cudaEventRecord(event1, stream2)); */
508 
509  /* L=W*H */
510  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
511 
512  /* L=A(./)L */
513  ddiv_cuda(m*n, A, L, stream1);
514 
515  /* B=W'*L */
516  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
517 
518  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
519  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
520 
521  /* B(i,j)=B(i,j) / y(i) for all B elements */
522  /* H=H(.*)B */
523  dupdate2H_cuda(k, n, y, B, H, stream1);
524 
525  /* Stream1 recording event "event" for stream2 */
526  CUDAERR(cudaEventRecord(event, stream1));
527 
528 
529  /* ************************** Phase 2 **************************** */
530  /* Stream2 waiting "event" set by stream1 */
531  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
532 
533  /* Calculate the sums of all H rows via dgemv(H, x) is performed */
534  CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
535 
536  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
537  /* CUDAERR(cudaEventRecord(event1, stream2)); */
538 
539  /* L=W*H */
540  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
541 
542  /* L=A(./)L */
543  ddiv_cuda(m*n, A, L, stream1);
544 
545  /* B=L*H' */
546  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
547 
548  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
549  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
550 
551  /* B(i,j)=B(i,j)/y(j) for all B elements */
552  /* W=W(.*)B */
553  dupdate2W_cuda(m, k, y, B, W, stream1);
554 
555  /* Stream1 recording event "event2" for stream2 */
556  CUDAERR(cudaEventRecord(event, stream1));
557  }
558  /* maybe not needed but by completeness ... */
559  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
560 
561  CUBLASERR(cublasDestroy(handle1));
562  CUBLASERR(cublasDestroy(handle2));
563 
564  CUDAERR(cudaStreamDestroy(stream1));
565  CUDAERR(cudaStreamDestroy(stream2));
566 
567  CUDAERR(cudaEventDestroy(event));
568 
569  CUDAERR(cudaFree(B));
570  CUDAERR(cudaFree(L));
571  CUDAERR(cudaFree(x));
572  CUDAERR(cudaFree(y));
573  break;
574 
575  case UpdateW:
576  CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(double)));
577  CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
578  CUDAERR(cudaMalloc((void **)&x, n*sizeof(double)));
579  CUDAERR(cudaMalloc((void **)&y, k*sizeof(double)));
580 
581  CUDAERR(cudaStreamCreate(&stream1));
582  CUDAERR(cudaStreamCreate(&stream2));
583 
584  CUBLASERR(cublasCreate(&handle1));
585  CUBLASERR(cublasCreate(&handle2));
586 
587  CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
588 
589  CUBLASERR(cublasSetStream(handle1, stream1));
590  CUBLASERR(cublasSetStream(handle2, stream2));
591 
592  /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
593  dmemset_cuda(max(m,n), x, 1.0, stream1);
594 
595  /* Stream1 recording event "event" for stream2 (for the 1st time) */
596  CUDAERR(cudaEventRecord(event, stream1));
597 
598  for(i=0;i<nIter;i++)
599  {
600  /* ************************** Phase 2 **************************** */
601  /* Stream2 waiting "event" set by stream1 */
602  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
603 
604  /* Calculate the sums of all H rows via dgemv(H, x) is performed */
605  CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
606 
607  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
608  /* CUDAERR(cudaEventRecord(event1, stream2)); */
609 
610  /* L=W*H */
611  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
612 
613  /* L=A(./)L */
614  ddiv_cuda(m*n, A, L, stream1);
615 
616  /* B=L*H' */
617  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
618 
619  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
620  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
621 
622  /* B(i,j)=B(i,j)/y(j) for all B elements */
623  /* W=W(.*)B */
624  dupdate2W_cuda(m, k, y, B, W, stream1);
625 
626  /* Stream1 recording event "event2" for stream2 */
627  CUDAERR(cudaEventRecord(event, stream1));
628  }
629  /* maybe not needed but by completeness ... */
630  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
631 
632  CUBLASERR(cublasDestroy(handle1));
633  CUBLASERR(cublasDestroy(handle2));
634 
635  CUDAERR(cudaStreamDestroy(stream1));
636  CUDAERR(cudaStreamDestroy(stream2));
637 
638  CUDAERR(cudaEventDestroy(event));
639 
640  CUDAERR(cudaFree(B));
641  CUDAERR(cudaFree(L));
642  CUDAERR(cudaFree(x));
643  CUDAERR(cudaFree(y));
644  break;
645 
646  case UpdateH:
647  CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(double)));
648  CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
649  CUDAERR(cudaMalloc((void **)&x, m*sizeof(double)));
650  CUDAERR(cudaMalloc((void **)&y, k*sizeof(double)));
651 
652  CUDAERR(cudaStreamCreate(&stream1));
653  CUDAERR(cudaStreamCreate(&stream2));
654 
655  CUBLASERR(cublasCreate(&handle1));
656  CUBLASERR(cublasCreate(&handle2));
657 
658  CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
659 
660  CUBLASERR(cublasSetStream(handle1, stream1));
661  CUBLASERR(cublasSetStream(handle2, stream2));
662 
663  /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
664  dmemset_cuda(max(m,n), x, 1.0, stream1);
665 
666  /* Stream1 recording event "event" for stream2 (for the 1st time) */
667  CUDAERR(cudaEventRecord(event, stream1));
668 
669  for(i=0;i<nIter;i++)
670  {
671  /* ************************ Phase 1 *************************** */
672  /* Stream2 waiting "event" set by stream1. 1st time is always ok */
673  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
674 
675  /* Calculate the sums of all W columns via dgemv(W, x) */
676  CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
677 
678  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
679  /* CUDAERR(cudaEventRecord(event1, stream2)); */
680 
681  /* L=W*H */
682  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
683 
684  /* L=A(./)L */
685  ddiv_cuda(m*n, A, L, stream1);
686 
687  /* B=W'*L */
688  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
689 
690  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
691  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
692 
693  /* B(i,j)=B(i,j) / y(i) for all B elements */
694  /* H=H(.*)B */
695  dupdate2H_cuda(k, n, y, B, H, stream1);
696 
697  /* Stream1 recording event "event" for stream2 */
698  CUDAERR(cudaEventRecord(event, stream1));
699  }
700  /* maybe not needed but by completeness ... */
701  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
702 
703  CUBLASERR(cublasDestroy(handle1));
704  CUBLASERR(cublasDestroy(handle2));
705 
706  CUDAERR(cudaStreamDestroy(stream1));
707  CUDAERR(cudaStreamDestroy(stream2));
708 
709  CUDAERR(cudaEventDestroy(event));
710 
711  CUDAERR(cudaFree(B));
712  CUDAERR(cudaFree(L));
713  CUDAERR(cudaFree(x));
714  CUDAERR(cudaFree(y));
715  break;
716 
717  default:
718  return -1;
719  }
720  return 0;
721 }
722 
723 
724 /**
725  * \fn int sbdivone_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
726  * \brief sbdivone_cuda performs NNMF using betadivergence when beta=2 using simple precision
727  * See description of dbdivone_cuda for more info
728  * \param m: (input) Number of rows of matrix A and matrix W
729  * \param n: (input) Number of columns of matrix A and matrix H
730  * \param k: (input) Number of columns of matrix W and rows of H
731  * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
732  * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
733  * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
734  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
735  * \param nIter: (input) Number of iterations
736  *
737  * It returns 0 if all is OK.
738 */
739 int sbdivone_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
740 {
741  float
742  *B=NULL,
743  *L=NULL,
744  *x=NULL,
745  *y=NULL,
746  alpha=1.0f,
747  beta =0.0f;
748 
749  int
750  i, devID;
751 
752  cublasHandle_t
753  handle1,
754  handle2;
755 
756  cudaStream_t
757  stream1,
758  stream2;
759 
760  cudaEvent_t
761  event;
762 
763  CUDAERR(cudaGetDevice(&devID));
764 
765  switch (uType)
766  {
767  case UpdateAll:
768  CUDAERR(cudaMalloc((void **)&B, k*max(m,n)*sizeof(float)));
769  CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
770 
771  /* This two vectors (x and y) are used both in Step 1 and Step 2. With */
772  /* the strategy used with matrices B and D the size of x is max(m,n) */
773  CUDAERR(cudaMalloc((void **)&x, max(m,n)*sizeof(float)));
774  CUDAERR(cudaMalloc((void **)&y, k*sizeof(float)));
775 
776  CUDAERR(cudaStreamCreate(&stream1));
777  CUDAERR(cudaStreamCreate(&stream2));
778 
779  CUBLASERR(cublasCreate(&handle1));
780  CUBLASERR(cublasCreate(&handle2));
781 
782  CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
783 
784  CUBLASERR(cublasSetStream(handle1, stream1));
785  CUBLASERR(cublasSetStream(handle2, stream2));
786 
787  /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
788  smemset_cuda(max(m,n), x, 1.0f, stream1);
789 
790  /* Stream1 recording event "event" for stream2 (for the 1st time) */
791  CUDAERR(cudaEventRecord(event, stream1));
792 
793  for(i=0;i<nIter;i++)
794  {
795  /* ************************ Phase 1 *************************** */
796  /* Stream2 waiting "event" set by stream1. 1st time is always ok */
797  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
798 
799  /* Calculate the sums of all W columns via dgemv(W, x) */
800  CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
801 
802  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
803  /* CUDAERR(cudaEventRecord(event1, stream2)); */
804 
805  /* L=W*H */
806  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
807 
808  /* L=A(./)L */
809  sdiv_cuda(m*n, A, L, stream1);
810 
811  /* B=W'*L */
812  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
813 
814  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
815  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
816 
817  /* B(i,j)=B(i,j)/y(i) for all B elements */
818  /* H=H(.*)B */
819  supdate2H_cuda(k, n, y, B, H, stream1);
820 
821  /* Stream1 recording event "event" for stream2 */
822  CUDAERR(cudaEventRecord(event, stream1));
823 
824 
825  /* ************************** Phase 2 **************************** */
826  /* Stream2 waiting "event" set by stream1 */
827  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
828 
829  /* Calculate the sums of all H rows via dgemv(H, x) is performed */
830  CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
831 
832  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
833  /* CUDAERR(cudaEventRecord(event1, stream2)); */
834 
835  /* L=W*H */
836  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
837 
838  /* L=A(./)L */
839  sdiv_cuda(m*n, A, L, stream1);
840 
841  /* B=L*H' */
842  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
843 
844  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
845  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
846 
847  /* B(i,j)=B(i,j)/y(j) for all B elements */
848  /* W=W(.*)B */
849  supdate2W_cuda(m, k, y, B, W, stream1);
850 
851  /* Stream1 recording event "event2" for stream2 */
852  CUDAERR(cudaEventRecord(event, stream1));
853  }
854  /* maybe not needed but by completeness ... */
855  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
856 
857  CUBLASERR(cublasDestroy(handle1));
858  CUBLASERR(cublasDestroy(handle2));
859 
860  CUDAERR(cudaStreamDestroy(stream1));
861  CUDAERR(cudaStreamDestroy(stream2));
862 
863  CUDAERR(cudaEventDestroy(event));
864 
865  CUDAERR(cudaFree(B));
866  CUDAERR(cudaFree(L));
867  CUDAERR(cudaFree(x));
868  CUDAERR(cudaFree(y));
869  break;
870 
871  case UpdateW:
872  CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(float)));
873  CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
874  CUDAERR(cudaMalloc((void **)&x, n*sizeof(float)));
875  CUDAERR(cudaMalloc((void **)&y, k*sizeof(float)));
876 
877  CUDAERR(cudaStreamCreate(&stream1));
878  CUDAERR(cudaStreamCreate(&stream2));
879 
880  CUBLASERR(cublasCreate(&handle1));
881  CUBLASERR(cublasCreate(&handle2));
882 
883  CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
884 
885  CUBLASERR(cublasSetStream(handle1, stream1));
886  CUBLASERR(cublasSetStream(handle2, stream2));
887 
888  /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
889  smemset_cuda(max(m,n), x, 1.0f, stream1);
890 
891  /* Stream1 recording event "event" for stream2 (for the 1st time) */
892  CUDAERR(cudaEventRecord(event, stream1));
893 
894  for(i=0;i<nIter;i++)
895  {
896  /* ************************** Phase 2 **************************** */
897  /* Stream2 waiting "event" set by stream1 */
898  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
899 
900  /* Calculate the sums of all H rows via dgemv(H, x) is performed */
901  CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));
902 
903  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
904  /* CUDAERR(cudaEventRecord(event1, stream2)); */
905 
906  /* L=W*H */
907  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
908 
909  /* L=A(./)L */
910  sdiv_cuda(m*n, A, L, stream1);
911 
912  /* B=L*H' */
913  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));
914 
915  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
916  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
917 
918  /* B(i,j)=B(i,j)/y(j) for all B elements */
919  /* W=W(.*)B */
920  supdate2W_cuda(m, k, y, B, W, stream1);
921 
922  /* Stream1 recording event "event2" for stream2 */
923  CUDAERR(cudaEventRecord(event, stream1));
924  }
925  /* maybe not needed but by completeness ... */
926  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
927 
928  CUBLASERR(cublasDestroy(handle1));
929  CUBLASERR(cublasDestroy(handle2));
930 
931  CUDAERR(cudaStreamDestroy(stream1));
932  CUDAERR(cudaStreamDestroy(stream2));
933 
934  CUDAERR(cudaEventDestroy(event));
935 
936  CUDAERR(cudaFree(B));
937  CUDAERR(cudaFree(L));
938  CUDAERR(cudaFree(x));
939  CUDAERR(cudaFree(y));
940  break;
941 
942  case UpdateH:
943  CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(float)));
944  CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
945  CUDAERR(cudaMalloc((void **)&x, m*sizeof(float)));
946  CUDAERR(cudaMalloc((void **)&y, k*sizeof(float)));
947 
948  CUDAERR(cudaStreamCreate(&stream1));
949  CUDAERR(cudaStreamCreate(&stream2));
950 
951  CUBLASERR(cublasCreate(&handle1));
952  CUBLASERR(cublasCreate(&handle2));
953 
954  CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
955 
956  CUBLASERR(cublasSetStream(handle1, stream1));
957  CUBLASERR(cublasSetStream(handle2, stream2));
958 
959  /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
960  smemset_cuda(max(m,n), x, 1.0f, stream1);
961 
962  /* Stream1 recording event "event" for stream2 (for the 1st time) */
963  CUDAERR(cudaEventRecord(event, stream1));
964 
965  for(i=0;i<nIter;i++)
966  {
967  /* ************************ Phase 1 *************************** */
968  /* Stream2 waiting "event" set by stream1. 1st time is always ok */
969  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
970 
971  /* Calculate the sums of all W columns via dgemv(W, x) */
972  CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));
973 
974  /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
975  /* CUDAERR(cudaEventRecord(event1, stream2)); */
976 
977  /* L=W*H */
978  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));
979 
980  /* L=A(./)L */
981  sdiv_cuda(m*n, A, L, stream1);
982 
983  /* B=W'*L */
984  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));
985 
986  /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
987  /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */
988 
989  /* B(i,j)=B(i,j)/y(i) for all B elements */
990  /* H=H(.*)B */
991  supdate2H_cuda(k, n, y, B, H, stream1);
992 
993  /* Stream1 recording event "event" for stream2 */
994  CUDAERR(cudaEventRecord(event, stream1));
995  }
996  /* maybe not needed but by completeness ... */
997  CUDAERR(cudaStreamWaitEvent(stream2, event, 0));
998 
999  CUBLASERR(cublasDestroy(handle1));
1000  CUBLASERR(cublasDestroy(handle2));
1001 
1002  CUDAERR(cudaStreamDestroy(stream1));
1003  CUDAERR(cudaStreamDestroy(stream2));
1004 
1005  CUDAERR(cudaEventDestroy(event));
1006 
1007  CUDAERR(cudaFree(B));
1008  CUDAERR(cudaFree(L));
1009  CUDAERR(cudaFree(x));
1010  CUDAERR(cudaFree(y));
1011  break;
1012 
1013  default:
1014  return -1;
1015  }
1016  return 0;
1017 }
1018 
1019 
1020 /**
1021  * \fn int dbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1022  * \brief dbdiv_cuda is a wrapper that calls the adequate function to performs NNMF using betadivergence using double precision with GPUs
1023  * \param m: (input) Number of rows of matrix A and matrix W
1024  * \param n: (input) Number of columns of matrix A and matrix H
1025  * \param k: (input) Number of columns of matrix W and rows of H
1026  * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
1027  * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
1028  * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
1029  * \param beta: (input) Double precision value. The parameter beta of betadivergence method
1030  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
1031  * \param nIter: (input) Number of iterations
1032 */
1033 int dbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1034 {
1035  if ((beta < 0.0) || (nIter <= 0))
1036  return -1;
1037 
1038  if (beta>=2.0 && beta<=2.0)
1039  return dmlsa_cuda(m, n, k, A, W, H, uType, nIter);
1040  else
1041  {
1042  if (beta>=1.0 && beta<=1.0)
1043  return dbdivone_cuda(m, n, k, A, W, H, uType, nIter);
1044  else
1045  return dbdivg_cuda(m, n, k, A, W, H, beta, uType, nIter);
1046  }
1047 }
1048 
1049 
1050 /**
1051  * \fn int sbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1052  * \brief sbdiv_cuda is a wrapper that calls the adequate function to performs NNMF using betadivergence
1053  * using simple precision with GPUs
1054  * \param m: (input) Number of rows of matrix A and matrix W
1055  * \param n: (input) Number of columns of matrix A and matrix H
1056  * \param k: (input) Number of columns of matrix W and rows of H
1057  * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
1058  * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
1059  * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
1060  * \param beta: (input) Simple precision value. The parameter beta of betadivergence method
1061  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
1062  * \param nIter: (input) Number of iterations
1063 */
1064 int sbdiv_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
1065 {
1066  if ((beta < 0.0f) || (nIter <= 0))
1067  return -1;
1068 
1069  if (beta>=2.0f && beta<=2.0f)
1070  return smlsa_cuda(m, n, k, A, W, H, uType, nIter);
1071  else
1072  {
1073  if (beta>=1.0f && beta<=1.0f)
1074  return sbdivone_cuda(m, n, k, A, W, H, uType, nIter);
1075  else
1076  return sbdivg_cuda(m, n, k, A, W, H, beta, uType, nIter);
1077  }
1078 }
1079 
1080 
1081 /**
1082  * \fn void dupdate1H_cuda(const int n, const double *X, double *H, cudaStream_t stream)
1083  * \brief It calls cuda kernel vdupdate1H_cuda that performs H=H(.*)B(./)C where matrices
1084  * B and C are stored in buffer X by columns. All B 1st and after all C
1085  * \param n: (input) Number of elements of H
1086  * \param X: (input) Double precision input buffer (1D column-major)
1087  * \param H: (inout) Double precision input/output matrix (1D column-major)
1088  * \param stream: (input) ID of the stream to use
1089 */
1090 void dupdate1H_cuda(const int n, const double *X, double *H, cudaStream_t stream)
1091 {
1092  dim3 dimGrid, dimBlock;
1093 
1094  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1095  dimBlock.x = 192;
1096  #else
1097  dimBlock.x = 256;
1098  #endif
1099 
1100  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
1101  vdupdate1H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(n, X, H);
1102  #ifdef With_Check
1103  cudaDeviceSynchronize();
1104  #endif
1105 }
1106 
1107 
1108 /**
1109  * \fn void supdate1H_cuda(const int n, const float *X, float *H, cudaStream_t stream)
1110  * \brief It calls cuda kernel vsupdate1H_cuda that performs H=H(.*)B(./)C where matrices
1111  * B and C are stored in buffer X by columns. All B 1st and after all C
1112  * \param n: (input) Number of elements of H
1113  * \param X: (input) Simple precision input buffer (1D column-major)
1114  * \param H: (inout) Simple precision input/output matrix (1D column-major)
1115  * \param stream: (input) ID of the stream to use
1116 */
1117 void supdate1H_cuda(const int n, const float *X, float *H, cudaStream_t stream)
1118 {
1119  dim3 dimGrid, dimBlock;
1120 
1121  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1122  dimBlock.x = 192;
1123  #else
1124  dimBlock.x = 256;
1125  #endif
1126 
1127  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
1128  vsupdate1H_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, X, H);
1129  #ifdef With_Check
1130  cudaDeviceSynchronize();
1131  #endif
1132 }
1133 
1134 
1135 /**
1136  * \fn void dupdate1W_cuda(const int m, const int n, const double *X, double *W, cudaStream_t stream)
1137  * \brief It calls kernel vdupdate1W_cuda that performs W=W(.*)D(./)E where matrices
1138  * D and E are stored in buffer X according beta-divergence general case (see dbdivg_cuda(...))
1139  * \param m: (input) Number of rows of W
1140  * \param n: (input) Number of colums of W
1141  * \param X: (input) Double precision input buffer (1D column-major)
1142  * \param W: (inout) Double precision input/output matrix (1D column-major)
1143  * \param stream: (input) ID of the stream to use
1144 */
1145 void dupdate1W_cuda(const int m, const int n, const double *X, double *W, cudaStream_t stream)
1146 {
1147  dim3 dimGrid, dimBlock;
1148 
1149  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1150  dimBlock.x = 192;
1151  #else
1152  dimBlock.x = 256;
1153  #endif
1154 
1155  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1156  vdupdate1W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, X, W);
1157  #ifdef With_Check
1158  cudaDeviceSynchronize();
1159  #endif
1160 }
1161 
1162 
1163 /**
1164  * \fn void supdate1W_cuda(const int m, const int n, const float *X, float *W, cudaStream_t stream)
1165  * \brief It calls kernel vsupdate1W_cuda that performs W=W(.*)D(./)E where matrices
1166  * D and E are stored in buffer X according beta-divergence general case (see sbdivg_cuda(...))
1167  * \param m: (input) Number of rows of W
1168  * \param n: (input) Number of colums of W
1169  * \param X: (input) Simple precision input buffer (1D column-major)
1170  * \param W: (inout) Simple precision input/output matrix (1D column-major)
1171  * \param stream: (input) ID of the stream to use
1172 */
1173 void supdate1W_cuda(const int m, const int n, const float *X, float *W, cudaStream_t stream)
1174 {
1175  dim3 dimGrid, dimBlock;
1176 
1177  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1178  dimBlock.x = 192;
1179  #else
1180  dimBlock.x = 256;
1181  #endif
1182 
1183  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1184  vsupdate1W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, X, W);
1185  #ifdef With_Check
1186  cudaDeviceSynchronize();
1187  #endif
1188 }
1189 
1190 
1191 /**
1192  * \fn void dupdate2H_cuda(const int m, const int n, const double *x, const double *B, double *H, cudaStream_t stream)
1193  * \brief It calls kernel vdupdate2H_cuda that performs H(i)=H(i)*(B(i)/x(j))
1194  * \param m: (input) Number of rows of B and H, and number of elements of vector x
1195  * \param n: (input) Number of columns of B and A
1196  * \param x: (input) Double precision vector with the sum of W columns
1197  * \param B: (input) Double precision input matrix (1D column-major)
1198  * \param H: (inout) Double precision input/output matrix (1D column-major)
1199  * \param stream: (input) ID of the stream to use
1200 */
1201 void dupdate2H_cuda(const int m, const int n, const double *x, const double *B, double *H, cudaStream_t stream)
1202 {
1203  dim3 dimGrid, dimBlock;
1204 
1205  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1206  dimBlock.x = 192;
1207  #else
1208  dimBlock.x = 256;
1209  #endif
1210 
1211  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1212  vdupdate2H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(m, n, x, B, H);
1213  #ifdef With_Check
1214  cudaDeviceSynchronize();
1215  #endif
1216 }
1217 
1218 
1219 /**
1220  * \fn void supdate2H_cuda(const int m, const int n, const float *x, const float *B, float *H, cudaStream_t stream)
1221  * \brief It calls kernel vsupdate2H_cuda that performs H(i)=H(i)*(B(i)/x(j))
1222  * \param m: (input) Number of rows of B and H, and number of elements of vector x
1223  * \param n: (input) Number of columns of B and A
1224  * \param x: (input) Simple precision vector with the sum of W columns
1225  * \param B: (input) Simple precision input matrix (1D column-major)
1226  * \param H: (inout) Simple precision input/output matrix (1D column-major)
1227  * \param stream: (input) ID of the stream to use
1228 */
1229 void supdate2H_cuda(const int m, const int n, const float *x, const float *B, float *H, cudaStream_t stream)
1230 {
1231  dim3 dimGrid, dimBlock;
1232 
1233  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1234  dimBlock.x = 192;
1235  #else
1236  dimBlock.x = 256;
1237  #endif
1238 
1239  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1240  vsupdate2H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(m, n, x, B, H);
1241  #ifdef With_Check
1242  cudaDeviceSynchronize();
1243  #endif
1244 }
1245 
1246 
1247 /**
1248  * \fn void dupdate2W_cuda(const int m, const int n, const double *x, const double *B, double *W, cudaStream_t stream)
1249  * \brief It calls kernel vdupdate2W_cuda that performs W(i)=W(i)*(B(i)/x(j))
1250  * \param m: (input) Number of rows of W and B,
1251  * \param n: (input) Number of columns of W and B, and number of elements of vector x
1252  * \param x: (input) Double precision vector with the sum of W fills
1253  * \param B: (input) Double precision input matrix (1D column-major)
1254  * \param W: (inout) Double precision input/output matrix (1D column-major)
1255  * \param stream: (input) ID of the stream to use
1256 */
1257 void dupdate2W_cuda(const int m, const int n, const double *x, const double *B, double *W, cudaStream_t stream)
1258 {
1259  dim3 dimGrid, dimBlock;
1260 
1261  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1262  dimBlock.x = 192;
1263  #else
1264  dimBlock.x = 256;
1265  #endif
1266 
1267  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1268  vdupdate2W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, x, B, W);
1269  #ifdef With_Check
1270  cudaDeviceSynchronize();
1271  #endif
1272 }
1273 
1274 
1275 /**
1276  * \fn void supdate2W_cuda(const int m, const int n, const float *x, const float *B, float *W, cudaStream_t stream)
1277  * \brief It calls kernel vsupdate2W_cuda that performs W(i)=W(i)*(B(i)/x(j))
1278  * \param m: (input) Number of rows of W and B,
1279  * \param n: (input) Number of columns of W and B, and number of elements of vector x
1280  * \param x: (input) Simple precision vector with the sum of W fills
1281  * \param B: (input) Simple precision input matrix (1D column-major)
1282  * \param W: (inout) Simple precision input/output matrix (1D column-major)
1283  * \param stream: (input) ID of the stream to use
1284 */
1285 void supdate2W_cuda(const int m, const int n, const float *x, const float *B, float *W, cudaStream_t stream)
1286 {
1287  dim3 dimGrid, dimBlock;
1288 
1289  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1290  dimBlock.x = 192;
1291  #else
1292  dimBlock.x = 256;
1293  #endif
1294 
1295  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1296  vsupdate2W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, x, B, W);
1297  #ifdef With_Check
1298  cudaDeviceSynchronize();
1299  #endif
1300 }
1301 
1302 
1303 /**
1304  * \fn void dkernelH_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1305  * \brief It calls kernel vdkernelH_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1306  * Note expo is a real number expo < 0 or expo > 0
1307  * \param m: (input) Number of rows of L, A and R matrices
1308  * \param n: (input) Number of columns of L, A and R matrices
1309  * \param L: (input) Double precision input matrix (1D column-major)
1310  * \param A: (input) Double precision input matrix (1D column-major)
1311  * \param R: (output) Double precision output matrix (1D column-major)
1312  * \param expo: (input) the "power of" for function pow(). It is a double precision value
1313  * \param stream: (input) ID of the stream to use
1314 */
1315 void dkernelH_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1316 {
1317  dim3 dimGrid, dimBlock;
1318 
1319  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1320  dimBlock.x = 256;
1321  #else
1322  dimBlock.x = 256;
1323  #endif
1324 
1325  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1326  vdkernelH_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
1327  #ifdef With_Check
1328  cudaDeviceSynchronize();
1329  #endif
1330 }
1331 
1332 
1333 /**
1334  * \fn void skernelH_cuda(const int m, const int n, const float *L, const float *A, float *R, const double expo, cudaStream_t stream)
1335  * \brief It calls kernel vskernelH_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1336  * Note expo is a real number expo < 0 or expo > 0
1337  * \param m: (input) Number of rows of L, A and R matrices
1338  * \param n: (input) Number of columns of L, A and R matrices
1339  * \param L: (input) Simple precision input matrix (1D column-major)
1340  * \param A: (input) Simple precision input matrix (1D column-major)
1341  * \param R: (output) Simple precision output matrix (1D column-major)
1342  * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1343  * \param stream: (input) ID of the stream to use
1344 */
1345 void skernelH_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
1346 {
1347  dim3 dimGrid, dimBlock;
1348 
1349  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1350  dimBlock.x = 192;
1351  #else
1352  dimBlock.x = 256;
1353  #endif
1354 
1355  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1356  vskernelH_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
1357  #ifdef With_Check
1358  cudaDeviceSynchronize();
1359  #endif
1360 }
1361 
1362 
1363 /**
1364  * \fn void dkernelW_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1365  * \brief It calls kernel vdkernelW_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1366  * \param m: (input) Number of rows of L, A and R matrices
1367  * \param n: (input) Number of columns of L, A and R matrices
1368  * \param L: (input) Double precision input matrix (1D column-major)
1369  * \param A: (input) Double precision input matrix (1D column-major)
1370  * \param R: (output) Double precision output matrix (1D column-major)
1371  * \param expo: (input) the "power of" for function pow(). It is a double precision value
1372  * \param stream: (input) ID of the stream to use
1373 */
1374 void dkernelW_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
1375 {
1376  dim3 dimGrid, dimBlock;
1377 
1378  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1379  dimBlock.x = 256;
1380  #else
1381  dimBlock.x = 256;
1382  #endif
1383 
1384  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1385  vdkernelW_cuda<<<dimGrid, dimBlock, 0 ,stream>>>(m, n, L, A, R, expo);
1386  #ifdef With_Check
1387  cudaDeviceSynchronize();
1388  #endif
1389 }
1390 
1391 
1392 /**
1393  * \fn void skernelW_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
1394  * \brief It calls kernel vskernelW_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
1395  * \param m: (input) Number of rows of L, A and R matrices
1396  * \param n: (input) Number of columns of L, A and R matrices
1397  * \param L: (input) Simple precision input matrix (1D column-major)
1398  * \param A: (input) Simple precision input matrix (1D column-major)
1399  * \param R: (output) Simple precision output matrix (1D column-major)
1400  * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1401  * \param stream: (input) ID of the stream to use
1402 */
1403 void skernelW_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
1404 {
1405  dim3 dimGrid, dimBlock;
1406 
1407  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
1408  dimBlock.x = 192;
1409  #else
1410  dimBlock.x = 256;
1411  #endif
1412 
1413  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
1414  vskernelW_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
1415  #ifdef With_Check
1416  cudaDeviceSynchronize();
1417  #endif
1418 }
1419 
1420 
1421 
1422 /**
1423  * \fn __global__ void vdkernelH_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1424  * \brief This kernel performs double precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo)
1425  * Note expo is a real number expo < 0 or expo > 0
1426  * \param m: (input) Number of rows of L, A and R matrices
1427  * \param n: (input) Number of columns of L, A and R matrices
1428  * \param L: (input) Double precision input matrix (1D column-major)
1429  * \param A: (input) Double precision input matrix (1D column-major)
1430  * \param R: (output) Double precision output matrix (1D column-major)
1431  * \param expo: (input) the "power of" for function pow(). It is a double precision value
1432 */
1433 __global__ void vdkernelH_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1434 {
1435  unsigned int
1436  pos =blockDim.x * blockIdx.x + threadIdx.x,
1437  size=m*n;
1438 
1439  double
1440  dtmp1, dtmp2;
1441 
1442  if (pos < size)
1443  {
1444  dtmp1 = L[pos];
1445 
1446  if (dtmp1 == 0.0)
1447  /* if (dtmp1>=0.0 && dtmp1<=0.0) */
1448  R[pos] = R[pos+size] = 0.0;
1449  else
1450  {
1451  dtmp2 = pow(dtmp1, expo);
1452 
1453  /* ask for A[i]=0.0 don't give improvements. We don't do it */
1454  R[pos] =dtmp2*A[pos];
1455  R[pos+size]=dtmp1*dtmp2;
1456  }
1457  }
1458 }
1459 
1460 /**
1461  * \fn __global__ void vskernelH_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const double expo)
1462  * \brief This kernel computes simple precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo)
1463  * Note expo is a real number expo < 0 or expo > 0
1464  * \param m: (input) Number of rows of L, A and R matrices
1465  * \param n: (input) Number of columns of L, A and R matrices
1466  * \param L: (input) Simpel precision input matrix (1D column-major)
1467  * \param A: (input) Simpel precision input matrix (1D column-major)
1468  * \param R: (output) Simpel precision output matrix (1D column-major)
1469  * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1470 */
1471 __global__ void vskernelH_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const float expo)
1472 {
1473  unsigned int
1474  pos =blockDim.x * blockIdx.x + threadIdx.x,
1475  size=m*n;
1476 
1477  float
1478  ftmp1, ftmp2;
1479 
1480  if (pos < size)
1481  {
1482  ftmp1 = L[pos];
1483 
1484  if (ftmp1 == 0.0f)
1485  /* if (ftmp1>=0.0f && ftmp1<=0.0f) */
1486  R[pos] = R[pos+size] = 0.0f;
1487  else
1488  {
1489  ftmp2 = powf(ftmp1, expo);
1490 
1491  /* ask for A[i]=0.0 don't give improvements. We don't do it */
1492  R[pos] =ftmp2*A[pos];
1493  R[pos+size]=ftmp1*ftmp2;
1494  }
1495  }
1496 }
1497 
1498 
1499 /**
1500  * \fn __global__ void vdkernelW_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1501  * \brief This kernel computes double precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo)
1502  * Note expo is a real number expo < 0 or expo > 0
1503  * \param m: (input) Number of rows of L, A and R matrices
1504  * \param n: (input) Number of columns of L, A and R matrices
1505  * \param L: (input) Double precision input matrix (1D column-major)
1506  * \param A: (input) Double precision input matrix (1D column-major)
1507  * \param R: (output) Double precision output matrix (1D column-major)
1508  * \param expo: (input) the "power of" for function pow(). It is a double precision value
1509 */
1510 __global__ void vdkernelW_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
1511 {
1512  unsigned int
1513  pos=blockDim.x * blockIdx.x + threadIdx.x,
1514  newpos;
1515 
1516  double
1517  dtmp1, dtmp2;
1518 
1519  if (pos < m*n)
1520  {
1521  newpos = 2*m*(pos/m)+(pos%m);
1522  dtmp1 = L[pos];
1523 
1524  if (dtmp1 == 0.0)
1525  /* if (dtmp1>=0.0 && dtmp1<=0.0) */
1526  R[pos] = R[pos+m] = 0.0;
1527  else
1528  {
1529  dtmp2 = pow(dtmp1, expo);
1530 
1531  R[newpos] = dtmp2*A[pos];
1532  R[newpos+m]= dtmp1*dtmp2;
1533  }
1534  }
1535 }
1536 
1537 
1538 /**
1539  * \fn __global__ void vskernelW_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const double expo)
1540  * \brief This kernel computes simple precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo)
1541  * Note expo is a real number expo < 0 or expo > 0
1542  * \param m: (input) Number of rows of L, A and R matrices
1543  * \param n: (input) Number of columns of L, A and R matrices
1544  * \param L: (input) Simple precision input matrix (1D column-major)
1545  * \param A: (input) Simple precision input matrix (1D column-major)
1546  * \param R: (output) Simple precision output matrix (1D column-major)
1547  * \param expo: (input) the "power of" for function pow(). It is a simple precision value
1548 */
1549 __global__ void vskernelW_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const float expo)
1550 {
1551  unsigned int
1552  pos=blockDim.x * blockIdx.x + threadIdx.x,
1553  newpos;
1554 
1555  float
1556  ftmp1, ftmp2;
1557 
1558  if (pos < m*n)
1559  {
1560  newpos = 2*m*(pos/m)+(pos%m);
1561  ftmp1 = L[pos];
1562 
1563  if (ftmp1 == 0.0f)
1564  /* if (ftmp1>=0.0f && ftmp1<=0.0f) */
1565  R[pos] = R[pos+m] = 0.0f;
1566  else
1567  {
1568  ftmp2 = powf(ftmp1, expo);
1569 
1570  R[newpos] = ftmp2*A[pos];
1571  R[newpos+m]= ftmp1*ftmp2;
1572  }
1573  }
1574 }
1575 
1576 
1577 /**
1578  * \fn __global__ void vdupdate1H_cuda(const int n, const double* __restrict__ X, double *H)
1579  * \brief This kernel computes double precision H(i)=H(i)*B(i)/C(i) where matrices
1580  * B and C are stored in the same buffer (called X). All B 1st and after C
1581  * \param n: (input) Number of elements of H
1582  * \param X: (input) Double precision input matrix (1D column-major)
1583  * \param H: (inout) Double precision input/output matrix (1D column-major)
1584 */
1585 __global__ void vdupdate1H_cuda(const int n, const double* __restrict__ X, double *H)
1586  /* The use of "const double* __restrict__ A" and after "...=A[i]" is */
1587  /* the same that "double *A" and after "...=__ldg(&A[i])" but __ldg() */
1588  /* intrinsic is only available on compute capability 3.5 (or newer) */
1589  /* With both go through the read-only cache */
1590 {
1591  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1592 
1593  if (pos < n)
1594  #ifdef With_Check
1595  /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
1596  H[pos] = H[pos] * X[pos] / X[pos+n];
1597  assert(!fpe(H[pos]));
1598  #else
1599  H[pos] = H[pos] * X[pos] / X[pos+n];
1600  #endif
1601 }
1602 
1603 
1604 /**
1605  * \fn __global__ void vsupdate1H_cuda(const int n, const float* __restrict__ X, float *H)
1606  * \brief This kernel computes simple precision H(i)=H(i)*B(i)/C(i) where matrices
1607  * B and C are stored in the same buffer (called X). All B 1st and after C
1608  * \param n: (input) Number of elements of H
1609  * \param X: (input) Simple precision input matrix (1D column-major)
1610  * \param H: (inout) Simple precision input/output matrix (1D column-major)
1611 */
1612 __global__ void vsupdate1H_cuda(const int n, const float* __restrict__ X, float *H)
1613 {
1614  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1615 
1616  if (pos < n)
1617  #ifdef With_Check
1618  /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
1619  H[pos] = H[pos] * X[pos] / X[pos+n];
1620  assert(!fpe(H[pos]));
1621  #else
1622  H[pos] = H[pos] * X[pos] / X[pos+n];
1623  #endif
1624 }
1625 
1626 
1627 /**
1628  * \fn __global__ void vdupdate1W_cuda(const int m, const int n, const double* __restrict__ X, double *W)
1629  * \brief This kernel computes double precision W[i]=W[i]*D[i]/E[i] where matrices D and E are
1630  * stored in the same buffer (called X) according beta-divergence general case (see dbdivg_cuda(...))
1631  * \param m: (input) Number of rows of W
1632  * \param n: (input) Number of columns of W
1633  * \param X: (input) Double precision input matrix (1D column-major)
1634  * \param W: (inout) Double precision input/output matrix (1D column-major)
1635 */
1636 __global__ void vdupdate1W_cuda(const int m, const int n, const double* __restrict__ X, double *W)
1637 {
1638  unsigned int
1639  pos=blockDim.x * blockIdx.x + threadIdx.x,
1640  newpos;
1641 
1642  if (pos < m*n)
1643  {
1644  newpos = pos+(pos/m)*m;
1645  #ifdef With_Check
1646  W[pos] = W[pos] * X[newpos] / X[newpos+m];
1647  assert(!fpe(W[pos]));
1648  #else
1649  W[pos] = W[pos] * X[newpos] / X[newpos+m];
1650  #endif
1651  }
1652 }
1653 
1654 
1655 /**
1656  * \fn __global__ void vsupdate1W_cuda(const int m, const int n, const float* __rectrict__ X, float *W)
1657  * \brief This kernel computes simple precision W[i]=W[i]*D[i]/E[i] where matrices D and E are
1658  * stored in the same buffer (called X) according beta-divergence general case (see dbdivg_cuda(...))
1659  * \param m: (input) Number of rows of W
1660  * \param n: (input) Number of columns of W
1661  * \param X: (input) Simple precision input matrix (1D column-major)
1662  * \param W: (inout) Simple precision input/output matrix (1D column-major)
1663 */
1664 __global__ void vsupdate1W_cuda(const int m, const int n, const float* __restrict__ X, float *W)
1665 {
1666  unsigned int
1667  pos=blockDim.x * blockIdx.x + threadIdx.x,
1668  newpos;
1669 
1670  if (pos < m*n)
1671  {
1672  newpos = pos+(pos/m)*m;
1673  #ifdef With_Check
1674  /* Here we can have NaN and Inf if X(newpos+m) and/or W(pos) and or/ X(newpos)=0 */
1675  W[pos] = W[pos] * X[newpos] / X[newpos+m];
1676  assert(!fpe(W[pos]));
1677  #else
1678  W[pos] = W[pos] * X[newpos] / X[newpos+m];
1679  #endif
1680  }
1681 }
1682 
1683 
1684 /**
1685  * \fn __global__ void vdupdate2H_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict_ B, double *H)
1686  * \brief This kernel computes double precision H(i)=H(i)*(B(i)/y(j))
1687  * \param m: (input) Number of rows of B and H, and number of elements of vector y
1688  * \param n: (input) Number of columns of B and A
1689  * \param y: (input) Double precision vector with the sum of W columns
1690  * \param B: (input) Double precision input matrix (1D column-major)
1691  * \param H: (inout) Double precision input/output matrix (1D column-major)
1692 */
1693 __global__ void vdupdate2H_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *H)
1694 {
1695  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1696 
1697  if (pos < m*n)
1698  #ifdef With_Check
1699  /* Here we can have NaN and Inf if y(pos%m) and/or H(pos) and or/ B(pos)=0 */
1700  H[pos] = H[pos] * (B[pos] / y[pos%m]);
1701  assert(!fpe(H[pos]));
1702  #else
1703  H[pos] = H[pos] * (B[pos] / y[pos%m]);
1704  #endif
1705 }
1706 
1707 
1708 /**
1709  * \fn __global__ void vsupdate2H_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *H)
1710  * \brief This kernel performs the simple H(i)=H(i)*(B(i)/y(j))
1711  * \param m: (input) Number of rows of B and H, and number of elements of vector y
1712  * \param n: (input) Number of columns of B and A
1713  * \param y: (input) Simple precision vector with the sum of W columns
1714  * \param B: (input) Simple precision input matrix (1D column-major)
1715  * \param H: (inout) Simple precision input/output matrix (1D column-major)
1716 */
1717 __global__ void vsupdate2H_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *H)
1718 {
1719  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1720 
1721  if (pos < m*n)
1722  #ifdef With_Check
1723  /* Here we can have NaN and Inf if y(pos%m) and/or H(pos) and or/ B(pos)=0 */
1724  H[pos] = H[pos] * (B[pos] / y[pos%m]);
1725  assert(!fpe(H[pos]));
1726  #else
1727  H[pos] = H[pos] * (B[pos] / y[pos%m]);
1728  #endif
1729 }
1730 
1731 
1732 /**
1733  * \fn __global__ void vdupdate2W_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *W)
1734  * \brief This kernel performs double precision W(i)=W(i)*(B(i)/y(j))
1735  * \param m: (input) Number of rows of W and B,
1736  * \param n: (input) Number of columns of W and B, and number of elements of vector y
1737  * \param y: (input) Double precision vector with the sum of H rows
1738  * \param B: (input) Double precision input matrix (1D column-major)
1739  * \param W: (inout) Double precision input/output matrix (1D column-major)
1740 */
1741 __global__ void vdupdate2W_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *W)
1742 {
1743  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1744 
1745  if (pos < m*n)
1746  #ifdef With_Check
1747  /* Here we can have NaN and Inf if y(pos/m) and/or W(pos) and/or B(pos)=0 */
1748  W[pos] = W[pos] * (B[pos] / y[pos/m]);
1749  assert(!fpe(W[pos]));
1750  #else
1751  W[pos] = W[pos] * (B[pos] / y[pos/m]);
1752  #endif
1753 }
1754 
1755 
1756 /**
1757  * \fn __global__ void vsupdate2W_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *W)
1758  * \brief This kernel computes simple precision W(i)=W(i)*(B(i)/y(j))
1759  * \param m: (input) Number of rows of W and B,
1760  * \param n: (input) Number of columns of W and B, and number of elements of vector y
1761  * \param y: (input) Simple precision vector with the sum of H rows
1762  * \param B: (input) Simple precision input matrix (1D column-major)
1763  * \param W: (inout) Simple precision input/output matrix (1D column-major)
1764 */
1765 __global__ void vsupdate2W_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *W)
1766 {
1767  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
1768 
1769  if (pos < m*n)
1770  #ifdef With_Check
1771  /* Here we can have NaN and Inf if y(pos/m) and/or W(pos) and/or B(pos)=0 */
1772  W[pos] = W[pos] * (B[pos] / y[pos/m]);
1773  assert(!fpe(W[pos]));
1774  #else
1775  W[pos] = W[pos] * (B[pos] / y[pos/m]);
1776  #endif
1777 }