NnmfPack  2.1
mlsa_cuda.cu
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2014 by PIR (University of Oviedo) and *
3  * INCO2 (Polytechnic University of Valencia) groups. *
4  * nnmfpack@gmail.com *
5  * *
6  * This program is free software; you can redistribute it and/or modify *
7  * it under the terms of the GNU General Public License as published by *
8  * the Free Software Foundation; either version 2 of the License, or *
9  * (at your option) any later version. *
10  * *
11  * This program is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14  * GNU General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU General Public License *
17  * along with this program; if not, write to the *
18  * Free Software Foundation, Inc., *
19  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20  ***************************************************************************
21 */
22 /**
23  * \file mlsa_cuda.cu
24  * \brief File with functions to calcule NNMF using mlsa method for GPUs
25  * \author Information Retrieval and Parallel Computing Group (IRPCG)
26  * \author University of Oviedo, Spain
27  * \author Interdisciplinary Computation and Communication Group (INCO2)
28  * \author Universitat Politecnica de Valencia, Spain.
29  * \author Contact: nnmfpack@gmail.com
30  * \date 04/11/14
31 */
32 #include "mlsa_cuda.h"
33 
34 
35 /**
36  * \fn int dmlsa_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
37  * \brief this function performs NNMF using beta-divergence when beta=2, using double precision.
38  *
39  * The algorithm is<BR>
40  * &nbsp;&nbsp;repit nIter times<BR>
41  * &nbsp;&nbsp;&nbsp;&nbsp;STEP 1<BR>
42  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
43  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B=W'*A<BR>
44  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;C=W'*L<BR>
45  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H=H(.*)B(./)C<BR>
46  *
47  * &nbsp;&nbsp;&nbsp;&nbsp;STEP 2<BR>
48  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
49  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;D=A*H'<BR>
50  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;E=L*H'<BR>
51  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;W=W(.*)D(./)E<BR>
52  * &nbsp;&nbsp;&nbsp;&nbsp;end repit<BR>
53  * End algorithm<BR>
54  *
55  * To save some FLOPs and RAM a modified Lee and Seung version is used, so we have Step 1: L=W'*W,
56  * C=L*H and then B=W'*A; Step 2: L=H*H', E=W*L and D=A*H'. W'*W (H*H') and L*H (W*L) can do in
57  * parallel with W'*A (A*H'). Here we use streams.
58  *
59  * In real live B and C are (k*n) matrices used in STEP 1, and D and E are (m*k)
60  * matrices used in STEP 2. B/C and D/E are independent. For this reason only 2 matrices are declared
61  * to save space. They are matrices B and C with size max(m,n)*k
62  *
63  * \param m: (input) Number of rows of matrix A and matrix W
64  * \param n: (input) Number of columns of matrix A and matrix H
65  * \param k: (input) Number of columns of matrix W and rows of H
66  * \param A: (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
67  * \param W: (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
68  * \param H: (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
69  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
70  * \param nIter: (input) Number of iterations
71  *
72  * It returns 0 if all is OK.
73 */
74 int dmlsa_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
75 {
76  double
77  *B=NULL,
78  *C=NULL,
79  *L=NULL,
80  alpha=1.0,
81  beta =0.0;
82 
83  int
84  i, devID;
85 
86  cublasHandle_t
87  handle1,
88  handle2;
89 
90  cudaStream_t
91  stream1,
92  stream2;
93 
94  cudaEvent_t
95  event1,
96  event2;
97 
98  CUDAERR(cudaGetDevice(&devID));
99 
100  switch (uType)
101  {
102  case UpdateAll:
103  CUDAERR(cudaMalloc((void **)&B, max(m,n)*k*sizeof(double)));
104  CUDAERR(cudaMalloc((void **)&C, max(m,n)*k*sizeof(double)));
105  CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));
106 
107  CUDAERR(cudaStreamCreate(&stream1));
108  CUDAERR(cudaStreamCreate(&stream2));
109 
110  CUBLASERR(cublasCreate(&handle1));
111  CUBLASERR(cublasCreate(&handle2));
112 
113  CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
114  CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
115 
116  CUBLASERR(cublasSetStream(handle1, stream1));
117  CUBLASERR(cublasSetStream(handle2, stream2));
118 
119  /* Why streams? Why not. Today perhaps no but in the future CUBLAS... */
120  /* Why this approach of streams? */
121  /* Let m=n and k = n/2. Thereby B=W'*A has n^3 flops. On other hand */
122  /* W'*W has (n^3)/2 and L*H the same. That is, stream 2 has n^3 flops */
123  /* and stream 1 the same ((n^3)/2 + (n^3)/2 = n^3). When k is smaller */
124  /* than n/2 stream 2 runs more flops (slower). When k is bigger... In */
125  /* real problems k (is) should be much smaller than min(m,n) so this */
126  /* approach is (maybe) the best. */
127 
128  /* Stream1 recording event "event2" for stream2 (for the 1st time) */
129  CUDAERR(cudaEventRecord(event2, stream1));
130 
131  for(i=0;i<nIter;i++)
132  {
133  /* ************************** Phase 1 **************************** */
134  /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
135  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
136 
137  /* B=W'*A */
138  CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
139 
140  /* Stream2 recording event "event1" for stream1 */
141  CUDAERR(cudaEventRecord(event1, stream2));
142 
143  /* L=W'*W */
144  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
145 
146  /* C=L*H */
147  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
148 
149  /* Stream1 waiting "event1" set by stream2 */
150  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
151 
152  /* H=H(.*)B(./)C */
153  ddotdiv_cuda(k*n, B, C, H, stream1);
154 
155  /* Stream1 recording event "event2" for stream2 */
156  CUDAERR(cudaEventRecord(event2, stream1));
157 
158 
159  /* ************************** Phase 2 **************************** */
160  /* Stream2 waiting "event2" set by stream1 */
161  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
162 
163  /* B=A*H' */
164  CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
165 
166  /* Stream2 recording event "event1" for stream1 */
167  CUDAERR(cudaEventRecord(event1, stream2));
168 
169  /* L=H*H' */
170  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
171 
172  /* C=W*L */
173  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
174 
175  /* Stream1 waiting "event1" set by stream 2 */
176  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
177 
178  /* W=W(.*)B(./)C */
179  ddotdiv_cuda(m*k, B, C, W, stream1);
180 
181  /* Stream1 recording event "event2" for stream2 */
182  CUDAERR(cudaEventRecord(event2, stream1));
183  }
184  /* maybe not needed but by completeness ... */
185  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
186 
187  CUBLASERR(cublasDestroy(handle1));
188  CUBLASERR(cublasDestroy(handle2));
189 
190  CUDAERR(cudaStreamDestroy(stream1));
191  CUDAERR(cudaStreamDestroy(stream2));
192 
193  CUDAERR(cudaEventDestroy(event1));
194  CUDAERR(cudaEventDestroy(event2));
195 
196  CUDAERR(cudaFree(B));
197  CUDAERR(cudaFree(C));
198  CUDAERR(cudaFree(L));
199  break;
200 
201  case UpdateW:
202  CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(double)));
203  CUDAERR(cudaMalloc((void **)&C, m*k*sizeof(double)));
204  CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));
205 
206  CUDAERR(cudaStreamCreate(&stream1));
207  CUDAERR(cudaStreamCreate(&stream2));
208 
209  CUBLASERR(cublasCreate(&handle1));
210  CUBLASERR(cublasCreate(&handle2));
211 
212  CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
213  CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
214 
215  CUBLASERR(cublasSetStream(handle1, stream1));
216  CUBLASERR(cublasSetStream(handle2, stream2));
217 
218  /* Why streams? Why not. Today perhaps no but in the future CUBLAS... */
219  /* Why this approach of streams? */
220  /* Let m=n and k = n/2. Thereby B=W'*A has n^3 flops. On other hand */
221  /* W'*W has (n^3)/2 and L*H the same. That is, stream 2 has n^3 flops */
222  /* and stream 1 the same ((n^3)/2 + (n^3)/2 = n^3). When k is smaller */
223  /* than n/2 stream 2 runs more flops (slower). When k is bigger... In */
224  /* real problems k (is) should be much smaller than min(m,n) so this */
225  /* approach is (maybe) the best. */
226 
227  /* Stream1 recording event "event2" for stream2 (for the 1st time) */
228  CUDAERR(cudaEventRecord(event2, stream1));
229 
230  for(i=0;i<nIter;i++)
231  {
232  /* ************************** Phase 2 **************************** */
233  /* Stream2 waiting "event2" set by stream1 */
234  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
235 
236  /* B=A*H' */
237  CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
238 
239  /* Stream2 recording event "event1" for stream1 */
240  CUDAERR(cudaEventRecord(event1, stream2));
241 
242  /* L=H*H' */
243  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
244 
245  /* C=W*L */
246  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
247 
248  /* Stream1 waiting "event1" set by stream 2 */
249  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
250 
251  /* W=W(.*)B(./)C */
252  ddotdiv_cuda(m*k, B, C, W, stream1);
253 
254  /* Stream1 recording event "event2" for stream2 */
255  CUDAERR(cudaEventRecord(event2, stream1));
256  }
257  /* maybe not needed but by completeness ... */
258  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
259 
260  CUBLASERR(cublasDestroy(handle1));
261  CUBLASERR(cublasDestroy(handle2));
262 
263  CUDAERR(cudaStreamDestroy(stream1));
264  CUDAERR(cudaStreamDestroy(stream2));
265 
266  CUDAERR(cudaEventDestroy(event1));
267  CUDAERR(cudaEventDestroy(event2));
268 
269  CUDAERR(cudaFree(B));
270  CUDAERR(cudaFree(C));
271  CUDAERR(cudaFree(L));
272  break;
273 
274  case UpdateH:
275  CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(double)));
276  CUDAERR(cudaMalloc((void **)&C, n*k*sizeof(double)));
277  CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));
278 
279  CUDAERR(cudaStreamCreate(&stream1));
280  CUDAERR(cudaStreamCreate(&stream2));
281 
282  CUBLASERR(cublasCreate(&handle1));
283  CUBLASERR(cublasCreate(&handle2));
284 
285  CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
286  CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
287 
288  CUBLASERR(cublasSetStream(handle1, stream1));
289  CUBLASERR(cublasSetStream(handle2, stream2));
290 
291  /* Stream1 recording event "event2" for stream2 (for the 1st time) */
292  CUDAERR(cudaEventRecord(event2, stream1));
293 
294  for(i=0;i<nIter;i++)
295  {
296  /* ************************** Phase 1 **************************** */
297  /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
298  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
299 
300  /* B=W'*A */
301  CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
302 
303  /* Stream2 recording event "event1" for stream1 */
304  CUDAERR(cudaEventRecord(event1, stream2));
305 
306  /* L=W'*W */
307  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
308 
309  /* C=L*H */
310  CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
311 
312  /* Stream1 waiting "event1" set by stream2 */
313  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
314 
315  /* H=H(.*)B(./)C */
316  ddotdiv_cuda(k*n, B, C, H, stream1);
317 
318  /* Stream1 recording event "event2" for stream2 */
319  CUDAERR(cudaEventRecord(event2, stream1));
320  }
321  /* maybe not needed but by completeness ... */
322  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
323 
324  CUBLASERR(cublasDestroy(handle1));
325  CUBLASERR(cublasDestroy(handle2));
326 
327  CUDAERR(cudaStreamDestroy(stream1));
328  CUDAERR(cudaStreamDestroy(stream2));
329 
330  CUDAERR(cudaEventDestroy(event1));
331  CUDAERR(cudaEventDestroy(event2));
332 
333  CUDAERR(cudaFree(B));
334  CUDAERR(cudaFree(C));
335  CUDAERR(cudaFree(L));
336  break;
337 
338  default:
339  return -1;
340  }
341  return 0;
342 }
343 
344 
345 /**
346  * \fn int smlsa_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
347  * \brief smlsa_cuda performs NNMF using betadi-vergence when beta=2, using simple precision.
348  * See description of dmlsa_cuda for more info
349  * \param m: (input) Number of rows of matrix A and matrix W
350  * \param n: (input) Number of columns of matrix A and matrix H
351  * \param k: (input) Number of columns of matrix W and rows of H
352  * \param A: (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
353  * \param W: (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
354  * \param H: (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
355  * \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
356  * \param nIter: (input) Number of iterations
357  *
358  * It returns 0 if all is OK.
359 */
360 int smlsa_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
361 {
362  float
363  *B=NULL,
364  *C=NULL,
365  *L=NULL,
366  alpha=1.0f,
367  beta =0.0f;
368 
369  int
370  i, devID;
371 
372  cublasHandle_t
373  handle1,
374  handle2;
375 
376  cudaStream_t
377  stream1,
378  stream2;
379 
380  cudaEvent_t
381  event1,
382  event2;
383 
384  CUDAERR(cudaGetDevice(&devID));
385 
386  switch (uType)
387  {
388  case UpdateAll:
389  CUDAERR(cudaMalloc((void **)&B, max(m,n)*k*sizeof(float)));
390  CUDAERR(cudaMalloc((void **)&C, max(m,n)*k*sizeof(float)));
391  CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));
392 
393  CUDAERR(cudaStreamCreate(&stream1));
394  CUDAERR(cudaStreamCreate(&stream2));
395 
396  CUBLASERR(cublasCreate(&handle1));
397  CUBLASERR(cublasCreate(&handle2));
398 
399  CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
400  CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
401 
402  CUBLASERR(cublasSetStream(handle1, stream1));
403  CUBLASERR(cublasSetStream(handle2, stream2));
404 
405  /* Why streams? Why this approach of streams? See dmlsa_cuda */
406 
407  /* Stream1 recording event "event2" for stream2 (for the 1st time) */
408  CUDAERR(cudaEventRecord(event2, stream1));
409 
410  for(i=0;i<nIter;i++)
411  {
412  /* ************************** Phase 1 **************************** */
413  /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
414  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
415 
416  /* B=W'*A */
417  CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
418 
419  /* Stream2 recording event "event1" for stream1 */
420  CUDAERR(cudaEventRecord(event1, stream2));
421 
422  /* L=W'*W */
423  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
424 
425  /* C=L*H */
426  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
427 
428  /* Stream1 waiting "event1" set by stream2 */
429  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
430 
431  /* H=H(.*)B(./)C */
432  sdotdiv_cuda(k*n, B, C, H, stream1);
433 
434  /* Stream1 recording event "event2" for stream2 */
435  CUDAERR(cudaEventRecord(event2, stream1));
436 
437 
438  /* ************************** Phase 2 **************************** */
439  /* Stream2 waiting "event2" set by stream1 */
440  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
441 
442  /* B=A*H' */
443  CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
444 
445  /* Stream2 recording event "event1" for stream1 */
446  CUDAERR(cudaEventRecord(event1, stream2));
447 
448  /* L=H*H' */
449  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
450 
451  /* C=W*L */
452  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
453 
454  /* Stream1 waiting "event1" set by stream 2 */
455  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
456 
457  /* W=W(.*)B(./)C */
458  sdotdiv_cuda(m*k, B, C, W, stream1);
459 
460  /* Stream1 recording event "event2" for stream2 */
461  CUDAERR(cudaEventRecord(event2, stream1));
462  }
463  /* maybe not needed but by completeness ... */
464  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
465 
466  CUBLASERR(cublasDestroy(handle1));
467  CUBLASERR(cublasDestroy(handle2));
468 
469  CUDAERR(cudaStreamDestroy(stream1));
470  CUDAERR(cudaStreamDestroy(stream2));
471 
472  CUDAERR(cudaEventDestroy(event1));
473  CUDAERR(cudaEventDestroy(event2));
474 
475  CUDAERR(cudaFree(B));
476  CUDAERR(cudaFree(C));
477  CUDAERR(cudaFree(L));
478  break;
479 
480  case UpdateW:
481  CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(float)));
482  CUDAERR(cudaMalloc((void **)&C, m*k*sizeof(float)));
483  CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));
484 
485  CUDAERR(cudaStreamCreate(&stream1));
486  CUDAERR(cudaStreamCreate(&stream2));
487 
488  CUBLASERR(cublasCreate(&handle1));
489  CUBLASERR(cublasCreate(&handle2));
490 
491  CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
492  CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
493 
494  CUBLASERR(cublasSetStream(handle1, stream1));
495  CUBLASERR(cublasSetStream(handle2, stream2));
496 
497  /* Why streams? Why this approach of streams? See dmlsa_cuda */
498 
499  /* Stream1 recording event "event2" for stream2 (for the 1st time) */
500  CUDAERR(cudaEventRecord(event2, stream1));
501 
502  for(i=0;i<nIter;i++)
503  {
504  /* ************************** Phase 2 **************************** */
505  /* Stream2 waiting "event2" set by stream1 */
506  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
507 
508  /* B=A*H' */
509  CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));
510 
511  /* Stream2 recording event "event1" for stream1 */
512  CUDAERR(cudaEventRecord(event1, stream2));
513 
514  /* L=H*H' */
515  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));
516 
517  /* C=W*L */
518  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));
519 
520  /* Stream1 waiting "event1" set by stream 2 */
521  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
522 
523  /* W=W(.*)B(./)C */
524  sdotdiv_cuda(m*k, B, C, W, stream1);
525 
526  /* Stream1 recording event "event2" for stream2 */
527  CUDAERR(cudaEventRecord(event2, stream1));
528  }
529  /* maybe not needed but by completeness ... */
530  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
531 
532  CUBLASERR(cublasDestroy(handle1));
533  CUBLASERR(cublasDestroy(handle2));
534 
535  CUDAERR(cudaStreamDestroy(stream1));
536  CUDAERR(cudaStreamDestroy(stream2));
537 
538  CUDAERR(cudaEventDestroy(event1));
539  CUDAERR(cudaEventDestroy(event2));
540 
541  CUDAERR(cudaFree(B));
542  CUDAERR(cudaFree(C));
543  CUDAERR(cudaFree(L));
544  break;
545 
546  case UpdateH:
547  CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(float)));
548  CUDAERR(cudaMalloc((void **)&C, n*k*sizeof(float)));
549  CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));
550 
551  CUDAERR(cudaStreamCreate(&stream1));
552  CUDAERR(cudaStreamCreate(&stream2));
553 
554  CUBLASERR(cublasCreate(&handle1));
555  CUBLASERR(cublasCreate(&handle2));
556 
557  CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
558  CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));
559 
560  CUBLASERR(cublasSetStream(handle1, stream1));
561  CUBLASERR(cublasSetStream(handle2, stream2));
562 
563  /* Why streams? Why this approach of streams? See dmlsa_cuda */
564 
565  /* Stream1 recording event "event2" for stream2 (for the 1st time) */
566  CUDAERR(cudaEventRecord(event2, stream1));
567 
568  for(i=0;i<nIter;i++)
569  {
570  /* ************************** Phase 1 **************************** */
571  /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
572  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
573 
574  /* B=W'*A */
575  CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));
576 
577  /* Stream2 recording event "event1" for stream1 */
578  CUDAERR(cudaEventRecord(event1, stream2));
579 
580  /* L=W'*W */
581  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));
582 
583  /* C=L*H */
584  CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));
585 
586  /* Stream1 waiting "event1" set by stream2 */
587  CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));
588 
589  /* H=H(.*)B(./)C */
590  sdotdiv_cuda(k*n, B, C, H, stream1);
591 
592  /* Stream1 recording event "event2" for stream2 */
593  CUDAERR(cudaEventRecord(event2, stream1));
594  }
595  /* maybe not needed but by completeness ... */
596  CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));
597 
598  CUBLASERR(cublasDestroy(handle1));
599  CUBLASERR(cublasDestroy(handle2));
600 
601  CUDAERR(cudaStreamDestroy(stream1));
602  CUDAERR(cudaStreamDestroy(stream2));
603 
604  CUDAERR(cudaEventDestroy(event1));
605  CUDAERR(cudaEventDestroy(event2));
606 
607  CUDAERR(cudaFree(B));
608  CUDAERR(cudaFree(C));
609  CUDAERR(cudaFree(L));
610  break;
611 
612  default:
613  return -1;
614  }
615  return 0;
616 }
617 
618 
619 
620 /**
621  * \fn void ddotdiv_cuda(const int n, const double *x, const double *y, double *z, cudaStream_t stream)
622  * \brief It calls kernel vddotdiv_cuda that performs z[i]=x[i]*y[i]/z[i]
623  * \param n: (input) Number of elements of x, y and z
624  * \param x: (input) Double precision input vector/matrix (1D column-major)
625  * \param y: (input) Double precision input vector/matrix (1D column-major)
626  * \param z: (inout) Double precision input/output vector/matrix (1D column-major)
627  * \param stream: (input) ID of the stream to use
628 */
629 void ddotdiv_cuda(const int n, const double *x, const double *y, double *z, cudaStream_t stream)
630 {
631  dim3 dimGrid, dimBlock;
632 
633  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
634  dimBlock.x = 192;
635  #else
636  dimBlock.x = 256;
637  #endif
638 
639  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
640  vddotdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y, z);
641  #ifdef With_Check
642  cudaDeviceSynchronize();
643  #endif
644 }
645 
646 
647 /**
648  * \fn void sdotdiv_cuda(const int n, const float *x, const float *y, float *z, cudaStream_t stream)
649  * \brief It calls kernel vsdotdiv_cuda that performs z[i]=x[i]*y[i]/z[i]
650  * \param n: (input) Number of elements of x, y and z
651  * \param x: (input) Simple precision input vector/matrix (1D column-major)
652  * \param y: (input) Simple precision input vector/matrix (1D column-major)
653  * \param z: (inout) Simple precision input/output vector/matrix (1D column-major)
654  * \param stream: (input) ID of the stream to use
655 */
656 void sdotdiv_cuda(const int n, const float *x, const float *y, float *z, cudaStream_t stream)
657 {
658  dim3 dimGrid, dimBlock;
659 
660  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
661  dimBlock.x = 192;
662  #else
663  dimBlock.x = 256;
664  #endif
665 
666  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
667  vsdotdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y, z);
668  #ifdef With_Check
669  cudaDeviceSynchronize();
670  #endif
671 }
672 
673 
674 /**
675  * \fn __global__ void vddotdiv_cuda(const int n, const double* __restrict__ x, const double* __restrict__ y, double *z)
676  * \brief This kernel computes double precision z[i]=z[i]*x[i]/y[i]
677  * \param n: (input) Number of elements of x, y and z
678  * \param x: (input) Double precision input vector/matrix (1D column-major)
679  * \param y: (input) Double precision input vector/matrix (1D column-major)
680  * \param z: (inout) Double precision input/output vector/matrix (1D column-major)
681 */
682 __global__ void vddotdiv_cuda(const int n, const double* __restrict__ x, const double* __restrict__ y, double *z)
683 {
684  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
685 
686  if (pos < n)
687  #ifdef With_Check
688  z[pos] = z[pos] * x[pos] / y[pos];
689  assert(!fpe(z[pos]));
690  #else
691  z[pos] = z[pos] * x[pos] / y[pos];
692  #endif
693 }
694 
695 
696 /**
697  * \fn __global__ void vsdotdiv_cuda(const int n, const float* __restrict__ x, const float* __restrict__ y, float *z)
698  * \brief This kernel calculates simple precision z[i]=z[i]*x[i]/y[i]
699  * \param n: (input) Number of elements of x, y and z
700  * \param x: (input) Simple precision input vector/matrix (1D column-major)
701  * \param y: (input) Simple precision input vector/matrix (1D column-major)
702  * \param z: (inout) Simple precision input/output vector/matrix (1D column-major)
703 */
704 __global__ void vsdotdiv_cuda(const int n, const float* __restrict__ x, const float* __restrict__ y, float *z)
705 {
706  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
707 
708  if (pos < n)
709  #ifdef With_Check
710  /* Here we can have NaN and Inf if y(pos) and/or x(pos) and/or z[pos]=0 */
711  z[pos] = z[pos] * x[pos] / y[pos];
712  assert(!fpe(z[pos]));
713  #else
714  z[pos] = z[pos] * x[pos] / y[pos];
715  #endif
716 }
717