NnmfPack  2.1
utils_cuda.cu
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2014 by PIR (University of Oviedo) and *
3  * INCO2 (Polytechnic University of Valencia) groups. *
4  * nnmfpack@gmail.com *
5  * *
6  * This program is free software; you can redistribute it and/or modify *
7  * it under the terms of the GNU General Public License as published by *
8  * the Free Software Foundation; either version 2 of the License, or *
9  * (at your option) any later version. *
10  * *
11  * This program is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14  * GNU General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU General Public License *
17  * along with this program; if not, write to the *
18  * Free Software Foundation, Inc., *
19  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20  ***************************************************************************
21 */
22 /**
23  * \file utils_cuda.cu
24  * \brief Some auxiliar functions. Double and simple precision for
25  * GPUs from CUDA.
26  * \author Information Retrieval and Parallel Computing Group (IRPCG)
27  * \author University of Oviedo, Spain
28  * \author Interdisciplinary Computation and Communication Group (INCO2)
29  * \author Universitat Politecnica de Valencia, Spain.
30  * \author Contact: nnmfpack@gmail.com
31  * \date 04/11/14
32 */
33 #include "utils_cuda.h"
34 
35 
36 /* ************************************************************************************ */
37 /* *************************************** kernels ************************************ */
38 /* ************************************************************************************ */
39 /**
40  * \fn __global__ void vdmemset_cuda(const int n, double *x, const double val)
41  * \brief This kernel fills all positions of x with double precision "val"
42  * \param n: (input) Number of elements of x
43  * \param x: (output) Double precision output matrix (1D column-major) or vector
44  * \param val: (input) Double precision value
45 */
46 __global__ void vdmemset_cuda(const int n, double *x, const double val)
47 {
48  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
49 
50  /* Avoid divergence branch? This is an alternative. */
51  /* unsigned int pos = blockDim.x * blockIdx.x + threadIdx.x; */
52  /* unsigned int stride = blockDim.x * gridDim.x; */
53  /* unsigned int stride = blockDim.x * gridDim.x; */
54  /* for(; pos < n; pos += stride) */
55  /* x[pos] = val; */
56 
57  /* While next is technically called a "divergence", not all */
58  /* threads within a warp evaluate the condition identically, */
59  /* it is completely harmless. We use this instead the other */
60  /* (above) */
61  if (pos < n)
62  x[pos] = val;
63 }
64 
65 
66 /**
67  * \fn __global__ void vsmemset_cuda(const int n, float *x, const float val)
68  * \brief This kernel fills all positions of x with simple precision "val"
69  * \param n: (input) Number of elements of x
70  * \param x: (output) Simple precision output matrix (1D column-major) or vector
71  * \param val: (input) Simple precision value
72 */
73 __global__ void vsmemset_cuda(const int n, float *x, const float val)
74 {
75  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
76 
77  if (pos < n)
78  x[pos] = val;
79 }
80 
81 
82 
83 /**
84  * \fn __global__ void vddiv_cuda(const int n, const double* __restrict__ x, double *y)
85  * \brief This kernel computes double precision y[i]=x[i]/y[i]
86  * \param n: (input) Number of elements of x and y
87  * \param x: (input) Double precision input vector/matrix (1D column-major)
88  * \param y: (inout) Double precision input/output vector/matrix (1D column-major)
89 */
90 __global__ void vddiv_cuda(const int n, const double* __restrict__ x, double *y)
91 {
92  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
93 
94  if (pos < n)
95  #ifdef With_Check
96  y[pos] = x[pos] / y[pos];
97  assert(!fpe(y[pos]));
98  #else
99  y[pos] = x[pos] / y[pos];
100  #endif
101 }
102 
103 
104 /**
105  * \fn __global__ void vsdiv_cuda(const int n, const float* __restrict__ x, float *y)
106  * \brief This kernel performs simple precision y[i]=x[i]/y[i]
107  * \param n: (input) Number of elements of x and y
108  * \param x: (input) Simple precision input vector/matrix (1D column-major)
109  * \param y: (inout) Simple precision input/output vector/matrix (1D column-major)
110 */
111 __global__ void vsdiv_cuda(const int n, const float* __restrict__ x, float *y)
112 {
113  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
114 
115  if (pos < n)
116  #ifdef With_Check
117  /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
118  y[pos] = x[pos] / y[pos];
119  assert(!fpe(y[pos]));
120  #else
121  y[pos] = x[pos] / y[pos];
122  #endif
123 }
124 
125 /**
126  * \fn __global__ void vdsub_cuda(const int n, const double* __restrict__ x, double *y)
127  * \brief This kernel performs double precision y[i]=x[i]-y[i]
128  * \param n: (input) Number of elements of x and y
129  * \param x: (input) Double precision input vector/matrix
130  * \param y: (inout) Double precision input/output vector/matrix
131 */
132 __global__ void vdsub_cuda(const int n, const double* __restrict__ x, double *y)
133 {
134  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
135 
136  if (pos < n)
137  /* ask for x[pos] or y[pos] = 0.0 don't give improvements. We don't do it */
138  y[pos] = x[pos] - y[pos];
139 }
140 
141 
142 /**
143  * \fn __global__ void vssub_cuda(const int n, const float* __restrict__ x, float *y)
144  * \brief This kernel performs simple precision y[i]=x[i]-y[i]
145  * \param n: (input) Number of elements of x and y
146  * \param x: (input) Simple precision input vector/matrix
147  * \param y: (inout) Simple precision input/output vectir/matrix
148 */
149 __global__ void vssub_cuda(const int n, const float* __restrict__ x, float *y)
150 {
151  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
152 
153  if (pos < n)
154  /* ask for x[pos] or y[pos] = 0.0 don't give improvements. We don't do it */
155  y[pos] = x[pos] - y[pos];
156 }
157 
158 
159 /**
160  * \fn __global__ void vderrorbd0_cuda(const int n, const double* __restrict__ x, double *y)
161  * \brief This kernel performs auxiliar double precision operations when error is computed
162  using betadivergence error formula and beta = 0
163  * \param n: (input) Number of elements of x and y
164  * \param x: (input) Double precision input vector/matrix
165  * \param y: (inout) Double precision input/output vector/matrix
166 */
167 __global__ void vderrorbd0_cuda(const int n, const double* __restrict__ x, double *y)
168 {
169  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
170 
171  double dtmp1, dtmp2;
172 
173  if (pos < n)
174  {
175  #ifdef With_Check
176  /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
177  dtmp1=x[pos] / y[pos];
178  assert(!fpe(dtmp1));
179 
180  dtmp2=log(dtmp1);
181  assert(!fpe(dtmp2));
182 
183  y[pos]=dtmp1 - dtmp2 - 1.0;
184  #else
185  dtmp1=x[pos] / y[pos];
186  dtmp2=log(dtmp1);
187 
188  y[pos]=dtmp1 - dtmp2 - 1.0;
189  #endif
190  }
191 }
192 
193 
194 /**
195  * \fn __global__ void vserrorbd0_cuda(const int n, const float* __restrict__ x, float *y)
196  * \brief This kernel performs auxiliar simple precision operations when error is computed
197  using betadivergence error formula and beta = 0
198  * \param n: (input) Number of elements of x and y
199  * \param x: (input) Simple precision input vector/matrix
200  * \param y: (inout) Simple precision input/output vector/matrix
201 */
202 __global__ void vserrorbd0_cuda(const int n, const float* __restrict__ x, float *y)
203 {
204  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
205 
206  float ftmp1, ftmp2;
207 
208  if (pos < n)
209  {
210  #ifdef With_Check
211  /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
212  ftmp1=x[pos] / y[pos];
213  assert(!fpe(ftmp1));
214 
215  ftmp2=logf(ftmp1);
216  assert(!fpe(ftmp2));
217 
218  y[pos]=ftmp1 - ftmp2 - 1.0f;
219  #else
220  ftmp1=x[pos] / y[pos];
221  ftmp2=logf(ftmp1);
222 
223  y[pos]=ftmp1 - ftmp2 - 1.0f;
224  #endif
225  }
226 }
227 
228 
229 /**
230  * \fn __global__ void vderrorbd1_cuda(const int n, const double* __restrict__ x, double *y)
231  * \brief This kernel performs auxiliar double precision operations when error is computed
232  using betadivergence error formula and beta = 1
233  * \param n: (input) Number of elements of x and y
234  * \param x: (input) Double precision input vector/matrix
235  * \param y: (inout) Double precision input/output vector/matrix
236 */
237 __global__ void vderrorbd1_cuda(const int n, const double* __restrict__ x, double *y)
238 {
239  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
240 
241  double dtmp1, dtmp2, dtmp3;
242 
243  if (pos < n)
244  {
245  #ifdef With_Check
246  /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
247  dtmp1=x[pos];
248  dtmp2=y[pos];
249  dtmp3=log(dtmp1 / dtmp2);
250  assert(!fpe(dtmp3));
251 
252  y[pos]=dtmp1 * dtmp3 + dtmp2 - dtmp1;
253  #else
254  dtmp1=x[pos];
255  dtmp2=y[pos];
256  dtmp3=log(dtmp1 / dtmp2);
257 
258  y[pos]=dtmp1 * dtmp3 + dtmp2 - dtmp1;
259  #endif
260  }
261 }
262 
263 
264 /**
265  * \fn __global__ void vserrorbd1_cuda(const int n, const float* __restrict__ x, float *y)
266  * \brief This kernel performs auxiliar simple precision operations when error is computed
267  using betadivergence error formula and beta = 1
268  * \param n: (input) Number of elements of x and y
269  * \param x: (input) Simple precision input vector/matrix
270  * \param y: (inout) Simple precision input/output vector/matrix
271 */
272 __global__ void vserrorbd1_cuda(const int n, const float* __restrict__ x, float *y)
273 {
274  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
275 
276  float ftmp1, ftmp2, ftmp3;
277 
278  if (pos < n)
279  {
280  #ifdef With_Check
281  /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
282  ftmp1=x[pos];
283  ftmp2=y[pos];
284  ftmp3=logf(ftmp1 / ftmp2);
285  assert(!fpe(ftmp3));
286 
287  y[pos]=ftmp1 * ftmp3 + ftmp2 - ftmp1;
288  #else
289  ftmp1=x[pos];
290  ftmp2=y[pos];
291  ftmp3=logf(ftmp1 / ftmp2);
292 
293  y[pos]=ftmp1 * ftmp3 + ftmp2 - ftmp1;
294  #endif
295  }
296 }
297 
298 
299 /**
300  * \fn __global__ void vderrorbdg_cuda(const int n, const double* __restrict__ x, double *y, const double beta)
301  * \brief This kernel performs auxiliar double precision operations when error is computed
302  using betadivergence error formula with (beta != 0) and (beta != 1)
303  * \param n: (input) Number of elements of x and y
304  * \param x: (input) Double precision input vector/matrix
305  * \param y: (inout) Double precision input/output vector/matrix
306  * \param beta: (input) Double precision value of beta
307 */
308 __global__ void vderrorbdg_cuda(const int n, const double* __restrict__ x, double *y, const double beta)
309 {
310  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
311 
312  double dbeta, dtmp1, dtmp2, dtmp3;
313 
314  if (pos < n)
315  {
316  dbeta=beta-1.0;
317  dtmp1=x[pos];
318  dtmp2=y[pos];
319  dtmp3=beta*dtmp1*pow(dtmp2, dbeta);
320 
321  #ifdef With_Check
322  y[pos]=(pow(dtmp1, beta) + dbeta*pow(dtmp2, beta) - dtmp3) / (beta * dbeta);
323  assert(!fpe(y[pos]));
324  #else
325  y[pos]=(pow(dtmp1, beta) + dbeta*pow(dtmp2, beta) - dtmp3) / (beta * dbeta);
326  #endif
327  }
328 }
329 
330 
331 /**
332  * \fn __global__ void vserrorbdg_cuda(const int n, const float* __restrict__ x, float *y, const float beta)
333  * \brief This kernel performs auxiliar simple precision operations when error is computed
334  using betadivergence error formula with (beta != 0) and (beta != 1)
335  * \param n: (input) Number of elements of x and y
336  * \param x: (input) Simple precision input vector/matrix
337  * \param y: (inout) Simple precision input/output vector/matrix
338  * \param beta: (input) Simple precision value of beta
339 */
340 __global__ void vserrorbdg_cuda(const int n, const float* __restrict__ x, float *y, const float beta)
341 {
342  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
343 
344  float fbeta, ftmp1, ftmp2, ftmp3;
345 
346  if (pos < n)
347  {
348  fbeta=beta-1.0f;
349  ftmp1=x[pos];
350  ftmp2=y[pos];
351  ftmp3=beta*ftmp1*powf(ftmp2, fbeta);
352 
353  #ifdef With_Check
354  y[pos]=(powf(ftmp1, beta) + fbeta*powf(ftmp2, beta) - ftmp3) / (beta * fbeta);
355  assert(!fpe(y[pos]));
356  #else
357  y[pos]=(powf(ftmp1, beta) + fbeta*powf(ftmp2, beta) - ftmp3) / (beta * fbeta);
358  #endif
359  }
360 }
361 
362 
363 
364 /* ************************************************************************************ */
365 /* *********************** wrappers for the kernels and others ************************ */
366 /* ************************************************************************************ */
367 
368 /**
369  * \fn void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
370  * \brief It calls the kernel vdmemset_cuda that fills all positions of vector/matrix x
371  * with the double precision value "val"
372  * \param n: (input) Number of elements of x
373  * \param x: (output) Double precision output vector/matrix (1D column-major)
374  * \param val: (input) Double precision value
375  * \param stream: (input) ID of the stream to use
376 */
377 void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
378 {
379  dim3 dimGrid, dimBlock;
380 
381  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
382  dimBlock.x = 192;
383  #else
384  dimBlock.x = 256;
385  #endif
386 
387  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
388  vdmemset_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, val);
389  #ifdef With_Check
390  cudaDeviceSynchronize();
391  #endif
392 }
393 
394 
395 /**
396  * \fn void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
397  * \brief It calls the kernel vsmemset_cuda that fills all positions of vector/matrix x
398  * with the simple precision value "val"
399  * \param n: (input) Number of elements of x
400  * \param x: (output) Simple precision output vector/matrix (1D column-major)
401  * \param val: (input) Simple precision value
402  * \param stream: (input) ID of the stream to use
403 */
404 void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
405 {
406  dim3 dimGrid, dimBlock;
407 
408  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
409  dimBlock.x = 192;
410  #else
411  dimBlock.x = 256;
412  #endif
413 
414  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
415  vsmemset_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, val);
416  #ifdef With_Check
417  cudaDeviceSynchronize();
418  #endif
419 }
420 
421 
422 /**
423  * \fn void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
424  * \brief It calls kernel vddiv_cuda that y[i]=x[i]/y[i]
425  * \param n: (input) Number of elements of x and y
426  * \param x: (input) Double precision input vector/matrix (1D column-major)
427  * \param y: (inout) Double precision input/output vector/matrix (1D column-major)
428  * \param stream: (input) ID of the stream to use
429 */
430 void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
431 {
432  dim3 dimGrid, dimBlock;
433 
434  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
435  dimBlock.x = 192;
436  #else
437  dimBlock.x = 256;
438  #endif
439 
440  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
441  vddiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y);
442  #ifdef With_Check
443  cudaDeviceSynchronize();
444  #endif
445 }
446 
447 
448 /**
449  * \fn void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
450  * \brief It calls kernel vsdiv_cuda that y[i]=x[i]/y[i]
451  * \param n: (input) Number of elements of x and y
452  * \param x: (input) Simple precision input vector/matrix (1D column-major)
453  * \param y: (inout) Simple precision input/output vector/matrix (1D column-major)
454  * \param stream: (input) ID of the stream to use
455 */
456 void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
457 {
458  dim3 dimGrid, dimBlock;
459 
460  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
461  dimBlock.x = 192;
462  #else
463  dimBlock.x = 256;
464  #endif
465 
466  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
467  vsdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y);
468  #ifdef With_Check
469  cudaDeviceSynchronize();
470  #endif
471 }
472 
473 
474 /**
475  * \fn void dsub_cuda(const int n, const double *x, const double *y, double *z)
476  * \brief This wrapper calls kernel vdsub_cuda
477  * \param n: (input) Number of elements of x and y
478  * \param x: (input) Double precision input vector/matrix
479  * \param y: (inout) Double precision input/output vector/matrix
480 */
481 void dsub_cuda(const int n, const double *x, double *y)
482 {
483  dim3 dimGrid, dimBlock;
484 
485  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
486  dimBlock.x = 192;
487  #else
488  dimBlock.x = 256;
489  #endif
490 
491  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
492  vdsub_cuda<<<dimGrid, dimBlock>>>(n, x, y);
493  #ifdef With_Check
494  cudaDeviceSynchronize();
495  #endif
496 }
497 
498 
499 /**
500  * \fn void ssub_cuda(const int n, const float *x, float *y)
501  * \brief This wrapper calls kernel vssub_cuda
502  * \param n: (input) Number of elements of x and y
503  * \param x: (input) Simple precision input vector/matrix
504  * \param y: (inout) Simple precision input/output vector/matrix
505 */
506 void ssub_cuda(const int n, const float *x, float *y)
507 {
508  dim3 dimGrid, dimBlock;
509 
510  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
511  dimBlock.x = 192;
512  #else
513  dimBlock.x = 256;
514  #endif
515 
516  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
517  vssub_cuda<<<dimGrid, dimBlock>>>(n, x, y);
518  #ifdef With_Check
519  cudaDeviceSynchronize();
520  #endif
521 }
522 
523 
524 /**
525  * \fn double derror_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H)
526  * \brief derror_cuda returns double precision "2norm(A - WH) / sqrt(m x n)"
527  * \param m: (input) Number of rows of matrix A and number of rows of W
528  * \param n: (input) Number of columns of matrix A and number of columns of H
529  * \param k: (input) Number of columns of matrix W and number of rows of H
530  * \param A: (input) Double precision matrix, dimension (m x n), 1D layout column major
531  * \param W: (input) Double precision matrix, dimension (m x k), 1D layout column major
532  * \param H: (input) Double precision matrix, dimension (k x n), 1D layout column major
533 */
534 double derror_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H)
535 {
536  double
537  error=0.0,
538  alpha=1.0,
539  beta =0.0,
540  *tmp =NULL;
541 
542  int
543  devID;
544 
545  cublasHandle_t
546  handle;
547 
548  cudaGetDevice(&devID);
549  cublasCreate(&handle);
550 
551  cudaMalloc((void **)&tmp, m * n * sizeof(double));
552 
553  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, tmp, m);
554 
555  dsub_cuda(m*n, A, tmp);
556 
557  cublasDnrm2(handle, m*n, tmp, 1, &error);
558 
559  cublasDestroy(handle);
560 
561  cudaFree(tmp);
562 
563  return error/sqrt(m*n);
564 }
565 
566 
567 /**
568  * \fn float serror_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H)
569  * \brief serror_cuda returns simple precision "2norm(A - WH) / sqrt(m x n)"
570  * \param m: (input) Number of rows of matrix A and number of rows of W
571  * \param n: (input) Number of columns of matrix A and number of columns of H
572  * \param k: (input) Number of columns of matrix W and number of rows of H
573  * \param A: (input) Simple precision matrix, dimension (m x n), 1D layout column major
574  * \param W: (input) Simple precision matrix, dimension (m x k), 1D layout column major
575  * \param H: (input) Simple precision matrix, dimension (k x n), 1D layout column major
576 */
577 float serror_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H)
578 {
579  float
580  error=0.0,
581  alpha=1.0,
582  beta =0.0,
583  *tmp =NULL;
584 
585  int
586  devID;
587 
588  cublasHandle_t
589  handle;
590 
591 
592  cudaGetDevice(&devID);
593  cublasCreate(&handle);
594 
595  cudaMalloc((void **)&tmp, m * n * sizeof(float));
596 
597  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, tmp, m);
598 
599  ssub_cuda(m*n, A, tmp);
600 
601  cublasSnrm2(handle, m*n, tmp, 1, &error);
602 
603  cublasDestroy(handle);
604 
605  cudaFree(tmp);
606 
607  return error/sqrtf(m*n);
608 }
609 
610 
611 /**
612  * \fn double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double betadiv)
613  * \brief This function returns double precision error when error is computed using betadivergence error formula
614  * \param m: (input) Number of rows of A and W
615  * \param n: (input) Number of columns of A and H
616  * \param k: (input) Number of columns/rows of W/H
617  * \param A: (input) Double precision input matrix A
618  * \param W: (input) Double precision input matrix W
619  * \param H: (input) Double precision input matrix H
620  * \param betadiv: (input) beta value
621 */
622 double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double betadiv)
623 {
624  dim3
625  dimGrid,
626  dimBlock;
627 
628  double
629  error=0.0,
630  alpha=1.0,
631  beta =0.0,
632  *dtmp=NULL;
633 
634  int
635  devID;
636 
637  cublasHandle_t
638  handle;
639 
640  dimBlock.x = 256;
641 
642  cudaGetDevice(&devID);
643  cublasCreate(&handle);
644 
645  cudaMalloc((void **)&dtmp, m*n*sizeof(double));
646 
647  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, dtmp, m);
648 
649  if (betadiv>=0.0 && betadiv<=0.0)
650  {
651  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
652  vderrorbd0_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp);
653  }
654  else
655  {
656  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
657  dimBlock.x = 224;
658  #endif
659  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
660 
661  if (betadiv>=1.0 && betadiv<=1.0)
662  vderrorbd1_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp);
663  else
664  vderrorbdg_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp, betadiv);
665  }
666  #ifdef With_Check
667  cudaDeviceSynchronize();
668  #endif
669 
670  /* all dtmp elements are >=0 so we use Cublas */
671  cublasDasum(handle, m*n, dtmp, 1, &error);
672 
673  error=sqrt((2.0*error)/((double)m*n));
674 
675  cublasDestroy(handle);
676 
677  cudaFree(dtmp);
678 
679  return error;
680 }
681 
682 
683 /**
684  * \fn float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float betadiv)
685  * \brief This function returns simple precision error when error is computed using betadivergence error formula
686  * \param m: (input) Number of rows of A and W
687  * \param n: (input) Number of columns of A and H
688  * \param k: (input) Number of columns/rows of W/H
689  * \param A: (input) Simple precision input matrix A
690  * \param W: (input) Simple precision input matrix W
691  * \param H: (input) Simple precision input matrix H
692  * \param betadiv: (input) beta value
693 */
694 float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float betadiv)
695 {
696  dim3
697  dimGrid,
698  dimBlock;
699 
700  float
701  alpha=1.0f,
702  beta =0.0f,
703  error=0.0f,
704  *ftmp=NULL;
705 
706  int
707  devID;
708 
709  cublasHandle_t
710  handle;
711 
712  dimBlock.x = 256;
713  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
714 
715  cudaGetDevice(&devID);
716  cublasCreate(&handle);
717 
718  cudaMalloc((void **)&ftmp, m*n*sizeof(float));
719 
720  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, ftmp, m);
721 
722  if (betadiv>=0.0 && betadiv<=0.0)
723  vserrorbd0_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp);
724  else
725  {
726  if (betadiv>=1.0 && betadiv<=1.0)
727  vserrorbd1_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp);
728  else
729  vserrorbdg_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp, betadiv);
730  }
731  #ifdef With_Check
732  cudaDeviceSynchronize();
733  #endif
734 
735  /* all ftmp elements are >=0 so we use Cublas */
736  cublasSasum(handle, m*n, ftmp, 1, &error);
737 
738  error=sqrtf((2.0f*error)/((float)m*n));
739 
740  cublasDestroy(handle);
741 
742  cudaFree(ftmp);
743 
744  return error;
745 }
746 
747 
748 /**
749  * \fn void dlarngenn_cuda(const int m, const int n, const int seed, double *M)
750  * \brief dlarngenn_cuda returns a (m x n) random double precision matrix.
751  * An uniform (0, 1) distribution is used to generate the values
752  * \param m: (input) Number of rows of matrix M
753  * \param n: (input) Number of columns of matrix M
754  * \param seed: (input) Initial seed for the random numbers
755  * \param M: (output) Double precision matrix, dimension (m x n), 1D column major
756 */
757 void dlarngenn_cuda(const int m, const int n, const int seed, double *M)
758 {
759  curandGenerator_t gen;
760 
761  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
762  curandSetPseudoRandomGeneratorSeed(gen, seed);
763  curandGenerateUniformDouble(gen, M, m*n);
764  curandDestroyGenerator(gen);
765 }
766 
767 
768 /**
769  * \fn void slarngenn_cuda(const int m, const int n, const int seed, float *M)
770  * \brief slarngenn_cuda returns a (m x n) random simple precision matrix.
771  * An uniform (0, 1) distribution is used to generate the values
772  * \param m: (input) Number of rows of matrix M
773  * \param n: (input) Number of columns of matrix M
774  * \param seed: (input) Initial seed for the random numbers
775  * \param M: (output) Simple precision matrix, dimension (m x n), 1D column major
776 */
777 void slarngenn_cuda(const int m, const int n, const int seed, float *M)
778 {
779  curandGenerator_t gen;
780 
781  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
782  curandSetPseudoRandomGeneratorSeed(gen, seed);
783  curandGenerateUniform(gen, M, m * n);
784  curandDestroyGenerator(gen);
785 }