1 /***************************************************************************
2 * Copyright (C) 2014 by PIR (University of Oviedo) and *
3 * INCO2 (Polytechnic University of Valencia) groups. *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU General Public License as published by *
8 * the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU General Public License for more details. *
16 * You should have received a copy of the GNU General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20 ***************************************************************************
24 * \brief Some auxiliar functions. Double and simple precision for
26 * \author Information Retrieval and Parallel Computing Group (IRPCG)
27 * \author University of Oviedo, Spain
28 * \author Interdisciplinary Computation and Communication Group (INCO2)
29 * \author Universitat Politecnica de Valencia, Spain.
30 * \author Contact: nnmfpack@gmail.com
33 #include "utils_cuda.h"
36 /* ************************************************************************************ */
37 /* *************************************** kernels ************************************ */
38 /* ************************************************************************************ */
40 * \fn __global__ void vdmemset_cuda(const int n, double *x, const double val)
41 * \brief This kernel fills all positions of x with double precision "val"
42 * \param n: (input) Number of elements of x
43 * \param x: (output) Double precision output matrix (1D column-major) or vector
44 * \param val: (input) Double precision value
46 __global__ void vdmemset_cuda(const int n, double *x, const double val)
48 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
50 /* Avoid divergence branch? This is an alternative. */
51 /* unsigned int pos = blockDim.x * blockIdx.x + threadIdx.x; */
52 /* unsigned int stride = blockDim.x * gridDim.x; */
53 /* unsigned int stride = blockDim.x * gridDim.x; */
54 /* for(; pos < n; pos += stride) */
57 /* While next is technically called a "divergence", not all */
58 /* threads within a warp evaluate the condition identically, */
59 /* it is completely harmless. We use this instead the other */
67 * \fn __global__ void vsmemset_cuda(const int n, float *x, const float val)
68 * \brief This kernel fills all positions of x with simple precision "val"
69 * \param n: (input) Number of elements of x
70 * \param x: (output) Simple precision output matrix (1D column-major) or vector
71 * \param val: (input) Simple precision value
73 __global__ void vsmemset_cuda(const int n, float *x, const float val)
75 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
84 * \fn __global__ void vddiv_cuda(const int n, const double* __restrict__ x, double *y)
85 * \brief This kernel computes double precision y[i]=x[i]/y[i]
86 * \param n: (input) Number of elements of x and y
87 * \param x: (input) Double precision input vector/matrix (1D column-major)
88 * \param y: (inout) Double precision input/output vector/matrix (1D column-major)
90 __global__ void vddiv_cuda(const int n, const double* __restrict__ x, double *y)
92 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
96 y[pos] = x[pos] / y[pos];
99 y[pos] = x[pos] / y[pos];
105 * \fn __global__ void vsdiv_cuda(const int n, const float* __restrict__ x, float *y)
106 * \brief This kernel performs simple precision y[i]=x[i]/y[i]
107 * \param n: (input) Number of elements of x and y
108 * \param x: (input) Simple precision input vector/matrix (1D column-major)
109 * \param y: (inout) Simple precision input/output vector/matrix (1D column-major)
111 __global__ void vsdiv_cuda(const int n, const float* __restrict__ x, float *y)
113 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
117 /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
118 y[pos] = x[pos] / y[pos];
119 assert(!fpe(y[pos]));
121 y[pos] = x[pos] / y[pos];
126 * \fn __global__ void vdsub_cuda(const int n, const double* __restrict__ x, double *y)
127 * \brief This kernel performs double precision y[i]=x[i]-y[i]
128 * \param n: (input) Number of elements of x and y
129 * \param x: (input) Double precision input vector/matrix
130 * \param y: (inout) Double precision input/output vector/matrix
132 __global__ void vdsub_cuda(const int n, const double* __restrict__ x, double *y)
134 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
137 /* ask for x[pos] or y[pos] = 0.0 don't give improvements. We don't do it */
138 y[pos] = x[pos] - y[pos];
143 * \fn __global__ void vssub_cuda(const int n, const float* __restrict__ x, float *y)
144 * \brief This kernel performs simple precision y[i]=x[i]-y[i]
145 * \param n: (input) Number of elements of x and y
146 * \param x: (input) Simple precision input vector/matrix
147 * \param y: (inout) Simple precision input/output vectir/matrix
149 __global__ void vssub_cuda(const int n, const float* __restrict__ x, float *y)
151 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
154 /* ask for x[pos] or y[pos] = 0.0 don't give improvements. We don't do it */
155 y[pos] = x[pos] - y[pos];
160 * \fn __global__ void vderrorbd0_cuda(const int n, const double* __restrict__ x, double *y)
161 * \brief This kernel performs auxiliar double precision operations when error is computed
162 using betadivergence error formula and beta = 0
163 * \param n: (input) Number of elements of x and y
164 * \param x: (input) Double precision input vector/matrix
165 * \param y: (inout) Double precision input/output vector/matrix
167 __global__ void vderrorbd0_cuda(const int n, const double* __restrict__ x, double *y)
169 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
176 /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
177 dtmp1=x[pos] / y[pos];
183 y[pos]=dtmp1 - dtmp2 - 1.0;
185 dtmp1=x[pos] / y[pos];
188 y[pos]=dtmp1 - dtmp2 - 1.0;
195 * \fn __global__ void vserrorbd0_cuda(const int n, const float* __restrict__ x, float *y)
196 * \brief This kernel performs auxiliar simple precision operations when error is computed
197 using betadivergence error formula and beta = 0
198 * \param n: (input) Number of elements of x and y
199 * \param x: (input) Simple precision input vector/matrix
200 * \param y: (inout) Simple precision input/output vector/matrix
202 __global__ void vserrorbd0_cuda(const int n, const float* __restrict__ x, float *y)
204 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
211 /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
212 ftmp1=x[pos] / y[pos];
218 y[pos]=ftmp1 - ftmp2 - 1.0f;
220 ftmp1=x[pos] / y[pos];
223 y[pos]=ftmp1 - ftmp2 - 1.0f;
230 * \fn __global__ void vderrorbd1_cuda(const int n, const double* __restrict__ x, double *y)
231 * \brief This kernel performs auxiliar double precision operations when error is computed
232 using betadivergence error formula and beta = 1
233 * \param n: (input) Number of elements of x and y
234 * \param x: (input) Double precision input vector/matrix
235 * \param y: (inout) Double precision input/output vector/matrix
237 __global__ void vderrorbd1_cuda(const int n, const double* __restrict__ x, double *y)
239 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
241 double dtmp1, dtmp2, dtmp3;
246 /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
249 dtmp3=log(dtmp1 / dtmp2);
252 y[pos]=dtmp1 * dtmp3 + dtmp2 - dtmp1;
256 dtmp3=log(dtmp1 / dtmp2);
258 y[pos]=dtmp1 * dtmp3 + dtmp2 - dtmp1;
265 * \fn __global__ void vserrorbd1_cuda(const int n, const float* __restrict__ x, float *y)
266 * \brief This kernel performs auxiliar simple precision operations when error is computed
267 using betadivergence error formula and beta = 1
268 * \param n: (input) Number of elements of x and y
269 * \param x: (input) Simple precision input vector/matrix
270 * \param y: (inout) Simple precision input/output vector/matrix
272 __global__ void vserrorbd1_cuda(const int n, const float* __restrict__ x, float *y)
274 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
276 float ftmp1, ftmp2, ftmp3;
281 /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
284 ftmp3=logf(ftmp1 / ftmp2);
287 y[pos]=ftmp1 * ftmp3 + ftmp2 - ftmp1;
291 ftmp3=logf(ftmp1 / ftmp2);
293 y[pos]=ftmp1 * ftmp3 + ftmp2 - ftmp1;
300 * \fn __global__ void vderrorbdg_cuda(const int n, const double* __restrict__ x, double *y, const double beta)
301 * \brief This kernel performs auxiliar double precision operations when error is computed
302 using betadivergence error formula with (beta != 0) and (beta != 1)
303 * \param n: (input) Number of elements of x and y
304 * \param x: (input) Double precision input vector/matrix
305 * \param y: (inout) Double precision input/output vector/matrix
306 * \param beta: (input) Double precision value of beta
308 __global__ void vderrorbdg_cuda(const int n, const double* __restrict__ x, double *y, const double beta)
310 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
312 double dbeta, dtmp1, dtmp2, dtmp3;
319 dtmp3=beta*dtmp1*pow(dtmp2, dbeta);
322 y[pos]=(pow(dtmp1, beta) + dbeta*pow(dtmp2, beta) - dtmp3) / (beta * dbeta);
323 assert(!fpe(y[pos]));
325 y[pos]=(pow(dtmp1, beta) + dbeta*pow(dtmp2, beta) - dtmp3) / (beta * dbeta);
332 * \fn __global__ void vserrorbdg_cuda(const int n, const float* __restrict__ x, float *y, const float beta)
333 * \brief This kernel performs auxiliar simple precision operations when error is computed
334 using betadivergence error formula with (beta != 0) and (beta != 1)
335 * \param n: (input) Number of elements of x and y
336 * \param x: (input) Simple precision input vector/matrix
337 * \param y: (inout) Simple precision input/output vector/matrix
338 * \param beta: (input) Simple precision value of beta
340 __global__ void vserrorbdg_cuda(const int n, const float* __restrict__ x, float *y, const float beta)
342 unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;
344 float fbeta, ftmp1, ftmp2, ftmp3;
351 ftmp3=beta*ftmp1*powf(ftmp2, fbeta);
354 y[pos]=(powf(ftmp1, beta) + fbeta*powf(ftmp2, beta) - ftmp3) / (beta * fbeta);
355 assert(!fpe(y[pos]));
357 y[pos]=(powf(ftmp1, beta) + fbeta*powf(ftmp2, beta) - ftmp3) / (beta * fbeta);
364 /* ************************************************************************************ */
365 /* *********************** wrappers for the kernels and others ************************ */
366 /* ************************************************************************************ */
369 * \fn void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
370 * \brief It calls the kernel vdmemset_cuda that fills all positions of vector/matrix x
371 * with the double precision value "val"
372 * \param n: (input) Number of elements of x
373 * \param x: (output) Double precision output vector/matrix (1D column-major)
374 * \param val: (input) Double precision value
375 * \param stream: (input) ID of the stream to use
377 void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
379 dim3 dimGrid, dimBlock;
381 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
387 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
388 vdmemset_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, val);
390 cudaDeviceSynchronize();
396 * \fn void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
397 * \brief It calls the kernel vsmemset_cuda that fills all positions of vector/matrix x
398 * with the simple precision value "val"
399 * \param n: (input) Number of elements of x
400 * \param x: (output) Simple precision output vector/matrix (1D column-major)
401 * \param val: (input) Simple precision value
402 * \param stream: (input) ID of the stream to use
404 void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
406 dim3 dimGrid, dimBlock;
408 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
414 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
415 vsmemset_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, val);
417 cudaDeviceSynchronize();
423 * \fn void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
424 * \brief It calls kernel vddiv_cuda that y[i]=x[i]/y[i]
425 * \param n: (input) Number of elements of x and y
426 * \param x: (input) Double precision input vector/matrix (1D column-major)
427 * \param y: (inout) Double precision input/output vector/matrix (1D column-major)
428 * \param stream: (input) ID of the stream to use
430 void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
432 dim3 dimGrid, dimBlock;
434 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
440 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
441 vddiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y);
443 cudaDeviceSynchronize();
449 * \fn void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
450 * \brief It calls kernel vsdiv_cuda that y[i]=x[i]/y[i]
451 * \param n: (input) Number of elements of x and y
452 * \param x: (input) Simple precision input vector/matrix (1D column-major)
453 * \param y: (inout) Simple precision input/output vector/matrix (1D column-major)
454 * \param stream: (input) ID of the stream to use
456 void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
458 dim3 dimGrid, dimBlock;
460 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
466 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
467 vsdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y);
469 cudaDeviceSynchronize();
475 * \fn void dsub_cuda(const int n, const double *x, const double *y, double *z)
476 * \brief This wrapper calls kernel vdsub_cuda
477 * \param n: (input) Number of elements of x and y
478 * \param x: (input) Double precision input vector/matrix
479 * \param y: (inout) Double precision input/output vector/matrix
481 void dsub_cuda(const int n, const double *x, double *y)
483 dim3 dimGrid, dimBlock;
485 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
491 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
492 vdsub_cuda<<<dimGrid, dimBlock>>>(n, x, y);
494 cudaDeviceSynchronize();
500 * \fn void ssub_cuda(const int n, const float *x, float *y)
501 * \brief This wrapper calls kernel vssub_cuda
502 * \param n: (input) Number of elements of x and y
503 * \param x: (input) Simple precision input vector/matrix
504 * \param y: (inout) Simple precision input/output vector/matrix
506 void ssub_cuda(const int n, const float *x, float *y)
508 dim3 dimGrid, dimBlock;
510 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
516 dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
517 vssub_cuda<<<dimGrid, dimBlock>>>(n, x, y);
519 cudaDeviceSynchronize();
525 * \fn double derror_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H)
526 * \brief derror_cuda returns double precision "2norm(A - WH) / sqrt(m x n)"
527 * \param m: (input) Number of rows of matrix A and number of rows of W
528 * \param n: (input) Number of columns of matrix A and number of columns of H
529 * \param k: (input) Number of columns of matrix W and number of rows of H
530 * \param A: (input) Double precision matrix, dimension (m x n), 1D layout column major
531 * \param W: (input) Double precision matrix, dimension (m x k), 1D layout column major
532 * \param H: (input) Double precision matrix, dimension (k x n), 1D layout column major
534 double derror_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H)
548 cudaGetDevice(&devID);
549 cublasCreate(&handle);
551 cudaMalloc((void **)&tmp, m * n * sizeof(double));
553 cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, tmp, m);
555 dsub_cuda(m*n, A, tmp);
557 cublasDnrm2(handle, m*n, tmp, 1, &error);
559 cublasDestroy(handle);
563 return error/sqrt(m*n);
568 * \fn float serror_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H)
569 * \brief serror_cuda returns simple precision "2norm(A - WH) / sqrt(m x n)"
570 * \param m: (input) Number of rows of matrix A and number of rows of W
571 * \param n: (input) Number of columns of matrix A and number of columns of H
572 * \param k: (input) Number of columns of matrix W and number of rows of H
573 * \param A: (input) Simple precision matrix, dimension (m x n), 1D layout column major
574 * \param W: (input) Simple precision matrix, dimension (m x k), 1D layout column major
575 * \param H: (input) Simple precision matrix, dimension (k x n), 1D layout column major
577 float serror_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H)
592 cudaGetDevice(&devID);
593 cublasCreate(&handle);
595 cudaMalloc((void **)&tmp, m * n * sizeof(float));
597 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, tmp, m);
599 ssub_cuda(m*n, A, tmp);
601 cublasSnrm2(handle, m*n, tmp, 1, &error);
603 cublasDestroy(handle);
607 return error/sqrtf(m*n);
612 * \fn double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double betadiv)
613 * \brief This function returns double precision error when error is computed using betadivergence error formula
614 * \param m: (input) Number of rows of A and W
615 * \param n: (input) Number of columns of A and H
616 * \param k: (input) Number of columns/rows of W/H
617 * \param A: (input) Double precision input matrix A
618 * \param W: (input) Double precision input matrix W
619 * \param H: (input) Double precision input matrix H
620 * \param betadiv: (input) beta value
622 double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double betadiv)
642 cudaGetDevice(&devID);
643 cublasCreate(&handle);
645 cudaMalloc((void **)&dtmp, m*n*sizeof(double));
647 cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, dtmp, m);
649 if (betadiv>=0.0 && betadiv<=0.0)
651 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
652 vderrorbd0_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp);
656 #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
659 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
661 if (betadiv>=1.0 && betadiv<=1.0)
662 vderrorbd1_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp);
664 vderrorbdg_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp, betadiv);
667 cudaDeviceSynchronize();
670 /* all dtmp elements are >=0 so we use Cublas */
671 cublasDasum(handle, m*n, dtmp, 1, &error);
673 error=sqrt((2.0*error)/((double)m*n));
675 cublasDestroy(handle);
684 * \fn float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float betadiv)
685 * \brief This function returns simple precision error when error is computed using betadivergence error formula
686 * \param m: (input) Number of rows of A and W
687 * \param n: (input) Number of columns of A and H
688 * \param k: (input) Number of columns/rows of W/H
689 * \param A: (input) Simple precision input matrix A
690 * \param W: (input) Simple precision input matrix W
691 * \param H: (input) Simple precision input matrix H
692 * \param betadiv: (input) beta value
694 float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float betadiv)
713 dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
715 cudaGetDevice(&devID);
716 cublasCreate(&handle);
718 cudaMalloc((void **)&ftmp, m*n*sizeof(float));
720 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, ftmp, m);
722 if (betadiv>=0.0 && betadiv<=0.0)
723 vserrorbd0_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp);
726 if (betadiv>=1.0 && betadiv<=1.0)
727 vserrorbd1_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp);
729 vserrorbdg_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp, betadiv);
732 cudaDeviceSynchronize();
735 /* all ftmp elements are >=0 so we use Cublas */
736 cublasSasum(handle, m*n, ftmp, 1, &error);
738 error=sqrtf((2.0f*error)/((float)m*n));
740 cublasDestroy(handle);
749 * \fn void dlarngenn_cuda(const int m, const int n, const int seed, double *M)
750 * \brief dlarngenn_cuda returns a (m x n) random double precision matrix.
751 * An uniform (0, 1) distribution is used to generate the values
752 * \param m: (input) Number of rows of matrix M
753 * \param n: (input) Number of columns of matrix M
754 * \param seed: (input) Initial seed for the random numbers
755 * \param M: (output) Double precision matrix, dimension (m x n), 1D column major
757 void dlarngenn_cuda(const int m, const int n, const int seed, double *M)
759 curandGenerator_t gen;
761 curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
762 curandSetPseudoRandomGeneratorSeed(gen, seed);
763 curandGenerateUniformDouble(gen, M, m*n);
764 curandDestroyGenerator(gen);
769 * \fn void slarngenn_cuda(const int m, const int n, const int seed, float *M)
770 * \brief slarngenn_cuda returns a (m x n) random simple precision matrix.
771 * An uniform (0, 1) distribution is used to generate the values
772 * \param m: (input) Number of rows of matrix M
773 * \param n: (input) Number of columns of matrix M
774 * \param seed: (input) Initial seed for the random numbers
775 * \param M: (output) Simple precision matrix, dimension (m x n), 1D column major
777 void slarngenn_cuda(const int m, const int n, const int seed, float *M)
779 curandGenerator_t gen;
781 curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
782 curandSetPseudoRandomGeneratorSeed(gen, seed);
783 curandGenerateUniform(gen, M, m * n);
784 curandDestroyGenerator(gen);