NMFPack/utils__cuda_8cu_source.html

/***************************************************************************
 *   Copyright (C) 2014 by PIR (University of Oviedo) and                  *
 *   INCO2 (Polytechnic University of Valencia) groups.                    *
 *   nnmfpack@gmail.com                                                    *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************
*/
/**
 *  \file    utils_cuda.cu
 *  \brief   Some auxiliar functions. Double and simple precision for
 *           GPUs from CUDA.
 *  \author  Information Retrieval and Parallel Computing Group (IRPCG)
 *  \author  University of Oviedo, Spain
 *  \author  Interdisciplinary Computation and Communication Group (INCO2)
 *  \author  Universitat Politecnica de Valencia, Spain.
 *  \author  Contact: nnmfpack@gmail.com
 *  \date    04/11/14
*/
#include "utils_cuda.h"


/* ************************************************************************************ */
/* *************************************** kernels ************************************ */
/* ************************************************************************************ */
/**
 *  \fn    __global__ void vdmemset_cuda(const int n, double *x, const double val)
 *  \brief This kernel fills all positions of x with double precision "val"
 *  \param n:   (input)  Number of elements of x
 *  \param x:   (output) Double precision output matrix (1D column-major) or vector
 *  \param val: (input)  Double precision value
*/
__global__ void vdmemset_cuda(const int n, double *x, const double val)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  /* Avoid divergence branch? This is an alternative.          */
  /* unsigned int pos = blockDim.x * blockIdx.x + threadIdx.x; */
  /* unsigned int stride = blockDim.x * gridDim.x;             */
  /* unsigned int stride = blockDim.x * gridDim.x;             */
  /* for(; pos < n; pos += stride)                             */
  /*   x[pos] = val;                                           */

  /* While next is technically called a "divergence", not all  */
  /* threads within a warp evaluate the condition identically, */
  /* it is completely harmless. We use this instead the other  */
  /* (above)                                                   */
  if (pos < n)
    x[pos] = val;
}


/**
 *  \fn    __global__ void vsmemset_cuda(const int n, float *x, const float val)
 *  \brief This kernel fills all positions of x with simple precision "val"
 *  \param n:   (input)  Number of elements of x
 *  \param x:   (output) Simple precision output matrix (1D column-major) or vector
 *  \param val: (input)  Simple precision value
*/
__global__ void vsmemset_cuda(const int n, float *x, const float val)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    x[pos] = val;
}


/**
 *  \fn    __global__ void vddiv_cuda(const int n, const double* __restrict__ x, double *y)
 *  \brief This kernel computes double precision y[i]=x[i]/y[i]
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Double precision input vector/matrix (1D column-major)
 *  \param y: (inout) Double precision input/output vector/matrix (1D column-major)
*/
__global__ void vddiv_cuda(const int n, const double* __restrict__ x, double *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    #ifdef With_Check
      y[pos] = x[pos] / y[pos];
      assert(!fpe(y[pos]));
    #else
      y[pos] = x[pos] / y[pos];
    #endif
}


/**
 *  \fn    __global__ void vsdiv_cuda(const int n, const float* __restrict__ x, float *y)
 *  \brief This kernel performs simple precision y[i]=x[i]/y[i]
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Simple precision input vector/matrix (1D column-major)
 *  \param y: (inout) Simple precision input/output vector/matrix (1D column-major)
*/
__global__ void vsdiv_cuda(const int n, const float* __restrict__ x, float *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
      y[pos] = x[pos] / y[pos];
      assert(!fpe(y[pos]));
    #else
      y[pos] = x[pos] / y[pos];
    #endif
}

/**
 *  \fn    __global__ void vdsub_cuda(const int n, const double* __restrict__ x, double *y)
 *  \brief This kernel performs double precision y[i]=x[i]-y[i]
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Double precision input vector/matrix
 *  \param y: (inout) Double precision input/output vector/matrix
*/
__global__ void vdsub_cuda(const int n, const double* __restrict__ x, double *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    /* ask for x[pos] or y[pos] = 0.0 don't give improvements. We don't do it */
    y[pos] = x[pos] - y[pos];
}


/**
 *  \fn    __global__ void vssub_cuda(const int n, const float* __restrict__ x, float *y)
 *  \brief This kernel performs simple precision y[i]=x[i]-y[i]
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Simple precision input vector/matrix
 *  \param y: (inout) Simple precision input/output vectir/matrix
*/
__global__ void vssub_cuda(const int n, const float* __restrict__ x, float *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    /* ask for x[pos] or y[pos] = 0.0 don't give improvements. We don't do it */
    y[pos] = x[pos] - y[pos];
}


/**
 *  \fn    __global__ void vderrorbd0_cuda(const int n, const double* __restrict__ x, double *y)
 *  \brief This kernel performs auxiliar double precision operations when error is computed
           using betadivergence error formula and beta = 0
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Double precision input vector/matrix
 *  \param y: (inout) Double precision input/output vector/matrix
*/
__global__ void vderrorbd0_cuda(const int n, const double* __restrict__ x, double *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  double dtmp1, dtmp2;

  if (pos < n)
  {
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
      dtmp1=x[pos] / y[pos];
      assert(!fpe(dtmp1));

      dtmp2=log(dtmp1);
      assert(!fpe(dtmp2));

      y[pos]=dtmp1 - dtmp2 - 1.0;
    #else
      dtmp1=x[pos] / y[pos];
      dtmp2=log(dtmp1);

      y[pos]=dtmp1 - dtmp2 - 1.0;
    #endif
  }
}


/**
 *  \fn    __global__ void vserrorbd0_cuda(const int n, const float* __restrict__ x, float *y)
 *  \brief This kernel performs auxiliar simple precision operations when error is computed
           using betadivergence error formula and beta = 0
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Simple precision input vector/matrix
 *  \param y: (inout) Simple precision input/output vector/matrix
*/
__global__ void vserrorbd0_cuda(const int n, const float* __restrict__ x, float *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  float ftmp1, ftmp2;

  if (pos < n)
  {
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
      ftmp1=x[pos] / y[pos];
      assert(!fpe(ftmp1));

      ftmp2=logf(ftmp1);
      assert(!fpe(ftmp2));

      y[pos]=ftmp1 - ftmp2 - 1.0f;
    #else
      ftmp1=x[pos] / y[pos];
      ftmp2=logf(ftmp1);

      y[pos]=ftmp1 - ftmp2 - 1.0f;
    #endif
  }
}


/**
 *  \fn    __global__ void vderrorbd1_cuda(const int n, const double* __restrict__ x, double *y)
 *  \brief This kernel performs auxiliar double precision operations when error is computed
           using betadivergence error formula and beta = 1
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Double precision input vector/matrix
 *  \param y: (inout) Double precision input/output vector/matrix
*/
__global__ void vderrorbd1_cuda(const int n, const double* __restrict__ x, double *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  double dtmp1, dtmp2, dtmp3;

  if (pos < n)
  {
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
      dtmp1=x[pos];
      dtmp2=y[pos];
      dtmp3=log(dtmp1 / dtmp2);
      assert(!fpe(dtmp3));

      y[pos]=dtmp1 * dtmp3 + dtmp2 - dtmp1;
    #else
      dtmp1=x[pos];
      dtmp2=y[pos];
      dtmp3=log(dtmp1 / dtmp2);

      y[pos]=dtmp1 * dtmp3 + dtmp2 - dtmp1;
    #endif
  }
}


/**
 *  \fn    __global__ void vserrorbd1_cuda(const int n, const float* __restrict__ x, float *y)
 *  \brief This kernel performs auxiliar simple precision operations when error is computed
           using betadivergence error formula and beta = 1
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Simple precision input vector/matrix
 *  \param y: (inout) Simple precision input/output vector/matrix
*/
__global__ void vserrorbd1_cuda(const int n, const float* __restrict__ x, float *y)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  float ftmp1, ftmp2, ftmp3;

  if (pos < n)
  {
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos) and/or x(pos)=0 */
      ftmp1=x[pos];
      ftmp2=y[pos];
      ftmp3=logf(ftmp1 / ftmp2);
      assert(!fpe(ftmp3));

      y[pos]=ftmp1 * ftmp3 + ftmp2 - ftmp1;
    #else
      ftmp1=x[pos];
      ftmp2=y[pos];
      ftmp3=logf(ftmp1 / ftmp2);

      y[pos]=ftmp1 * ftmp3 + ftmp2 - ftmp1;
    #endif
  }
}


/**
 *  \fn    __global__ void vderrorbdg_cuda(const int n, const double* __restrict__ x, double *y, const double beta)
 *  \brief This kernel performs auxiliar double precision operations when error is computed
           using betadivergence error formula with (beta != 0) and (beta != 1)
 *  \param n:    (input) Number of elements of x and y
 *  \param x:    (input) Double precision input vector/matrix
 *  \param y:    (inout) Double precision input/output vector/matrix
 *  \param beta: (input) Double precision value of beta
*/
__global__ void vderrorbdg_cuda(const int n, const double* __restrict__ x, double *y, const double beta)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  double dbeta, dtmp1, dtmp2, dtmp3;

  if (pos < n)
  {
    dbeta=beta-1.0;
    dtmp1=x[pos];
    dtmp2=y[pos];
    dtmp3=beta*dtmp1*pow(dtmp2, dbeta);

    #ifdef With_Check
      y[pos]=(pow(dtmp1, beta) + dbeta*pow(dtmp2, beta) - dtmp3) / (beta * dbeta);
      assert(!fpe(y[pos]));
    #else
      y[pos]=(pow(dtmp1, beta) + dbeta*pow(dtmp2, beta) - dtmp3) / (beta * dbeta);
    #endif
  }
}


/**
 *  \fn    __global__ void vserrorbdg_cuda(const int n, const float* __restrict__ x, float *y, const float beta)
 *  \brief This kernel performs auxiliar simple precision operations when error is computed
           using betadivergence error formula with (beta != 0) and (beta != 1)
 *  \param n:    (input) Number of elements of x and y
 *  \param x:    (input) Simple precision input vector/matrix
 *  \param y:    (inout) Simple precision input/output vector/matrix
 *  \param beta: (input) Simple precision value of beta
*/
__global__ void vserrorbdg_cuda(const int n, const float* __restrict__ x, float *y, const float beta)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  float fbeta, ftmp1, ftmp2, ftmp3;

  if (pos < n)
  {
    fbeta=beta-1.0f;
    ftmp1=x[pos];
    ftmp2=y[pos];
    ftmp3=beta*ftmp1*powf(ftmp2, fbeta);

    #ifdef With_Check
      y[pos]=(powf(ftmp1, beta) + fbeta*powf(ftmp2, beta) - ftmp3) / (beta * fbeta);
      assert(!fpe(y[pos]));
    #else
      y[pos]=(powf(ftmp1, beta) + fbeta*powf(ftmp2, beta) - ftmp3) / (beta * fbeta);
    #endif
  }
}


/* ************************************************************************************ */
/* *********************** wrappers for the kernels and others ************************ */
/* ************************************************************************************ */

/**
 *  \fn    void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
 *  \brief It calls the kernel vdmemset_cuda that fills all positions of vector/matrix x
 *         with the double precision value "val"
 *  \param n:      (input)  Number of elements of x
 *  \param x:      (output) Double precision output vector/matrix (1D column-major)
 *  \param val:    (input)  Double precision value
 *  \param stream: (input)  ID of the stream to use
*/
void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vdmemset_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, val);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
 *  \brief It calls the kernel vsmemset_cuda that fills all positions of vector/matrix x
 *         with  the simple precision value "val"
 *  \param n:      (input)  Number of elements of x
 *  \param x:      (output) Simple precision output vector/matrix (1D column-major)
 *  \param val:    (input)  Simple precision value
 *  \param stream: (input)  ID of the stream to use
*/
void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vsmemset_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, val);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
 *  \brief It calls kernel vddiv_cuda that  y[i]=x[i]/y[i]
 *  \param n:      (input) Number of elements of x and y
 *  \param x:      (input) Double precision input vector/matrix (1D column-major)
 *  \param y:      (inout) Double precision input/output vector/matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vddiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
 *  \brief It calls kernel vsdiv_cuda that  y[i]=x[i]/y[i]
 *  \param n:      (input) Number of elements of x and y
 *  \param x:      (input) Simple precision input vector/matrix (1D column-major)
 *  \param y:      (inout) Simple precision input/output vector/matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vsdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void dsub_cuda(const int n, const double *x, const double *y, double *z)
 *  \brief This wrapper calls kernel vdsub_cuda
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Double precision input vector/matrix
 *  \param y: (inout) Double precision input/output vector/matrix
*/
void dsub_cuda(const int n, const double *x, double *y)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vdsub_cuda<<<dimGrid, dimBlock>>>(n, x, y);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void ssub_cuda(const int n, const float *x, float *y)
 *  \brief This wrapper calls kernel vssub_cuda
 *  \param n: (input) Number of elements of x and y
 *  \param x: (input) Simple precision input vector/matrix
 *  \param y: (inout) Simple precision input/output vector/matrix
*/
void ssub_cuda(const int n, const float *x, float *y)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vssub_cuda<<<dimGrid, dimBlock>>>(n, x, y);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn     double derror_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H)
 *  \brief  derror_cuda returns double precision "2norm(A - WH) / sqrt(m x n)"
 *  \param m: (input) Number of rows of matrix A and number of rows of W
 *  \param n: (input) Number of columns of matrix A and number of columns of H
 *  \param k: (input) Number of columns of matrix W and number of rows of H
 *  \param A: (input) Double precision matrix, dimension (m x n), 1D layout column major
 *  \param W: (input) Double precision matrix, dimension (m x k), 1D layout column major
 *  \param H: (input) Double precision matrix, dimension (k x n), 1D layout column major
*/
double derror_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H)
{
  double
    error=0.0,
    alpha=1.0,
    beta =0.0,
    *tmp =NULL;

  int
    devID;

  cublasHandle_t
    handle;

  cudaGetDevice(&devID);
  cublasCreate(&handle);

  cudaMalloc((void **)&tmp, m * n * sizeof(double));

  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, tmp, m);

  dsub_cuda(m*n, A, tmp);

  cublasDnrm2(handle, m*n, tmp, 1, &error);

  cublasDestroy(handle);

  cudaFree(tmp);

  return error/sqrt(m*n);
}


/**
 *  \fn     float serror_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H)
 *  \brief  serror_cuda returns simple precision "2norm(A - WH) / sqrt(m x n)"
 *  \param m: (input) Number of rows of matrix A and number of rows of W
 *  \param n: (input) Number of columns of matrix A and number of columns of H
 *  \param k: (input) Number of columns of matrix W and number of rows of H
 *  \param A: (input) Simple precision matrix, dimension (m x n), 1D layout column major
 *  \param W: (input) Simple precision matrix, dimension (m x k), 1D layout column major
 *  \param H: (input) Simple precision matrix, dimension (k x n), 1D layout column major
*/
float serror_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H)
{
  float
    error=0.0,
    alpha=1.0,
    beta =0.0,
    *tmp =NULL;

  int
    devID;

  cublasHandle_t
    handle;


  cudaGetDevice(&devID);
  cublasCreate(&handle);

  cudaMalloc((void **)&tmp, m * n * sizeof(float));

  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, tmp, m);

  ssub_cuda(m*n, A, tmp);

  cublasSnrm2(handle, m*n, tmp, 1, &error);

  cublasDestroy(handle);

  cudaFree(tmp);

  return error/sqrtf(m*n);
}


/**
 *  \fn    double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double betadiv)
 *  \brief This function returns double precision error when error is computed using betadivergence error formula
 *  \param m:       (input) Number of rows of A and W
 *  \param n:       (input) Number of columns of A and H
 *  \param k:       (input) Number of columns/rows of W/H
 *  \param A:       (input) Double precision input matrix A
 *  \param W:       (input) Double precision input matrix W
 *  \param H:       (input) Double precision input matrix H
 *  \param betadiv: (input) beta value
*/
double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double betadiv)
{
  dim3
    dimGrid,
    dimBlock;

  double
    error=0.0,
    alpha=1.0,
    beta =0.0,
    *dtmp=NULL;

  int
    devID;

  cublasHandle_t
    handle;

  dimBlock.x = 256;

  cudaGetDevice(&devID);
  cublasCreate(&handle);

  cudaMalloc((void **)&dtmp, m*n*sizeof(double));

  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, dtmp, m);

  if (betadiv>=0.0 && betadiv<=0.0)
  {
    dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
    vderrorbd0_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp);
  }
  else
  {
    #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
      dimBlock.x = 224;
    #endif
    dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;

    if (betadiv>=1.0 && betadiv<=1.0)
      vderrorbd1_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp);
    else
      vderrorbdg_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, dtmp, betadiv);
  }
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif

  /* all dtmp elements are >=0 so we use Cublas */
  cublasDasum(handle, m*n, dtmp, 1, &error);

  error=sqrt((2.0*error)/((double)m*n));

  cublasDestroy(handle);

  cudaFree(dtmp);

  return error;
}


/**
 *  \fn    float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float betadiv)
 *  \brief This function returns simple precision error when error is computed using betadivergence error formula
 *  \param m:       (input) Number of rows of A and W
 *  \param n:       (input) Number of columns of A and H
 *  \param k:       (input) Number of columns/rows of W/H
 *  \param A:       (input) Simple precision input matrix A
 *  \param W:       (input) Simple precision input matrix W
 *  \param H:       (input) Simple precision input matrix H
 *  \param betadiv: (input) beta value
*/
float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float betadiv)
{
  dim3
    dimGrid,
    dimBlock;

  float
    alpha=1.0f,
    beta =0.0f,
    error=0.0f,
    *ftmp=NULL;

  int
    devID;

  cublasHandle_t
    handle;

  dimBlock.x = 256;
  dimGrid.x  = (m*n + dimBlock.x -1) / dimBlock.x;

  cudaGetDevice(&devID);
  cublasCreate(&handle);

  cudaMalloc((void **)&ftmp, m*n*sizeof(float));

  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, ftmp, m);

  if (betadiv>=0.0 && betadiv<=0.0)
    vserrorbd0_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp);
  else
  {
    if (betadiv>=1.0 && betadiv<=1.0)
      vserrorbd1_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp);
    else
      vserrorbdg_cuda<<<dimGrid, dimBlock, 0, 0>>>(m*n, A, ftmp, betadiv);
  }
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif

  /* all ftmp elements are >=0 so we use Cublas */
  cublasSasum(handle, m*n, ftmp, 1, &error);

  error=sqrtf((2.0f*error)/((float)m*n));

  cublasDestroy(handle);

  cudaFree(ftmp);

  return error;
}


/**
 *  \fn     void dlarngenn_cuda(const int m, const int n, const int seed, double *M)
 *  \brief  dlarngenn_cuda returns a (m x n) random double precision matrix.
 *          An uniform (0, 1) distribution is used to generate the values
 *  \param m:    (input)  Number of rows of matrix M
 *  \param n:    (input)  Number of columns of matrix M
 *  \param seed: (input)  Initial seed for the random numbers
 *  \param M:    (output) Double precision matrix, dimension (m x n), 1D column major
*/
void dlarngenn_cuda(const int m, const int n, const int seed, double *M)
{
  curandGenerator_t gen;

  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
  curandSetPseudoRandomGeneratorSeed(gen, seed);
  curandGenerateUniformDouble(gen, M, m*n);
  curandDestroyGenerator(gen);
}


/**
 *  \fn     void slarngenn_cuda(const int m, const int n, const int seed, float *M)
 *  \brief  slarngenn_cuda returns a (m x n) random simple precision matrix.
 *          An uniform (0, 1) distribution is used to generate the values
 *  \param m:    (input)  Number of rows of matrix M
 *  \param n:    (input)  Number of columns of matrix M
 *  \param seed: (input)  Initial seed for the random numbers
 *  \param M:    (output) Simple precision matrix, dimension (m x n), 1D column major
*/
void slarngenn_cuda(const int m, const int n, const int seed, float *M)
{
  curandGenerator_t gen;

  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
  curandSetPseudoRandomGeneratorSeed(gen, seed);
  curandGenerateUniform(gen, M, m * n);
  curandDestroyGenerator(gen);
}