NMFPack/bdiv__cuda_8cu_source.html

/***************************************************************************
 *   Copyright (C) 2014 by PIR (University of Oviedo) and                  *
 *   INCO2 (Polytechnic University of Valencia) groups.                    *
 *   nnmfpack@gmail.com                                                    *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************
*/
/**
 *  \file    bdiv_cuda.cu
 *  \brief   File with functions to calcule NNMF using betadivergence methods for GPUs
 *  \author  Information Retrieval and Parallel Computing Group (IRPCG)
 *  \author  University of Oviedo, Spain
 *  \author  Interdisciplinary Computation and Communication Group (INCO2)
 *  \author  Universitat Politecnica de Valencia, Spain.
 *  \author  Contact: nnmfpack@gmail.com
 *  \date    04/11/14
*/
#include "bdiv_cuda.h"


/**
 *  \fn    int dbdivg_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, const int nIter)
 *  \brief dbdivg_cuda performs the NNMF using beta-divergence when beta is != 1 and !=2, using double precision.
 *
 *         The algorithm is<BR>
 *         &nbsp;&nbsp;repit nIter times<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;STEP 1<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L(.^)(beta-2)<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L2=L1(.*)A<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L1(.*)L<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B=W'*L2<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;C=W'*L1<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H=H(.*)B(./)C<BR>
 *
 *         &nbsp;&nbsp;&nbsp;&nbsp;STEP 2<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L(.^)(beta-2)<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L2=L1(.*)A<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L1=L1(.*)L<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;D=L2*H'<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;E=L1*H'<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;W=W(.*)D(./)E<BR>
 *         &nbsp;&nbsp;end repit<BR>
 *         End algorithm<BR>
 *
 *         In real live L1 and L2 are (m*n) matrices used in STEP 1 and 2. For performance
 *         reasons only one 1D colum-mayor buffer, named R in next code, of size 2*m*n is used
 *         In STEP 1, first part of R (m*n positions) is L2 and the second part is L1.
 *         In STEP 2, fisrt column of R (2*m positions) is composed by the first column of L2
 *         following first column of L1. Second column of R (2*m positions) is composed by the
 *         sencond column of L2 following second column of L1. 3rd column of R ... and so on
 *
 *         In real live B and C are (k*n) matrices used in STEP 1, and D and E are (m*k)
 *         matrices used in STEP 2. B/C and D/E are independent. However we do not have L1
 *         and L2, we have R, and we can do B=W'*L2 and C=W'*L1 (or D=L2*H' and E=L1*H') at
 *         the same time. For this reason only one matrix is declared to save space. This is
 *         matrix M with size 2*max(m,n)*k
 *
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param expo:  (input) Double precision value. The exponent beta of betadivergence method
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
 *
 *  It returns 0 if all is OK.
*/
int dbdivg_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, const int nIter)
{
  double
    *M=NULL,
    *L=NULL,
    *R=NULL,
    alpha=1.0,
    beta =0.0;

  int
    i, devID;

  cublasHandle_t
    handle;

  CUDAERR(cudaGetDevice(&devID));

  switch (uType)
  {
     case UpdateAll:
        CUDAERR(cudaMalloc((void **)&M, 2*max(m, n) * k * sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L,           m * n * sizeof(double)));
        CUDAERR(cudaMalloc((void **)&R,         2*m * n * sizeof(double)));

        CUBLASERR(cublasCreate(&handle));

        for(i=0;i<nIter;i++)
        {
            /* ************************ Phase 1 *************************** */
            /* L=W*H */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A       */
            /* L1=L1(.*)L       */
            dkernelH_cuda(m, n, L, A, R, expo-2.0, 0);

            /* B=W'*L2 */
            /* C=W'*L1 */
            /* above is equal to R=W'*|L2 | L1| */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));

            /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
            dupdate1H_cuda(k*n, M, H, 0);


            /* ************************ Phase 2 *************************** */
            /* L=W*H */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A */
            /* L1=L1(.*)L */
            dkernelW_cuda(m, n, L, A, R, expo-2.0, 0);

            /* D=L2*H' */
            /* E=L1*H' */
            /*                     |L2|      */
            /* above is equal to R=|  | * H' */
            /*                     |L1|      */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));

            /* W=W(.*)D(./)E */
            dupdate1W_cuda(m, k, M, W, 0);
        }
        CUBLASERR(cublasDestroy(handle));

        CUDAERR(cudaFree(M));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(R));
        break;

     case UpdateW:
        CUDAERR(cudaMalloc((void **)&M, 2*m * k * sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L,   m * n * sizeof(double)));
        CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));

        CUBLASERR(cublasCreate(&handle));

        for(i=0;i<nIter;i++)
        {
            /* ************************ Phase 2 *************************** */
            /* L=W*H */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A */
            /* L1=L1(.*)L */
            dkernelW_cuda(m, n, L, A, R, expo-2.0, 0);

            /* D=L2*H' */
            /* E=L1*H' */
            /*                     |L2|      */
            /* above is equal to R=|  | * H' */
            /*                     |L1|      */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));

            /* W=W(.*)D(./)E */
            dupdate1W_cuda(m, k, M, W, 0);
        }
        CUBLASERR(cublasDestroy(handle));

        CUDAERR(cudaFree(M));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(R));
        break;

     case UpdateH:
        CUDAERR(cudaMalloc((void **)&M, 2*n * k * sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L,   m * n * sizeof(double)));
        CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(double)));

        CUBLASERR(cublasCreate(&handle));

        for(i=0;i<nIter;i++)
        {
            /* ************************ Phase 1 *************************** */
            /* L=W*H */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A       */
            /* L1=L1(.*)L       */
            dkernelH_cuda(m, n, L, A, R, expo-2.0, 0);

            /* B=W'*L2 */
            /* C=W'*L1 */
            /* above is equal to R=W'*|L2 | L1| */
            CUBLASERR(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));

            /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
            dupdate1H_cuda(k*n, M, H, 0);
        }
        CUBLASERR(cublasDestroy(handle));

        CUDAERR(cudaFree(M));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(R));
        break;

     default:
        return -1;
  }
  return 0;
}

/**
 *  \fn    int sbdivg_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, const int nIter)
 *  \brief sbdivg_cuda performs NNMF using beta-divergence when beta is != 1 and !=2, using simple precision
 *         See description of dbdivg_cuda for more info
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param expo:  (input) Simple precision value. The exponent beta of betadivergence method
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
 *
 *  It returns 0 if all is OK.
*/
int sbdivg_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, const int nIter)
{
  float
    *M=NULL,
    *L=NULL,
    *R=NULL,
    alpha=1.0f,
    beta =0.0f;

  int
    i, devID;

  cublasHandle_t
    handle;

  CUDAERR(cudaGetDevice(&devID));

  switch (uType)
  {
     case UpdateAll:
        CUDAERR(cudaMalloc((void **)&M, 2*max(m, n) * k * sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L,           m * n * sizeof(float)));
        CUDAERR(cudaMalloc((void **)&R,         2*m * n * sizeof(float)));

        CUBLASERR(cublasCreate(&handle));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 1 *************************** */
           /* L=W*H */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A       */
           /* L1=L1(.*)L       */
           skernelH_cuda(m, n, L, A, R, expo-2.0f, 0);

           /* B=W'*L2 */
           /* C=W'*L1 */
           /* above is equal to R=W'*|L2 | L1| */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));

           /* H=H(.*)B(./)C */
           supdate1H_cuda(k*n, M, H, 0);


           /* ************************ Phase 2 *************************** */
           /* L=W*H */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A */
           /* L1=L1(.*)L */
           skernelW_cuda(m, n, L, A, R, expo-2.0f, 0);

           /* D=L2*H' */
           /* E=L1*H' */
           /*                     |L2|      */
           /* above is equal to R=|  | * H' */
           /*                     |L1|      */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));

           /* W=W(.*)D(./)E */
           supdate1W_cuda(m, k, M, W, 0);
        }
        CUBLASERR(cublasDestroy(handle));

        CUDAERR(cudaFree(M));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(R));
        break;

     case UpdateW:
        CUDAERR(cudaMalloc((void **)&M, 2*m * k * sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L,   m * n * sizeof(float)));
        CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));

        CUBLASERR(cublasCreate(&handle));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 2 *************************** */
           /* L=W*H */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A */
           /* L1=L1(.*)L */
           skernelW_cuda(m, n, L, A, R, expo-2.0f, 0);

           /* D=L2*H' */
           /* E=L1*H' */
           /*                     |L2|      */
           /* above is equal to R=|  | * H' */
           /*                     |L1|      */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 2*m, k, n, &alpha, R, 2*m, H, k, &beta, M, 2*m));

           /* W=W(.*)D(./)E */
           supdate1W_cuda(m, k, M, W, 0);
        }
        CUBLASERR(cublasDestroy(handle));

        CUDAERR(cudaFree(M));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(R));
        break;

     case UpdateH:
        CUDAERR(cudaMalloc((void **)&M, 2*n * k * sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L,   m * n * sizeof(float)));
        CUDAERR(cudaMalloc((void **)&R, 2*m * n * sizeof(float)));

        CUBLASERR(cublasCreate(&handle));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 1 *************************** */
           /* L=W*H */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A       */
           /* L1=L1(.*)L       */
           skernelH_cuda(m, n, L, A, R, expo-2.0f, 0);

           /* B=W'*L2 */
           /* C=W'*L1 */
           /* above is equal to R=W'*|L2 | L1| */
           CUBLASERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, k, 2*n, m, &alpha, W, m, R, m, &beta, M, k));

           /* H=H(.*)B(./)C */
           supdate1H_cuda(k*n, M, H, 0);
        }
        CUBLASERR(cublasDestroy(handle));

        CUDAERR(cudaFree(M));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(R));
        break;

     default:
        return -1;
  }
  return 0;
}


/**
 *  \fn    int dbdivone_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
 *  \brief dbdivone_cuda performs NNMF using beta-divergence when beta=1, using double precision.
 *
 *         The algorithm is<BR>
 *         &nbsp;&nbsp;repit nIter times<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;STEP 1<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;y(i)=sum(W(j,i) for all j in range) for all i in range<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=A(./)L<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B=W'*L<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B(i,j)=B(i,j) / y(i) for all B elements<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H=H(.*)B<BR>
 *
 *         &nbsp;&nbsp;&nbsp;&nbsp;STEP 2<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;y(i)=sum(H(i,j) for all j in range) for all i in range<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=A(./)L <BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;D=L*H'<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B(i,j)=B(i,j) / y(j) for all B elements<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;W=W(.*)D<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;end repit<BR>
 *         End algorithm<BR>
 *
 *         In real live B is a (k*n) matrix used in STEP 1, and D is a (m*k)
 *         matrix used in STEP 2. B and D are independent. For this reason only 1 matrix
 *         of size max(m,n)*k is declared/used
 *
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
 *
 *  It returns 0 if all is OK.
*/
int dbdivone_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
{
  double
    *B=NULL,
    *L=NULL,
    *x=NULL,
    *y=NULL,
    alpha=1.0,
    beta =0.0;

  int
    i, devID;

  cublasHandle_t
    handle1,
    handle2;

  cudaStream_t
    stream1,
    stream2;

  cudaEvent_t
    event;

  CUDAERR(cudaGetDevice(&devID));

  switch (uType)
  {
     case UpdateAll:
        CUDAERR(cudaMalloc((void **)&B, k*max(m,n)*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L,        m*n*sizeof(double)));

        /* These two vectors (x and y) are used both in Phase 1 and Phase 2. With */
        /* the strategy used with matrices B and D the size of x is max(m,n)      */
        CUDAERR(cudaMalloc((void **)&x, max(m,n)*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&y,        k*sizeof(double)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Why streams? Because we use them before                         */
        /* For small sizes without streams is fast (n=m <= 500)            */
        /* For medium sizes with streams is fast (500 <= n=m <= 1000)      */
        /* For other sizes streams is less (aprox. =) than without streams */
        /* Why this approach?                                              */
        /* Maybe this is the best way to use less calls to cudaStreamWait  */
        /* due to one cublasDgemv is faster than two cublasDgemm plus one  */
        /* kernel. Call latter cublasDgemv perhaps it is ok but...         */

        /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
        dmemset_cuda(max(m,n), x, 1.0, stream1);

        /* Stream1 recording event "event" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 1 *************************** */
           /* Stream2 waiting "event" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all W columns via dgemv(W, x) */
           CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           ddiv_cuda(m*n, A, L, stream1);

           /* B=W'*L */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j) / y(i) for all B elements */
           /* H=H(.*)B */
           dupdate2H_cuda(k, n, y, B, H, stream1);

           /* Stream1 recording event "event" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));


           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all H rows via dgemv(H, x) is performed */
           CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           ddiv_cuda(m*n, A, L, stream1);

           /* B=L*H' */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j)/y(j) for all B elements */
           /* W=W(.*)B */
           dupdate2W_cuda(m, k, y, B, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(x));
        CUDAERR(cudaFree(y));
        break;

     case UpdateW:
        CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&x,   n*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&y,   k*sizeof(double)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
        dmemset_cuda(max(m,n), x, 1.0, stream1);

        /* Stream1 recording event "event" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all H rows via dgemv(H, x) is performed */
           CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           ddiv_cuda(m*n, A, L, stream1);

           /* B=L*H' */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j)/y(j) for all B elements */
           /* W=W(.*)B */
           dupdate2W_cuda(m, k, y, B, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(x));
        CUDAERR(cudaFree(y));
        break;

     case UpdateH:
        CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&x,   m*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&y,   k*sizeof(double)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
        dmemset_cuda(max(m,n), x, 1.0, stream1);

        /* Stream1 recording event "event" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 1 *************************** */
           /* Stream2 waiting "event" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all W columns via dgemv(W, x) */
           CUBLASERR(cublasDgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           ddiv_cuda(m*n, A, L, stream1);

           /* B=W'*L */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j) / y(i) for all B elements */
           /* H=H(.*)B */
           dupdate2H_cuda(k, n, y, B, H, stream1);

           /* Stream1 recording event "event" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(x));
        CUDAERR(cudaFree(y));
        break;

     default:
        return -1;
  }
  return 0;
}


/**
 *  \fn    int sbdivone_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
 *  \brief sbdivone_cuda performs NNMF using betadivergence when beta=2 using simple precision
 *         See description of dbdivone_cuda for more info
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
 *
 *  It returns 0 if all is OK.
*/
int sbdivone_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
{
  float
    *B=NULL,
    *L=NULL,
    *x=NULL,
    *y=NULL,
    alpha=1.0f,
    beta =0.0f;

  int
    i, devID;

  cublasHandle_t
    handle1,
    handle2;

  cudaStream_t
    stream1,
    stream2;

  cudaEvent_t
    event;

  CUDAERR(cudaGetDevice(&devID));

  switch (uType)
  {
     case UpdateAll:
        CUDAERR(cudaMalloc((void **)&B, k*max(m,n)*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L,        m*n*sizeof(float)));

        /* This two vectors (x and y) are used both in Step 1 and Step 2. With */
        /* the strategy used with matrices B and D the size of x is max(m,n)   */
        CUDAERR(cudaMalloc((void **)&x, max(m,n)*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&y,        k*sizeof(float)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
        smemset_cuda(max(m,n), x, 1.0f, stream1);

        /* Stream1 recording event "event" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 1 *************************** */
           /* Stream2 waiting "event" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all W columns via dgemv(W, x) */
           CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           sdiv_cuda(m*n, A, L, stream1);

           /* B=W'*L */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j)/y(i) for all B elements */
           /* H=H(.*)B */
           supdate2H_cuda(k, n, y, B, H, stream1);

           /* Stream1 recording event "event" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));


           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all H rows via dgemv(H, x) is performed */
           CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           sdiv_cuda(m*n, A, L, stream1);

           /* B=L*H' */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j)/y(j) for all B elements */
           /* W=W(.*)B */
           supdate2W_cuda(m, k, y, B, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(x));
        CUDAERR(cudaFree(y));
        break;

     case UpdateW:
        CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&x,   n*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&y,   k*sizeof(float)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
        smemset_cuda(max(m,n), x, 1.0f, stream1);

        /* Stream1 recording event "event" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all H rows via dgemv(H, x) is performed */
           CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_N, k, n, &alpha, H, k, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           sdiv_cuda(m*n, A, L, stream1);

           /* B=L*H' */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, L, m, H, k, &beta, B, m));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j)/y(j) for all B elements */
           /* W=W(.*)B */
           supdate2W_cuda(m, k, y, B, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(x));
        CUDAERR(cudaFree(y));
        break;

     case UpdateH:
        CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L, m*n*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&x,   m*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&y,   k*sizeof(float)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Stream 1 stars filling x with ones (x[i]=1.0 for all i) */
        smemset_cuda(max(m,n), x, 1.0f, stream1);

        /* Stream1 recording event "event" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************ Phase 1 *************************** */
           /* Stream2 waiting "event" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

           /* Calculate the sums of all W columns via dgemv(W, x) */
           CUBLASERR(cublasSgemv(handle2, CUBLAS_OP_T, m, k, &alpha, W, m, x, 1, &beta, y, 1));

           /* Stream2 should be record event "event1" for stream2 but it is faster than stream1 */
           /* CUDAERR(cudaEventRecord(event1, stream2)); */

           /* L=W*H */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, W, m, H, k, &beta, L, m));

           /* L=A(./)L */
           sdiv_cuda(m*n, A, L, stream1);

           /* B=W'*L */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, L, m, &beta, B, k));

           /* Stream1 should be wait "event1" set by stream2 but stream2 is faster than stream1 */
           /* CUDAERR(cudaStreamWaitEvent(stream2, even1, 0)); */

           /* B(i,j)=B(i,j)/y(i) for all B elements */
           /* H=H(.*)B */
           supdate2H_cuda(k, n, y, B, H, stream1);

           /* Stream1 recording event "event" for stream2 */
           CUDAERR(cudaEventRecord(event, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(L));
        CUDAERR(cudaFree(x));
        CUDAERR(cudaFree(y));
        break;

     default:
        return -1;
  }
  return 0;
}


/**
 *  \fn    int dbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
 *  \brief dbdiv_cuda is a wrapper that calls the adequate function to performs NNMF using betadivergence using double precision with GPUs
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param beta:  (input) Double precision value. The parameter beta of betadivergence method
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
*/
int dbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
{
  if ((beta < 0.0) || (nIter <= 0))
    return -1;

  if (beta>=2.0 && beta<=2.0)
    return dmlsa_cuda(m, n, k, A, W, H, uType, nIter);
  else
  {
    if (beta>=1.0 && beta<=1.0)
      return dbdivone_cuda(m, n, k, A, W, H, uType, nIter);
    else
      return dbdivg_cuda(m, n, k, A, W, H, beta, uType, nIter);
  }
}


/**
 *  \fn    int sbdiv_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
 *  \brief sbdiv_cuda is a wrapper that calls the adequate function to performs NNMF using betadivergence
 *         using simple precision with GPUs
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param beta:  (input) Simple precision value. The parameter beta of betadivergence method
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
*/
int sbdiv_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
{
  if ((beta < 0.0f) || (nIter <= 0))
    return -1;

  if (beta>=2.0f && beta<=2.0f)
    return smlsa_cuda(m, n, k, A, W, H, uType, nIter);
  else
  {
    if (beta>=1.0f && beta<=1.0f)
      return sbdivone_cuda(m, n, k, A, W, H, uType, nIter);
    else
      return sbdivg_cuda(m, n, k, A, W, H, beta, uType, nIter);
  }
}


/**
 *  \fn    void dupdate1H_cuda(const int n, const double *X, double *H, cudaStream_t stream)
 *  \brief It calls cuda kernel vdupdate1H_cuda that performs H=H(.*)B(./)C where matrices
 *         B and C are stored in buffer X by columns. All B 1st and after all C
 *  \param n:      (input) Number of elements of H
 *  \param X:      (input) Double precision input buffer (1D column-major)
 *  \param H:      (inout) Double precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void dupdate1H_cuda(const int n, const double *X, double *H, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vdupdate1H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(n, X, H);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void supdate1H_cuda(const int n, const float *X, float *H, cudaStream_t stream)
 *  \brief It calls cuda kernel vsupdate1H_cuda that performs H=H(.*)B(./)C where matrices
 *         B and C are stored in buffer X by columns. All B 1st and after all C
 *  \param n:      (input) Number of elements of H
 *  \param X:      (input) Simple precision input buffer (1D column-major)
 *  \param H:      (inout) Simple precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void supdate1H_cuda(const int n, const float *X, float *H, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vsupdate1H_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, X, H);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void dupdate1W_cuda(const int m, const int n, const double *X, double *W, cudaStream_t stream)
 *  \brief It calls kernel vdupdate1W_cuda that performs W=W(.*)D(./)E where matrices
 *         D and E are stored in buffer X according beta-divergence general case (see dbdivg_cuda(...))
 *  \param m:      (input) Number of rows of W
 *  \param n:      (input) Number of colums of W
 *  \param X:      (input) Double precision input buffer (1D column-major)
 *  \param W:      (inout) Double precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void dupdate1W_cuda(const int m, const int n, const double *X, double *W, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vdupdate1W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, X, W);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void supdate1W_cuda(const int m, const int n, const float *X, float *W, cudaStream_t stream)
 *  \brief It calls kernel vsupdate1W_cuda that performs W=W(.*)D(./)E where matrices
 *         D and E are stored in buffer X according beta-divergence general case (see sbdivg_cuda(...))
 *  \param m:      (input) Number of rows of W
 *  \param n:      (input) Number of colums of W
 *  \param X:      (input) Simple precision input buffer (1D column-major)
 *  \param W:      (inout) Simple precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void supdate1W_cuda(const int m, const int n, const float *X, float *W, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vsupdate1W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, X, W);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void dupdate2H_cuda(const int m, const int n, const double *x, const double *B, double *H, cudaStream_t stream)
 *  \brief It calls kernel vdupdate2H_cuda that performs H(i)=H(i)*(B(i)/x(j))
 *  \param m:      (input) Number of rows    of B and H, and number of elements of vector x
 *  \param n:      (input) Number of columns of B and A
 *  \param x:      (input) Double precision vector with the sum of W columns
 *  \param B:      (input) Double precision input matrix (1D column-major)
 *  \param H:      (inout) Double precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void dupdate2H_cuda(const int m, const int n, const double *x, const double *B, double *H, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vdupdate2H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(m, n, x, B, H);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void supdate2H_cuda(const int m, const int n, const float *x, const float *B, float *H, cudaStream_t stream)
 *  \brief It calls kernel vsupdate2H_cuda that performs H(i)=H(i)*(B(i)/x(j))
 *  \param m:      (input) Number of rows    of B and H, and number of elements of vector x
 *  \param n:      (input) Number of columns of B and A
 *  \param x:      (input) Simple precision vector with the sum of W columns
 *  \param B:      (input) Simple precision input matrix (1D column-major)
 *  \param H:      (inout) Simple precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void supdate2H_cuda(const int m, const int n, const float *x, const float *B, float *H, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vsupdate2H_cuda<<<dimGrid, dimBlock, 0 , stream>>>(m, n, x, B, H);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void dupdate2W_cuda(const int m, const int n, const double *x, const double *B, double *W, cudaStream_t stream)
 *  \brief It calls kernel vdupdate2W_cuda that performs W(i)=W(i)*(B(i)/x(j))
 *  \param m:      (input) Number of rows    of W and B,
 *  \param n:      (input) Number of columns of W and B, and number of elements of vector x
 *  \param x:      (input) Double precision vector with the sum of W fills
 *  \param B:      (input) Double precision input matrix (1D column-major)
 *  \param W:      (inout) Double precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void dupdate2W_cuda(const int m, const int n, const double *x, const double *B, double *W, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vdupdate2W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, x, B, W);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void supdate2W_cuda(const int m, const int n, const float *x, const float *B, float *W, cudaStream_t stream)
 *  \brief It calls kernel vsupdate2W_cuda that performs W(i)=W(i)*(B(i)/x(j))
 *  \param m:      (input) Number of rows    of W and B,
 *  \param n:      (input) Number of columns of W and B, and number of elements of vector x
 *  \param x:      (input) Simple precision vector with the sum of W fills
 *  \param B:      (input) Simple precision input matrix (1D column-major)
 *  \param W:      (inout) Simple precision input/output matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void supdate2W_cuda(const int m, const int n, const float *x, const float *B, float *W, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vsupdate2W_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, x, B, W);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void dkernelH_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
 *  \brief It calls kernel vdkernelH_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
 *         Note expo is a real number expo < 0 or expo > 0
 *  \param m:      (input)  Number of rows    of L, A and R matrices
 *  \param n:      (input)  Number of columns of L, A and R matrices
 *  \param L:      (input)  Double precision input matrix (1D column-major)
 *  \param A:      (input)  Double precision input matrix (1D column-major)
 *  \param R:      (output) Double precision output matrix (1D column-major)
 *  \param expo:   (input)  the "power of" for function pow(). It is a double precision value
 *  \param stream: (input)  ID of the stream to use
*/
void dkernelH_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 256;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vdkernelH_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void skernelH_cuda(const int m, const int n, const float *L, const float *A, float *R, const double expo, cudaStream_t stream)
 *  \brief It calls kernel vskernelH_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
 *         Note expo is a real number expo < 0 or expo > 0
 *  \param m:      (input)  Number of rows    of L, A and R matrices
 *  \param n:      (input)  Number of columns of L, A and R matrices
 *  \param L:      (input)  Simple precision input matrix (1D column-major)
 *  \param A:      (input)  Simple precision input matrix (1D column-major)
 *  \param R:      (output) Simple precision output matrix (1D column-major)
 *  \param expo:   (input)  the "power of" for function pow(). It is a simple precision value
 *  \param stream: (input)  ID of the stream to use
*/
void skernelH_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vskernelH_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void dkernelW_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
 *  \brief It calls kernel vdkernelW_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
 *  \param m:      (input)  Number of rows of L, A and R matrices
 *  \param n:      (input)  Number of columns of L, A and R matrices
 *  \param L:      (input)  Double precision input matrix (1D column-major)
 *  \param A:      (input)  Double precision input matrix (1D column-major)
 *  \param R:      (output) Double precision output matrix (1D column-major)
 *  \param expo:   (input)  the "power of" for function pow(). It is a double precision value
 *  \param stream: (input)  ID of the stream to use
*/
void dkernelW_cuda(const int m, const int n, const double *L, const double *A, double *R, const double expo, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 256;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vdkernelW_cuda<<<dimGrid, dimBlock, 0 ,stream>>>(m, n, L, A, R, expo);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void skernelW_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
 *  \brief It calls kernel vskernelW_cuda that performs R(i)=((L(i)^expo)*A(i))*L(i)
 *  \param m:      (input)  Number of rows of L, A and R matrices
 *  \param n:      (input)  Number of columns of L, A and R matrices
 *  \param L:      (input)  Simple precision input matrix (1D column-major)
 *  \param A:      (input)  Simple precision input matrix (1D column-major)
 *  \param R:      (output) Simple precision output matrix (1D column-major)
 *  \param expo:   (input)  the "power of" for function pow(). It is a simple precision value
 *  \param stream: (input)  ID of the stream to use
*/
void skernelW_cuda(const int m, const int n, const float *L, const float *A, float *R, const float expo, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (m*n + dimBlock.x -1) / dimBlock.x;
  vskernelW_cuda<<<dimGrid, dimBlock, 0, stream>>>(m, n, L, A, R, expo);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    __global__ void vdkernelH_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
 *  \brief This kernel performs double precision  R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo)
 *         Note expo is a real number expo < 0 or expo > 0
 *  \param m:    (input)  Number of rows    of L, A and R matrices
 *  \param n:    (input)  Number of columns of L, A and R matrices
 *  \param L:    (input)  Double precision input matrix (1D column-major)
 *  \param A:    (input)  Double precision input matrix (1D column-major)
 *  \param R:    (output) Double precision output matrix (1D column-major)
 *  \param expo: (input)  the "power of" for function pow(). It is a double precision value
*/
__global__ void vdkernelH_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
{
  unsigned int
    pos =blockDim.x * blockIdx.x + threadIdx.x,
    size=m*n;

  double
    dtmp1, dtmp2;

  if (pos < size)
  {
    dtmp1 = L[pos];

    if (dtmp1 == 0.0)
    /* if (dtmp1>=0.0 && dtmp1<=0.0) */
      R[pos] = R[pos+size] = 0.0;
    else
    {
      dtmp2 = pow(dtmp1, expo);

      /* ask for A[i]=0.0 don't give improvements. We don't do it */
      R[pos]     =dtmp2*A[pos];
      R[pos+size]=dtmp1*dtmp2;
    }
  }
}

/**
 *  \fn    __global__ void vskernelH_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const double expo)
 *  \brief This kernel computes simple precision  R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo)
 *         Note expo is a real number expo < 0 or expo > 0
 *  \param m:    (input)  Number of rows    of L, A and R matrices
 *  \param n:    (input)  Number of columns of L, A and R matrices
 *  \param L:    (input)  Simpel precision input matrix (1D column-major)
 *  \param A:    (input)  Simpel precision input matrix (1D column-major)
 *  \param R:    (output) Simpel precision output matrix (1D column-major)
 *  \param expo: (input)  the "power of" for function pow(). It is a simple precision value
*/
__global__ void vskernelH_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const float expo)
{
  unsigned int
    pos =blockDim.x * blockIdx.x + threadIdx.x,
    size=m*n;

  float
    ftmp1, ftmp2;

  if (pos < size)
  {
    ftmp1 = L[pos];

    if (ftmp1 == 0.0f)
    /* if (ftmp1>=0.0f && ftmp1<=0.0f) */
      R[pos] = R[pos+size] = 0.0f;
    else
    {
      ftmp2 = powf(ftmp1, expo);

      /* ask for A[i]=0.0 don't give improvements. We don't do it */
      R[pos]     =ftmp2*A[pos];
      R[pos+size]=ftmp1*ftmp2;
    }
  }
}


/**
 *  \fn    __global__ void vdkernelW_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
 *  \brief This kernel computes double precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo)
 *         Note expo is a real number expo < 0 or expo > 0
 *  \param m:    (input)  Number of rows    of L, A and R matrices
 *  \param n:    (input)  Number of columns of L, A and R matrices
 *  \param L:    (input)  Double precision input matrix (1D column-major)
 *  \param A:    (input)  Double precision input matrix (1D column-major)
 *  \param R:    (output) Double precision output matrix (1D column-major)
 *  \param expo: (input)  the "power of" for function pow(). It is a double precision value
*/
__global__ void vdkernelW_cuda(const int m, const int n, const double* __restrict__ L, const double* __restrict__ A, double *R, const double expo)
{
  unsigned int
    pos=blockDim.x * blockIdx.x + threadIdx.x,
    newpos;

  double
    dtmp1, dtmp2;

  if (pos < m*n)
  {
    newpos = 2*m*(pos/m)+(pos%m);
    dtmp1  = L[pos];

    if (dtmp1 == 0.0)
    /* if (dtmp1>=0.0 && dtmp1<=0.0) */
      R[pos] = R[pos+m] = 0.0;
    else
    {
      dtmp2 = pow(dtmp1, expo);

      R[newpos]  = dtmp2*A[pos];
      R[newpos+m]= dtmp1*dtmp2;
    }
  }
}


/**
 *  \fn    __global__ void vskernelW_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const double expo)
 *  \brief This kernel computes simple precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo)
 *         Note expo is a real number expo < 0 or expo > 0
 *  \param m:    (input)  Number of rows    of L, A and R matrices
 *  \param n:    (input)  Number of columns of L, A and R matrices
 *  \param L:    (input)  Simple precision input matrix (1D column-major)
 *  \param A:    (input)  Simple precision input matrix (1D column-major)
 *  \param R:    (output) Simple precision output matrix (1D column-major)
 *  \param expo: (input)  the "power of" for function pow(). It is a simple precision value
*/
__global__ void vskernelW_cuda(const int m, const int n, const float* __restrict__ L, const float* __restrict__ A, float *R, const float expo)
{
  unsigned int
    pos=blockDim.x * blockIdx.x + threadIdx.x,
    newpos;

  float
    ftmp1, ftmp2;

  if (pos < m*n)
  {
    newpos = 2*m*(pos/m)+(pos%m);
    ftmp1  = L[pos];

    if (ftmp1 == 0.0f)
    /* if (ftmp1>=0.0f && ftmp1<=0.0f) */
      R[pos] = R[pos+m] = 0.0f;
    else
    {
      ftmp2 = powf(ftmp1, expo);

      R[newpos]  = ftmp2*A[pos];
      R[newpos+m]= ftmp1*ftmp2;
    }
  }
}


/**
 *  \fn    __global__ void vdupdate1H_cuda(const int n, const double* __restrict__ X, double *H)
 *  \brief This kernel computes double precision H(i)=H(i)*B(i)/C(i) where matrices
 *         B and C are stored in the same buffer (called X). All B 1st and after C
 *  \param n: (input) Number of elements of H
 *  \param X: (input) Double precision input matrix (1D column-major)
 *  \param H: (inout) Double precision input/output matrix (1D column-major)
*/
__global__ void vdupdate1H_cuda(const int n, const double* __restrict__ X, double *H)
  /* The use of "const double* __restrict__ A" and after "...=A[i]" is  */
  /* the same that "double *A" and after "...=__ldg(&A[i])" but __ldg() */
  /* intrinsic is only available on compute capability 3.5 (or newer)   */
  /* With both go through the read-only cache                           */
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
      H[pos] = H[pos] * X[pos] / X[pos+n];
      assert(!fpe(H[pos]));
    #else
      H[pos] = H[pos] * X[pos] / X[pos+n];
    #endif
}


/**
 *  \fn    __global__ void vsupdate1H_cuda(const int n, const float* __restrict__ X, float *H)
 *  \brief This kernel computes simple precision H(i)=H(i)*B(i)/C(i) where matrices
 *         B and C are stored in the same buffer (called X). All B 1st and after C
 *  \param n: (input) Number of elements of H
 *  \param X: (input) Simple precision input matrix (1D column-major)
 *  \param H: (inout) Simple precision input/output matrix (1D column-major)
*/
__global__ void vsupdate1H_cuda(const int n, const float* __restrict__ X, float *H)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
      H[pos] = H[pos] * X[pos] / X[pos+n];
      assert(!fpe(H[pos]));
    #else
      H[pos] = H[pos] * X[pos] / X[pos+n];
    #endif
}


/**
 *  \fn    __global__ void vdupdate1W_cuda(const int m, const int n, const double* __restrict__ X, double *W)
 *  \brief This kernel computes double precision W[i]=W[i]*D[i]/E[i] where matrices D and E are
 *         stored in the same buffer (called X) according beta-divergence general case (see dbdivg_cuda(...))
 *  \param m: (input) Number of rows of W
 *  \param n: (input) Number of columns of W
 *  \param X: (input) Double precision input matrix (1D column-major)
 *  \param W: (inout) Double precision input/output matrix (1D column-major)
*/
__global__ void vdupdate1W_cuda(const int m, const int n, const double* __restrict__ X, double *W)
{
  unsigned int
    pos=blockDim.x * blockIdx.x + threadIdx.x,
    newpos;

  if (pos < m*n)
  {
    newpos = pos+(pos/m)*m;
    #ifdef With_Check
      W[pos] = W[pos] * X[newpos] / X[newpos+m];
      assert(!fpe(W[pos]));
    #else
      W[pos] = W[pos] * X[newpos] / X[newpos+m];
    #endif
  }
}


/**
 *  \fn    __global__ void vsupdate1W_cuda(const int m, const int n, const float* __rectrict__ X, float *W)
 *  \brief This kernel computes simple precision W[i]=W[i]*D[i]/E[i] where matrices D and E are
 *         stored in the same buffer (called X) according beta-divergence general case (see dbdivg_cuda(...))
 *  \param m: (input) Number of rows of W
 *  \param n: (input) Number of columns of W
 *  \param X: (input) Simple precision input matrix (1D column-major)
 *  \param W: (inout) Simple precision input/output matrix (1D column-major)
*/
__global__ void vsupdate1W_cuda(const int m, const int n, const float* __restrict__ X, float *W)
{
  unsigned int
    pos=blockDim.x * blockIdx.x + threadIdx.x,
    newpos;

  if (pos < m*n)
  {
    newpos = pos+(pos/m)*m;
    #ifdef With_Check
      /* Here we can have NaN and Inf if X(newpos+m) and/or W(pos) and or/ X(newpos)=0 */
      W[pos] = W[pos] * X[newpos] / X[newpos+m];
      assert(!fpe(W[pos]));
    #else
      W[pos] = W[pos] * X[newpos] / X[newpos+m];
    #endif
  }
}


/**
 *  \fn    __global__ void vdupdate2H_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict_ B, double *H)
 *  \brief This kernel computes double precision H(i)=H(i)*(B(i)/y(j))
 *  \param m:   (input) Number of rows    of B and H, and number of elements of vector y
 *  \param n:   (input) Number of columns of B and A
 *  \param y:   (input) Double precision vector with the sum of W columns
 *  \param B:   (input) Double precision input matrix (1D column-major)
 *  \param H:   (inout) Double precision input/output matrix (1D column-major)
*/
__global__ void vdupdate2H_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *H)
{
  unsigned int  pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < m*n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos%m) and/or H(pos) and or/ B(pos)=0 */
      H[pos] = H[pos] * (B[pos] / y[pos%m]);
      assert(!fpe(H[pos]));
    #else
      H[pos] = H[pos] * (B[pos] / y[pos%m]);
    #endif
}


/**
 *  \fn    __global__ void vsupdate2H_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *H)
 *  \brief This kernel performs the simple H(i)=H(i)*(B(i)/y(j))
 *  \param m:   (input) Number of rows    of B and H, and number of elements of vector y
 *  \param n:   (input) Number of columns of B and A
 *  \param y:   (input) Simple precision vector with the sum of W columns
 *  \param B:   (input) Simple precision input matrix (1D column-major)
 *  \param H:   (inout) Simple precision input/output matrix (1D column-major)
*/
__global__ void vsupdate2H_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *H)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < m*n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos%m) and/or H(pos) and or/ B(pos)=0 */
      H[pos] = H[pos] * (B[pos] / y[pos%m]);
      assert(!fpe(H[pos]));
    #else
      H[pos] = H[pos] * (B[pos] / y[pos%m]);
    #endif
}


/**
 *  \fn    __global__ void vdupdate2W_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *W)
 *  \brief This kernel performs double precision W(i)=W(i)*(B(i)/y(j))
 *  \param m:   (input) Number of rows    of W and B,
 *  \param n:   (input) Number of columns of W and B, and number of elements of vector y
 *  \param y:   (input) Double precision vector with the sum of H rows
 *  \param B:   (input) Double precision input matrix (1D column-major)
 *  \param W:   (inout) Double precision input/output matrix (1D column-major)
*/
__global__ void vdupdate2W_cuda(const int m, const int n, const double* __restrict__ y, const double* __restrict__ B, double *W)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < m*n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos/m) and/or W(pos) and/or B(pos)=0 */
      W[pos] = W[pos] * (B[pos] / y[pos/m]);
      assert(!fpe(W[pos]));
    #else
      W[pos] = W[pos] * (B[pos] / y[pos/m]);
    #endif
}


/**
 *  \fn    __global__ void vsupdate2W_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *W)
 *  \brief This kernel computes simple precision W(i)=W(i)*(B(i)/y(j))
 *  \param m:   (input) Number of rows    of W and B,
 *  \param n:   (input) Number of columns of W and B, and number of elements of vector y
 *  \param y:   (input) Simple precision vector with the sum of H rows
 *  \param B:   (input) Simple precision input matrix (1D column-major)
 *  \param W:   (inout) Simple precision input/output matrix (1D column-major)
*/
__global__ void vsupdate2W_cuda(const int m, const int n, const float* __restrict__ y, const float* __restrict__ B, float *W)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < m*n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos/m) and/or W(pos) and/or B(pos)=0 */
      W[pos] = W[pos] * (B[pos] / y[pos/m]);
      assert(!fpe(W[pos]));
    #else
      W[pos] = W[pos] * (B[pos] / y[pos/m]);
    #endif
}