NMFPack/mlsa__cuda_8cu_source.html

/***************************************************************************
 *   Copyright (C) 2014 by PIR (University of Oviedo) and                  *
 *   INCO2 (Polytechnic University of Valencia) groups.                    *
 *   nnmfpack@gmail.com                                                    *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************
*/
/**
 *  \file    mlsa_cuda.cu
 *  \brief   File with functions to calcule NNMF using mlsa method for GPUs
 *  \author  Information Retrieval and Parallel Computing Group (IRPCG)
 *  \author  University of Oviedo, Spain
 *  \author  Interdisciplinary Computation and Communication Group (INCO2)
 *  \author  Universitat Politecnica de Valencia, Spain.
 *  \author  Contact: nnmfpack@gmail.com
 *  \date    04/11/14
*/
#include "mlsa_cuda.h"


/**
 *  \fn    int dmlsa_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
 *  \brief this function performs NNMF using beta-divergence when beta=2, using double precision.
 *
 *         The algorithm is<BR>
 *         &nbsp;&nbsp;repit nIter times<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;STEP 1<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;B=W'*A<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;C=W'*L<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H=H(.*)B(./)C<BR>
 *
 *         &nbsp;&nbsp;&nbsp;&nbsp;STEP 2<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;L=W*H<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;D=A*H'<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;E=L*H'<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;W=W(.*)D(./)E<BR>
 *         &nbsp;&nbsp;&nbsp;&nbsp;end repit<BR>
 *         End algorithm<BR>
 *
 *         To save some FLOPs and RAM a modified Lee and Seung version is used, so we have Step 1: L=W'*W,
 *         C=L*H and then B=W'*A; Step 2: L=H*H', E=W*L and D=A*H'. W'*W (H*H') and L*H (W*L) can do in
 *         parallel with W'*A (A*H'). Here we use streams.
 *
 *         In real live B and C are (k*n) matrices used in STEP 1, and D and E are (m*k)
 *         matrices used in STEP 2. B/C and D/E are independent. For this reason only 2 matrices are declared
 *         to save space. They are matrices B and C with size max(m,n)*k
 *
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Double precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Double precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Double precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
 *
 *  It returns 0 if all is OK.
*/
int dmlsa_cuda(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
{
  double
    *B=NULL,
    *C=NULL,
    *L=NULL,
    alpha=1.0,
    beta =0.0;

  int
    i, devID;

  cublasHandle_t
    handle1,
    handle2;

  cudaStream_t
    stream1,
    stream2;

  cudaEvent_t
    event1,
    event2;

  CUDAERR(cudaGetDevice(&devID));

  switch (uType)
  {
     case UpdateAll:
        CUDAERR(cudaMalloc((void **)&B, max(m,n)*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&C, max(m,n)*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L,        k*k*sizeof(double)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
        CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Why streams? Why not. Today perhaps no but in the future CUBLAS... */
        /* Why this approach of streams?                                      */
        /* Let m=n and k = n/2. Thereby B=W'*A has n^3 flops. On other hand   */
        /* W'*W has (n^3)/2 and L*H the same. That is, stream 2 has n^3 flops */
        /* and stream 1 the same ((n^3)/2 + (n^3)/2 = n^3). When k is smaller */
        /* than n/2 stream 2 runs more flops (slower). When k is bigger... In */
        /* real problems k (is) should be much smaller than min(m,n) so this  */
        /* approach is (maybe) the best.                                      */

        /* Stream1 recording event "event2" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event2, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 1 **************************** */
           /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=W'*A */
           CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=W'*W */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));

           /* C=L*H */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));

           /* Stream1 waiting "event1" set by stream2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* H=H(.*)B(./)C */
           ddotdiv_cuda(k*n, B, C, H, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));


           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event2" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=A*H' */
           CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=H*H' */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));

           /* C=W*L */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));

           /* Stream1 waiting "event1" set by stream 2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* W=W(.*)B(./)C */
           ddotdiv_cuda(m*k, B, C, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event1));
        CUDAERR(cudaEventDestroy(event2));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(C));
        CUDAERR(cudaFree(L));
        break;

     case UpdateW:
        CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&C, m*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
        CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Why streams? Why not. Today perhaps no but in the future CUBLAS... */
        /* Why this approach of streams?                                      */
        /* Let m=n and k = n/2. Thereby B=W'*A has n^3 flops. On other hand   */
        /* W'*W has (n^3)/2 and L*H the same. That is, stream 2 has n^3 flops */
        /* and stream 1 the same ((n^3)/2 + (n^3)/2 = n^3). When k is smaller */
        /* than n/2 stream 2 runs more flops (slower). When k is bigger... In */
        /* real problems k (is) should be much smaller than min(m,n) so this  */
        /* approach is (maybe) the best.                                      */

        /* Stream1 recording event "event2" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event2, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event2" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=A*H' */
           CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=H*H' */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));

           /* C=W*L */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));

           /* Stream1 waiting "event1" set by stream 2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* W=W(.*)B(./)C */
           ddotdiv_cuda(m*k, B, C, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event1));
        CUDAERR(cudaEventDestroy(event2));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(C));
        CUDAERR(cudaFree(L));
        break;

     case UpdateH:
        CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&C, n*k*sizeof(double)));
        CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(double)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
        CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Stream1 recording event "event2" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event2, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 1 **************************** */
           /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=W'*A */
           CUBLASERR(cublasDgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=W'*W */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));

           /* C=L*H */
           CUBLASERR(cublasDgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));

           /* Stream1 waiting "event1" set by stream2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* H=H(.*)B(./)C */
           ddotdiv_cuda(k*n, B, C, H, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event1));
        CUDAERR(cudaEventDestroy(event2));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(C));
        CUDAERR(cudaFree(L));
        break;

     default:
        return -1;
  }
  return 0;
}


/**
 *  \fn    int smlsa_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
 *  \brief smlsa_cuda performs NNMF using betadi-vergence when beta=2, using simple precision.
 *         See description of dmlsa_cuda for more info
 *  \param m:     (input) Number of rows of matrix A and matrix W
 *  \param n:     (input) Number of columns of matrix A and matrix H
 *  \param k:     (input) Number of columns of matrix W and rows of H
 *  \param A:     (input) Simple precision input matrix of (m * n) number of elements stored using 1D column-major
 *  \param W:     (inout) Simple precision input/output matrix of (m * k) number of elements stored using 1D column-major
 *  \param H:     (inout) Simple precision input/output matrix of (k * n) number of elements stored using 1D column-major
 *  \param uType: (input) It can be UpdateAll (W and H are updated), UpdateW (only H is updated) or UpdateH (only W is updated)
 *  \param nIter: (input) Number of iterations
 *
 *  It returns 0 if all is OK.
*/
int smlsa_cuda(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
{
  float
    *B=NULL,
    *C=NULL,
    *L=NULL,
    alpha=1.0f,
    beta =0.0f;

  int
    i, devID;

  cublasHandle_t
    handle1,
    handle2;

  cudaStream_t
    stream1,
    stream2;

  cudaEvent_t
    event1,
    event2;

  CUDAERR(cudaGetDevice(&devID));

  switch (uType)
  {
     case UpdateAll:
        CUDAERR(cudaMalloc((void **)&B,  max(m,n)*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&C,  max(m,n)*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L,         k*k*sizeof(float)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
        CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Why streams? Why this approach of streams? See dmlsa_cuda */

        /* Stream1 recording event "event2" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event2, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 1 **************************** */
           /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=W'*A */
           CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=W'*W */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));

           /* C=L*H */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));

           /* Stream1 waiting "event1" set by stream2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* H=H(.*)B(./)C */
           sdotdiv_cuda(k*n, B, C, H, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));


           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event2" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=A*H' */
           CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=H*H' */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));

           /* C=W*L */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));

           /* Stream1 waiting "event1" set by stream 2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* W=W(.*)B(./)C */
           sdotdiv_cuda(m*k, B, C, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event1));
        CUDAERR(cudaEventDestroy(event2));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(C));
        CUDAERR(cudaFree(L));
        break;

     case UpdateW:
        CUDAERR(cudaMalloc((void **)&B, m*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&C, m*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
        CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Why streams? Why this approach of streams? See dmlsa_cuda */

        /* Stream1 recording event "event2" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event2, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 2 **************************** */
           /* Stream2 waiting "event2" set by stream1 */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=A*H' */
           CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_N, CUBLAS_OP_T, m, k, n, &alpha, A, m, H, k, &beta, B, m));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=H*H' */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_T, k, k, n, &alpha, H, k, H, k, &beta, L, k));

           /* C=W*L */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, m, k, k, &alpha, W, m, L, k, &beta, C, m));

           /* Stream1 waiting "event1" set by stream 2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* W=W(.*)B(./)C */
           sdotdiv_cuda(m*k, B, C, W, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event1));
        CUDAERR(cudaEventDestroy(event2));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(C));
        CUDAERR(cudaFree(L));
        break;

     case UpdateH:
        CUDAERR(cudaMalloc((void **)&B, n*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&C, n*k*sizeof(float)));
        CUDAERR(cudaMalloc((void **)&L, k*k*sizeof(float)));

        CUDAERR(cudaStreamCreate(&stream1));
        CUDAERR(cudaStreamCreate(&stream2));

        CUBLASERR(cublasCreate(&handle1));
        CUBLASERR(cublasCreate(&handle2));

        CUDAERR(cudaEventCreateWithFlags(&event1, cudaEventDisableTiming));
        CUDAERR(cudaEventCreateWithFlags(&event2, cudaEventDisableTiming));

        CUBLASERR(cublasSetStream(handle1, stream1));
        CUBLASERR(cublasSetStream(handle2, stream2));

        /* Why streams? Why this approach of streams? See dmlsa_cuda */

        /* Stream1 recording event "event2" for stream2 (for the 1st time) */
        CUDAERR(cudaEventRecord(event2, stream1));

        for(i=0;i<nIter;i++)
        {
           /* ************************** Phase 1 **************************** */
           /* Stream2 waiting "event2" set by stream1. 1st time is always ok */
           CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

           /* B=W'*A */
           CUBLASERR(cublasSgemm(handle2, CUBLAS_OP_T, CUBLAS_OP_N, k, n, m, &alpha, W, m, A, m, &beta, B, k));

           /* Stream2 recording event "event1" for stream1 */
           CUDAERR(cudaEventRecord(event1, stream2));

           /* L=W'*W */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_T, CUBLAS_OP_N, k, k, m, &alpha, W, m, W, m, &beta, L, k));

           /* C=L*H */
           CUBLASERR(cublasSgemm(handle1, CUBLAS_OP_N, CUBLAS_OP_N, k, n, k, &alpha, L, k, H, k, &beta, C, k));

           /* Stream1 waiting "event1" set by stream2 */
           CUDAERR(cudaStreamWaitEvent(stream1, event1, 0));

           /* H=H(.*)B(./)C */
           sdotdiv_cuda(k*n, B, C, H, stream1);

           /* Stream1 recording event "event2" for stream2 */
           CUDAERR(cudaEventRecord(event2, stream1));
        }
        /* maybe not needed but by completeness ... */
        CUDAERR(cudaStreamWaitEvent(stream2, event2, 0));

        CUBLASERR(cublasDestroy(handle1));
        CUBLASERR(cublasDestroy(handle2));

        CUDAERR(cudaStreamDestroy(stream1));
        CUDAERR(cudaStreamDestroy(stream2));

        CUDAERR(cudaEventDestroy(event1));
        CUDAERR(cudaEventDestroy(event2));

        CUDAERR(cudaFree(B));
        CUDAERR(cudaFree(C));
        CUDAERR(cudaFree(L));
        break;

     default:
        return -1;
  }
  return 0;
}


/**
 *  \fn    void ddotdiv_cuda(const int n, const double *x, const double *y, double *z, cudaStream_t stream)
 *  \brief It calls kernel vddotdiv_cuda that performs z[i]=x[i]*y[i]/z[i]
 *  \param n:      (input) Number of elements of x, y and z
 *  \param x:      (input) Double precision input vector/matrix (1D column-major)
 *  \param y:      (input) Double precision input vector/matrix (1D column-major)
 *  \param z:      (inout) Double precision input/output vector/matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void ddotdiv_cuda(const int n, const double *x, const double *y, double *z, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vddotdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y, z);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    void sdotdiv_cuda(const int n, const float *x, const float *y, float *z, cudaStream_t stream)
 *  \brief It calls kernel vsdotdiv_cuda that performs z[i]=x[i]*y[i]/z[i]
 *  \param n:      (input) Number of elements of x, y and z
 *  \param x:      (input) Simple precision input vector/matrix (1D column-major)
 *  \param y:      (input) Simple precision input vector/matrix (1D column-major)
 *  \param z:      (inout) Simple precision input/output vector/matrix (1D column-major)
 *  \param stream: (input) ID of the stream to use
*/
void sdotdiv_cuda(const int n, const float *x, const float *y, float *z, cudaStream_t stream)
{
  dim3 dimGrid, dimBlock;

  #if defined(CUDA_ARCH) && (CUDA_ARCH == 200)
    dimBlock.x = 192;
  #else
    dimBlock.x = 256;
  #endif

  dimGrid.x = (n + dimBlock.x -1) / dimBlock.x;
  vsdotdiv_cuda<<<dimGrid, dimBlock, 0, stream>>>(n, x, y, z);
  #ifdef With_Check
    cudaDeviceSynchronize();
  #endif
}


/**
 *  \fn    __global__ void vddotdiv_cuda(const int n, const double* __restrict__ x, const double* __restrict__ y, double *z)
 *  \brief This kernel computes double precision z[i]=z[i]*x[i]/y[i]
 *  \param n: (input) Number of elements of x, y and z
 *  \param x: (input) Double precision input vector/matrix (1D column-major)
 *  \param y: (input) Double precision input vector/matrix (1D column-major)
 *  \param z: (inout) Double precision input/output vector/matrix (1D column-major)
*/
__global__ void vddotdiv_cuda(const int n, const double* __restrict__ x, const double* __restrict__ y, double *z)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    #ifdef With_Check
      z[pos] = z[pos] * x[pos] / y[pos];
      assert(!fpe(z[pos]));
    #else
      z[pos] = z[pos] * x[pos] / y[pos];
    #endif
}


/**
 *  \fn    __global__ void vsdotdiv_cuda(const int n, const float* __restrict__ x, const float* __restrict__ y, float *z)
 *  \brief This kernel calculates simple precision z[i]=z[i]*x[i]/y[i]
 *  \param n: (input) Number of elements of x, y and z
 *  \param x: (input) Simple precision input vector/matrix (1D column-major)
 *  \param y: (input) Simple precision input vector/matrix (1D column-major)
 *  \param z: (inout) Simple precision input/output vector/matrix (1D column-major)
*/
__global__ void vsdotdiv_cuda(const int n, const float* __restrict__ x, const float* __restrict__ y, float *z)
{
  unsigned int pos=blockDim.x * blockIdx.x + threadIdx.x;

  if (pos < n)
    #ifdef With_Check
      /* Here we can have NaN and Inf if y(pos) and/or x(pos) and/or z[pos]=0 */
      z[pos] = z[pos] * x[pos] / y[pos];
      assert(!fpe(z[pos]));
    #else
      z[pos] = z[pos] * x[pos] / y[pos];
    #endif
}