NMFPack/bdiv__cpu_8c_source.html

 /***************************************************************************
  *   Copyright (C) 2014 by PIR (University of Oviedo) and                  *
  *   INCO2 (Polytechnic University of Valencia) groups.                    *
  *   nnmfpack@gmail.com                                                    *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
  *   the Free Software Foundation; either version 2 of the License, or     *
  *   (at your option) any later version.                                   *
  *                                                                         *
  *   This program is distributed in the hope that it will be useful,       *
  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  *   GNU General Public License for more details.                          *
  *                                                                         *
  *   You should have received a copy of the GNU General Public License     *
  *   along with this program; if not, write to the                         *
  *   Free Software Foundation, Inc.,                                       *
  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
  ***************************************************************************
 */
 #include "bdiv_cpu.h"


 int dbdivg_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, int nIter)
 {
   double
     *L=NULL,
     *M=NULL,
     *R=NULL;

   int i;

   switch (uType)
   {
      case UpdateAll:
         #ifdef With_MKL
           M = (double *)mkl_malloc(2*max(m,n)* k * sizeof(double), WRDLEN);
           L = (double *)mkl_malloc(        m * n * sizeof(double), WRDLEN);
           R = (double *)mkl_malloc(      2*m * n * sizeof(double), WRDLEN);
         #else
           #ifdef With_ARM
             M = (double *)malloc(2*max(m,n)* k * sizeof(double));
             L = (double *)malloc(        m * n * sizeof(double));
             R = (double *)malloc(      2*m * n * sizeof(double));
           #else
             M = (double *)_mm_malloc(2*max(m,n)* k * sizeof(double), WRDLEN);
             L = (double *)_mm_malloc(        m * n * sizeof(double), WRDLEN);
             R = (double *)_mm_malloc(      2*m * n * sizeof(double), WRDLEN);
           #endif
         #endif

         if (L == NULL || M == NULL || R == NULL)
           return -1;

         for(i=0;i<nIter;i++)
         {
           /* ************************ Phase 1 *************************** */
           /* L=W*H */
           cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A       */
           /* L1=L1(.*)L       */
           /* R is L1 and L2   */
           dkernelH_x86(m, n, L, A, R, expo-2.0);

           /* B=W'*L2 */
           /* C=W'*L1 */
           /* above is equal to R=W'*|L2 | L1| */
           cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);

           /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
           dupdate1H_x86(k*n, M, H);


           /* ************************ Phase 2 *************************** */
           /* L=W*H */
           cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A       */
           /* L1=L1(.*)L       */
           /* R is L1 and L2   */
           dkernelW_x86(m, n , L, A, R, expo-2.0);

           /* D=L2*H' */
           /* E=L1*H' */
           /*                     |L2|      */
           /* above is equal to R=|  | * H' */
           /*                     |L1|      */
           cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);

           /* W=W(.*)D(./)E */
          dupdate1W_x86(m, k, M, W);
         }
         #ifdef With_MKL
           mkl_free(L);
           mkl_free(R);
           mkl_free(M);
         #else
           #ifdef With_ARM
             free(L);
             free(M);
             free(R);
           #else
             _mm_free(L);
             _mm_free(M);
             _mm_free(R);
           #endif
         #endif
         break;

      case UpdateW:
         #ifdef With_MKL
           M = (double *)mkl_malloc(2*m * k * sizeof(double), WRDLEN);
           L = (double *)mkl_malloc(  m * n * sizeof(double), WRDLEN);
           R = (double *)mkl_malloc(2*m * n * sizeof(double), WRDLEN);
         #else
           #ifdef With_ARM
             M = (double *)malloc(2*m * k * sizeof(double));
             L = (double *)malloc(  m * n * sizeof(double));
             R = (double *)malloc(2*m * n * sizeof(double));
           #else
             M = (double *)_mm_malloc(2*m * k * sizeof(double), WRDLEN);
             L = (double *)_mm_malloc(  m * n * sizeof(double), WRDLEN);
             R = (double *)_mm_malloc(2*m * n * sizeof(double), WRDLEN);
           #endif
         #endif

         if (L == NULL || M == NULL || R == NULL )
           return -1;

         for(i=0;i<nIter;i++)
         {
           /* ************************ Phase 2 *************************** */
           /* L=W*H */
           cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A       */
           /* L1=L1(.*)L       */
           /* R is L1 and L2   */
           dkernelW_x86(m, n , L, A, R, expo-2.0);

           /* D=L2*H' */
           /* E=L1*H' */
           /*                     |L2|      */
           /* above is equal to R=|  | * H' */
           /*                     |L1|      */
           cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);

           /* W=W(.*)D(./)E */
          dupdate1W_x86(m, k, M, W);
         }
         #ifdef With_MKL
           mkl_free(L);
           mkl_free(R);
           mkl_free(M);
         #else
           #ifdef With_ARM
             free(L);
             free(M);
             free(R);
           #else
             _mm_free(L);
             _mm_free(M);
             _mm_free(R);
           #endif
         #endif
         break;

      case UpdateH:
         #ifdef With_MKL
           M = (double *)mkl_malloc(2*n * k * sizeof(double), WRDLEN);
           L = (double *)mkl_malloc(  m * n * sizeof(double), WRDLEN);
           R = (double *)mkl_malloc(2*m * n * sizeof(double), WRDLEN);
         #else
           #ifdef With_ARM
             M = (double *)malloc(2*n * k * sizeof(double));
             L = (double *)malloc(  m * n * sizeof(double));
             R = (double *)malloc(2*m * n * sizeof(double));
           #else
             M = (double *)_mm_malloc(2*n * k * sizeof(double), WRDLEN);
             L = (double *)_mm_malloc(  m * n * sizeof(double), WRDLEN);
             R = (double *)_mm_malloc(2*m * n * sizeof(double), WRDLEN);
           #endif
         #endif

         if (L == NULL || M == NULL || R == NULL )
           return -1;

         for(i=0;i<nIter;i++)
         {
           /* ************************ Phase 1 *************************** */
           /* L=W*H */
           cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

           /* L1=L(.^)(expo-2) */
           /* L2=L1(.*)A       */
           /* L1=L1(.*)L       */
           /* R is L1 and L2   */
           dkernelH_x86(m, n, L, A, R, expo-2.0);

           /* B=W'*L2 */
           /* C=W'*L1 */
           /* above is equal to R=W'*|L2 | L1| */
           cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);

           /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
           dupdate1H_x86(k*n, M, H);
         }
         #ifdef With_MKL
           mkl_free(L);
           mkl_free(R);
           mkl_free(M);
         #else
           #ifdef With_ARM
             free(L);
             free(M);
             free(R);
           #else
             _mm_free(L);
             _mm_free(M);
             _mm_free(R);
           #endif
         #endif
         break;

      default:
         return -1;
   }
   return 0;
 }


 int sbdivg_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, int nIter)
 {
   float
     *L=NULL,
     *M=NULL,
     *R=NULL;

   int i;

   switch (uType)
   {
      case UpdateAll:
         #ifdef With_MKL
           M = (float *)mkl_malloc(2*max(m,n) * k * sizeof(float), WRDLEN);
           L = (float *)mkl_malloc(         m * n * sizeof(float), WRDLEN);
           R = (float *)mkl_malloc(       2*m * n * sizeof(float), WRDLEN);
         #else
           #ifdef With_ARM
             M = (float *)malloc(2*max(m,n) * k * sizeof(float));
             L = (float *)malloc(         m * n * sizeof(float));
             R = (float *)malloc(       2*m * n * sizeof(float));
           #else
             M = (float *)_mm_malloc(2*max(m,n) * k * sizeof(float), WRDLEN);
             L = (float *)_mm_malloc(         m * n * sizeof(float), WRDLEN);
             R = (float *)_mm_malloc(       2*m * n * sizeof(float), WRDLEN);
           #endif
         #endif

         if (L == NULL || M == NULL || R == NULL )
            return -1;

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 1 *************************** */
            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A       */
            /* L1=L1(.*)L       */
            /* R is L1 and L2   */
            skernelH_x86(m, n, L, A, R, expo-2.0f);

            /* B=W'*L2 */
            /* C=W'*L1 */
            /* above is equal to R=W'*|L2 | L1| */
            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);

            /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
            supdate1H_x86(k*n, M, H);


            /* ************************ Phase 2 *************************** */
            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A       */
            /* L1=L1(.*)L       */
            /* R is L1 and L2   */
            skernelW_x86(m, n, L, A, R, expo-2.0f);

            /* D=L2*H' */
            /* E=L1*H' */
            /*                     |L2|      */
            /* above is equal to R=|  | * H' */
            /*                     |L1|      */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);

            /* W=W(.*)D(./)E */
            supdate1W_x86(m, k, M, W);
         }
         #ifdef With_MKL
           mkl_free(L);
           mkl_free(R);
           mkl_free(M);
         #else
           #ifdef With_ARM
             free(L);
             free(M);
             free(R);
           #else
             _mm_free(L);
             _mm_free(M);
             _mm_free(R);
           #endif
         #endif
         break;

      case UpdateW:
         #ifdef With_MKL
           M = (float *)mkl_malloc(2*m * k * sizeof(float), WRDLEN);
           L = (float *)mkl_malloc(  m * n * sizeof(float), WRDLEN);
           R = (float *)mkl_malloc(2*m * n * sizeof(float), WRDLEN);
         #else
           #ifdef With_ARM
             M = (float *)malloc(2*m * k * sizeof(float));
             L = (float *)malloc(  m * n * sizeof(float));
             R = (float *)malloc(2*m * n * sizeof(float));
           #else
             M = (float *)_mm_malloc(2*m * k * sizeof(float), WRDLEN);
             L = (float *)_mm_malloc(  m * n * sizeof(float), WRDLEN);
             R = (float *)_mm_malloc(2*m * n * sizeof(float), WRDLEN);
           #endif
         #endif

         if (L == NULL || M == NULL || R == NULL )
            return -1;

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 2 *************************** */
            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A       */
            /* L1=L1(.*)L       */
            /* R is L1 and L2   */
            skernelW_x86(m, n, L, A, R, expo-2.0f);

            /* D=L2*H' */
            /* E=L1*H' */
            /*                     |L2|      */
            /* above is equal to R=|  | * H' */
            /*                     |L1|      */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);

            /* W=W(.*)D(./)E */
            supdate1W_x86(m, k, M, W);
         }
         #ifdef With_MKL
           mkl_free(L);
           mkl_free(R);
           mkl_free(M);
         #else
           #ifdef With_ARM
             free(L);
             free(M);
             free(R);
           #else
             _mm_free(L);
             _mm_free(M);
             _mm_free(R);
           #endif
         #endif
         break;

      case UpdateH:
         #ifdef With_MKL
           M = (float *)mkl_malloc(2*n * k * sizeof(float), WRDLEN);
           L = (float *)mkl_malloc(  m * n * sizeof(float), WRDLEN);
           R = (float *)mkl_malloc(2*m * n * sizeof(float), WRDLEN);
         #else
           #ifdef With_ARM
             M = (float *)malloc(2*n * k * sizeof(float));
             L = (float *)malloc(  m * n * sizeof(float));
             R = (float *)malloc(2*m * n * sizeof(float));
           #else
             M = (float *)_mm_malloc(2*n * k * sizeof(float), WRDLEN);
             L = (float *)_mm_malloc(  m * n * sizeof(float), WRDLEN);
             R = (float *)_mm_malloc(2*m * n * sizeof(float), WRDLEN);
           #endif
         #endif

         if (L == NULL || M == NULL || R == NULL )
            return -1;

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 1 *************************** */
            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L1=L(.^)(expo-2) */
            /* L2=L1(.*)A       */
            /* L1=L1(.*)L       */
            /* R is L1 and L2   */
            skernelH_x86(m, n, L, A, R, expo-2.0f);

            /* B=W'*L2 */
            /* C=W'*L1 */
            /* above is equal to R=W'*|L2 | L1| */
            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);

            /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
            supdate1H_x86(k*n, M, H);
         }
         #ifdef With_MKL
           mkl_free(L);
           mkl_free(R);
           mkl_free(M);
         #else
           #ifdef With_ARM
             free(L);
             free(M);
             free(R);
           #else
             _mm_free(L);
             _mm_free(M);
             _mm_free(R);
           #endif
         #endif
         break;

      default:
         return -1;
   }
   return 0;
 }


 int dbdivone_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
 {
   double
     *B=NULL,
     *L=NULL,
     *x=NULL,
     *y=NULL;

   int i;

   switch (uType)
   {
      case UpdateAll:
         /* Vectors "x" and "y" are used both in Phase 1 and Phase 2. With   */
         /* the strategy used for matrices B and D the sise of x is max(m,n) */
         #ifdef With_MKL
           B = (double *)mkl_malloc(max(m,n) * k * sizeof(double), WRDLEN);
           L = (double *)mkl_malloc(       m * n * sizeof(double), WRDLEN);
           x = (double *)mkl_malloc(    max(m,n) * sizeof(double), WRDLEN);
           y = (double *)mkl_malloc(           k * sizeof(double), WRDLEN);
         #else
           #ifdef With_ARM
             B = (double *)malloc(max(m,n) * k * sizeof(double));
             L = (double *)malloc(       m * n * sizeof(double));
             x = (double *)malloc(    max(m,n) * sizeof(double));
             y = (double *)malloc(           k * sizeof(double));
           #else
             B = (double *)_mm_malloc(max(m,n) * k * sizeof(double), WRDLEN);
             L = (double *)_mm_malloc(       m * n * sizeof(double), WRDLEN);
             x = (double *)_mm_malloc(    max(m,n) * sizeof(double), WRDLEN);
             y = (double *)_mm_malloc(           k * sizeof(double), WRDLEN);
           #endif
         #endif

         if (B == NULL || L == NULL || x == NULL || y ==NULL)
            return -1;

         /* x[i]=1.0 for all i */
         dmemset_x86(max(m,n), x, 1.0);

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 1 *************************** */
            /* Calculate the sums of all W columns via dgemv(W, x) */
            cblas_dgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            ddiv_x86(m*n, A, L);

            /* B=W'*L */
            cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);

            /* B(i,j)=B(i,j)/y(i) for all B elements */
            /* H=H(.*)B */
            dupdate2H_x86(k, n, y, B, H);


            /* ************************ Phase 2 *************************** */
            /* Calculate the sums of all H rows via dgemv(H, x) */
            cblas_dgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            ddiv_x86(m*n, A, L);

            /* B=L*H' */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);

            /* B(i,j)=B(i,j)/y(j) for all B elements */
            /* W=W(.*)B */
            dupdate2W_x86(m, k, y, B, W);
         }
         #ifdef With_MKL
           mkl_free(B);
           mkl_free(L);
           mkl_free(x);
           mkl_free(y);
         #else
           #ifdef With_ARM
             free(B);
             free(L);
             free(x);
             free(y);
           #else
             _mm_free(B);
             _mm_free(L);
             _mm_free(x);
             _mm_free(y);
           #endif
         #endif
         break;

      case UpdateW:
         #ifdef With_MKL
           B = (double *)mkl_malloc(m * k * sizeof(double), WRDLEN);
           L = (double *)mkl_malloc(m * n * sizeof(double), WRDLEN);
           x = (double *)mkl_malloc(    n * sizeof(double), WRDLEN);
           y = (double *)mkl_malloc(    k * sizeof(double), WRDLEN);
         #else
           #ifdef With_ARM
             B = (double *)malloc(m * k * sizeof(double));
             L = (double *)malloc(m * n * sizeof(double));
             x = (double *)malloc(    n * sizeof(double));
             y = (double *)malloc(    k * sizeof(double));
           #else
             B = (double *)_mm_malloc(m * k * sizeof(double), WRDLEN);
             L = (double *)_mm_malloc(m * n * sizeof(double), WRDLEN);
             x = (double *)_mm_malloc(    n * sizeof(double), WRDLEN);
             y = (double *)_mm_malloc(    k * sizeof(double), WRDLEN);
           #endif
         #endif

         if (B == NULL || L == NULL || x == NULL || y ==NULL)
            return -1;

         /* x[i]=1.0 for all i */
         dmemset_x86(n, x, 1.0);

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 2 *************************** */
            /* Calculate the sums of all H rows via dgemv(H, x) */
            cblas_dgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            ddiv_x86(m*n, A, L);

            /* B=L*H' */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);

            /* B(i,j)=B(i,j)/y(j) for all B elements */
            /* W=W(.*)B */
            dupdate2W_x86(m, k, y, B, W);
         }
         #ifdef With_MKL
           mkl_free(B);
           mkl_free(L);
           mkl_free(x);
           mkl_free(y);
         #else
           #ifdef With_ARM
             free(B);
             free(L);
             free(x);
             free(y);
           #else
             _mm_free(B);
             _mm_free(L);
             _mm_free(x);
             _mm_free(y);
           #endif
         #endif
         break;

      case UpdateH:
         #ifdef With_MKL
           B = (double *)mkl_malloc(n * k * sizeof(double), WRDLEN);
           L = (double *)mkl_malloc(m * n * sizeof(double), WRDLEN);
           x = (double *)mkl_malloc(    m * sizeof(double), WRDLEN);
           y = (double *)mkl_malloc(    k * sizeof(double), WRDLEN);
         #else
           #ifdef With_ARM
             B = (double *)malloc(n * k * sizeof(double));
             L = (double *)malloc(m * n * sizeof(double));
             x = (double *)malloc(    m * sizeof(double));
             y = (double *)malloc(    k * sizeof(double));
           #else
             B = (double *)_mm_malloc(n * k * sizeof(double), WRDLEN);
             L = (double *)_mm_malloc(m * n * sizeof(double), WRDLEN);
             x = (double *)_mm_malloc(    m * sizeof(double), WRDLEN);
             y = (double *)_mm_malloc(    k * sizeof(double), WRDLEN);
           #endif
         #endif

         if (B == NULL || L == NULL || x == NULL || y ==NULL)
            return -1;

         /* x[i]=1.0 for all i */
         dmemset_x86(m, x, 1.0);

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 1 *************************** */
            /* Calculate the sums of all W columns via dgemv(W, x) */
            cblas_dgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            ddiv_x86(m*n, A, L);

            /* B=W'*L */
            cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);

            /* B(i,j)=B(i,j)/y(i) for all B elements */
            /* H=H(.*)B */
            dupdate2H_x86(k, n, y, B, H);
         }
         #ifdef With_MKL
           mkl_free(B);
           mkl_free(L);
           mkl_free(x);
           mkl_free(y);
         #else
           #ifdef With_ARM
             free(B);
             free(L);
             free(x);
             free(y);
           #else
             _mm_free(B);
             _mm_free(L);
             _mm_free(x);
             _mm_free(y);
           #endif
         #endif
         break;

      default:
         return -1;
   }
   return 0;
 }


 int sbdivone_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
 {
   float
     *B=NULL,
     *L=NULL,
     *x=NULL,
     *y=NULL;

   int i;

   switch (uType)
   {
      case UpdateAll:
         #ifdef With_MKL
           B = (float *)mkl_malloc(max(m,n) * k * sizeof(float), WRDLEN);
           L = (float *)mkl_malloc(       m * n * sizeof(float), WRDLEN);
           x = (float *)mkl_malloc(    max(m,n) * sizeof(float), WRDLEN);
           y = (float *)mkl_malloc(           k * sizeof(float), WRDLEN);
         #else
           #ifdef With_ARM
             B = (float *)malloc(max(m,n) * k * sizeof(float));
             L = (float *)malloc(       m * n * sizeof(float));
             x = (float *)malloc(    max(m,n) * sizeof(float));
             y = (float *)malloc(           k * sizeof(float));
           #else
             B = (float *)_mm_malloc(max(m,n) * k * sizeof(float), WRDLEN);
             L = (float *)_mm_malloc(       m * n * sizeof(float), WRDLEN);
             x = (float *)_mm_malloc(    max(m,n) * sizeof(float), WRDLEN);
             y = (float *)_mm_malloc(           k * sizeof(float), WRDLEN);
           #endif
         #endif

         if (B == NULL || L == NULL || x == NULL || y ==NULL)
            return -1;

         /* x[i]=1.0 for all i */
         smemset_x86(max(m,n), x, 1.0);

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 1 *************************** */
            /* Calculate the sums of all W columns via sgemv(W, x) */
            cblas_sgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            sdiv_x86(m*n, A, L);

            /* B=W'*L */
            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);

            /* B(i,j)=B(i,j)/y(i) for all B elements */
            /* H=H(.*)B */
            supdate2H_x86(k, n, y, B, H);


            /* ************************ Phase 2 *************************** */
            /* Calculate the sums of all H rows via sgemv(H, x) */
            cblas_sgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            sdiv_x86(m*n, A, L);

            /* B=L*H' */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);

            /* B(i,j)=B(i,j)/y(j) for all B elements */
            /* W=W(.*)B */
            supdate2W_x86(m, k, y, B, W);
         }
         #ifdef With_MKL
           mkl_free(B);
           mkl_free(L);
           mkl_free(x);
           mkl_free(y);
         #else
           #ifdef With_ARM
             free(B);
             free(L);
             free(x);
             free(y);
           #else
             _mm_free(B);
             _mm_free(L);
             _mm_free(x);
             _mm_free(y);
           #endif
         #endif
         break;

      case UpdateW:
         #ifdef With_MKL
           B = (float *)mkl_malloc(m * k * sizeof(float), WRDLEN);
           L = (float *)mkl_malloc(m * n * sizeof(float), WRDLEN);
           x = (float *)mkl_malloc(    n * sizeof(float), WRDLEN);
           y = (float *)mkl_malloc(    k * sizeof(float), WRDLEN);
         #else
           #ifdef With_ARM
             B = (float *)malloc(m * k * sizeof(float));
             L = (float *)malloc(m * n * sizeof(float));
             x = (float *)malloc(    n * sizeof(float));
             y = (float *)malloc(    k * sizeof(float));
           #else
             B = (float *)_mm_malloc(m * k * sizeof(float), WRDLEN);
             L = (float *)_mm_malloc(m * n * sizeof(float), WRDLEN);
             x = (float *)_mm_malloc(    n * sizeof(float), WRDLEN);
             y = (float *)_mm_malloc(    k * sizeof(float), WRDLEN);
           #endif
         #endif

         if (B == NULL || L == NULL || x == NULL || y ==NULL)
            return -1;

         /* x[i]=1.0 for all i */
         smemset_x86(n, x, 1.0);

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 2 *************************** */
            /* Calculate the sums of all H rows via sgemv(H, x) */
            cblas_sgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            sdiv_x86(m*n, A, L);

            /* B=L*H' */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);

            /* B(i,j)=B(i,j)/y(j) for all B elements */
            /* W=W(.*)B */
            supdate2W_x86(m, k, y, B, W);
         }
         #ifdef With_MKL
           mkl_free(B);
           mkl_free(L);
           mkl_free(x);
           mkl_free(y);
         #else
           #ifdef With_ARM
             free(B);
             free(L);
             free(x);
             free(y);
           #else
             _mm_free(B);
             _mm_free(L);
             _mm_free(x);
             _mm_free(y);
           #endif
         #endif
         break;

      case UpdateH:
         #ifdef With_MKL
           B = (float *)mkl_malloc(n * k * sizeof(float), WRDLEN);
           L = (float *)mkl_malloc(m * n * sizeof(float), WRDLEN);
           x = (float *)mkl_malloc(    m * sizeof(float), WRDLEN);
           y = (float *)mkl_malloc(    k * sizeof(float), WRDLEN);
         #else
           #ifdef With_ARM
             B = (float *)malloc(n * k * sizeof(float));
             L = (float *)malloc(m * n * sizeof(float));
             x = (float *)malloc(    m * sizeof(float));
             y = (float *)malloc(    k * sizeof(float));
           #else
             B = (float *)_mm_malloc(n * k * sizeof(float), WRDLEN);
             L = (float *)_mm_malloc(m * n * sizeof(float), WRDLEN);
             x = (float *)_mm_malloc(    m * sizeof(float), WRDLEN);
             y = (float *)_mm_malloc(    k * sizeof(float), WRDLEN);
           #endif
         #endif

         if (B == NULL || L == NULL || x == NULL || y ==NULL)
            return -1;

         /* x[i]=1.0 for all i */
         smemset_x86(m, x, 1.0);

         for(i=0;i<nIter;i++)
         {
            /* ************************ Phase 1 *************************** */
            /* Calculate the sums of all W columns via sgemv(W, x) */
            cblas_sgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);

            /* L=W*H */
            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);

            /* L=A(./)L*/
            sdiv_x86(m*n, A, L);

            /* B=W'*L */
            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);

            /* B(i,j)=B(i,j)/y(i) for all B elements */
            /* H=H(.*)B */
            supdate2H_x86(k, n, y, B, H);
         }
         #ifdef With_MKL
           mkl_free(B);
           mkl_free(L);
           mkl_free(x);
           mkl_free(y);
         #else
           #ifdef With_ARM
             free(B);
             free(L);
             free(x);
             free(y);
           #else
             _mm_free(B);
             _mm_free(L);
             _mm_free(x);
             _mm_free(y);
           #endif
         #endif
         break;

      default:
         return -1;
   }
   return 0;
 }


 int dbdiv_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
 {
   if ((beta < 0.0) || (nIter <= 0))
     return -1;

   if(beta>=2.0 && beta<=2.0)
     return dmlsa_cpu(m, n, k, A, W, H, uType, nIter);
   else
   {
     if(beta>=1.0 && beta<=1.0)
       return dbdivone_cpu(m, n, k, A, W, H, uType, nIter);
     else
       return dbdivg_cpu(m, n, k, A, W, H, beta, uType, nIter);
   }
 }


 int sbdiv_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
 {
   if ((beta < 0.0) || (nIter <= 0))
     return -1;

   if(beta>=2.0 && beta<=2.0)
     return smlsa_cpu(m, n, k, A, W, H, uType, nIter);
   else
   {
     if(beta>=1.0 && beta<=1.0)
       return sbdivone_cpu(m, n, k, A, W, H, uType, nIter);
     else
       return sbdivg_cpu(m, n, k, A, W, H, beta, uType, nIter);
   }
 }

 void dkernelH_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
 {
   int i, size;

   size=m*n;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<size; i++)
   {
     double dtmp1, dtmp2;

     if (L[i]>=0.0 && L[i]<=0.0)
       R[i] = R[i+size] = 0.0;
     else
     {
       dtmp1=L[i];
       dtmp2=pow(dtmp1, expo);

       /* ask for A[i]=0.0 don't give improvements. We don't do it */
       R[i]     =dtmp2 * A[i];
       R[i+size]=dtmp1 * dtmp2;
     }
   }
 }


 void skernelH_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
 {
   int i, size;

   size=m*n;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<size; i++)
   {
     float ftmp1, ftmp2;

     if (L[i]>=0.0 && L[i]<=0.0)
       R[i] = R[i+size] = 0.0;
     else
     {
       ftmp1=L[i];
       ftmp2=powf(ftmp1, expo);

       /* ask for A[i]=0.0 don't give improvements. We don't do it */
       R[i]     =ftmp2 * A[i];
       R[i+size]=ftmp1 * ftmp2;
     }
   }
 }


 void dkernelW_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
 {
   int i;

   #ifdef With_ICC
     #pragma parallel
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     int pos;
     double dtmp1, dtmp2;

     pos=2*m*(i/m)+(i%m);

     if (L[i]>=0.0 && L[i]<=0.0)
       R[pos] = R[pos+m] = 0.0;
     else
     {
       dtmp1=L[i];
       dtmp2=pow(dtmp1, expo);

       /* ask for A[i]=0.0 don't give improvements. We don't do it */
       R[pos]  =dtmp2 * A[i];
       R[pos+m]=dtmp1 * dtmp2;
     }
   }
 }


 void skernelW_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
 {
   int i;

   #ifdef With_ICC
     #pragma parallel
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     int pos;
     float ftmp1, ftmp2;

     pos=2*m*(i/m)+(i%m);

     if (L[i]>=0.0 && L[i]<=0.0)
       R[pos] = R[pos+m] = 0.0;
     else
     {
       ftmp1=L[i];
       ftmp2=powf(ftmp1, expo);

       /* ask for A[i]=0.0 don't give improvements. We don't do it */
       R[pos]  =ftmp2 * A[i];
       R[pos+m]=ftmp1 * ftmp2;
     }
   }
 }


 void dupdate1H_x86(const int n, const double *X, double *__restrict__ H)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=32
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
       H[i]=H[i] * (X[i] / X[i+n]);
       assert(isfinite(H[i]));
     #else
       H[i]=H[i] * (X[i] / X[i+n]);
     #endif
   }
 }

 void supdate1H_x86(const int n, const float *X, float *__restrict__ H)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=32
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if X(i+n) and/or H(i) and/or X(i)=0 */
       H[i]=H[i] * (X[i] / X[i+n]);
       assert(isfinite(H[i]));
     #else
       H[i]=H[i] * (X[i] / X[i+n]);
     #endif
   }
 }


 void dupdate1W_x86(const int m, const int n, const double *X, double *__restrict__ W)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     int pos;

     pos = i + (i/m)*m;

     #ifdef With_Check
       /* Here we can have NaN and Inf if X(pos+m) and/or W(i) and/or X(pos)=0 */
       W[i]=W[i] * (X[pos] / X[pos+m]);
       assert(isfinite(W[i]));
     #else
       W[i]=W[i] * (X[pos] / X[pos+m]);
     #endif
   }
 }


 void supdate1W_x86(const int m, const int n, const float *X, float *__restrict__ W)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     int pos;

     pos  = i + (i/m)*m;

     #ifdef With_Check
       /* Here we can have NaN and Inf if X(pos+m) and/or W(i) and/or X(pos)=0 */
       W[i]=W[i] * (X[pos] / X[pos+m]);
       assert(isfinite(W[i]));
     #else
       W[i]=W[i] * (X[pos] / X[pos+m]);
     #endif
   }
 }


 void dupdate2H_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ H)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=32
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i%m) and/or H(i) and/or B(i)=0 */
       H[i]=H[i] * (B[i] / y[i%m]);
       assert(isfinite(H[i]));
     #else
       H[i]=H[i] * (B[i] / y[i%m]);
     #endif
   }
 }


 void supdate2H_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ H)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=32
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i%m) and/or H(i) and/or B(i)=0 */
       H[i]=H[i] * (B[i] / y[i%m]);
       assert(isfinite(H[i]));
     #else
       H[i]=H[i] * (B[i] / y[i%m]);
     #endif
   }
 }


 void dupdate2W_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ W)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=32
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i/m) and/or W(i) and/or B(i)=0 */
       W[i]=W[i] * (B[i] / y[i/m]);
       assert(isfinite(W[i]));
     #else
       W[i]=W[i] * (B[i] / y[i/m]);
     #endif
   }
 }


 void supdate2W_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ W)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=32
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for(i=0; i<m*n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i/m) and/or W(i) and/or B(i)=0 */
       W[i]=W[i] * (B[i] / y[i/m]);
       assert(isfinite(W[i]));
     #else
       W[i]=W[i] * (B[i] / y[i/m]);
     #endif
   }
 }

 void derrorbd0_x86(const int n, const double *x, double *__restrict__ y)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for (i=0; i<n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
       y[i]=(x[i]/y[i]) - log(x[i]/y[i]) - 1.0;
       assert(isfinite(y[i]));
     #else
       y[i]=(x[i]/y[i]) - log(x[i]/y[i]) - 1.0;
     #endif
   }
 }


 void serrorbd0_x86(const int n, const float *x, float *__restrict__ y)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for (i=0; i<n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
       y[i]=(x[i]/y[i]) - logf(x[i]/y[i]) - 1.0f;
       assert(isfinite(y[i]));
     #else
       y[i]=(x[i]/y[i]) - logf(x[i]/y[i]) - 1.0f;
     #endif
   }
 }


 void derrorbd1_x86(const int n, const double *x, double *__restrict__ y)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for (i=0; i<n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
       y[i]=(x[i]*log(x[i]/y[i])) + y[i] - x[i];
       assert(isfinite(y[i]));
     #else
       y[i]=(x[i]*log(x[i]/y[i])) + y[i] - x[i];
     #endif
   }
 }


 void serrorbd1_x86(const int n, const float *x, float *__restrict__ y)
 {
   int i;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for (i=0; i<n; i++)
   {
     #ifdef With_Check
       /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
       y[i]=(x[i]*logf(x[i]/y[i])) + y[i] - x[i];
       assert(isfinite(y[i]));
     #else
       y[i]=(x[i]*logf(x[i]/y[i])) + y[i] - x[i];
     #endif
   }
 }


 void derrorbdg_x86(const int n, const double *x, double *__restrict__ y, const double beta)
 {
   int i;

   double
     dbeta, dtmp;

   dbeta=beta-1.0;
   dtmp =beta*dbeta;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for (i=0; i<n; i++)
   {
     #ifdef With_Check
       y[i]=(pow(x[i],beta) + (dbeta*pow(y[i],beta)) - (beta*x[i]*pow(y[i],dbeta))) / dtmp;
       assert(isfinite(y[i]));
     #else
       y[i]=(pow(x[i],beta) + (dbeta*pow(y[i],beta)) - (beta*x[i]*pow(y[i],dbeta))) / dtmp;
   #endif
   }
 }


 void serrorbdg_x86(const int n, const float *x, float *__restrict__ y, const float beta)
 {
   int i;

   float
     fbeta, ftmp;

   fbeta=beta-1.0f;
   ftmp =beta*fbeta;

   #ifdef With_ICC
     #pragma loop_count min=16
     #pragma simd
   #else
     #ifdef With_OMP
        #pragma omp parallel for
     #endif
   #endif
   for (i=0; i<n; i++)
   {
     #ifdef With_Check
       y[i]=(powf(x[i],beta) + (fbeta*powf(y[i],beta)) - (beta*x[i]*powf(y[i],fbeta))) / ftmp;
       assert(isfinite(y[i]));
     #else
       y[i]=(powf(x[i],beta) + (fbeta*powf(y[i],beta)) - (beta*x[i]*powf(y[i],fbeta))) / ftmp;
   #endif
   }
 }


 double derrorbd_x86(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double beta)
 {
   double
     error=0.0,
     *tmp=NULL;

   #ifdef With_MKL
     tmp = (double *)mkl_malloc(m*n*sizeof(double), WRDLEN);
   #else
     #ifdef With_ARM
       tmp = (double *)(m*n*sizeof(double));
     #else
       tmp = (double *)_mm_malloc(m*n*sizeof(double), WRDLEN);
     #endif
   #endif

   cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, tmp, m);

   if (beta>=0.0 && beta<=0.0)
     derrorbd0_x86(m*n, A, tmp);
   else
   {
     if (beta>=1.0 && beta<=1.0)
       derrorbd1_x86(m*n, A, tmp);
     else
       derrorbdg_x86(m*n, A, tmp, beta);
   }

   /* Old                                               */
   /* #ifdef With_ICC                                   */
   /*   #pragma loop_count min=512                      */
   /*   #pragma simd reduction(+ : error)               */
   /* #else                                             */
   /*   #pragma omp parallel for reduction(+ : error)   */
   /* #endif                                            */
   /* for (i=0; i<m*n; i++)                             */
   /*   error += tmp[i];                                */
   /* all tmp elements are >=0 so we use MKL/BLAS       */
   error=cblas_dasum(m*n, tmp, 1);

   error=sqrt((2.0*error)/((double)m*n));

   #ifdef With_MKL
     mkl_free(tmp);
   #else
     #ifdef With_ARM
        free(tmp);
     #else
        _mm_free(tmp);
     #endif
   #endif

   return error;
 }


 float serrorbd_x86(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float beta)
 {
   float
     error=0.0,
     *tmp=NULL;

   #ifdef With_MKL
     tmp = (float *)mkl_malloc(m*n*sizeof(float), WRDLEN);
   #else
     #ifdef With_ARM
       tmp = (float *)malloc(m*n*sizeof(float));
     #else
       tmp = (float *)_mm_malloc(m*n*sizeof(float), WRDLEN);
     #endif
   #endif

   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, tmp, m);

   if (beta>=0.0 && beta<=0.0)
     serrorbd0_x86(m*n, A, tmp);
   else
   {
     if (beta>=1.0 && beta<=1.0)
       serrorbd1_x86(m*n, A, tmp);
     else
       serrorbdg_x86(m*n, A, tmp, beta);
   }

   /* Old                                               */
   /* #ifdef With_ICC                                   */
   /*   #pragma loop_count min=512                      */
   /*   #pragma simd reduction(+ : error)               */
   /* #else                                             */
   /*   #pragma omp parallel for reduction(+ : error)   */
   /* #endif                                            */
   /* for (i=0; i<m*n; i++)                             */
   /*   error += tmp[i];                                */
   /* all tmp elements are >=0 so we use MKL/BLAS       */
   error=cblas_sasum(m*n, tmp, 1);

   error=sqrtf((2.0f*error)/((float)m*n));

   #ifdef With_MKL
     mkl_free(tmp);
   #else
     #ifdef With_ARM
        free(tmp);
     #else
        _mm_free(tmp);
     #endif
   #endif

   return error;
 }

serrorbdg_x86
void serrorbdg_x86(const int n, const float *x, float *__restrict__ y, const float beta)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1741

dupdate1W_x86
void dupdate1W_x86(const int m, const int n, const double *X, double *__restrict__ W)
Definition: bdiv_cpu.c:1359

dupdate2W_x86
void dupdate2W_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ W)
This function performs double precision W(i)=W(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1503

supdate2W_x86
void supdate2W_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ W)
This function computes simple precision W(i)=W(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1537

skernelW_x86
void skernelW_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
This function computes simple precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo) Note exp...
Definition: bdiv_cpu.c:1250

dkernelH_x86
void dkernelH_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
Definition: bdiv_cpu.c:1116

dmlsa_cpu
int dmlsa_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dmlsa_cpu performs NNMF using betadivergence when beta=2 using double precision
Definition: mlsa_cpu.c:74

dbdiv_cpu
int dbdiv_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
dbdiv_cpu is a wrapper that calls the adequate function to performs NNMF using betadivergence using d...
Definition: bdiv_cpu.c:1059

derrorbd0_x86
void derrorbd0_x86(const int n, const double *x, double *__restrict__ y)
This function performs auxiliar double precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1569

derrorbd1_x86
void derrorbd1_x86(const int n, const double *x, double *__restrict__ y)
This function performs auxiliar double precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1635

derrorbdg_x86
void derrorbdg_x86(const int n, const double *x, double *__restrict__ y, const double beta)
This function performs auxiliar double precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1702

sbdiv_cpu
int sbdiv_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
Definition: bdiv_cpu.c:1089

smlsa_cpu
int smlsa_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
smlsa_cpu performs NNMF using betadivergence when beta=2 using simple precision
Definition: mlsa_cpu.c:275

supdate1H_x86
void supdate1H_x86(const int n, const float *X, float *__restrict__ H)
This function computes simple precision H(i)=H(i)*B(i)/C(i) where matrices B and C are stored in the ...
Definition: bdiv_cpu.c:1325

dbdivone_cpu
int dbdivone_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dbdivone_cpu performs NNMF using beta-divergence when beta=1, using double precision ...
Definition: bdiv_cpu.c:566

supdate1W_x86
void supdate1W_x86(const int m, const int n, const float *X, float *__restrict__ W)
This function computes double precision W[i]=W[i]*D[i]/E[i] where matrices D and E are stored in the ...
Definition: bdiv_cpu.c:1397

derrorbd_x86
double derrorbd_x86(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double beta)
This function returns double precision error when error is computed using betadivergence error formul...
Definition: bdiv_cpu.c:1782

ddiv_x86
void ddiv_x86(const int n, const double *x, double *__restrict__ y)
This function calls the appropiate funtions to performs double precision element-wise y[i]=x[i]/y[i] ...
Definition: utils_x86.c:91

sbdivg_cpu
int sbdivg_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, int nIter)
sbdivg_cpu performs NNMF using betadivergence for general case (beta <> 1 and 2) using simple precisi...
Definition: bdiv_cpu.c:313

dbdivg_cpu
int dbdivg_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, int nIter)
dbdivg_cpu performs the NNMF using beta-divergence when beta is != 1 and !=2, using double precision...
Definition: bdiv_cpu.c:87

sbdivone_cpu
int sbdivone_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
sbdivone_cpu performs NNMF using betadivergence when beta=1 using simple precision ...
Definition: bdiv_cpu.c:813

serrorbd0_x86
void serrorbd0_x86(const int n, const float *x, float *__restrict__ y)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1602

sdiv_x86
void sdiv_x86(const int n, const float *x, float *__restrict__ y)
This function calls the appropiate funtions to performs simple precision element-wise x[i]=x[i]/y[i] ...
Definition: utils_x86.c:127

serrorbd1_x86
void serrorbd1_x86(const int n, const float *x, float *__restrict__ y)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1668

skernelH_x86
void skernelH_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
This function computes simple precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo) Note "exp...
Definition: bdiv_cpu.c:1160

serrorbd_x86
float serrorbd_x86(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float beta)
This function returns simple precision error when error is computed using betadivergence error formul...
Definition: bdiv_cpu.c:1849

dupdate2H_x86
void dupdate2H_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ H)
This function computes double precision H(i)=H(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1435

smemset_x86
void smemset_x86(const int n, float *__restrict__ x, const float val)
This function fills all positions of x with val.
Definition: utils_x86.c:66

supdate2H_x86
void supdate2H_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ H)
This function performs the simple H(i)=H(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1469

bdiv_cpu.h
Header file for using the betadivergence cuda functions with CPU.

dkernelW_x86
void dkernelW_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
This function computes double precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo) Note exp...
Definition: bdiv_cpu.c:1204

dupdate1H_x86
void dupdate1H_x86(const int n, const double *X, double *__restrict__ H)
This function computes double precision H(i)=H(i)*B(i)/C(i) where matrices B and C are stored in the ...
Definition: bdiv_cpu.c:1293

dmemset_x86
void dmemset_x86(const int n, double *__restrict__ x, const double val)
This function fills all positions of x with val.
Definition: utils_x86.c:42