87 int dbdivg_cpu(
const int m,
const int n,
const int k,
const double *A,
double *W,
double *H,
const double expo,
const int uType,
int nIter)
100 M = (
double *)mkl_malloc(2*max(m,n)* k *
sizeof(double), WRDLEN);
101 L = (
double *)mkl_malloc( m * n *
sizeof(
double), WRDLEN);
102 R = (
double *)mkl_malloc( 2*m * n *
sizeof(
double), WRDLEN);
105 M = (
double *)malloc(2*max(m,n)* k *
sizeof(double));
106 L = (
double *)malloc( m * n *
sizeof(
double));
107 R = (
double *)malloc( 2*m * n *
sizeof(
double));
109 M = (
double *)_mm_malloc(2*max(m,n)* k *
sizeof(double), WRDLEN);
110 L = (
double *)_mm_malloc( m * n *
sizeof(
double), WRDLEN);
111 R = (
double *)_mm_malloc( 2*m * n *
sizeof(
double), WRDLEN);
115 if (L == NULL || M == NULL || R == NULL)
122 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
133 cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
141 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
154 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
178 M = (
double *)mkl_malloc(2*m * k *
sizeof(
double), WRDLEN);
179 L = (
double *)mkl_malloc( m * n *
sizeof(
double), WRDLEN);
180 R = (
double *)mkl_malloc(2*m * n *
sizeof(
double), WRDLEN);
183 M = (
double *)malloc(2*m * k *
sizeof(
double));
184 L = (
double *)malloc( m * n *
sizeof(
double));
185 R = (
double *)malloc(2*m * n *
sizeof(
double));
187 M = (
double *)_mm_malloc(2*m * k *
sizeof(
double), WRDLEN);
188 L = (
double *)_mm_malloc( m * n *
sizeof(
double), WRDLEN);
189 R = (
double *)_mm_malloc(2*m * n *
sizeof(
double), WRDLEN);
193 if (L == NULL || M == NULL || R == NULL )
200 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
213 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
237 M = (
double *)mkl_malloc(2*n * k *
sizeof(
double), WRDLEN);
238 L = (
double *)mkl_malloc( m * n *
sizeof(
double), WRDLEN);
239 R = (
double *)mkl_malloc(2*m * n *
sizeof(
double), WRDLEN);
242 M = (
double *)malloc(2*n * k *
sizeof(
double));
243 L = (
double *)malloc( m * n *
sizeof(
double));
244 R = (
double *)malloc(2*m * n *
sizeof(
double));
246 M = (
double *)_mm_malloc(2*n * k *
sizeof(
double), WRDLEN);
247 L = (
double *)_mm_malloc( m * n *
sizeof(
double), WRDLEN);
248 R = (
double *)_mm_malloc(2*m * n *
sizeof(
double), WRDLEN);
252 if (L == NULL || M == NULL || R == NULL )
259 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
270 cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
313 int sbdivg_cpu(
const int m,
const int n,
const int k,
const float *A,
float *W,
float *H,
const float expo,
const int uType,
int nIter)
326 M = (
float *)mkl_malloc(2*max(m,n) * k *
sizeof(float), WRDLEN);
327 L = (
float *)mkl_malloc( m * n *
sizeof(
float), WRDLEN);
328 R = (
float *)mkl_malloc( 2*m * n *
sizeof(
float), WRDLEN);
331 M = (
float *)malloc(2*max(m,n) * k *
sizeof(float));
332 L = (
float *)malloc( m * n *
sizeof(
float));
333 R = (
float *)malloc( 2*m * n *
sizeof(
float));
335 M = (
float *)_mm_malloc(2*max(m,n) * k *
sizeof(float), WRDLEN);
336 L = (
float *)_mm_malloc( m * n *
sizeof(
float), WRDLEN);
337 R = (
float *)_mm_malloc( 2*m * n *
sizeof(
float), WRDLEN);
341 if (L == NULL || M == NULL || R == NULL )
348 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
359 cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
367 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
380 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
404 M = (
float *)mkl_malloc(2*m * k *
sizeof(
float), WRDLEN);
405 L = (
float *)mkl_malloc( m * n *
sizeof(
float), WRDLEN);
406 R = (
float *)mkl_malloc(2*m * n *
sizeof(
float), WRDLEN);
409 M = (
float *)malloc(2*m * k *
sizeof(
float));
410 L = (
float *)malloc( m * n *
sizeof(
float));
411 R = (
float *)malloc(2*m * n *
sizeof(
float));
413 M = (
float *)_mm_malloc(2*m * k *
sizeof(
float), WRDLEN);
414 L = (
float *)_mm_malloc( m * n *
sizeof(
float), WRDLEN);
415 R = (
float *)_mm_malloc(2*m * n *
sizeof(
float), WRDLEN);
419 if (L == NULL || M == NULL || R == NULL )
426 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
439 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
463 M = (
float *)mkl_malloc(2*n * k *
sizeof(
float), WRDLEN);
464 L = (
float *)mkl_malloc( m * n *
sizeof(
float), WRDLEN);
465 R = (
float *)mkl_malloc(2*m * n *
sizeof(
float), WRDLEN);
468 M = (
float *)malloc(2*n * k *
sizeof(
float));
469 L = (
float *)malloc( m * n *
sizeof(
float));
470 R = (
float *)malloc(2*m * n *
sizeof(
float));
472 M = (
float *)_mm_malloc(2*n * k *
sizeof(
float), WRDLEN);
473 L = (
float *)_mm_malloc( m * n *
sizeof(
float), WRDLEN);
474 R = (
float *)_mm_malloc(2*m * n *
sizeof(
float), WRDLEN);
478 if (L == NULL || M == NULL || R == NULL )
485 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
496 cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
566 int dbdivone_cpu(
const int m,
const int n,
const int k,
const double *A,
double *W,
double *H,
const int uType,
const int nIter)
582 B = (
double *)mkl_malloc(max(m,n) * k *
sizeof(double), WRDLEN);
583 L = (
double *)mkl_malloc( m * n *
sizeof(
double), WRDLEN);
584 x = (
double *)mkl_malloc( max(m,n) *
sizeof(double), WRDLEN);
585 y = (
double *)mkl_malloc( k *
sizeof(
double), WRDLEN);
588 B = (
double *)malloc(max(m,n) * k *
sizeof(double));
589 L = (
double *)malloc( m * n *
sizeof(
double));
590 x = (
double *)malloc( max(m,n) *
sizeof(double));
591 y = (
double *)malloc( k *
sizeof(
double));
593 B = (
double *)_mm_malloc(max(m,n) * k *
sizeof(double), WRDLEN);
594 L = (
double *)_mm_malloc( m * n *
sizeof(
double), WRDLEN);
595 x = (
double *)_mm_malloc( max(m,n) *
sizeof(double), WRDLEN);
596 y = (
double *)_mm_malloc( k *
sizeof(
double), WRDLEN);
600 if (B == NULL || L == NULL || x == NULL || y ==NULL)
610 cblas_dgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
613 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
619 cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
628 cblas_dgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
631 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
637 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
665 B = (
double *)mkl_malloc(m * k *
sizeof(
double), WRDLEN);
666 L = (
double *)mkl_malloc(m * n *
sizeof(
double), WRDLEN);
667 x = (
double *)mkl_malloc( n *
sizeof(
double), WRDLEN);
668 y = (
double *)mkl_malloc( k *
sizeof(
double), WRDLEN);
671 B = (
double *)malloc(m * k *
sizeof(
double));
672 L = (
double *)malloc(m * n *
sizeof(
double));
673 x = (
double *)malloc( n *
sizeof(
double));
674 y = (
double *)malloc( k *
sizeof(
double));
676 B = (
double *)_mm_malloc(m * k *
sizeof(
double), WRDLEN);
677 L = (
double *)_mm_malloc(m * n *
sizeof(
double), WRDLEN);
678 x = (
double *)_mm_malloc( n *
sizeof(
double), WRDLEN);
679 y = (
double *)_mm_malloc( k *
sizeof(
double), WRDLEN);
683 if (B == NULL || L == NULL || x == NULL || y ==NULL)
693 cblas_dgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
696 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
702 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
730 B = (
double *)mkl_malloc(n * k *
sizeof(
double), WRDLEN);
731 L = (
double *)mkl_malloc(m * n *
sizeof(
double), WRDLEN);
732 x = (
double *)mkl_malloc( m *
sizeof(
double), WRDLEN);
733 y = (
double *)mkl_malloc( k *
sizeof(
double), WRDLEN);
736 B = (
double *)malloc(n * k *
sizeof(
double));
737 L = (
double *)malloc(m * n *
sizeof(
double));
738 x = (
double *)malloc( m *
sizeof(
double));
739 y = (
double *)malloc( k *
sizeof(
double));
741 B = (
double *)_mm_malloc(n * k *
sizeof(
double), WRDLEN);
742 L = (
double *)_mm_malloc(m * n *
sizeof(
double), WRDLEN);
743 x = (
double *)_mm_malloc( m *
sizeof(
double), WRDLEN);
744 y = (
double *)_mm_malloc( k *
sizeof(
double), WRDLEN);
748 if (B == NULL || L == NULL || x == NULL || y ==NULL)
758 cblas_dgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
761 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
767 cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
813 int sbdivone_cpu(
const int m,
const int n,
const int k,
const float *A,
float *W,
float *H,
const int uType,
const int nIter)
827 B = (
float *)mkl_malloc(max(m,n) * k *
sizeof(float), WRDLEN);
828 L = (
float *)mkl_malloc( m * n *
sizeof(
float), WRDLEN);
829 x = (
float *)mkl_malloc( max(m,n) *
sizeof(float), WRDLEN);
830 y = (
float *)mkl_malloc( k *
sizeof(
float), WRDLEN);
833 B = (
float *)malloc(max(m,n) * k *
sizeof(float));
834 L = (
float *)malloc( m * n *
sizeof(
float));
835 x = (
float *)malloc( max(m,n) *
sizeof(float));
836 y = (
float *)malloc( k *
sizeof(
float));
838 B = (
float *)_mm_malloc(max(m,n) * k *
sizeof(float), WRDLEN);
839 L = (
float *)_mm_malloc( m * n *
sizeof(
float), WRDLEN);
840 x = (
float *)_mm_malloc( max(m,n) *
sizeof(float), WRDLEN);
841 y = (
float *)_mm_malloc( k *
sizeof(
float), WRDLEN);
845 if (B == NULL || L == NULL || x == NULL || y ==NULL)
855 cblas_sgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
858 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
864 cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
873 cblas_sgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
876 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
882 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
910 B = (
float *)mkl_malloc(m * k *
sizeof(
float), WRDLEN);
911 L = (
float *)mkl_malloc(m * n *
sizeof(
float), WRDLEN);
912 x = (
float *)mkl_malloc( n *
sizeof(
float), WRDLEN);
913 y = (
float *)mkl_malloc( k *
sizeof(
float), WRDLEN);
916 B = (
float *)malloc(m * k *
sizeof(
float));
917 L = (
float *)malloc(m * n *
sizeof(
float));
918 x = (
float *)malloc( n *
sizeof(
float));
919 y = (
float *)malloc( k *
sizeof(
float));
921 B = (
float *)_mm_malloc(m * k *
sizeof(
float), WRDLEN);
922 L = (
float *)_mm_malloc(m * n *
sizeof(
float), WRDLEN);
923 x = (
float *)_mm_malloc( n *
sizeof(
float), WRDLEN);
924 y = (
float *)_mm_malloc( k *
sizeof(
float), WRDLEN);
928 if (B == NULL || L == NULL || x == NULL || y ==NULL)
938 cblas_sgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
941 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
947 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
975 B = (
float *)mkl_malloc(n * k *
sizeof(
float), WRDLEN);
976 L = (
float *)mkl_malloc(m * n *
sizeof(
float), WRDLEN);
977 x = (
float *)mkl_malloc( m *
sizeof(
float), WRDLEN);
978 y = (
float *)mkl_malloc( k *
sizeof(
float), WRDLEN);
981 B = (
float *)malloc(n * k *
sizeof(
float));
982 L = (
float *)malloc(m * n *
sizeof(
float));
983 x = (
float *)malloc( m *
sizeof(
float));
984 y = (
float *)malloc( k *
sizeof(
float));
986 B = (
float *)_mm_malloc(n * k *
sizeof(
float), WRDLEN);
987 L = (
float *)_mm_malloc(m * n *
sizeof(
float), WRDLEN);
988 x = (
float *)_mm_malloc( m *
sizeof(
float), WRDLEN);
989 y = (
float *)_mm_malloc( k *
sizeof(
float), WRDLEN);
993 if (B == NULL || L == NULL || x == NULL || y ==NULL)
1003 cblas_sgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
1006 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
1012 cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
1059 int dbdiv_cpu(
const int m,
const int n,
const int k,
const double *A,
double *W,
double *H,
const double beta,
const int uType,
const int nIter)
1061 if ((beta < 0.0) || (nIter <= 0))
1064 if(beta>=2.0 && beta<=2.0)
1065 return dmlsa_cpu(m, n, k, A, W, H, uType, nIter);
1068 if(beta>=1.0 && beta<=1.0)
1071 return dbdivg_cpu(m, n, k, A, W, H, beta, uType, nIter);
1089 int sbdiv_cpu(
const int m,
const int n,
const int k,
const float *A,
float *W,
float *H,
const float beta,
const int uType,
const int nIter)
1091 if ((beta < 0.0) || (nIter <= 0))
1094 if(beta>=2.0 && beta<=2.0)
1095 return smlsa_cpu(m, n, k, A, W, H, uType, nIter);
1098 if(beta>=1.0 && beta<=1.0)
1101 return sbdivg_cpu(m, n, k, A, W, H, beta, uType, nIter);
1116 void dkernelH_x86(
const int m,
const int n,
const double *L,
const double *A,
double *__restrict__ R,
const double expo)
1123 #pragma loop_count min=16 1127 #pragma omp parallel for 1130 for(i=0; i<size; i++)
1132 double dtmp1, dtmp2;
1134 if (L[i]>=0.0 && L[i]<=0.0)
1135 R[i] = R[i+size] = 0.0;
1139 dtmp2=pow(dtmp1, expo);
1143 R[i+size]=dtmp1 * dtmp2;
1160 void skernelH_x86(
const int m,
const int n,
const float *L,
const float *A,
float *__restrict__ R,
const float expo)
1167 #pragma loop_count min=16 1171 #pragma omp parallel for 1174 for(i=0; i<size; i++)
1178 if (L[i]>=0.0 && L[i]<=0.0)
1179 R[i] = R[i+size] = 0.0;
1183 ftmp2=powf(ftmp1, expo);
1187 R[i+size]=ftmp1 * ftmp2;
1204 void dkernelW_x86(
const int m,
const int n,
const double *L,
const double *A,
double *__restrict__ R,
const double expo)
1210 #pragma loop_count min=16 1214 #pragma omp parallel for 1217 for(i=0; i<m*n; i++)
1220 double dtmp1, dtmp2;
1222 pos=2*m*(i/m)+(i%m);
1224 if (L[i]>=0.0 && L[i]<=0.0)
1225 R[pos] = R[pos+m] = 0.0;
1229 dtmp2=pow(dtmp1, expo);
1232 R[pos] =dtmp2 * A[i];
1233 R[pos+m]=dtmp1 * dtmp2;
1250 void skernelW_x86(
const int m,
const int n,
const float *L,
const float *A,
float *__restrict__ R,
const float expo)
1256 #pragma loop_count min=16 1260 #pragma omp parallel for 1263 for(i=0; i<m*n; i++)
1268 pos=2*m*(i/m)+(i%m);
1270 if (L[i]>=0.0 && L[i]<=0.0)
1271 R[pos] = R[pos+m] = 0.0;
1275 ftmp2=powf(ftmp1, expo);
1278 R[pos] =ftmp2 * A[i];
1279 R[pos+m]=ftmp1 * ftmp2;
1298 #pragma loop_count min=32 1302 #pragma omp parallel for 1309 H[i]=H[i] * (X[i] / X[i+n]);
1310 assert(isfinite(H[i]));
1312 H[i]=H[i] * (X[i] / X[i+n]);
1330 #pragma loop_count min=32 1334 #pragma omp parallel for 1341 H[i]=H[i] * (X[i] / X[i+n]);
1342 assert(isfinite(H[i]));
1344 H[i]=H[i] * (X[i] / X[i+n]);
1359 void dupdate1W_x86(
const int m,
const int n,
const double *X,
double *__restrict__ W)
1364 #pragma loop_count min=16 1368 #pragma omp parallel for 1371 for(i=0; i<m*n; i++)
1379 W[i]=W[i] * (X[pos] / X[pos+m]);
1380 assert(isfinite(W[i]));
1382 W[i]=W[i] * (X[pos] / X[pos+m]);
1397 void supdate1W_x86(
const int m,
const int n,
const float *X,
float *__restrict__ W)
1402 #pragma loop_count min=16 1406 #pragma omp parallel for 1409 for(i=0; i<m*n; i++)
1417 W[i]=W[i] * (X[pos] / X[pos+m]);
1418 assert(isfinite(W[i]));
1420 W[i]=W[i] * (X[pos] / X[pos+m]);
1435 void dupdate2H_x86(
const int m,
const int n,
const double *y,
const double *B,
double *__restrict__ H)
1440 #pragma loop_count min=32 1444 #pragma omp parallel for 1447 for(i=0; i<m*n; i++)
1451 H[i]=H[i] * (B[i] / y[i%m]);
1452 assert(isfinite(H[i]));
1454 H[i]=H[i] * (B[i] / y[i%m]);
1469 void supdate2H_x86(
const int m,
const int n,
const float *y,
const float *B,
float *__restrict__ H)
1474 #pragma loop_count min=32 1478 #pragma omp parallel for 1481 for(i=0; i<m*n; i++)
1485 H[i]=H[i] * (B[i] / y[i%m]);
1486 assert(isfinite(H[i]));
1488 H[i]=H[i] * (B[i] / y[i%m]);
1503 void dupdate2W_x86(
const int m,
const int n,
const double *y,
const double *B,
double *__restrict__ W)
1508 #pragma loop_count min=32 1512 #pragma omp parallel for 1515 for(i=0; i<m*n; i++)
1519 W[i]=W[i] * (B[i] / y[i/m]);
1520 assert(isfinite(W[i]));
1522 W[i]=W[i] * (B[i] / y[i/m]);
1537 void supdate2W_x86(
const int m,
const int n,
const float *y,
const float *B,
float *__restrict__ W)
1542 #pragma loop_count min=32 1546 #pragma omp parallel for 1549 for(i=0; i<m*n; i++)
1553 W[i]=W[i] * (B[i] / y[i/m]);
1554 assert(isfinite(W[i]));
1556 W[i]=W[i] * (B[i] / y[i/m]);
1574 #pragma loop_count min=16 1578 #pragma omp parallel for 1585 y[i]=(x[i]/y[i]) - log(x[i]/y[i]) - 1.0;
1586 assert(isfinite(y[i]));
1588 y[i]=(x[i]/y[i]) - log(x[i]/y[i]) - 1.0;
1607 #pragma loop_count min=16 1611 #pragma omp parallel for 1618 y[i]=(x[i]/y[i]) - logf(x[i]/y[i]) - 1.0f;
1619 assert(isfinite(y[i]));
1621 y[i]=(x[i]/y[i]) - logf(x[i]/y[i]) - 1.0f;
1640 #pragma loop_count min=16 1644 #pragma omp parallel for 1651 y[i]=(x[i]*log(x[i]/y[i])) + y[i] - x[i];
1652 assert(isfinite(y[i]));
1654 y[i]=(x[i]*log(x[i]/y[i])) + y[i] - x[i];
1673 #pragma loop_count min=16 1677 #pragma omp parallel for 1684 y[i]=(x[i]*logf(x[i]/y[i])) + y[i] - x[i];
1685 assert(isfinite(y[i]));
1687 y[i]=(x[i]*logf(x[i]/y[i])) + y[i] - x[i];
1702 void derrorbdg_x86(
const int n,
const double *x,
double *__restrict__ y,
const double beta)
1713 #pragma loop_count min=16 1717 #pragma omp parallel for 1723 y[i]=(pow(x[i],beta) + (dbeta*pow(y[i],beta)) - (beta*x[i]*pow(y[i],dbeta))) / dtmp;
1724 assert(isfinite(y[i]));
1726 y[i]=(pow(x[i],beta) + (dbeta*pow(y[i],beta)) - (beta*x[i]*pow(y[i],dbeta))) / dtmp;
1741 void serrorbdg_x86(
const int n,
const float *x,
float *__restrict__ y,
const float beta)
1752 #pragma loop_count min=16 1756 #pragma omp parallel for 1762 y[i]=(powf(x[i],beta) + (fbeta*powf(y[i],beta)) - (beta*x[i]*powf(y[i],fbeta))) / ftmp;
1763 assert(isfinite(y[i]));
1765 y[i]=(powf(x[i],beta) + (fbeta*powf(y[i],beta)) - (beta*x[i]*powf(y[i],fbeta))) / ftmp;
1782 double derrorbd_x86(
const int m,
const int n,
const int k,
const double *A,
const double *W,
const double *H,
const double beta)
1789 tmp = (
double *)mkl_malloc(m*n*
sizeof(
double), WRDLEN);
1792 tmp = (
double *)(m*n*
sizeof(
double));
1794 tmp = (
double *)_mm_malloc(m*n*
sizeof(
double), WRDLEN);
1798 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, tmp, m);
1800 if (beta>=0.0 && beta<=0.0)
1804 if (beta>=1.0 && beta<=1.0)
1820 error=cblas_dasum(m*n, tmp, 1);
1822 error=sqrt((2.0*error)/((
double)m*n));
1849 float serrorbd_x86(
const int m,
const int n,
const int k,
const float *A,
const float *W,
const float *H,
const float beta)
1856 tmp = (
float *)mkl_malloc(m*n*
sizeof(
float), WRDLEN);
1859 tmp = (
float *)malloc(m*n*
sizeof(
float));
1861 tmp = (
float *)_mm_malloc(m*n*
sizeof(
float), WRDLEN);
1865 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, tmp, m);
1867 if (beta>=0.0 && beta<=0.0)
1871 if (beta>=1.0 && beta<=1.0)
1887 error=cblas_sasum(m*n, tmp, 1);
1889 error=sqrtf((2.0f*error)/((
float)m*n));
void serrorbdg_x86(const int n, const float *x, float *__restrict__ y, const float beta)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
void dupdate1W_x86(const int m, const int n, const double *X, double *__restrict__ W)
void dupdate2W_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ W)
This function performs double precision W(i)=W(i)*(B(i)/y(j))
void supdate2W_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ W)
This function computes simple precision W(i)=W(i)*(B(i)/y(j))
void skernelW_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
This function computes simple precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo) Note exp...
void dkernelH_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
int dmlsa_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dmlsa_cpu performs NNMF using betadivergence when beta=2 using double precision
int dbdiv_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
dbdiv_cpu is a wrapper that calls the adequate function to performs NNMF using betadivergence using d...
void derrorbd0_x86(const int n, const double *x, double *__restrict__ y)
This function performs auxiliar double precision operations when error is computed using betadivergen...
void derrorbd1_x86(const int n, const double *x, double *__restrict__ y)
This function performs auxiliar double precision operations when error is computed using betadivergen...
void derrorbdg_x86(const int n, const double *x, double *__restrict__ y, const double beta)
This function performs auxiliar double precision operations when error is computed using betadivergen...
int sbdiv_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
int smlsa_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
smlsa_cpu performs NNMF using betadivergence when beta=2 using simple precision
void supdate1H_x86(const int n, const float *X, float *__restrict__ H)
This function computes simple precision H(i)=H(i)*B(i)/C(i) where matrices B and C are stored in the ...
int dbdivone_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dbdivone_cpu performs NNMF using beta-divergence when beta=1, using double precision ...
void supdate1W_x86(const int m, const int n, const float *X, float *__restrict__ W)
This function computes double precision W[i]=W[i]*D[i]/E[i] where matrices D and E are stored in the ...
double derrorbd_x86(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double beta)
This function returns double precision error when error is computed using betadivergence error formul...
void ddiv_x86(const int n, const double *x, double *__restrict__ y)
This function calls the appropiate funtions to performs double precision element-wise y[i]=x[i]/y[i] ...
int sbdivg_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, int nIter)
sbdivg_cpu performs NNMF using betadivergence for general case (beta <> 1 and 2) using simple precisi...
int dbdivg_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, int nIter)
dbdivg_cpu performs the NNMF using beta-divergence when beta is != 1 and !=2, using double precision...
int sbdivone_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
sbdivone_cpu performs NNMF using betadivergence when beta=1 using simple precision ...
void serrorbd0_x86(const int n, const float *x, float *__restrict__ y)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
void sdiv_x86(const int n, const float *x, float *__restrict__ y)
This function calls the appropiate funtions to performs simple precision element-wise x[i]=x[i]/y[i] ...
void serrorbd1_x86(const int n, const float *x, float *__restrict__ y)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
void skernelH_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
This function computes simple precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo) Note "exp...
float serrorbd_x86(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float beta)
This function returns simple precision error when error is computed using betadivergence error formul...
void dupdate2H_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ H)
This function computes double precision H(i)=H(i)*(B(i)/y(j))
void smemset_x86(const int n, float *__restrict__ x, const float val)
This function fills all positions of x with val.
void supdate2H_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ H)
This function performs the simple H(i)=H(i)*(B(i)/y(j))
Header file for using the betadivergence cuda functions with CPU.
void dkernelW_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
This function computes double precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo) Note exp...
void dupdate1H_x86(const int n, const double *X, double *__restrict__ H)
This function computes double precision H(i)=H(i)*B(i)/C(i) where matrices B and C are stored in the ...
void dmemset_x86(const int n, double *__restrict__ x, const double val)
This function fills all positions of x with val.