NnmfPack  2.1
bdiv_cpu.c
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2014 by PIR (University of Oviedo) and *
3  * INCO2 (Polytechnic University of Valencia) groups. *
4  * nnmfpack@gmail.com *
5  * *
6  * This program is free software; you can redistribute it and/or modify *
7  * it under the terms of the GNU General Public License as published by *
8  * the Free Software Foundation; either version 2 of the License, or *
9  * (at your option) any later version. *
10  * *
11  * This program is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14  * GNU General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU General Public License *
17  * along with this program; if not, write to the *
18  * Free Software Foundation, Inc., *
19  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20  ***************************************************************************
21 */
33 #include "bdiv_cpu.h"
34 
35 
87 int dbdivg_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, int nIter)
88 {
89  double
90  *L=NULL,
91  *M=NULL,
92  *R=NULL;
93 
94  int i;
95 
96  switch (uType)
97  {
98  case UpdateAll:
99  #ifdef With_MKL
100  M = (double *)mkl_malloc(2*max(m,n)* k * sizeof(double), WRDLEN);
101  L = (double *)mkl_malloc( m * n * sizeof(double), WRDLEN);
102  R = (double *)mkl_malloc( 2*m * n * sizeof(double), WRDLEN);
103  #else
104  #ifdef With_ARM
105  M = (double *)malloc(2*max(m,n)* k * sizeof(double));
106  L = (double *)malloc( m * n * sizeof(double));
107  R = (double *)malloc( 2*m * n * sizeof(double));
108  #else
109  M = (double *)_mm_malloc(2*max(m,n)* k * sizeof(double), WRDLEN);
110  L = (double *)_mm_malloc( m * n * sizeof(double), WRDLEN);
111  R = (double *)_mm_malloc( 2*m * n * sizeof(double), WRDLEN);
112  #endif
113  #endif
114 
115  if (L == NULL || M == NULL || R == NULL)
116  return -1;
117 
118  for(i=0;i<nIter;i++)
119  {
120  /* ************************ Phase 1 *************************** */
121  /* L=W*H */
122  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
123 
124  /* L1=L(.^)(expo-2) */
125  /* L2=L1(.*)A */
126  /* L1=L1(.*)L */
127  /* R is L1 and L2 */
128  dkernelH_x86(m, n, L, A, R, expo-2.0);
129 
130  /* B=W'*L2 */
131  /* C=W'*L1 */
132  /* above is equal to R=W'*|L2 | L1| */
133  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
134 
135  /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
136  dupdate1H_x86(k*n, M, H);
137 
138 
139  /* ************************ Phase 2 *************************** */
140  /* L=W*H */
141  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
142 
143  /* L1=L(.^)(expo-2) */
144  /* L2=L1(.*)A */
145  /* L1=L1(.*)L */
146  /* R is L1 and L2 */
147  dkernelW_x86(m, n , L, A, R, expo-2.0);
148 
149  /* D=L2*H' */
150  /* E=L1*H' */
151  /* |L2| */
152  /* above is equal to R=| | * H' */
153  /* |L1| */
154  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
155 
156  /* W=W(.*)D(./)E */
157  dupdate1W_x86(m, k, M, W);
158  }
159  #ifdef With_MKL
160  mkl_free(L);
161  mkl_free(R);
162  mkl_free(M);
163  #else
164  #ifdef With_ARM
165  free(L);
166  free(M);
167  free(R);
168  #else
169  _mm_free(L);
170  _mm_free(M);
171  _mm_free(R);
172  #endif
173  #endif
174  break;
175 
176  case UpdateW:
177  #ifdef With_MKL
178  M = (double *)mkl_malloc(2*m * k * sizeof(double), WRDLEN);
179  L = (double *)mkl_malloc( m * n * sizeof(double), WRDLEN);
180  R = (double *)mkl_malloc(2*m * n * sizeof(double), WRDLEN);
181  #else
182  #ifdef With_ARM
183  M = (double *)malloc(2*m * k * sizeof(double));
184  L = (double *)malloc( m * n * sizeof(double));
185  R = (double *)malloc(2*m * n * sizeof(double));
186  #else
187  M = (double *)_mm_malloc(2*m * k * sizeof(double), WRDLEN);
188  L = (double *)_mm_malloc( m * n * sizeof(double), WRDLEN);
189  R = (double *)_mm_malloc(2*m * n * sizeof(double), WRDLEN);
190  #endif
191  #endif
192 
193  if (L == NULL || M == NULL || R == NULL )
194  return -1;
195 
196  for(i=0;i<nIter;i++)
197  {
198  /* ************************ Phase 2 *************************** */
199  /* L=W*H */
200  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
201 
202  /* L1=L(.^)(expo-2) */
203  /* L2=L1(.*)A */
204  /* L1=L1(.*)L */
205  /* R is L1 and L2 */
206  dkernelW_x86(m, n , L, A, R, expo-2.0);
207 
208  /* D=L2*H' */
209  /* E=L1*H' */
210  /* |L2| */
211  /* above is equal to R=| | * H' */
212  /* |L1| */
213  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
214 
215  /* W=W(.*)D(./)E */
216  dupdate1W_x86(m, k, M, W);
217  }
218  #ifdef With_MKL
219  mkl_free(L);
220  mkl_free(R);
221  mkl_free(M);
222  #else
223  #ifdef With_ARM
224  free(L);
225  free(M);
226  free(R);
227  #else
228  _mm_free(L);
229  _mm_free(M);
230  _mm_free(R);
231  #endif
232  #endif
233  break;
234 
235  case UpdateH:
236  #ifdef With_MKL
237  M = (double *)mkl_malloc(2*n * k * sizeof(double), WRDLEN);
238  L = (double *)mkl_malloc( m * n * sizeof(double), WRDLEN);
239  R = (double *)mkl_malloc(2*m * n * sizeof(double), WRDLEN);
240  #else
241  #ifdef With_ARM
242  M = (double *)malloc(2*n * k * sizeof(double));
243  L = (double *)malloc( m * n * sizeof(double));
244  R = (double *)malloc(2*m * n * sizeof(double));
245  #else
246  M = (double *)_mm_malloc(2*n * k * sizeof(double), WRDLEN);
247  L = (double *)_mm_malloc( m * n * sizeof(double), WRDLEN);
248  R = (double *)_mm_malloc(2*m * n * sizeof(double), WRDLEN);
249  #endif
250  #endif
251 
252  if (L == NULL || M == NULL || R == NULL )
253  return -1;
254 
255  for(i=0;i<nIter;i++)
256  {
257  /* ************************ Phase 1 *************************** */
258  /* L=W*H */
259  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
260 
261  /* L1=L(.^)(expo-2) */
262  /* L2=L1(.*)A */
263  /* L1=L1(.*)L */
264  /* R is L1 and L2 */
265  dkernelH_x86(m, n, L, A, R, expo-2.0);
266 
267  /* B=W'*L2 */
268  /* C=W'*L1 */
269  /* above is equal to R=W'*|L2 | L1| */
270  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
271 
272  /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
273  dupdate1H_x86(k*n, M, H);
274  }
275  #ifdef With_MKL
276  mkl_free(L);
277  mkl_free(R);
278  mkl_free(M);
279  #else
280  #ifdef With_ARM
281  free(L);
282  free(M);
283  free(R);
284  #else
285  _mm_free(L);
286  _mm_free(M);
287  _mm_free(R);
288  #endif
289  #endif
290  break;
291 
292  default:
293  return -1;
294  }
295  return 0;
296 }
297 
298 
313 int sbdivg_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, int nIter)
314 {
315  float
316  *L=NULL,
317  *M=NULL,
318  *R=NULL;
319 
320  int i;
321 
322  switch (uType)
323  {
324  case UpdateAll:
325  #ifdef With_MKL
326  M = (float *)mkl_malloc(2*max(m,n) * k * sizeof(float), WRDLEN);
327  L = (float *)mkl_malloc( m * n * sizeof(float), WRDLEN);
328  R = (float *)mkl_malloc( 2*m * n * sizeof(float), WRDLEN);
329  #else
330  #ifdef With_ARM
331  M = (float *)malloc(2*max(m,n) * k * sizeof(float));
332  L = (float *)malloc( m * n * sizeof(float));
333  R = (float *)malloc( 2*m * n * sizeof(float));
334  #else
335  M = (float *)_mm_malloc(2*max(m,n) * k * sizeof(float), WRDLEN);
336  L = (float *)_mm_malloc( m * n * sizeof(float), WRDLEN);
337  R = (float *)_mm_malloc( 2*m * n * sizeof(float), WRDLEN);
338  #endif
339  #endif
340 
341  if (L == NULL || M == NULL || R == NULL )
342  return -1;
343 
344  for(i=0;i<nIter;i++)
345  {
346  /* ************************ Phase 1 *************************** */
347  /* L=W*H */
348  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
349 
350  /* L1=L(.^)(expo-2) */
351  /* L2=L1(.*)A */
352  /* L1=L1(.*)L */
353  /* R is L1 and L2 */
354  skernelH_x86(m, n, L, A, R, expo-2.0f);
355 
356  /* B=W'*L2 */
357  /* C=W'*L1 */
358  /* above is equal to R=W'*|L2 | L1| */
359  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
360 
361  /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
362  supdate1H_x86(k*n, M, H);
363 
364 
365  /* ************************ Phase 2 *************************** */
366  /* L=W*H */
367  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
368 
369  /* L1=L(.^)(expo-2) */
370  /* L2=L1(.*)A */
371  /* L1=L1(.*)L */
372  /* R is L1 and L2 */
373  skernelW_x86(m, n, L, A, R, expo-2.0f);
374 
375  /* D=L2*H' */
376  /* E=L1*H' */
377  /* |L2| */
378  /* above is equal to R=| | * H' */
379  /* |L1| */
380  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
381 
382  /* W=W(.*)D(./)E */
383  supdate1W_x86(m, k, M, W);
384  }
385  #ifdef With_MKL
386  mkl_free(L);
387  mkl_free(R);
388  mkl_free(M);
389  #else
390  #ifdef With_ARM
391  free(L);
392  free(M);
393  free(R);
394  #else
395  _mm_free(L);
396  _mm_free(M);
397  _mm_free(R);
398  #endif
399  #endif
400  break;
401 
402  case UpdateW:
403  #ifdef With_MKL
404  M = (float *)mkl_malloc(2*m * k * sizeof(float), WRDLEN);
405  L = (float *)mkl_malloc( m * n * sizeof(float), WRDLEN);
406  R = (float *)mkl_malloc(2*m * n * sizeof(float), WRDLEN);
407  #else
408  #ifdef With_ARM
409  M = (float *)malloc(2*m * k * sizeof(float));
410  L = (float *)malloc( m * n * sizeof(float));
411  R = (float *)malloc(2*m * n * sizeof(float));
412  #else
413  M = (float *)_mm_malloc(2*m * k * sizeof(float), WRDLEN);
414  L = (float *)_mm_malloc( m * n * sizeof(float), WRDLEN);
415  R = (float *)_mm_malloc(2*m * n * sizeof(float), WRDLEN);
416  #endif
417  #endif
418 
419  if (L == NULL || M == NULL || R == NULL )
420  return -1;
421 
422  for(i=0;i<nIter;i++)
423  {
424  /* ************************ Phase 2 *************************** */
425  /* L=W*H */
426  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
427 
428  /* L1=L(.^)(expo-2) */
429  /* L2=L1(.*)A */
430  /* L1=L1(.*)L */
431  /* R is L1 and L2 */
432  skernelW_x86(m, n, L, A, R, expo-2.0f);
433 
434  /* D=L2*H' */
435  /* E=L1*H' */
436  /* |L2| */
437  /* above is equal to R=| | * H' */
438  /* |L1| */
439  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 2*m, k, n, 1.0, R, 2*m, H, k, 0.0, M, 2*m);
440 
441  /* W=W(.*)D(./)E */
442  supdate1W_x86(m, k, M, W);
443  }
444  #ifdef With_MKL
445  mkl_free(L);
446  mkl_free(R);
447  mkl_free(M);
448  #else
449  #ifdef With_ARM
450  free(L);
451  free(M);
452  free(R);
453  #else
454  _mm_free(L);
455  _mm_free(M);
456  _mm_free(R);
457  #endif
458  #endif
459  break;
460 
461  case UpdateH:
462  #ifdef With_MKL
463  M = (float *)mkl_malloc(2*n * k * sizeof(float), WRDLEN);
464  L = (float *)mkl_malloc( m * n * sizeof(float), WRDLEN);
465  R = (float *)mkl_malloc(2*m * n * sizeof(float), WRDLEN);
466  #else
467  #ifdef With_ARM
468  M = (float *)malloc(2*n * k * sizeof(float));
469  L = (float *)malloc( m * n * sizeof(float));
470  R = (float *)malloc(2*m * n * sizeof(float));
471  #else
472  M = (float *)_mm_malloc(2*n * k * sizeof(float), WRDLEN);
473  L = (float *)_mm_malloc( m * n * sizeof(float), WRDLEN);
474  R = (float *)_mm_malloc(2*m * n * sizeof(float), WRDLEN);
475  #endif
476  #endif
477 
478  if (L == NULL || M == NULL || R == NULL )
479  return -1;
480 
481  for(i=0;i<nIter;i++)
482  {
483  /* ************************ Phase 1 *************************** */
484  /* L=W*H */
485  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
486 
487  /* L1=L(.^)(expo-2) */
488  /* L2=L1(.*)A */
489  /* L1=L1(.*)L */
490  /* R is L1 and L2 */
491  skernelH_x86(m, n, L, A, R, expo-2.0f);
492 
493  /* B=W'*L2 */
494  /* C=W'*L1 */
495  /* above is equal to R=W'*|L2 | L1| */
496  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, 2*n, m, 1.0, W, m, R, m, 0.0, M, k);
497 
498  /* H=H(.*)B(./)C. Note that matrices B and C are stored together in matrix M*/
499  supdate1H_x86(k*n, M, H);
500  }
501  #ifdef With_MKL
502  mkl_free(L);
503  mkl_free(R);
504  mkl_free(M);
505  #else
506  #ifdef With_ARM
507  free(L);
508  free(M);
509  free(R);
510  #else
511  _mm_free(L);
512  _mm_free(M);
513  _mm_free(R);
514  #endif
515  #endif
516  break;
517 
518  default:
519  return -1;
520  }
521  return 0;
522 }
523 
524 
525 
526 
566 int dbdivone_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
567 {
568  double
569  *B=NULL,
570  *L=NULL,
571  *x=NULL,
572  *y=NULL;
573 
574  int i;
575 
576  switch (uType)
577  {
578  case UpdateAll:
579  /* Vectors "x" and "y" are used both in Phase 1 and Phase 2. With */
580  /* the strategy used for matrices B and D the sise of x is max(m,n) */
581  #ifdef With_MKL
582  B = (double *)mkl_malloc(max(m,n) * k * sizeof(double), WRDLEN);
583  L = (double *)mkl_malloc( m * n * sizeof(double), WRDLEN);
584  x = (double *)mkl_malloc( max(m,n) * sizeof(double), WRDLEN);
585  y = (double *)mkl_malloc( k * sizeof(double), WRDLEN);
586  #else
587  #ifdef With_ARM
588  B = (double *)malloc(max(m,n) * k * sizeof(double));
589  L = (double *)malloc( m * n * sizeof(double));
590  x = (double *)malloc( max(m,n) * sizeof(double));
591  y = (double *)malloc( k * sizeof(double));
592  #else
593  B = (double *)_mm_malloc(max(m,n) * k * sizeof(double), WRDLEN);
594  L = (double *)_mm_malloc( m * n * sizeof(double), WRDLEN);
595  x = (double *)_mm_malloc( max(m,n) * sizeof(double), WRDLEN);
596  y = (double *)_mm_malloc( k * sizeof(double), WRDLEN);
597  #endif
598  #endif
599 
600  if (B == NULL || L == NULL || x == NULL || y ==NULL)
601  return -1;
602 
603  /* x[i]=1.0 for all i */
604  dmemset_x86(max(m,n), x, 1.0);
605 
606  for(i=0;i<nIter;i++)
607  {
608  /* ************************ Phase 1 *************************** */
609  /* Calculate the sums of all W columns via dgemv(W, x) */
610  cblas_dgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
611 
612  /* L=W*H */
613  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
614 
615  /* L=A(./)L*/
616  ddiv_x86(m*n, A, L);
617 
618  /* B=W'*L */
619  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
620 
621  /* B(i,j)=B(i,j)/y(i) for all B elements */
622  /* H=H(.*)B */
623  dupdate2H_x86(k, n, y, B, H);
624 
625 
626  /* ************************ Phase 2 *************************** */
627  /* Calculate the sums of all H rows via dgemv(H, x) */
628  cblas_dgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
629 
630  /* L=W*H */
631  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
632 
633  /* L=A(./)L*/
634  ddiv_x86(m*n, A, L);
635 
636  /* B=L*H' */
637  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
638 
639  /* B(i,j)=B(i,j)/y(j) for all B elements */
640  /* W=W(.*)B */
641  dupdate2W_x86(m, k, y, B, W);
642  }
643  #ifdef With_MKL
644  mkl_free(B);
645  mkl_free(L);
646  mkl_free(x);
647  mkl_free(y);
648  #else
649  #ifdef With_ARM
650  free(B);
651  free(L);
652  free(x);
653  free(y);
654  #else
655  _mm_free(B);
656  _mm_free(L);
657  _mm_free(x);
658  _mm_free(y);
659  #endif
660  #endif
661  break;
662 
663  case UpdateW:
664  #ifdef With_MKL
665  B = (double *)mkl_malloc(m * k * sizeof(double), WRDLEN);
666  L = (double *)mkl_malloc(m * n * sizeof(double), WRDLEN);
667  x = (double *)mkl_malloc( n * sizeof(double), WRDLEN);
668  y = (double *)mkl_malloc( k * sizeof(double), WRDLEN);
669  #else
670  #ifdef With_ARM
671  B = (double *)malloc(m * k * sizeof(double));
672  L = (double *)malloc(m * n * sizeof(double));
673  x = (double *)malloc( n * sizeof(double));
674  y = (double *)malloc( k * sizeof(double));
675  #else
676  B = (double *)_mm_malloc(m * k * sizeof(double), WRDLEN);
677  L = (double *)_mm_malloc(m * n * sizeof(double), WRDLEN);
678  x = (double *)_mm_malloc( n * sizeof(double), WRDLEN);
679  y = (double *)_mm_malloc( k * sizeof(double), WRDLEN);
680  #endif
681  #endif
682 
683  if (B == NULL || L == NULL || x == NULL || y ==NULL)
684  return -1;
685 
686  /* x[i]=1.0 for all i */
687  dmemset_x86(n, x, 1.0);
688 
689  for(i=0;i<nIter;i++)
690  {
691  /* ************************ Phase 2 *************************** */
692  /* Calculate the sums of all H rows via dgemv(H, x) */
693  cblas_dgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
694 
695  /* L=W*H */
696  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
697 
698  /* L=A(./)L*/
699  ddiv_x86(m*n, A, L);
700 
701  /* B=L*H' */
702  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
703 
704  /* B(i,j)=B(i,j)/y(j) for all B elements */
705  /* W=W(.*)B */
706  dupdate2W_x86(m, k, y, B, W);
707  }
708  #ifdef With_MKL
709  mkl_free(B);
710  mkl_free(L);
711  mkl_free(x);
712  mkl_free(y);
713  #else
714  #ifdef With_ARM
715  free(B);
716  free(L);
717  free(x);
718  free(y);
719  #else
720  _mm_free(B);
721  _mm_free(L);
722  _mm_free(x);
723  _mm_free(y);
724  #endif
725  #endif
726  break;
727 
728  case UpdateH:
729  #ifdef With_MKL
730  B = (double *)mkl_malloc(n * k * sizeof(double), WRDLEN);
731  L = (double *)mkl_malloc(m * n * sizeof(double), WRDLEN);
732  x = (double *)mkl_malloc( m * sizeof(double), WRDLEN);
733  y = (double *)mkl_malloc( k * sizeof(double), WRDLEN);
734  #else
735  #ifdef With_ARM
736  B = (double *)malloc(n * k * sizeof(double));
737  L = (double *)malloc(m * n * sizeof(double));
738  x = (double *)malloc( m * sizeof(double));
739  y = (double *)malloc( k * sizeof(double));
740  #else
741  B = (double *)_mm_malloc(n * k * sizeof(double), WRDLEN);
742  L = (double *)_mm_malloc(m * n * sizeof(double), WRDLEN);
743  x = (double *)_mm_malloc( m * sizeof(double), WRDLEN);
744  y = (double *)_mm_malloc( k * sizeof(double), WRDLEN);
745  #endif
746  #endif
747 
748  if (B == NULL || L == NULL || x == NULL || y ==NULL)
749  return -1;
750 
751  /* x[i]=1.0 for all i */
752  dmemset_x86(m, x, 1.0);
753 
754  for(i=0;i<nIter;i++)
755  {
756  /* ************************ Phase 1 *************************** */
757  /* Calculate the sums of all W columns via dgemv(W, x) */
758  cblas_dgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
759 
760  /* L=W*H */
761  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
762 
763  /* L=A(./)L*/
764  ddiv_x86(m*n, A, L);
765 
766  /* B=W'*L */
767  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
768 
769  /* B(i,j)=B(i,j)/y(i) for all B elements */
770  /* H=H(.*)B */
771  dupdate2H_x86(k, n, y, B, H);
772  }
773  #ifdef With_MKL
774  mkl_free(B);
775  mkl_free(L);
776  mkl_free(x);
777  mkl_free(y);
778  #else
779  #ifdef With_ARM
780  free(B);
781  free(L);
782  free(x);
783  free(y);
784  #else
785  _mm_free(B);
786  _mm_free(L);
787  _mm_free(x);
788  _mm_free(y);
789  #endif
790  #endif
791  break;
792 
793  default:
794  return -1;
795  }
796  return 0;
797 }
798 
799 
813 int sbdivone_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
814 {
815  float
816  *B=NULL,
817  *L=NULL,
818  *x=NULL,
819  *y=NULL;
820 
821  int i;
822 
823  switch (uType)
824  {
825  case UpdateAll:
826  #ifdef With_MKL
827  B = (float *)mkl_malloc(max(m,n) * k * sizeof(float), WRDLEN);
828  L = (float *)mkl_malloc( m * n * sizeof(float), WRDLEN);
829  x = (float *)mkl_malloc( max(m,n) * sizeof(float), WRDLEN);
830  y = (float *)mkl_malloc( k * sizeof(float), WRDLEN);
831  #else
832  #ifdef With_ARM
833  B = (float *)malloc(max(m,n) * k * sizeof(float));
834  L = (float *)malloc( m * n * sizeof(float));
835  x = (float *)malloc( max(m,n) * sizeof(float));
836  y = (float *)malloc( k * sizeof(float));
837  #else
838  B = (float *)_mm_malloc(max(m,n) * k * sizeof(float), WRDLEN);
839  L = (float *)_mm_malloc( m * n * sizeof(float), WRDLEN);
840  x = (float *)_mm_malloc( max(m,n) * sizeof(float), WRDLEN);
841  y = (float *)_mm_malloc( k * sizeof(float), WRDLEN);
842  #endif
843  #endif
844 
845  if (B == NULL || L == NULL || x == NULL || y ==NULL)
846  return -1;
847 
848  /* x[i]=1.0 for all i */
849  smemset_x86(max(m,n), x, 1.0);
850 
851  for(i=0;i<nIter;i++)
852  {
853  /* ************************ Phase 1 *************************** */
854  /* Calculate the sums of all W columns via sgemv(W, x) */
855  cblas_sgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
856 
857  /* L=W*H */
858  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
859 
860  /* L=A(./)L*/
861  sdiv_x86(m*n, A, L);
862 
863  /* B=W'*L */
864  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
865 
866  /* B(i,j)=B(i,j)/y(i) for all B elements */
867  /* H=H(.*)B */
868  supdate2H_x86(k, n, y, B, H);
869 
870 
871  /* ************************ Phase 2 *************************** */
872  /* Calculate the sums of all H rows via sgemv(H, x) */
873  cblas_sgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
874 
875  /* L=W*H */
876  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
877 
878  /* L=A(./)L*/
879  sdiv_x86(m*n, A, L);
880 
881  /* B=L*H' */
882  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
883 
884  /* B(i,j)=B(i,j)/y(j) for all B elements */
885  /* W=W(.*)B */
886  supdate2W_x86(m, k, y, B, W);
887  }
888  #ifdef With_MKL
889  mkl_free(B);
890  mkl_free(L);
891  mkl_free(x);
892  mkl_free(y);
893  #else
894  #ifdef With_ARM
895  free(B);
896  free(L);
897  free(x);
898  free(y);
899  #else
900  _mm_free(B);
901  _mm_free(L);
902  _mm_free(x);
903  _mm_free(y);
904  #endif
905  #endif
906  break;
907 
908  case UpdateW:
909  #ifdef With_MKL
910  B = (float *)mkl_malloc(m * k * sizeof(float), WRDLEN);
911  L = (float *)mkl_malloc(m * n * sizeof(float), WRDLEN);
912  x = (float *)mkl_malloc( n * sizeof(float), WRDLEN);
913  y = (float *)mkl_malloc( k * sizeof(float), WRDLEN);
914  #else
915  #ifdef With_ARM
916  B = (float *)malloc(m * k * sizeof(float));
917  L = (float *)malloc(m * n * sizeof(float));
918  x = (float *)malloc( n * sizeof(float));
919  y = (float *)malloc( k * sizeof(float));
920  #else
921  B = (float *)_mm_malloc(m * k * sizeof(float), WRDLEN);
922  L = (float *)_mm_malloc(m * n * sizeof(float), WRDLEN);
923  x = (float *)_mm_malloc( n * sizeof(float), WRDLEN);
924  y = (float *)_mm_malloc( k * sizeof(float), WRDLEN);
925  #endif
926  #endif
927 
928  if (B == NULL || L == NULL || x == NULL || y ==NULL)
929  return -1;
930 
931  /* x[i]=1.0 for all i */
932  smemset_x86(n, x, 1.0);
933 
934  for(i=0;i<nIter;i++)
935  {
936  /* ************************ Phase 2 *************************** */
937  /* Calculate the sums of all H rows via sgemv(H, x) */
938  cblas_sgemv(CblasColMajor, CblasNoTrans, k, n, 1.0, H, k, x, 1, 0.0, y, 1);
939 
940  /* L=W*H */
941  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
942 
943  /* L=A(./)L*/
944  sdiv_x86(m*n, A, L);
945 
946  /* B=L*H' */
947  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, L, m, H, k, 0.0, B, m);
948 
949  /* B(i,j)=B(i,j)/y(j) for all B elements */
950  /* W=W(.*)B */
951  supdate2W_x86(m, k, y, B, W);
952  }
953  #ifdef With_MKL
954  mkl_free(B);
955  mkl_free(L);
956  mkl_free(x);
957  mkl_free(y);
958  #else
959  #ifdef With_ARM
960  free(B);
961  free(L);
962  free(x);
963  free(y);
964  #else
965  _mm_free(B);
966  _mm_free(L);
967  _mm_free(x);
968  _mm_free(y);
969  #endif
970  #endif
971  break;
972 
973  case UpdateH:
974  #ifdef With_MKL
975  B = (float *)mkl_malloc(n * k * sizeof(float), WRDLEN);
976  L = (float *)mkl_malloc(m * n * sizeof(float), WRDLEN);
977  x = (float *)mkl_malloc( m * sizeof(float), WRDLEN);
978  y = (float *)mkl_malloc( k * sizeof(float), WRDLEN);
979  #else
980  #ifdef With_ARM
981  B = (float *)malloc(n * k * sizeof(float));
982  L = (float *)malloc(m * n * sizeof(float));
983  x = (float *)malloc( m * sizeof(float));
984  y = (float *)malloc( k * sizeof(float));
985  #else
986  B = (float *)_mm_malloc(n * k * sizeof(float), WRDLEN);
987  L = (float *)_mm_malloc(m * n * sizeof(float), WRDLEN);
988  x = (float *)_mm_malloc( m * sizeof(float), WRDLEN);
989  y = (float *)_mm_malloc( k * sizeof(float), WRDLEN);
990  #endif
991  #endif
992 
993  if (B == NULL || L == NULL || x == NULL || y ==NULL)
994  return -1;
995 
996  /* x[i]=1.0 for all i */
997  smemset_x86(m, x, 1.0);
998 
999  for(i=0;i<nIter;i++)
1000  {
1001  /* ************************ Phase 1 *************************** */
1002  /* Calculate the sums of all W columns via sgemv(W, x) */
1003  cblas_sgemv(CblasColMajor, CblasTrans, m, k, 1.0, W, m, x, 1, 0.0, y, 1);
1004 
1005  /* L=W*H */
1006  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, L, m);
1007 
1008  /* L=A(./)L*/
1009  sdiv_x86(m*n, A, L);
1010 
1011  /* B=W'*L */
1012  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, L, m, 0.0, B, k);
1013 
1014  /* B(i,j)=B(i,j)/y(i) for all B elements */
1015  /* H=H(.*)B */
1016  supdate2H_x86(k, n, y, B, H);
1017  }
1018  #ifdef With_MKL
1019  mkl_free(B);
1020  mkl_free(L);
1021  mkl_free(x);
1022  mkl_free(y);
1023  #else
1024  #ifdef With_ARM
1025  free(B);
1026  free(L);
1027  free(x);
1028  free(y);
1029  #else
1030  _mm_free(B);
1031  _mm_free(L);
1032  _mm_free(x);
1033  _mm_free(y);
1034  #endif
1035  #endif
1036  break;
1037 
1038  default:
1039  return -1;
1040  }
1041  return 0;
1042 }
1043 
1044 
1059 int dbdiv_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
1060 {
1061  if ((beta < 0.0) || (nIter <= 0))
1062  return -1;
1063 
1064  if(beta>=2.0 && beta<=2.0)
1065  return dmlsa_cpu(m, n, k, A, W, H, uType, nIter);
1066  else
1067  {
1068  if(beta>=1.0 && beta<=1.0)
1069  return dbdivone_cpu(m, n, k, A, W, H, uType, nIter);
1070  else
1071  return dbdivg_cpu(m, n, k, A, W, H, beta, uType, nIter);
1072  }
1073 }
1074 
1075 
1089 int sbdiv_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
1090 {
1091  if ((beta < 0.0) || (nIter <= 0))
1092  return -1;
1093 
1094  if(beta>=2.0 && beta<=2.0)
1095  return smlsa_cpu(m, n, k, A, W, H, uType, nIter);
1096  else
1097  {
1098  if(beta>=1.0 && beta<=1.0)
1099  return sbdivone_cpu(m, n, k, A, W, H, uType, nIter);
1100  else
1101  return sbdivg_cpu(m, n, k, A, W, H, beta, uType, nIter);
1102  }
1103 }
1104 
1116 void dkernelH_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
1117 {
1118  int i, size;
1119 
1120  size=m*n;
1121 
1122  #ifdef With_ICC
1123  #pragma loop_count min=16
1124  #pragma simd
1125  #else
1126  #ifdef With_OMP
1127  #pragma omp parallel for
1128  #endif
1129  #endif
1130  for(i=0; i<size; i++)
1131  {
1132  double dtmp1, dtmp2;
1133 
1134  if (L[i]>=0.0 && L[i]<=0.0)
1135  R[i] = R[i+size] = 0.0;
1136  else
1137  {
1138  dtmp1=L[i];
1139  dtmp2=pow(dtmp1, expo);
1140 
1141  /* ask for A[i]=0.0 don't give improvements. We don't do it */
1142  R[i] =dtmp2 * A[i];
1143  R[i+size]=dtmp1 * dtmp2;
1144  }
1145  }
1146 }
1147 
1148 
1160 void skernelH_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
1161 {
1162  int i, size;
1163 
1164  size=m*n;
1165 
1166  #ifdef With_ICC
1167  #pragma loop_count min=16
1168  #pragma simd
1169  #else
1170  #ifdef With_OMP
1171  #pragma omp parallel for
1172  #endif
1173  #endif
1174  for(i=0; i<size; i++)
1175  {
1176  float ftmp1, ftmp2;
1177 
1178  if (L[i]>=0.0 && L[i]<=0.0)
1179  R[i] = R[i+size] = 0.0;
1180  else
1181  {
1182  ftmp1=L[i];
1183  ftmp2=powf(ftmp1, expo);
1184 
1185  /* ask for A[i]=0.0 don't give improvements. We don't do it */
1186  R[i] =ftmp2 * A[i];
1187  R[i+size]=ftmp1 * ftmp2;
1188  }
1189  }
1190 }
1191 
1192 
1204 void dkernelW_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
1205 {
1206  int i;
1207 
1208  #ifdef With_ICC
1209  #pragma parallel
1210  #pragma loop_count min=16
1211  #pragma simd
1212  #else
1213  #ifdef With_OMP
1214  #pragma omp parallel for
1215  #endif
1216  #endif
1217  for(i=0; i<m*n; i++)
1218  {
1219  int pos;
1220  double dtmp1, dtmp2;
1221 
1222  pos=2*m*(i/m)+(i%m);
1223 
1224  if (L[i]>=0.0 && L[i]<=0.0)
1225  R[pos] = R[pos+m] = 0.0;
1226  else
1227  {
1228  dtmp1=L[i];
1229  dtmp2=pow(dtmp1, expo);
1230 
1231  /* ask for A[i]=0.0 don't give improvements. We don't do it */
1232  R[pos] =dtmp2 * A[i];
1233  R[pos+m]=dtmp1 * dtmp2;
1234  }
1235  }
1236 }
1237 
1238 
1250 void skernelW_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
1251 {
1252  int i;
1253 
1254  #ifdef With_ICC
1255  #pragma parallel
1256  #pragma loop_count min=16
1257  #pragma simd
1258  #else
1259  #ifdef With_OMP
1260  #pragma omp parallel for
1261  #endif
1262  #endif
1263  for(i=0; i<m*n; i++)
1264  {
1265  int pos;
1266  float ftmp1, ftmp2;
1267 
1268  pos=2*m*(i/m)+(i%m);
1269 
1270  if (L[i]>=0.0 && L[i]<=0.0)
1271  R[pos] = R[pos+m] = 0.0;
1272  else
1273  {
1274  ftmp1=L[i];
1275  ftmp2=powf(ftmp1, expo);
1276 
1277  /* ask for A[i]=0.0 don't give improvements. We don't do it */
1278  R[pos] =ftmp2 * A[i];
1279  R[pos+m]=ftmp1 * ftmp2;
1280  }
1281  }
1282 }
1283 
1284 
1293 void dupdate1H_x86(const int n, const double *X, double *__restrict__ H)
1294 {
1295  int i;
1296 
1297  #ifdef With_ICC
1298  #pragma loop_count min=32
1299  #pragma simd
1300  #else
1301  #ifdef With_OMP
1302  #pragma omp parallel for
1303  #endif
1304  #endif
1305  for(i=0; i<n; i++)
1306  {
1307  #ifdef With_Check
1308  /* Here we can have NaN and Inf if X(pos+n) and/or H(pos) and or/ X(pos)=0 */
1309  H[i]=H[i] * (X[i] / X[i+n]);
1310  assert(isfinite(H[i]));
1311  #else
1312  H[i]=H[i] * (X[i] / X[i+n]);
1313  #endif
1314  }
1315 }
1316 
1325 void supdate1H_x86(const int n, const float *X, float *__restrict__ H)
1326 {
1327  int i;
1328 
1329  #ifdef With_ICC
1330  #pragma loop_count min=32
1331  #pragma simd
1332  #else
1333  #ifdef With_OMP
1334  #pragma omp parallel for
1335  #endif
1336  #endif
1337  for(i=0; i<n; i++)
1338  {
1339  #ifdef With_Check
1340  /* Here we can have NaN and Inf if X(i+n) and/or H(i) and/or X(i)=0 */
1341  H[i]=H[i] * (X[i] / X[i+n]);
1342  assert(isfinite(H[i]));
1343  #else
1344  H[i]=H[i] * (X[i] / X[i+n]);
1345  #endif
1346  }
1347 }
1348 
1349 
1359 void dupdate1W_x86(const int m, const int n, const double *X, double *__restrict__ W)
1360 {
1361  int i;
1362 
1363  #ifdef With_ICC
1364  #pragma loop_count min=16
1365  #pragma simd
1366  #else
1367  #ifdef With_OMP
1368  #pragma omp parallel for
1369  #endif
1370  #endif
1371  for(i=0; i<m*n; i++)
1372  {
1373  int pos;
1374 
1375  pos = i + (i/m)*m;
1376 
1377  #ifdef With_Check
1378  /* Here we can have NaN and Inf if X(pos+m) and/or W(i) and/or X(pos)=0 */
1379  W[i]=W[i] * (X[pos] / X[pos+m]);
1380  assert(isfinite(W[i]));
1381  #else
1382  W[i]=W[i] * (X[pos] / X[pos+m]);
1383  #endif
1384  }
1385 }
1386 
1387 
1397 void supdate1W_x86(const int m, const int n, const float *X, float *__restrict__ W)
1398 {
1399  int i;
1400 
1401  #ifdef With_ICC
1402  #pragma loop_count min=16
1403  #pragma simd
1404  #else
1405  #ifdef With_OMP
1406  #pragma omp parallel for
1407  #endif
1408  #endif
1409  for(i=0; i<m*n; i++)
1410  {
1411  int pos;
1412 
1413  pos = i + (i/m)*m;
1414 
1415  #ifdef With_Check
1416  /* Here we can have NaN and Inf if X(pos+m) and/or W(i) and/or X(pos)=0 */
1417  W[i]=W[i] * (X[pos] / X[pos+m]);
1418  assert(isfinite(W[i]));
1419  #else
1420  W[i]=W[i] * (X[pos] / X[pos+m]);
1421  #endif
1422  }
1423 }
1424 
1425 
1435 void dupdate2H_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ H)
1436 {
1437  int i;
1438 
1439  #ifdef With_ICC
1440  #pragma loop_count min=32
1441  #pragma simd
1442  #else
1443  #ifdef With_OMP
1444  #pragma omp parallel for
1445  #endif
1446  #endif
1447  for(i=0; i<m*n; i++)
1448  {
1449  #ifdef With_Check
1450  /* Here we can have NaN and Inf if y(i%m) and/or H(i) and/or B(i)=0 */
1451  H[i]=H[i] * (B[i] / y[i%m]);
1452  assert(isfinite(H[i]));
1453  #else
1454  H[i]=H[i] * (B[i] / y[i%m]);
1455  #endif
1456  }
1457 }
1458 
1459 
1469 void supdate2H_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ H)
1470 {
1471  int i;
1472 
1473  #ifdef With_ICC
1474  #pragma loop_count min=32
1475  #pragma simd
1476  #else
1477  #ifdef With_OMP
1478  #pragma omp parallel for
1479  #endif
1480  #endif
1481  for(i=0; i<m*n; i++)
1482  {
1483  #ifdef With_Check
1484  /* Here we can have NaN and Inf if y(i%m) and/or H(i) and/or B(i)=0 */
1485  H[i]=H[i] * (B[i] / y[i%m]);
1486  assert(isfinite(H[i]));
1487  #else
1488  H[i]=H[i] * (B[i] / y[i%m]);
1489  #endif
1490  }
1491 }
1492 
1493 
1503 void dupdate2W_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ W)
1504 {
1505  int i;
1506 
1507  #ifdef With_ICC
1508  #pragma loop_count min=32
1509  #pragma simd
1510  #else
1511  #ifdef With_OMP
1512  #pragma omp parallel for
1513  #endif
1514  #endif
1515  for(i=0; i<m*n; i++)
1516  {
1517  #ifdef With_Check
1518  /* Here we can have NaN and Inf if y(i/m) and/or W(i) and/or B(i)=0 */
1519  W[i]=W[i] * (B[i] / y[i/m]);
1520  assert(isfinite(W[i]));
1521  #else
1522  W[i]=W[i] * (B[i] / y[i/m]);
1523  #endif
1524  }
1525 }
1526 
1527 
1537 void supdate2W_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ W)
1538 {
1539  int i;
1540 
1541  #ifdef With_ICC
1542  #pragma loop_count min=32
1543  #pragma simd
1544  #else
1545  #ifdef With_OMP
1546  #pragma omp parallel for
1547  #endif
1548  #endif
1549  for(i=0; i<m*n; i++)
1550  {
1551  #ifdef With_Check
1552  /* Here we can have NaN and Inf if y(i/m) and/or W(i) and/or B(i)=0 */
1553  W[i]=W[i] * (B[i] / y[i/m]);
1554  assert(isfinite(W[i]));
1555  #else
1556  W[i]=W[i] * (B[i] / y[i/m]);
1557  #endif
1558  }
1559 }
1560 
1569 void derrorbd0_x86(const int n, const double *x, double *__restrict__ y)
1570 {
1571  int i;
1572 
1573  #ifdef With_ICC
1574  #pragma loop_count min=16
1575  #pragma simd
1576  #else
1577  #ifdef With_OMP
1578  #pragma omp parallel for
1579  #endif
1580  #endif
1581  for (i=0; i<n; i++)
1582  {
1583  #ifdef With_Check
1584  /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
1585  y[i]=(x[i]/y[i]) - log(x[i]/y[i]) - 1.0;
1586  assert(isfinite(y[i]));
1587  #else
1588  y[i]=(x[i]/y[i]) - log(x[i]/y[i]) - 1.0;
1589  #endif
1590  }
1591 }
1592 
1593 
1602 void serrorbd0_x86(const int n, const float *x, float *__restrict__ y)
1603 {
1604  int i;
1605 
1606  #ifdef With_ICC
1607  #pragma loop_count min=16
1608  #pragma simd
1609  #else
1610  #ifdef With_OMP
1611  #pragma omp parallel for
1612  #endif
1613  #endif
1614  for (i=0; i<n; i++)
1615  {
1616  #ifdef With_Check
1617  /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
1618  y[i]=(x[i]/y[i]) - logf(x[i]/y[i]) - 1.0f;
1619  assert(isfinite(y[i]));
1620  #else
1621  y[i]=(x[i]/y[i]) - logf(x[i]/y[i]) - 1.0f;
1622  #endif
1623  }
1624 }
1625 
1626 
1635 void derrorbd1_x86(const int n, const double *x, double *__restrict__ y)
1636 {
1637  int i;
1638 
1639  #ifdef With_ICC
1640  #pragma loop_count min=16
1641  #pragma simd
1642  #else
1643  #ifdef With_OMP
1644  #pragma omp parallel for
1645  #endif
1646  #endif
1647  for (i=0; i<n; i++)
1648  {
1649  #ifdef With_Check
1650  /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
1651  y[i]=(x[i]*log(x[i]/y[i])) + y[i] - x[i];
1652  assert(isfinite(y[i]));
1653  #else
1654  y[i]=(x[i]*log(x[i]/y[i])) + y[i] - x[i];
1655  #endif
1656  }
1657 }
1658 
1659 
1668 void serrorbd1_x86(const int n, const float *x, float *__restrict__ y)
1669 {
1670  int i;
1671 
1672  #ifdef With_ICC
1673  #pragma loop_count min=16
1674  #pragma simd
1675  #else
1676  #ifdef With_OMP
1677  #pragma omp parallel for
1678  #endif
1679  #endif
1680  for (i=0; i<n; i++)
1681  {
1682  #ifdef With_Check
1683  /* Here we can have NaN and Inf if y(i) and/or x(i)=0 */
1684  y[i]=(x[i]*logf(x[i]/y[i])) + y[i] - x[i];
1685  assert(isfinite(y[i]));
1686  #else
1687  y[i]=(x[i]*logf(x[i]/y[i])) + y[i] - x[i];
1688  #endif
1689  }
1690 }
1691 
1692 
1702 void derrorbdg_x86(const int n, const double *x, double *__restrict__ y, const double beta)
1703 {
1704  int i;
1705 
1706  double
1707  dbeta, dtmp;
1708 
1709  dbeta=beta-1.0;
1710  dtmp =beta*dbeta;
1711 
1712  #ifdef With_ICC
1713  #pragma loop_count min=16
1714  #pragma simd
1715  #else
1716  #ifdef With_OMP
1717  #pragma omp parallel for
1718  #endif
1719  #endif
1720  for (i=0; i<n; i++)
1721  {
1722  #ifdef With_Check
1723  y[i]=(pow(x[i],beta) + (dbeta*pow(y[i],beta)) - (beta*x[i]*pow(y[i],dbeta))) / dtmp;
1724  assert(isfinite(y[i]));
1725  #else
1726  y[i]=(pow(x[i],beta) + (dbeta*pow(y[i],beta)) - (beta*x[i]*pow(y[i],dbeta))) / dtmp;
1727  #endif
1728  }
1729 }
1730 
1731 
1741 void serrorbdg_x86(const int n, const float *x, float *__restrict__ y, const float beta)
1742 {
1743  int i;
1744 
1745  float
1746  fbeta, ftmp;
1747 
1748  fbeta=beta-1.0f;
1749  ftmp =beta*fbeta;
1750 
1751  #ifdef With_ICC
1752  #pragma loop_count min=16
1753  #pragma simd
1754  #else
1755  #ifdef With_OMP
1756  #pragma omp parallel for
1757  #endif
1758  #endif
1759  for (i=0; i<n; i++)
1760  {
1761  #ifdef With_Check
1762  y[i]=(powf(x[i],beta) + (fbeta*powf(y[i],beta)) - (beta*x[i]*powf(y[i],fbeta))) / ftmp;
1763  assert(isfinite(y[i]));
1764  #else
1765  y[i]=(powf(x[i],beta) + (fbeta*powf(y[i],beta)) - (beta*x[i]*powf(y[i],fbeta))) / ftmp;
1766  #endif
1767  }
1768 }
1769 
1770 
1782 double derrorbd_x86(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double beta)
1783 {
1784  double
1785  error=0.0,
1786  *tmp=NULL;
1787 
1788  #ifdef With_MKL
1789  tmp = (double *)mkl_malloc(m*n*sizeof(double), WRDLEN);
1790  #else
1791  #ifdef With_ARM
1792  tmp = (double *)(m*n*sizeof(double));
1793  #else
1794  tmp = (double *)_mm_malloc(m*n*sizeof(double), WRDLEN);
1795  #endif
1796  #endif
1797 
1798  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, tmp, m);
1799 
1800  if (beta>=0.0 && beta<=0.0)
1801  derrorbd0_x86(m*n, A, tmp);
1802  else
1803  {
1804  if (beta>=1.0 && beta<=1.0)
1805  derrorbd1_x86(m*n, A, tmp);
1806  else
1807  derrorbdg_x86(m*n, A, tmp, beta);
1808  }
1809 
1810  /* Old */
1811  /* #ifdef With_ICC */
1812  /* #pragma loop_count min=512 */
1813  /* #pragma simd reduction(+ : error) */
1814  /* #else */
1815  /* #pragma omp parallel for reduction(+ : error) */
1816  /* #endif */
1817  /* for (i=0; i<m*n; i++) */
1818  /* error += tmp[i]; */
1819  /* all tmp elements are >=0 so we use MKL/BLAS */
1820  error=cblas_dasum(m*n, tmp, 1);
1821 
1822  error=sqrt((2.0*error)/((double)m*n));
1823 
1824  #ifdef With_MKL
1825  mkl_free(tmp);
1826  #else
1827  #ifdef With_ARM
1828  free(tmp);
1829  #else
1830  _mm_free(tmp);
1831  #endif
1832  #endif
1833 
1834  return error;
1835 }
1836 
1837 
1849 float serrorbd_x86(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float beta)
1850 {
1851  float
1852  error=0.0,
1853  *tmp=NULL;
1854 
1855  #ifdef With_MKL
1856  tmp = (float *)mkl_malloc(m*n*sizeof(float), WRDLEN);
1857  #else
1858  #ifdef With_ARM
1859  tmp = (float *)malloc(m*n*sizeof(float));
1860  #else
1861  tmp = (float *)_mm_malloc(m*n*sizeof(float), WRDLEN);
1862  #endif
1863  #endif
1864 
1865  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, W, m, H, k, 0.0, tmp, m);
1866 
1867  if (beta>=0.0 && beta<=0.0)
1868  serrorbd0_x86(m*n, A, tmp);
1869  else
1870  {
1871  if (beta>=1.0 && beta<=1.0)
1872  serrorbd1_x86(m*n, A, tmp);
1873  else
1874  serrorbdg_x86(m*n, A, tmp, beta);
1875  }
1876 
1877  /* Old */
1878  /* #ifdef With_ICC */
1879  /* #pragma loop_count min=512 */
1880  /* #pragma simd reduction(+ : error) */
1881  /* #else */
1882  /* #pragma omp parallel for reduction(+ : error) */
1883  /* #endif */
1884  /* for (i=0; i<m*n; i++) */
1885  /* error += tmp[i]; */
1886  /* all tmp elements are >=0 so we use MKL/BLAS */
1887  error=cblas_sasum(m*n, tmp, 1);
1888 
1889  error=sqrtf((2.0f*error)/((float)m*n));
1890 
1891  #ifdef With_MKL
1892  mkl_free(tmp);
1893  #else
1894  #ifdef With_ARM
1895  free(tmp);
1896  #else
1897  _mm_free(tmp);
1898  #endif
1899  #endif
1900 
1901  return error;
1902 }
1903 
void serrorbdg_x86(const int n, const float *x, float *__restrict__ y, const float beta)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1741
void dupdate1W_x86(const int m, const int n, const double *X, double *__restrict__ W)
Definition: bdiv_cpu.c:1359
void dupdate2W_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ W)
This function performs double precision W(i)=W(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1503
void supdate2W_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ W)
This function computes simple precision W(i)=W(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1537
void skernelW_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
This function computes simple precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo) Note exp...
Definition: bdiv_cpu.c:1250
void dkernelH_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
Definition: bdiv_cpu.c:1116
int dmlsa_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dmlsa_cpu performs NNMF using betadivergence when beta=2 using double precision
Definition: mlsa_cpu.c:74
int dbdiv_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double beta, const int uType, const int nIter)
dbdiv_cpu is a wrapper that calls the adequate function to performs NNMF using betadivergence using d...
Definition: bdiv_cpu.c:1059
void derrorbd0_x86(const int n, const double *x, double *__restrict__ y)
This function performs auxiliar double precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1569
void derrorbd1_x86(const int n, const double *x, double *__restrict__ y)
This function performs auxiliar double precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1635
void derrorbdg_x86(const int n, const double *x, double *__restrict__ y, const double beta)
This function performs auxiliar double precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1702
int sbdiv_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float beta, const int uType, const int nIter)
Definition: bdiv_cpu.c:1089
int smlsa_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
smlsa_cpu performs NNMF using betadivergence when beta=2 using simple precision
Definition: mlsa_cpu.c:275
void supdate1H_x86(const int n, const float *X, float *__restrict__ H)
This function computes simple precision H(i)=H(i)*B(i)/C(i) where matrices B and C are stored in the ...
Definition: bdiv_cpu.c:1325
int dbdivone_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dbdivone_cpu performs NNMF using beta-divergence when beta=1, using double precision ...
Definition: bdiv_cpu.c:566
void supdate1W_x86(const int m, const int n, const float *X, float *__restrict__ W)
This function computes double precision W[i]=W[i]*D[i]/E[i] where matrices D and E are stored in the ...
Definition: bdiv_cpu.c:1397
double derrorbd_x86(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double beta)
This function returns double precision error when error is computed using betadivergence error formul...
Definition: bdiv_cpu.c:1782
void ddiv_x86(const int n, const double *x, double *__restrict__ y)
This function calls the appropiate funtions to performs double precision element-wise y[i]=x[i]/y[i] ...
Definition: utils_x86.c:91
int sbdivg_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const float expo, const int uType, int nIter)
sbdivg_cpu performs NNMF using betadivergence for general case (beta <> 1 and 2) using simple precisi...
Definition: bdiv_cpu.c:313
int dbdivg_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const double expo, const int uType, int nIter)
dbdivg_cpu performs the NNMF using beta-divergence when beta is != 1 and !=2, using double precision...
Definition: bdiv_cpu.c:87
int sbdivone_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
sbdivone_cpu performs NNMF using betadivergence when beta=1 using simple precision ...
Definition: bdiv_cpu.c:813
void serrorbd0_x86(const int n, const float *x, float *__restrict__ y)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1602
void sdiv_x86(const int n, const float *x, float *__restrict__ y)
This function calls the appropiate funtions to performs simple precision element-wise x[i]=x[i]/y[i] ...
Definition: utils_x86.c:127
void serrorbd1_x86(const int n, const float *x, float *__restrict__ y)
This function performs auxiliar simple precision operations when error is computed using betadivergen...
Definition: bdiv_cpu.c:1668
void skernelH_x86(const int m, const int n, const float *L, const float *A, float *__restrict__ R, const float expo)
This function computes simple precision R(i)=(L(i)^expo)*A[i] and R(i+m*n)=L[i]*(L(i)^expo) Note "exp...
Definition: bdiv_cpu.c:1160
float serrorbd_x86(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float beta)
This function returns simple precision error when error is computed using betadivergence error formul...
Definition: bdiv_cpu.c:1849
void dupdate2H_x86(const int m, const int n, const double *y, const double *B, double *__restrict__ H)
This function computes double precision H(i)=H(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1435
void smemset_x86(const int n, float *__restrict__ x, const float val)
This function fills all positions of x with val.
Definition: utils_x86.c:66
void supdate2H_x86(const int m, const int n, const float *y, const float *B, float *__restrict__ H)
This function performs the simple H(i)=H(i)*(B(i)/y(j))
Definition: bdiv_cpu.c:1469
Header file for using the betadivergence cuda functions with CPU.
void dkernelW_x86(const int m, const int n, const double *L, const double *A, double *__restrict__ R, const double expo)
This function computes double precision R(pos)=L(i)^expo)*A(i) and R(pos+m)=L(i)*(L(i)^expo) Note exp...
Definition: bdiv_cpu.c:1204
void dupdate1H_x86(const int n, const double *X, double *__restrict__ H)
This function computes double precision H(i)=H(i)*B(i)/C(i) where matrices B and C are stored in the ...
Definition: bdiv_cpu.c:1293
void dmemset_x86(const int n, double *__restrict__ x, const double val)
This function fills all positions of x with val.
Definition: utils_x86.c:42