NnmfPack  2.1
mlsa_cpu.c
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2014 by PIR (University of Oviedo) and *
3  * INCO2 (Polytechnic University of Valencia) groups. *
4  * nnmfpack@gmail.com *
5  * *
6  * This program is free software; you can redistribute it and/or modify *
7  * it under the terms of the GNU General Public License as published by *
8  * the Free Software Foundation; either version 2 of the License, or *
9  * (at your option) any later version. *
10  * *
11  * This program is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14  * GNU General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU General Public License *
17  * along with this program; if not, write to the *
18  * Free Software Foundation, Inc., *
19  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
20  ***************************************************************************
21 */
33 #include "mlsa_cpu.h"
34 
35 
74 int dmlsa_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
75 {
76  double
77  *B=NULL,
78  *C=NULL,
79  *L=NULL;
80 
81  int i;
82 
83  switch (uType)
84  {
85  case UpdateAll:
86  #ifdef With_MKL
87  B = (double *)mkl_malloc(max(m,n) * k * sizeof(double), WRDLEN);
88  C = (double *)mkl_malloc(max(m,n) * k * sizeof(double), WRDLEN);
89  L = (double *)mkl_malloc( k * k * sizeof(double), WRDLEN);
90  #else
91  #ifdef With_ARM
92  B = (double *)malloc(max(m,n) * k * sizeof(double));
93  C = (double *)malloc(max(m,n) * k * sizeof(double));
94  L = (double *)malloc( k * k * sizeof(double));
95  #else
96  B = (double *)_mm_malloc(max(m,n) * k * sizeof(double), WRDLEN);
97  C = (double *)_mm_malloc(max(m,n) * k * sizeof(double), WRDLEN);
98  L = (double *)_mm_malloc( k * k * sizeof(double), WRDLEN);
99  #endif
100  #endif
101 
102  if (B == NULL || C == NULL || L == NULL)
103  return -1;
104 
105  for(i=0;i<nIter;i++)
106  {
107  /* ************************ Phase 1 *************************** */
108  /* B=W'*A */
109  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, A, m, 0.0, B, k);
110 
111  /* L=W'*W */
112  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, k, m, 1.0, W, m, W, m, 0.0, L, k);
113 
114  /* C=L*H */
115  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, k, n, k, 1.0, L, k, H, k, 0.0, C, k);
116 
117  /* H=H(*)B(/)C */
118  ddotdiv_x86(k*n, B, C, H);
119 
120 
121  /* ************************ Phase 2 *************************** */
122  /* B=A*H' */
123  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, A, m, H, k, 0.0, B, m);
124 
125  /* L=H*H' */
126  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, k, k, n, 1.0, H, k, H, k, 0.0, L, k);
127 
128  /* C=W*L */
129  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, k, k, 1.0, W, m, L, k, 0.0, C, m);
130 
131  /* W=W(*)B(/)C */
132  ddotdiv_x86(m*k, B, C, W);
133  }
134  #ifdef With_MKL
135  mkl_free(B);
136  mkl_free(C);
137  mkl_free(L);
138  #else
139  #ifdef With_ARM
140  free(B);
141  free(C);
142  free(L);
143  #else
144  _mm_free(B);
145  _mm_free(C);
146  _mm_free(L);
147  #endif
148  #endif
149  break;
150 
151  case UpdateW:
152  #ifdef With_MKL
153  B = (double *)mkl_malloc(m * k * sizeof(double), WRDLEN);
154  C = (double *)mkl_malloc(m * k * sizeof(double), WRDLEN);
155  L = (double *)mkl_malloc(k * k * sizeof(double), WRDLEN);
156  #else
157  #ifdef With_ARM
158  B = (double *)malloc(m * k * sizeof(double));
159  C = (double *)malloc(m * k * sizeof(double));
160  L = (double *)malloc(k * k * sizeof(double));
161  #else
162  B = (double *)_mm_malloc(m * k * sizeof(double), WRDLEN);
163  C = (double *)_mm_malloc(m * k * sizeof(double), WRDLEN);
164  L = (double *)_mm_malloc(k * k * sizeof(double), WRDLEN);
165  #endif
166  #endif
167 
168  if (B == NULL || C == NULL || L == NULL)
169  return -1;
170 
171  for(i=0;i<nIter;i++)
172  {
173  /* ************************ Phase 2 *************************** */
174  /* B=A*H' */
175  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, A, m, H, k, 0.0, B, m);
176 
177  /* L=H*H' */
178  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, k, k, n, 1.0, H, k, H, k, 0.0, L, k);
179 
180  /* C=W*L */
181  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, k, k, 1.0, W, m, L, k, 0.0, C, m);
182 
183  /* W=W(*)B(/)C */
184  ddotdiv_x86(m*k, B, C, W);
185  }
186  #ifdef With_MKL
187  mkl_free(B);
188  mkl_free(C);
189  mkl_free(L);
190  #else
191  #ifdef With_ARM
192  free(B);
193  free(C);
194  free(L);
195  #else
196  _mm_free(B);
197  _mm_free(C);
198  _mm_free(L);
199  #endif
200  #endif
201  break;
202 
203  case UpdateH:
204  #ifdef With_MKL
205  B = (double *)mkl_malloc(n * k * sizeof(double), WRDLEN);
206  C = (double *)mkl_malloc(n * k * sizeof(double), WRDLEN);
207  L = (double *)mkl_malloc(k * k * sizeof(double), WRDLEN);
208  #else
209  #ifdef With_ARM
210  B = (double *)malloc(n * k * sizeof(double));
211  C = (double *)malloc(n * k * sizeof(double));
212  L = (double *)malloc(k * k * sizeof(double));
213  #else
214  B = (double *)_mm_malloc(n * k * sizeof(double), WRDLEN);
215  C = (double *)_mm_malloc(n * k * sizeof(double), WRDLEN);
216  L = (double *)_mm_malloc(k * k * sizeof(double), WRDLEN);
217  #endif
218  #endif
219 
220  if (B == NULL || C == NULL || L == NULL)
221  return -1;
222 
223  for(i=0;i<nIter;i++)
224  {
225  /* ************************ Phase 1 *************************** */
226  /* B=W'*A */
227  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, A, m, 0.0, B, k);
228 
229  /* L=W'*W */
230  cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, k, m, 1.0, W, m, W, m, 0.0, L, k);
231 
232  /* C=L*H */
233  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, k, n, k, 1.0, L, k, H, k, 0.0, C, k);
234 
235  /* H=H(*)B(/)C */
236  ddotdiv_x86(k*n, B, C, H);
237  }
238  #ifdef With_MKL
239  mkl_free(B);
240  mkl_free(C);
241  mkl_free(L);
242  #else
243  #ifdef With_ARM
244  free(B);
245  free(C);
246  free(L);
247  #else
248  _mm_free(B);
249  _mm_free(C);
250  _mm_free(L);
251  #endif
252  #endif
253  break;
254 
255  default:
256  return -1;
257  }
258  return 0;
259 }
260 
261 
275 int smlsa_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
276 {
277  float
278  *B=NULL,
279  *C=NULL,
280  *L=NULL;
281 
282  int i;
283 
284  switch (uType)
285  {
286  case UpdateAll:
287  #ifdef With_MKL
288  B = (float *)mkl_malloc(max(m,n) * k * sizeof(float), WRDLEN);
289  C = (float *)mkl_malloc(max(m,n) * k * sizeof(float), WRDLEN);
290  L = (float *)mkl_malloc( m * n * sizeof(float), WRDLEN);
291  #else
292  #ifdef With_ARM
293  B = (float *)malloc(max(m,n) * k * sizeof(float));
294  C = (float *)malloc(max(m,n) * k * sizeof(float));
295  L = (float *)malloc( m * n * sizeof(float));
296  #else
297  B = (float *)_mm_malloc(max(m,n) * k * sizeof(float), WRDLEN);
298  C = (float *)_mm_malloc(max(m,n) * k * sizeof(float), WRDLEN);
299  L = (float *)_mm_malloc( m * n * sizeof(float), WRDLEN);
300  #endif
301  #endif
302 
303  if (B == NULL || C == NULL || L == NULL)
304  return -1;
305 
306  for(i=0;i<nIter;i++)
307  {
308  /* ************************ Phase 1 *************************** */
309  /* B=W'*A */
310  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, A, m, 0.0, B, k);
311 
312  /* L=W'*W */
313  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, k, m, 1.0, W, m, W, m, 0.0, L, k);
314 
315  /* C=L*H */
316  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, k, n, k, 1.0, L, k, H, k, 0.0, C, k);
317 
318  /* H=H(*)B(/)C */
319  sdotdiv_x86(k*n, B, C, H);
320 
321  /* ************************ Phase 2 *************************** */
322  /* B=A*H' */
323  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, A, m, H, k, 0.0, B, m);
324 
325  /* L=H*H' */
326  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, k, k, n, 1.0, H, k, H, k, 0.0, L, k);
327 
328  /* C=W*L */
329  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, k, k, 1.0, W, m, L, k, 0.0, C, m);
330 
331  /* W=W(*)B(/)C */
332  sdotdiv_x86(m*k, B, C, W);
333  }
334  #ifdef With_MKL
335  mkl_free(B);
336  mkl_free(C);
337  mkl_free(L);
338  #else
339  #ifdef With_ARM
340  free(B);
341  free(C);
342  free(L);
343  #else
344  _mm_free(B);
345  _mm_free(C);
346  _mm_free(L);
347  #endif
348  #endif
349  break;
350 
351  case UpdateW:
352  #ifdef With_MKL
353  B = (float *)mkl_malloc(m * k * sizeof(float), WRDLEN);
354  C = (float *)mkl_malloc(m * k * sizeof(float), WRDLEN);
355  L = (float *)mkl_malloc(m * n * sizeof(float), WRDLEN);
356  #else
357  #ifdef With_ARM
358  B = (float *)malloc(m * k * sizeof(float));
359  C = (float *)malloc(m * k * sizeof(float));
360  L = (float *)malloc(m * n * sizeof(float));
361  #else
362  B = (float *)_mm_malloc(m * k * sizeof(float), WRDLEN);
363  C = (float *)_mm_malloc(m * k * sizeof(float), WRDLEN);
364  L = (float *)_mm_malloc(m * n * sizeof(float), WRDLEN);
365  #endif
366  #endif
367 
368  if (B == NULL || C == NULL || L == NULL)
369  return -1;
370 
371  for(i=0;i<nIter;i++)
372  {
373  /* ************************ Phase 2 *************************** */
374  /* B=A*H' */
375  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, A, m, H, k, 0.0, B, m);
376 
377  /* L=H*H' */
378  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, k, k, n, 1.0, H, k, H, k, 0.0, L, k);
379 
380  /* C=W*L */
381  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, k, k, 1.0, W, m, L, k, 0.0, C, m);
382 
383  /* W=W(*)B(/)C */
384  sdotdiv_x86(m*k, B, C, W);
385  }
386  #ifdef With_MKL
387  mkl_free(B);
388  mkl_free(C);
389  mkl_free(L);
390  #else
391  #ifdef With_ARM
392  free(B);
393  free(C);
394  free(L);
395  #else
396  _mm_free(B);
397  _mm_free(C);
398  _mm_free(L);
399  #endif
400  #endif
401  break;
402 
403  case UpdateH:
404  #ifdef With_MKL
405  B = (float *)mkl_malloc(n * k * sizeof(float), WRDLEN);
406  C = (float *)mkl_malloc(n * k * sizeof(float), WRDLEN);
407  L = (float *)mkl_malloc(m * n * sizeof(float), WRDLEN);
408  #else
409  #ifdef With_ARM
410  B = (float *)malloc(n * k * sizeof(float));
411  C = (float *)malloc(n * k * sizeof(float));
412  L = (float *)malloc(m * n * sizeof(float));
413  #else
414  B = (float *)_mm_malloc(n * k * sizeof(float), WRDLEN);
415  C = (float *)_mm_malloc(n * k * sizeof(float), WRDLEN);
416  L = (float *)_mm_malloc(m * n * sizeof(float), WRDLEN);
417  #endif
418  #endif
419 
420  if (B == NULL || C == NULL || L == NULL)
421  return -1;
422 
423  for(i=0;i<nIter;i++)
424  {
425  /* ************************ Phase 1 *************************** */
426  /* B=W'*A */
427  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, n, m, 1.0, W, m, A, m, 0.0, B, k);
428 
429  /* L=W'*W */
430  cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, k, k, m, 1.0, W, m, W, m, 0.0, L, k);
431 
432  /* C=L*H */
433  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, k, n, k, 1.0, L, k, H, k, 0.0, C, k);
434 
435  /* H=H(*)B(/)C */
436  sdotdiv_x86(k*n, B, C, H);
437  }
438  #ifdef With_MKL
439  mkl_free(B);
440  mkl_free(C);
441  mkl_free(L);
442  #else
443  #ifdef With_ARM
444  free(B);
445  free(C);
446  free(L);
447  #else
448  _mm_free(B);
449  _mm_free(C);
450  _mm_free(L);
451  #endif
452  #endif
453  break;
454 
455  default:
456  return -1;
457  }
458  return 0;
459 }
460 
461 
471 void ddotdiv_x86(const int n, const double *x, const double *y, double *__restrict__ z)
472 {
473  #ifdef With_MKL
474  vdMul(n, z, x, z);
475  vdDiv(n, z, y, z);
476  #else
477  int i;
478  #ifdef With_ICC
479  #pragma loop_count min=32
480  #pragma simd
481  #else
482  #ifdef With_OMP
483  #pragma omp parallel for
484  #endif
485  #endif
486  for (i=0; i<n; i++)
487  {
488  #ifdef With_Check
489  /* Here we can have NaN and Inf if y(i) and/or x(i) and/or z[i]=0 */
490  z[i]=z[i] * x[i] / y[i];
491  assert(isfinite(z[i]));
492  #else
493  z[i]=z[i] * x[i] / y[i];
494  #endif
495  }
496  #endif
497 }
498 
499 
509 void sdotdiv_x86(const int n, const float *x, const float *y, float *__restrict__ z)
510 {
511  #ifdef With_MKL
512  vsMul(n, z, x, z);
513  vsDiv(n, z, y, z);
514  #else
515  int i;
516  #ifdef With_ICC
517  #pragma loop_count min=32
518  #pragma simd
519  #else
520  #ifdef With_OMP
521  #pragma omp parallel for
522  #endif
523  #endif
524  for (i=0; i<n; i++)
525  {
526  #ifdef With_Check
527  /* Here we can have NaN and Inf if y(i) and/or x(i) and/or z[i]=0 */
528  z[i]=z[i] * x[i] / y[i];
529  assert(isfinite(z[i]));
530  #else
531  z[i]=z[i] * x[i] / y[i];
532  #endif
533  }
534  #endif
535 }
int dmlsa_cpu(const int m, const int n, const int k, const double *A, double *W, double *H, const int uType, const int nIter)
dmlsa_cpu performs NNMF using betadivergence when beta=2 using double precision
Definition: mlsa_cpu.c:74
void ddotdiv_x86(const int n, const double *x, const double *y, double *__restrict__ z)
This function calls the appropiate funtions to performs double precision element-wise z[i]=z[i]*x[i]/...
Definition: mlsa_cpu.c:471
File with functions to calcule NNMF using the mlsa algorithm for CPUs.
int smlsa_cpu(const int m, const int n, const int k, const float *A, float *W, float *H, const int uType, const int nIter)
smlsa_cpu performs NNMF using betadivergence when beta=2 using simple precision
Definition: mlsa_cpu.c:275
void sdotdiv_x86(const int n, const float *x, const float *y, float *__restrict__ z)
This function calls the appropiate funtions to performs simple precision element-wise z[i]=z[i]*x[i]/...
Definition: mlsa_cpu.c:509