ReMAS  1.5
Real-time Musical Accompaniment System
GPUFunctions.cu
Go to the documentation of this file.
1 /**************************************************************************
2  * Copyright (C) 2017 by "Information Retrieval and Parallel Computing" *
3  * group (University of Oviedo, Spain), "Interdisciplinary Computation *
4  * and Communication" group (Polytechnic University of Valencia, Spain) *
5  * and "Signal Processing and Telecommunication Systems Research" group *
6  * (University of Jaen, Spain) *
7  * Contact: remaspack@gmail.com *
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  * This program is distributed in the hope that it will be useful, *
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
17  * GNU General Public License for more details. *
18  * *
19  * You should have received a copy of the GNU General Public License *
20  * along with this program; if not, write to the *
21  * Free Software Foundation, Inc., *
22  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
23  **************************************************************************
24 */
34 #include "GPUFunctions.h"
35 
36 extern "C" {
37 #include "../common/FileFunctions.h"
38 }
39 
40 
47 unsigned int NextPow2(unsigned int x)
48 {
49  --x;
50  x |= x >> 1;
51  x |= x >> 2;
52  x |= x >> 4;
53  x |= x >> 8;
54  x |= x >> 16;
55 
56  return ++x;
57 }
58 
59 
66 inline bool IsPow2(unsigned int x) { return ((x & (x-1)) == 0); }
67 
68 
75 int HaveCompatibleGPU(int &maxGrid)
76 {
77  int deviceCount, driverVersion;
78 
79  cudaDeviceProp deviceProp;
80 
81  CUDAERR(cudaGetDeviceCount(&deviceCount));
82 
83  CUDAERR(cudaGetDeviceProperties(&deviceProp, 0));
84  if (deviceProp.major < 3) {
85  printf("Sorry, we need CUDA Capability >=3\n");
86  return ErrGpuWrong;
87  }
88  maxGrid=deviceProp.maxGridSize[0];
89 
90  CUDAERR(cudaDriverGetVersion(&driverVersion));
91  if ((driverVersion/1000) < 6) {
92  printf("Sorry, we need CUDA Version >=6\n");
93  return ErrGpuWrong;
94  }
95 
96  if (!deviceProp.unifiedAddressing) {
97  printf("Your system does not support Unified Memory\n");
98  return ErrGpuWrong;
99  }
100 
101  return OK;
102 }
103 
104 
117 int AllocS_fkGPU(MyType **s_fk, MyType **tauxi, MyType **ts_fk, const MyType BETA, const int nmidi,
118  const int nbases, DTWfiles NameFiles)
119 {
120  CUDAERR(cudaMallocManaged((void **)s_fk, sizeof(MyType)*nmidi*nbases, cudaMemAttachGlobal));
121  CHECKERR(ReadS_fk((*s_fk), nbases, NameFiles.file_partitura));
122 
123  if (!(BETA>=(MyType)0.0 && BETA<=(MyType)0.0) && !(BETA>=(MyType)1.0 && BETA<=(MyType)1.0))
124  {
125  CUDAERR(cudaMallocManaged((void **)tauxi, sizeof(MyType)*nmidi, cudaMemAttachGlobal));
126  CUDAERR(cudaMallocManaged((void **)ts_fk, sizeof(MyType)*nmidi*nbases, cudaMemAttachGlobal));
127  }
128 
129  return OK;
130 }
131 
132 
148 int AllocDataGPU(MyType **v_hanning, int **states_time_i, int **states_time_e, int **states_seq, int **states_corr,
149  int **I_SxD, int *DTWSize, const int tamtrama, const int nstates, DTWfiles NameFiles)
150 {
151  int i, j, pos;
152 
153  CHECKNULL((*states_time_i)=(int *)calloc(nstates, sizeof(int)));
154  CHECKNULL((*states_time_e)=(int *)calloc(nstates, sizeof(int)));
155  CHECKNULL((*states_seq) =(int *)calloc(nstates, sizeof(int)));
156  CHECKNULL((*states_corr) =(int *)calloc(nstates, sizeof(int)));
157 
158  CHECKERR(ReadVectorInt64((*states_seq), nstates, NameFiles.fileStates_seq));
159  CHECKERR(ReadVectorInt64((*states_time_i), nstates, NameFiles.fileStates_Time_i));
160  CHECKERR(ReadVectorInt64((*states_time_e), nstates, NameFiles.fileStates_Time_e));
161  CHECKERR(ReadVectorInt64((*states_corr), nstates, NameFiles.fileStates_corr));
162 
163  (*DTWSize)=(*states_time_e)[nstates - 1] + 1;
164 
165  CUDAERR(cudaMallocManaged((void **)I_SxD, sizeof(int)*(*DTWSize), cudaMemAttachGlobal));
166 
167  pos=0;
168  for (i=0; i<nstates; i++)
169  {
170  for (j=(*states_time_i)[i]; j<=(*states_time_e)[i]; j++)
171  {
172  (*I_SxD)[pos]=(*states_seq)[i];
173  pos++;
174  }
175  }
176 
177  CUDAERR(cudaMallocManaged((void **)v_hanning, sizeof(MyType)*tamtrama, cudaMemAttachGlobal));
178  CHECKERR(ReadVector((*v_hanning), tamtrama, NameFiles.file_hanning));
179 
180  return OK;
181 }
182 
183 
197 int AllocFFTGPU(MyFFTGPUType *plan, MyType **X_fft, MyType **Out_fft, MyType **Mod_fft, int *kmin_fft,
198  int *kmax_fft, const int nfft, DTWfiles NameFiles)
199 {
200  CUDAERR(cudaMallocManaged((void **)X_fft, sizeof(MyType)*2*nfft+1, cudaMemAttachGlobal));
201  CUDAERR(cudaMallocManaged((void **)Mod_fft, sizeof(MyType)*nfft, cudaMemAttachGlobal));
202  /* ¿¿ works with Mod_fft size=nfft/2+1 ?? */
203 
204  #ifdef SIMPLE
205  CUDAERR(cudaMallocManaged((void **)Out_fft, sizeof(cufftComplex)*nfft, cudaMemAttachGlobal));
206  CUFFTERR(cufftPlan1d(plan, nfft, CUFFT_R2C, 1));
207  #else
208  CUDAERR(cudaMallocManaged((void **)Out_fft, sizeof(cufftDoubleComplex)*nfft, cudaMemAttachGlobal));
209  CUFFTERR(cufftPlan1d(plan, nfft, CUFFT_D2Z, 1));
210  #endif
211 
212  if (plan==NULL) return ErrFFTSched;
213 
214  CHECKERR(ReadVectorInt64(kmax_fft, N_MIDI, NameFiles.file_kmax));
215  CHECKERR(ReadVectorInt64(kmin_fft, N_MIDI, NameFiles.file_kmin));
216 
217  return OK;
218 }
219 
220 
232 int AllocDTWGPU(MyType **pV, MyType **v_SxD, MyType **sdata, const int maxGrid, const int DTWSize, const int DTWSizePlusPad)
233 {
234  int numThreads, numBlocks, sharedSize;
235 
236  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, DTWSize);
237 
238  CUDAERR(cudaMallocManaged((void **)pV, sizeof(MyType)*DTWSizePlusPad, cudaMemAttachGlobal));
239  CUDAERR(cudaMallocManaged((void **)v_SxD, sizeof(MyType)*DTWSize, cudaMemAttachGlobal));
240  CUDAERR(cudaMallocManaged((void **)sdata, sizeof(MyType)*numBlocks, cudaMemAttachGlobal));
241 
242  return OK;
243 }
244 
245 
259 int AllocAuxiGPU(MyType **norms, short **GPUframe, short **CPUframe, MyType **v_cfreq, MyType **v_dxState, const int nbases,
260  const int tamframe, const int nmidi)
261 {
262  CUDAERR(cudaMallocManaged((void **)norms, sizeof(MyType)*nbases, cudaMemAttachGlobal));
263  CUDAERR(cudaMallocManaged((void **)v_dxState, sizeof(MyType)*nbases, cudaMemAttachGlobal));
264  CUDAERR(cudaMallocManaged((void **)v_cfreq, sizeof(MyType)*nmidi, cudaMemAttachGlobal));
265 
266  CUDAERR(cudaMalloc ((void **)GPUframe, sizeof(short)*tamframe));
267  CUDAERR(cudaHostAlloc((void **)CPUframe, sizeof(short)*tamframe, cudaHostAllocWriteCombined));
268 
269  return OK;
270 }
271 
272 
283 void BlocksAndThreads(int *blocks, int *threads, int *sharedsize, const int maxGrid, const int size)
284 {
285  (*threads) = (size < maxThreads*2) ? NextPow2((size + 1)/ 2) : maxThreads;
286  (*blocks) = (size + ((*threads) * 2 - 1)) / ((*threads) * 2);
287 
288  if ((*blocks) > maxGrid)
289  {
290  (*blocks) /= 2;
291  (*threads) *= 2;
292  }
293 
294  (*blocks) = min(maxBlocks, (*blocks));
295  (*sharedsize) = ((*threads) <= sizeWarp) ? 2*(*threads)*sizeof(MyType) : (*threads)*sizeof(MyType);
296 }
297 
298 
307 int FFTGPU(MyType *X_fft, MyType *Out_fft, MyFFTGPUType *plan)
308 {
309  #ifdef SIMPLE
310  CUFFTERR(cufftExecR2C(*plan, (cufftReal *)X_fft, (cufftComplex *)Out_fft));
311  #else
312  CUFFTERR(cufftExecD2Z(*plan, (cufftDoubleReal *)X_fft, (cufftDoubleComplex *)Out_fft));
313  #endif
314 
315  return OK;
316 }
317 
318 
330 void InitSxD(MyType *odata, MyType *v_SxD, const MyType* __restrict__ v_dxState, const int* __restrict__ I_SxD,
331  const int maxGrid, const int size)
332 {
333  int numBlocks=0, numThreads=0, sharedSize=0, s;
334 
335  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);
336 
337  kernel_InitSxD<<<numBlocks, numThreads, sharedSize>>>(odata, v_SxD, v_dxState, I_SxD, numThreads, IsPow2(size), size);
338 
339  s = numBlocks;
340  while (s > 1)
341  {
342  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);
343 
344  kernel_Sum<<<numBlocks, numThreads, sharedSize>>>(odata, odata, numThreads, IsPow2(s), s);
345  s = (s + (numThreads*2-1)) / (numThreads*2);
346 
347  }
348  kernel_Vnorm<<<1, 1>>>(odata);
349 }
350 
351 
362 int OneImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
363 {
364  int numBlocks=0, numThreads=0, sharedSize=0, s;
365 
366  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);
367 
368  kernel_OneImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads, IsPow2(size), size);
369 
370  s = numBlocks;
371  while (s > 1)
372  {
373  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);
374 
375  kernel_OneIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads, IsPow2(s), s);
376  s = (s + (numThreads*2-1)) / (numThreads*2);
377  }
378  cudaDeviceSynchronize();
379 
380  return opos[0];
381 }
382 
383 
394 int FirstImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
395 {
396  int numBlocks=0, numThreads=0, sharedSize=0, s;
397 
398  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);
399 
400  kernel_FirstImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads, IsPow2(size), size);
401 
402  s = numBlocks;
403  while (s > 1)
404  {
405  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);
406 
407  kernel_FirstIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads, IsPow2(s), s);
408  s = (s + (numThreads*2-1)) / (numThreads*2);
409  }
410  cudaDeviceSynchronize();
411 
412  return opos[0];
413 }
414 
415 
426 int LastImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
427 {
428  int numBlocks=0, numThreads=0, sharedSize=0, s;
429 
430  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);
431 
432  kernel_LastImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads, IsPow2(size), size);
433 
434  s = numBlocks;
435  while (s > 1)
436  {
437  BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);
438 
439  kernel_LastIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads, IsPow2(s), s);
440  s = (s + (numThreads*2-1)) / (numThreads*2);
441  }
442  cudaDeviceSynchronize();
443 
444  return opos[0];
445 }
446 
447 
456 int ReadWavGPU1st(short *GPUframe, short *CPUframe, FILE *fp)
457 {
458  if (fread(&CPUframe[TAMMUESTRA], sizeof(short), TTminusTM, fp) != TTminusTM) return ErrReadFile;
459 
460  CUDAERR(cudaMemcpy(&GPUframe[TAMMUESTRA], &CPUframe[TAMMUESTRA], sizeof(short)*TTminusTM, cudaMemcpyHostToDevice));
461 
462  return OK;
463 }
464 
473 int ReadWavGPU(short *GPUframe, short *CPUframe, FILE *fp)
474 {
475  kernel_Shift<<<1, TAMMUESTRA>>>(GPUframe, TAMTRAMA, TAMMUESTRA);
476 
477  if (fread(CPUframe, sizeof(short), TAMMUESTRA, fp) != TAMMUESTRA) return ErrReadFile;
478 
479  // ¿¿ cudaDeviceSynchronize(); ??
480 
481  CUDAERR(cudaMemcpy(&GPUframe[TTminusTM], CPUframe, sizeof(short)*TAMMUESTRA, cudaMemcpyHostToDevice));
482 
483  return OK;
484 }
485 
486 
487 #ifdef ALSA
488 
498  int ReadAlsaGPU1st(short *GPUframe, short *CPUframe, snd_pcm_t *DeviceID, FILE *fpdump)
499  {
500  if (snd_pcm_readi(DeviceID, &CPUframe[TAMMUESTRA], TTminusTM) != TTminusTM) return ErrReadDevice;
501 
502  CUDAERR(cudaMemcpy(&GPUframe[TAMMUESTRA], &CPUframe[TAMMUESTRA], sizeof(short)*TTminusTM, cudaMemcpyHostToDevice));
503 
504  #ifdef DUMP
505  if (fwrite(&CPUframe[TAMMUESTRA], sizeof(short), TTminusTM, fpdump) != TTminusTM) return ErrWriteFile;
506  #endif
507 
508  return OK;
509  }
510 
521  int ReadAlsaGPU(short *GPUframe, short *CPUframe, snd_pcm_t *DeviceID, FILE *fpdump)
522  {
523  kernel_Shift<<<1, TAMMUESTRA>>>(GPUframe, TAMTRAMA, TAMMUESTRA);
524 
525  if (snd_pcm_readi(DeviceID, CPUframe, TAMMUESTRA) != TAMMUESTRA) return ErrReadDevice;
526 
527  // ¿¿ cudaDeviceSynchronize(); ??
528 
529  CUDAERR(cudaMemcpy(&GPUframe[TTminusTM], CPUframe, sizeof(short)*TAMMUESTRA, cudaMemcpyHostToDevice));
530 
531  #ifdef DUMP
532  if (fwrite(&CPUframe[TTminusTM], sizeof(short), TAMMUESTRA, fpdump) != TAMMUESTRA) return ErrWriteFile;
533  #endif
534 
535  return OK;
536  }
537 #endif
char * fileStates_Time_i
Definition: defines.h:237
Struct for store the name of input/verificaton files. Each composition needs a file with values for ...
Definition: defines.h:228
int ReadWavGPU1st(short *GPUframe, short *CPUframe, FILE *fp)
ReadWavGPU1st reads first audio (frame) from WAV file when NVIDIA GPU is used.
int ReadVectorInt64(int *vector, const int size, const char *filename)
ReadVectorInt64 fills a int vector with the int64 info stores in a file.
int FirstImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find the first minimun and its position.
int OneImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find one minimun and its position.
int ReadVector(MyType *vector, const int size, const char *filename)
ReadVector fills a MyType vector with the MyType info stores in a file.
int FFTGPU(MyType *X_fft, MyType *Out_fft, MyFFTGPUType *plan)
FFTGPU computes FFT.
Header file for using ReMAS with Nvidia GPUs.
int AllocS_fkGPU(MyType **s_fk, MyType **tauxi, MyType **ts_fk, const MyType BETA, const int nmidi, const int nbases, DTWfiles NameFiles)
AllocS_fkGPU Allocates memory for S_fk vector, read its data from file and initializes other auxiliar...
int AllocFFTGPU(MyFFTGPUType *plan, MyType **X_fft, MyType **Out_fft, MyType **Mod_fft, int *kmin_fft, int *kmax_fft, const int nfft, DTWfiles NameFiles)
AllocFFTGPU Allocates "Unified" GPU memory for FFT vector and reads some fft information from files...
char * file_kmax
Definition: defines.h:234
int AllocAuxiGPU(MyType **norms, short **GPUframe, short **CPUframe, MyType **v_cfreq, MyType **v_dxState, const int nbases, const int tamframe, const int nmidi)
AllocAuxiGPU memory reservation for norms, frame, v_cfreq and v_dxState vectors.
char * fileStates_corr
Definition: defines.h:239
int ReadWavGPU(short *GPUframe, short *CPUframe, FILE *fp)
ReadFileGPU reads current audio (frame) from WAV file when NVIDIA GPU is used.
int AllocDataGPU(MyType **v_hanning, int **states_time_i, int **states_time_e, int **states_seq, int **states_corr, int **I_SxD, int *DTWSize, const int tamtrama, const int nstates, DTWfiles NameFiles)
AllocDataGPU Allocates memory and initializes some structures reading info from files.
int ReadS_fk(MyType *s_fk, const int BASES, const char *filename)
ReadS_fk fills the vector s_fk with the info stores in a file.
int HaveCompatibleGPU(int &maxGrid)
HaveCompatibleGPU checks if the system has an appropiate GPU for ReMAS.
Definition: GPUFunctions.cu:75
void InitSxD(MyType *odata, MyType *v_SxD, const MyType *__restrict__ v_dxState, const int *__restrict__ I_SxD, const int maxGrid, const int size)
InitSxD launches the cuda kernel that sets up the vector SxD when "Unified" GPU memory is used...
int LastImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find the last minimun and its position.
int AllocDTWGPU(MyType **pV, MyType **v_SxD, MyType **sdata, const int maxGrid, const int DTWSize, const int DTWSizePlusPad)
AllocDTWGPU Allocates memory for DTW vectors and auxiliar structures.
char * fileStates_Time_e
Definition: defines.h:236
bool IsPow2(unsigned int x)
IsPow2 decides if a number is power of 2.
Definition: GPUFunctions.cu:66
char * fileStates_seq
Definition: defines.h:238
char * file_kmin
Definition: defines.h:235
char * file_partitura
Definition: defines.h:233
void BlocksAndThreads(int *blocks, int *threads, int *sharedsize, const int maxGrid, const int size)
BlocksAndThreads calculates the suitable number of blocks and threads, and the needed shared memory...
unsigned int NextPow2(unsigned int x)
NextPow2 returns the next power of 2 of a given number.
Definition: GPUFunctions.cu:47
char * file_hanning
Definition: defines.h:231