37 #include "../common/FileFunctions.h" 66 inline bool IsPow2(
unsigned int x) {
return ((x & (x-1)) == 0); }
77 int deviceCount, driverVersion;
79 cudaDeviceProp deviceProp;
81 CUDAERR(cudaGetDeviceCount(&deviceCount));
83 CUDAERR(cudaGetDeviceProperties(&deviceProp, 0));
84 if (deviceProp.major < 3) {
85 printf(
"Sorry, we need CUDA Capability >=3\n");
88 maxGrid=deviceProp.maxGridSize[0];
90 CUDAERR(cudaDriverGetVersion(&driverVersion));
91 if ((driverVersion/1000) < 6) {
92 printf(
"Sorry, we need CUDA Version >=6\n");
96 if (!deviceProp.unifiedAddressing) {
97 printf(
"Your system does not support Unified Memory\n");
117 int AllocS_fkGPU(MyType **s_fk, MyType **tauxi, MyType **ts_fk,
const MyType BETA,
const int nmidi,
118 const int nbases,
DTWfiles NameFiles)
120 CUDAERR(cudaMallocManaged((
void **)s_fk,
sizeof(MyType)*nmidi*nbases, cudaMemAttachGlobal));
123 if (!(BETA>=(MyType)0.0 && BETA<=(MyType)0.0) && !(BETA>=(MyType)1.0 && BETA<=(MyType)1.0))
125 CUDAERR(cudaMallocManaged((
void **)tauxi,
sizeof(MyType)*nmidi, cudaMemAttachGlobal));
126 CUDAERR(cudaMallocManaged((
void **)ts_fk,
sizeof(MyType)*nmidi*nbases, cudaMemAttachGlobal));
148 int AllocDataGPU(MyType **v_hanning,
int **states_time_i,
int **states_time_e,
int **states_seq,
int **states_corr,
149 int **I_SxD,
int *DTWSize,
const int tamtrama,
const int nstates,
DTWfiles NameFiles)
153 CHECKNULL((*states_time_i)=(
int *)calloc(nstates,
sizeof(
int)));
154 CHECKNULL((*states_time_e)=(
int *)calloc(nstates,
sizeof(
int)));
155 CHECKNULL((*states_seq) =(
int *)calloc(nstates,
sizeof(
int)));
156 CHECKNULL((*states_corr) =(
int *)calloc(nstates,
sizeof(
int)));
163 (*DTWSize)=(*states_time_e)[nstates - 1] + 1;
165 CUDAERR(cudaMallocManaged((
void **)I_SxD,
sizeof(
int)*(*DTWSize), cudaMemAttachGlobal));
168 for (i=0; i<nstates; i++)
170 for (j=(*states_time_i)[i]; j<=(*states_time_e)[i]; j++)
172 (*I_SxD)[pos]=(*states_seq)[i];
177 CUDAERR(cudaMallocManaged((
void **)v_hanning,
sizeof(MyType)*tamtrama, cudaMemAttachGlobal));
197 int AllocFFTGPU(MyFFTGPUType *plan, MyType **X_fft, MyType **Out_fft, MyType **Mod_fft,
int *kmin_fft,
198 int *kmax_fft,
const int nfft,
DTWfiles NameFiles)
200 CUDAERR(cudaMallocManaged((
void **)X_fft,
sizeof(MyType)*2*nfft+1, cudaMemAttachGlobal));
201 CUDAERR(cudaMallocManaged((
void **)Mod_fft,
sizeof(MyType)*nfft, cudaMemAttachGlobal));
205 CUDAERR(cudaMallocManaged((
void **)Out_fft,
sizeof(cufftComplex)*nfft, cudaMemAttachGlobal));
206 CUFFTERR(cufftPlan1d(plan, nfft, CUFFT_R2C, 1));
208 CUDAERR(cudaMallocManaged((
void **)Out_fft,
sizeof(cufftDoubleComplex)*nfft, cudaMemAttachGlobal));
209 CUFFTERR(cufftPlan1d(plan, nfft, CUFFT_D2Z, 1));
212 if (plan==NULL)
return ErrFFTSched;
232 int AllocDTWGPU(MyType **pV, MyType **v_SxD, MyType **sdata,
const int maxGrid,
const int DTWSize,
const int DTWSizePlusPad)
234 int numThreads, numBlocks, sharedSize;
238 CUDAERR(cudaMallocManaged((
void **)pV,
sizeof(MyType)*DTWSizePlusPad, cudaMemAttachGlobal));
239 CUDAERR(cudaMallocManaged((
void **)v_SxD,
sizeof(MyType)*DTWSize, cudaMemAttachGlobal));
240 CUDAERR(cudaMallocManaged((
void **)sdata,
sizeof(MyType)*numBlocks, cudaMemAttachGlobal));
259 int AllocAuxiGPU(MyType **norms,
short **GPUframe,
short **CPUframe, MyType **v_cfreq, MyType **v_dxState,
const int nbases,
260 const int tamframe,
const int nmidi)
262 CUDAERR(cudaMallocManaged((
void **)norms,
sizeof(MyType)*nbases, cudaMemAttachGlobal));
263 CUDAERR(cudaMallocManaged((
void **)v_dxState,
sizeof(MyType)*nbases, cudaMemAttachGlobal));
264 CUDAERR(cudaMallocManaged((
void **)v_cfreq,
sizeof(MyType)*nmidi, cudaMemAttachGlobal));
266 CUDAERR(cudaMalloc ((
void **)GPUframe,
sizeof(
short)*tamframe));
267 CUDAERR(cudaHostAlloc((
void **)CPUframe,
sizeof(
short)*tamframe, cudaHostAllocWriteCombined));
283 void BlocksAndThreads(
int *blocks,
int *threads,
int *sharedsize,
const int maxGrid,
const int size)
285 (*threads) = (size < maxThreads*2) ?
NextPow2((size + 1)/ 2) : maxThreads;
286 (*blocks) = (size + ((*threads) * 2 - 1)) / ((*threads) * 2);
288 if ((*blocks) > maxGrid)
294 (*blocks) = min(maxBlocks, (*blocks));
295 (*sharedsize) = ((*threads) <= sizeWarp) ? 2*(*threads)*
sizeof(MyType) : (*threads)*
sizeof(MyType);
307 int FFTGPU(MyType *X_fft, MyType *Out_fft, MyFFTGPUType *plan)
310 CUFFTERR(cufftExecR2C(*plan, (cufftReal *)X_fft, (cufftComplex *)Out_fft));
312 CUFFTERR(cufftExecD2Z(*plan, (cufftDoubleReal *)X_fft, (cufftDoubleComplex *)Out_fft));
330 void InitSxD(MyType *odata, MyType *v_SxD,
const MyType* __restrict__ v_dxState,
const int* __restrict__ I_SxD,
331 const int maxGrid,
const int size)
333 int numBlocks=0, numThreads=0, sharedSize=0, s;
337 kernel_InitSxD<<<numBlocks, numThreads, sharedSize>>>(odata, v_SxD, v_dxState, I_SxD, numThreads,
IsPow2(size), size);
344 kernel_Sum<<<numBlocks, numThreads, sharedSize>>>(odata, odata, numThreads,
IsPow2(s), s);
345 s = (s + (numThreads*2-1)) / (numThreads*2);
348 kernel_Vnorm<<<1, 1>>>(odata);
362 int OneImin(MyType *odata,
int *opos, MyType *idata,
const int maxGrid,
const int size)
364 int numBlocks=0, numThreads=0, sharedSize=0, s;
368 kernel_OneImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads,
IsPow2(size), size);
375 kernel_OneIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads,
IsPow2(s), s);
376 s = (s + (numThreads*2-1)) / (numThreads*2);
378 cudaDeviceSynchronize();
394 int FirstImin(MyType *odata,
int *opos, MyType *idata,
const int maxGrid,
const int size)
396 int numBlocks=0, numThreads=0, sharedSize=0, s;
400 kernel_FirstImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads,
IsPow2(size), size);
407 kernel_FirstIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads,
IsPow2(s), s);
408 s = (s + (numThreads*2-1)) / (numThreads*2);
410 cudaDeviceSynchronize();
426 int LastImin(MyType *odata,
int *opos, MyType *idata,
const int maxGrid,
const int size)
428 int numBlocks=0, numThreads=0, sharedSize=0, s;
432 kernel_LastImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads,
IsPow2(size), size);
439 kernel_LastIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads,
IsPow2(s), s);
440 s = (s + (numThreads*2-1)) / (numThreads*2);
442 cudaDeviceSynchronize();
458 if (fread(&CPUframe[TAMMUESTRA],
sizeof(
short), TTminusTM, fp) != TTminusTM)
return ErrReadFile;
460 CUDAERR(cudaMemcpy(&GPUframe[TAMMUESTRA], &CPUframe[TAMMUESTRA],
sizeof(
short)*TTminusTM, cudaMemcpyHostToDevice));
475 kernel_Shift<<<1, TAMMUESTRA>>>(GPUframe, TAMTRAMA, TAMMUESTRA);
477 if (fread(CPUframe,
sizeof(
short), TAMMUESTRA, fp) != TAMMUESTRA)
return ErrReadFile;
481 CUDAERR(cudaMemcpy(&GPUframe[TTminusTM], CPUframe,
sizeof(
short)*TAMMUESTRA, cudaMemcpyHostToDevice));
498 int ReadAlsaGPU1st(
short *GPUframe,
short *CPUframe, snd_pcm_t *DeviceID, FILE *fpdump)
500 if (snd_pcm_readi(DeviceID, &CPUframe[TAMMUESTRA], TTminusTM) != TTminusTM)
return ErrReadDevice;
502 CUDAERR(cudaMemcpy(&GPUframe[TAMMUESTRA], &CPUframe[TAMMUESTRA],
sizeof(
short)*TTminusTM, cudaMemcpyHostToDevice));
505 if (fwrite(&CPUframe[TAMMUESTRA],
sizeof(
short), TTminusTM, fpdump) != TTminusTM)
return ErrWriteFile;
521 int ReadAlsaGPU(
short *GPUframe,
short *CPUframe, snd_pcm_t *DeviceID, FILE *fpdump)
523 kernel_Shift<<<1, TAMMUESTRA>>>(GPUframe, TAMTRAMA, TAMMUESTRA);
525 if (snd_pcm_readi(DeviceID, CPUframe, TAMMUESTRA) != TAMMUESTRA)
return ErrReadDevice;
529 CUDAERR(cudaMemcpy(&GPUframe[TTminusTM], CPUframe,
sizeof(
short)*TAMMUESTRA, cudaMemcpyHostToDevice));
532 if (fwrite(&CPUframe[TTminusTM],
sizeof(
short), TAMMUESTRA, fpdump) != TAMMUESTRA)
return ErrWriteFile;
Struct for store the name of input/verificaton files. Each composition needs a file with values for ...
int ReadWavGPU1st(short *GPUframe, short *CPUframe, FILE *fp)
ReadWavGPU1st reads first audio (frame) from WAV file when NVIDIA GPU is used.
int ReadVectorInt64(int *vector, const int size, const char *filename)
ReadVectorInt64 fills a int vector with the int64 info stores in a file.
int FirstImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find the first minimun and its position.
int OneImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find one minimun and its position.
int ReadVector(MyType *vector, const int size, const char *filename)
ReadVector fills a MyType vector with the MyType info stores in a file.
int FFTGPU(MyType *X_fft, MyType *Out_fft, MyFFTGPUType *plan)
FFTGPU computes FFT.
Header file for using ReMAS with Nvidia GPUs.
int AllocS_fkGPU(MyType **s_fk, MyType **tauxi, MyType **ts_fk, const MyType BETA, const int nmidi, const int nbases, DTWfiles NameFiles)
AllocS_fkGPU Allocates memory for S_fk vector, read its data from file and initializes other auxiliar...
int AllocFFTGPU(MyFFTGPUType *plan, MyType **X_fft, MyType **Out_fft, MyType **Mod_fft, int *kmin_fft, int *kmax_fft, const int nfft, DTWfiles NameFiles)
AllocFFTGPU Allocates "Unified" GPU memory for FFT vector and reads some fft information from files...
int AllocAuxiGPU(MyType **norms, short **GPUframe, short **CPUframe, MyType **v_cfreq, MyType **v_dxState, const int nbases, const int tamframe, const int nmidi)
AllocAuxiGPU memory reservation for norms, frame, v_cfreq and v_dxState vectors.
int ReadWavGPU(short *GPUframe, short *CPUframe, FILE *fp)
ReadFileGPU reads current audio (frame) from WAV file when NVIDIA GPU is used.
int AllocDataGPU(MyType **v_hanning, int **states_time_i, int **states_time_e, int **states_seq, int **states_corr, int **I_SxD, int *DTWSize, const int tamtrama, const int nstates, DTWfiles NameFiles)
AllocDataGPU Allocates memory and initializes some structures reading info from files.
int ReadS_fk(MyType *s_fk, const int BASES, const char *filename)
ReadS_fk fills the vector s_fk with the info stores in a file.
int HaveCompatibleGPU(int &maxGrid)
HaveCompatibleGPU checks if the system has an appropiate GPU for ReMAS.
void InitSxD(MyType *odata, MyType *v_SxD, const MyType *__restrict__ v_dxState, const int *__restrict__ I_SxD, const int maxGrid, const int size)
InitSxD launches the cuda kernel that sets up the vector SxD when "Unified" GPU memory is used...
int LastImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find the last minimun and its position.
int AllocDTWGPU(MyType **pV, MyType **v_SxD, MyType **sdata, const int maxGrid, const int DTWSize, const int DTWSizePlusPad)
AllocDTWGPU Allocates memory for DTW vectors and auxiliar structures.
bool IsPow2(unsigned int x)
IsPow2 decides if a number is power of 2.
void BlocksAndThreads(int *blocks, int *threads, int *sharedsize, const int maxGrid, const int size)
BlocksAndThreads calculates the suitable number of blocks and threads, and the needed shared memory...
unsigned int NextPow2(unsigned int x)
NextPow2 returns the next power of 2 of a given number.