ReMASPack/GPUFunctions_8cu_source.html

 /**************************************************************************
  *   Copyright (C) 2017 by "Information Retrieval and Parallel Computing" *
  *   group (University of Oviedo, Spain), "Interdisciplinary Computation  *
  *   and Communication" group (Polytechnic University of Valencia, Spain) *
  *   and "Signal Processing and Telecommunication Systems Research" group *
  *   (University of Jaen, Spain)                                          *
  *   Contact: remaspack@gmail.com                                         *
  *                                                                        *
  *   This program is free software; you can redistribute it and/or modify *
  *   it under the terms of the GNU General Public License as published by *
  *   the Free Software Foundation; either version 2 of the License, or    *
  *   (at your option) any later version.                                  *
  *                                                                        *
  *   This program is distributed in the hope that it will be useful,      *
  *   but WITHOUT ANY WARRANTY; without even the implied warranty of       *
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
  *   GNU General Public License for more details.                         *
  *                                                                        *
  *   You should have received a copy of the GNU General Public License    *
  *   along with this program; if not, write to the                        *
  *   Free Software Foundation, Inc.,                                      *
  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.            *
  **************************************************************************
 */
 #include "GPUFunctions.h"

 extern "C" {
 #include "../common/FileFunctions.h"
 }


 unsigned int NextPow2(unsigned int x)
 {
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;

    return ++x;
 }


 inline bool IsPow2(unsigned int x) { return ((x & (x-1)) == 0); }


 int HaveCompatibleGPU(int &maxGrid)
 {
    int deviceCount, driverVersion;

    cudaDeviceProp deviceProp;

    CUDAERR(cudaGetDeviceCount(&deviceCount));

    CUDAERR(cudaGetDeviceProperties(&deviceProp, 0));
    if (deviceProp.major < 3) {
       printf("Sorry, we need CUDA Capability >=3\n");
       return ErrGpuWrong;
    }
    maxGrid=deviceProp.maxGridSize[0];

    CUDAERR(cudaDriverGetVersion(&driverVersion));
    if ((driverVersion/1000) < 6) {
       printf("Sorry, we need CUDA Version >=6\n");
       return ErrGpuWrong;
    }

    if (!deviceProp.unifiedAddressing) {
       printf("Your system does not support Unified Memory\n");
       return ErrGpuWrong;
    }

    return OK;
 }


 int AllocS_fkGPU(MyType **s_fk, MyType **tauxi, MyType **ts_fk, const MyType BETA, const int nmidi,
                  const int nbases, DTWfiles NameFiles)
 {
    CUDAERR(cudaMallocManaged((void **)s_fk, sizeof(MyType)*nmidi*nbases, cudaMemAttachGlobal));
    CHECKERR(ReadS_fk((*s_fk), nbases, NameFiles.file_partitura));

    if (!(BETA>=(MyType)0.0 && BETA<=(MyType)0.0) && !(BETA>=(MyType)1.0 && BETA<=(MyType)1.0))
    {
       CUDAERR(cudaMallocManaged((void **)tauxi, sizeof(MyType)*nmidi,        cudaMemAttachGlobal));
       CUDAERR(cudaMallocManaged((void **)ts_fk, sizeof(MyType)*nmidi*nbases, cudaMemAttachGlobal));
    }

    return OK;
 }


 int AllocDataGPU(MyType **v_hanning, int **states_time_i, int **states_time_e, int **states_seq, int **states_corr,
                  int **I_SxD, int *DTWSize, const int tamtrama, const int nstates, DTWfiles NameFiles)
 {
    int i, j, pos;

    CHECKNULL((*states_time_i)=(int *)calloc(nstates, sizeof(int)));
    CHECKNULL((*states_time_e)=(int *)calloc(nstates, sizeof(int)));
    CHECKNULL((*states_seq)   =(int *)calloc(nstates, sizeof(int)));
    CHECKNULL((*states_corr)  =(int *)calloc(nstates, sizeof(int)));

    CHECKERR(ReadVectorInt64((*states_seq),    nstates, NameFiles.fileStates_seq));
    CHECKERR(ReadVectorInt64((*states_time_i), nstates, NameFiles.fileStates_Time_i));
    CHECKERR(ReadVectorInt64((*states_time_e), nstates, NameFiles.fileStates_Time_e));
    CHECKERR(ReadVectorInt64((*states_corr),   nstates, NameFiles.fileStates_corr));

    (*DTWSize)=(*states_time_e)[nstates - 1] + 1;

    CUDAERR(cudaMallocManaged((void **)I_SxD, sizeof(int)*(*DTWSize), cudaMemAttachGlobal));

    pos=0;
    for (i=0; i<nstates; i++)
    {
       for (j=(*states_time_i)[i]; j<=(*states_time_e)[i]; j++)
       {
          (*I_SxD)[pos]=(*states_seq)[i];
          pos++;
        }
    }

    CUDAERR(cudaMallocManaged((void **)v_hanning, sizeof(MyType)*tamtrama, cudaMemAttachGlobal));
    CHECKERR(ReadVector((*v_hanning), tamtrama, NameFiles.file_hanning));

    return OK;
 }


 int AllocFFTGPU(MyFFTGPUType *plan, MyType **X_fft, MyType **Out_fft, MyType **Mod_fft, int *kmin_fft,
                 int *kmax_fft, const int nfft, DTWfiles NameFiles)
 {
    CUDAERR(cudaMallocManaged((void **)X_fft,   sizeof(MyType)*2*nfft+1, cudaMemAttachGlobal));
    CUDAERR(cudaMallocManaged((void **)Mod_fft, sizeof(MyType)*nfft,     cudaMemAttachGlobal));
    /* ¿¿ works with Mod_fft  size=nfft/2+1 ?? */

    #ifdef SIMPLE
       CUDAERR(cudaMallocManaged((void **)Out_fft, sizeof(cufftComplex)*nfft, cudaMemAttachGlobal));
       CUFFTERR(cufftPlan1d(plan, nfft, CUFFT_R2C, 1));
    #else
       CUDAERR(cudaMallocManaged((void **)Out_fft, sizeof(cufftDoubleComplex)*nfft, cudaMemAttachGlobal));
       CUFFTERR(cufftPlan1d(plan, nfft, CUFFT_D2Z, 1));
    #endif

    if (plan==NULL) return ErrFFTSched;

    CHECKERR(ReadVectorInt64(kmax_fft, N_MIDI, NameFiles.file_kmax));
    CHECKERR(ReadVectorInt64(kmin_fft, N_MIDI, NameFiles.file_kmin));

    return OK;
 }


 int AllocDTWGPU(MyType **pV, MyType **v_SxD, MyType **sdata, const int maxGrid, const int DTWSize, const int DTWSizePlusPad)
 {
    int numThreads, numBlocks, sharedSize;

    BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, DTWSize);

    CUDAERR(cudaMallocManaged((void **)pV,    sizeof(MyType)*DTWSizePlusPad, cudaMemAttachGlobal));
    CUDAERR(cudaMallocManaged((void **)v_SxD, sizeof(MyType)*DTWSize,        cudaMemAttachGlobal));
    CUDAERR(cudaMallocManaged((void **)sdata, sizeof(MyType)*numBlocks,      cudaMemAttachGlobal));

    return OK;
 }


 int AllocAuxiGPU(MyType **norms, short **GPUframe, short **CPUframe, MyType **v_cfreq, MyType **v_dxState, const int nbases,
                  const int tamframe, const int nmidi)
 {
    CUDAERR(cudaMallocManaged((void **)norms,     sizeof(MyType)*nbases,  cudaMemAttachGlobal));
    CUDAERR(cudaMallocManaged((void **)v_dxState, sizeof(MyType)*nbases,  cudaMemAttachGlobal));
    CUDAERR(cudaMallocManaged((void **)v_cfreq,   sizeof(MyType)*nmidi,   cudaMemAttachGlobal));

    CUDAERR(cudaMalloc   ((void **)GPUframe, sizeof(short)*tamframe));
    CUDAERR(cudaHostAlloc((void **)CPUframe, sizeof(short)*tamframe, cudaHostAllocWriteCombined));

    return OK;
 }


 void BlocksAndThreads(int *blocks, int *threads, int *sharedsize, const int maxGrid, const int size)
 {
    (*threads) = (size < maxThreads*2) ? NextPow2((size + 1)/ 2) : maxThreads;
    (*blocks)  = (size + ((*threads) * 2 - 1)) / ((*threads) * 2);

    if ((*blocks) > maxGrid)
    {
       (*blocks)  /= 2;
       (*threads) *= 2;
    }

    (*blocks)     = min(maxBlocks, (*blocks));
    (*sharedsize) = ((*threads) <= sizeWarp) ? 2*(*threads)*sizeof(MyType) : (*threads)*sizeof(MyType);
 }


 int FFTGPU(MyType *X_fft, MyType *Out_fft, MyFFTGPUType *plan)
 {
    #ifdef SIMPLE
       CUFFTERR(cufftExecR2C(*plan, (cufftReal *)X_fft,       (cufftComplex *)Out_fft));
    #else
       CUFFTERR(cufftExecD2Z(*plan, (cufftDoubleReal *)X_fft, (cufftDoubleComplex *)Out_fft));
    #endif

    return OK;
 }


 void InitSxD(MyType *odata, MyType *v_SxD, const MyType* __restrict__ v_dxState, const int* __restrict__ I_SxD,
              const int maxGrid, const int size)
 {
    int numBlocks=0, numThreads=0, sharedSize=0, s;

    BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);

    kernel_InitSxD<<<numBlocks, numThreads, sharedSize>>>(odata, v_SxD, v_dxState, I_SxD, numThreads, IsPow2(size), size);

    s = numBlocks;
    while (s > 1)
    {
       BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);

       kernel_Sum<<<numBlocks, numThreads, sharedSize>>>(odata, odata, numThreads, IsPow2(s), s);
       s = (s + (numThreads*2-1)) / (numThreads*2);

    }
    kernel_Vnorm<<<1, 1>>>(odata);
 }


 int OneImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
 {
    int numBlocks=0, numThreads=0, sharedSize=0, s;

    BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);

    kernel_OneImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads, IsPow2(size), size);

    s = numBlocks;
    while (s > 1)
    {
       BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);

       kernel_OneIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads, IsPow2(s), s);
       s = (s + (numThreads*2-1)) / (numThreads*2);
    }
    cudaDeviceSynchronize();

    return opos[0];
 }


 int FirstImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
 {
    int numBlocks=0, numThreads=0, sharedSize=0, s;

    BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);

    kernel_FirstImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads, IsPow2(size), size);

    s = numBlocks;
    while (s > 1)
    {
       BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);

       kernel_FirstIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads, IsPow2(s), s);
       s = (s + (numThreads*2-1)) / (numThreads*2);
    }
    cudaDeviceSynchronize();

    return opos[0];
 }


 int LastImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
 {
    int numBlocks=0, numThreads=0, sharedSize=0, s;

    BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, size);

    kernel_LastImin<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, idata, numThreads, IsPow2(size), size);

    s = numBlocks;
    while (s > 1)
    {
       BlocksAndThreads(&numBlocks, &numThreads, &sharedSize, maxGrid, s);

       kernel_LastIminLast<<<numBlocks, numThreads, 2*sharedSize>>>(odata, opos, odata, opos, numThreads, IsPow2(s), s);
       s = (s + (numThreads*2-1)) / (numThreads*2);
    }
    cudaDeviceSynchronize();

    return opos[0];
 }


 int ReadWavGPU1st(short *GPUframe, short *CPUframe, FILE *fp)
 {
   if (fread(&CPUframe[TAMMUESTRA], sizeof(short), TTminusTM, fp) != TTminusTM) return ErrReadFile;

   CUDAERR(cudaMemcpy(&GPUframe[TAMMUESTRA], &CPUframe[TAMMUESTRA], sizeof(short)*TTminusTM, cudaMemcpyHostToDevice));

   return OK;
 }

 int ReadWavGPU(short *GPUframe, short *CPUframe, FILE *fp)
 {
   kernel_Shift<<<1, TAMMUESTRA>>>(GPUframe, TAMTRAMA, TAMMUESTRA);

   if (fread(CPUframe, sizeof(short), TAMMUESTRA, fp) != TAMMUESTRA) return ErrReadFile;

   // ¿¿ cudaDeviceSynchronize(); ??

   CUDAERR(cudaMemcpy(&GPUframe[TTminusTM], CPUframe, sizeof(short)*TAMMUESTRA, cudaMemcpyHostToDevice));

   return OK;
 }


 #ifdef ALSA

   int ReadAlsaGPU1st(short *GPUframe, short *CPUframe, snd_pcm_t *DeviceID, FILE *fpdump)
   {
     if (snd_pcm_readi(DeviceID, &CPUframe[TAMMUESTRA], TTminusTM) != TTminusTM) return ErrReadDevice;

     CUDAERR(cudaMemcpy(&GPUframe[TAMMUESTRA], &CPUframe[TAMMUESTRA], sizeof(short)*TTminusTM, cudaMemcpyHostToDevice));

     #ifdef DUMP
       if (fwrite(&CPUframe[TAMMUESTRA], sizeof(short), TTminusTM, fpdump) != TTminusTM) return ErrWriteFile;
     #endif

     return OK;
   }

   int ReadAlsaGPU(short *GPUframe, short *CPUframe, snd_pcm_t *DeviceID, FILE *fpdump)
   {
     kernel_Shift<<<1, TAMMUESTRA>>>(GPUframe, TAMTRAMA, TAMMUESTRA);

     if (snd_pcm_readi(DeviceID, CPUframe, TAMMUESTRA) != TAMMUESTRA) return ErrReadDevice;

     // ¿¿ cudaDeviceSynchronize(); ??

     CUDAERR(cudaMemcpy(&GPUframe[TTminusTM], CPUframe, sizeof(short)*TAMMUESTRA, cudaMemcpyHostToDevice));

     #ifdef DUMP
       if (fwrite(&CPUframe[TTminusTM], sizeof(short), TAMMUESTRA, fpdump) != TAMMUESTRA) return ErrWriteFile;
     #endif

     return OK;
   }
 #endif
DTWfiles::fileStates_Time_i
char * fileStates_Time_i
Definition: defines.h:237

DTWfiles
Struct for store the name of input/verificaton files.  Each composition needs a file with values for ...
Definition: defines.h:228

ReadWavGPU1st
int ReadWavGPU1st(short *GPUframe, short *CPUframe, FILE *fp)
ReadWavGPU1st reads first audio (frame) from WAV file when NVIDIA GPU is used.
Definition: GPUFunctions.cu:456

ReadVectorInt64
int ReadVectorInt64(int *vector, const int size, const char *filename)
ReadVectorInt64 fills a int vector with the int64 info stores in a file.
Definition: FileFunctions.c:185

FirstImin
int FirstImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find the first minimun and its position.
Definition: GPUFunctions.cu:394

OneImin
int OneImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find one minimun and its position.
Definition: GPUFunctions.cu:362

ReadVector
int ReadVector(MyType *vector, const int size, const char *filename)
ReadVector fills a MyType vector with the MyType info stores in a file.
Definition: FileFunctions.c:144

FFTGPU
int FFTGPU(MyType *X_fft, MyType *Out_fft, MyFFTGPUType *plan)
FFTGPU computes FFT.
Definition: GPUFunctions.cu:307

GPUFunctions.h
Header file for using ReMAS with Nvidia GPUs.

AllocS_fkGPU
int AllocS_fkGPU(MyType **s_fk, MyType **tauxi, MyType **ts_fk, const MyType BETA, const int nmidi, const int nbases, DTWfiles NameFiles)
AllocS_fkGPU Allocates memory for S_fk vector, read its data from file and initializes other auxiliar...
Definition: GPUFunctions.cu:117

AllocFFTGPU
int AllocFFTGPU(MyFFTGPUType *plan, MyType **X_fft, MyType **Out_fft, MyType **Mod_fft, int *kmin_fft, int *kmax_fft, const int nfft, DTWfiles NameFiles)
AllocFFTGPU Allocates "Unified" GPU memory for FFT vector and reads some fft information from files...
Definition: GPUFunctions.cu:197

DTWfiles::file_kmax
char * file_kmax
Definition: defines.h:234

AllocAuxiGPU
int AllocAuxiGPU(MyType **norms, short **GPUframe, short **CPUframe, MyType **v_cfreq, MyType **v_dxState, const int nbases, const int tamframe, const int nmidi)
AllocAuxiGPU memory reservation for norms, frame, v_cfreq and v_dxState vectors.
Definition: GPUFunctions.cu:259

DTWfiles::fileStates_corr
char * fileStates_corr
Definition: defines.h:239

ReadWavGPU
int ReadWavGPU(short *GPUframe, short *CPUframe, FILE *fp)
ReadFileGPU reads current audio (frame) from WAV file when NVIDIA GPU is used.
Definition: GPUFunctions.cu:473

AllocDataGPU
int AllocDataGPU(MyType **v_hanning, int **states_time_i, int **states_time_e, int **states_seq, int **states_corr, int **I_SxD, int *DTWSize, const int tamtrama, const int nstates, DTWfiles NameFiles)
AllocDataGPU Allocates memory and initializes some structures reading info from files.
Definition: GPUFunctions.cu:148

ReadS_fk
int ReadS_fk(MyType *s_fk, const int BASES, const char *filename)
ReadS_fk fills the vector s_fk with the info stores in a file.
Definition: FileFunctions.c:228

HaveCompatibleGPU
int HaveCompatibleGPU(int &maxGrid)
HaveCompatibleGPU checks if the system has an appropiate GPU for ReMAS.
Definition: GPUFunctions.cu:75

InitSxD
void InitSxD(MyType *odata, MyType *v_SxD, const MyType *__restrict__ v_dxState, const int *__restrict__ I_SxD, const int maxGrid, const int size)
InitSxD launches the cuda kernel that sets up the vector SxD when "Unified" GPU memory is used...
Definition: GPUFunctions.cu:330

LastImin
int LastImin(MyType *odata, int *opos, MyType *idata, const int maxGrid, const int size)
This function launches cuda kernels to find the last minimun and its position.
Definition: GPUFunctions.cu:426

AllocDTWGPU
int AllocDTWGPU(MyType **pV, MyType **v_SxD, MyType **sdata, const int maxGrid, const int DTWSize, const int DTWSizePlusPad)
AllocDTWGPU Allocates memory for DTW vectors and auxiliar structures.
Definition: GPUFunctions.cu:232

DTWfiles::fileStates_Time_e
char * fileStates_Time_e
Definition: defines.h:236

IsPow2
bool IsPow2(unsigned int x)
IsPow2 decides if a number is power of 2.
Definition: GPUFunctions.cu:66

DTWfiles::fileStates_seq
char * fileStates_seq
Definition: defines.h:238

DTWfiles::file_kmin
char * file_kmin
Definition: defines.h:235

DTWfiles::file_partitura
char * file_partitura
Definition: defines.h:233

BlocksAndThreads
void BlocksAndThreads(int *blocks, int *threads, int *sharedsize, const int maxGrid, const int size)
BlocksAndThreads calculates the suitable number of blocks and threads, and the needed shared memory...
Definition: GPUFunctions.cu:283

NextPow2
unsigned int NextPow2(unsigned int x)
NextPow2 returns the next power of 2 of a given number.
Definition: GPUFunctions.cu:47

DTWfiles::file_hanning
char * file_hanning
Definition: defines.h:231