ReMASPack/kernels_8cuh_source.html

 /**************************************************************************
  *   Copyright (C) 2017 by "Information Retrieval and Parallel Computing" *
  *   group (University of Oviedo, Spain), "Interdisciplinary Computation  *
  *   and Communication" group (Polytechnic University of Valencia, Spain) *
  *   and "Signal Processing and Telecommunication Systems Research" group *
  *   (University of Jaen, Spain)                                          *
  *   Contact: remaspack@gmail.com                                         *
  *                                                                        *
  *   This program is free software; you can redistribute it and/or modify *
  *   it under the terms of the GNU General Public License as published by *
  *   the Free Software Foundation; either version 2 of the License, or    *
  *   (at your option) any later version.                                  *
  *                                                                        *
  *   This program is distributed in the hope that it will be useful,      *
  *   but WITHOUT ANY WARRANTY; without even the implied warranty of       *
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
  *   GNU General Public License for more details.                         *
  *                                                                        *
  *   You should have received a copy of the GNU General Public License    *
  *   along with this program; if not, write to the                        *
  *   Free Software Foundation, Inc.,                                      *
  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.            *
  **************************************************************************
 */
 __device__ inline double __shfl_downD(double var, unsigned int srcLane, int width=sizeWarp) {
   /* Ranilla: 12-4-2019                                                                 */
   /* This is cuda, not "C". We can define xx(type a, type b=value) and call as xx(val1) */
   /* or xx(val1, val2) or etc. Parameter "b" is "value" when the call is xx(val1)       */
   int2 a = *reinterpret_cast<int2*>(&var);

   #ifdef CUDA9
      a.x = __shfl_down_sync(0xffffffff, a.x, srcLane, width);
      a.y = __shfl_down_sync(0xffffffff, a.y, srcLane, width);
   #else
      a.x = __shfl_down(a.x, srcLane, width);
      a.y = __shfl_down(a.y, srcLane, width);
   #endif

   return *reinterpret_cast<double*>(&a);
 }


 __inline__ __device__ double warpReduceSumD(double val)
 {
   /* Ranilla: 12-4-2019                             */
   /* warpSize is apparently a compile-time constant */
   /* in PTX, but formally it is not a compile-time  */
   /* known constant prevents code optimization. So  */
   /* we define sizeWarp in ../common/defines.h. We  */
   /* use sizeWarp instead the build-in warpSize.    */
   for (int offset = sizeWarp/2; offset > 0; offset /= 2)
     val += __shfl_downD(val, offset);
   return val;
 }


 __inline__ __device__ float warpReduceSumS(float val)
 {
   for (int offset = sizeWarp/2; offset > 0; offset /= 2)
     #ifdef CUDA9
        val += __shfl_down_sync(0xffffffff, val, offset, sizeWarp);
     #else
        val += __shfl_down(val, offset, sizeWarp);
     #endif

   return val;
 }


 __global__ void kernel_InitDTW(MyType* __restrict__ pV, const int pos, const int size)
 {
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
    {
       if (tid==pos)
          pV[tid]=0.0;
       else
         #ifdef SIMPLE
           pV[tid]=FLT_MAX;
         #else
           pV[tid]=DBL_MAX;
         #endif
    }
 }

 __global__ void kernel_DTW(const MyType* __restrict__ Sequence, MyType* __restrict__ pD, const int NSeq,
                            const int Where, const int NST)
 {
    unsigned int j=threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int NSTplusNC, k, Pos;

    MyType d, d2;

    #ifdef SIMPLE
       d=FLT_MAX;
    #else
       d=DBL_MAX;
    #endif

    if (j<NST)
    {
       NSTplusNC = N_COSTS + NST;
       Pos       =((NSeq + N_COSTS) % TBLOCK) * NSTplusNC + N_COSTS + j - 1;
       for(k=0; k<N_COSTS; k++)
       {
          d2 = Sequence[j]*CCosts[k]+pD[Pos-k];
          if (d2 < d) d=d2;
       }

       for (k=N_COSTS; k<T_COSTS; k++)
       {
          Pos=((NSeq + (T_COSTS-k)) % TBLOCK) * NSTplusNC + N_COSTS + j - 1;

          d2 = Sequence[j]*CCosts[k]+pD[Pos];

          if (d2 < d) d=d2;
       }

       pD[Where+j] = d;
    }
 }


 __global__ void kernel_InitSxD(MyType* __restrict__ odata, MyType* __restrict__ v_SxD, const MyType* __restrict__ v_dxState,
                                const int* __restrict__ I_SxD, const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType sdata[];

    unsigned int      tid = threadIdx.x;
    unsigned int        i = blockIdx.x*blockSize*2 + threadIdx.x;
    unsigned int gridSize = blockSize*2*gridDim.x;

    MyType mySum=0.0, myData;

    while (i < size)
    {
       myData = v_SxD[i] = v_dxState[I_SxD[i]];
       mySum += myData*myData;

       if (SizeIsPow2 || i + blockSize < size)
       {
          myData = v_SxD[i+blockSize] = v_dxState[I_SxD[i+blockSize]];
          mySum += myData*myData;
       }

       i += gridSize;
    }
    sdata[tid] = mySum;
    __syncthreads();


    /* Ranilla: 12-4-2019: New approach */
    for (unsigned int j=maxThreads; j>=4*sizeWarp; j>>=1)
    {
       if ((blockSize >= j) && (tid < (j>>1)))
          sdata[tid] = mySum = mySum + sdata[tid + (j>>1)];
       __syncthreads();
    }

    /* Ranilla: 12-4-2019: Old approach */
    /*if ((blockSize >= 512) && (tid < 256))
       sdata[tid] = mySum = mySum + sdata[tid + 256];
    __syncthreads();

    if ((blockSize >= 256) &&(tid < 128))
       sdata[tid] = mySum = mySum + sdata[tid + 128];
    __syncthreads();

    if ((blockSize >= 128) && (tid <  64))
       sdata[tid] = mySum = mySum + sdata[tid +  64];
    __syncthreads();*/

    if (tid < sizeWarp)
    {
      if (blockSize >= 2*sizeWarp)
        mySum += sdata[tid + sizeWarp];

      for (int offset = sizeWarp/2; offset > 0; offset /= 2)
        #ifdef CUDA9
          mySum += __shfl_down_sync(0xffffffff, mySum, offset);
        #else
          mySum += __shfl_down(mySum, offset);
        #endif
    }
    if (tid == 0) odata[blockIdx.x] = mySum;
 }


 __global__ void kernel_Sum(MyType* __restrict__ odata, const MyType* __restrict__ idata,
                            const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType sdata[];

    unsigned int      tid = threadIdx.x;
    unsigned int        i = blockIdx.x*blockSize*2 + threadIdx.x;
    unsigned int gridSize = blockSize*2*gridDim.x;

    MyType mySum=0.0;

    while (i < size)
    {
       mySum += idata[i];

       if (SizeIsPow2 || i + blockSize < size)
          mySum += idata[i+blockSize];

       i += gridSize;
    }
    sdata[tid] = mySum;
    __syncthreads();

    /* Ranilla: 12-4-2019: New approach */
    for (unsigned int j=maxThreads; j>=4*sizeWarp; j>>=1)
    {
       if ((blockSize >= j) && (tid < (j>>1)))
          sdata[tid] = mySum = mySum + sdata[tid + (j>>1)];
       __syncthreads();
    }

    /* Ranilla: 12-4-2019: Old approach */
    /*if ((blockSize >= 512) && (tid < 256))
       sdata[tid] = mySum = mySum + sdata[tid + 256];
    __syncthreads();

    if ((blockSize >= 256) &&(tid < 128))
       sdata[tid] = mySum = mySum + sdata[tid + 128];
    __syncthreads();

    if ((blockSize >= 128) && (tid <  64))
       sdata[tid] = mySum = mySum + sdata[tid +  64];
    __syncthreads();*/

    if (tid < sizeWarp)
    {
       if (blockSize >= sizeWarp*2)
         mySum += sdata[tid + sizeWarp];

       for (int offset = sizeWarp/2; offset > 0; offset /= 2)
         #ifdef CUDA9
           mySum += __shfl_down_sync(0xffffffff, mySum, offset);
         #else
           mySum += __shfl_down(mySum, offset);
         #endif
    }
    if (tid == 0) odata[blockIdx.x] = mySum;
 }


 __global__ void kernel_Vnorm(MyType* __restrict__ odata)
 {
    #ifdef SIMPLE
       odata[0] = 1.0f / (sqrtf(odata[0]) + FLT_EPSILON);
    #else
       odata[0] = 1.0  / ( sqrt(odata[0]) + DBL_EPSILON);
    #endif
 }

 __global__ void kernel_ApplyWindow(MyType* __restrict__ X_fft, const short* __restrict__ frame,
                                    const MyType* __restrict__ v_hanning, const int TTRA, const int NFFT)
 {
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < NFFT)
       X_fft[tid] = (tid < TTRA) ? (MyType)frame[tid] * Scaling * v_hanning[tid] : 0.0;
 }


 __global__ void kernel_UpdateSxD(MyType* __restrict__ dest, const MyType ALPHA, const MyType* __restrict__ norm,
                                  const int size)
 {
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
       #ifdef SIMPLE
          dest[tid] = 1.0f - expf(ALPHA*fabsf(dest[tid]*norm[0]));
       #else
          dest[tid] = 1.0  -  exp(ALPHA* fabs(dest[tid]*norm[0]));
      #endif
 }


 __global__ void kernel_CompNorB0(MyType* __restrict__ norms, const MyType value, const int size)
 {
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
       norms[tid]=value;
 }


 __global__ void kernel_CompNorB1(MyType* __restrict__ norms, const MyType* __restrict__ s_fk,
                                  const int NMIDI, const int size)
 {
    unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
    unsigned int j;
    unsigned int stride = i*N_MIDI_PAD;
    MyType a;

    if (i<size)
    {
      a=0.0;
      for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
         a += s_fk[stride+j];

      #ifdef SIMPLE
        a = warpReduceSumS(a);
      #else
        a = warpReduceSumD(a);
      #endif

      if (threadIdx.x==0) norms[i]=a;
    }
 }


 __global__ void kernel_CompNorBG(MyType* __restrict__ norms, MyType* __restrict__ ts_fk,
                                  const MyType* __restrict__ s_fk, const int NMIDI, const MyType BETA, const int size)
 {
    unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
    unsigned int j;
    unsigned int stride = i*N_MIDI_PAD;
    MyType a,b;

    if (i<size)
    {
      #ifdef SIMPLE
        a=0.0f;
        for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
        {
          ts_fk[stride+j] = b = powf(s_fk[stride+j], BETA - 1.0f);
          a += b*s_fk[stride+j];
        }
        a=warpReduceSumS(a);
      #else
        a=0.0;
        for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
        {
          ts_fk[stride+j] = b = pow(s_fk[stride+j], BETA - 1.0f);
          a += b*s_fk[stride+j];
        }
        a=warpReduceSumD(a);
      #endif

      if (threadIdx.x==0) norms[i]=a;
    }
 }

 __global__ void kernel_PowToReal(MyType* __restrict__ dest, const MyType* __restrict__ src, const MyType ex, const int size)
 {
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size)
    {
       #ifdef SIMPLE
         dest[tid]=powf(src[tid], ex);
       #else
         dest[tid]= pow(src[tid], ex);
       #endif
    }
 }


 __global__ void kernel_Modul(MyType* __restrict__ dest, const MyType* __restrict__ src, const int size)
 {
   unsigned int    tid =  threadIdx.x + blockIdx.x * blockDim.x;
   unsigned int stride = (threadIdx.x + blockIdx.x * blockDim.x)*2;

   MyType tmp1, tmp2;

   if (tid <= size)
   {
      tmp1 = src[stride];
      tmp2 = src[stride + 1];

      dest[tid]=tmp1*tmp1 + tmp2*tmp2;
   }
 }


 __global__ void kernel_Cfreq(MyType* __restrict__ dest, const MyType* __restrict__ src)
 {
   unsigned int i = blockIdx.x;
   unsigned int j = threadIdx.x;

   MyType tmp = 0.0;
   for (unsigned int k=Ckmin_fft[i]+j; k<=Ckmax_fft[i]; k+=sizeWarp) {
     tmp += src[k];
   }

   #ifdef SIMPLE
      tmp = warpReduceSumS(tmp);
   #else
      tmp = warpReduceSumD(tmp);
   #endif

   if (j==0) {
     #ifdef SIMPLE
        dest[i] = sqrtf(tmp);
     #else
        dest[i] =  sqrt(tmp);
     #endif
   }
 }


 __global__ void  kernel_Reduction(MyType* __restrict__ dest, const int size)
 {
    unsigned int tid = threadIdx.x;
    unsigned int j;

    MyType a=0.0;

    for(j=tid; j<size; j+=sizeWarp) a += dest[j];

    #ifdef SIMPLE
      a = warpReduceSumS(a);
    #else
      a = warpReduceSumD(a);
    #endif

    if (tid==0) dest[size]=a;
 }

 __global__ void  kernel_ReductionPowBeta(MyType* __restrict__ dest, const MyType BETA, const int size)
 {
    unsigned int tid = threadIdx.x;
    unsigned int j;

    MyType a=0.0;

    for(j=tid; j<size; j+=sizeWarp)
      #ifdef SIMPLE
        a += powf(dest[j], BETA);
      #else
        a += pow (dest[j], BETA);
      #endif

    #ifdef SIMPLE
      a = warpReduceSumS(a);
      if (tid==0) dest[size]=powf(a, 1.0/BETA);
    #else
      a = warpReduceSumD(a);
      if (tid==0) dest[size]=pow(a, 1.0/BETA);
    #endif
 }


 /* maxThreads defined within common/defined.h as 512 */
 __global__ void __launch_bounds__(maxThreads, 4)
 kernel_CompDisB0(MyType* __restrict__ dest, const MyType* __restrict__ v_cfreq, const MyType* __restrict__ norms,
                  const MyType* __restrict__ s_fk, const int NMIDI, const int size)
 {
    unsigned int      i =  blockIdx.x * blockDim.y + threadIdx.y;
    unsigned int      j;
    unsigned int stride = i * N_MIDI_PAD;
    unsigned int th_row = threadIdx.y;
    unsigned int th_col = threadIdx.x;
    unsigned int    row = i + threadIdx.x; /* This is useful only for the first row */
    bool          guard = th_row == 0 && row < size && th_col < blockDim.y;
    MyType a, b, tmp1;

    __shared__ MyType sh[sizeWarp];

    if (i < size)
    {
       a=0.0;
       for(j=th_col; j<NMIDI; j+=sizeWarp) {
         a += v_cfreq[j] / s_fk[stride+j];
       }

       #ifdef SIMPLE
          a = warpReduceSumS(a);
       #else
          a = warpReduceSumD(a);
       #endif

       if(guard) {
         sh[th_col] = norms[row];
       }
       __syncthreads();

       if (th_col == 0)
          b=a/sh[th_row];

       #ifdef CUDA9
          b = __shfl_sync(0xffffffff, b, 0);
       #else
          b = __shfl(b, 0);
       #endif

       a=0.0;
       for(j=th_col; j<NMIDI; j+=sizeWarp)
       {
          tmp1 = v_cfreq[j] / (s_fk[stride + j] * b);
          #ifdef SIMPLE
             a += tmp1 - logf(tmp1) - 1.0f;
          #else
             a += tmp1 -  log(tmp1) - 1.0;
          #endif
       }

       #ifdef SIMPLE
          a = warpReduceSumS(a);
       #else
          a = warpReduceSumD(a);
       #endif

       if(th_col == 0) {
         sh[th_row] = a;
       }
       __syncthreads();

       if(guard) {
         dest[row] = sh[th_col];
       }
    }
 }


 /* maxThreads defined within common/defined.h as 512 */
 __global__ void __launch_bounds__(maxThreads, 4)
 kernel_CompDisB1(MyType* __restrict__ dest, const MyType* __restrict__ v_cfreq, const MyType* __restrict__ norms,
                  const MyType* __restrict__ s_fk, const int NMIDI, const int size)
 {
    unsigned int      i =  blockIdx.x * blockDim.y + threadIdx.y;
    unsigned int      j;
    unsigned int stride = i * N_MIDI_PAD;
    unsigned int th_row = threadIdx.y;
    unsigned int th_col = threadIdx.x;
    unsigned int    row = i + threadIdx.x; /* This is useful only for the first row */
    bool          guard = th_row == 0 && row < size && th_col < blockDim.y;
    MyType a, tmp1, tmp2, tmp3;

    __shared__ MyType sh[sizeWarp];

    if (i < size)
    {
       if(guard) {
         sh[th_col] = v_cfreq[NMIDI] / norms[row];
       }
       __syncthreads();

       tmp1=sh[th_row];

       a=0.0;
       for(j=th_col; j<NMIDI; j+=sizeWarp) {
         tmp2 = s_fk[stride+j] * tmp1;
         tmp3 = v_cfreq[j];
         #ifdef SIMPLE
           a += tmp3*logf(tmp3/tmp2) + tmp2 - tmp3;
         #else
           a += tmp3* log(tmp3/tmp2) + tmp2 - tmp3;
         #endif
       }

       #ifdef SIMPLE
          a = warpReduceSumS(a);
       #else
          a = warpReduceSumD(a);
       #endif

       if(th_col == 0) {
         sh[th_row] = a;
       }
       __syncthreads();

       if(guard) {
         dest[row] = sh[th_col];
       }
    }
 }


 /* maxThreads defined within common/defined.h as 512 */
 __global__ void __launch_bounds__(maxThreads, 4)
 kernel_CompDisBG(MyType* __restrict__ dest, const MyType* __restrict__ v_cfreq,
                  const MyType* __restrict__ norms, const MyType* __restrict__ s_fk,
                  const MyType* __restrict__ ts_fk, const MyType* __restrict__ tauxi,
                  const MyType BETA, const int NMIDI, const int size)
 {
    unsigned int      i =  blockIdx.x * blockDim.y + threadIdx.y;
    unsigned int      j, k;
    unsigned int stride = i * N_MIDI_PAD;
    unsigned int th_row = threadIdx.y;
    unsigned int th_col = threadIdx.x;
    unsigned int    row = i + threadIdx.x; /* This is useful only for the first row */

    bool   guard = th_row == 0 && row < size && th_col < blockDim.y;
    MyType a, b, tmp1, tmp2;
    MyType beta1 = BETA-1.0;
    MyType tmp3  = (1.0 / (BETA*(BETA-1.0)));

    __shared__ MyType sh_a[sizeWarp/2], sh_b[sizeWarp/2];

    if (i < size)
    {
       a=0.0;
       for(j=th_col, k=stride+th_col; j<NMIDI; j+=sizeWarp, k+=sizeWarp) {
         a += v_cfreq[j] * ts_fk[stride+j];
       }

       #ifdef SIMPLE
         a = warpReduceSumS(a);
       #else
         a = warpReduceSumD(a);
       #endif

       if (th_col == 0) {
         sh_a[th_row] = a;
       }
       __syncthreads();

       if(guard) {
         a = sh_a[th_col] / norms[row];
         #ifdef SIMPLE
           b = powf(a, beta1);
         #else
           b =  pow(a, beta1);
         #endif
         sh_b[th_col] = BETA * b;
         sh_a[th_col] = b * a * beta1;
       }
       __syncthreads();

       tmp1 = sh_b[th_row];
       tmp2 = sh_a[th_row];

       /* Ranilla: 12-4-2019: New approach */
       j = th_col;
       k = stride+th_col;
       a = 0.0;
       for (unsigned int s=sizeWarp; s<N_MIDI_PAD; s+=sizeWarp,j+=sizeWarp,k+=sizeWarp) {
          a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
       }
       /* Here j is from 96 to 127, but NMIDI is only 114. (N_MIDI_PAD - NMIDI) */
       /* is 128. Thereby, only threads from 0 to 17 can do the next sentence.  */
       if (th_col < (sizeWarp - (N_MIDI_PAD - NMIDI)))
         a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);

       /* Ranilla: 12-4-2019: Old approach */
       /*j  = th_col;
       k  = stride+th_col;
       a  = ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
       j += sizeWarp;
       k += sizeWarp;
       a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
       j += sizeWarp;
       k += sizeWarp;
       a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
       j += sizeWarp;
       k += sizeWarp;
       if(th_col<18) {
         a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
       }*/

       #ifdef SIMPLE
         a = warpReduceSumS(a);
       #else
         a = warpReduceSumD(a);
       #endif

       if(th_col == 0) {
         sh_a[th_row] = a;
       }
       __syncthreads();

       if(guard) {
         dest[row] = sh_a[th_col];
       }
    }
 }


 __global__ void kernel_Shift(short* __restrict__ frame, const int TTRAMA, const int TMUEST)
 {
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int i, tmp;

    for (i=0; i<(TTRAMA/TMUEST - 1); i++)
    {
       tmp=tid+i*TMUEST;
       frame[tmp]=frame[tmp+TMUEST];
       __syncthreads();
    }
 }


 __global__ void kernel_BetaNorm(MyType* __restrict__ vector, const int size)
 {
    unsigned int tid = threadIdx.x;

    /* The previous call to kernel_Reduction / kernel_ReductionPowBeta puts in vector[size] the reduction value */
    MyType value=vector[size];

    vector[tid] = vector[tid] / value;
 }


 __global__ void kernel_OneImin(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
                                const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType ss[];

    MyType *sdata=ss;
    int    *pdata=(int *)&sdata[blockSize];

    int   tid = threadIdx.x;
    int     i = blockIdx.x*blockSize*2 + threadIdx.x;
    int gSize = blockSize*2*gridDim.x;
    int myPos, tmpPos;

    #ifdef SIMPLE
       MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
    #else
       MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
    #endif

    while (i < size)
    {
       myMin=idata[i];
       myPos=i;

       if (SizeIsPow2 || i + blockSize < size)
          if (idata[i+blockSize] < myMin) {
             myMin=idata[i+blockSize];
             myPos=i+blockSize;
          }
       i += gSize;
    }
    sdata[tid]=myMin;
    pdata[tid]=myPos;
    __syncthreads();

    for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
    {
       if ((blockSize >= 2*s) && (tid < s))
          if (sdata[tid + s] < myMin) {
             sdata[tid]=myMin=sdata[tid+s];
             pdata[tid]=myPos=pdata[tid+s];
          }
       __syncthreads();
    }

    if (tid < sizeWarp)
    {
       if ((blockSize >= 2*sizeWarp) && (sdata[tid + sizeWarp] < myMin)) {
          myMin=sdata[tid+sizeWarp];
          myPos=pdata[tid+sizeWarp];
       }

       for (int offset = sizeWarp/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
       {
          #ifdef CUDA9
            #ifdef SIMPLE
              tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
          #else
            #ifdef SIMPLE
              tmpMin = __shfl_down(myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down(myPos, offset, sizeWarp);
          #endif

          if (tmpMin < myMin) { myMin=tmpMin; myPos=tmpPos; }
       }
    }
    if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
 }


 __global__ void kernel_OneIminLast(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
                                    const int* __restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType ss[];
    MyType *sdata=ss;
    int    *pdata=(int *)&sdata[blockSize];

    int   tid = threadIdx.x;
    int     i = blockIdx.x*blockSize*2 + threadIdx.x;
    int gSize = blockSize*2*gridDim.x;
    int myPos, tmpPos;

    #ifdef SIMPLE
       MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
    #else
       MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
    #endif

    while (i < size)
    {
       myMin=idata[i];
       myPos=ipos[i];

       if (SizeIsPow2 || i + blockSize < size)
          if (idata[i+blockSize] < myMin) {
             myMin=idata[i+blockSize];
             myPos=ipos[i+blockSize];
          }
       i += gSize;
    }
    sdata[tid]=myMin;
    pdata[tid]=myPos;
    __syncthreads();

    for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
    {
       if ((blockSize >= 2*s) && (tid < s))
          if (sdata[tid + s] < myMin) {
             sdata[tid]=myMin=sdata[tid+s];
             pdata[tid]=myPos=pdata[tid+s];
          }
       __syncthreads();
    }

    if (tid < sizeWarp)
    {
       if ((blockSize >= 2*sizeWarp) && (sdata[tid + sizeWarp] < myMin)) {
          myMin=sdata[tid+sizeWarp];
          myPos=pdata[tid+sizeWarp];
       }

       for (int offset = blockSize/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
       {
          #ifdef CUDA9
            #ifdef SIMPLE
              tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
          #else
            #ifdef SIMPLE
              tmpMin = __shfl_down(myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down(myPos, offset, sizeWarp);
          #endif

          if (tmpMin < myMin) { myMin=tmpMin; myPos=tmpPos; }
       }
    }
    if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
 }


 __global__ void kernel_FirstImin(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
                                  const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType ss[];

    MyType *sdata=ss;
    int    *pdata=(int *)&sdata[blockSize];

    int   tid = threadIdx.x;
    int     i = blockIdx.x*blockSize*2 + threadIdx.x;
    int gSize = blockSize*2*gridDim.x;
    int myPos, tmpPos;

    #ifdef SIMPLE
       MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
    #else
       MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
    #endif

    while (i < size)
    {
       myMin=idata[i];
       myPos=i;

       if (SizeIsPow2 || i + blockSize < size)
          if (idata[i+blockSize] < myMin) {
             myMin=idata[i+blockSize];
             myPos=i+blockSize;
          }
       i += gSize;
    }
    sdata[tid]=myMin;
    pdata[tid]=myPos;
    __syncthreads();

    for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
    {
       if ((blockSize >= 2*s) && (tid < s))
          if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] < myPos))) {
             sdata[tid]=myMin=sdata[tid+s];
             pdata[tid]=myPos=pdata[tid+s];
          }
       __syncthreads();
    }

    if (tid < sizeWarp)
    {
       if ((blockSize >= 2*sizeWarp) &&
          ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] < myPos)))) {
         myMin=sdata[tid+sizeWarp];
         myPos=pdata[tid+sizeWarp];
       }

       for (int offset = sizeWarp/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
       {
          #ifdef CUDA9
            #ifdef SIMPLE
              tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
          #else
            #ifdef SIMPLE
              tmpMin = __shfl_down(myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down(myPos, offset, sizeWarp);
          #endif

          if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos < myPos))) {
             myMin=tmpMin;
             myPos=tmpPos;
          }
       }
    }
    if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
 }


 __global__ void kernel_FirstIminLast(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
                                      const int* __restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType ss[];
    MyType *sdata=ss;
    int    *pdata=(int *)&sdata[blockSize];

    int   tid = threadIdx.x;
    int     i = blockIdx.x*blockSize*2 + threadIdx.x;
    int gSize = blockSize*2*gridDim.x;
    int myPos, tmpPos;

    #ifdef SIMPLE
       MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
    #else
       MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
    #endif

    while (i < size)
    {
       myMin=idata[i];
       myPos=ipos[i];

       if (SizeIsPow2 || i + blockSize < size)
          if (idata[i+blockSize] < myMin) {
             myMin=idata[i+blockSize];
             myPos=ipos[i+blockSize];
          }
       i += gSize;
    }
    sdata[tid]=myMin;
    pdata[tid]=myPos;
    __syncthreads();

    for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
    {
       if ((blockSize >= 2*s) && (tid < s))
          if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] < myPos))) {
             sdata[tid]=myMin=sdata[tid+s];
             pdata[tid]=myPos=pdata[tid+s];
          }
       __syncthreads();
    }

    if (tid < sizeWarp)
    {
       if ((blockSize >= 2*sizeWarp) &&
          ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] < myPos)))) {
         myMin=sdata[tid+sizeWarp];
         myPos=pdata[tid+sizeWarp];
       }

       for (int offset = blockSize/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
       {
          #ifdef CUDA9
            #ifdef SIMPLE
              tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
          #else
            #ifdef SIMPLE
              tmpMin = __shfl_down(myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down(myPos, offset, sizeWarp);
          #endif

          if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos < myPos))) {
             myMin=tmpMin;
             myPos=tmpPos;
          }
       }
    }
    if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
 }


 __global__ void kernel_LastImin(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
                                 const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType ss[];

    MyType *sdata=ss;
    int    *pdata=(int *)&sdata[blockSize];

    int   tid = threadIdx.x;
    int     i = blockIdx.x*blockSize*2 + threadIdx.x;
    int gSize = blockSize*2*gridDim.x;
    int myPos, tmpPos;

    #ifdef SIMPLE
       MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
    #else
       MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
    #endif

    while (i < size)
    {
       myMin=idata[i];
       myPos=i;

       if (SizeIsPow2 || i + blockSize < size)
          if (idata[i+blockSize] <= myMin) {
             myMin=idata[i+blockSize];
             myPos=i+blockSize;
          }
       i += gSize;
    }
    sdata[tid]=myMin;
    pdata[tid]=myPos;
    __syncthreads();

    for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
    {
       if ((blockSize >= 2*s) && (tid < s))
          if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] > myPos))) {
             sdata[tid]=myMin=sdata[tid+s];
             pdata[tid]=myPos=pdata[tid+s];
          }
       __syncthreads();
    }

    if (tid < sizeWarp)
    {
       if ((blockSize >= 2*sizeWarp) &&
          ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] > myPos)))) {
             myMin=sdata[tid+sizeWarp];
             myPos=pdata[tid+sizeWarp];
       }

       for (int offset = sizeWarp/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
       {
          #ifdef CUDA9
            #ifdef SIMPLE
              tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
          #else
            #ifdef SIMPLE
              tmpMin = __shfl_down(myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down(myPos, offset, sizeWarp);
          #endif

          if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos > myPos))) {
             myMin=tmpMin;
             myPos=tmpPos;
          }
       }
    }
    if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
 }


 __global__ void kernel_LastIminLast(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
                                     const int* __restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
 {
    extern __shared__ MyType ss[];
    MyType *sdata=ss;
    int    *pdata=(int *)&sdata[blockSize];

    int   tid = threadIdx.x;
    int     i = blockIdx.x*blockSize*2 + threadIdx.x;
    int gSize = blockSize*2*gridDim.x;
    int myPos, tmpPos;

    #ifdef SIMPLE
       MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
    #else
       MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
    #endif

    while (i < size)
    {
       myMin=idata[i];
       myPos=ipos[i];

       if (SizeIsPow2 || i + blockSize < size)
          if (idata[i+blockSize] <= myMin) {
             myMin=idata[i+blockSize];
             myPos=ipos[i+blockSize];
          }
       i += gSize;
    }
    sdata[tid]=myMin;
    pdata[tid]=myPos;
    __syncthreads();

    for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
    {
       if ((blockSize >= 2*s) && (tid < s))
          if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] > myPos))) {
             sdata[tid]=myMin=sdata[tid+s];
             pdata[tid]=myPos=pdata[tid+s];
          }
       __syncthreads();
    }

    if (tid < sizeWarp)
    {
       if ((blockSize >= 2*sizeWarp) &&
          ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] > myPos)))) {
             myMin=sdata[tid+sizeWarp];
             myPos=pdata[tid+sizeWarp];
       }

       for (int offset = blockSize/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
       {
          #ifdef CUDA9
            #ifdef SIMPLE
              tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
          #else
            #ifdef SIMPLE
              tmpMin = __shfl_down(myMin, offset, sizeWarp);
            #else
              tmpMin = __shfl_downD(myMin, offset);
            #endif
            tmpPos = __shfl_down(myPos, offset, sizeWarp);
          #endif

          if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos > myPos))) {
             myMin=tmpMin;
             myPos=tmpPos;
          }
       }
    }
    if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
 }

kernel_PowToReal
__global__ void kernel_PowToReal(MyType *__restrict__ dest, const MyType *__restrict__ src, const MyType ex, const int size)
kernel_PowToReal This cuda kernel powers the elements of a vector to a real number and stores them in...
Definition: kernels.cuh:481

kernel_Sum
__global__ void kernel_Sum(MyType *__restrict__ odata, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_Sum This cuda kernel adds the elements of a vector.
Definition: kernels.cuh:260

warpReduceSumD
__inline__ __device__ double warpReduceSumD(double val)
warpReduceSumD does double sum reduction within a warp
Definition: kernels.cuh:67

kernel_LastIminLast
__global__ void kernel_LastIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_LastIminLast used with kernel_LastImin to calculates the position of the last minimum in a vec...
Definition: kernels.cuh:1376

kernel_ApplyWindow
__global__ void kernel_ApplyWindow(MyType *__restrict__ X_fft, const short *__restrict__ frame, const MyType *__restrict__ v_hanning, const int TTRA, const int NFFT)
kernel_ApplyWindow scales and set the elements of the audio vector X_fft
Definition: kernels.cuh:345

kernel_BetaNorm
__global__ void kernel_BetaNorm(MyType *__restrict__ vector, const int size)
kernel__BetaNorm normalized the vector
Definition: kernels.cuh:901

kernel_FirstIminLast
__global__ void kernel_FirstIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_FirstIminLast used with kernel_OneImin to calculates the position of the first minimum in a ve...
Definition: kernels.cuh:1192

kernel_Shift
__global__ void kernel_Shift(short *__restrict__ frame, const int TTRAMA, const int TMUEST)
kernel_Shift shifts the vector elements TMUEST positions on the left
Definition: kernels.cuh:880

kernel_Vnorm
__global__ void kernel_Vnorm(MyType *__restrict__ odata)
kernel_Vnorm This cuda kernel initializes position 0 of a vector
Definition: kernels.cuh:326

kernel_LastImin
__global__ void kernel_LastImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_LastImin calculates the position of the last minimum in a vector
Definition: kernels.cuh:1283

kernel_CompNorB1
__global__ void kernel_CompNorB1(MyType *__restrict__ norms, const MyType *__restrict__ s_fk, const int NMIDI, const int size)
kernel_CompNorB1 This cuda kernel computes the norm of a vector when BETA=1
Definition: kernels.cuh:404

kernel_CompNorBG
__global__ void kernel_CompNorBG(MyType *__restrict__ norms, MyType *__restrict__ ts_fk, const MyType *__restrict__ s_fk, const int NMIDI, const MyType BETA, const int size)
kernel_CompNorBG This cuda kernel computes the norm of a vector when BETA <> 0 and BETA <> 1 ...
Definition: kernels.cuh:440

kernel_ReductionPowBeta
__global__ void kernel_ReductionPowBeta(MyType *__restrict__ dest, const MyType BETA, const int size)
kernel_Reduction This cuda kernel performs a typical sum-reduction of a vector
Definition: kernels.cuh:586

kernel_InitSxD
__global__ void kernel_InitSxD(MyType *__restrict__ odata, MyType *__restrict__ v_SxD, const MyType *__restrict__ v_dxState, const int *__restrict__ I_SxD, const int blockSize, const bool SizeIsPow2, const int size)
kernel_InitSxD This cuda kernel sets up the vector SxD.
Definition: kernels.cuh:185

kernel_OneIminLast
__global__ void kernel_OneIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_OneIminLast used with kernel_OneImin to calculates the position of one minimum in a vector ...
Definition: kernels.cuh:1012

kernel_OneImin
__global__ void kernel_OneImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_OneImin calculates the position of one minimum in a vector
Definition: kernels.cuh:923

warpReduceSumS
__inline__ __device__ float warpReduceSumS(float val)
warpReduceSumD does float sum reduction within a warp
Definition: kernels.cuh:87

kernel_Cfreq
__global__ void kernel_Cfreq(MyType *__restrict__ dest, const MyType *__restrict__ src)
kernel_Cfreq This cuda kernel computes sqrt(sum of elements of a vector) and stores it in dest[0] ...
Definition: kernels.cuh:527

__shfl_downD
__device__ double __shfl_downD(double var, unsigned int srcLane, int width=sizeWarp)
__shfl_downD performs __shfl_down of a double number
Definition: kernels.cuh:43

kernel_FirstImin
__global__ void kernel_FirstImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_FirstImin calculates the position of the first minimum in a vector
Definition: kernels.cuh:1099

kernel_Modul
__global__ void kernel_Modul(MyType *__restrict__ dest, const MyType *__restrict__ src, const int size)
kernel_Modul This cuda kernel computes the modulus of elements of a vector and stores them in other v...
Definition: kernels.cuh:503

kernel_CompNorB0
__global__ void kernel_CompNorB0(MyType *__restrict__ norms, const MyType value, const int size)
kernel_CompNorB0 This cuda kernel computes the norm of a vector when BETA=0
Definition: kernels.cuh:386

__launch_bounds__
__global__ void __launch_bounds__(maxThreads, 4) kernel_CompDisB0(MyType *__restrict__ dest
kernel_CompDisB0 This cuda kernel computes the distortion of a vector when BETA=0 ...

kernel_UpdateSxD
__global__ void kernel_UpdateSxD(MyType *__restrict__ dest, const MyType ALPHA, const MyType *__restrict__ norm, const int size)
kernel_UpdateSxD This cuda kernel update the elements of SxD vector
Definition: kernels.cuh:364

kernel_InitDTW
__global__ void kernel_InitDTW(MyType *__restrict__ pV, const int pos, const int size)
kernel_InitDTW This cuda kernel initializes DTW vector
Definition: kernels.cuh:108

kernel_DTW
__global__ void kernel_DTW(const MyType *__restrict__ Sequence, MyType *__restrict__ pD, const int NSeq, const int Where, const int NST)
kernel_DTW This cuda kernel performs the Online-DTW process for the current frame ...
Definition: kernels.cuh:135

kernel_Reduction
__global__ void kernel_Reduction(MyType *__restrict__ dest, const int size)
kernel_Reduction This cuda kernel performs a typical sum-reduction of a vector
Definition: kernels.cuh:560