ReMAS  1.5
Real-time Musical Accompaniment System
kernels.cuh
Go to the documentation of this file.
1 /**************************************************************************
2  * Copyright (C) 2017 by "Information Retrieval and Parallel Computing" *
3  * group (University of Oviedo, Spain), "Interdisciplinary Computation *
4  * and Communication" group (Polytechnic University of Valencia, Spain) *
5  * and "Signal Processing and Telecommunication Systems Research" group *
6  * (University of Jaen, Spain) *
7  * Contact: remaspack@gmail.com *
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  * This program is distributed in the hope that it will be useful, *
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
17  * GNU General Public License for more details. *
18  * *
19  * You should have received a copy of the GNU General Public License *
20  * along with this program; if not, write to the *
21  * Free Software Foundation, Inc., *
22  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
23  **************************************************************************
24 */
43 __device__ inline double __shfl_downD(double var, unsigned int srcLane, int width=sizeWarp) {
44  /* Ranilla: 12-4-2019 */
45  /* This is cuda, not "C". We can define xx(type a, type b=value) and call as xx(val1) */
46  /* or xx(val1, val2) or etc. Parameter "b" is "value" when the call is xx(val1) */
47  int2 a = *reinterpret_cast<int2*>(&var);
48 
49  #ifdef CUDA9
50  a.x = __shfl_down_sync(0xffffffff, a.x, srcLane, width);
51  a.y = __shfl_down_sync(0xffffffff, a.y, srcLane, width);
52  #else
53  a.x = __shfl_down(a.x, srcLane, width);
54  a.y = __shfl_down(a.y, srcLane, width);
55  #endif
56 
57  return *reinterpret_cast<double*>(&a);
58 }
59 
60 
67 __inline__ __device__ double warpReduceSumD(double val)
68 {
69  /* Ranilla: 12-4-2019 */
70  /* warpSize is apparently a compile-time constant */
71  /* in PTX, but formally it is not a compile-time */
72  /* known constant prevents code optimization. So */
73  /* we define sizeWarp in ../common/defines.h. We */
74  /* use sizeWarp instead the build-in warpSize. */
75  for (int offset = sizeWarp/2; offset > 0; offset /= 2)
76  val += __shfl_downD(val, offset);
77  return val;
78 }
79 
80 
87 __inline__ __device__ float warpReduceSumS(float val)
88 {
89  for (int offset = sizeWarp/2; offset > 0; offset /= 2)
90  #ifdef CUDA9
91  val += __shfl_down_sync(0xffffffff, val, offset, sizeWarp);
92  #else
93  val += __shfl_down(val, offset, sizeWarp);
94  #endif
95 
96  return val;
97 }
98 
99 
108 __global__ void kernel_InitDTW(MyType* __restrict__ pV, const int pos, const int size)
109 {
110  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
111 
112  if (tid < size)
113  {
114  if (tid==pos)
115  pV[tid]=0.0;
116  else
117  #ifdef SIMPLE
118  pV[tid]=FLT_MAX;
119  #else
120  pV[tid]=DBL_MAX;
121  #endif
122  }
123 }
124 
135 __global__ void kernel_DTW(const MyType* __restrict__ Sequence, MyType* __restrict__ pD, const int NSeq,
136  const int Where, const int NST)
137 {
138  unsigned int j=threadIdx.x + blockIdx.x * blockDim.x;
139  unsigned int NSTplusNC, k, Pos;
140 
141  MyType d, d2;
142 
143  #ifdef SIMPLE
144  d=FLT_MAX;
145  #else
146  d=DBL_MAX;
147  #endif
148 
149  if (j<NST)
150  {
151  NSTplusNC = N_COSTS + NST;
152  Pos =((NSeq + N_COSTS) % TBLOCK) * NSTplusNC + N_COSTS + j - 1;
153  for(k=0; k<N_COSTS; k++)
154  {
155  d2 = Sequence[j]*CCosts[k]+pD[Pos-k];
156  if (d2 < d) d=d2;
157  }
158 
159  for (k=N_COSTS; k<T_COSTS; k++)
160  {
161  Pos=((NSeq + (T_COSTS-k)) % TBLOCK) * NSTplusNC + N_COSTS + j - 1;
162 
163  d2 = Sequence[j]*CCosts[k]+pD[Pos];
164 
165  if (d2 < d) d=d2;
166  }
167 
168  pD[Where+j] = d;
169  }
170 }
171 
172 
185 __global__ void kernel_InitSxD(MyType* __restrict__ odata, MyType* __restrict__ v_SxD, const MyType* __restrict__ v_dxState,
186  const int* __restrict__ I_SxD, const int blockSize, const bool SizeIsPow2, const int size)
187 {
188  extern __shared__ MyType sdata[];
189 
190  unsigned int tid = threadIdx.x;
191  unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
192  unsigned int gridSize = blockSize*2*gridDim.x;
193 
194  MyType mySum=0.0, myData;
195 
196  while (i < size)
197  {
198  myData = v_SxD[i] = v_dxState[I_SxD[i]];
199  mySum += myData*myData;
200 
201  if (SizeIsPow2 || i + blockSize < size)
202  {
203  myData = v_SxD[i+blockSize] = v_dxState[I_SxD[i+blockSize]];
204  mySum += myData*myData;
205  }
206 
207  i += gridSize;
208  }
209  sdata[tid] = mySum;
210  __syncthreads();
211 
212 
213  /* Ranilla: 12-4-2019: New approach */
214  for (unsigned int j=maxThreads; j>=4*sizeWarp; j>>=1)
215  {
216  if ((blockSize >= j) && (tid < (j>>1)))
217  sdata[tid] = mySum = mySum + sdata[tid + (j>>1)];
218  __syncthreads();
219  }
220 
221  /* Ranilla: 12-4-2019: Old approach */
222  /*if ((blockSize >= 512) && (tid < 256))
223  sdata[tid] = mySum = mySum + sdata[tid + 256];
224  __syncthreads();
225 
226  if ((blockSize >= 256) &&(tid < 128))
227  sdata[tid] = mySum = mySum + sdata[tid + 128];
228  __syncthreads();
229 
230  if ((blockSize >= 128) && (tid < 64))
231  sdata[tid] = mySum = mySum + sdata[tid + 64];
232  __syncthreads();*/
233 
234  if (tid < sizeWarp)
235  {
236  if (blockSize >= 2*sizeWarp)
237  mySum += sdata[tid + sizeWarp];
238 
239  for (int offset = sizeWarp/2; offset > 0; offset /= 2)
240  #ifdef CUDA9
241  mySum += __shfl_down_sync(0xffffffff, mySum, offset);
242  #else
243  mySum += __shfl_down(mySum, offset);
244  #endif
245  }
246  if (tid == 0) odata[blockIdx.x] = mySum;
247 }
248 
249 
260 __global__ void kernel_Sum(MyType* __restrict__ odata, const MyType* __restrict__ idata,
261  const int blockSize, const bool SizeIsPow2, const int size)
262 {
263  extern __shared__ MyType sdata[];
264 
265  unsigned int tid = threadIdx.x;
266  unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
267  unsigned int gridSize = blockSize*2*gridDim.x;
268 
269  MyType mySum=0.0;
270 
271  while (i < size)
272  {
273  mySum += idata[i];
274 
275  if (SizeIsPow2 || i + blockSize < size)
276  mySum += idata[i+blockSize];
277 
278  i += gridSize;
279  }
280  sdata[tid] = mySum;
281  __syncthreads();
282 
283  /* Ranilla: 12-4-2019: New approach */
284  for (unsigned int j=maxThreads; j>=4*sizeWarp; j>>=1)
285  {
286  if ((blockSize >= j) && (tid < (j>>1)))
287  sdata[tid] = mySum = mySum + sdata[tid + (j>>1)];
288  __syncthreads();
289  }
290 
291  /* Ranilla: 12-4-2019: Old approach */
292  /*if ((blockSize >= 512) && (tid < 256))
293  sdata[tid] = mySum = mySum + sdata[tid + 256];
294  __syncthreads();
295 
296  if ((blockSize >= 256) &&(tid < 128))
297  sdata[tid] = mySum = mySum + sdata[tid + 128];
298  __syncthreads();
299 
300  if ((blockSize >= 128) && (tid < 64))
301  sdata[tid] = mySum = mySum + sdata[tid + 64];
302  __syncthreads();*/
303 
304  if (tid < sizeWarp)
305  {
306  if (blockSize >= sizeWarp*2)
307  mySum += sdata[tid + sizeWarp];
308 
309  for (int offset = sizeWarp/2; offset > 0; offset /= 2)
310  #ifdef CUDA9
311  mySum += __shfl_down_sync(0xffffffff, mySum, offset);
312  #else
313  mySum += __shfl_down(mySum, offset);
314  #endif
315  }
316  if (tid == 0) odata[blockIdx.x] = mySum;
317 }
318 
319 
326 __global__ void kernel_Vnorm(MyType* __restrict__ odata)
327 {
328  #ifdef SIMPLE
329  odata[0] = 1.0f / (sqrtf(odata[0]) + FLT_EPSILON);
330  #else
331  odata[0] = 1.0 / ( sqrt(odata[0]) + DBL_EPSILON);
332  #endif
333 }
334 
345 __global__ void kernel_ApplyWindow(MyType* __restrict__ X_fft, const short* __restrict__ frame,
346  const MyType* __restrict__ v_hanning, const int TTRA, const int NFFT)
347 {
348  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
349 
350  if (tid < NFFT)
351  X_fft[tid] = (tid < TTRA) ? (MyType)frame[tid] * Scaling * v_hanning[tid] : 0.0;
352 }
353 
354 
364 __global__ void kernel_UpdateSxD(MyType* __restrict__ dest, const MyType ALPHA, const MyType* __restrict__ norm,
365  const int size)
366 {
367  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
368 
369  if (tid < size)
370  #ifdef SIMPLE
371  dest[tid] = 1.0f - expf(ALPHA*fabsf(dest[tid]*norm[0]));
372  #else
373  dest[tid] = 1.0 - exp(ALPHA* fabs(dest[tid]*norm[0]));
374  #endif
375 }
376 
377 
386 __global__ void kernel_CompNorB0(MyType* __restrict__ norms, const MyType value, const int size)
387 {
388  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
389 
390  if (tid < size)
391  norms[tid]=value;
392 }
393 
394 
404 __global__ void kernel_CompNorB1(MyType* __restrict__ norms, const MyType* __restrict__ s_fk,
405  const int NMIDI, const int size)
406 {
407  unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
408  unsigned int j;
409  unsigned int stride = i*N_MIDI_PAD;
410  MyType a;
411 
412  if (i<size)
413  {
414  a=0.0;
415  for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
416  a += s_fk[stride+j];
417 
418  #ifdef SIMPLE
419  a = warpReduceSumS(a);
420  #else
421  a = warpReduceSumD(a);
422  #endif
423 
424  if (threadIdx.x==0) norms[i]=a;
425  }
426 }
427 
428 
440 __global__ void kernel_CompNorBG(MyType* __restrict__ norms, MyType* __restrict__ ts_fk,
441  const MyType* __restrict__ s_fk, const int NMIDI, const MyType BETA, const int size)
442 {
443  unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
444  unsigned int j;
445  unsigned int stride = i*N_MIDI_PAD;
446  MyType a,b;
447 
448  if (i<size)
449  {
450  #ifdef SIMPLE
451  a=0.0f;
452  for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
453  {
454  ts_fk[stride+j] = b = powf(s_fk[stride+j], BETA - 1.0f);
455  a += b*s_fk[stride+j];
456  }
457  a=warpReduceSumS(a);
458  #else
459  a=0.0;
460  for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
461  {
462  ts_fk[stride+j] = b = pow(s_fk[stride+j], BETA - 1.0f);
463  a += b*s_fk[stride+j];
464  }
465  a=warpReduceSumD(a);
466  #endif
467 
468  if (threadIdx.x==0) norms[i]=a;
469  }
470 }
471 
481 __global__ void kernel_PowToReal(MyType* __restrict__ dest, const MyType* __restrict__ src, const MyType ex, const int size)
482 {
483  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
484  if (tid < size)
485  {
486  #ifdef SIMPLE
487  dest[tid]=powf(src[tid], ex);
488  #else
489  dest[tid]= pow(src[tid], ex);
490  #endif
491  }
492 }
493 
494 
503 __global__ void kernel_Modul(MyType* __restrict__ dest, const MyType* __restrict__ src, const int size)
504 {
505  unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
506  unsigned int stride = (threadIdx.x + blockIdx.x * blockDim.x)*2;
507 
508  MyType tmp1, tmp2;
509 
510  if (tid <= size)
511  {
512  tmp1 = src[stride];
513  tmp2 = src[stride + 1];
514 
515  dest[tid]=tmp1*tmp1 + tmp2*tmp2;
516  }
517 }
518 
519 
527 __global__ void kernel_Cfreq(MyType* __restrict__ dest, const MyType* __restrict__ src)
528 {
529  unsigned int i = blockIdx.x;
530  unsigned int j = threadIdx.x;
531 
532  MyType tmp = 0.0;
533  for (unsigned int k=Ckmin_fft[i]+j; k<=Ckmax_fft[i]; k+=sizeWarp) {
534  tmp += src[k];
535  }
536 
537  #ifdef SIMPLE
538  tmp = warpReduceSumS(tmp);
539  #else
540  tmp = warpReduceSumD(tmp);
541  #endif
542 
543  if (j==0) {
544  #ifdef SIMPLE
545  dest[i] = sqrtf(tmp);
546  #else
547  dest[i] = sqrt(tmp);
548  #endif
549  }
550 }
551 
552 
560 __global__ void kernel_Reduction(MyType* __restrict__ dest, const int size)
561 {
562  unsigned int tid = threadIdx.x;
563  unsigned int j;
564 
565  MyType a=0.0;
566 
567  for(j=tid; j<size; j+=sizeWarp) a += dest[j];
568 
569  #ifdef SIMPLE
570  a = warpReduceSumS(a);
571  #else
572  a = warpReduceSumD(a);
573  #endif
574 
575  if (tid==0) dest[size]=a;
576 }
577 
586 __global__ void kernel_ReductionPowBeta(MyType* __restrict__ dest, const MyType BETA, const int size)
587 {
588  unsigned int tid = threadIdx.x;
589  unsigned int j;
590 
591  MyType a=0.0;
592 
593  for(j=tid; j<size; j+=sizeWarp)
594  #ifdef SIMPLE
595  a += powf(dest[j], BETA);
596  #else
597  a += pow (dest[j], BETA);
598  #endif
599 
600  #ifdef SIMPLE
601  a = warpReduceSumS(a);
602  if (tid==0) dest[size]=powf(a, 1.0/BETA);
603  #else
604  a = warpReduceSumD(a);
605  if (tid==0) dest[size]=pow(a, 1.0/BETA);
606  #endif
607 }
608 
609 
621 /* maxThreads defined within common/defined.h as 512 */
622 __global__ void __launch_bounds__(maxThreads, 4)
623 kernel_CompDisB0(MyType* __restrict__ dest, const MyType* __restrict__ v_cfreq, const MyType* __restrict__ norms,
624  const MyType* __restrict__ s_fk, const int NMIDI, const int size)
625 {
626  unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
627  unsigned int j;
628  unsigned int stride = i * N_MIDI_PAD;
629  unsigned int th_row = threadIdx.y;
630  unsigned int th_col = threadIdx.x;
631  unsigned int row = i + threadIdx.x; /* This is useful only for the first row */
632  bool guard = th_row == 0 && row < size && th_col < blockDim.y;
633  MyType a, b, tmp1;
634 
635  __shared__ MyType sh[sizeWarp];
636 
637  if (i < size)
638  {
639  a=0.0;
640  for(j=th_col; j<NMIDI; j+=sizeWarp) {
641  a += v_cfreq[j] / s_fk[stride+j];
642  }
643 
644  #ifdef SIMPLE
645  a = warpReduceSumS(a);
646  #else
647  a = warpReduceSumD(a);
648  #endif
649 
650  if(guard) {
651  sh[th_col] = norms[row];
652  }
653  __syncthreads();
654 
655  if (th_col == 0)
656  b=a/sh[th_row];
657 
658  #ifdef CUDA9
659  b = __shfl_sync(0xffffffff, b, 0);
660  #else
661  b = __shfl(b, 0);
662  #endif
663 
664  a=0.0;
665  for(j=th_col; j<NMIDI; j+=sizeWarp)
666  {
667  tmp1 = v_cfreq[j] / (s_fk[stride + j] * b);
668  #ifdef SIMPLE
669  a += tmp1 - logf(tmp1) - 1.0f;
670  #else
671  a += tmp1 - log(tmp1) - 1.0;
672  #endif
673  }
674 
675  #ifdef SIMPLE
676  a = warpReduceSumS(a);
677  #else
678  a = warpReduceSumD(a);
679  #endif
680 
681  if(th_col == 0) {
682  sh[th_row] = a;
683  }
684  __syncthreads();
685 
686  if(guard) {
687  dest[row] = sh[th_col];
688  }
689  }
690 }
691 
692 
704 /* maxThreads defined within common/defined.h as 512 */
705 __global__ void __launch_bounds__(maxThreads, 4)
706 kernel_CompDisB1(MyType* __restrict__ dest, const MyType* __restrict__ v_cfreq, const MyType* __restrict__ norms,
707  const MyType* __restrict__ s_fk, const int NMIDI, const int size)
708 {
709  unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
710  unsigned int j;
711  unsigned int stride = i * N_MIDI_PAD;
712  unsigned int th_row = threadIdx.y;
713  unsigned int th_col = threadIdx.x;
714  unsigned int row = i + threadIdx.x; /* This is useful only for the first row */
715  bool guard = th_row == 0 && row < size && th_col < blockDim.y;
716  MyType a, tmp1, tmp2, tmp3;
717 
718  __shared__ MyType sh[sizeWarp];
719 
720  if (i < size)
721  {
722  if(guard) {
723  sh[th_col] = v_cfreq[NMIDI] / norms[row];
724  }
725  __syncthreads();
726 
727  tmp1=sh[th_row];
728 
729  a=0.0;
730  for(j=th_col; j<NMIDI; j+=sizeWarp) {
731  tmp2 = s_fk[stride+j] * tmp1;
732  tmp3 = v_cfreq[j];
733  #ifdef SIMPLE
734  a += tmp3*logf(tmp3/tmp2) + tmp2 - tmp3;
735  #else
736  a += tmp3* log(tmp3/tmp2) + tmp2 - tmp3;
737  #endif
738  }
739 
740  #ifdef SIMPLE
741  a = warpReduceSumS(a);
742  #else
743  a = warpReduceSumD(a);
744  #endif
745 
746  if(th_col == 0) {
747  sh[th_row] = a;
748  }
749  __syncthreads();
750 
751  if(guard) {
752  dest[row] = sh[th_col];
753  }
754  }
755 }
756 
757 
772 /* maxThreads defined within common/defined.h as 512 */
773 __global__ void __launch_bounds__(maxThreads, 4)
774 kernel_CompDisBG(MyType* __restrict__ dest, const MyType* __restrict__ v_cfreq,
775  const MyType* __restrict__ norms, const MyType* __restrict__ s_fk,
776  const MyType* __restrict__ ts_fk, const MyType* __restrict__ tauxi,
777  const MyType BETA, const int NMIDI, const int size)
778 {
779  unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
780  unsigned int j, k;
781  unsigned int stride = i * N_MIDI_PAD;
782  unsigned int th_row = threadIdx.y;
783  unsigned int th_col = threadIdx.x;
784  unsigned int row = i + threadIdx.x; /* This is useful only for the first row */
785 
786  bool guard = th_row == 0 && row < size && th_col < blockDim.y;
787  MyType a, b, tmp1, tmp2;
788  MyType beta1 = BETA-1.0;
789  MyType tmp3 = (1.0 / (BETA*(BETA-1.0)));
790 
791  __shared__ MyType sh_a[sizeWarp/2], sh_b[sizeWarp/2];
792 
793  if (i < size)
794  {
795  a=0.0;
796  for(j=th_col, k=stride+th_col; j<NMIDI; j+=sizeWarp, k+=sizeWarp) {
797  a += v_cfreq[j] * ts_fk[stride+j];
798  }
799 
800  #ifdef SIMPLE
801  a = warpReduceSumS(a);
802  #else
803  a = warpReduceSumD(a);
804  #endif
805 
806  if (th_col == 0) {
807  sh_a[th_row] = a;
808  }
809  __syncthreads();
810 
811  if(guard) {
812  a = sh_a[th_col] / norms[row];
813  #ifdef SIMPLE
814  b = powf(a, beta1);
815  #else
816  b = pow(a, beta1);
817  #endif
818  sh_b[th_col] = BETA * b;
819  sh_a[th_col] = b * a * beta1;
820  }
821  __syncthreads();
822 
823  tmp1 = sh_b[th_row];
824  tmp2 = sh_a[th_row];
825 
826  /* Ranilla: 12-4-2019: New approach */
827  j = th_col;
828  k = stride+th_col;
829  a = 0.0;
830  for (unsigned int s=sizeWarp; s<N_MIDI_PAD; s+=sizeWarp,j+=sizeWarp,k+=sizeWarp) {
831  a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
832  }
833  /* Here j is from 96 to 127, but NMIDI is only 114. (N_MIDI_PAD - NMIDI) */
834  /* is 128. Thereby, only threads from 0 to 17 can do the next sentence. */
835  if (th_col < (sizeWarp - (N_MIDI_PAD - NMIDI)))
836  a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
837 
838  /* Ranilla: 12-4-2019: Old approach */
839  /*j = th_col;
840  k = stride+th_col;
841  a = ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
842  j += sizeWarp;
843  k += sizeWarp;
844  a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
845  j += sizeWarp;
846  k += sizeWarp;
847  a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
848  j += sizeWarp;
849  k += sizeWarp;
850  if(th_col<18) {
851  a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
852  }*/
853 
854  #ifdef SIMPLE
855  a = warpReduceSumS(a);
856  #else
857  a = warpReduceSumD(a);
858  #endif
859 
860  if(th_col == 0) {
861  sh_a[th_row] = a;
862  }
863  __syncthreads();
864 
865  if(guard) {
866  dest[row] = sh_a[th_col];
867  }
868  }
869 }
870 
871 
880 __global__ void kernel_Shift(short* __restrict__ frame, const int TTRAMA, const int TMUEST)
881 {
882  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
883  unsigned int i, tmp;
884 
885  for (i=0; i<(TTRAMA/TMUEST - 1); i++)
886  {
887  tmp=tid+i*TMUEST;
888  frame[tmp]=frame[tmp+TMUEST];
889  __syncthreads();
890  }
891 }
892 
893 
901 __global__ void kernel_BetaNorm(MyType* __restrict__ vector, const int size)
902 {
903  unsigned int tid = threadIdx.x;
904 
905  /* The previous call to kernel_Reduction / kernel_ReductionPowBeta puts in vector[size] the reduction value */
906  MyType value=vector[size];
907 
908  vector[tid] = vector[tid] / value;
909 }
910 
911 
923 __global__ void kernel_OneImin(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
924  const int blockSize, const bool SizeIsPow2, const int size)
925 {
926  extern __shared__ MyType ss[];
927 
928  MyType *sdata=ss;
929  int *pdata=(int *)&sdata[blockSize];
930 
931  int tid = threadIdx.x;
932  int i = blockIdx.x*blockSize*2 + threadIdx.x;
933  int gSize = blockSize*2*gridDim.x;
934  int myPos, tmpPos;
935 
936  #ifdef SIMPLE
937  MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
938  #else
939  MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
940  #endif
941 
942  while (i < size)
943  {
944  myMin=idata[i];
945  myPos=i;
946 
947  if (SizeIsPow2 || i + blockSize < size)
948  if (idata[i+blockSize] < myMin) {
949  myMin=idata[i+blockSize];
950  myPos=i+blockSize;
951  }
952  i += gSize;
953  }
954  sdata[tid]=myMin;
955  pdata[tid]=myPos;
956  __syncthreads();
957 
958  for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
959  {
960  if ((blockSize >= 2*s) && (tid < s))
961  if (sdata[tid + s] < myMin) {
962  sdata[tid]=myMin=sdata[tid+s];
963  pdata[tid]=myPos=pdata[tid+s];
964  }
965  __syncthreads();
966  }
967 
968  if (tid < sizeWarp)
969  {
970  if ((blockSize >= 2*sizeWarp) && (sdata[tid + sizeWarp] < myMin)) {
971  myMin=sdata[tid+sizeWarp];
972  myPos=pdata[tid+sizeWarp];
973  }
974 
975  for (int offset = sizeWarp/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
976  {
977  #ifdef CUDA9
978  #ifdef SIMPLE
979  tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
980  #else
981  tmpMin = __shfl_downD(myMin, offset);
982  #endif
983  tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
984  #else
985  #ifdef SIMPLE
986  tmpMin = __shfl_down(myMin, offset, sizeWarp);
987  #else
988  tmpMin = __shfl_downD(myMin, offset);
989  #endif
990  tmpPos = __shfl_down(myPos, offset, sizeWarp);
991  #endif
992 
993  if (tmpMin < myMin) { myMin=tmpMin; myPos=tmpPos; }
994  }
995  }
996  if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
997 }
998 
999 
1012 __global__ void kernel_OneIminLast(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
1013  const int* __restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
1014 {
1015  extern __shared__ MyType ss[];
1016  MyType *sdata=ss;
1017  int *pdata=(int *)&sdata[blockSize];
1018 
1019  int tid = threadIdx.x;
1020  int i = blockIdx.x*blockSize*2 + threadIdx.x;
1021  int gSize = blockSize*2*gridDim.x;
1022  int myPos, tmpPos;
1023 
1024  #ifdef SIMPLE
1025  MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1026  #else
1027  MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1028  #endif
1029 
1030  while (i < size)
1031  {
1032  myMin=idata[i];
1033  myPos=ipos[i];
1034 
1035  if (SizeIsPow2 || i + blockSize < size)
1036  if (idata[i+blockSize] < myMin) {
1037  myMin=idata[i+blockSize];
1038  myPos=ipos[i+blockSize];
1039  }
1040  i += gSize;
1041  }
1042  sdata[tid]=myMin;
1043  pdata[tid]=myPos;
1044  __syncthreads();
1045 
1046  for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
1047  {
1048  if ((blockSize >= 2*s) && (tid < s))
1049  if (sdata[tid + s] < myMin) {
1050  sdata[tid]=myMin=sdata[tid+s];
1051  pdata[tid]=myPos=pdata[tid+s];
1052  }
1053  __syncthreads();
1054  }
1055 
1056  if (tid < sizeWarp)
1057  {
1058  if ((blockSize >= 2*sizeWarp) && (sdata[tid + sizeWarp] < myMin)) {
1059  myMin=sdata[tid+sizeWarp];
1060  myPos=pdata[tid+sizeWarp];
1061  }
1062 
1063  for (int offset = blockSize/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
1064  {
1065  #ifdef CUDA9
1066  #ifdef SIMPLE
1067  tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1068  #else
1069  tmpMin = __shfl_downD(myMin, offset);
1070  #endif
1071  tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1072  #else
1073  #ifdef SIMPLE
1074  tmpMin = __shfl_down(myMin, offset, sizeWarp);
1075  #else
1076  tmpMin = __shfl_downD(myMin, offset);
1077  #endif
1078  tmpPos = __shfl_down(myPos, offset, sizeWarp);
1079  #endif
1080 
1081  if (tmpMin < myMin) { myMin=tmpMin; myPos=tmpPos; }
1082  }
1083  }
1084  if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1085 }
1086 
1087 
1099 __global__ void kernel_FirstImin(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
1100  const int blockSize, const bool SizeIsPow2, const int size)
1101 {
1102  extern __shared__ MyType ss[];
1103 
1104  MyType *sdata=ss;
1105  int *pdata=(int *)&sdata[blockSize];
1106 
1107  int tid = threadIdx.x;
1108  int i = blockIdx.x*blockSize*2 + threadIdx.x;
1109  int gSize = blockSize*2*gridDim.x;
1110  int myPos, tmpPos;
1111 
1112  #ifdef SIMPLE
1113  MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1114  #else
1115  MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1116  #endif
1117 
1118  while (i < size)
1119  {
1120  myMin=idata[i];
1121  myPos=i;
1122 
1123  if (SizeIsPow2 || i + blockSize < size)
1124  if (idata[i+blockSize] < myMin) {
1125  myMin=idata[i+blockSize];
1126  myPos=i+blockSize;
1127  }
1128  i += gSize;
1129  }
1130  sdata[tid]=myMin;
1131  pdata[tid]=myPos;
1132  __syncthreads();
1133 
1134  for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
1135  {
1136  if ((blockSize >= 2*s) && (tid < s))
1137  if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] < myPos))) {
1138  sdata[tid]=myMin=sdata[tid+s];
1139  pdata[tid]=myPos=pdata[tid+s];
1140  }
1141  __syncthreads();
1142  }
1143 
1144  if (tid < sizeWarp)
1145  {
1146  if ((blockSize >= 2*sizeWarp) &&
1147  ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] < myPos)))) {
1148  myMin=sdata[tid+sizeWarp];
1149  myPos=pdata[tid+sizeWarp];
1150  }
1151 
1152  for (int offset = sizeWarp/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
1153  {
1154  #ifdef CUDA9
1155  #ifdef SIMPLE
1156  tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1157  #else
1158  tmpMin = __shfl_downD(myMin, offset);
1159  #endif
1160  tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1161  #else
1162  #ifdef SIMPLE
1163  tmpMin = __shfl_down(myMin, offset, sizeWarp);
1164  #else
1165  tmpMin = __shfl_downD(myMin, offset);
1166  #endif
1167  tmpPos = __shfl_down(myPos, offset, sizeWarp);
1168  #endif
1169 
1170  if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos < myPos))) {
1171  myMin=tmpMin;
1172  myPos=tmpPos;
1173  }
1174  }
1175  }
1176  if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1177 }
1178 
1179 
1192 __global__ void kernel_FirstIminLast(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
1193  const int* __restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
1194 {
1195  extern __shared__ MyType ss[];
1196  MyType *sdata=ss;
1197  int *pdata=(int *)&sdata[blockSize];
1198 
1199  int tid = threadIdx.x;
1200  int i = blockIdx.x*blockSize*2 + threadIdx.x;
1201  int gSize = blockSize*2*gridDim.x;
1202  int myPos, tmpPos;
1203 
1204  #ifdef SIMPLE
1205  MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1206  #else
1207  MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1208  #endif
1209 
1210  while (i < size)
1211  {
1212  myMin=idata[i];
1213  myPos=ipos[i];
1214 
1215  if (SizeIsPow2 || i + blockSize < size)
1216  if (idata[i+blockSize] < myMin) {
1217  myMin=idata[i+blockSize];
1218  myPos=ipos[i+blockSize];
1219  }
1220  i += gSize;
1221  }
1222  sdata[tid]=myMin;
1223  pdata[tid]=myPos;
1224  __syncthreads();
1225 
1226  for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
1227  {
1228  if ((blockSize >= 2*s) && (tid < s))
1229  if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] < myPos))) {
1230  sdata[tid]=myMin=sdata[tid+s];
1231  pdata[tid]=myPos=pdata[tid+s];
1232  }
1233  __syncthreads();
1234  }
1235 
1236  if (tid < sizeWarp)
1237  {
1238  if ((blockSize >= 2*sizeWarp) &&
1239  ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] < myPos)))) {
1240  myMin=sdata[tid+sizeWarp];
1241  myPos=pdata[tid+sizeWarp];
1242  }
1243 
1244  for (int offset = blockSize/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
1245  {
1246  #ifdef CUDA9
1247  #ifdef SIMPLE
1248  tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1249  #else
1250  tmpMin = __shfl_downD(myMin, offset);
1251  #endif
1252  tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1253  #else
1254  #ifdef SIMPLE
1255  tmpMin = __shfl_down(myMin, offset, sizeWarp);
1256  #else
1257  tmpMin = __shfl_downD(myMin, offset);
1258  #endif
1259  tmpPos = __shfl_down(myPos, offset, sizeWarp);
1260  #endif
1261 
1262  if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos < myPos))) {
1263  myMin=tmpMin;
1264  myPos=tmpPos;
1265  }
1266  }
1267  }
1268  if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1269 }
1270 
1271 
1283 __global__ void kernel_LastImin(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
1284  const int blockSize, const bool SizeIsPow2, const int size)
1285 {
1286  extern __shared__ MyType ss[];
1287 
1288  MyType *sdata=ss;
1289  int *pdata=(int *)&sdata[blockSize];
1290 
1291  int tid = threadIdx.x;
1292  int i = blockIdx.x*blockSize*2 + threadIdx.x;
1293  int gSize = blockSize*2*gridDim.x;
1294  int myPos, tmpPos;
1295 
1296  #ifdef SIMPLE
1297  MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1298  #else
1299  MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1300  #endif
1301 
1302  while (i < size)
1303  {
1304  myMin=idata[i];
1305  myPos=i;
1306 
1307  if (SizeIsPow2 || i + blockSize < size)
1308  if (idata[i+blockSize] <= myMin) {
1309  myMin=idata[i+blockSize];
1310  myPos=i+blockSize;
1311  }
1312  i += gSize;
1313  }
1314  sdata[tid]=myMin;
1315  pdata[tid]=myPos;
1316  __syncthreads();
1317 
1318  for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
1319  {
1320  if ((blockSize >= 2*s) && (tid < s))
1321  if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] > myPos))) {
1322  sdata[tid]=myMin=sdata[tid+s];
1323  pdata[tid]=myPos=pdata[tid+s];
1324  }
1325  __syncthreads();
1326  }
1327 
1328  if (tid < sizeWarp)
1329  {
1330  if ((blockSize >= 2*sizeWarp) &&
1331  ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] > myPos)))) {
1332  myMin=sdata[tid+sizeWarp];
1333  myPos=pdata[tid+sizeWarp];
1334  }
1335 
1336  for (int offset = sizeWarp/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
1337  {
1338  #ifdef CUDA9
1339  #ifdef SIMPLE
1340  tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1341  #else
1342  tmpMin = __shfl_downD(myMin, offset);
1343  #endif
1344  tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1345  #else
1346  #ifdef SIMPLE
1347  tmpMin = __shfl_down(myMin, offset, sizeWarp);
1348  #else
1349  tmpMin = __shfl_downD(myMin, offset);
1350  #endif
1351  tmpPos = __shfl_down(myPos, offset, sizeWarp);
1352  #endif
1353 
1354  if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos > myPos))) {
1355  myMin=tmpMin;
1356  myPos=tmpPos;
1357  }
1358  }
1359  }
1360  if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1361 }
1362 
1363 
1376 __global__ void kernel_LastIminLast(MyType* __restrict__ odata, int* __restrict__ opos, const MyType* __restrict__ idata,
1377  const int* __restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
1378 {
1379  extern __shared__ MyType ss[];
1380  MyType *sdata=ss;
1381  int *pdata=(int *)&sdata[blockSize];
1382 
1383  int tid = threadIdx.x;
1384  int i = blockIdx.x*blockSize*2 + threadIdx.x;
1385  int gSize = blockSize*2*gridDim.x;
1386  int myPos, tmpPos;
1387 
1388  #ifdef SIMPLE
1389  MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1390  #else
1391  MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1392  #endif
1393 
1394  while (i < size)
1395  {
1396  myMin=idata[i];
1397  myPos=ipos[i];
1398 
1399  if (SizeIsPow2 || i + blockSize < size)
1400  if (idata[i+blockSize] <= myMin) {
1401  myMin=idata[i+blockSize];
1402  myPos=ipos[i+blockSize];
1403  }
1404  i += gSize;
1405  }
1406  sdata[tid]=myMin;
1407  pdata[tid]=myPos;
1408  __syncthreads();
1409 
1410  for (unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1) // s/=2 is equal to s>>=1
1411  {
1412  if ((blockSize >= 2*s) && (tid < s))
1413  if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] > myPos))) {
1414  sdata[tid]=myMin=sdata[tid+s];
1415  pdata[tid]=myPos=pdata[tid+s];
1416  }
1417  __syncthreads();
1418  }
1419 
1420  if (tid < sizeWarp)
1421  {
1422  if ((blockSize >= 2*sizeWarp) &&
1423  ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] > myPos)))) {
1424  myMin=sdata[tid+sizeWarp];
1425  myPos=pdata[tid+sizeWarp];
1426  }
1427 
1428  for (int offset = blockSize/2; offset > 0; offset>>=1) // offset/=2 is equal to offset>>=1
1429  {
1430  #ifdef CUDA9
1431  #ifdef SIMPLE
1432  tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1433  #else
1434  tmpMin = __shfl_downD(myMin, offset);
1435  #endif
1436  tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1437  #else
1438  #ifdef SIMPLE
1439  tmpMin = __shfl_down(myMin, offset, sizeWarp);
1440  #else
1441  tmpMin = __shfl_downD(myMin, offset);
1442  #endif
1443  tmpPos = __shfl_down(myPos, offset, sizeWarp);
1444  #endif
1445 
1446  if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos > myPos))) {
1447  myMin=tmpMin;
1448  myPos=tmpPos;
1449  }
1450  }
1451  }
1452  if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1453 }
1454 
__global__ void kernel_PowToReal(MyType *__restrict__ dest, const MyType *__restrict__ src, const MyType ex, const int size)
kernel_PowToReal This cuda kernel powers the elements of a vector to a real number and stores them in...
Definition: kernels.cuh:481
__global__ void kernel_Sum(MyType *__restrict__ odata, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_Sum This cuda kernel adds the elements of a vector.
Definition: kernels.cuh:260
__inline__ __device__ double warpReduceSumD(double val)
warpReduceSumD does double sum reduction within a warp
Definition: kernels.cuh:67
__global__ void kernel_LastIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_LastIminLast used with kernel_LastImin to calculates the position of the last minimum in a vec...
Definition: kernels.cuh:1376
__global__ void kernel_ApplyWindow(MyType *__restrict__ X_fft, const short *__restrict__ frame, const MyType *__restrict__ v_hanning, const int TTRA, const int NFFT)
kernel_ApplyWindow scales and set the elements of the audio vector X_fft
Definition: kernels.cuh:345
__global__ void kernel_BetaNorm(MyType *__restrict__ vector, const int size)
kernel__BetaNorm normalized the vector
Definition: kernels.cuh:901
__global__ void kernel_FirstIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_FirstIminLast used with kernel_OneImin to calculates the position of the first minimum in a ve...
Definition: kernels.cuh:1192
__global__ void kernel_Shift(short *__restrict__ frame, const int TTRAMA, const int TMUEST)
kernel_Shift shifts the vector elements TMUEST positions on the left
Definition: kernels.cuh:880
__global__ void kernel_Vnorm(MyType *__restrict__ odata)
kernel_Vnorm This cuda kernel initializes position 0 of a vector
Definition: kernels.cuh:326
__global__ void kernel_LastImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_LastImin calculates the position of the last minimum in a vector
Definition: kernels.cuh:1283
__global__ void kernel_CompNorB1(MyType *__restrict__ norms, const MyType *__restrict__ s_fk, const int NMIDI, const int size)
kernel_CompNorB1 This cuda kernel computes the norm of a vector when BETA=1
Definition: kernels.cuh:404
__global__ void kernel_CompNorBG(MyType *__restrict__ norms, MyType *__restrict__ ts_fk, const MyType *__restrict__ s_fk, const int NMIDI, const MyType BETA, const int size)
kernel_CompNorBG This cuda kernel computes the norm of a vector when BETA <> 0 and BETA <> 1 ...
Definition: kernels.cuh:440
__global__ void kernel_ReductionPowBeta(MyType *__restrict__ dest, const MyType BETA, const int size)
kernel_Reduction This cuda kernel performs a typical sum-reduction of a vector
Definition: kernels.cuh:586
__global__ void kernel_InitSxD(MyType *__restrict__ odata, MyType *__restrict__ v_SxD, const MyType *__restrict__ v_dxState, const int *__restrict__ I_SxD, const int blockSize, const bool SizeIsPow2, const int size)
kernel_InitSxD This cuda kernel sets up the vector SxD.
Definition: kernels.cuh:185
__global__ void kernel_OneIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_OneIminLast used with kernel_OneImin to calculates the position of one minimum in a vector ...
Definition: kernels.cuh:1012
__global__ void kernel_OneImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_OneImin calculates the position of one minimum in a vector
Definition: kernels.cuh:923
__inline__ __device__ float warpReduceSumS(float val)
warpReduceSumD does float sum reduction within a warp
Definition: kernels.cuh:87
__global__ void kernel_Cfreq(MyType *__restrict__ dest, const MyType *__restrict__ src)
kernel_Cfreq This cuda kernel computes sqrt(sum of elements of a vector) and stores it in dest[0] ...
Definition: kernels.cuh:527
__device__ double __shfl_downD(double var, unsigned int srcLane, int width=sizeWarp)
__shfl_downD performs __shfl_down of a double number
Definition: kernels.cuh:43
__global__ void kernel_FirstImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_FirstImin calculates the position of the first minimum in a vector
Definition: kernels.cuh:1099
__global__ void kernel_Modul(MyType *__restrict__ dest, const MyType *__restrict__ src, const int size)
kernel_Modul This cuda kernel computes the modulus of elements of a vector and stores them in other v...
Definition: kernels.cuh:503
__global__ void kernel_CompNorB0(MyType *__restrict__ norms, const MyType value, const int size)
kernel_CompNorB0 This cuda kernel computes the norm of a vector when BETA=0
Definition: kernels.cuh:386
__global__ void __launch_bounds__(maxThreads, 4) kernel_CompDisB0(MyType *__restrict__ dest
kernel_CompDisB0 This cuda kernel computes the distortion of a vector when BETA=0 ...
__global__ void kernel_UpdateSxD(MyType *__restrict__ dest, const MyType ALPHA, const MyType *__restrict__ norm, const int size)
kernel_UpdateSxD This cuda kernel update the elements of SxD vector
Definition: kernels.cuh:364
__global__ void kernel_InitDTW(MyType *__restrict__ pV, const int pos, const int size)
kernel_InitDTW This cuda kernel initializes DTW vector
Definition: kernels.cuh:108
__global__ void kernel_DTW(const MyType *__restrict__ Sequence, MyType *__restrict__ pD, const int NSeq, const int Where, const int NST)
kernel_DTW This cuda kernel performs the Online-DTW process for the current frame ...
Definition: kernels.cuh:135
__global__ void kernel_Reduction(MyType *__restrict__ dest, const int size)
kernel_Reduction This cuda kernel performs a typical sum-reduction of a vector
Definition: kernels.cuh:560