43 __device__
inline double __shfl_downD(
double var,
unsigned int srcLane,
int width=sizeWarp) {
47 int2 a = *
reinterpret_cast<int2*
>(&var);
50 a.x = __shfl_down_sync(0xffffffff, a.x, srcLane, width);
51 a.y = __shfl_down_sync(0xffffffff, a.y, srcLane, width);
53 a.x = __shfl_down(a.x, srcLane, width);
54 a.y = __shfl_down(a.y, srcLane, width);
57 return *
reinterpret_cast<double*
>(&a);
75 for (
int offset = sizeWarp/2; offset > 0; offset /= 2)
89 for (
int offset = sizeWarp/2; offset > 0; offset /= 2)
91 val += __shfl_down_sync(0xffffffff, val, offset, sizeWarp);
93 val += __shfl_down(val, offset, sizeWarp);
108 __global__
void kernel_InitDTW(MyType* __restrict__ pV,
const int pos,
const int size)
110 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
135 __global__
void kernel_DTW(
const MyType* __restrict__ Sequence, MyType* __restrict__ pD,
const int NSeq,
136 const int Where,
const int NST)
138 unsigned int j=threadIdx.x + blockIdx.x * blockDim.x;
139 unsigned int NSTplusNC, k, Pos;
151 NSTplusNC = N_COSTS + NST;
152 Pos =((NSeq + N_COSTS) % TBLOCK) * NSTplusNC + N_COSTS + j - 1;
153 for(k=0; k<N_COSTS; k++)
155 d2 = Sequence[j]*CCosts[k]+pD[Pos-k];
159 for (k=N_COSTS; k<T_COSTS; k++)
161 Pos=((NSeq + (T_COSTS-k)) % TBLOCK) * NSTplusNC + N_COSTS + j - 1;
163 d2 = Sequence[j]*CCosts[k]+pD[Pos];
185 __global__
void kernel_InitSxD(MyType* __restrict__ odata, MyType* __restrict__ v_SxD,
const MyType* __restrict__ v_dxState,
186 const int* __restrict__ I_SxD,
const int blockSize,
const bool SizeIsPow2,
const int size)
188 extern __shared__ MyType sdata[];
190 unsigned int tid = threadIdx.x;
191 unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
192 unsigned int gridSize = blockSize*2*gridDim.x;
194 MyType mySum=0.0, myData;
198 myData = v_SxD[i] = v_dxState[I_SxD[i]];
199 mySum += myData*myData;
201 if (SizeIsPow2 || i + blockSize < size)
203 myData = v_SxD[i+blockSize] = v_dxState[I_SxD[i+blockSize]];
204 mySum += myData*myData;
214 for (
unsigned int j=maxThreads; j>=4*sizeWarp; j>>=1)
216 if ((blockSize >= j) && (tid < (j>>1)))
217 sdata[tid] = mySum = mySum + sdata[tid + (j>>1)];
236 if (blockSize >= 2*sizeWarp)
237 mySum += sdata[tid + sizeWarp];
239 for (
int offset = sizeWarp/2; offset > 0; offset /= 2)
241 mySum += __shfl_down_sync(0xffffffff, mySum, offset);
243 mySum += __shfl_down(mySum, offset);
246 if (tid == 0) odata[blockIdx.x] = mySum;
260 __global__
void kernel_Sum(MyType* __restrict__ odata,
const MyType* __restrict__ idata,
261 const int blockSize,
const bool SizeIsPow2,
const int size)
263 extern __shared__ MyType sdata[];
265 unsigned int tid = threadIdx.x;
266 unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
267 unsigned int gridSize = blockSize*2*gridDim.x;
275 if (SizeIsPow2 || i + blockSize < size)
276 mySum += idata[i+blockSize];
284 for (
unsigned int j=maxThreads; j>=4*sizeWarp; j>>=1)
286 if ((blockSize >= j) && (tid < (j>>1)))
287 sdata[tid] = mySum = mySum + sdata[tid + (j>>1)];
306 if (blockSize >= sizeWarp*2)
307 mySum += sdata[tid + sizeWarp];
309 for (
int offset = sizeWarp/2; offset > 0; offset /= 2)
311 mySum += __shfl_down_sync(0xffffffff, mySum, offset);
313 mySum += __shfl_down(mySum, offset);
316 if (tid == 0) odata[blockIdx.x] = mySum;
329 odata[0] = 1.0f / (sqrtf(odata[0]) + FLT_EPSILON);
331 odata[0] = 1.0 / ( sqrt(odata[0]) + DBL_EPSILON);
346 const MyType* __restrict__ v_hanning,
const int TTRA,
const int NFFT)
348 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
351 X_fft[tid] = (tid < TTRA) ? (MyType)frame[tid] * Scaling * v_hanning[tid] : 0.0;
364 __global__
void kernel_UpdateSxD(MyType* __restrict__ dest,
const MyType ALPHA,
const MyType* __restrict__ norm,
367 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
371 dest[tid] = 1.0f - expf(ALPHA*fabsf(dest[tid]*norm[0]));
373 dest[tid] = 1.0 - exp(ALPHA* fabs(dest[tid]*norm[0]));
386 __global__
void kernel_CompNorB0(MyType* __restrict__ norms,
const MyType value,
const int size)
388 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
404 __global__
void kernel_CompNorB1(MyType* __restrict__ norms,
const MyType* __restrict__ s_fk,
405 const int NMIDI,
const int size)
407 unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
409 unsigned int stride = i*N_MIDI_PAD;
415 for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
424 if (threadIdx.x==0) norms[i]=a;
441 const MyType* __restrict__ s_fk,
const int NMIDI,
const MyType BETA,
const int size)
443 unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
445 unsigned int stride = i*N_MIDI_PAD;
452 for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
454 ts_fk[stride+j] = b = powf(s_fk[stride+j], BETA - 1.0f);
455 a += b*s_fk[stride+j];
460 for(j=threadIdx.x; j<NMIDI; j+=sizeWarp)
462 ts_fk[stride+j] = b = pow(s_fk[stride+j], BETA - 1.0f);
463 a += b*s_fk[stride+j];
468 if (threadIdx.x==0) norms[i]=a;
481 __global__
void kernel_PowToReal(MyType* __restrict__ dest,
const MyType* __restrict__ src,
const MyType ex,
const int size)
483 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
487 dest[tid]=powf(src[tid], ex);
489 dest[tid]= pow(src[tid], ex);
503 __global__
void kernel_Modul(MyType* __restrict__ dest,
const MyType* __restrict__ src,
const int size)
505 unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
506 unsigned int stride = (threadIdx.x + blockIdx.x * blockDim.x)*2;
513 tmp2 = src[stride + 1];
515 dest[tid]=tmp1*tmp1 + tmp2*tmp2;
527 __global__
void kernel_Cfreq(MyType* __restrict__ dest,
const MyType* __restrict__ src)
529 unsigned int i = blockIdx.x;
530 unsigned int j = threadIdx.x;
533 for (
unsigned int k=Ckmin_fft[i]+j; k<=Ckmax_fft[i]; k+=sizeWarp) {
545 dest[i] = sqrtf(tmp);
562 unsigned int tid = threadIdx.x;
567 for(j=tid; j<size; j+=sizeWarp) a += dest[j];
575 if (tid==0) dest[size]=a;
588 unsigned int tid = threadIdx.x;
593 for(j=tid; j<size; j+=sizeWarp)
595 a += powf(dest[j], BETA);
597 a += pow (dest[j], BETA);
602 if (tid==0) dest[size]=powf(a, 1.0/BETA);
605 if (tid==0) dest[size]=pow(a, 1.0/BETA);
623 kernel_CompDisB0(MyType* __restrict__ dest,
const MyType* __restrict__ v_cfreq,
const MyType* __restrict__ norms,
624 const MyType* __restrict__ s_fk,
const int NMIDI,
const int size)
626 unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
628 unsigned int stride = i * N_MIDI_PAD;
629 unsigned int th_row = threadIdx.y;
630 unsigned int th_col = threadIdx.x;
631 unsigned int row = i + threadIdx.x;
632 bool guard = th_row == 0 && row < size && th_col < blockDim.y;
635 __shared__ MyType sh[sizeWarp];
640 for(j=th_col; j<NMIDI; j+=sizeWarp) {
641 a += v_cfreq[j] / s_fk[stride+j];
651 sh[th_col] = norms[row];
659 b = __shfl_sync(0xffffffff, b, 0);
665 for(j=th_col; j<NMIDI; j+=sizeWarp)
667 tmp1 = v_cfreq[j] / (s_fk[stride + j] * b);
669 a += tmp1 - logf(tmp1) - 1.0f;
671 a += tmp1 - log(tmp1) - 1.0;
687 dest[row] = sh[th_col];
706 kernel_CompDisB1(MyType* __restrict__ dest,
const MyType* __restrict__ v_cfreq,
const MyType* __restrict__ norms,
707 const MyType* __restrict__ s_fk,
const int NMIDI,
const int size)
709 unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
711 unsigned int stride = i * N_MIDI_PAD;
712 unsigned int th_row = threadIdx.y;
713 unsigned int th_col = threadIdx.x;
714 unsigned int row = i + threadIdx.x;
715 bool guard = th_row == 0 && row < size && th_col < blockDim.y;
716 MyType a, tmp1, tmp2, tmp3;
718 __shared__ MyType sh[sizeWarp];
723 sh[th_col] = v_cfreq[NMIDI] / norms[row];
730 for(j=th_col; j<NMIDI; j+=sizeWarp) {
731 tmp2 = s_fk[stride+j] * tmp1;
734 a += tmp3*logf(tmp3/tmp2) + tmp2 - tmp3;
736 a += tmp3* log(tmp3/tmp2) + tmp2 - tmp3;
752 dest[row] = sh[th_col];
774 kernel_CompDisBG(MyType* __restrict__ dest,
const MyType* __restrict__ v_cfreq,
775 const MyType* __restrict__ norms,
const MyType* __restrict__ s_fk,
776 const MyType* __restrict__ ts_fk,
const MyType* __restrict__ tauxi,
777 const MyType BETA,
const int NMIDI,
const int size)
779 unsigned int i = blockIdx.x * blockDim.y + threadIdx.y;
781 unsigned int stride = i * N_MIDI_PAD;
782 unsigned int th_row = threadIdx.y;
783 unsigned int th_col = threadIdx.x;
784 unsigned int row = i + threadIdx.x;
786 bool guard = th_row == 0 && row < size && th_col < blockDim.y;
787 MyType a, b, tmp1, tmp2;
788 MyType beta1 = BETA-1.0;
789 MyType tmp3 = (1.0 / (BETA*(BETA-1.0)));
791 __shared__ MyType sh_a[sizeWarp/2], sh_b[sizeWarp/2];
796 for(j=th_col, k=stride+th_col; j<NMIDI; j+=sizeWarp, k+=sizeWarp) {
797 a += v_cfreq[j] * ts_fk[stride+j];
812 a = sh_a[th_col] / norms[row];
818 sh_b[th_col] = BETA * b;
819 sh_a[th_col] = b * a * beta1;
830 for (
unsigned int s=sizeWarp; s<N_MIDI_PAD; s+=sizeWarp,j+=sizeWarp,k+=sizeWarp) {
831 a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
835 if (th_col < (sizeWarp - (N_MIDI_PAD - NMIDI)))
836 a += ((tauxi[j] + ts_fk[k] * (s_fk[k] * tmp2 - v_cfreq[j] * tmp1)) * tmp3);
866 dest[row] = sh_a[th_col];
880 __global__
void kernel_Shift(
short* __restrict__ frame,
const int TTRAMA,
const int TMUEST)
882 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
885 for (i=0; i<(TTRAMA/TMUEST - 1); i++)
888 frame[tmp]=frame[tmp+TMUEST];
903 unsigned int tid = threadIdx.x;
906 MyType value=vector[size];
908 vector[tid] = vector[tid] / value;
923 __global__
void kernel_OneImin(MyType* __restrict__ odata,
int* __restrict__ opos,
const MyType* __restrict__ idata,
924 const int blockSize,
const bool SizeIsPow2,
const int size)
926 extern __shared__ MyType ss[];
929 int *pdata=(
int *)&sdata[blockSize];
931 int tid = threadIdx.x;
932 int i = blockIdx.x*blockSize*2 + threadIdx.x;
933 int gSize = blockSize*2*gridDim.x;
937 MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
939 MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
947 if (SizeIsPow2 || i + blockSize < size)
948 if (idata[i+blockSize] < myMin) {
949 myMin=idata[i+blockSize];
958 for (
unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1)
960 if ((blockSize >= 2*s) && (tid < s))
961 if (sdata[tid + s] < myMin) {
962 sdata[tid]=myMin=sdata[tid+s];
963 pdata[tid]=myPos=pdata[tid+s];
970 if ((blockSize >= 2*sizeWarp) && (sdata[tid + sizeWarp] < myMin)) {
971 myMin=sdata[tid+sizeWarp];
972 myPos=pdata[tid+sizeWarp];
975 for (
int offset = sizeWarp/2; offset > 0; offset>>=1)
979 tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
983 tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
986 tmpMin = __shfl_down(myMin, offset, sizeWarp);
990 tmpPos = __shfl_down(myPos, offset, sizeWarp);
993 if (tmpMin < myMin) { myMin=tmpMin; myPos=tmpPos; }
996 if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1012 __global__
void kernel_OneIminLast(MyType* __restrict__ odata,
int* __restrict__ opos,
const MyType* __restrict__ idata,
1013 const int* __restrict__ ipos,
const int blockSize,
const bool SizeIsPow2,
const int size)
1015 extern __shared__ MyType ss[];
1017 int *pdata=(
int *)&sdata[blockSize];
1019 int tid = threadIdx.x;
1020 int i = blockIdx.x*blockSize*2 + threadIdx.x;
1021 int gSize = blockSize*2*gridDim.x;
1025 MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1027 MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1035 if (SizeIsPow2 || i + blockSize < size)
1036 if (idata[i+blockSize] < myMin) {
1037 myMin=idata[i+blockSize];
1038 myPos=ipos[i+blockSize];
1046 for (
unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1)
1048 if ((blockSize >= 2*s) && (tid < s))
1049 if (sdata[tid + s] < myMin) {
1050 sdata[tid]=myMin=sdata[tid+s];
1051 pdata[tid]=myPos=pdata[tid+s];
1058 if ((blockSize >= 2*sizeWarp) && (sdata[tid + sizeWarp] < myMin)) {
1059 myMin=sdata[tid+sizeWarp];
1060 myPos=pdata[tid+sizeWarp];
1063 for (
int offset = blockSize/2; offset > 0; offset>>=1)
1067 tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1071 tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1074 tmpMin = __shfl_down(myMin, offset, sizeWarp);
1078 tmpPos = __shfl_down(myPos, offset, sizeWarp);
1081 if (tmpMin < myMin) { myMin=tmpMin; myPos=tmpPos; }
1084 if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1099 __global__
void kernel_FirstImin(MyType* __restrict__ odata,
int* __restrict__ opos,
const MyType* __restrict__ idata,
1100 const int blockSize,
const bool SizeIsPow2,
const int size)
1102 extern __shared__ MyType ss[];
1105 int *pdata=(
int *)&sdata[blockSize];
1107 int tid = threadIdx.x;
1108 int i = blockIdx.x*blockSize*2 + threadIdx.x;
1109 int gSize = blockSize*2*gridDim.x;
1113 MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1115 MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1123 if (SizeIsPow2 || i + blockSize < size)
1124 if (idata[i+blockSize] < myMin) {
1125 myMin=idata[i+blockSize];
1134 for (
unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1)
1136 if ((blockSize >= 2*s) && (tid < s))
1137 if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] < myPos))) {
1138 sdata[tid]=myMin=sdata[tid+s];
1139 pdata[tid]=myPos=pdata[tid+s];
1146 if ((blockSize >= 2*sizeWarp) &&
1147 ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] < myPos)))) {
1148 myMin=sdata[tid+sizeWarp];
1149 myPos=pdata[tid+sizeWarp];
1152 for (
int offset = sizeWarp/2; offset > 0; offset>>=1)
1156 tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1160 tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1163 tmpMin = __shfl_down(myMin, offset, sizeWarp);
1167 tmpPos = __shfl_down(myPos, offset, sizeWarp);
1170 if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos < myPos))) {
1176 if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1192 __global__
void kernel_FirstIminLast(MyType* __restrict__ odata,
int* __restrict__ opos,
const MyType* __restrict__ idata,
1193 const int* __restrict__ ipos,
const int blockSize,
const bool SizeIsPow2,
const int size)
1195 extern __shared__ MyType ss[];
1197 int *pdata=(
int *)&sdata[blockSize];
1199 int tid = threadIdx.x;
1200 int i = blockIdx.x*blockSize*2 + threadIdx.x;
1201 int gSize = blockSize*2*gridDim.x;
1205 MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1207 MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1215 if (SizeIsPow2 || i + blockSize < size)
1216 if (idata[i+blockSize] < myMin) {
1217 myMin=idata[i+blockSize];
1218 myPos=ipos[i+blockSize];
1226 for (
unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1)
1228 if ((blockSize >= 2*s) && (tid < s))
1229 if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] < myPos))) {
1230 sdata[tid]=myMin=sdata[tid+s];
1231 pdata[tid]=myPos=pdata[tid+s];
1238 if ((blockSize >= 2*sizeWarp) &&
1239 ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] < myPos)))) {
1240 myMin=sdata[tid+sizeWarp];
1241 myPos=pdata[tid+sizeWarp];
1244 for (
int offset = blockSize/2; offset > 0; offset>>=1)
1248 tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1252 tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1255 tmpMin = __shfl_down(myMin, offset, sizeWarp);
1259 tmpPos = __shfl_down(myPos, offset, sizeWarp);
1262 if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos < myPos))) {
1268 if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1283 __global__
void kernel_LastImin(MyType* __restrict__ odata,
int* __restrict__ opos,
const MyType* __restrict__ idata,
1284 const int blockSize,
const bool SizeIsPow2,
const int size)
1286 extern __shared__ MyType ss[];
1289 int *pdata=(
int *)&sdata[blockSize];
1291 int tid = threadIdx.x;
1292 int i = blockIdx.x*blockSize*2 + threadIdx.x;
1293 int gSize = blockSize*2*gridDim.x;
1297 MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1299 MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1307 if (SizeIsPow2 || i + blockSize < size)
1308 if (idata[i+blockSize] <= myMin) {
1309 myMin=idata[i+blockSize];
1318 for (
unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1)
1320 if ((blockSize >= 2*s) && (tid < s))
1321 if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] > myPos))) {
1322 sdata[tid]=myMin=sdata[tid+s];
1323 pdata[tid]=myPos=pdata[tid+s];
1330 if ((blockSize >= 2*sizeWarp) &&
1331 ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] > myPos)))) {
1332 myMin=sdata[tid+sizeWarp];
1333 myPos=pdata[tid+sizeWarp];
1336 for (
int offset = sizeWarp/2; offset > 0; offset>>=1)
1340 tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1344 tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1347 tmpMin = __shfl_down(myMin, offset, sizeWarp);
1351 tmpPos = __shfl_down(myPos, offset, sizeWarp);
1354 if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos > myPos))) {
1360 if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
1376 __global__
void kernel_LastIminLast(MyType* __restrict__ odata,
int* __restrict__ opos,
const MyType* __restrict__ idata,
1377 const int* __restrict__ ipos,
const int blockSize,
const bool SizeIsPow2,
const int size)
1379 extern __shared__ MyType ss[];
1381 int *pdata=(
int *)&sdata[blockSize];
1383 int tid = threadIdx.x;
1384 int i = blockIdx.x*blockSize*2 + threadIdx.x;
1385 int gSize = blockSize*2*gridDim.x;
1389 MyType myMin=FLT_MAX, tmpMin=FLT_MAX;
1391 MyType myMin=DBL_MAX, tmpMin=DBL_MAX;
1399 if (SizeIsPow2 || i + blockSize < size)
1400 if (idata[i+blockSize] <= myMin) {
1401 myMin=idata[i+blockSize];
1402 myPos=ipos[i+blockSize];
1410 for (
unsigned int s=maxThreads/2; s>=2*sizeWarp; s>>=1)
1412 if ((blockSize >= 2*s) && (tid < s))
1413 if ((sdata[tid + s] < myMin) || ((sdata[tid + s] == myMin) && (pdata[tid + s] > myPos))) {
1414 sdata[tid]=myMin=sdata[tid+s];
1415 pdata[tid]=myPos=pdata[tid+s];
1422 if ((blockSize >= 2*sizeWarp) &&
1423 ((sdata[tid + sizeWarp] < myMin) || ((sdata[tid + sizeWarp] == myMin) && (pdata[tid + sizeWarp] > myPos)))) {
1424 myMin=sdata[tid+sizeWarp];
1425 myPos=pdata[tid+sizeWarp];
1428 for (
int offset = blockSize/2; offset > 0; offset>>=1)
1432 tmpMin = __shfl_down_sync(0xffffffff, myMin, offset, sizeWarp);
1436 tmpPos = __shfl_down_sync(0xffffffff, myPos, offset, sizeWarp);
1439 tmpMin = __shfl_down(myMin, offset, sizeWarp);
1443 tmpPos = __shfl_down(myPos, offset, sizeWarp);
1446 if ((tmpMin < myMin) || ((tmpMin == myMin) && (tmpPos > myPos))) {
1452 if (tid == 0) { odata[blockIdx.x]=myMin; opos[blockIdx.x]=myPos; }
__global__ void kernel_PowToReal(MyType *__restrict__ dest, const MyType *__restrict__ src, const MyType ex, const int size)
kernel_PowToReal This cuda kernel powers the elements of a vector to a real number and stores them in...
__global__ void kernel_Sum(MyType *__restrict__ odata, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_Sum This cuda kernel adds the elements of a vector.
__inline__ __device__ double warpReduceSumD(double val)
warpReduceSumD does double sum reduction within a warp
__global__ void kernel_LastIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_LastIminLast used with kernel_LastImin to calculates the position of the last minimum in a vec...
__global__ void kernel_ApplyWindow(MyType *__restrict__ X_fft, const short *__restrict__ frame, const MyType *__restrict__ v_hanning, const int TTRA, const int NFFT)
kernel_ApplyWindow scales and set the elements of the audio vector X_fft
__global__ void kernel_BetaNorm(MyType *__restrict__ vector, const int size)
kernel__BetaNorm normalized the vector
__global__ void kernel_FirstIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_FirstIminLast used with kernel_OneImin to calculates the position of the first minimum in a ve...
__global__ void kernel_Shift(short *__restrict__ frame, const int TTRAMA, const int TMUEST)
kernel_Shift shifts the vector elements TMUEST positions on the left
__global__ void kernel_Vnorm(MyType *__restrict__ odata)
kernel_Vnorm This cuda kernel initializes position 0 of a vector
__global__ void kernel_LastImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_LastImin calculates the position of the last minimum in a vector
__global__ void kernel_CompNorB1(MyType *__restrict__ norms, const MyType *__restrict__ s_fk, const int NMIDI, const int size)
kernel_CompNorB1 This cuda kernel computes the norm of a vector when BETA=1
__global__ void kernel_CompNorBG(MyType *__restrict__ norms, MyType *__restrict__ ts_fk, const MyType *__restrict__ s_fk, const int NMIDI, const MyType BETA, const int size)
kernel_CompNorBG This cuda kernel computes the norm of a vector when BETA <> 0 and BETA <> 1 ...
__global__ void kernel_ReductionPowBeta(MyType *__restrict__ dest, const MyType BETA, const int size)
kernel_Reduction This cuda kernel performs a typical sum-reduction of a vector
__global__ void kernel_InitSxD(MyType *__restrict__ odata, MyType *__restrict__ v_SxD, const MyType *__restrict__ v_dxState, const int *__restrict__ I_SxD, const int blockSize, const bool SizeIsPow2, const int size)
kernel_InitSxD This cuda kernel sets up the vector SxD.
__global__ void kernel_OneIminLast(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int *__restrict__ ipos, const int blockSize, const bool SizeIsPow2, const int size)
kernel_OneIminLast used with kernel_OneImin to calculates the position of one minimum in a vector ...
__global__ void kernel_OneImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_OneImin calculates the position of one minimum in a vector
__inline__ __device__ float warpReduceSumS(float val)
warpReduceSumD does float sum reduction within a warp
__global__ void kernel_Cfreq(MyType *__restrict__ dest, const MyType *__restrict__ src)
kernel_Cfreq This cuda kernel computes sqrt(sum of elements of a vector) and stores it in dest[0] ...
__device__ double __shfl_downD(double var, unsigned int srcLane, int width=sizeWarp)
__shfl_downD performs __shfl_down of a double number
__global__ void kernel_FirstImin(MyType *__restrict__ odata, int *__restrict__ opos, const MyType *__restrict__ idata, const int blockSize, const bool SizeIsPow2, const int size)
kernel_FirstImin calculates the position of the first minimum in a vector
__global__ void kernel_Modul(MyType *__restrict__ dest, const MyType *__restrict__ src, const int size)
kernel_Modul This cuda kernel computes the modulus of elements of a vector and stores them in other v...
__global__ void kernel_CompNorB0(MyType *__restrict__ norms, const MyType value, const int size)
kernel_CompNorB0 This cuda kernel computes the norm of a vector when BETA=0
__global__ void __launch_bounds__(maxThreads, 4) kernel_CompDisB0(MyType *__restrict__ dest
kernel_CompDisB0 This cuda kernel computes the distortion of a vector when BETA=0 ...
__global__ void kernel_UpdateSxD(MyType *__restrict__ dest, const MyType ALPHA, const MyType *__restrict__ norm, const int size)
kernel_UpdateSxD This cuda kernel update the elements of SxD vector
__global__ void kernel_InitDTW(MyType *__restrict__ pV, const int pos, const int size)
kernel_InitDTW This cuda kernel initializes DTW vector
__global__ void kernel_DTW(const MyType *__restrict__ Sequence, MyType *__restrict__ pD, const int NSeq, const int Where, const int NST)
kernel_DTW This cuda kernel performs the Online-DTW process for the current frame ...
__global__ void kernel_Reduction(MyType *__restrict__ dest, const int size)
kernel_Reduction This cuda kernel performs a typical sum-reduction of a vector