//
// Created by Alberto on 2020-05-06.
//

#include "ComputeFunctions.h"
#include <complex.h>
#include <cblas.h>

/* Para medir tiempos */
double Ctimer(void) {
    struct timeval tm;

    gettimeofday(&tm, NULL);

    return tm.tv_sec + tm.tv_usec/1.0E6;
}



int nextPow2(int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

int comparadesc(const void *a, const void *b)
{
    ValueAndPos *A=(ValueAndPos *)a;
    ValueAndPos *B=(ValueAndPos *)b;

    return (B->data - A->data);
}


int compara(const void *a, const void *b)
{
    return *((int*)a)-*((int*)b);
}


/* Funcion que retorna la PRIMERA posicion de V de valor igual a val */
int buscarFirstIgual(const int n, const MyType *V, const MyType val)
{
    int i=0, pos=-1;
    bool noEncontrado=true;

    while (i<n && noEncontrado)
    {
        if (V[i]==val) { pos=i; noEncontrado=false; }
        i++;
    }
    return pos;
}


/* Funcion que retorna la PRIMERA posicion de V de valor mayor que val */
int buscarFirstMayor(const int n, const MyType *V, const MyType val)
{
    int i=0, pos=-1;
    bool noEncontrado=true;

    while (i<n && noEncontrado)
    {
        if (V[i]>val) { pos=i; noEncontrado=false; }
        i++;
    }
    return pos;
}


/* Funcion que retorna la PRIMERA posicion de V de valor mayor o igual que val */
int buscarFirstMayorIgual(const int n, const MyType *V, const MyType val)
{
    int i=0, pos=-1;
    bool noEncontrado=true;

    while (i<n && noEncontrado)
    {
        if (V[i]>=val) { pos=i; noEncontrado=false; }
        i++;
    }
    return pos;
}


/* Funcion que retorna la ULTIMA posicion de V de valor menor o igual que val */
int buscarLastMenorIgual(const int n, const MyType *V, const MyType val)
{
    int i=0;

    while ((V[i]<val) && (i<n)) { i++; }

    return (i-1);
}



inline void memSetValue(const int, const MyType, MyType *);


inline void memSetValue(const int n, const MyType value, MyType *v)
{
    int i;

#pragma GCC ivdep
    for(i=0; i<n; i++)
        v[i]=value;
}


void memSetValueRealComplex(const int n, const MyType value, MyFFTcompl *v)
{
    int i;

#pragma GCC ivdep
    for(i=0; i<n; i++)
        v[i]=value + 0.0;
}


void suma(const int n, MyType *dest, const MyType *src)
{
    int i;
#pragma GCC ivdep
    for(i=0; i<n; i++)
        dest[i]+=src[i];
}



void xcorr(const int n, MyType *corr, const int m, const MyType *x)
{
    int i;

#ifdef CBLAS
    for(i=0; i<m; i++)
#ifdef SIMPLE
        corr[m+i-1]=corr[m-i-1]=cblas_sdot(m-i, x, 1, &x[i], 1);
#else
            corr[m+i-1]=corr[m-i-1]=cblas_ddot(m-i, x, 1, &x[i], 1);
#endif
#else
    int    j;
    MyType dtmp;

    #ifdef OMP2
      #pragma omp parallel for private(dtmp,j)  // ¿openmp overhead > saved time?
    #endif
    for(i=0; i<m; i++)
    {
      dtmp=0.0;
      for(j=0; j<(m-i); j++)
        dtmp += x[j] * x[j+i];
      corr[m+i-1]=corr[m-i-1]=dtmp;
    }
#endif
}


void simpleRL(const int n, const MyType *Y, MyType *b0, MyType *slope)
{

    int i;
    MyType sumX, sumX2, sumY=0, sumXY=0;

    sumX =n*(1.0+n)/2.0;
    sumX2=sumX*(2.0*n+1.0)/3.0;

    /* Version hand-made */
#ifdef OMP2
#pragma omp parallel for reduction(+:sumY) reduction(+:sumXY) // ¿openmp overhead > saved time?
#endif
    for(i=0;i<n;i++) { sumY+=Y[i]; sumXY+=i*Y[i]; }



    *slope=(n*sumXY - sumX*sumY) / (n*sumX2 - sumX*sumX);
    *b0   =(sumY-(*slope)*sumX)  / n;
}


int hanning(const int n, MyType *v)
{
    int
            i, Half,
            SMOne=n-1;

    MyType
            tmp=DosPI/(MyType)(n+1);

    if ((n%2)==0) Half=n/2; else Half=(n+1)/2;

#ifdef OMP2
#pragma omp parallel for simd  // ¿overhead > saved time?
#else
#pragma GCC ivdep
#endif
    for(i=0; i<Half; i++)
    {
#ifdef SIMPLE
        v[i]=0.5*(1.0 - cosf(tmp*(i+1)));
#else
        v[i]=0.5*(1.0 -  cos(tmp*(i+1)));
#endif
        v[SMOne-i] = v[i];
    }
    return OK;
}


int cHannFFT(const int winSize, const int nFrames, const int NoOverlap, const MyType *audio, const MyType *vHanning,
             const int rowsNMF, MyType *rSNMF, MyFFTcompl *cSNMF, const int fftSize, const int maxThreads, const int planType)
{
    int          k;
    MyFFTcompl   *xFFT=NULL;
    MyFFTCPUType *planFFT=NULL;


#ifdef SIMPLE
    CHECKNULL(xFFT   =(MyFFTcompl   *)fftwf_malloc(sizeof(MyFFTcompl)  *fftSize*maxThreads));
    CHECKNULL(planFFT=(MyFFTCPUType *)fftwf_malloc(sizeof(MyFFTCPUType)*maxThreads));
#else
    CHECKNULL(xFFT   =(MyFFTcompl   *)fftw_malloc(sizeof(MyFFTcompl)  *fftSize*maxThreads));
    CHECKNULL(planFFT=(MyFFTCPUType *)fftw_malloc(sizeof(MyFFTCPUType)*maxThreads));
#endif
    for(k=0;k<maxThreads;k++)
    {
#ifdef SIMPLE
        planFFT[k]=fftwf_plan_dft_1d(fftSize, &xFFT[k*fftSize], &xFFT[k*fftSize], FFTW_FORWARD, planType);
#else
        planFFT[k]= fftw_plan_dft_1d(fftSize, &xFFT[k*fftSize], &xFFT[k*fftSize], FFTW_FORWARD, planType);
#endif
    }

#ifdef OMP
#pragma omp parallel
#endif
    {
        int myID, myIDpos, pos, i, j;

#ifdef OMP
        myID=omp_get_thread_num();
#else
        myID=0;
#endif
        myIDpos=myID*fftSize;

#ifdef OMP
#pragma omp for
#endif
        for(i=0; i<nFrames; i++)
        {
            pos=i*(winSize-NoOverlap);

#pragma GCC ivdep
            for(j=0; j<winSize; j++) { xFFT[myIDpos+j]=audio[pos+j] * vHanning[j]; }

            if(winSize<fftSize)
                memset(&xFFT[myIDpos+winSize], 0, sizeof(MyFFTcompl)*(fftSize-winSize));

#ifdef SIMPLE
            fftwf_execute(planFFT[myID]);
#else
            fftw_execute(planFFT[myID]);
#endif

            pos=i*rowsNMF;

#pragma GCC ivdep
            for(j=0; j<rowsNMF; j++)
            {
#ifdef SIMPLE
                rSNMF[pos+j]=cabsf(xFFT[myIDpos+j]);
#else
                rSNMF[pos+j]= cabs(xFFT[myIDpos+j]);
#endif
                cSNMF[pos+j]=xFFT[myIDpos+j];
            }
        }
    }

    for(k=0;k<maxThreads;k++)
    {
#ifdef SIMPLE
        fftwf_destroy_plan(planFFT[k]);
#else
        fftw_destroy_plan(planFFT[k]);
#endif
    }

#ifdef SIMPLE
    fftwf_free(xFFT);
    fftwf_free(planFFT);
#else
    fftw_free(xFFT);
    fftw_free(planFFT);
#endif

    return OK;
}


int rHannFFT(const int winSize, const int nFrames, const int NoOverlap, const MyType *audio, const MyType *vHanning,
             const int rowsNMF, MyType *sNMF, const int fftSize, const int maxThreads, const int planType)
{
    int          k;
    MyFFTcompl   *xFFT=NULL;
    MyFFTCPUType *planFFT=NULL;


#ifdef SIMPLE
    CHECKNULL(xFFT   =(MyFFTcompl   *)fftwf_malloc(sizeof(MyFFTcompl)  *fftSize*maxThreads));
    CHECKNULL(planFFT=(MyFFTCPUType *)fftwf_malloc(sizeof(MyFFTCPUType)*maxThreads));
#else
    CHECKNULL(xFFT   =(MyFFTcompl   *)fftw_malloc(sizeof(MyFFTcompl)  *fftSize*maxThreads));
    CHECKNULL(planFFT=(MyFFTCPUType *)fftw_malloc(sizeof(MyFFTCPUType)*maxThreads));
#endif
    for(k=0;k<maxThreads;k++)
    {
#ifdef SIMPLE
        planFFT[k]=fftwf_plan_dft_1d(fftSize, &xFFT[k*fftSize], &xFFT[k*fftSize], FFTW_FORWARD, planType);
#else
        planFFT[k]= fftw_plan_dft_1d(fftSize, &xFFT[k*fftSize], &xFFT[k*fftSize], FFTW_FORWARD, planType);
#endif
    }

#ifdef OMP
#pragma omp parallel
#endif
    {
        int myID, myIDpos, pos, i, j;

#ifdef OMP
        myID=omp_get_thread_num();
#else
        myID=0;
#endif
        myIDpos=myID*fftSize;

#ifdef OMP
#pragma omp for
#endif
        for(i=0; i<nFrames; i++)
        {
            pos=i*(winSize-NoOverlap);

#pragma GCC ivdep
            for(j=0; j<winSize; j++) { xFFT[myIDpos+j]=audio[pos+j] * vHanning[j]; }

            if(winSize<fftSize)
                memset(&xFFT[myIDpos+winSize], 0, sizeof(MyFFTcompl)*(fftSize-winSize));

#ifdef SIMPLE
            fftwf_execute(planFFT[myID]);
#else
            fftw_execute(planFFT[myID]);
#endif

            pos=i*rowsNMF;

#pragma GCC ivdep
            for(j=0; j<rowsNMF; j++)
            {
#ifdef SIMPLE
                sNMF[pos+j]=cabsf(xFFT[myIDpos+j]);
#else
                sNMF[pos+j]= cabs(xFFT[myIDpos+j]);
#endif
            }
        }
    }

    for(k=0;k<maxThreads;k++)
    {
#ifdef SIMPLE
        fftwf_destroy_plan(planFFT[k]);
#else
        fftw_destroy_plan(planFFT[k]);
#endif
    }

#ifdef SIMPLE
    fftwf_free(xFFT);
    fftwf_free(planFFT);
#else
    fftw_free(xFFT);
    fftw_free(planFFT);
#endif

    return OK;
}


void randNMF(const int n, MyType *v)
{
    int i;

    MyType max=(MyType)RAND_MAX;

#pragma GCC ivdep
    for(i=0; i<n; i++)
#ifdef SIMPLE
        v[i]=((MyType)rand() / max) + sEPS;
#else
            v[i]=((MyType)rand() / max) + dEPS;
#endif
}


void randNMFmt19937ar(const int n, MyType *v)
{
    int i;

    init_genrand(13);

#pragma GCC ivdep
    for(i=0; i<n; i++)
#ifdef SIMPLE
        v[i]=(float)genrand_real3() + 1.0;  // Peligroso
#else
            v[i]=genrand_real3() + 1.0;
#endif
}

void normNMF(const int F, const int bc, const int T, MyType *W, MyType *H)
{
    int    i;
    MyType dtmp;

    for(i=0; i<bc; i++)
    {
#ifdef SIMPLE
        dtmp=cblas_snrm2(F, &W[i*F], 1);

      cblas_sscal(F, 1.0/dtmp, &W[i*F], 1);
      cblas_sscal(T,     dtmp, &H[i],  bc);
#else
        dtmp=cblas_dnrm2(F, &W[i*F], 1);

        cblas_dscal(F, 1.0/dtmp, &W[i*F], 1);
        cblas_dscal(T,     dtmp, &H[i],  bc);
#endif
    }
}

void epsNMF(const int n, MyType *v)
{
    int i;

#pragma GCC ivdep
    for(i=0; i<n; i++)
#ifdef SIMPLE
        v[i]=v[i]+sEPS;
#else
            v[i]=v[i]+dEPS;
#endif
}

MyType distNMF(const int F, const int T, const int bc, MyType *sNMF, MyType *yNMF, MyType *hNMF)
{
    int i;
    MyType dtmp, dist=0.0;

#ifndef LAPACKE
    int j, pos;
#ifdef SIMPLE
    float  dsum=FLT_MIN;
#else
    double dsum=DBL_MIN;
#endif
#endif

#ifdef OMP
#pragma omp parallel for reduction(+:dist) private(dtmp)
#endif
    for(i=0; i<F*T; i++)
    {
#ifdef SIMPLE
        dtmp = sNMF[i] * logf(sNMF[i]/yNMF[i]) + yNMF[i] - sNMF[i];
#else
        dtmp = sNMF[i] *  log(sNMF[i]/yNMF[i]) + yNMF[i] - sNMF[i];
#endif
        dist+=dtmp*dtmp;
    }

#ifdef LAPACKE
    #ifdef SIMPLE
      dist=sqrtf(dist) + lambda*LAPACKE_slange(CblasColMajor,'1', bc, T, hNMF, bc);
    #else
      dist=sqrt(dist)  + lambda*LAPACKE_dlange(CblasColMajor,'1', bc, T, hNMF, bc);
    #endif
    return dist;
#else
#ifdef OMP
#pragma omp parallel for private(dtmp, pos, j) reduction(max: dsum)
#endif
    for(i=0; i<T; i++)
    {
        dtmp=0.0; pos=i*bc;
        for(j=0; j<bc; j++)
            dtmp += hNMF[pos+j];
        if (dtmp > dsum) { dsum=dtmp; }
    }
    return sqrt(dist)+lambda*dsum;
#endif
}


int uNMF(const int F, const int T, const int bc, MyType *sNMF, MyType *wNMF, MyType *hNMF, MyType *yNMF, int *rept)
{
#ifdef DEPURA
    FILE *fdepura;
#endif

    int
            i, j, k, idx, endNMF=0;

    MyType
            *vONES=NULL, *vDeno=NULL, *mTemp, xlambda=lambda*((MyType)F/(MyType)bc);

#ifdef SIMPLE
    float  dklpre=FLT_MAX, dklnow;
#else
    double dklpre=DBL_MAX, dklnow;
#endif


    CHECKNULL(vONES=(MyType *)malloc(max(F,T)   * sizeof(MyType)));
    CHECKNULL(vDeno=(MyType *)malloc(bc         * sizeof(MyType)));
    CHECKNULL(mTemp=(MyType *)malloc(bc*max(F,T)* sizeof(MyType)));

    int a;
    #pragma GCC ivdep
    for(a=0; a<max(F,T);a++){
        vONES[a]=1.0;
    }


    randNMF(F*bc, wNMF);
    randNMF(bc*T, hNMF);


    normNMF(F, bc, T, wNMF, hNMF);


#ifdef SIMPLE
    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, F, T, bc, 1.0, wNMF, F, hNMF, bc, 0.0, yNMF, F);
#else
    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, F, T, bc, 1.0, wNMF, F, hNMF, bc, 0.0, yNMF, F);
#endif


    i=0;
    while ((i<maxIter) && (!endNMF))
    {

#ifdef SIMPLE
        cblas_sgemv(CblasColMajor, CblasNoTrans, bc, T, 1.0, hNMF, bc, vONES, 1, 0.0, vDeno, 1);
#else
        cblas_dgemv(CblasColMajor, CblasNoTrans, bc, T, 1.0, hNMF, bc, vONES, 1, 0.0, vDeno, 1);
#endif

#ifdef OMP
#pragma omp parallel for simd
#endif
        for(j=0; j<F*T; j++)
#ifdef SIMPLE
            yNMF[j]=sNMF[j] / (yNMF[j] +sEPS);
#else
                yNMF[j]=sNMF[j] / (yNMF[j] +dEPS);
#endif

#ifdef SIMPLE
        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, F, bc, T, 1.0, yNMF, F, hNMF, bc, 0.0, mTemp, F);
#else
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, F, bc, T, 1.0, yNMF, F, hNMF, bc, 0.0, mTemp, F);
#endif

#ifdef OMP
#pragma omp parallel for private(idx, k)
#endif
        for(j=0; j<bc; j++)
        {
            idx=j*F;
#pragma GCC ivdep
            for(k=0; k<F; k++)
            {
#ifdef SIMPLE
                wNMF[idx+k] = wNMF[idx+k] * mTemp[idx+k] / (vDeno[j]+sEPS);
#else
                wNMF[idx+k] = wNMF[idx+k] * mTemp[idx+k] / (vDeno[j]+dEPS);
#endif

            }
        }

        normNMF(F, bc, T, wNMF, hNMF);
#ifdef SIMPLE
        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, F, T, bc, 1.0, wNMF, F, hNMF, bc, 0.0, yNMF, F);
#else
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, F, T, bc, 1.0, wNMF, F, hNMF, bc, 0.0, yNMF, F);
#endif

#ifdef SIMPLE
        cblas_sgemv(CblasColMajor, CblasTrans, F, bc, 1.0, wNMF, F, vONES, 1, 0.0, vDeno, 1);
#else
        cblas_dgemv(CblasColMajor, CblasTrans, F, bc, 1.0, wNMF, F, vONES, 1, 0.0, vDeno, 1);
#endif
#ifdef OMP
#pragma omp parallel for simd
#endif
        for(j=0; j<bc; j++) { vDeno[j] = vDeno[j]+xlambda; }

#ifdef OMP
#pragma omp parallel for simd
#endif
        for(j=0; j<F*T; j++)
#ifdef SIMPLE
            yNMF[j]=sNMF[j] / (yNMF[j] +sEPS);
#else
                yNMF[j]=sNMF[j] / (yNMF[j] +dEPS);
#endif

#ifdef SIMPLE
        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, bc, T, F, 1.0, wNMF, F, yNMF, F, 0.0, mTemp, bc);
#else
        cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, bc, T, F, 1.0, wNMF, F, yNMF, F, 0.0, mTemp, bc);
#endif

#ifdef OMP
#pragma omp parallel for private(idx, k)
#endif
        for(j=0; j<T; j++)
        {
            idx=j*bc;
#pragma GCC ivdep
            for(k=0; k<bc; k++)
            {
                hNMF[idx+k] = (hNMF[idx+k] * mTemp[idx+k]) / vDeno[k];

            }
        }

#ifdef SIMPLE
        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, F, T, bc, 1.0, wNMF, F, hNMF, bc, 0.0, yNMF, F);
#else
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, F, T, bc, 1.0, wNMF, F, hNMF, bc, 0.0, yNMF, F);
#endif

        dklnow=distNMF(F, T, bc, sNMF, yNMF, hNMF);

        if ((dklpre-dklnow) < (dklnow*eta)) { endNMF=1; } else { dklpre=dklnow; }

        i++;
    }
    *rept=i;

#ifdef DEPURA
    CHECKNULL(fdepura=fopen("wNMF.dat", "wb"));
    fwrite(&F,   sizeof(int),       1, fdepura);
    fwrite(&bc,  sizeof(int),       1, fdepura);
    fwrite(wNMF, sizeof(MyType), F*bc, fdepura);
    CHECKERR(fclose(fdepura));

    CHECKNULL(fdepura=fopen("hNMF.dat", "wb"));
    fwrite(&bc,  sizeof(int),       1, fdepura);
    fwrite(&T,   sizeof(int),       1, fdepura);
    fwrite(hNMF, sizeof(MyType), bc*T, fdepura);
    CHECKERR(fclose(fdepura));
#endif

    free(vONES);  free(vDeno);  free(mTemp);
    return OK;
}

int HSD(const int nSamples, const int winSMst, const MyType *audio, const MyType *TsMst, MyType **Times,
        int *Patrones, const int maxThreads, const int planType)
{
    int          winSize, fftSize, CxSize, NoOverlap, NoOverMst, nFrames, nFraMst, nFrecCP, *Peaks=NULL;
    int          i, j, pos1, pos2, idx1=0, idx2=0;
    MyType       *vHanning=NULL, *sNMF=NULL, *Ts=NULL, *R=NULL, *Cx=NULL;
    MyType       max1, max2, dtmp, perEsti;

    winSize  = freqHz*secondsHSD;
    fftSize  = nextPow2(winSize);
    nFrecCP  = ceil((freqCut*fftSize)/freqHz);
    NoOverlap= round(fftSize*overlap);
    NoOverMst= round(nextPow2(winSMst)*overlap);
    nFrames  = floor((nSamples-NoOverlap)/(winSize - NoOverlap));
    nFraMst  = floor((nSamples-NoOverMst)/(winSMst - NoOverMst));
    CxSize   = 2*nFrames-1;

    CHECKNULL(vHanning=(MyType *)calloc(winSize,         sizeof(MyType)));
    CHECKNULL(      Ts=(MyType *)calloc(nFrames,         sizeof(MyType)));
    CHECKNULL(       R=(MyType *)calloc(nFrames,         sizeof(MyType)));
    CHECKNULL(   Peaks=(int    *)calloc(nFrames,         sizeof(int)));
    CHECKNULL(      Cx=(MyType *)calloc(CxSize,          sizeof(MyType)));
    CHECKNULL(    sNMF=(MyType *)calloc(nFrecCP*nFrames, sizeof(MyType)));


    CHECKERR(hanning(winSize, vHanning));
    CHECKERR(rHannFFT(winSize, nFrames, NoOverlap, audio, vHanning, nFrecCP, sNMF, fftSize, maxThreads, planType));

    pos1=nFrecCP;
#ifdef OMP2
#pragma omp parallel for private(pos2, dtmp, j) firstprivate(pos1) // ¿overhead > saved time.?
#endif
    for(i=1;i<nFrames;i++)
    {
        pos2=pos1-nFrecCP;  dtmp=0.0;
        for(j=0; j<nFrecCP; j++)
            dtmp+=sNMF[pos1+j]-sNMF[pos2+j];


#ifdef SIMPLE
        R[i]=(dtmp+fabsf(dtmp))/2.0;
#else
        R[i]=(dtmp+ fabs(dtmp))/2.0;
#endif
        pos1+=nFrecCP;
    }

    xcorr(CxSize, Cx, nFrames, R);

    CHECKERR(AMbPD(nFrames, &Cx[nFrames], Peaks));

    max1=Cx[nFrames+Peaks[1]];
#ifdef OMP
#pragma omp parallel for reduction(max: max1)
#endif
    for(i=nFrames+Peaks[1]+1;i<CxSize;i++)
        if (Cx[i]>max1) max1=Cx[i];
    idx1=buscarFirstIgual(nFrames, &Cx[nFrames], max1);

    max2=Cx[nFrames+idx1+1];
#ifdef OMP
#pragma omp parallel for reduction(max: max2)
#endif
    for(i=nFrames+idx1+2;i<CxSize;i++)
        if (Cx[i]>max2) max2=Cx[i];
    idx2=buscarFirstIgual(nFrames, &Cx[nFrames], max2);

    dtmp=(MyType)(winSize-NoOverlap)/(MyType)freqHz;
#ifdef OMP2
#pragma omp parallel for simd  // ¿overhead > saved time.?
#else
#pragma GCC ivdep
#endif
    for(i=0;i<nFrames;i++)
        Ts[i]=i*dtmp;


    if ((Ts[idx2]>Ts[idx1]) && (Ts[idx2]<1.0) && ((Ts[idx2]-Ts[idx1])>0.4))
        perEsti=Ts[idx2+1];
    else
        perEsti=Ts[idx1+1];

    CHECKERR(AMbPD(nFrames, R, Peaks));

    CHECKNULL((*Times)=(MyType *)calloc(Peaks[0]+2, sizeof(MyType)));

    max1=Ts[Peaks[1]]-perEsti;
    if (max1 > 0)
    {
        (*Times)[1]=max1;
        for(i=1; i<=Peaks[0]; i++)
            (*Times)[i+1]=Ts[Peaks[i]];
        (*Times)[0]=Peaks[0]+1;
        idx1=buscarFirstMayorIgual(nFrames, Ts, max1);
        memmove(&Peaks[2], &Peaks[1], sizeof(int)*Peaks[0]);
        Peaks[0]++;
        Peaks[1]=idx1-1;
    }
    else
    {
        max1=Ts[Peaks[2]]-perEsti;
        if ((max1 > 0) && (max1<Ts[Peaks[1]+1]))
        {
            (*Times)[1]=max1;
            for(i=1; i<=Peaks[0]; i++)
                (*Times)[i+1]=Ts[Peaks[i]];
            (*Times)[0]=Peaks[0]+1;
            idx1=buscarFirstMayor(nFrames, Ts, max1);
            memmove(&Peaks[2], &Peaks[1], sizeof(int)*Peaks[0]);
            Peaks[0]++;
            Peaks[1]=idx1-1;
        }
        else
        {
            for(i=1; i<=Peaks[0]; i++)
                (*Times)[i]=Ts[Peaks[i]];
            (*Times)[0]=Peaks[0];
        }
    }

#ifdef OMP2
#pragma omp parallel for private(idx1, idx2, pos1)
#endif
    for(i=1;i<=Peaks[0];i++)
    {

        idx1=buscarLastMenorIgual(nFraMst, TsMst, Ts[Peaks[i]]);

        if (idx1<0) { idx1=2; idx2=0; } else { idx2=idx1+1; }

        switch (winSMst)
        {
            case  512:
                if(idx1>0) { idx1=idx1-1; idx2=idx2+2; } else { idx1=0; idx2=idx2+1; }
                break;
            case 1024:
                if(idx1>0) { idx1=idx1-1; idx2=idx2+1; } else { idx1=0; idx2=idx2+1; }
                break;
            case 2048:
                if(idx1>0) { idx1=idx1-1;              } else { idx1=0; }
                break;
            default:
                if(idx1>0) { idx1=idx1-1; idx2=idx2+2; } else { idx1=0; idx2=idx2+1; }
        }

        pos1=idx1;
        while(pos1<=idx2)
        {
            Patrones[pos1]=1;
            pos1++;
        }
    }
    free(vHanning); free(sNMF); free(Ts); free(R); free(Peaks); free(Cx);

    return OK;
}

int AMbPD(const int n, const MyType *R, int *Peaks)
{
    int    i, k, m, count;
    MyType *LSM=NULL, *vONES=NULL, *G=NULL;

    m=ceil(n/2.0)-1;

    CHECKNULL(  LSM=(MyType *)calloc(m*n, sizeof(MyType)));
    CHECKNULL(vONES=(MyType *)malloc(n  * sizeof(MyType)));
    CHECKNULL(    G=(MyType *)calloc(n  , sizeof(MyType)));

    int a;
    #pragma GCC ivdep
    for(a=0; a<n;a++){
        vONES[a]=1.0;
    }
    randNMFmt19937ar(m*n, LSM);

    for(k=0; k<m; k++)
        for(i=k+1; i<n-k-1; i++)
            if ((R[i]>R[i-1-k]) && (R[i]>R[i+1+k])) LSM[k+(i+1)*m]=0.0;

#ifdef SIMPLE
    cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, 1.0, LSM, m, vONES, 1, 0.0, G, 1);
    k=max(cblas_isamin(m, G, 1)+1, 2);
#else
    cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, 1.0, LSM, m, vONES, 1, 0.0, G, 1);
    k=max(cblas_idamin(m, G, 1)+1, 2);
#endif

    count=0;
#ifdef OMP3
#pragma omp parallel
#endif
    {
        int pos, j;
        MyType   dtmp, dtmp1, media;
#ifdef OMP3
#pragma omp for
#endif
        for(i=0;i<n;i++)
        {
            dtmp=0.0;
            pos =i*m;
            for(j=0;j<k;j++)
                dtmp+=LSM[pos+j];
            media=dtmp/k;

            dtmp =0.0;
            for(j=0;j<k;j++)
            {
                dtmp1=LSM[pos+j]-media;
                dtmp+=dtmp1*dtmp1;
            }

            if (sqrt(dtmp/(k-1.0))==0)
            {
#ifdef OMP3
#pragma omp critical (contador)
#endif
                {
                    count++;
                    Peaks[count]=i-1;
                }
            }
        }
    }
    Peaks[0]=count;
#ifdef OMP3
    qsort(&Peaks[1], count, sizeof(int), compara);
#endif


    free(LSM); free(vONES); free(G);

    return OK;
}


int invSpectroExtended(const int N, const int nFrames, const int bc, const int winSize, const int M,
                       const int NoOverlap, const MyType *vHanning, const MyType *wNMF, const MyType *hNMF,
                       MyType *Mat, MyFFTcompl *dest, MyFFTcompl *xIFFT, MyFFTCPUType planIFFT)
{
    int i, j, start, pos, stride, stridePos;
    MyType Z=(MyType)M;


    Mat[0]  =xIFFT[0]  =wNMF[0]  *hNMF[0];
    Mat[N-1]=xIFFT[N-1]=wNMF[N-1]*hNMF[0];

#pragma GCC ivdep
    for(j=1;j<N-1;j++)
    {
        Mat[j] = xIFFT[j] = xIFFT[M-j] = wNMF[j]*hNMF[0];
    }

#ifdef SIMPLE
    fftwf_execute(planIFFT);
#else
    fftw_execute(planIFFT);
#endif

#pragma GCC ivdep
    for(j=0; j<winSize;j++)
        dest[j]=creal(xIFFT[j])/Z;


    stride=winSize-NoOverlap; pos=bc; start=N;
    for(i=1;i<nFrames;i++)
    {
        Mat[start]    =xIFFT[0]  =wNMF[0]  *hNMF[pos];
        Mat[start+N-1]=xIFFT[N-1]=wNMF[N-1]*hNMF[pos];

#pragma GCC ivdep
        for(j=1;j<N-1;j++)
        {
            Mat[start+j] = xIFFT[j] = xIFFT[M-j] = wNMF[j]*hNMF[pos];
        }

#ifdef SIMPLE
        fftwf_execute(planIFFT);
#else
        fftw_execute(planIFFT);
#endif

        stridePos=i*stride;

#pragma GCC ivdep
        for(j=0; j<winSize/2;j++)
            dest[stridePos+j]=(creal(dest[stridePos+j])+creal(xIFFT[j])/Z) / (vHanning[j]+vHanning[stride+j]);

#pragma GCC ivdep
        for(j=winSize/2;j<winSize;j++)
            dest[stridePos+j]=creal(xIFFT[j])/Z;

        pos  +=bc;
        start+=N;
    }

    return 0;
}


int invSpectro(const int N, const int nFrames, const int winSize, const int M, const int NoOverlap,
               const MyType *vHanning, const MyFFTcompl *src, MyFFTcompl *dest, MyFFTcompl *xIFFT, MyFFTCPUType planIFFT)
{
    int    i, j, start, stride, pos;
    MyType Z=(MyType)M;

    xIFFT[0]  =src[0];
    xIFFT[N-1]=src[N-1];

#pragma GCC ivdep
    for(j=1;j<N-1;j++)
    {
        xIFFT[j]  =src[j];
        xIFFT[M-j]=conj(src[j]);
    }

#ifdef SIMPLE
    fftwf_execute(planIFFT);
#else
    fftw_execute(planIFFT);
#endif

#pragma GCC ivdep
    for(j=0; j<winSize;j++)
        dest[j]=creal(xIFFT[j])/Z;


    stride=winSize-NoOverlap; start=N;
    for(i=1;i<nFrames;i++)
    {
        xIFFT[0]  =src[start];
        xIFFT[N-1]=src[N+start-1];

#pragma GCC ivdep
        for(j=1;j<N-1;j++)
        {
            xIFFT[j]  =src[start+j];
            xIFFT[M-j]=conj(src[start+j]);
        }

#ifdef SIMPLE
        fftwf_execute(planIFFT);
#else
        fftw_execute(planIFFT);
#endif

        pos=i*stride;

#pragma GCC ivdep
        for(j=0; j<winSize/2;j++)
            dest[pos+j]=(creal(dest[pos+j])+creal(xIFFT[j])/Z) / (vHanning[j]+vHanning[stride+j]);

#pragma GCC ivdep
        for(j=winSize/2;j<winSize;j++)
            dest[pos+j]=creal(xIFFT[j])/Z;

        start+=N;
    }

    return 0;
}


MyType corrPatron(const int bc, const int nFrames, const MyType *hNMF, const int *Y, MyType *X)
{
    MyType CoVaria, dtmp, Xmean, Ymean, XdesStd, YdesStd;
    int    i;

    dtmp=0;
    for(i=0; i<nFrames;i++) { X[i]=hNMF[i*bc];  dtmp+=X[i]; }
    dtmp=dtmp/(MyType)nFrames;

    Xmean=Ymean=0;
    for(i=0; i<nFrames;i++)
    {
        Ymean+=Y[i];
        if (X[i] > dtmp) { X[i]=1; Xmean++; } else { X[i]=0; }
    }

    Xmean=Xmean/(MyType)nFrames;
    Ymean=Ymean/(MyType)nFrames;

    XdesStd=YdesStd=CoVaria=0;
    dtmp=Xmean*Ymean;
    for(i=0;i<nFrames;i++)
    {
        XdesStd+=(X[i]-Xmean)*(X[i]-Xmean);
        YdesStd+=(Y[i]-Ymean)*(Y[i]-Ymean);
        CoVaria+=(MyType)(X[i]*Y[i]) - dtmp;
    }

    CoVaria/=(MyType)(nFrames-1);
#ifdef SIMPLE
    XdesStd=sqrtf(XdesStd/(MyType)(nFrames-1));
    YdesStd=sqrtf(YdesStd/(MyType)(nFrames-1));
#else
    XdesStd=sqrt(XdesStd/(MyType)(nFrames-1));
    YdesStd=sqrt(YdesStd/(MyType)(nFrames-1));
#endif

    return CoVaria / (XdesStd*YdesStd);

}

MyType RollOff(const int n, const MyFFTcompl *X, const MyType threshold)
{
    int    i;
    MyType Energy=0, dtmp1, dtmp2;

    for(i=0;i<(n/2);i++)
    {
        dtmp1  =cabs(X[i]);
        Energy+=dtmp1*dtmp1;
    }

    i=0;  dtmp1=0;
    while (dtmp1<=(Energy*threshold))
    {
        dtmp2 =cabs(X[i]);
        dtmp1+=dtmp2*dtmp2;
        i++;
    }

    return (((MyType)i*freqHz)/(MyType)n);
}


MyType corrCos(const int rowsNMF, const int colsBases, const MyType *W, const MyType *Bases, MyType *SIM)
{
    int    i;
    MyType dnorm2W, dtmp;

#ifdef SIMPLE
    float  maxSIM=FLT_MIN;
#else
    double maxSIM=DBL_MIN;
#endif

#ifdef SIMPLE
    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 1, colsBases, rowsNMF, 1.0, W, 1, Bases, rowsNMF, 0.0, SIM, 1);
    dnorm2W=cblas_snrm2(rowsNMF, W, 1);
#else
    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 1, colsBases, rowsNMF, 1.0, W, 1, Bases, rowsNMF, 0.0, SIM, 1);
    dnorm2W=cblas_dnrm2(rowsNMF, W, 1);
#endif

    for(i=0;i<colsBases;i++)
    {
#ifdef SIMPLE
        dtmp=SIM[i] / (dnorm2W*cblas_snrm2(rowsNMF, &Bases[i*rowsNMF], 1));
#else
        dtmp=SIM[i] / (dnorm2W*cblas_dnrm2(rowsNMF, &Bases[i*rowsNMF], 1));
#endif
        maxSIM=max(dtmp, maxSIM);
    }

    return maxSIM;
}


void ReconCorazon(const int n, MyFFTcompl *Xc, MyFFTcompl *Xp, const MyFFTcompl *cSNMF, const MyType *XcSpec, const MyType *XpSpec)
{
    int i;
    MyType dtmpc, dtmpp, dsum;

#ifdef OMP
#pragma omp parallel for private(dtmpc, dtmpp, dsum)
#endif
    for(i=0; i<n;i++)
    {
        dtmpc=XcSpec[i]*XcSpec[i];  dtmpp=XpSpec[i]*XpSpec[i];  dsum=dtmpc+dtmpp;

        Xc[i]=(dtmpc/dsum)*cSNMF[i];
        Xp[i]=(dtmpp/dsum)*cSNMF[i];

    }
}


void linspace(const int left, const int right, const int n, MyType *v)
{
    int i;
    MyType stride;

    stride=(MyType)(right-left) / (MyType)(n-1);

    v[0]=(MyType)left; v[n-1]=(MyType)right;

    for(i=1; i<n-1; i++) v[i]=v[i-1]+stride;

}


void interpl(const int N, const int M, const double *x, const double *y, const double *Vx, double *Vy)
{
    double xtmp;
    int low_i, low_ip1, high_i, mid_i, k;

#pragma omp parallel for schedule(guided) private(low_i, low_ip1, high_i, mid_i, xtmp)
    for (k=0; k<M; k++)
    {
        if ((Vx[k] <= x[N-1]) && (x[0] <=Vx[k]))
        {
            low_i   = 1;
            low_ip1 = 2;
            high_i  = N;

            while (high_i > low_ip1)
            {
                mid_i = (low_i + high_i) >> 1;
                if (Vx[k] >= x[mid_i - 1]) { low_i = mid_i; low_ip1 = mid_i + 1; }  else { high_i = mid_i; }
            }

            xtmp = x[low_i - 1];
            xtmp = (Vx[k] - xtmp) / (x[low_i] - xtmp);

            if (xtmp == 0.0) {
                Vy[k] = y[low_i - 1];
            } else if (xtmp == 1.0) {
                Vy[k] = y[low_i];
            } else if (y[low_i - 1] == y[low_i]) {
                Vy[k] = y[low_i - 1];
            } else {
                Vy[k] = (1.0 - xtmp) * y[low_i - 1] + xtmp * y[low_i];
            }
        }
    }
}


/* length(Cx)=length(La)=n, length(x)=m. n=2m-1 */
void xcorr2(const int n, const int m, const MyType *x, MyType *Cx, MyType *La)
{
    int i, j;
    MyType dtmp;

#ifdef OMP
#pragma omp parallel for private(dtmp,j)
#endif
    for(i=0; i<m; i++)
    {
        dtmp=0.0;
        for(j=0; j<(m-i); j++)
            dtmp += x[j] * x[j+i];
        Cx[m+i-1]=Cx[m-i-1]=dtmp;
    }

    for(i=0; i<n; i++)
        La[i]=(MyType)(i-(m-1));
}

int BMP1(const int rowsNMF, const int nFrames,const MyFFTcompl *Xc, const int fftSize){
    //Estimación de FC: Opción 1
    double *Xc2=NULL;
    CHECKNULL(      Xc2=(double    *)fftw_malloc(sizeof(double)   * nFrames));

    double *T=NULL;
    CHECKNULL(      T=(double    *)fftw_malloc(sizeof(double)   * nFrames));
    for(int a=0;a<nFrames;a++){
        T[a]=(a+1)*fftSize*overlap/freqHz;
    }

    MyType dtmp;
    int pos1=0;
    int pos2;
    int i=0;
    for(i=0;i<nFrames;i++)
    {
        dtmp=0.0;
        for(int j=0; j<rowsNMF; j++)
            dtmp+=cabs(Xc[pos1+j]*Xc[pos1+j]);


#ifdef SIMPLE
        R[i]=(dtmp+fabsf(dtmp))/2.0;
#else
        Xc2[i]=dtmp;
#endif
        pos1+=rowsNMF;
    }

    int s=0;
    for(int a=1;a<nFrames-1;a++){
        if(Xc2[a-1]<Xc2[a] && Xc2[a+1]<Xc2[a] && Xc2[a]>0.4){
            s=s+1;
        }
    }
    return s/2/T[nFrames-1]*60;
}

int BMP2(const int rowsNMF, const int nFrames,const MyFFTcompl *Xc, const int fftSize){


    double *Xc2=NULL;
    CHECKNULL(      Xc2=(double    *)fftw_malloc(sizeof(double)   * nFrames));

    MyType dtmp;
    int pos1=0;
    int pos2;
    int i=0;
    for(i=0;i<nFrames;i++)
    {
        dtmp=0.0;
        for(int j=0; j<rowsNMF; j++)
            dtmp+=cabs(Xc[pos1+j]*Xc[pos1+j]);


#ifdef SIMPLE
        R[i]=(dtmp+fabsf(dtmp))/2.0;
#else
        Xc2[i]=dtmp;
#endif
        pos1+=rowsNMF;
    }


    double *T=NULL;
    CHECKNULL(      T=(double    *)fftw_malloc(sizeof(double)   * nFrames));
    for(int a=0;a<nFrames;a++){
        T[a]=(a+1)*fftSize*overlap/freqHz;
    }
    double tiempo1=0;
    double tiempo2=0;
    double cont=0;
    double sum=0;

    for(int a=1;a<nFrames-1;a++) {
        if (Xc2[a - 1] < Xc2[a] && Xc2[a + 1] < Xc2[a] && Xc2[a] > 2.5 && tiempo2 != 0) {
            tiempo1 = tiempo2;
            tiempo2 = T[a];
            cont = cont + 1;
            sum = sum + (tiempo2 - tiempo1);
        }
        if (Xc2[a - 1] < Xc2[a] && Xc2[a + 1] < Xc2[a] && Xc2[a] > 2.5 && tiempo2 == 0) {
            tiempo2 = T[a];
        }
    }
    double tot = 60.0 / (sum / cont);
    return tot;
}

MyType BMP3(const int rowsNMF, const int nFrames, const int nSamples, MyFFTcompl *Xc, MyType *Audio, MyType fs)
{
    int i, j, tmp, BPM_min=40, BPM_max=190, T0_min_samp, T0_max_samp, cuantos;

    MyType *x=NULL, *y=NULL, *xx=NULL, *yy=NULL, *Cx=NULL, dtmp;

    ValueAndPos *Peaks=NULL;

    bool Encontrado=false;

    T0_max_samp=floor((60.0/BPM_min)*fs);
    T0_min_samp=floor((60.0/BPM_max)*fs);


    CHECKNULL(x  =(MyType *)calloc(nFrames,      sizeof(MyType)));
    CHECKNULL(y  =(MyType *)calloc(nFrames,      sizeof(MyType)));
    CHECKNULL(xx =(MyType *)calloc(nSamples,     sizeof(MyType)));
    CHECKNULL(yy =(MyType *)calloc(nSamples,     sizeof(MyType)));
    CHECKNULL(Cx =(MyType *)calloc(2*nSamples-1, sizeof(MyType)));

    CHECKNULL(Peaks=(ValueAndPos *)calloc(nSamples, sizeof(ValueAndPos)));

    linspace(0, 1, nFrames,  x);
    linspace(0, 1, nSamples, xx);

#pragma omp parallel for private(dtmp, j)
    for(i=0; i<nFrames; i++)
    {
        dtmp=0.0;
        for(j=0; j<rowsNMF; j++)
            dtmp += cabs(Xc[i*rowsNMF+j]);
        y[i]=dtmp;
    }


    interpl(nFrames, nSamples, x, y, xx, yy);

    xcorr(2*nSamples-1, Cx, nSamples, yy);

    tmp=floor((2*nSamples-1)/2);
    for(i=0; i<=nSamples; i++)
        Cx[i]=Cx[i+tmp-1];

    cuantos=0;
    for(i=1; i<nSamples; i++)
        if ((Cx[i]>Cx[i-1]) && (Cx[i]>=Cx[i+1])) { Peaks[cuantos].data=Cx[i]; Peaks[cuantos].pos=i; cuantos++; }

    qsort(Peaks, cuantos, sizeof(ValueAndPos), comparadesc);

    i=0;
    while ((!Encontrado) && (i<cuantos))
    {
        if ((Peaks[i].pos > T0_min_samp) && (Peaks[i].pos < T0_max_samp))
            Encontrado=true;
        else
            i++;
    }

#ifdef SIMPLE
    dtmp=roundf(60.0/(Peaks[i].pos/fs));
#else
    dtmp=round (60.0/(Peaks[i].pos/fs));
#endif

    free(x); free(y); free(xx); free(yy); free(Cx); free(Peaks);

    return dtmp;
}