#include "FileFunctions.h"
#include "ComputeFunctions.h"

int main(int argc , char *argv[])
{  
   #ifdef TALK2
     double Time, hannfftTime, nmfTime, hsdTime, clusTime, invTime;
     #ifndef OMP
       double dtime;
     #endif 
   #endif

   int  nFrames, fftSize, nSamples, vCLTSize, maxThreads, *Ncor=NULL, nItera, bc, i,
        winSize, rowsNMF, colBases, planType, NoOverlap,  *Npul=NULL, *Patrones=NULL;
    
   MyType *yNMF=NULL, *wNMF=NULL, *Times=NULL, *Audio=NULL, *corrX=NULL, *XpAUX=NULL, *vHanning=NULL,
          *hNMF=NULL, *SIM=NULL,  *rSNMF=NULL, *Bases=NULL, *xAUXI=NULL, *XcAUX=NULL, *Ts=NULL, dtmp;

   MyFFTcompl   *xIFFT=NULL,   *cSNMF=NULL, *vCLT=NULL, *Xp=NULL, *Xc=NULL;
   MyFFTCPUType *planFFT=NULL, *planIFFT=NULL;

   bool          esWAV=false;
   FILE          *fpin, *fpout;
   short         *wavAudio=NULL;
   WAVHeader     WavHeader;
   unsigned char RIFF[4];


   /* Parse cmd line */
   if (argc != 5)
   {
      printf("Usage: %s <input audio file> <input bases file> <output audio HC file> <output audio HP file>\n", argv[0]);
      printf("   Example: %s input.dat bases.dat output.dat\n\n", argv[0]);
      return -1;
   }


   /* Open file with input audio. Viendo si es WAV o NO */
   CHECKNULL(fpin=fopen(argv[1], "rb"));
   if (fread(RIFF, 1, 4, fpin) != 4)              { fclose(fpin); return ErrReadFile; }
   if (strcasecmp((const char *)RIFF, "RIFF")==0) { esWAV=true; }

   if (esWAV) /* Fichero tipo wav compatible con ReMas, SoundPrism, etc */
   {
     CHECKERR(ReadWavHeader(&WavHeader, fpin));
     nSamples=WavHeader.num_samples;

     CHECKNULL(wavAudio=(short *)calloc(nSamples, sizeof(short)));
     CHECKNULL(Audio=  (MyType *)calloc(nSamples, sizeof(MyType)));

     CHECKERR(ReadAudioWav(nSamples, wavAudio, Audio, fpin));
   }
   else
   {
     nSamples=RIFF[0] | (RIFF[1]<<8) | (RIFF[2]<<16) | (RIFF[3]<<24);
     
     CHECKNULL(Audio=(MyType *)calloc(nSamples, sizeof(MyType)));

     CHECKERR(ReadAudio(nSamples, Audio, fpin));
   }
   CHECKERR(fclose(fpin));

   
   /* winSize   es el equivalente a TAMTRAMA   en ReMAS, SoundPrism, etc. */
   /* NoOverlap es el equivalente a TAMMUESTRA en ReMAS, SoundPrism, etc. */ 
   /* "(nSamples - NoOverlap) / (winSize - NoOverlap)" es el equivalente  */
   /* a "(nSamples-TTminusTM) / TAMMUESTRA" en ReMAS, SoundPrism, etc.    */
   winSize   = freqHz*seconds;	       // 8000 x 0.064 = 512
   fftSize   = nextPow2(winSize);      // 512
   NoOverlap = round(fftSize*overlap); // 256
   rowsNMF   = NoOverlap+1;	       // 257
   bc        = bcCorazon+bcPulmon;     // 55 (bcCorazon) + 64(bcPulmon) = 119
   nFrames   = floor((nSamples-NoOverlap)/(winSize-NoOverlap));


   /* Leer fichero bases y comprobar compatibilidad */
   CHECKNULL(fpin=fopen(argv[2], "rb"));
   if (fread(&i,        sizeof(int), 1, fpin) != 1) { fclose(fpin); return ErrReadFile; }
   if (fread(&colBases, sizeof(int), 1, fpin) != 1) { fclose(fpin); return ErrReadFile; }
   if (i != rowsNMF) { printf("Error Bases dimensions\n"); fclose(fpin); return ErrReadFile; }

   CHECKNULL(Bases=(MyType *)calloc(rowsNMF*colBases, sizeof(MyType)));
   if (fread(Bases, sizeof(MyType), rowsNMF*colBases, fpin) != rowsNMF*colBases) { fclose(fpin); return ErrReadFile; }   
   CHECKERR(fclose(fpin));
   

   /* Incializamos el generador de aleatorios */
   srand(13);


   /* Importanto los Planes para la FFTW, si existen, Si no se quieren usar comentar */
   if (!fftw_import_wisdom_from_filename("Planes.txt"))
     planType=FFTW_ESTIMATE;
   else
     planType=FFTW_WISDOM_ONLY;

   #ifdef PARFFTW
     #ifdef SIMPLE
       fftwf_init_threads();
     #else
       fftw_init_threads();
     #endif
   #endif

   #ifdef OMP
     maxThreads=omp_get_max_threads();
   #else
     maxThreads=1;
   #endif


   #ifdef TALK2
     printf("(nSamples, nFrames, winSize, fftSize)=(%d, %d, %d, %d)\n", nSamples, nFrames, winSize, fftSize);
   #endif

   /* Prepare hanning vector */
   CHECKNULL(vHanning=(MyType *)calloc(winSize, sizeof(MyType)));  

   /* Prepare yNMF, wNMF and hNMF (for all audio) and other structures */
   CHECKNULL(   rSNMF=(MyType *)calloc(rowsNMF * nFrames, sizeof(MyType)));  
   CHECKNULL(    yNMF=(MyType *)calloc(rowsNMF * nFrames, sizeof(MyType)));  
   CHECKNULL(    wNMF=(MyType *)calloc(rowsNMF * bc,      sizeof(MyType)));
   CHECKNULL(    hNMF=(MyType *)calloc(bc      * nFrames, sizeof(MyType)));
   CHECKNULL(      Ts=(MyType *)calloc(nFrames,           sizeof(MyType)));
   CHECKNULL(Patrones=(int    *)calloc(nFrames,           sizeof(int)));
   
   #ifdef SIMPLE
     CHECKNULL(cSNMF=(MyFFTcompl *)fftwf_malloc(rowsNMF*nFrames*sizeof(MyFFTcompl)));
   #else
     CHECKNULL(cSNMF=(MyFFTcompl *) fftw_malloc(rowsNMF*nFrames*sizeof(MyFFTcompl)));
   #endif   


   CHECKERR(hanning(winSize, vHanning));
   /* Hanning + FFT + Ts = sg in Matlab */
   #ifdef TALK2
     Time=Ctimer();
   #endif
   CHECKERR(cHannFFT(winSize, nFrames, NoOverlap, Audio, vHanning, rowsNMF, rSNMF, cSNMF, fftSize, maxThreads, planType));
   #ifdef TALK2
     hannfftTime=Ctimer()-Time;
   #endif


   /* Unsupervised NMF */
   #ifdef TALK2
     nmfTime=Ctimer();
   #endif
   CHECKERR(uNMF(rowsNMF, nFrames, bc, rSNMF, wNMF, hNMF, yNMF, &nItera));
   #ifdef TALK2
     nmfTime=Ctimer()-nmfTime;
   #endif

   dtmp=(MyType)(winSize-NoOverlap)/(MyType)freqHz;
   #ifdef OMP2
     #pragma omp parallel for simd  // ¿overhead > saved time.?
   #else
     #pragma GCC ivdep
   #endif
   for(i=0;i<nFrames;i++) { Ts[i]=i*dtmp; }


   /* Heart Sound _Detection */
   #ifdef TALK2
     hsdTime=Ctimer();
   #endif
   CHECKERR(HSD(nSamples, winSize, Audio, Ts, &Times, Patrones, maxThreads, planType));
   #ifdef TALK2
     hsdTime=Ctimer()-hsdTime;
   #endif


   /* Clustering: invspecgram+Clustering. Movemos aqui sus declaraciones por claridad y ?¿         */
   /* rowsCluster=nFrames*(winSize-NoOverlap)+NoOverlap                                            */ 
   /* Los resultados de invSpectro son el vector vCLT[rowsCluster] y la matriz xAUXI, para     */
   /* cada iteracion de este bucle principal, esto es, i=0,...,bc. xAUXI=W(:,i)*H(i,:) se puede    */
   /* calcular con cblas_dgemm antes de llamar a invSpectro y pasarla como argumento, o que lo haga*/
   /* invSpectro implicitamente aprovechando sus operaciones, para evitar cblas_dgemm. xAUXI se    */
   /* usa al final del for para obtener 2 matrices finales. El problema es que esta fase es lenta  */
   /* y para acelerar funciona bien el paralelismo compulsivo (parallel este for). Si hacemos esto */
   /* tendremos que usar:                                                                          */
   /*   a) maxThreads vectores para la IFFT, que no es mucho espacio fftsize=512. Podemos usar la  */
   /*      IFFT in-place para ahorar algo de espacio. Parece adecuado.                             */
   /*   b) Convertir vCLT[rowsCluster] en la matriz vCLT[rowsCluster, maxThreads], es algo */
   /*      de espacio. En sistemas pequenos maxThreads=2/4 (o se puede limitar), y en sistemas fat */
   /*      no hay problema porque tienen memoria. Ademas, se usan audios de ¿maximo 30 segundos?   */
   /*      con lo que rowsCluster (nFrames*(winSize-NoOverlap)+NoOverlap) sera como mucho (230Kb)  */
   /*   c) maxThreads matrices xAUXI, cada una de ellas de [rowsNMF,nFrames], que para nuestros    */
   /*      audios maximo de 30 segundos sera [257, 930] aproximadamente.                           */
   /*   d) maxThreads vectores para la FFT del calculo Roll-Off. Aqui el tamano depende del tamano */
   /*      del audios (nFrames) y puede ser grande (65536, 1048576 o 8388608 para 7, 70 o 660 sec. */
   /*      respectivamente, por cada vector, si hay maxThreads vectores entonces... Podemo ahorrar */
   /*      espacio usando in-place y trabajando directamente sobre matCluster. Entonces matCluster */
   /*      debe ser de los tamanos indicados, aunque no se use, y de tipo complex. Esto ensucia un */
   /*      poco el codigo pero ahora espacio y ¿tiempo? Se usa.                                    */
   /*                                                                                              */
   /* Note that nFrames*(winSize-NoOverlap)+NoOverlap+"resto del floor" is equal to nSamples       */
   vCLTSize=nextPow2(nFrames*(winSize-NoOverlap)+NoOverlap);


   /* Para la IFFT y FFT del calculo Roll-Off */
   #ifdef SIMPLE
     CHECKNULL(      Xp=(MyFFTcompl   *)fftwf_malloc(sizeof(MyFFTcompl)   * rowsNMF  * nFrames));
     CHECKNULL(      Xc=(MyFFTcompl   *)fftwf_malloc(sizeof(MyFFTcompl)   * rowsNMF  * nFrames));
     CHECKNULL(   xIFFT=(MyFFTcompl   *)fftwf_malloc(sizeof(MyFFTcompl)   * fftSize  * maxThreads));
     CHECKNULL(    vCLT=(MyFFTcompl   *)fftwf_malloc(sizeof(MyFFTcompl)   * vCLTSize * maxThreads));
     CHECKNULL(planIFFT=(MyFFTCPUType *)fftwf_malloc(sizeof(MyFFTCPUType) * maxThreads));
     CHECKNULL( planFFT=(MyFFTCPUType *)fftwf_malloc(sizeof(MyFFTCPUType) * maxThreads));
   #else 
     CHECKNULL(      Xp=(MyFFTcompl    *)fftw_malloc(sizeof(MyFFTcompl)   * rowsNMF  * nFrames));
     CHECKNULL(      Xc=(MyFFTcompl    *)fftw_malloc(sizeof(MyFFTcompl)   * rowsNMF  * nFrames));
     CHECKNULL(   xIFFT=(MyFFTcompl    *)fftw_malloc(sizeof(MyFFTcompl)   * fftSize  * maxThreads));
     CHECKNULL(    vCLT=(MyFFTcompl    *)fftw_malloc(sizeof(MyFFTcompl)   * vCLTSize * maxThreads));
     CHECKNULL(planIFFT=(MyFFTCPUType  *)fftw_malloc(sizeof(MyFFTCPUType) * maxThreads));
     CHECKNULL( planFFT=(MyFFTCPUType  *)fftw_malloc(sizeof(MyFFTCPUType) * maxThreads));
   #endif

   for(i=0;i<maxThreads;i++)
   {
     #ifdef SIMPLE
       planIFFT[i]=fftwf_plan_dft_1d(fftSize,  &xIFFT[i*fftSize], &xIFFT[i*fftSize], FFTW_BACKWARD, planType);
       planFFT[i] =fftwf_plan_dft_1d(vCLTSize, &vCLT[i*vCLTSize], &vCLT[i*vCLTSize], FFTW_FORWARD,  planType);
     #else
       planIFFT[i]=fftw_plan_dft_1d (fftSize,  &xIFFT[i*fftSize], &xIFFT[i*fftSize], FFTW_BACKWARD, planType);
       planFFT[i] =fftw_plan_dft_1d (vCLTSize, &vCLT[i*vCLTSize], &vCLT[i*vCLTSize], FFTW_FORWARD,  planType);
     #endif
   }
   CHECKNULL(xAUXI=(MyType*)calloc(rowsNMF*nFrames*maxThreads, sizeof(MyType)));
   CHECKNULL(XcAUX=(MyType*)calloc(rowsNMF*nFrames*maxThreads, sizeof(MyType)));
   CHECKNULL(XpAUX=(MyType*)calloc(rowsNMF*nFrames*maxThreads, sizeof(MyType)));
   CHECKNULL(SIM  =(MyType*)calloc(colBases*maxThreads,        sizeof(MyType)));
   CHECKNULL(corrX=(MyType*)calloc(nFrames*maxThreads,         sizeof(MyType)));
   CHECKNULL(Ncor =(int   *)calloc(maxThreads,                 sizeof(int)));
   CHECKNULL(Npul =(int   *)calloc(maxThreads,                 sizeof(int)));


   #ifdef TALK2
     clusTime=Ctimer();
   #endif
   #ifdef OMP
     #pragma omp parallel num_threads(maxThreads)
   #endif
   {
      int    myID, k;
      MyType freqRollOff, maxSIM, coefCorr;

      #ifdef OMP
        myID=omp_get_thread_num();
      #else
        myID=0;
      #endif

      #ifdef OMP
        #pragma omp for
      #endif
      for(k=0;k<bc;k++)
      {
         /* Invspecgram, todo en una unica funcion */
         memSetValueRealComplex(vCLTSize, 0, &vCLT[myID*vCLTSize]);
         invSpectroExtended(rowsNMF, nFrames, bc, winSize, fftSize, NoOverlap, vHanning, &wNMF[k*rowsNMF], &hNMF[k],
                            &xAUXI[myID*rowsNMF*nFrames], &vCLT[myID*vCLTSize], &xIFFT[myID*fftSize], planIFFT[myID]);


         /* Clustering. En varias funciones */
         /* 1) Calculo Roll-Off             */
         #ifdef SIMPLE
           fftwf_execute(planFFT[myID]);
         #else
           fftw_execute (planFFT[myID]);
         #endif
         freqRollOff=RollOff(vCLTSize, &vCLT[myID*vCLTSize], energyThreshold);
         
         /* 2) Correlacion usando la distancia del coseno */
         maxSIM=corrCos(rowsNMF, colBases, &wNMF[k*rowsNMF], Bases, &SIM[myID*colBases]);

         /* 3) Correlacion entre matriz activacion (hNMF) y el patron */
         coefCorr=corrPatron(bc, nFrames, &hNMF[k], Patrones, &corrX[myID*nFrames]);

         /* Aqui no usamos cblas porque ya estamos en paralelo y para evitar sobrecargas */
         if((maxSIM > simThreshold) && (freqRollOff < fcThreshold) && (coefCorr > coefcoThreshold))
         {
           suma(rowsNMF*nFrames, &XcAUX[myID*rowsNMF*nFrames], &xAUXI[myID*rowsNMF*nFrames]);
           Ncor[myID]++;
         }
         else
         {
           suma(rowsNMF*nFrames, &XpAUX[myID*rowsNMF*nFrames], &xAUXI[myID*rowsNMF*nFrames]);
           Npul[myID]++;
         }
      }
   }

   /* Reuniendo los Npul, Ncor, XcAUX y XpAUX */
   for(i=1;i<maxThreads;i++)
   {
     Ncor[0]+=Ncor[i]; Npul[0]+=Npul[i];
     #ifdef SIMPLE
       cblas_saxpy(rowsNMF*nFrames, 1.0, &XpAUX[i*rowsNMF*nFrames], 1, XpAUX, 1);
       cblas_saxpy(rowsNMF*nFrames, 1.0, &XcAUX[i*rowsNMF*nFrames], 1, XcAUX, 1);
     #else
       cblas_daxpy(rowsNMF*nFrames, 1.0, &XpAUX[i*rowsNMF*nFrames], 1, XpAUX, 1);
       cblas_daxpy(rowsNMF*nFrames, 1.0, &XcAUX[i*rowsNMF*nFrames], 1, XcAUX, 1);
     #endif
   }
   #ifdef TALK2
     clusTime=Ctimer()-clusTime;
   #endif


   #ifdef TALK2
     invTime=Ctimer();
   #endif
   /* Mascaras relativas wiener + espectrograma reconstruido de los sonidos corazon */
   ReconCorazon(rowsNMF*nFrames, Xc, Xp, cSNMF, XcAUX, XpAUX);
   #ifdef OMP
     #pragma omp parallel sections num_threads(2)
     {
       #pragma omp section
         invSpectro(rowsNMF, nFrames, winSize, fftSize, NoOverlap, vHanning, Xc, &vCLT[0], &xIFFT[0], planIFFT[0]);
       #pragma omp section
         invSpectro(rowsNMF, nFrames, winSize, fftSize, NoOverlap, vHanning, Xp, &vCLT[vCLTSize], &xIFFT[fftSize], planIFFT[1]);
     }
     #ifdef TALK2
       invTime=Ctimer()-invTime;
     #endif
     if (esWAV)
     {
       CHECKNULL(fpout=fopen(argv[3], "wb"));
       CHECKERR(WriteWavHeader(WavHeader.header, RIFF, fpout));
       CHECKERR(WriteAudioWav(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, wavAudio, &vCLT[0], fpout));
       CHECKERR(fclose(fpout));

       CHECKNULL(fpout=fopen(argv[4], "wb"));
       CHECKERR(WriteWavHeader(WavHeader.header, RIFF, fpout));
       CHECKERR(WriteAudioWav(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, wavAudio, &vCLT[vCLTSize], fpout));
       CHECKERR(fclose(fpout));
       free(wavAudio);
     }
     else
     {
       CHECKNULL(fpout=fopen(argv[3], "wb"));
       if (fwrite(&nSamples, sizeof(int), 1, fpout) != 1) { fclose(fpout); return ErrReadFile; };
       CHECKERR(WriteAudio(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, Audio, &vCLT[0],        fpout));
       CHECKERR(fclose(fpout));

       CHECKNULL(fpout=fopen(argv[4], "wb"));
       if (fwrite(&nSamples, sizeof(int), 1, fpout) != 1) { fclose(fpout); return ErrReadFile; };
       CHECKERR(WriteAudio(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, Audio, &vCLT[vCLTSize], fpout));
       CHECKERR(fclose(fpout));
     }
   #else
     invSpectro(rowsNMF, nFrames, winSize, fftSize, NoOverlap, vHanning, Xc, vCLT, xIFFT, planIFFT[0]);
     #ifdef TALK2
       invTime=Ctimer()-invTime;
     #endif
     if (esWAV)
     {
       CHECKNULL(fpout=fopen(argv[3], "wb"));
       CHECKERR(WriteWavHeader(WavHeader.header, RIFF, fpout));
       CHECKERR(WriteAudioWav(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, wavAudio, vCLT, fpout));
       CHECKERR(fclose(fpout));
     }
     else
     {
       CHECKNULL(fpout=fopen(argv[3], "wb"));
       if (fwrite(&nSamples, sizeof(int), 1, fpout) != 1) { fclose(fpout); return ErrReadFile; };
       CHECKERR(WriteAudio(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, Audio, vCLT, fpout));
       CHECKERR(fclose(fpout));
     }
     #ifdef TALK2
       dtime=Ctimer();
     #endif
     invSpectro(rowsNMF, nFrames, winSize, fftSize, NoOverlap, vHanning, Xp, vCLT, xIFFT, planIFFT[0]); 
     #ifdef TALK2
       invTime+=Ctimer()-dtime;
     #endif
     if (esWAV)
     {
       CHECKNULL(fpout=fopen(argv[4], "wb"));
       CHECKERR(WriteWavHeader(WavHeader.header, RIFF, fpout));
       CHECKERR(WriteAudioWav(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, wavAudio, vCLT, fpout));
       CHECKERR(fclose(fpout));
       free(wavAudio);
     }
     else
     {
       CHECKNULL(fpout=fopen(argv[4], "wb"));
       if (fwrite(&nSamples, sizeof(int), 1, fpout) != 1) { fclose(fpout); return ErrReadFile; };
       CHECKERR(WriteAudio(nSamples, nFrames*(winSize-NoOverlap)+NoOverlap, Audio, vCLT, fpout));
       CHECKERR(fclose(fpout));
     }
   #endif

   #ifdef TALK2
     Time=Ctimer()-Time;
     /* VSZ  is the number in kiB of the virtual memory size available for the process execution.             */
     /* SIZE is the number in kiB of the virtual memori size available for the process, only code+data+stack. */
     /* RSS  is the number in kiB of the resident size (non-swapped physical memory) used bye the process.    */
     /* DRS  is the amount in kiB of physical memory devoted to other (data) than the executable code.        */
     /* TRS  is the amount in kIB of physical memory devoted to executable code.                              */
     #ifdef ARM
       system("/bin/ps -o fname,user,vsz,size,rss,drs,trs | /bin/grep Cardio");
     #else
       system("/usr/bin/ps -o fname,user,vsz,size,rss,drs,trs | /usr/bin/grep Cardio");
     #endif
     printf("Iteraciones NMF %d. Los tiempos en segundos\n", nItera);
     printf("Fase1 %1.5e Fase2 %1.5e Fase3 %1.5e Fase4 %1.5e Fase5 %1.5e Todo %1.5e\n",
            hannfftTime, nmfTime, hsdTime, clusTime, invTime, Time);
   #endif

   printf("BMPs estimados %f\n", BMP(rowsNMF, nFrames, nSamples, Xc, Audio, maxThreads));


   /* Memory free */
   free(Ncor); free(SIM);  free(rSNMF); free(wNMF); free(Times); free(Audio); free(XcAUX); free(Patrones); free(Ts);
   free(Npul); free(yNMF); free(corrX); free(hNMF); free(Bases); free(xAUXI); free(XpAUX); free(vHanning);

   for(i=0;i<maxThreads;i++)
   {
     #ifdef SIMPLE
       fftwf_destroy_plan(planIFFT[i]);
       fftwf_destroy_plan(planFFT[i]);
     #else
       fftw_destroy_plan(planIFFT[i]);
       fftw_destroy_plan(planFFT[i]);
     #endif
   }
   #ifdef SIMPLE
     fftwf_free(vCLT);
     fftwf_free(planIFFT);
     fftwf_free(planFFT);
     fftwf_free(xIFFT);
     fftwf_free(cSNMF);
     fftwf_free(Xp);
     fftwf_free(Xc);
     #ifdef PARFFTW
       fftwf_cleanup_threads();
     #endif
   #else
     fftw_free(vCLT);
     fftw_free(planIFFT);
     fftw_free(planFFT);
     fftw_free(xIFFT);
     fftw_free(cSNMF);
     fftw_free(Xp);
     fftw_free(Xc);
     #ifdef PARFFTW
       fftw_cleanup_threads();
     #endif
   #endif

   return OK;
}
