//-----------------------------------------------------------------------------
// Voice synthesis based on f0, spectrogram and aperiodicity.
// forward_real_fft, inverse_real_fft and minimum_phase are used to speed up.
//-----------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "world.h"

namespace stand
{
namespace math
{
namespace dsp
{

//-----------------------------------------------------------------------------
// GetGlottalPulse() calculates the glottal pulse based on periodic response
// and aperiodic response.
// Input:
//   f0                   : f0 at a frame
//   fft_size             : FFT size
//   periodic_response    : Periodic response
//   aperiodic_response   : Aperiodic response
//   noise_size           : Length of noise used for synthsizing aperiodic 
//                          response
// Output:
//   y                    : Calculated glottal pulse
//-----------------------------------------------------------------------------
void GetGlottalPulse(double f0, int fft_size, double *periodic_response, 
  double *aperiodic_response, int noise_size, 
  double *y)
{
  if( f0 != 0)
  {
    for(int i = 0;i < fft_size;i++)
      y[i] = periodic_response[i]*sqrt((double)noise_size) + 
      aperiodic_response[i];
  }
  else
  {
    for(int i = 0;i < fft_size;i++)
      y[i] = aperiodic_response[i];
  }
  for(int i = 0;i < fft_size;i++) y[i] /= (double)fft_size;

}

//-----------------------------------------------------------------------------
// GetOneFrameSegment() calculates a glottal pulse at a time.
// Input:
//   f0                   : f0 contour
//   spectrogram          : Spectrogram estimated by STAR
//   fft_size             : FFT size
//   aperiodicity         : Aperiodicity spectrogram based on TANDEM_AP
//   number_of_bands      : Number of frequency bands used for TANDEM_AP
//   target_f0            : Only a parameter in TANDEM_AP
//   frame_period         : Temporal period used for the analysis
//   current_time         : Temporal position to calculate a glottal pulse
//   fs                   : Sampling frequency
//   default_f0           : Parameter used for unvoiced segment
// Output:
//   y                    : Calculated glottal pulse
//-----------------------------------------------------------------------------
void GetOneFrameSegment(double *f0, double **spectrogram, int fft_size, 
  double **aperiodicity, int number_of_bands, double target_f0, 
  double frame_period, double current_time, int fs, double default_f0,
  ForwardRealFFT *forward_real_fft, InverseRealFFT *inverse_real_fft,
  MinimumPhaseAnalysis *minimum_phase, 
  double *y)
{
  double *aperiodic_ratio = (double *)malloc(sizeof(double)* fft_size);
  double *aperiodic_response = (double *)malloc(sizeof(double)* fft_size);
  double *periodic_response  = (double *)malloc(sizeof(double)* fft_size);

  int current_frame = (int)(current_time/(frame_period/1000.0) + 0.5);  
  int noise_size = (int)((current_time + 1.0/(f0[current_frame] == 
    0.0 ? default_f0 : f0[current_frame]))*(double)fs) - 
    (int)(current_time*(double)fs);

  // 非周期性指標の計算
  CalculateAperiodicity(aperiodicity[current_frame], number_of_bands, 
    fft_size, f0[current_frame], fs, target_f0, aperiodic_ratio);

  // 非周期音の合成
  for(int i = 0;i < noise_size;i++) forward_real_fft->waveform[i] = randn();
  for(int i = noise_size;i < fft_size;i++) 
    forward_real_fft->waveform[i] = 0.0;
  fft_execute(forward_real_fft->forward_fft);

  for(int i = 0;i <= minimum_phase->fft_size/2;i++)
    minimum_phase->log_spectrum[i] = 
    log(spectrogram[current_frame][i] * 
    ((1-aperiodic_ratio[i])+0.000000000000001))/2.0;
  GetMinimumPhaseSpectrum(minimum_phase);

  for(int i = 0;i <= fft_size/2;i++)
  {
    inverse_real_fft->spectrum[i][0] = 
      minimum_phase->minimum_phase_spectrum[i][0] * 
      forward_real_fft->spectrum[i][0] - 
      minimum_phase->minimum_phase_spectrum[i][1] * 
      forward_real_fft->spectrum[i][1];
    inverse_real_fft->spectrum[i][1] = 
      minimum_phase->minimum_phase_spectrum[i][0] * 
      forward_real_fft->spectrum[i][1] + 
      minimum_phase->minimum_phase_spectrum[i][1] * 
      forward_real_fft->spectrum[i][0];
  }
  fft_execute(inverse_real_fft->inverse_fft);
  for(int i = 0;i < fft_size;i++) 
    aperiodic_response[i] = inverse_real_fft->waveform[i];

  // Synthesis of the periodic response.
  // If f0 is zero, we cannot synthesize it.
  if( f0[current_frame] != 0)
  {
    for(int i = 0;i <= minimum_phase->fft_size/2;i++)
      minimum_phase->log_spectrum[i] = 
      log(spectrogram[current_frame][i] * aperiodic_ratio[i])/2.0;
    GetMinimumPhaseSpectrum(minimum_phase);

    for(int i = 0;i <= fft_size/2;i++)
    {
      inverse_real_fft->spectrum[i][0] = 
        minimum_phase->minimum_phase_spectrum[i][0];
      inverse_real_fft->spectrum[i][1] = 
        minimum_phase->minimum_phase_spectrum[i][1];
    }
    fft_execute(inverse_real_fft->inverse_fft);
    for(int i = 0;i < fft_size;i++)
      periodic_response[i] = inverse_real_fft->waveform[i];
  }

  GetGlottalPulse(f0[current_frame], fft_size, periodic_response, 
    aperiodic_response, noise_size, y);

  free(periodic_response);
  free(aperiodic_response);
  free(aperiodic_ratio);
}



//-----------------------------------------------------------------------------
// synthesis_ap() synthesize the voice based on f0, spectrogram and 
// aperiodicity (not excitation signal).
// Input:
//   f0                   : f0 contour
//   f0_length            : Length of f0
//   spectrogram          : Spectrogram estimated by STAR
//   fft_size             : FFT size
//   aperiodicity         : Aperiodicity spectrogram based on TANDEM_AP
//   number_of_bands      : Number of frequency bands used for TANDEM_AP
//   target_f0            : Only a parameter in TANDEM_AP
//   frame_period         : Temporal period used for the analysis
//   fs                   : Sampling frequency
//   y_length             : Length of the output signal (Memory of y has been
//                          allocated in advance)
// Output:
//   y                    : Calculated glottal pulse
//-----------------------------------------------------------------------------
void synthesis_ap(double *f0, int f0_length, double **spectrogram, 
  int fft_size, double **aperiodicity, int number_of_bands, double target_f0, 
  double frame_period, int fs, int y_length, 
  double *y)
{
  double *impulse_response = (double *)malloc(sizeof(double) * fft_size);

  for(int i = 0;i < y_length;i++) y[i] = 0.0;

  MinimumPhaseAnalysis minimum_phase = {0};
  InitializeMinimumPhaseAnalysis(fft_size, &minimum_phase);
  InverseRealFFT inverse_real_fft = {0};
  InitializeInverseRealFFT(fft_size, &inverse_real_fft);
  ForwardRealFFT forward_real_fft = {0};
  InitializeForwardRealFFT(fft_size, &forward_real_fft);

  double currentTime = 0.0;
  int currentPosition = 0;
  int currentFrame = 0;
  for(int i = 0;;i++)
  {
    for(int j = 0;j < fft_size;j++) impulse_response[j] = 0.0;

    GetOneFrameSegment(f0, spectrogram, fft_size, aperiodicity, 
      number_of_bands, target_f0, frame_period, currentTime, fs, 
      DEFAULT_F0, &forward_real_fft, &inverse_real_fft, 
      &minimum_phase, impulse_response);

    currentPosition = (int)(currentTime*(double)fs);

    for(int j = 0;j < fft_size/2;j++)
    {
      if(j+currentPosition >= y_length) break;
      y[j+currentPosition] += impulse_response[j];
    }

    currentTime += 1.0/(f0[currentFrame] == 
      0.0 ? DEFAULT_F0 : f0[currentFrame]);
    currentFrame = (int)(currentTime/(frame_period/1000.0) + 0.5);
    currentPosition = (int)(currentTime*(double)fs);
    if(fft_size/2+currentPosition >= y_length || 
      currentFrame >= f0_length) break;
  }

  DestroyMinimumPhaseAnalysis(&minimum_phase);
  DestroyInverseRealFFT(&inverse_real_fft);
  DestroyForwardRealFFT(&forward_real_fft);

  free(impulse_response);
  return;
}

}
}
}
