Logo Search packages:      
Sourcecode: wengophone version File versions  Download package

enc_dtx.c

/*
 *===================================================================
 *  3GPP AMR Wideband Floating-point Speech Codec
 *===================================================================
 */
#include <stdlib.h>
#include <memory.h>
#include <math.h>
#include "typedef.h"
#include "enc_lpc.h"
#include "enc_util.h"


#define DTX_HIST_SIZE_MIN_ONE       7
#define DTX_HANG_CONST              7     /* yields eight frames of SP HANGOVER  */
#define DTX_ELAPSED_FRAMES_THRESH   (24 + 7 -1)
#define MED_THRESH                  2.25
#define GAIN_THR                    1.406
#define ORDER                       16    /* order of linear prediction filter   */
#define RANDOM_INITSEED             21845 /* own random init value               */
#define MRDTX                       10

#define SIZE_BK_NOISE1  64
#define SIZE_BK_NOISE2  64
#define SIZE_BK_NOISE3  64
#define SIZE_BK_NOISE4  32
#define SIZE_BK_NOISE5  32

#define FRAME_LEN 256   /* Length (samples) of the input frame */
#define SCALE     128   /* (UNITY * UNITY) / 512               */
#define TONE_THR 0.65f  /* Threshold for tone detection        */

/* constants for speech level estimation */
#define SP_EST_COUNT       80
#define SP_ACTIVITY_COUNT  25
#define ALPHA_SP_UP     (1.0f - 0.85f)
#define ALPHA_SP_DOWN   (1.0f - 0.85f)

#define NOM_LEVEL          2050.0F              /* about -26 dBov                */
#define SPEECH_LEVEL_INIT  NOM_LEVEL
#define MIN_SPEECH_LEVEL1  (NOM_LEVEL * 0.063F) /* NOM_LEVEL -24 dB              */
#define MIN_SPEECH_LEVEL2  (NOM_LEVEL * 0.2F)   /* NOM_LEVEL -14 dB              */
#define MIN_SPEECH_SNR     0.125F               /* 0 dB, lowest SNR estimation   */

/* Constants for background spectrum update */
#define ALPHA_UP1   (1.0f - 0.95f)  /* Normal update, upwards:   */
#define ALPHA_DOWN1 (1.0f - 0.936f) /* Normal update, downwards  */
#define ALPHA_UP2   (1.0f - 0.985f) /* Forced update, upwards    */
#define ALPHA_DOWN2 (1.0f - 0.943f) /* Forced update, downwards  */
#define ALPHA3      (1.0f - 0.95f)  /* Update downwards          */
#define ALPHA4      (1.0f - 0.9f)   /* For stationary estimation */
#define ALPHA5      (1.0f - 0.5f)   /* For stationary estimation */

/* Constants for VAD threshold */
#define THR_MIN   (1.6F * SCALE) /* Minimum threshold                            */
#define THR_HIGH  (6.0F * SCALE) /* Highest threshold                            */
#define THR_LOW   (1.7F * SCALE) /* Lowest threshold                             */
#define NO_P1     31744.0F       /* ilog2(1), Noise level for highest threshold  */
#define NO_P2     19786.0F       /* ilog2(0.1, Noise level for lowest threshold  */
#define NO_SLOPE  ((Float32)(THR_LOW - THR_HIGH) / (Float32)(NO_P2 - NO_P1))
#define SP_CH_MIN (-0.75F * SCALE)
#define SP_CH_MAX (0.75F * SCALE)
#define SP_P1     22527.0F       /* ilog2(NOM_LEVEL / 4)                         */
#define SP_P2     17832.0F       /* ilog2(NOM_LEVEL * 4)                         */
#define SP_SLOPE  ((Float32)(SP_CH_MAX - SP_CH_MIN) / (Float32)(SP_P2 - SP_P1))

/* Constants for hangover length */
#define HANG_HIGH 12          /* longest hangover                 */
#define HANG_LOW  2           /* shortest hangover                */
#define HANG_P1   THR_LOW     /* threshold for longest hangover   */
#define HANG_P2   (4 * SCALE) /* threshold for Word16est hangover */
#define HANG_SLOPE ((Float32)(HANG_LOW - HANG_HIGH) / (Float32)(HANG_P2 - HANG_P1))

/* Constants for burst length */
#define BURST_HIGH   8        /* longest burst length          */
#define BURST_LOW    3        /* shortest burst length         */
#define BURST_P1     THR_HIGH /* threshold for Word32est burst */
#define BURST_P2     THR_LOW  /* threshold for Word16est burst */
#define BURST_SLOPE  ((Float32)(BURST_LOW - BURST_HIGH) / (Float32)(BURST_P2 - BURST_P1))

/* Parameters for background spectrum recovery function */
#define STAT_COUNT      20    /* threshold of stationary detection counter         */
#define STAT_THR_LEVEL  184   /* Threshold level for stationarity detection        */
#define STAT_THR        1000  /* Threshold for stationarity detection              */

/* Limits for background noise estimate */
#define NOISE_MIN    40    /* minimum */
#define NOISE_MAX    20000 /* maximum */
#define NOISE_INIT   150   /* initial */

/* Thresholds for signal power (now calculated on 2 frames) */
#define VAD_POW_LOW        30000.0f   /* If input power is lower than this, VAD is set to 0  */
#define POW_PITCH_TONE_THR 686080.0f  /* If input power is lower, pitch detection is ignored */

/* Constants for the filter bank */
#define COEFF3   0.407806f /* coefficient for the 3rd order filter     */
#define COEFF5_1 0.670013f /* 1st coefficient the for 5th order filter */
#define COEFF5_2 0.195007f /* 2nd coefficient the for 5th order filter */

extern const Float32 E_ROM_en_adjust[];
extern const Float32 E_ROM_mean_isf_noise[];
extern const Float32 E_ROM_dico1_isf_noise[];
extern const Float32 E_ROM_dico2_isf_noise[];
extern const Float32 E_ROM_dico3_isf_noise[];
extern const Float32 E_ROM_dico4_isf_noise[];
extern const Float32 E_ROM_dico5_isf_noise[];
extern const Float32 E_ROM_isf[];


/*
 * E_DTX_isf_history_aver
 *
 * Parameters:
 *    isf_old         I/O: ISF vectors
 *    indices           I: ISF indices
 *    isf_aver          O: averaged ISFs
 *
 * Function:
 *    Perform the ISF averaging
 *
 * Returns:
 *    void
 */
static void E_DTX_isf_history_aver(Float32 isf_old[], Word16 indices[],
                                   Float32 isf_aver[])
{
   Float32 isf_tmp[2 * M];
   Float32 tmp;
   Word32 i, j, k;

   /*
    * Memorize in isf_tmp[][] the ISF vectors to be replaced by
    * the median ISF vector prior to the averaging
    */
   for (k = 0; k < 2; k++)
   {
      if (indices[k] != -1)
      {
         for (i = 0; i < M; i++)
         {
            isf_tmp[k * M + i] = isf_old[indices[k] * M + i];
            isf_old[indices[k] * M + i] = isf_old[indices[2] * M + i];
         }
      }
   }

   /* Perform the ISF averaging */
   for (j = 0; j < M; j++)
   {
      tmp = 0;

      for (i = 0; i < DTX_HIST_SIZE; i++)
      {
         tmp += isf_old[i * M + j];
      }

      isf_aver[j] = tmp;
   }

   /* Retrieve from isf_tmp[][] the ISF vectors saved prior to averaging */
   for (k = 0; k < 2; k++)
   {
      if (indices[k] != -1)
      {
         for (i = 0; i < M; i++)
         {
            isf_old[indices[k] * M + i] = isf_tmp[k * M + i];
         }
      }
   }

   return;
}

/*
 * E_DTX_dithering_control
 *
 * Parameters:
 *    st                I: state struct
 *
 * Function:
 *    Analysis of the variation and stationarity
 *    of the background noise.
 *
 * Returns:
 *    Dithering decision
 */
static Word16 E_DTX_dithering_control(E_DTX_State * st)
{
   Float32 ISF_diff, gain_diff, mean, tmp;
   Word32 i;
   Word16 CN_dith;

   /* determine how stationary the spectrum of background noise is */
   ISF_diff = 0.0F;

   for (i = 0; i < 8; i++)
   {
      ISF_diff += st->mem_distance_sum[i];
   }

   if (ISF_diff > 5147609.0f)
   {
      CN_dith = 1;
   }
   else
   {
      CN_dith = 0;
   }

   /* determine how stationary the energy of background noise is */
   mean = 0.0f;

   for (i = 0; i < DTX_HIST_SIZE; i++)
   {
      mean += st->mem_log_en[i] / (Float32)DTX_HIST_SIZE;
   }

   gain_diff = 0.0f;

   for (i = 0; i < DTX_HIST_SIZE; i++)
   {
      tmp = (Float32)fabs(st->mem_log_en[i] - mean);
      gain_diff += tmp;
   }

   if (gain_diff > GAIN_THR)
   {
      CN_dith = 1;
   }

   return CN_dith;
}

/*
 * E_DTX_buffer
 *
 * Parameters:
 *    st           I/O: state struct
 *    isf_new        I: isf vector
 *    enr            I: residual energy (for L_FRAME)
 *    codec_mode     I: speech coder mode
 *
 * Function:
 *    Handles the DTX buffer
 *
 * Returns:
 *    void
 */
void E_DTX_buffer(E_DTX_State *st, Float32 isf_new[], Float32 enr,
                  Word16 codec_mode)
{
   Float32 log_en;

   /* update pointer to circular buffer */
   st->mem_hist_ptr++;

   if (st->mem_hist_ptr == DTX_HIST_SIZE)
   {
      st->mem_hist_ptr = 0;
   }

   /* copy isf vector into buffer */
   memcpy(&st->mem_isf[st->mem_hist_ptr * M], isf_new, M * sizeof(Float32));

   enr += 1e-10F;

   log_en = (Float32)(log10(enr / ((Float64)L_FRAME)) / log10(2.0F));

   /* Subtract ~ 3 dB */
   st->mem_log_en[st->mem_hist_ptr] = log_en + E_ROM_en_adjust[codec_mode];

   return;
}

/*
 * E_DTX_frame_indices_find
 *
 * Parameters:
 *    st           I/O: state struct
 *    isf_old_tx     I: isf vector
 *    indices        I: distance indices
 *
 * Function:
 *    Find indices for min/max distances
 *
 * Returns:
 *    void
 */
static void E_DTX_frame_indices_find(E_DTX_State * st, Word16 indices[])
{
   Float32 L_tmp, tmp, summin, summax, summax2nd;
   Word32 i, j, k;
   Word16 ptr;

   /*
    * Remove the effect of the oldest frame from the column
    * sum sumD[0..E_DTX_HIST_SIZE-1]. sumD[E_DTX_HIST_SIZE] is
    * not updated since it will be removed later.
    */

   k = DTX_HIST_SIZE_MIN_ONE;
   j = -1;

   for (i = 0; i < DTX_HIST_SIZE_MIN_ONE; i++)
   {
      j = j + k;
      st->mem_distance_sum[i] = st->mem_distance_sum[i] - st->mem_distance[j];
      k--;
   }

   /*
    * Shift the column sum sumD. The element sumD[E_DTX_HIST_SIZE-1]
    * corresponding to the oldest frame is removed. The sum of
    * the distances between the latest isf and other isfs,
    * i.e. the element sumD[0], will be computed during this call.
    * Hence this element is initialized to zero.
    */

   for (i = DTX_HIST_SIZE_MIN_ONE; i > 0; i--)
   {
      st->mem_distance_sum[i] = st->mem_distance_sum[i - 1];
   }
   st->mem_distance_sum[0] = 0.0F;

   /*
    * Remove the oldest frame from the distance matrix.
    * Note that the distance matrix is replaced by a one-
    * dimensional array to save static memory.
    */

   k = 0;

   for (i = 27; i >= 12; i = i - k)
   {
      k++;
      for (j = k; j > 0; j--)
      {
         st->mem_distance[i - j + 1] = st->mem_distance[i - j - k];
      }
   }

   /*
    * Compute the first column of the distance matrix D
    * (squared Euclidean distances from isf1[] to isf_old_tx[][]).
    */

   ptr = st->mem_hist_ptr;

   for (i = 1; i < DTX_HIST_SIZE; i++)
   {
      /* Compute the distance between the latest isf and the other isfs. */
      ptr--;

      if (ptr < 0)
      {
         ptr = DTX_HIST_SIZE_MIN_ONE;
      }
      L_tmp = 0;

      for (j = 0; j < M; j++)
      {
         tmp = st->mem_isf[st->mem_hist_ptr * M + j] - st->mem_isf[ptr * M + j];
         L_tmp += tmp * tmp;
      }

      st->mem_distance[i - 1] = L_tmp;

      /* Update also the column sums. */
      st->mem_distance_sum[0] += st->mem_distance[i - 1];
      st->mem_distance_sum[i] += st->mem_distance[i - 1];
   }

   /* Find the minimum and maximum distances */
   summax = st->mem_distance_sum[0];
   summin = st->mem_distance_sum[0];
   indices[0] = 0;
   indices[2] = 0;

   for (i = 1; i < DTX_HIST_SIZE; i++)
   {
      if (st->mem_distance_sum[i] > summax)
      {
         indices[0] = (Word16)i;
         summax = st->mem_distance_sum[i];
      }

      if (st->mem_distance_sum[i] < summin)
      {
         indices[2] = (Word16)i;
         summin = st->mem_distance_sum[i];
      }
   }

   /* Find the second largest distance */
   summax2nd = -100000000.0;
   indices[1] = -1;
   for (i = 0; i < DTX_HIST_SIZE; i++)
   {
      if ((st->mem_distance_sum[i] > summax2nd) && (i != indices[0]))
      {
         indices[1] = (Word16)i;
         summax2nd = st->mem_distance_sum[i];
      }
   }

   for (i = 0; i < 3; i++)
   {
      indices[i] = (Word16)(st->mem_hist_ptr - indices[i]);
      if (indices[i] < 0)
      {
         indices[i] += DTX_HIST_SIZE;
      }
   }

   /*
    * If maximum distance / MED_THRESH is smaller than minimum distance
    * then the median ISF vector replacement is not performed
    */
   L_tmp = (Float32)(summax / MED_THRESH);

   if (L_tmp <= summin)
   {
      indices[0] = -1;
   }

   /*
    * If second largest distance/MED_THRESH is smaller than
    * minimum distance then the median ISF vector replacement is
    * not performed
    */
   L_tmp = (Float32)(summax2nd / MED_THRESH);

   if (L_tmp <= summin)
   {
      indices[1] = -1;
   }

   return;
}

/*
 * E_DTX_isf_q
 *
 * Parameters:
 *    isf            I: ISF in the frequency domain (0..6400)
 *    isf_q          O: quantised ISF
 *    indice         O: quantisation indices
 *
 * Function:
 *    The ISF vector is quantized using VQ with split-by-5
 *
 * Returns:
 *    void
 */
static void E_DTX_isf_q(Float32 *isf, Word16 **indice)
{
   Word32 i;
   Float32 tmp;

   for (i = 0; i < ORDER; i++)
   {
      isf[i] = isf[i] - E_ROM_mean_isf_noise[i];
   }

   (*indice)[0] = E_LPC_isf_sub_vq(&isf[0], E_ROM_dico1_isf_noise, 2,
      SIZE_BK_NOISE1, &tmp);
   (*indice)[1] = E_LPC_isf_sub_vq(&isf[2], E_ROM_dico2_isf_noise, 3,
      SIZE_BK_NOISE2, &tmp);
   (*indice)[2] = E_LPC_isf_sub_vq(&isf[5], E_ROM_dico3_isf_noise, 3,
      SIZE_BK_NOISE3, &tmp);
   (*indice)[3] = E_LPC_isf_sub_vq(&isf[8], E_ROM_dico4_isf_noise, 4,
      SIZE_BK_NOISE4, &tmp);
   (*indice)[4] = E_LPC_isf_sub_vq(&isf[12], E_ROM_dico5_isf_noise, 4,
      SIZE_BK_NOISE5, &tmp);

   return;
}

/*
 * E_DTX_exe
 *
 * Parameters:
 *    st           I/O: state struct
 *    exc2           O: CN excitation
 *    pt_prms        O: analysis parameters
 *
 * Function:
 *    Confort noise parameters are encoded for the SID frame
 *
 * Returns:
 *    void
 */
void E_DTX_exe(E_DTX_State *st, Float32 *exc2, Word16 **pt_prms)
{
   Float32 isf[M];
   Float32 log_en, level, gain, ener;
   Word32 i,j;
   Word16 isf_order[3];
   Word16 CN_dith;

   /* VOX mode computation of SID parameters */

   log_en = 0.0F;
   memset(isf, 0, M * sizeof(Float32));

   /* average energy and isf */
   for (i = 0; i < DTX_HIST_SIZE; i++)
   {
      log_en += st->mem_log_en[i] / (Float32)DTX_HIST_SIZE;
   }

   E_DTX_frame_indices_find(st, isf_order);
   E_DTX_isf_history_aver(st->mem_isf, isf_order, isf);

   for (j = 0; j < M; j++)
   {
      isf[j] = isf[j] / (Float32)DTX_HIST_SIZE;   /* divide by 8 */
   }

   /*  quantize logarithmic energy to 6 bits (-6 : 66 dB) */

   st->mem_log_en_index = (Word16)((log_en + 2.0F) * 2.625F);

   if(st->mem_log_en_index > 63)
   {
      st->mem_log_en_index = 63;
   }

   if(st->mem_log_en_index < 0)
   {
      st->mem_log_en_index = 0;
   }

   E_DTX_isf_q(isf, pt_prms);
   (*pt_prms) += 5;

   **pt_prms = st->mem_log_en_index;
   (*pt_prms) += 1;

   CN_dith = E_DTX_dithering_control(st);

   **pt_prms = CN_dith;
   (*pt_prms) += 1;

   /* adjust level to speech coder mode */

   log_en = (Float32)((Float32)st->mem_log_en_index / 2.625 - 2.0);
   level = (Float32)(pow( 2.0, log_en ));

   /* generate white noise vector */

   for (i = 0; i < L_FRAME; i++)
   {
      exc2[i] = (Float32)E_UTIL_random(&(st->mem_cng_seed));
   }

   ener = 0.01F;

   for (i = 0; i < L_FRAME; i++)
   {
      ener += exc2[i] * exc2[i];
   }

   gain = (Float32)sqrt(level * L_FRAME / ener);

   for (i = 0; i < L_FRAME; i++)
   {
      exc2[i] *= gain;
   }

   return;
}

/*
 * E_DTX_reset
 *
 * Parameters:
 *    st             O: state struct
 *
 * Function:
 *    Initializes state memory
 *
 * Returns:
 *    non-zero with error, zero for ok
 */
Word32 E_DTX_reset(E_DTX_State *st)
{
   Word32 i;

   if (st == (E_DTX_State *) NULL)
   {
      return -1;
   }

   st->mem_hist_ptr = 0;
   st->mem_log_en_index = 0;

   /* Init isf_hist[] */
   for(i = 0; i < DTX_HIST_SIZE; i++)
   {
      memcpy(&st->mem_isf[i * M], E_ROM_isf, M * sizeof(Float32));
   }

   st->mem_cng_seed = RANDOM_INITSEED;

   /* Reset energy history */
   memset(st->mem_log_en, 0, DTX_HIST_SIZE * sizeof(Float32));

   st->mem_dtx_hangover_count = DTX_HANG_CONST;
   st->mem_dec_ana_elapsed_count = DTX_ELAPSED_FRAMES_THRESH;

   memset(st->mem_distance, 0, 28 * sizeof(Float32));
   memset(st->mem_distance_sum, 0, (DTX_HIST_SIZE - 1) * sizeof(Float32));

   return 0;
}

/*
 * E_DTX_init
 *
 * Parameters:
 *    st           I/O: state struct
 *
 * Function:
 *    Allocates state memory and initializes state memory
 *
 * Returns:
 *    non-zero with error, zero for ok
 */
Word32 E_DTX_init (E_DTX_State **st)
{
   E_DTX_State* s;

   if (st == (E_DTX_State **) NULL)
   {
      return -1;
   }

   *st = NULL;

   /* allocate memory */
   if ((s= (E_DTX_State *) malloc(sizeof(E_DTX_State))) == NULL)
   {
      return -1;
   }

   E_DTX_reset(s);
   *st = s;

   return 0;
}

/*
 * E_DTX_exit
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    The memory used for state memory is freed
 *
 * Returns:
 *    void
 */
void E_DTX_exit (E_DTX_State **st)
{
   if (st == NULL || *st == NULL)
   {
      return;
   }

   /* deallocate memory */
   free(*st);
   *st = NULL;

   return;
}


/*
 * E_DTX_tx_handler
 *
 * Parameters:
 *    st           I/O: State struct
 *    vad_flag       I: vad decision
 *    usedMode     I/O: mode changed or not
 *
 * Function:
 *    Adds extra speech hangover to analyze speech on the decoding side.
 *
 * Returns:
 *    void
 */
void E_DTX_tx_handler(E_DTX_State *st, Word32 vad_flag, Word16 *usedMode)
{

   /* this state machine is in synch with the GSMEFR txDtx machine */
   st->mem_dec_ana_elapsed_count++;

   if (vad_flag != 0)
   {
      st->mem_dtx_hangover_count = DTX_HANG_CONST;
   }
   else
   {  /* non-speech */
      if (st->mem_dtx_hangover_count == 0)
      {  /* out of decoder analysis hangover  */
         st->mem_dec_ana_elapsed_count = 0;
         *usedMode = MRDTX;
      }
      else
      { /* in possible analysis hangover */
         st->mem_dtx_hangover_count--;

         /* decAnaElapsedCount + dtxHangoverCount < E_DTX_ELAPSED_FRAMES_THRESH */
         if ((st->mem_dec_ana_elapsed_count + st->mem_dtx_hangover_count)
            < DTX_ELAPSED_FRAMES_THRESH)
         {
            *usedMode = MRDTX;
            /* if Word16 time since decoder update, do not add extra HO */
         }
         /*
         else
         override VAD and stay in
         speech mode *usedMode
         and add extra hangover
         */
      }
   }

   return;
}

/*
 * E_DTX_filter5
 *
 * Parameters:
 *    in0         I/O: input values / output low-pass part
 *    in1         I/O: input values / output high-pass part
 *    data        I/O: updated filter memory
 *
 * Function:
 *    Fifth-order half-band lowpass/highpass filter pair with decimation.
 *
 * Returns:
 *    void
 */
static void E_DTX_filter5(Float32 *in0, Float32 *in1,  Float32 data[])
{
   Float32 temp0, temp1, temp2;

   temp0 = *in0 - COEFF5_1 * data[0];
   temp1 = data[0] + COEFF5_1 * temp0;
   data[0] = ((temp0 > 1e-10) | (temp0 < -1e-10)) ? temp0 : 0;

   temp0 = *in1 - COEFF5_2 * data[1];
   temp2 = data[1] + COEFF5_2 * temp0;
   data[1] = ((temp0 > 1e-10) | (temp0 < -1e-10)) ? temp0 : 0;

   *in0 = (temp1 + temp2) * 0.5F;
   *in1 = (temp1 - temp2) * 0.5F;
}

/*
 * E_DTX_filter3
 *
 * Parameters:
 *    in0         I/O: input values / output low-pass part
 *    in1         I/O: input values / output high-pass part
 *    data        I/O: updated filter memory
 *
 * Function:
 *    Third-order half-band lowpass/highpass filter pair with decimation.
 *
 * Returns:
 *    void
 */
static void E_DTX_filter3(Float32 *in0, Float32 *in1, Float32 *data)
{
   Float32 temp1, temp2;

   temp1 = *in1 - COEFF3 * *data;
   temp2 = *data + COEFF3 * temp1;
   *data = ((temp1 > 1e-10) | (temp1 < -1e-10)) ? temp1 : 0;

   *in1 = (*in0 - temp2) * 0.5F;
   *in0 = (*in0 + temp2) * 0.5F;
}

/*
 * E_DTX_level_calculation
 *
 * Parameters:
 *    data          I: signal buffer
 *    sub_level   I/0: level calculated at the end of the previous frame /
 *                     level of signal calculated from the last
 *                     (count2 - count1) samples
 *    count1        I: number of samples to be counted
 *    count2        I: number of samples to be counted
 *    ind_m         I: step size for the index of the data buffer
 *    ind_a         I: starting index of the data buffer
 *    scale         I: scaling for the level calculation
 *
 * Function:
 *    Calculate signal level in a sub-band. Level is calculated
 *    by summing absolute values of the input data.
 *
 *    Because speech coder has a lookahead, signal level calculated
 *    over the lookahead (data[count1 - count2]) is stored (*sub_level)
 *    and added to the level of the next frame. Additionally, group
 *    delay and decimation of the filter bank is taken into the count
 *    for the values of the counters (count1, count2).
 *
 * Returns:
 *    signal level
 */
static Float32 E_DTX_level_calculation(Float32 data[], Float32 *sub_level,
                                       Word16 count1, Word16 count2,
                                       Word16 ind_m, Word16 ind_a,
                                       Float32 scale)
{
  Float64 l_temp1, l_temp2;
  Float32 level;
  Word32 i;

  l_temp1 = 0.0;

  for (i = count1; i < count2; i++)
  {
     l_temp1 += fabs(data[ind_m * i + ind_a]);
  }

  l_temp1 *= 2.0;
  l_temp2 = l_temp1 + *sub_level / scale;
  *sub_level = (Float32)(l_temp1 * scale);

  for (i = 0; i < count1; i++)
  {
     l_temp2 += 2.0f * fabs(data[ind_m * i + ind_a]);
  }

  level = (Float32)(l_temp2 * scale);

  return level;
}

/*
 * E_DTX_filter_bank
 *
 * Parameters:
 *    st          I/0: State struct
 *    in            I: input frame
 *    level         I: signal levels at each band
 *
 * Function:
 *    Divide input signal into bands and calculate level of
 *    the signal in each band
 *
 * Returns:
 *    void
 */
static void E_DTX_filter_bank(E_DTX_Vad_State *st, Float32 in[],
                              Float32 level[])
{
   Float32 tmp_buf[FRAME_LEN];
   Word32 i, j;

   /* shift input 1 bit down for safe scaling */
   for (i = 0; i < FRAME_LEN; i++)
   {
      tmp_buf[i] = in[i] * 0.5F;
   }

   /* run the filter bank */
   for (i = 0; i < (FRAME_LEN >> 1); i++)
   {
      j = i << 1;
      E_DTX_filter5(&tmp_buf[j], &tmp_buf[j + 1], st->mem_a_data5[0]);
   }
   for (i = 0; i < (FRAME_LEN >> 2); i++)
   {
      j = i << 2;
      E_DTX_filter5(&tmp_buf[j], &tmp_buf[j + 2], st->mem_a_data5[1]);
      E_DTX_filter5(&tmp_buf[j + 1], &tmp_buf[j + 3], st->mem_a_data5[2]);
   }
   for (i = 0; i < (FRAME_LEN >> 3); i++)
   {
      j = i << 3;
      E_DTX_filter5(&tmp_buf[j], &tmp_buf[j + 4], st->mem_a_data5[3]);
      E_DTX_filter5(&tmp_buf[j + 2], &tmp_buf[j + 6], st->mem_a_data5[4]);
      E_DTX_filter3(&tmp_buf[j + 3], &tmp_buf[j + 7], &st->mem_a_data3[0]);
   }
   for (i = 0; i < (FRAME_LEN >> 4); i++)
   {
      j = i << 4;
      E_DTX_filter3(&tmp_buf[j], &tmp_buf[j + 8], &st->mem_a_data3[1]);
      E_DTX_filter3(&tmp_buf[j + 4], &tmp_buf[j + 12], &st->mem_a_data3[2]);
      E_DTX_filter3(&tmp_buf[j + 6], &tmp_buf[j + 14], &st->mem_a_data3[3]);
   }

   for (i = 0; i < (FRAME_LEN >> 5); i++)
   {
      j = i << 5;
      E_DTX_filter3(&tmp_buf[j + 0], &tmp_buf[j + 16], &st->mem_a_data3[4]);
      E_DTX_filter3(&tmp_buf[j + 8], &tmp_buf[j + 24], &st->mem_a_data3[5]);
   }

   /* calculate levels in each frequency band */

   /* 4800 - 6400 Hz*/
   level[11] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[11],
      (FRAME_LEN >> 2) - 48, FRAME_LEN >> 2, 4, 1, 0.25F);
   /* 4000 - 4800 Hz*/
   level[10] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[10],
      (FRAME_LEN >> 3) - 24, FRAME_LEN >> 3, 8, 7, 0.5F);
   /* 3200 - 4000 Hz*/
   level[9] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[9],
      (FRAME_LEN >> 3) - 24, FRAME_LEN >> 3, 8, 3, 0.5F);
   /* 2400 - 3200 Hz*/
   level[8] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[8],
      (FRAME_LEN >> 3) - 24, FRAME_LEN >> 3, 8, 2, 0.5F);
   /* 2000 - 2400 Hz*/
   level[7] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[7],
      (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 14, 1.0F);
   /* 1600 - 2000 Hz*/
   level[6] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[6],
      (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 6, 1.0F);
   /* 1200 - 1600 Hz*/
   level[5] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[5],
      (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 4, 1.0F);
   /* 800 - 1200 Hz*/
   level[4] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[4],
      (FRAME_LEN >> 4) - 12, FRAME_LEN >> 4, 16, 12, 1.0F);
   /* 600 - 800 Hz*/
   level[3] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[3],
      (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 8, 2.0F);
   /* 400 - 600 Hz*/
   level[2] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[2],
      (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 24, 2.0F);
   /* 200 - 400 Hz*/
   level[1] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[1],
      (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 16, 2.0F);
   /* 0 - 200 Hz*/
   level[0] = E_DTX_level_calculation(tmp_buf, &st->mem_sub_level[0],
      (FRAME_LEN >> 5) - 6, FRAME_LEN >> 5, 32, 0, 2.0F);
}

/*
 * E_DTX_update_cntrl
 *
 * Parameters:
 *    st          I/0: State struct
 *    level         I: sub-band levels of the input frame
 *
 * Function:
 *    Control update of the background noise estimate.
 *
 * Returns:
 *    void
 */
static void E_DTX_update_cntrl(E_DTX_Vad_State *st, Float32 level[])
{

   Float32 stat_rat;
   Float32 num, denom;
   Float32 alpha;
   Word32 i;

   /* if fullband pitch or tone have been detected for a while, initialize stat_count */

   if ((st->mem_pitch_tone & 0x7c00) == 0x7c00)
   {
      st->mem_stat_count = STAT_COUNT;

   }
   else
   {
      /* if 8 last vad-decisions have been "0", reinitialize stat_count */

      if ((st->mem_vadreg & 0x7f80) == 0)
      {
         st->mem_stat_count = STAT_COUNT;
      }
      else
      {
         stat_rat = 0;
         for (i = 0; i < COMPLEN; i++)
         {

            if (level[i] > st->mem_ave_level[i])
            {
               num = level[i];
               denom = st->mem_ave_level[i];
            }
            else
            {
               num = st->mem_ave_level[i];
               denom = level[i];
            }
            /* Limit nimimum value of num and denom to STAT_THR_LEVEL */

            if (num  < STAT_THR_LEVEL)
            {
               num = STAT_THR_LEVEL;
            }

            if (denom < STAT_THR_LEVEL)
            {
               denom = STAT_THR_LEVEL;
            }

            stat_rat += num/denom * 64;

         }

         /* compare stat_rat with a threshold and update stat_count */

         if (stat_rat  > STAT_THR)
         {
            st->mem_stat_count = STAT_COUNT;
         }
         else
         {

            if ((st->mem_vadreg & 0x4000) != 0)
            {

               if (st->mem_stat_count != 0)
               {
                  st->mem_stat_count--;
               }
            }
         }
      }
   }

   /* Update average amplitude estimate for stationarity estimation */
   alpha = ALPHA4;

   if (st->mem_stat_count == STAT_COUNT)
   {
      alpha = 1.0;
   }
   else if ((st->mem_vadreg & 0x4000) == 0)
   {

      alpha = ALPHA5;
   }

   for (i = 0; i < COMPLEN; i++)
   {
      st->mem_ave_level[i] += alpha * (level[i] - st->mem_ave_level[i]);
   }

}

/*
 * E_DTX_hangover_addition
 *
 * Parameters:
 *    st              I/0: State struct
 *    low_power         I: flag power of the input frame
 *    hang_len          I: hangover length
 *    burst_len         I: minimum burst length for hangover addition
 *
 * Function:
 *    Add hangover after speech bursts.
 *
 * Returns:
 *    VAD_flag indicating final VAD decision
 */
static Word16 E_DTX_hangover_addition(E_DTX_Vad_State *st, Word16 low_power,
                                      Word16 hang_len, Word16 burst_len)
{
   /*
    * if the input power (pow_sum) is lower than a threshold, clear
    * counters and set VAD_flag to "0"  "fast exit"
    */

   if (low_power != 0)
   {
      st->mem_burst_count = 0;
      st->mem_hang_count = 0;
      return 0;
   }

   /* update the counters (hang_count, burst_count) */

   if ((st->mem_vadreg & 0x4000) != 0)
   {
      st->mem_burst_count++;

      if (st->mem_burst_count >= burst_len)
      {
         st->mem_hang_count = hang_len;
      }
      return 1;
   }
   else
   {
      st->mem_burst_count = 0;

      if (st->mem_hang_count > 0)
      {
         st->mem_hang_count--;
         return 1;
      }
   }
   return 0;
}

/*
 * E_DTX_noise_estimate_update
 *
 * Parameters:
 *    st           I/0: State struct
 *    level          I: sub-band levels of the input frame
 *
 * Function:
 *    Update of background noise estimate
 *
 * Returns:
 *    void
 */
static void E_DTX_noise_estimate_update(E_DTX_Vad_State *st, Float32 level[])
{

   Float32 alpha_up, alpha_down, bckr_add, temp;
   Word32 i;

   /* Control update of bckr_est[] */
   E_DTX_update_cntrl(st, level);

   /* Choose update speed */
   bckr_add = 2.0;


   if ((0x7800 & st->mem_vadreg) == 0)
   {
      alpha_up = ALPHA_UP1;
      alpha_down = ALPHA_DOWN1;
   }
   else
   {

      if (st->mem_stat_count == 0)
      {
         alpha_up = ALPHA_UP2;
         alpha_down = ALPHA_DOWN2;
      }
      else
      {
         alpha_up = 0.0;
         alpha_down = ALPHA3;
         bckr_add = 0.0;
      }
   }

   /* Update noise estimate (bckr_est) */
   for (i = 0; i < COMPLEN; i++)
   {
      temp = st->mem_level[i] - st->mem_bckr_est[i];

      if (temp < 0.0)
      { /* update downwards*/
         st->mem_bckr_est[i] += -2 + (alpha_down * temp);

         /* limit minimum value of the noise estimate to NOISE_MIN */

         if (st->mem_bckr_est[i] < NOISE_MIN)
         {
            st->mem_bckr_est[i] = NOISE_MIN;
         }
      }
      else
      { /* update upwards */
         st->mem_bckr_est[i] += bckr_add + (alpha_up * temp);

         /* limit maximum value of the noise estimate to NOISE_MAX */

         if (st->mem_bckr_est[i] > NOISE_MAX)
         {
            st->mem_bckr_est[i] = NOISE_MAX;
         }
      }
   }

   /* Update signal levels of the previous frame (old_level) */
   memcpy(st->mem_level, level, COMPLEN * sizeof(Float32));
}

/*
 * E_DTX_decision
 *
 * Parameters:
 *    st           I/0: State struct
 *    level          I: sub-band levels of the input frame
 *    pow_sum        I: power of the input frame
 *
 * Function:
 *    Calculates VAD_flag
 *
 * Returns:
 *    VAD_flag
 */
static Word16 E_DTX_decision(E_DTX_Vad_State *st, Float32 level[COMPLEN], Float64 pow_sum)
{
   Float64 snr_sum;
   Float32 vad_thr, temp, noise_level;
   Float32 ilog2_speech_level, ilog2_noise_level;
   Float32 temp2;
   Word32 i;
   Word16 low_power_flag;
   Word16 hang_len,burst_len;

   /*
    * Calculate squared sum of the input levels (level)
    * divided by the background noise components (bckr_est).
    */
   snr_sum = 0.0;

   for (i = 0; i < COMPLEN; i++)
   {
      temp = level[i] / st->mem_bckr_est[i];
      snr_sum += temp * temp;
   }

   /* Calculate average level of estimated background noise */
   temp = 0.0;

   for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
   {
      temp += st->mem_bckr_est[i];
   }

   noise_level = (Float32)(temp * 0.0625);
   /*
    * if SNR is lower than a threshold (MIN_SPEECH_SNR),
    * and increase speech_level
    */
   temp = noise_level * MIN_SPEECH_SNR * 8;

   if (st->mem_speech_level <= temp)
   {
      st->mem_speech_level = temp;

      /* avoid log10 error */
      temp -= 1E-8F;
   }
   
   ilog2_noise_level = (Float32)(-1024.0F * log10(noise_level / 2147483648.0F) / log10(2.0F));
  
   /*
    * If SNR is very poor, speech_level is probably corrupted by noise level. This
    * is correctred by subtracting -MIN_SPEECH_SNR*noise_level from speech level
    */
   ilog2_speech_level = (Float32)(-1024.0F * log10((st->mem_speech_level - temp) / 2147483648.0F) / log10(2.0F));

   temp = NO_SLOPE * (ilog2_noise_level- NO_P1) + THR_HIGH;

   temp2 = SP_CH_MIN + SP_SLOPE * (ilog2_speech_level - SP_P1);

   if (temp2 < SP_CH_MIN)
   {
      temp2 = SP_CH_MIN;
   }

   if (temp2 > SP_CH_MAX)
   {
      temp2 = SP_CH_MAX;
   }

   vad_thr = temp + temp2;

   if (vad_thr < THR_MIN)
   {
      vad_thr = THR_MIN;
   }

   /* Shift VAD decision register */
   st->mem_vadreg = (Word16)(st->mem_vadreg >> 1);

   /* Make intermediate VAD decision */

   if (snr_sum > (vad_thr * (Float32)COMPLEN / 128.0F))
   {
      st->mem_vadreg = (Word16)(st->mem_vadreg | 0x4000);
   }
   /* primary vad decision made */

   /* check if the input power (pow_sum) is lower than a threshold" */

   if (pow_sum < VAD_POW_LOW)
   {
      low_power_flag = 1;
   }
   else
   {
      low_power_flag = 0;
   }

   /* Update speech subband background noise estimates */
   E_DTX_noise_estimate_update(st, level);

   hang_len = (Word16)((HANG_SLOPE * (vad_thr - HANG_P1) - 0.5) + HANG_HIGH);

   if (hang_len < HANG_LOW)
   {
      hang_len = HANG_LOW;
   }

   burst_len = (Word16)((BURST_SLOPE * (vad_thr - BURST_P1) - 0.5) + BURST_HIGH);

   return(E_DTX_hangover_addition(st, low_power_flag, hang_len,burst_len));
}

/*
 * E_DTX_dpeech_estimate
 *
 * Parameters:
 *    st           I/0: State struct
 *    in_level       I: level of the input frame
 *
 * Function:
 *    Estimate speech level
 *
 *    Maximum signal level is searched and stored to the variable sp_max.
 *    The speech frames must locate within SP_EST_COUNT number of frames to be counted.
 *    Thus, noisy frames having occasional VAD = "1" decisions will not
 *    affect to the estimated speech_level.
 *
 * Returns:
 *    void
 */
static void E_DTX_speech_estimate(E_DTX_Vad_State *st, Float32 in_level)
{
   Float32 alpha, tmp;

   /* if the required activity count cannot be achieved, reset counters */
   if (SP_ACTIVITY_COUNT  > (SP_EST_COUNT - st->mem_sp_est_cnt + st->mem_sp_max_cnt))
   {
      st->mem_sp_est_cnt = 0;
      st->mem_sp_max = 0.0;
      st->mem_sp_max_cnt = 0;
   }

   st->mem_sp_est_cnt++;

   if (((st->mem_vadreg & 0x4000) || (in_level > st->mem_speech_level))
      && (in_level > MIN_SPEECH_LEVEL1))
   {
      if (in_level > st->mem_sp_max)
      {
         st->mem_sp_max = in_level;
      }

      st->mem_sp_max_cnt++;

      if (st->mem_sp_max_cnt >= SP_ACTIVITY_COUNT)
      {

         tmp = st->mem_sp_max / 2.0F; /* scale to get "average" speech level*/

         if (tmp > st->mem_speech_level)
         {
            alpha = ALPHA_SP_UP;
         }
         else
         {
            alpha = ALPHA_SP_DOWN;
         }

         if (tmp > MIN_SPEECH_LEVEL2)
         {
            st->mem_speech_level += alpha * (tmp - st->mem_speech_level);
         }

         st->mem_sp_max = 0.0;
         st->mem_sp_max_cnt = 0;
         st->mem_sp_est_cnt = 0;
      }
   }
}

/*
 * E_DTX_vad_reset
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    Initialises state memory
 *
 * Returns:
 *    non-zero with error, zero for ok
 */
Word32 E_DTX_vad_reset (E_DTX_Vad_State *state)
{
   Word32 i;

   if (state == (E_DTX_Vad_State *) NULL)
   {
      return -1;
   }

   /* Initialize pitch detection variables */
   state->mem_pitch_tone = 0;
   state->mem_vadreg = 0;
   state->mem_hang_count = 0;
   state->mem_burst_count = 0;
   state->mem_hang_count = 0;

   /* initialize memory used by the filter bank */
   memset(state->mem_a_data5, 0, F_5TH_CNT * 2 * sizeof(Float32));
   memset(state->mem_a_data3, 0, F_3TH_CNT * sizeof(Float32));

   /* initialize the rest of the memory */
   for (i = 0; i < COMPLEN; i++)
   {
      state->mem_bckr_est[i] = NOISE_INIT;
      state->mem_level[i] = NOISE_INIT;
      state->mem_ave_level[i] = NOISE_INIT;
      state->mem_sub_level[i] = 0;
   }

   state->mem_sp_est_cnt = 0;
   state->mem_sp_max = 0;
   state->mem_sp_max_cnt = 0;
   state->mem_speech_level = SPEECH_LEVEL_INIT;
   state->mem_pow_sum = 0;
   state->mem_stat_count = 0;

   return 0;
}

/*
 * E_DTX_vad_init
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    Allocates state memory and initializes state memory
 *
 * Returns:
 *    non-zero with error, zero for ok
 */
Word32 E_DTX_vad_init (E_DTX_Vad_State **state)
{
   E_DTX_Vad_State* s;

   if (state == (E_DTX_Vad_State **) NULL)
   {
      return -1;
   }
   *state = NULL;

   /* allocate memory */
   if ((s = (E_DTX_Vad_State *) malloc(sizeof(E_DTX_Vad_State))) == NULL)
   {
      return -1;
   }

   E_DTX_vad_reset(s);

   *state = s;

   return 0;
}

/*
 * E_DTX_vad_exit
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    The memory used for state memory is freed
 *
 * Returns:
 *    void
 */
void E_DTX_vad_exit (E_DTX_Vad_State **state)
{
   if (state == NULL || *state == NULL)
   {
      return;
   }

   /* deallocate memory */
   free(*state);
   *state = NULL;

   return;
}

/*
 * E_DTX_pitch_tone_detection
 *
 * Parameters:
 *    state        I/0: State struct
 *    p_gain         I: pitch gain
 *
 * Function:
 *    Set tone flag if pitch gain is high. This is used to detect
 *    signaling tones and other signals with high pitch gain.
 *
 * Returns:
 *    void
 */
void E_DTX_pitch_tone_detection (E_DTX_Vad_State *st, Float32 p_gain)
{
   /* update tone flag and pitch flag */
   st->mem_pitch_tone = (Word16)(st->mem_pitch_tone >> 1);

   /* if (pitch_gain > TONE_THR) set tone flag */
   if (p_gain > TONE_THR)
   {
      st->mem_pitch_tone = (Word16)(st->mem_pitch_tone | 0x4000);
   }
}

/*
 * E_DTX_vad
 *
 * Parameters:
 *    st           I/0: State struct
 *    in_buf         I: samples of the input frame
 *
 * Function:
 *    Main program for Voice Activity Detection (VAD)
 *
 * Returns:
 *    VAD Decision, 1 = speech, 0 = noise
 */
Word16 E_DTX_vad(E_DTX_Vad_State *st, Float32 in_buf[])
{
   Float64 L_temp, pow_sum;
   Float32 level[COMPLEN];
   Float32 temp;
   Word32 i;
   Word16 VAD_flag;

   /* Calculate power of the input frame. */
   L_temp = 0.0;

   for (i = 0; i < FRAME_LEN; i++)
   {
      L_temp += in_buf[i] * in_buf[i];
   }

   L_temp *= 2.0;

   /* pow_sum = power of current frame and previous frame */
   pow_sum = L_temp + st->mem_pow_sum;

   /* save power of current frame for next call */
   st->mem_pow_sum = L_temp;

   /* If input power is very low, clear tone flag */
   if (pow_sum < POW_PITCH_TONE_THR)
   {
      st->mem_pitch_tone = (Word16)(st->mem_pitch_tone & 0x1fff);
   }

   /*  Run the filter bank and calculate signal levels at each band */
   E_DTX_filter_bank(st, in_buf, level);

   /* compute VAD decision */
   VAD_flag = E_DTX_decision(st, level, pow_sum);

   /* Calculate input level */
   L_temp = 0.0;
   for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
   {
      L_temp += level[i];
   }

   temp = (Float32)(L_temp / 16.0F);

   E_DTX_speech_estimate(st, temp); /* Estimate speech level */

   return(VAD_flag);
}

Generated by  Doxygen 1.6.0   Back to index