mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-08-14 01:49:05 +00:00
i've tested, now you can too
This commit is contained in:
@@ -4,8 +4,8 @@
|
||||
AUTHOR......: David Rowe
|
||||
DATE CREATED: 23/3/93
|
||||
|
||||
Non Linear Pitch (NLP) estimation functions.
|
||||
|
||||
Non Linear Pitch (NLP) estimation functions.
|
||||
|
||||
\*---------------------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
@@ -22,14 +22,13 @@
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "defines.h"
|
||||
#include "nlp.h"
|
||||
#include "dump.h"
|
||||
#include "four1.h"
|
||||
#include "kiss_fft.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
@@ -60,7 +59,7 @@
|
||||
|
||||
/* 48 tap 600Hz low pass FIR filter coefficients */
|
||||
|
||||
float nlp_fir[] = {
|
||||
const float nlp_fir[] = {
|
||||
-1.0818124e-03,
|
||||
-1.1008344e-03,
|
||||
-9.2768838e-04,
|
||||
@@ -112,12 +111,14 @@ float nlp_fir[] = {
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
float sq[PMAX_M]; /* squared speech samples */
|
||||
float mem_x,mem_y; /* memory for notch filter */
|
||||
float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */
|
||||
float sq[PMAX_M]; /* squared speech samples */
|
||||
float mem_x,mem_y; /* memory for notch filter */
|
||||
float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */
|
||||
kiss_fft_cfg fft_cfg; /* kiss FFT config */
|
||||
} NLP;
|
||||
|
||||
float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax);
|
||||
float test_candidate_mbe(COMP Sw[], COMP W[], float f0);
|
||||
float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo);
|
||||
float post_process_sub_multiples(COMP Fw[],
|
||||
int pmin, int pmax, float gmax, int gmax_bin,
|
||||
float *prev_Wo);
|
||||
@@ -146,20 +147,27 @@ void *nlp_create()
|
||||
for(i=0; i<NLP_NTAP; i++)
|
||||
nlp->mem_fir[i] = 0.0;
|
||||
|
||||
nlp->fft_cfg = kiss_fft_alloc (PE_FFT_SIZE, 0, NULL, NULL);
|
||||
assert(nlp->fft_cfg != NULL);
|
||||
|
||||
return (void*)nlp;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*\
|
||||
|
||||
nlp_destory()
|
||||
nlp_destroy()
|
||||
|
||||
Initialisation function for NLP pitch estimator.
|
||||
Shut down function for NLP pitch estimator.
|
||||
|
||||
\*---------------------------------------------------------------------------*/
|
||||
|
||||
void nlp_destroy(void *nlp_state)
|
||||
{
|
||||
NLP *nlp;
|
||||
assert(nlp_state != NULL);
|
||||
nlp = (NLP*)nlp_state;
|
||||
|
||||
KISS_FFT_FREE(nlp->fft_cfg);
|
||||
free(nlp_state);
|
||||
}
|
||||
|
||||
@@ -198,27 +206,30 @@ float nlp(
|
||||
float Sn[], /* input speech vector */
|
||||
int n, /* frames shift (no. new samples in Sn[]) */
|
||||
int m, /* analysis window size */
|
||||
int pmin, /* minimum pitch value */
|
||||
int pmin, /* minimum pitch value */
|
||||
int pmax, /* maximum pitch value */
|
||||
float *pitch, /* estimated pitch period in samples */
|
||||
COMP Sw[], /* Freq domain version of Sn[] */
|
||||
COMP W[], /* Freq domain window */
|
||||
float *prev_Wo
|
||||
)
|
||||
{
|
||||
NLP *nlp;
|
||||
float notch; /* current notch filter output */
|
||||
COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal */
|
||||
float notch; /* current notch filter output */
|
||||
COMP fw[PE_FFT_SIZE]; /* DFT of squared signal (input) */
|
||||
COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal (output) */
|
||||
float gmax;
|
||||
int gmax_bin;
|
||||
int i,j;
|
||||
float best_f0;
|
||||
|
||||
assert(nlp_state != NULL);
|
||||
assert(m <= PMAX_M);
|
||||
nlp = (NLP*)nlp_state;
|
||||
|
||||
/* Square, notch filter at DC, and LP filter vector */
|
||||
|
||||
for(i=m-n; i<M; i++) /* square latest speech samples */
|
||||
for(i=m-n; i<m; i++) /* square latest speech samples */
|
||||
nlp->sq[i] = Sn[i]*Sn[i];
|
||||
|
||||
for(i=m-n; i<m; i++) { /* notch filter at DC */
|
||||
@@ -226,7 +237,14 @@ float nlp(
|
||||
notch += COEFF*nlp->mem_y;
|
||||
nlp->mem_x = nlp->sq[i];
|
||||
nlp->mem_y = notch;
|
||||
nlp->sq[i] = notch;
|
||||
nlp->sq[i] = notch + 1.0; /* With 0 input vectors to codec,
|
||||
kiss_fft() would take a long
|
||||
time to execute when running in
|
||||
real time. Problem was traced
|
||||
to kiss_fft function call in
|
||||
this function. Adding this small
|
||||
constant fixed problem. Not
|
||||
exactly sure why. */
|
||||
}
|
||||
|
||||
for(i=m-n; i<m; i++) { /* FIR filter vector */
|
||||
@@ -243,19 +261,24 @@ float nlp(
|
||||
/* Decimate and DFT */
|
||||
|
||||
for(i=0; i<PE_FFT_SIZE; i++) {
|
||||
Fw[i].real = 0.0;
|
||||
Fw[i].imag = 0.0;
|
||||
fw[i].real = 0.0;
|
||||
fw[i].imag = 0.0;
|
||||
}
|
||||
for(i=0; i<m/DEC; i++) {
|
||||
Fw[i].real = nlp->sq[i*DEC]*(0.5 - 0.5*cos(2*PI*i/(m/DEC-1)));
|
||||
fw[i].real = nlp->sq[i*DEC]*(0.5 - 0.5*cos(2*PI*i/(m/DEC-1)));
|
||||
}
|
||||
#ifdef DUMP
|
||||
dump_dec(Fw);
|
||||
four1(&Fw[-1].imag,PE_FFT_SIZE,1);
|
||||
#endif
|
||||
|
||||
kiss_fft(nlp->fft_cfg, (kiss_fft_cpx *)fw, (kiss_fft_cpx *)Fw);
|
||||
for(i=0; i<PE_FFT_SIZE; i++)
|
||||
Fw[i].real = Fw[i].real*Fw[i].real + Fw[i].imag*Fw[i].imag;
|
||||
|
||||
#ifdef DUMP
|
||||
dump_sq(nlp->sq);
|
||||
dump_Fw(Fw);
|
||||
#endif
|
||||
|
||||
/* find global peak */
|
||||
|
||||
@@ -267,9 +290,13 @@ float nlp(
|
||||
gmax_bin = i;
|
||||
}
|
||||
}
|
||||
|
||||
best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin,
|
||||
prev_Wo);
|
||||
|
||||
//#define POST_PROCESS_MBE
|
||||
#ifdef POST_PROCESS_MBE
|
||||
best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_Wo);
|
||||
#else
|
||||
best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo);
|
||||
#endif
|
||||
|
||||
/* Shift samples in buffer to make room for new samples */
|
||||
|
||||
@@ -286,7 +313,7 @@ float nlp(
|
||||
|
||||
post_process_sub_multiples()
|
||||
|
||||
Given the global maximma of Fw[] we search interger submultiples for
|
||||
Given the global maximma of Fw[] we search integer submultiples for
|
||||
local maxima. If local maxima exist and they are above an
|
||||
experimentally derived threshold (OK a magic number I pulled out of
|
||||
the air) we choose the submultiple as the F0 estimate.
|
||||
@@ -317,10 +344,10 @@ float post_process_sub_multiples(COMP Fw[],
|
||||
/* post process estimate by searching submultiples */
|
||||
|
||||
mult = 2;
|
||||
min_bin = PE_FFT_SIZE*DEC/pmax;
|
||||
min_bin = PE_FFT_SIZE*DEC/pmax;
|
||||
cmax_bin = gmax_bin;
|
||||
prev_f0_bin = *prev_Wo*(4000.0/PI)*(PE_FFT_SIZE*DEC)/SAMPLE_RATE;
|
||||
|
||||
|
||||
while(gmax_bin/mult >= min_bin) {
|
||||
|
||||
b = gmax_bin/mult; /* determine search interval */
|
||||
@@ -339,7 +366,7 @@ float post_process_sub_multiples(COMP Fw[],
|
||||
|
||||
lmax = 0;
|
||||
lmax_bin = bmin;
|
||||
for (b=bmin; b<=bmax; b++) /* look for maximum in interval */
|
||||
for (b=bmin; b<=bmax; b++) /* look for maximum in interval */
|
||||
if (Fw[b].real > lmax) {
|
||||
lmax = Fw[b].real;
|
||||
lmax_bin = b;
|
||||
@@ -359,3 +386,158 @@ float post_process_sub_multiples(COMP Fw[],
|
||||
return best_f0;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*\
|
||||
|
||||
post_process_mbe()
|
||||
|
||||
Use the MBE pitch estimation algorithm to evaluate pitch candidates. This
|
||||
works OK but the accuracy at low F0 is affected by NW, the analysis window
|
||||
size used for the DFT of the input speech Sw[]. Also favours high F0 in
|
||||
the presence of background noise which causes periodic artifacts in the
|
||||
synthesised speech.
|
||||
|
||||
\*---------------------------------------------------------------------------*/
|
||||
|
||||
float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo)
|
||||
{
|
||||
float candidate_f0;
|
||||
float f0,best_f0; /* fundamental frequency */
|
||||
float e,e_min; /* MBE cost function */
|
||||
int i;
|
||||
float e_hz[F0_MAX];
|
||||
int bin;
|
||||
float f0_min, f0_max;
|
||||
float f0_start, f0_end;
|
||||
|
||||
f0_min = (float)SAMPLE_RATE/pmax;
|
||||
f0_max = (float)SAMPLE_RATE/pmin;
|
||||
|
||||
/* Now look for local maxima. Each local maxima is a candidate
|
||||
that we test using the MBE pitch estimation algotithm */
|
||||
|
||||
for(i=0; i<F0_MAX; i++)
|
||||
e_hz[i] = -1;
|
||||
e_min = 1E32;
|
||||
best_f0 = 50;
|
||||
for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) {
|
||||
if ((Fw[i].real > Fw[i-1].real) && (Fw[i].real > Fw[i+1].real)) {
|
||||
|
||||
/* local maxima found, lets test if it's big enough */
|
||||
|
||||
if (Fw[i].real > T*gmax) {
|
||||
|
||||
/* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */
|
||||
|
||||
candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC);
|
||||
f0_start = candidate_f0-20;
|
||||
f0_end = candidate_f0+20;
|
||||
if (f0_start < f0_min) f0_start = f0_min;
|
||||
if (f0_end > f0_max) f0_end = f0_max;
|
||||
|
||||
for(f0=f0_start; f0<=f0_end; f0+= 2.5) {
|
||||
e = test_candidate_mbe(Sw, W, f0);
|
||||
bin = floor(f0); assert((bin > 0) && (bin < F0_MAX));
|
||||
e_hz[bin] = e;
|
||||
if (e < e_min) {
|
||||
e_min = e;
|
||||
best_f0 = f0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* finally sample MBE cost function around previous pitch estimate
|
||||
(form of pitch tracking) */
|
||||
|
||||
candidate_f0 = *prev_Wo * SAMPLE_RATE/TWO_PI;
|
||||
f0_start = candidate_f0-20;
|
||||
f0_end = candidate_f0+20;
|
||||
if (f0_start < f0_min) f0_start = f0_min;
|
||||
if (f0_end > f0_max) f0_end = f0_max;
|
||||
|
||||
for(f0=f0_start; f0<=f0_end; f0+= 2.5) {
|
||||
e = test_candidate_mbe(Sw, W, f0);
|
||||
bin = floor(f0); assert((bin > 0) && (bin < F0_MAX));
|
||||
e_hz[bin] = e;
|
||||
if (e < e_min) {
|
||||
e_min = e;
|
||||
best_f0 = f0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DUMP
|
||||
dump_e(e_hz);
|
||||
#endif
|
||||
|
||||
return best_f0;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*\
|
||||
|
||||
test_candidate_mbe()
|
||||
|
||||
Returns the error of the MBE cost function for the input f0.
|
||||
|
||||
Note: I think a lot of the operations below can be simplified as
|
||||
W[].imag = 0 and has been normalised such that den always equals 1.
|
||||
|
||||
\*---------------------------------------------------------------------------*/
|
||||
|
||||
float test_candidate_mbe(
|
||||
COMP Sw[],
|
||||
COMP W[],
|
||||
float f0
|
||||
)
|
||||
{
|
||||
COMP Sw_[FFT_ENC]; /* DFT of all voiced synthesised signal */
|
||||
int l,al,bl,m; /* loop variables */
|
||||
COMP Am; /* amplitude sample for this band */
|
||||
int offset; /* centers Hw[] about current harmonic */
|
||||
float den; /* denominator of Am expression */
|
||||
float error; /* accumulated error between originl and synthesised */
|
||||
float Wo; /* current "test" fundamental freq. */
|
||||
int L;
|
||||
|
||||
L = floor((SAMPLE_RATE/2.0)/f0);
|
||||
Wo = f0*(2*PI/SAMPLE_RATE);
|
||||
|
||||
error = 0.0;
|
||||
|
||||
/* Just test across the harmonics in the first 1000 Hz (L/4) */
|
||||
|
||||
for(l=1; l<L/4; l++) {
|
||||
Am.real = 0.0;
|
||||
Am.imag = 0.0;
|
||||
den = 0.0;
|
||||
al = ceil((l - 0.5)*Wo*FFT_ENC/TWO_PI);
|
||||
bl = ceil((l + 0.5)*Wo*FFT_ENC/TWO_PI);
|
||||
|
||||
/* Estimate amplitude of harmonic assuming harmonic is totally voiced */
|
||||
|
||||
for(m=al; m<bl; m++) {
|
||||
offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5;
|
||||
Am.real += Sw[m].real*W[offset].real + Sw[m].imag*W[offset].imag;
|
||||
Am.imag += Sw[m].imag*W[offset].real - Sw[m].real*W[offset].imag;
|
||||
den += W[offset].real*W[offset].real + W[offset].imag*W[offset].imag;
|
||||
}
|
||||
|
||||
Am.real = Am.real/den;
|
||||
Am.imag = Am.imag/den;
|
||||
|
||||
/* Determine error between estimated harmonic and original */
|
||||
|
||||
for(m=al; m<bl; m++) {
|
||||
offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5;
|
||||
Sw_[m].real = Am.real*W[offset].real - Am.imag*W[offset].imag;
|
||||
Sw_[m].imag = Am.real*W[offset].imag + Am.imag*W[offset].real;
|
||||
error += (Sw[m].real - Sw_[m].real)*(Sw[m].real - Sw_[m].real);
|
||||
error += (Sw[m].imag - Sw_[m].imag)*(Sw[m].imag - Sw_[m].imag);
|
||||
}
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user