1/* 2 * This software has been licensed to the Centre of Speech Technology, KTH 3 * by Microsoft Corp. with the terms in the accompanying file BSD.txt, 4 * which is a BSD style license. 5 * 6 * "Copyright (c) 1990-1996 Entropic Research Laboratory, Inc. 7 * All rights reserved" 8 * 9 * Written by: Derek Lin 10 * Checked by: 11 * Revised by: David Talkin 12 * 13 * Brief description: Estimates F0 using normalized cross correlation and 14 * dynamic programming. 15 * 16 */ 17 18 19#include "snack.h" 20#include <math.h> 21#include <stdlib.h> 22#include <string.h> 23#include <stdio.h> 24#include <limits.h> 25#ifndef TRUE 26# define TRUE 1 27# define FALSE 0 28#endif 29#ifndef FLT_MAX 30# define FLT_MAX (3.40282347E+38f) 31#endif 32#ifndef M_PI 33# define M_PI (3.1415926536f) 34#endif 35#include "jkGetF0.h" 36 37int debug_level = 0; 38 39void free_dp_f0(); 40static int check_f0_params(); 41 42int 43Get_f0(Sound *sound, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[]) 44{ 45 float *fdata; 46 int done; 47 long buff_size, actsize; 48 double sf, start_time; 49 F0_params *par, *read_f0_params(); 50 float *f0p, *vuvp, *rms_speech, *acpkp; 51 int i, vecsize; 52 int init_dp_f0(), dp_f0(); 53 static int framestep = -1; 54 long sdstep = 0, total_samps; 55 int ndone = 0; 56 Tcl_Obj *list; 57 double framestep2 = 0.0, wind_dur; 58 59 int arg, startpos = 0, endpos = -1, fmax, fmin; 60 static CONST84 char *subOptionStrings[] = { 61 "-start", "-end", "-maxpitch", "-minpitch", "-progress", 62 "-framelength", "-method", "-windowlength", NULL 63 }; 64 enum subOptions { 65 START, END, F0MAX, F0MIN, PROGRESS, FRAME, METHOD, WINLEN 66 }; 67 68 if (sound->cmdPtr != NULL) { 69 Tcl_DecrRefCount(sound->cmdPtr); 70 sound->cmdPtr = NULL; 71 } 72 73 par = (F0_params *) ckalloc(sizeof(F0_params)); 74 par->cand_thresh = 0.3f; 75 par->lag_weight = 0.3f; 76 par->freq_weight = 0.02f; 77 par->trans_cost = 0.005f; 78 par->trans_amp = 0.5f; 79 par->trans_spec = 0.5f; 80 par->voice_bias = 0.0f; 81 par->double_cost = 0.35f; 82 par->min_f0 = 50; 83 par->max_f0 = 550; 84 par->frame_step = 0.01f; 85 par->wind_dur = 0.0075f; 86 par->n_cands = 20; 87 par->mean_f0 = 200; /* unused */ 88 par->mean_f0_weight = 0.0f; /* unused */ 89 par->conditioning = 0; /*unused */ 90 91 for (arg = 2; arg < objc; arg += 2) { 92 int index; 93 94 if (Tcl_GetIndexFromObj(interp, objv[arg], subOptionStrings, 95 "option", 0, &index) != TCL_OK) { 96 return TCL_ERROR; 97 } 98 99 if (arg + 1 == objc) { 100 Tcl_AppendResult(interp, "No argument given for ", 101 subOptionStrings[index], " option", (char *) NULL); 102 return TCL_ERROR; 103 } 104 105 switch ((enum subOptions) index) { 106 case START: 107 { 108 if (Tcl_GetIntFromObj(interp, objv[arg+1], &startpos) != TCL_OK) 109 return TCL_ERROR; 110 break; 111 } 112 case END: 113 { 114 if (Tcl_GetIntFromObj(interp, objv[arg+1], &endpos) != TCL_OK) 115 return TCL_ERROR; 116 break; 117 } 118 case F0MAX: 119 { 120 if (Tcl_GetIntFromObj(interp, objv[arg+1], &fmax) != TCL_OK) 121 return TCL_ERROR; 122 par->max_f0 = (float) fmax; 123 break; 124 } 125 case F0MIN: 126 { 127 if (Tcl_GetIntFromObj(interp, objv[arg+1], &fmin) != TCL_OK) 128 return TCL_ERROR; 129 par->min_f0 = (float) fmin; 130 break; 131 } 132 case PROGRESS: 133 { 134 char *str = Tcl_GetStringFromObj(objv[arg+1], NULL); 135 136 if (strlen(str) > 0) { 137 Tcl_IncrRefCount(objv[arg+1]); 138 sound->cmdPtr = objv[arg+1]; 139 } 140 break; 141 } 142 case FRAME: 143 { 144 if (Tcl_GetDoubleFromObj(interp, objv[arg+1], &framestep2) 145 != TCL_OK) 146 return TCL_ERROR; 147 par->frame_step = (float) framestep2; 148 break; 149 } 150 case METHOD: 151 { 152 break; 153 } 154 case WINLEN: 155 { 156 if (Tcl_GetDoubleFromObj(interp, objv[arg+1], &wind_dur) 157 != TCL_OK) 158 return TCL_ERROR; 159 par->wind_dur = (float) wind_dur; 160 break; 161 } 162 } 163 } 164 if (startpos < 0) startpos = 0; 165 if (endpos >= (sound->length - 1) || endpos == -1) 166 endpos = sound->length - 1; 167 if (startpos > endpos) return TCL_OK; 168 169 sf = (double) sound->samprate; 170 171 if (framestep > 0) /* If a value was specified with -S, use it. */ 172 par->frame_step = (float) (framestep / sf); 173 start_time = 0.0f; 174 if(check_f0_params(interp, par, sf)){ 175 Tcl_AppendResult(interp, "invalid/inconsistent parameters -- exiting.", NULL); 176 return TCL_ERROR; 177 } 178 179 total_samps = endpos - startpos + 1; 180 if(total_samps < ((par->frame_step * 2.0) + par->wind_dur) * sf) { 181 Tcl_AppendResult(interp, "input range too small for analysis by get_f0.", NULL); 182 return TCL_ERROR; 183 } 184 /* Initialize variables in get_f0.c; allocate data structures; 185 * determine length and overlap of input frames to read. 186 */ 187 if (init_dp_f0(sf, par, &buff_size, &sdstep) 188 || buff_size > INT_MAX || sdstep > INT_MAX) 189 { 190 Tcl_AppendResult(interp, "problem in init_dp_f0().", NULL); 191 return TCL_ERROR; 192 } 193 194 if (debug_level) 195 Fprintf(stderr, "init_dp_f0 returned buff_size %ld, sdstep %ld.\n",buff_size, sdstep); 196 197 if (buff_size > total_samps) 198 buff_size = total_samps; 199 200 actsize = min(buff_size, sound->length); 201 fdata = (float *) ckalloc(sizeof(float) * max(buff_size, sdstep)); 202 list = Tcl_NewListObj(0, NULL); 203 Snack_ProgressCallback(sound->cmdPtr, interp, "Computing pitch", 0.0); 204 ndone = startpos; 205 206 while (TRUE) { 207 done = (actsize < buff_size) || (total_samps == buff_size); 208 Snack_GetSoundData(sound, ndone, fdata, actsize); 209 /*if (sound->debug > 0) Snack_WriteLog("dp_f0...\n");*/ 210 if (dp_f0(fdata, (int) actsize, (int) sdstep, sf, par, 211 &f0p, &vuvp, &rms_speech, &acpkp, &vecsize, done)) { 212 Tcl_AppendResult(interp, "problem in dp_f0().", NULL); 213 return TCL_ERROR; 214 } 215 /*if (sound->debug > 0) Snack_WriteLogInt("done dp_f0",vecsize);*/ 216 for (i = vecsize - 1; i >= 0; i--) { 217 Tcl_Obj *frameList; 218 frameList = Tcl_NewListObj(0, NULL); 219 Tcl_ListObjAppendElement(interp, list, frameList); 220 Tcl_ListObjAppendElement(interp, frameList, 221 Tcl_NewDoubleObj((double)f0p[i])); 222 Tcl_ListObjAppendElement(interp, frameList, 223 Tcl_NewDoubleObj((double)vuvp[i])); 224 Tcl_ListObjAppendElement(interp, frameList, 225 Tcl_NewDoubleObj((double)rms_speech[i])); 226 Tcl_ListObjAppendElement(interp, frameList, 227 Tcl_NewDoubleObj((double)acpkp[i])); 228 } 229 230 if (done) break; 231 232 ndone += sdstep; 233 actsize = min(buff_size, sound->length - ndone); 234 total_samps -= sdstep; 235 236 if (actsize > total_samps) 237 actsize = total_samps; 238 239 if (1) { 240 int res = Snack_ProgressCallback(sound->cmdPtr, interp, "Computing pitch", (double) ndone / sound->length); 241 if (res != TCL_OK) { 242 return TCL_ERROR; 243 } 244 } 245 } 246 247 Snack_ProgressCallback(sound->cmdPtr, interp, "Computing pitch", 1.0); 248 249 ckfree((void *)fdata); 250 251 ckfree((void *)par); 252 253 free_dp_f0(); 254 255 Tcl_SetObjResult(interp, list); 256 257 return TCL_OK; 258} 259 260 261/* 262 * Some consistency checks on parameter values. 263 * Return a positive integer if any errors detected, 0 if none. 264 */ 265 266static int 267check_f0_params(Tcl_Interp *interp, F0_params *par, double sample_freq) 268{ 269 int error = 0; 270 double dstep; 271 272 if((par->cand_thresh < 0.01) || (par->cand_thresh > 0.99)) { 273 Tcl_AppendResult(interp, 274 "ERROR: cand_thresh parameter must be between [0.01, 0.99].",NULL); 275 error++; 276 } 277 if((par->wind_dur > .1) || (par->wind_dur < .0001)) { 278 Tcl_AppendResult(interp,"ERROR: wind_dur parameter must be between [0.0001, 0.1].",NULL); 279 error++; 280 } 281 if((par->n_cands > 100) || (par->n_cands < 3)){ 282 Tcl_AppendResult(interp, 283 "ERROR: n_cands parameter must be between [3,100].",NULL); 284 error++; 285 } 286 if((par->max_f0 <= par->min_f0) || (par->max_f0 >= (sample_freq/2.0)) || 287 (par->min_f0 < (sample_freq/10000.0))){ 288 Tcl_AppendResult(interp, 289 "ERROR: min(max)_f0 parameter inconsistent with sampling frequency.",NULL); 290 error++; 291 } 292 dstep = ((double)((int)(0.5 + (sample_freq * par->frame_step))))/sample_freq; 293 if(dstep != par->frame_step) { 294 if(debug_level) 295 Tcl_AppendResult(interp, 296 "Frame step set to exactly match signal sample rate.",NULL); 297 par->frame_step = (float) dstep; 298 } 299 if((par->frame_step > 0.1) || (par->frame_step < (1.0/sample_freq))){ 300 Tcl_AppendResult(interp, 301 "ERROR: frame_step parameter must be between [1/sampling rate, 0.1].",NULL); 302 error++; 303 } 304 305 return(error); 306} 307 308static void get_cand(), peak(), do_ffir(); 309static int lc_lin_fir(), downsamp(); 310 311/* ----------------------------------------------------------------------- */ 312void get_fast_cands(fdata, fdsdata, ind, step, size, dec, start, nlags, engref, maxloc, maxval, cp, peaks, locs, ncand, par) 313 float *fdata, *fdsdata, *engref, *maxval, *peaks; 314 int size, start, nlags, *maxloc, *locs, *ncand, ind, step, dec; 315 Cross *cp; 316 F0_params *par; 317{ 318 int decind, decstart, decnlags, decsize, i, j, *lp; 319 float *corp, xp, yp, lag_wt; 320 register float *pe; 321 322 lag_wt = par->lag_weight/nlags; 323 decnlags = 1 + (nlags/dec); 324 if((decstart = start/dec) < 1) decstart = 1; 325 decind = (ind * step)/dec; 326 decsize = 1 + (size/dec); 327 corp = cp->correl; 328 329 crossf(fdsdata + decind, decsize, decstart, decnlags, engref, maxloc, 330 maxval, corp); 331 cp->maxloc = *maxloc; /* location of maximum in correlation */ 332 cp->maxval = *maxval; /* max. correlation value (found at maxloc) */ 333 cp->rms = (float) sqrt(*engref/size); /* rms in reference window */ 334 cp->firstlag = decstart; 335 336 get_cand(cp,peaks,locs,decnlags,ncand,par->cand_thresh); /* return high peaks in xcorr */ 337 338 /* Interpolate to estimate peak locations and values at high sample rate. */ 339 for(i = *ncand, lp = locs, pe = peaks; i--; pe++, lp++) { 340 j = *lp - decstart - 1; 341 peak(&corp[j],&xp,&yp); 342 *lp = (*lp * dec) + (int)(0.5+(xp*dec)); /* refined lag */ 343 *pe = yp*(1.0f - (lag_wt* *lp)); /* refined amplitude */ 344 } 345 346 if(*ncand >= par->n_cands) { /* need to prune candidates? */ 347 register int *loc, *locm, lt; 348 register float smaxval, *pem; 349 register int outer, inner, lim; 350 for(outer=0, lim = par->n_cands-1; outer < lim; outer++) 351 for(inner = *ncand - 1 - outer, 352 pe = peaks + (*ncand) -1, pem = pe-1, 353 loc = locs + (*ncand) - 1, locm = loc-1; 354 inner--; 355 pe--,pem--,loc--,locm--) 356 if((smaxval = *pe) > *pem) { 357 *pe = *pem; 358 *pem = smaxval; 359 lt = *loc; 360 *loc = *locm; 361 *locm = lt; 362 } 363 *ncand = par->n_cands-1; /* leave room for the unvoiced hypothesis */ 364 } 365 crossfi(fdata + (ind * step), size, start, nlags, 7, engref, maxloc, 366 maxval, corp, locs, *ncand); 367 368 cp->maxloc = *maxloc; /* location of maximum in correlation */ 369 cp->maxval = *maxval; /* max. correlation value (found at maxloc) */ 370 cp->rms = (float) sqrt(*engref/size); /* rms in reference window */ 371 cp->firstlag = start; 372 get_cand(cp,peaks,locs,nlags,ncand,par->cand_thresh); /* return high peaks in xcorr */ 373 if(*ncand >= par->n_cands) { /* need to prune candidates again? */ 374 register int *loc, *locm, lt; 375 register float smaxval, *pe, *pem; 376 register int outer, inner, lim; 377 for(outer=0, lim = par->n_cands-1; outer < lim; outer++) 378 for(inner = *ncand - 1 - outer, 379 pe = peaks + (*ncand) -1, pem = pe-1, 380 loc = locs + (*ncand) - 1, locm = loc-1; 381 inner--; 382 pe--,pem--,loc--,locm--) 383 if((smaxval = *pe) > *pem) { 384 *pe = *pem; 385 *pem = smaxval; 386 lt = *loc; 387 *loc = *locm; 388 *locm = lt; 389 } 390 *ncand = par->n_cands - 1; /* leave room for the unvoiced hypothesis */ 391 } 392} 393 394/* ----------------------------------------------------------------------- */ 395float *downsample(input,samsin,state_idx,freq,samsout,decimate, first_time, last_time) 396 double freq; 397 float *input; 398 int samsin, *samsout, decimate, state_idx, first_time, last_time; 399{ 400 static float b[2048]; 401 static float *foutput = NULL; 402 float beta = 0.0f; 403 static int ncoeff = 127, ncoefft = 0; 404 int init; 405 406 if(input && (samsin > 0) && (decimate > 0) && *samsout) { 407 if(decimate == 1) { 408 return(input); 409 } 410 411 if(first_time){ 412 int nbuff = (samsin/decimate) + (2*ncoeff); 413 414 ncoeff = ((int)(freq * .005)) | 1; 415 beta = .5f/decimate; 416 foutput = (float*)ckrealloc((void *)foutput, sizeof(float) * nbuff); 417 /* spsassert(foutput, "Can't allocate foutput in downsample");*/ 418 for( ; nbuff > 0 ;) 419 foutput[--nbuff] = 0.0; 420 421 if( !lc_lin_fir(beta,&ncoeff,b)) { 422 fprintf(stderr,"\nProblems computing interpolation filter\n"); 423 ckfree((void *)foutput); 424 return(NULL); 425 } 426 ncoefft = (ncoeff/2) + 1; 427 } /* endif new coefficients need to be computed */ 428 429 if(first_time) init = 1; 430 else if (last_time) init = 2; 431 else init = 0; 432 433 if(downsamp(input,foutput,samsin,samsout,state_idx,decimate,ncoefft,b,init)) { 434 return(foutput); 435 } else 436 Fprintf(stderr,"Problems in downsamp() in downsample()\n"); 437 } else 438 Fprintf(stderr,"Bad parameters passed to downsample()\n"); 439 440 return(NULL); 441} 442 443/* ----------------------------------------------------------------------- */ 444/* Get likely candidates for F0 peaks. */ 445static void get_cand(cross,peak,loc,nlags,ncand,cand_thresh) 446 Cross *cross; 447 float *peak, cand_thresh; 448 int *loc; 449 int *ncand, nlags; 450{ 451 register int i, lastl, *t; 452 register float o, p, q, *r, *s, clip; 453 int start, ncan, maxl; 454 455 clip = (float) (cand_thresh * cross->maxval); 456 maxl = cross->maxloc; 457 lastl = nlags - 2; 458 start = cross->firstlag; 459 460 r = cross->correl; 461 o= *r++; /* first point */ 462 q = *r++; /* middle point */ 463 p = *r++; 464 s = peak; 465 t = loc; 466 ncan=0; 467 for(i=1; i < lastl; i++, o=q, q=p, p= *r++){ 468 if((q > clip) && /* is this a high enough value? */ 469 (q >= p) && (q >= o)){ /* NOTE: this finds SHOLDERS and PLATEAUS 470 as well as peaks (is this a good idea?) */ 471 *s++ = q; /* record the peak value */ 472 *t++ = i + start; /* and its location */ 473 ncan++; /* count number of peaks found */ 474 } 475 } 476/* 477 o = q; 478 q = p; 479 if( (q > clip) && (q >=0)){ 480 *s++ = q; 481 *t++ = i+start; 482 ncan++; 483 } 484*/ 485 *ncand = ncan; 486} 487 488/* ----------------------------------------------------------------------- */ 489/* buffer-to-buffer downsample operation */ 490/* This is STRICTLY a decimator! (no upsample) */ 491static int downsamp(in, out, samples, outsamps, state_idx, decimate, ncoef, fc, init) 492 float *in, *out; 493 int samples, *outsamps, decimate, ncoef, state_idx; 494 float fc[]; 495 int init; 496{ 497 if(in && out) { 498 do_ffir(in, samples, out, outsamps, state_idx, ncoef, fc, 0, decimate, init); 499 return(TRUE); 500 } else 501 printf("Bad signal(s) passed to downsamp()\n"); 502 return(FALSE); 503} 504 505/* ---------------------------------------------------------- */ 506static void do_ffir(buf,in_samps,bufo,out_samps,idx, ncoef,fc,invert,skip,init) 507/* fc contains 1/2 the coefficients of a symmetric FIR filter with unity 508 passband gain. This filter is convolved with the signal in buf. 509 The output is placed in buf2. If(invert), the filter magnitude 510 response will be inverted. If(init&1), beginning of signal is in buf; 511 if(init&2), end of signal is in buf. out_samps is set to the number of 512 output points placed in bufo. */ 513register float *buf, *bufo; 514float *fc; 515register int in_samps, ncoef, invert, skip, init, *out_samps; 516int idx; 517{ 518 register float *dp1, *dp2, *dp3, sum, integral; 519 static float *co=NULL, *mem=NULL; 520 static float state[1000]; 521 static int fsize=0, resid=0; 522 register int i, j, k, l; 523 register float *sp; 524 register float *buf1; 525 526 buf1 = buf; 527 if(ncoef > fsize) {/*allocate memory for full coeff. array and filter memory */ fsize = 0; 528 i = (ncoef+1)*2; 529 if(!((co = (float *)ckrealloc((void *)co, sizeof(float)*i)) && 530 (mem = (float *)ckrealloc((void *)mem, sizeof(float)*i)))) { 531 fprintf(stderr,"allocation problems in do_fir()\n"); 532 return; 533 } 534 fsize = ncoef; 535 } 536 537 /* fill 2nd half with data */ 538 for(i=ncoef, dp1=mem+ncoef-1; i-- > 0; ) *dp1++ = *buf++; 539 540 if(init & 1) { /* Is the beginning of the signal in buf? */ 541 /* Copy the half-filter and its mirror image into the coefficient array. */ 542 for(i=ncoef-1, dp3=fc+ncoef-1, dp2=co, dp1 = co+((ncoef-1)*2), 543 integral = 0.0; i-- > 0; ) 544 if(!invert) *dp1-- = *dp2++ = *dp3--; 545 else { 546 integral += (sum = *dp3--); 547 *dp1-- = *dp2++ = -sum; 548 } 549 if(!invert) *dp1 = *dp3; /* point of symmetry */ 550 else { 551 integral *= 2; 552 integral += *dp3; 553 *dp1 = integral - *dp3; 554 } 555 556 for(i=ncoef-1, dp1=mem; i-- > 0; ) *dp1++ = 0; 557 } 558 else 559 for(i=ncoef-1, dp1=mem, sp=state; i-- > 0; ) *dp1++ = *sp++; 560 561 i = in_samps; 562 resid = 0; 563 564 k = (ncoef << 1) -1; /* inner-product loop limit */ 565 566 if(skip <= 1) { /* never used */ 567/* *out_samps = i; 568 for( ; i-- > 0; ) { 569 for(j=k, dp1=mem, dp2=co, dp3=mem+1, sum = 0.0; j-- > 0; 570 *dp1++ = *dp3++ ) 571 sum += *dp2++ * *dp1; 572 573 *--dp1 = *buf++; 574 *bufo++ = (sum < 0.0)? sum -0.5 : sum +0.5; 575 } 576 if(init & 2) { 577 for(i=ncoef; i-- > 0; ) { 578 for(j=k, dp1=mem, dp2=co, dp3=mem+1, sum = 0.0; j-- > 0; 579 *dp1++ = *dp3++ ) 580 sum += *dp2++ * *dp1; 581 *--dp1 = 0.0; 582 *bufo++ = (sum < 0)? sum -0.5 : sum +0.5; 583 } 584 *out_samps += ncoef; 585 } 586 return; 587*/ 588 } 589 else { /* skip points (e.g. for downsampling) */ 590 /* the buffer end is padded with (ncoef-1) data points */ 591 for( l=0 ; l < *out_samps; l++ ) { 592 for(j=k-skip, dp1=mem, dp2=co, dp3=mem+skip, sum=0.0; j-- >0; 593 *dp1++ = *dp3++) 594 sum += *dp2++ * *dp1; 595 for(j=skip; j-- >0; *dp1++ = *buf++) /* new data to memory */ 596 sum += *dp2++ * *dp1; 597 *bufo++ = (sum<0.0) ? sum -0.5f : sum +0.5f; 598 } 599 if(init & 2){ 600 resid = in_samps - *out_samps * skip; 601 for(l=resid/skip; l-- >0; ){ 602 for(j=k-skip, dp1=mem, dp2=co, dp3=mem+skip, sum=0.0; j-- >0; 603 *dp1++ = *dp3++) 604 sum += *dp2++ * *dp1; 605 for(j=skip; j-- >0; *dp1++ = 0.0) 606 sum += *dp2++ * *dp1; 607 *bufo++ = (sum<0.0) ? sum -0.5f : sum +0.5f; 608 (*out_samps)++; 609 } 610 } 611 else 612 for(dp3=buf1+idx-ncoef+1, l=ncoef-1, sp=state; l-- >0; ) *sp++ = *dp3++; 613 } 614} 615 616/* ---------------------------------------------------------- */ 617static int lc_lin_fir(fc,nf,coef) 618/* create the coefficients for a symmetric FIR lowpass filter using the 619 window technique with a Hanning window. */ 620register float fc; 621float *coef; 622int *nf; 623{ 624 register int i, n; 625 register double twopi, fn, c; 626 627 if(((*nf % 2) != 1)) 628 *nf = *nf + 1; 629 n = (*nf + 1)/2; 630 631 /* Compute part of the ideal impulse response (the sin(x)/x kernel). */ 632 twopi = M_PI * 2.0; 633 coef[0] = (float) (2.0 * fc); 634 c = M_PI; 635 fn = twopi * fc; 636 for(i=1;i < n; i++) coef[i] = (float)(sin(i * fn)/(c * i)); 637 638 /* Now apply a Hanning window to the (infinite) impulse response. */ 639 /* (Probably should use a better window, like Kaiser...) */ 640 fn = twopi/(double)(*nf); 641 for(i=0;i<n;i++) 642 coef[n-i-1] *= (float)((.5 - (.5 * cos(fn * ((double)i + 0.5))))); 643 644 return(TRUE); 645} 646 647 648/* ----------------------------------------------------------------------- */ 649/* Use parabolic interpolation over the three points defining the peak 650 * vicinity to estimate the "true" peak. */ 651static void peak(y, xp, yp) 652 float *y, /* vector of length 3 defining peak */ 653 *xp, *yp; /* x,y values of parabolic peak fitting the input points. */ 654{ 655 register float a, c; 656 657 a = (float)((y[2]-y[1])+(.5*(y[0]-y[2]))); 658 if(fabs(a) > .000001) { 659 *xp = c = (float)((y[0]-y[2])/(4.0*a)); 660 *yp = y[1] - (a*c*c); 661 } else { 662 *xp = 0.0; 663 *yp = y[1]; 664 } 665} 666 667/* A fundamental frequency estimation algorithm using the normalized 668 cross correlation function and dynamic programming. The algorithm 669 implemented here is similar to that presented by B. Secrest and 670 G. Doddington, "An integrated pitch tracking algorithm for speech 671 systems", Proc. ICASSP-83, pp.1352-1355. It is fully described 672 by D. Talkin, "A robust algorithm for ptich tracking (RAPT)", in 673 W. B. Kleijn & K. K. Paliwal (eds.) Speech Coding and Synthesis, 674 (New York: Elsevier, 1995). */ 675 676/* For each frame, up to par->n_cands cross correlation peaks are 677 considered as F0 intervals. Each is scored according to its within- 678 frame properties (relative amplitude, relative location), and 679 according to its connectivity with each of the candidates in the 680 previous frame. An unvoiced hypothesis is also generated at each 681 frame and is considered in the light of voicing state change cost, 682 the quality of the cross correlation peak, and frequency continuity. */ 683 684/* At each frame, each candidate has associated with it the following 685 items: 686 its peak value 687 its peak value modified by its within-frame properties 688 its location 689 the candidate # in the previous frame yielding the min. err. 690 (this is the optimum path pointer!) 691 its cumulative cost: (local cost + connectivity cost + 692 cumulative cost of its best-previous-frame-match). */ 693 694/* Dynamic programming is then used to pick the best F0 trajectory and voicing 695 state given the local and transition costs for the entire utterance. */ 696 697/* To avoid the necessity of computing the full crosscorrelation at 698 the input sample rate, the signal is downsampled; a full ccf is 699 computed at the lower frequency; interpolation is used to estimate the 700 location of the peaks at the higher sample rate; and the fine-grained 701 ccf is computed only in the vicinity of these estimated peak 702 locations. */ 703 704 705/* 706 * READ_SIZE: length of input data frame in sec to read 707 * DP_CIRCULAR: determines the initial size of DP circular buffer in sec 708 * DP_HIST: stored frame history in second before checking for common path 709 * DP_CIRCULAR > READ_SIZE, DP_CIRCULAR at least 2 times of DP_HIST 710 * DP_LIMIT: in case no convergence is found, DP frames of DP_LIMIT secs 711 * are kept before output is forced by simply picking the lowest cost 712 * path 713 */ 714 715#define READ_SIZE 0.2 716#define DP_CIRCULAR 1.5 717#define DP_HIST 0.5 718#define DP_LIMIT 1.0 719 720/* 721 * stationarity parameters - 722 * STAT_WSIZE: window size in sec used in measuring frame energy/stationarity 723 * STAT_AINT: analysis interval in sec in measuring frame energy/stationarity 724 */ 725#define STAT_WSIZE 0.030 726#define STAT_AINT 0.020 727 728/* 729 * headF points to current frame in the circular buffer, 730 * tailF points to the frame where tracks start 731 * cmpthF points to starting frame of converged path to backtrack 732 */ 733 734static Frame *headF = NULL, *tailF = NULL, *cmpthF = NULL; 735 736static int *pcands = NULL; /* array for backtracking in convergence check */ 737static int cir_buff_growth_count = 0; 738 739static int size_cir_buffer, /* # of frames in circular DP buffer */ 740 size_frame_hist, /* # of frames required before convergence test */ 741 size_frame_out, /* # of frames before forcing output */ 742 num_active_frames, /* # of frames from tailF to headF */ 743 output_buf_size; /* # of frames allocated to output buffers */ 744 745/* 746 * DP parameters 747 */ 748static float tcost, tfact_a, tfact_s, frame_int, vbias, fdouble, wdur, ln2, 749 freqwt, lagwt; 750static int step, size, nlags, start, stop, ncomp, *locs = NULL; 751static short maxpeaks; 752 753static int wReuse = 0; /* number of windows seen before resued */ 754static Windstat *windstat = NULL; 755 756static float *f0p = NULL, *vuvp = NULL, *rms_speech = NULL, 757 *acpkp = NULL, *peaks = NULL; 758static int first_time = 1, pad; 759 760 761/*--------------------------------------------------------------------*/ 762int 763get_Nframes(buffsize, pad, step) 764 long buffsize; 765 int pad, step; 766{ 767 if (buffsize < pad) 768 return (0); 769 else 770 return ((buffsize - pad)/step); 771} 772 773 774/*--------------------------------------------------------------------*/ 775int 776init_dp_f0(freq, par, buffsize, sdstep) 777 double freq; 778 F0_params *par; 779 long *buffsize, *sdstep; 780{ 781 int nframes; 782 int i; 783 int stat_wsize, agap, ind, downpatch; 784 785/* 786 * reassigning some constants 787 */ 788 789 tcost = par->trans_cost; 790 tfact_a = par->trans_amp; 791 tfact_s = par->trans_spec; 792 vbias = par->voice_bias; 793 fdouble = par->double_cost; 794 frame_int = par->frame_step; 795 796 step = eround(frame_int * freq); 797 size = eround(par->wind_dur * freq); 798 frame_int = (float)(((float)step)/freq); 799 wdur = (float)(((float)size)/freq); 800 start = eround(freq / par->max_f0); 801 stop = eround(freq / par->min_f0); 802 nlags = stop - start + 1; 803 ncomp = size + stop + 1; /* # of samples required by xcorr 804 comp. per fr. */ 805 maxpeaks = 2 + (nlags/2); /* maximum number of "peaks" findable in ccf */ 806 ln2 = (float)log(2.0); 807 size_frame_hist = (int) (DP_HIST / frame_int); 808 size_frame_out = (int) (DP_LIMIT / frame_int); 809 810/* 811 * SET UP THE D.P. WEIGHTING FACTORS: 812 * The intent is to make the effectiveness of the various fudge factors 813 * independent of frame rate or sampling frequency. 814 */ 815 816 /* Lag-dependent weighting factor to emphasize early peaks (higher freqs)*/ 817 lagwt = par->lag_weight/stop; 818 819 /* Penalty for a frequency skip in F0 per frame */ 820 freqwt = par->freq_weight/frame_int; 821 822 i = (int) (READ_SIZE *freq); 823 if(ncomp >= step) nframes = ((i-ncomp)/step ) + 1; 824 else nframes = i / step; 825 826 /* *buffsize is the number of samples needed to make F0 computation 827 of nframes DP frames possible. The last DP frame is patched with 828 enough points so that F0 computation on it can be carried. F0 829 computaion on each frame needs enough points to do 830 831 1) xcross or cross correlation measure: 832 enough points to do xcross - ncomp 833 834 2) stationarity measure: 835 enough to make 30 msec windowing possible - ind 836 837 3) downsampling: 838 enough to make filtering possible -- downpatch 839 840 So there are nframes whole DP frames, padded with pad points 841 to make the last frame F0 computation ok. 842 843 */ 844 845 /* last point in data frame needs points of 1/2 downsampler filter length 846 long, 0.005 is the filter length used in downsampler */ 847 downpatch = (((int) (freq * 0.005))+1) / 2; 848 849 stat_wsize = (int) (STAT_WSIZE * freq); 850 agap = (int) (STAT_AINT * freq); 851 ind = ( agap - stat_wsize ) / 2; 852 i = stat_wsize + ind; 853 pad = downpatch + ((i>ncomp) ? i:ncomp); 854 *buffsize = nframes * step + pad; 855 *sdstep = nframes * step; 856 857 /* Allocate space for the DP storage circularly linked data structure */ 858 859 size_cir_buffer = (int) (DP_CIRCULAR / frame_int); 860 861 /* creating circularly linked data structures */ 862 tailF = alloc_frame(nlags, par->n_cands); 863 headF = tailF; 864 865 /* link them up */ 866 for(i=1; i<size_cir_buffer; i++){ 867 headF->next = alloc_frame(nlags, par->n_cands); 868 headF->next->prev = headF; 869 headF = headF->next; 870 } 871 headF->next = tailF; 872 tailF->prev = headF; 873 874 headF = tailF; 875 876 /* Allocate sscratch array to use during backtrack convergence test. */ 877 if( ! pcands ) { 878 pcands = (int *) ckalloc( par->n_cands * sizeof(int)); 879 /* spsassert(pcands,"can't allocate pathcands");*/ 880 } 881 882 /* Allocate arrays to return F0 and related signals. */ 883 884 /* Note: remember to compare *vecsize with size_frame_out, because 885 size_cir_buffer is not constant */ 886 output_buf_size = size_cir_buffer; 887 rms_speech = (float*)ckalloc(sizeof(float) * output_buf_size); 888 /* spsassert(rms_speech,"rms_speech ckalloc failed");*/ 889 f0p = (float*)ckalloc(sizeof(float) * output_buf_size); 890 /* spsassert(f0p,"f0p ckalloc failed");*/ 891 vuvp = (float*)ckalloc(sizeof(float)* output_buf_size); 892 /* spsassert(vuvp,"vuvp ckalloc failed");*/ 893 acpkp = (float*)ckalloc(sizeof(float) * output_buf_size); 894 /* spsassert(acpkp,"acpkp ckalloc failed");*/ 895 896 /* Allocate space for peak location and amplitude scratch arrays. */ 897 peaks = (float*)ckalloc(sizeof(float) * maxpeaks); 898 /* spsassert(peaks,"peaks ckalloc failed");*/ 899 locs = (int*)ckalloc(sizeof(int) * maxpeaks); 900 /* spsassert(locs, "locs ckalloc failed");*/ 901 902 /* Initialise the retrieval/saving scheme of window statistic measures */ 903 wReuse = agap / step; 904 if (wReuse){ 905 windstat = (Windstat *) ckalloc( wReuse * sizeof(Windstat)); 906 /* spsassert(windstat, "windstat ckalloc failed");*/ 907 for(i=0; i<wReuse; i++){ 908 windstat[i].err = 0; 909 windstat[i].rms = 0; 910 } 911 } 912 913 if(debug_level){ 914 Fprintf(stderr, "done with initialization:\n"); 915 Fprintf(stderr, 916 " size_cir_buffer:%d xcorr frame size:%d start lag:%d nlags:%d\n", 917 size_cir_buffer, size, start, nlags); 918 } 919 920 num_active_frames = 0; 921 first_time = 1; 922 923 return(0); 924} 925 926static Stat *get_stationarity(); 927 928/*--------------------------------------------------------------------*/ 929int 930dp_f0(fdata, buff_size, sdstep, freq, 931 par, f0p_pt, vuvp_pt, rms_speech_pt, acpkp_pt, vecsize, last_time) 932 float *fdata; 933 int buff_size, sdstep; 934 double freq; 935 F0_params *par; /* analysis control parameters */ 936 float **f0p_pt, **vuvp_pt, **rms_speech_pt, **acpkp_pt; 937 int *vecsize, last_time; 938{ 939 float maxval, engref, *sta, *rms_ratio, *dsdata, *downsample(); 940 register float ttemp, ftemp, ft1, ferr, err, errmin; 941 register int i, j, k, loc1, loc2; 942 int nframes, maxloc, ncand, ncandp, minloc, 943 decimate, samsds; 944 945 Stat *stat = NULL; 946 947 nframes = get_Nframes((long) buff_size, pad, step); /* # of whole frames */ 948 949 if(debug_level) 950 Fprintf(stderr, 951 "******* Computing %d dp frames ******** from %d points\n", nframes, buff_size); 952 953 /* Now downsample the signal for coarse peak estimates. */ 954 955 decimate = (int)(freq/2000.0); /* downsample to about 2kHz */ 956 if (decimate <= 1) 957 dsdata = fdata; 958 else { 959 samsds = ((nframes-1) * step + ncomp) / decimate; 960 dsdata = downsample(fdata, buff_size, sdstep, freq, &samsds, decimate, 961 first_time, last_time); 962 if (!dsdata) { 963 Fprintf(stderr, "can't get downsampled data.\n"); 964 return 1; 965 } 966 } 967 968 /* Get a function of the "stationarity" of the speech signal. */ 969 970 stat = get_stationarity(fdata, freq, buff_size, nframes, step, first_time); 971 if (!stat) { 972 Fprintf(stderr, "can't get stationarity\n"); 973 return(1); 974 } 975 sta = stat->stat; 976 rms_ratio = stat->rms_ratio; 977 978 /***********************************************************************/ 979 /* MAIN FUNDAMENTAL FREQUENCY ESTIMATION LOOP */ 980 /***********************************************************************/ 981 if(!first_time && nframes > 0) headF = headF->next; 982 983 for(i = 0; i < nframes; i++) { 984 985 /* NOTE: This buffer growth provision is probably not necessary. 986 It was put in (with errors) by Derek Lin and apparently never 987 tested. My tests and analysis suggest it is completely 988 superfluous. DT 9/5/96 */ 989 /* Dynamically allocating more space for the circular buffer */ 990 if(headF == tailF->prev){ 991 Frame *frm; 992 993 if(cir_buff_growth_count > 5){ 994 Fprintf(stderr, 995 "too many requests (%d) for dynamically allocating space.\n There may be a problem in finding converged path.\n",cir_buff_growth_count); 996 return(1); 997 } 998 if(debug_level) 999 Fprintf(stderr, "allocating %d more frames for DP circ. buffer.\n", size_cir_buffer); 1000 frm = alloc_frame(nlags, par->n_cands); 1001 headF->next = frm; 1002 frm->prev = headF; 1003 for(k=1; k<size_cir_buffer; k++){ 1004 frm->next = alloc_frame(nlags, par->n_cands); 1005 frm->next->prev = frm; 1006 frm = frm->next; 1007 } 1008 frm->next = tailF; 1009 tailF->prev = frm; 1010 cir_buff_growth_count++; 1011 } 1012 1013 headF->rms = stat->rms[i]; 1014 get_fast_cands(fdata, dsdata, i, step, size, decimate, start, 1015 nlags, &engref, &maxloc, 1016 &maxval, headF->cp, peaks, locs, &ncand, par); 1017 1018 /* Move the peak value and location arrays into the dp structure */ 1019 { 1020 register float *ftp1, *ftp2; 1021 register short *sp1; 1022 register int *sp2; 1023 1024 for(ftp1 = headF->dp->pvals, ftp2 = peaks, 1025 sp1 = headF->dp->locs, sp2 = locs, j=ncand; j--; ) { 1026 *ftp1++ = *ftp2++; 1027 *sp1++ = *sp2++; 1028 } 1029 *sp1 = -1; /* distinguish the UNVOICED candidate */ 1030 *ftp1 = maxval; 1031 headF->dp->mpvals[ncand] = vbias+maxval; /* (high cost if cor. is high)*/ 1032 } 1033 1034 /* Apply a lag-dependent weight to the peaks to encourage the selection 1035 of the first major peak. Translate the modified peak values into 1036 costs (high peak ==> low cost). */ 1037 for(j=0; j < ncand; j++){ 1038 ftemp = 1.0f - ((float)locs[j] * lagwt); 1039 headF->dp->mpvals[j] = 1.0f - (peaks[j] * ftemp); 1040 } 1041 ncand++; /* include the unvoiced candidate */ 1042 headF->dp->ncands = ncand; 1043 1044 /*********************************************************************/ 1045 /* COMPUTE THE DISTANCE MEASURES AND ACCUMULATE THE COSTS. */ 1046 /*********************************************************************/ 1047 1048 ncandp = headF->prev->dp->ncands; 1049 for(k=0; k<ncand; k++){ /* for each of the current candidates... */ 1050 minloc = 0; 1051 errmin = FLT_MAX; 1052 if((loc2 = headF->dp->locs[k]) > 0) { /* current cand. is voiced */ 1053 for(j=0; j<ncandp; j++){ /* for each PREVIOUS candidate... */ 1054 /* Get cost due to inter-frame period change. */ 1055 loc1 = headF->prev->dp->locs[j]; 1056 if (loc1 > 0) { /* prev. was voiced */ 1057 ftemp = (float) log(((double) loc2) / loc1); 1058 ttemp = (float) fabs(ftemp); 1059 ft1 = (float) (fdouble + fabs(ftemp + ln2)); 1060 if (ttemp > ft1) 1061 ttemp = ft1; 1062 ft1 = (float) (fdouble + fabs(ftemp - ln2)); 1063 if (ttemp > ft1) 1064 ttemp = ft1; 1065 ferr = ttemp * freqwt; 1066 } else { /* prev. was unvoiced */ 1067 ferr = tcost + (tfact_s * sta[i]) + (tfact_a / rms_ratio[i]); 1068 } 1069 /* Add in cumulative cost associated with previous peak. */ 1070 err = ferr + headF->prev->dp->dpvals[j]; 1071 if(err < errmin){ /* find min. cost */ 1072 errmin = err; 1073 minloc = j; 1074 } 1075 } 1076 } else { /* this is the unvoiced candidate */ 1077 for(j=0; j<ncandp; j++){ /* for each PREVIOUS candidate... */ 1078 1079 /* Get voicing transition cost. */ 1080 if (headF->prev->dp->locs[j] > 0) { /* previous was voiced */ 1081 ferr = tcost + (tfact_s * sta[i]) + (tfact_a * rms_ratio[i]); 1082 } 1083 else 1084 ferr = 0.0; 1085 /* Add in cumulative cost associated with previous peak. */ 1086 err = ferr + headF->prev->dp->dpvals[j]; 1087 if(err < errmin){ /* find min. cost */ 1088 errmin = err; 1089 minloc = j; 1090 } 1091 } 1092 } 1093 /* Now have found the best path from this cand. to prev. frame */ 1094 if (first_time && i==0) { /* this is the first frame */ 1095 headF->dp->dpvals[k] = headF->dp->mpvals[k]; 1096 headF->dp->prept[k] = 0; 1097 } else { 1098 headF->dp->dpvals[k] = errmin + headF->dp->mpvals[k]; 1099 headF->dp->prept[k] = minloc; 1100 } 1101 } /* END OF THIS DP FRAME */ 1102 1103 if (i < nframes - 1) 1104 headF = headF->next; 1105 1106 if (debug_level >= 2) { 1107 Fprintf(stderr,"%d engref:%10.0f max:%7.5f loc:%4d\n", 1108 i,engref,maxval,maxloc); 1109 } 1110 1111 } /* end for (i ...) */ 1112 1113 /***************************************************************/ 1114 /* DONE WITH FILLING DP STRUCTURES FOR THE SET OF SAMPLED DATA */ 1115 /* NOW FIND A CONVERGED DP PATH */ 1116 /***************************************************************/ 1117 1118 *vecsize = 0; /* # of output frames returned */ 1119 1120 num_active_frames += nframes; 1121 1122 if( num_active_frames >= size_frame_hist || last_time ){ 1123 Frame *frm; 1124 int num_paths, best_cand, frmcnt, checkpath_done = 1; 1125 float patherrmin; 1126 1127 if(debug_level) 1128 Fprintf(stderr, "available frames for backtracking: %d\n", 1129num_active_frames); 1130 1131 patherrmin = FLT_MAX; 1132 best_cand = 0; 1133 num_paths = headF->dp->ncands; 1134 1135 /* Get the best candidate for the final frame and initialize the 1136 paths' backpointers. */ 1137 frm = headF; 1138 for(k=0; k < num_paths; k++) { 1139 if (patherrmin > headF->dp->dpvals[k]){ 1140 patherrmin = headF->dp->dpvals[k]; 1141 best_cand = k; /* index indicating the best candidate at a path */ 1142 } 1143 pcands[k] = frm->dp->prept[k]; 1144 } 1145 1146 if(last_time){ /* Input data was exhausted. force final outputs. */ 1147 cmpthF = headF; /* Use the current frame as starting point. */ 1148 } else { 1149 /* Starting from the most recent frame, trace back each candidate's 1150 best path until reaching a common candidate at some past frame. */ 1151 frmcnt = 0; 1152 while (1) { 1153 frm = frm->prev; 1154 frmcnt++; 1155 checkpath_done = 1; 1156 for(k=1; k < num_paths; k++){ /* Check for convergence. */ 1157 if(pcands[0] != pcands[k]) 1158 checkpath_done = 0; 1159 } 1160 if( ! checkpath_done) { /* Prepare for checking at prev. frame. */ 1161 for(k=0; k < num_paths; k++){ 1162 pcands[k] = frm->dp->prept[pcands[k]]; 1163 } 1164 } else { /* All paths have converged. */ 1165 cmpthF = frm; 1166 best_cand = pcands[0]; 1167 if(debug_level) 1168 Fprintf(stderr, 1169 "paths went back %d frames before converging\n",frmcnt); 1170 break; 1171 } 1172 if(frm == tailF){ /* Used all available data? */ 1173 if( num_active_frames < size_frame_out) { /* Delay some more? */ 1174 checkpath_done = 0; /* Yes, don't backtrack at this time. */ 1175 cmpthF = NULL; 1176 } else { /* No more delay! Force best-guess output. */ 1177 checkpath_done = 1; 1178 cmpthF = headF; 1179 /* Fprintf(stderr, 1180 "WARNING: no converging path found after going back %d frames, will use the lowest cost path\n",num_active_frames);*/ 1181 } 1182 break; 1183 } /* end if (frm ...) */ 1184 } /* end while (1) */ 1185 } /* end if (last_time) ... else */ 1186 1187 /*************************************************************/ 1188 /* BACKTRACKING FROM cmpthF (best_cand) ALL THE WAY TO tailF */ 1189 /*************************************************************/ 1190 i = 0; 1191 frm = cmpthF; /* Start where convergence was found (or faked). */ 1192 while( frm != tailF->prev && checkpath_done){ 1193 if( i == output_buf_size ){ /* Need more room for outputs? */ 1194 output_buf_size *= 2; 1195 if(debug_level) 1196 Fprintf(stderr, 1197 "reallocating space for output frames: %d\n", 1198 output_buf_size); 1199 rms_speech = (float *) ckrealloc((void *) rms_speech, 1200 sizeof(float) * output_buf_size); 1201 /* spsassert(rms_speech, "rms_speech realloc failed in dp_f0()");*/ 1202 f0p = (float *) ckrealloc((void *) f0p, 1203 sizeof(float) * output_buf_size); 1204 /* spsassert(f0p, "f0p realloc failed in dp_f0()");*/ 1205 vuvp = (float *) ckrealloc((void *) vuvp, sizeof(float) * output_buf_size); 1206 /* spsassert(vuvp, "vuvp realloc failed in dp_f0()");*/ 1207 acpkp = (float *) ckrealloc((void *) acpkp, sizeof(float) * output_buf_size); 1208 /* spsassert(acpkp, "acpkp realloc failed in dp_f0()");*/ 1209 } 1210 rms_speech[i] = frm->rms; 1211 acpkp[i] = frm->dp->pvals[best_cand]; 1212 loc1 = frm->dp->locs[best_cand]; 1213 vuvp[i] = 1.0; 1214 best_cand = frm->dp->prept[best_cand]; 1215 ftemp = (float) loc1; 1216 if(loc1 > 0) { /* Was f0 actually estimated for this frame? */ 1217 if (loc1 > start && loc1 < stop) { /* loc1 must be a local maximum. */ 1218 float cormax, cprev, cnext, den; 1219 1220 j = loc1 - start; 1221 cormax = frm->cp->correl[j]; 1222 cprev = frm->cp->correl[j+1]; 1223 cnext = frm->cp->correl[j-1]; 1224 den = (float) (2.0 * ( cprev + cnext - (2.0 * cormax) )); 1225 /* 1226 * Only parabolic interpolate if cormax is indeed a local 1227 * turning point. Find peak of curve that goes though the 3 points 1228 */ 1229 1230 if (fabs(den) > 0.000001) 1231 ftemp += 2.0f - ((((5.0f*cprev)+(3.0f*cnext)-(8.0f*cormax))/den)); 1232 } 1233 f0p[i] = (float) (freq/ftemp); 1234 } else { /* No valid estimate; just fake some arbitrary F0. */ 1235 f0p[i] = 0; 1236 vuvp[i] = 0.0; 1237 } 1238 frm = frm->prev; 1239 1240 if (debug_level >= 2) 1241 Fprintf(stderr," i:%4d%8.1f%8.1f\n",i,f0p[i],vuvp[i]); 1242 /* f0p[i] starts from the most recent one */ 1243 /* Need to reverse the order in the calling function */ 1244 i++; 1245 } /* end while() */ 1246 if (checkpath_done){ 1247 *vecsize = i; 1248 tailF = cmpthF->next; 1249 num_active_frames -= *vecsize; 1250 } 1251 } /* end if() */ 1252 1253 if (debug_level) 1254 Fprintf(stderr, "writing out %d frames.\n", *vecsize); 1255 1256 *f0p_pt = f0p; 1257 *vuvp_pt = vuvp; 1258 *acpkp_pt = acpkp; 1259 *rms_speech_pt = rms_speech; 1260 /* *acpkp_pt = acpkp;*/ 1261 1262 if(first_time) first_time = 0; 1263 return(0); 1264} 1265 1266 1267/*--------------------------------------------------------------------*/ 1268Frame * 1269alloc_frame(nlags, ncands) 1270 int nlags, ncands; 1271{ 1272 Frame *frm; 1273 int j; 1274 1275 frm = (Frame*)ckalloc(sizeof(Frame)); 1276 frm->dp = (Dprec *) ckalloc(sizeof(Dprec)); 1277 /* spsassert(frm->dp,"frm->dp ckalloc failed in alloc_frame");*/ 1278 frm->dp->ncands = 0; 1279 frm->cp = (Cross *) ckalloc(sizeof(Cross)); 1280 /* spsassert(frm->cp,"frm->cp ckalloc failed in alloc_frame");*/ 1281 frm->cp->correl = (float *) ckalloc(sizeof(float) * nlags); 1282 /* spsassert(frm->cp->correl, "frm->cp->correl ckalloc failed");*/ 1283 /* Allocate space for candidates and working arrays. */ 1284 frm->dp->locs = (short*)ckalloc(sizeof(short) * ncands); 1285 /* spsassert(frm->dp->locs,"frm->dp->locs ckalloc failed in alloc_frame()");*/ 1286 frm->dp->pvals = (float*)ckalloc(sizeof(float) * ncands); 1287/* spsassert(frm->dp->pvals,"frm->dp->pvals ckalloc failed in alloc_frame()");*/ 1288 frm->dp->mpvals = (float*)ckalloc(sizeof(float) * ncands); 1289 /* spsassert(frm->dp->mpvals,"frm->dp->mpvals ckalloc failed in alloc_frame()");*/ 1290 frm->dp->prept = (short*)ckalloc(sizeof(short) * ncands); 1291 /* spsassert(frm->dp->prept,"frm->dp->prept ckalloc failed in alloc_frame()");*/ 1292 frm->dp->dpvals = (float*)ckalloc(sizeof(float) * ncands); 1293 /* spsassert(frm->dp->dpvals,"frm->dp->dpvals ckalloc failed in alloc_frame()");*/ 1294 1295 /* Initialize the cumulative DP costs to zero */ 1296 for(j = ncands-1; j >= 0; j--) 1297 frm->dp->dpvals[j] = 0.0; 1298 1299 return(frm); 1300} 1301 1302 1303/*--------------------------------------------------------------------*/ 1304/* push window stat to stack, and pop the oldest one */ 1305 1306static int 1307save_windstat(rho, order, err, rms) 1308 float *rho; 1309 int order; 1310 float err; 1311 float rms; 1312{ 1313 int i,j; 1314 1315 if(wReuse > 1){ /* push down the stack */ 1316 for(j=1; j<wReuse; j++){ 1317 for(i=0;i<=order; i++) windstat[j-1].rho[i] = windstat[j].rho[i]; 1318 windstat[j-1].err = windstat[j].err; 1319 windstat[j-1].rms = windstat[j].rms; 1320 } 1321 for(i=0;i<=order; i++) windstat[wReuse-1].rho[i] = rho[i]; /*save*/ 1322 windstat[wReuse-1].err = (float) err; 1323 windstat[wReuse-1].rms = (float) rms; 1324 return 1; 1325 } else if (wReuse == 1) { 1326 for(i=0;i<=order; i++) windstat[0].rho[i] = rho[i]; /* save */ 1327 windstat[0].err = (float) err; 1328 windstat[0].rms = (float) rms; 1329 return 1; 1330 } else 1331 return 0; 1332} 1333 1334 1335/*--------------------------------------------------------------------*/ 1336static int 1337retrieve_windstat(rho, order, err, rms) 1338 float *rho; 1339 int order; 1340 float *err; 1341 float *rms; 1342{ 1343 Windstat wstat; 1344 int i; 1345 1346 if(wReuse){ 1347 wstat = windstat[0]; 1348 for(i=0; i<=order; i++) rho[i] = wstat.rho[i]; 1349 *err = wstat.err; 1350 *rms = wstat.rms; 1351 return 1; 1352 } 1353 else return 0; 1354} 1355 1356 1357/*--------------------------------------------------------------------*/ 1358static float 1359get_similarity(order, size, pdata, cdata, 1360 rmsa, rms_ratio, pre, stab, w_type, init) 1361 int order, size; 1362 float *pdata, *cdata; 1363 float *rmsa, *rms_ratio, pre, stab; 1364 int w_type, init; 1365{ 1366 float rho3[BIGSORD+1], err3, rms3, rmsd3, b0, t, a2[BIGSORD+1], 1367 rho1[BIGSORD+1], a1[BIGSORD+1], b[BIGSORD+1], err1, rms1, rmsd1; 1368 float xitakura(), wind_energy(); 1369 void xa_to_aca (); 1370 int xlpc(); 1371 1372/* (In the lpc() calls below, size-1 is used, since the windowing and 1373 preemphasis function assumes an extra point is available in the 1374 input data array. This condition is apparently no longer met after 1375 Derek's modifications.) */ 1376 1377 /* get current window stat */ 1378 xlpc(order, stab, size-1, cdata, 1379 a2, rho3, (float *) NULL, &err3, &rmsd3, pre, w_type); 1380 rms3 = wind_energy(cdata, size, w_type); 1381 1382 if(!init) { 1383 /* get previous window stat */ 1384 if( !retrieve_windstat(rho1, order, &err1, &rms1)){ 1385 xlpc(order, stab, size-1, pdata, 1386 a1, rho1, (float *) NULL, &err1, &rmsd1, pre, w_type); 1387 rms1 = wind_energy(pdata, size, w_type); 1388 } 1389 xa_to_aca(a2+1,b,&b0,order); 1390 t = xitakura(order,b,&b0,rho1+1,&err1) - .8f; 1391 if(rms1 > 0.0) 1392 *rms_ratio = (0.001f + rms3)/rms1; 1393 else 1394 if(rms3 > 0.0) 1395 *rms_ratio = 2.0; /* indicate some energy increase */ 1396 else 1397 *rms_ratio = 1.0; /* no change */ 1398 } else { 1399 *rms_ratio = 1.0; 1400 t = 10.0; 1401 } 1402 *rmsa = rms3; 1403 save_windstat( rho3, order, err3, rms3); 1404 return((float)(0.2/t)); 1405} 1406 1407 1408/* -------------------------------------------------------------------- */ 1409/* This is an ad hoc signal stationarity function based on Itakura 1410 * distance and relative amplitudes. 1411 */ 1412/* 1413 This illustrates the window locations when the very first frame is read. 1414 It shows an example where each frame step | . | is 10 msec. The 1415 frame step size is variable. The window size is always 30 msec. 1416 The window centers '*' is always 20 msec apart. 1417 The windows cross each other right at the center of the DP frame, or 1418 where the '.' is. 1419 1420 ---------*--------- current window 1421 1422 ---------*--------- previous window 1423 1424 | . | . | . | . | . | . | . | . | . | 1425 ^ ^ ^ 1426 ^ ^ ^ 1427 ^ ^ fdata 1428 ^ ^ 1429 ^ q 1430 p 1431 1432 --- 1433 ind 1434 1435 fdata, q, p, ind, are variables used below. 1436 1437*/ 1438 1439static Stat *stat = NULL; 1440static float *mem = NULL; 1441 1442static Stat* 1443get_stationarity(fdata, freq, buff_size, nframes, frame_step, first_time) 1444 float *fdata; 1445 double freq; 1446 int buff_size, nframes, frame_step, first_time; 1447{ 1448 static int nframes_old = 0, memsize; 1449 float preemp = 0.4f, stab = 30.0f; 1450 float *p, *q, *r, *datend; 1451 int ind, i, j, m, size, order, agap, w_type = 3; 1452 1453 agap = (int) (STAT_AINT *freq); 1454 size = (int) (STAT_WSIZE * freq); 1455 ind = (agap - size) / 2; 1456 1457 if( nframes_old < nframes || !stat || first_time){ 1458 /* move this to init_dp_f0() later */ 1459 nframes_old = nframes; 1460 if(stat){ 1461 ckfree((char *) stat->stat); 1462 ckfree((char *) stat->rms); 1463 ckfree((char *) stat->rms_ratio); 1464 ckfree((char *) stat); 1465 } 1466 if (mem) ckfree((void *)mem); 1467 stat = (Stat *) ckalloc(sizeof(Stat)); 1468 /* spsassert(stat,"stat ckalloc failed in get_stationarity");*/ 1469 stat->stat = (float*)ckalloc(sizeof(float)*nframes); 1470 /* spsassert(stat->stat,"stat->stat ckalloc failed in get_stationarity");*/ 1471 stat->rms = (float*)ckalloc(sizeof(float)*nframes); 1472 /* spsassert(stat->rms,"stat->rms ckalloc failed in get_stationarity");*/ 1473 stat->rms_ratio = (float*)ckalloc(sizeof(float)*nframes); 1474 /* spsassert(stat->rms_ratio,"stat->ratio ckalloc failed in get_stationarity");*/ 1475 memsize = (int) (STAT_WSIZE * freq) + (int) (STAT_AINT * freq); 1476 mem = (float *) ckalloc( sizeof(float) * memsize); 1477 /* spsassert(mem, "mem ckalloc failed in get_stationarity()");*/ 1478 for(j=0; j<memsize; j++) mem[j] = 0; 1479 } 1480 1481 if(nframes == 0) return(stat); 1482 1483 q = fdata + ind; 1484 datend = fdata + buff_size; 1485 1486 if((order = (int) (2.0 + (freq/1000.0))) > BIGSORD) { 1487 Fprintf(stderr, 1488 "Optimim order (%d) exceeds that allowable (%d); reduce Fs\n",order, BIGSORD); 1489 order = BIGSORD; 1490 } 1491 1492 /* prepare for the first frame */ 1493 for(j=memsize/2, i=0; j<memsize; j++, i++) mem[j] = fdata[i]; 1494 1495 /* never run over end of frame, should already taken care of when read */ 1496 1497 for(j=0, p = q - agap; j < nframes; j++, p += frame_step, q += frame_step){ 1498 if( (p >= fdata) && (q >= fdata) && ( q + size <= datend) ) 1499 stat->stat[j] = get_similarity(order,size, p, q, 1500 &(stat->rms[j]), 1501 &(stat->rms_ratio[j]),preemp, 1502 stab,w_type, 0); 1503 else { 1504 if(first_time) { 1505 if( (p < fdata) && (q >= fdata) && (q+size <=datend) ) 1506 stat->stat[j] = get_similarity(order,size, NULL, q, 1507 &(stat->rms[j]), 1508 &(stat->rms_ratio[j]), 1509 preemp,stab,w_type, 1); 1510 else{ 1511 stat->rms[j] = 0.0; 1512 stat->stat[j] = 0.01f * 0.2f; /* a big transition */ 1513 stat->rms_ratio[j] = 1.0; /* no amplitude change */ 1514 } 1515 } else { 1516 if( (p<fdata) && (q+size <=datend) ){ 1517 stat->stat[j] = get_similarity(order,size, mem, 1518 mem + (memsize/2) + ind, 1519 &(stat->rms[j]), 1520 &(stat->rms_ratio[j]), 1521 preemp, stab,w_type, 0); 1522 /* prepare for the next frame_step if needed */ 1523 if(p + frame_step < fdata ){ 1524 for( m=0; m<(memsize-frame_step); m++) 1525 mem[m] = mem[m+frame_step]; 1526 r = q + size; 1527 for( m=0; m<frame_step; m++) 1528 mem[memsize-frame_step+m] = *r++; 1529 } 1530 } 1531 } 1532 } 1533 } 1534 1535 /* last frame, prepare for next call */ 1536 for(j=(memsize/2)-1, p=fdata + (nframes * frame_step)-1; j>=0 && p>=fdata; j--) 1537 mem[j] = *p--; 1538 return(stat); 1539} 1540 1541 1542/* -------------------------------------------------------------------- */ 1543/* Round the argument to the nearest integer. */ 1544/* 1545int 1546eround(flnum) 1547 double flnum; 1548{ 1549 return((flnum >= 0.0) ? (int)(flnum + 0.5) : (int)(flnum - 0.5)); 1550} 1551 1552*/ 1553void free_dp_f0() 1554{ 1555 int i; 1556 Frame *frm, *next; 1557 1558 ckfree((void *)pcands); 1559 pcands = NULL; 1560 1561 ckfree((void *)rms_speech); 1562 rms_speech = NULL; 1563 1564 ckfree((void *)f0p); 1565 f0p = NULL; 1566 1567 ckfree((void *)vuvp); 1568 vuvp = NULL; 1569 1570 ckfree((void *)acpkp); 1571 acpkp = NULL; 1572 1573 ckfree((void *)peaks); 1574 peaks = NULL; 1575 1576 ckfree((void *)locs); 1577 locs = NULL; 1578 1579 if (wReuse) { 1580 ckfree((void *)windstat); 1581 windstat = NULL; 1582 } 1583 1584 frm = headF; 1585 1586 for(i = 0; i < size_cir_buffer; i++) { 1587 next = frm->next; 1588 ckfree((void *)frm->cp->correl); 1589 ckfree((void *)frm->dp->locs); 1590 ckfree((void *)frm->dp->pvals); 1591 ckfree((void *)frm->dp->mpvals); 1592 ckfree((void *)frm->dp->prept); 1593 ckfree((void *)frm->dp->dpvals); 1594 ckfree((void *)frm->cp); 1595 ckfree((void *)frm->dp); 1596 ckfree((void *)frm); 1597 frm = next; 1598 } 1599 headF = NULL; 1600 tailF = NULL; 1601 1602 ckfree((void *)stat->stat); 1603 ckfree((void *)stat->rms); 1604 ckfree((void *)stat->rms_ratio); 1605 1606 ckfree((void *)stat); 1607 stat = NULL; 1608 1609 ckfree((void *)mem); 1610 mem = NULL; 1611} 1612 1613int 1614cGet_f0(Sound *sound, Tcl_Interp *interp, float **outlist, int *length) 1615{ 1616 float *fdata; 1617 int done; 1618 long buff_size, actsize; 1619 double sf, start_time; 1620 F0_params *par, *read_f0_params(); 1621 float *f0p, *vuvp, *rms_speech, *acpkp; 1622 int i, vecsize; 1623 int init_dp_f0(), dp_f0(); 1624 static int framestep = -1; 1625 long sdstep = 0, total_samps; 1626 int ndone = 0; 1627 Tcl_Obj *list; 1628 float *tmp = (float *)ckalloc(sizeof(float) * (5 + sound->length / 80)); 1629 int count = 0; 1630 int startpos = 0, endpos = -1; 1631 1632 if (sound->cmdPtr != NULL) { 1633 Tcl_DecrRefCount(sound->cmdPtr); 1634 sound->cmdPtr = NULL; 1635 } 1636 1637 par = (F0_params *) ckalloc(sizeof(F0_params)); 1638 par->cand_thresh = 0.3f; 1639 par->lag_weight = 0.3f; 1640 par->freq_weight = 0.02f; 1641 par->trans_cost = 0.005f; 1642 par->trans_amp = 0.5f; 1643 par->trans_spec = 0.5f; 1644 par->voice_bias = 0.0f; 1645 par->double_cost = 0.35f; 1646 par->min_f0 = 50; 1647 par->max_f0 = 550; 1648 par->frame_step = 0.01f; 1649 par->wind_dur = 0.0075f; 1650 par->n_cands = 20; 1651 par->mean_f0 = 200; /* unused */ 1652 par->mean_f0_weight = 0.0f; /* unused */ 1653 par->conditioning = 0; /* unused */ 1654 1655 if (startpos < 0) startpos = 0; 1656 if (endpos >= (sound->length - 1) || endpos == -1) 1657 endpos = sound->length - 1; 1658 if (startpos > endpos) return TCL_OK; 1659 1660 sf = (double) sound->samprate; 1661 1662 if (framestep > 0) /* If a value was specified with -S, use it. */ 1663 par->frame_step = (float) (framestep / sf); 1664 start_time = 0.0f; 1665 if(check_f0_params(interp, par, sf)){ 1666 Tcl_AppendResult(interp, "invalid/inconsistent parameters -- exiting.", NULL); 1667 return TCL_ERROR; 1668 } 1669 1670 total_samps = endpos - startpos + 1; 1671 if(total_samps < ((par->frame_step * 2.0) + par->wind_dur) * sf) { 1672 Tcl_AppendResult(interp, "input range too small for analysis by get_f0.", NULL); 1673 return TCL_ERROR; 1674 } 1675 /* Initialize variables in get_f0.c; allocate data structures; 1676 * determine length and overlap of input frames to read. 1677 */ 1678 if (init_dp_f0(sf, par, &buff_size, &sdstep) 1679 || buff_size > INT_MAX || sdstep > INT_MAX) 1680 { 1681 Tcl_AppendResult(interp, "problem in init_dp_f0().", NULL); 1682 return TCL_ERROR; 1683 } 1684 1685 if (debug_level) 1686 Fprintf(stderr, "init_dp_f0 returned buff_size %ld, sdstep %ld.\n",buff_size, sdstep); 1687 1688 if (buff_size > total_samps) 1689 buff_size = total_samps; 1690 1691 actsize = min(buff_size, sound->length); 1692 fdata = (float *) ckalloc(sizeof(float) * max(buff_size, sdstep)); 1693 list = Tcl_NewListObj(0, NULL); 1694 /* Snack_ProgressCallback(sound->cmdPtr, interp, "Computing pitch", 0.0);*/ 1695 ndone = startpos; 1696 1697 while (TRUE) { 1698 done = (actsize < buff_size) || (total_samps == buff_size); 1699 Snack_GetSoundData(sound, ndone, fdata, actsize); 1700 /*if (sound->debug > 0) Snack_WriteLog("dp_f0...\n");*/ 1701 if (dp_f0(fdata, (int) actsize, (int) sdstep, sf, par, 1702 &f0p, &vuvp, &rms_speech, &acpkp, &vecsize, done)) { 1703 Tcl_AppendResult(interp, "problem in dp_f0().", NULL); 1704 return TCL_ERROR; 1705 } 1706 /*if (sound->debug > 0) Snack_WriteLogInt("done dp_f0",vecsize);*/ 1707 for (i = vecsize - 1; i >= 0; i--) { 1708 tmp[count] = f0p[i]; 1709 count++; 1710 } 1711 1712 if (done) break; 1713 1714 ndone += sdstep; 1715 actsize = min(buff_size, sound->length - ndone); 1716 total_samps -= sdstep; 1717 1718 if (actsize > total_samps) 1719 actsize = total_samps; 1720 1721 /* if (1) { 1722 int res = Snack_ProgressCallback(sound->cmdPtr, interp, "Computing pitch", (double) ndone / sound->length); 1723 if (res != TCL_OK) { 1724 return TCL_ERROR; 1725 } 1726 }*/ 1727 } 1728 1729 /*Snack_ProgressCallback(sound->cmdPtr, interp, "Computing pitch", 1.0);*/ 1730 1731 ckfree((void *)fdata); 1732 1733 ckfree((void *)par); 1734 1735 free_dp_f0(); 1736 1737 *outlist = tmp; 1738 *length = count; 1739 /*Tcl_SetObjResult(interp, list);*/ 1740 1741 return TCL_OK; 1742} 1743