SphinxBase
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * cont_ad.c -- Continuous A/D listening and silence filtering module. 00039 * 00040 * HISTORY 00041 * 00042 * $Log: cont_ad_base.c,v $ 00043 * Revision 1.14 2005/07/02 03:51:32 rkm 00044 * Slowed down power histogram decay rate 00045 * 00046 * Revision 1.13 2005/06/30 00:27:17 rkm 00047 * Fixed silence handling in rawmode; added extra state variables 00048 * 00049 * 00050 * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. 00051 * - Changed rawmode handling to simply copy data even for silence 00052 * segments. 00053 * - Moved definitions of CONT_AD_STATE_{SIL,SPEECH} from .c to .h. 00054 * 00055 * Revision 1.12 2005/06/29 23:48:04 egouvea 00056 * Revert changes: variables defined in cont_ad_base.c should not be accessible by the application 00057 * 00058 * Revision 1.10 2005/02/13 01:29:48 rkm 00059 * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode 00060 * 00061 * Revision 1.9 2005/02/01 22:21:19 rkm 00062 * Added raw data logging, and raw data pass-through mode to cont_ad 00063 * 00064 * Revision 1.8 2004/07/23 23:36:34 egouvea 00065 * Ravi's merge, with the latest fixes in the FSG code, and making the log files generated by FSG, LM, and allphone have the same 'look and feel', with the backtrace information presented consistently 00066 * 00067 * 23-Jul-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00068 * Changed default adapt_rate from 0.5 to 0.2. 00069 * 00070 * Revision 1.7 2004/07/16 00:57:12 egouvea 00071 * Added Ravi's implementation of FSG support. 00072 * 00073 * Revision 1.2 2004/06/23 20:31:18 rkm 00074 * Added adapt_rate parameter; restructured frame processing to include threshold update 00075 * 00076 * 00077 * 23-Oct-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00078 * Small change in the way the noiselevel is updated in find_thresh(). 00079 * 00080 * 26-Aug-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00081 * Separated computation of "frame power" into a separate low-level 00082 * function. 00083 * 00084 * 13-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00085 * Modified to allow frame size to depend on audio sampling rate. 00086 * 00087 * 01-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00088 * Changed CONT_AD_DELTA_SPEECH back to 20. 00089 * 00090 * 30-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00091 * Changed CONT_AD_DELTA_SPEECH from 10 to 15. 00092 * Added FILE* argument to cont_ad_powhist_dump(). 00093 * 00094 * 19-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00095 * Changed CONT_AD_DELTA_SPEECH from 20 to 10, to increase sensitivity 00096 * to very short utterances. 00097 * 00098 * 16-Jan-98 Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University 00099 * Changed to use dB instead of the weird power measure. 00100 * Changed analysis window size, tuned default settings of most 00101 * parameters to make the system less sensitive to noise, changed 00102 * the histogram update frequency and decay to make the system 00103 * adapt more rapidly to changes in the environment. 00104 * Added cont_ad_set_params() and cont_ad_get_params(). 00105 * 00106 * 28-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00107 * Added FRMPOW2SIGLVL, max_siglvl(), and cont_ad_t.siglvl. 00108 * Changed min signal energy/frame to CONT_AD_SPF. 00109 * 00110 * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00111 * Added the option for cont_ad_read to return -1 on EOF. 00112 * 00113 * 21-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00114 * Added cont_ad_set_thresh(). 00115 * Bugfix: n_other is recomputed after updating thresholds. 00116 * 00117 * 20-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00118 * Separated thresholds for speech and silence. 00119 * Fixed bug in moving analysis window upon transition to speech state. 00120 * 00121 * 17-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 00122 * Created, based loosely on Steve Reed's original implementation. 00123 */ 00124 00125 /* 00126 * This module is intended to be interposed as a filter between any raw A/D source and the 00127 * application to remove silence regions. It is initialized with a raw A/D source function 00128 * (during the cont_ad_init call). Filtered A/D data can be read by the application using 00129 * the cont_ad_read function. This module assumes that the A/D source function supplies an 00130 * endless stream of data. The application is responsible for setting up the A/D source, 00131 * turning recording on and off as it desires. It is also responsible for invoking the 00132 * cont_ad_read function frequently enough to avoid buffer overruns and dropping A/D data. 00133 * This continuous listening module has an internal buffer of about 4 sec. 00134 * 00135 * This module must be initialized and calibrated at first (cont_ad_init and cont_ad_calib 00136 * functions). Raw samples are grouped into frames, the signal power in each frame is 00137 * computed and accumulated in a histogram. The module is always in one of two states: 00138 * SILENCE or SPEECH. Transitions between the two states are detected by looking for a 00139 * contiguous window of several frames that is predominantly of the other type. The type 00140 * is determined by comparing frame power to either of two thresholds, thresh_sil and 00141 * thresh_speech, as appropriate for the current state. These thresholds are set from the 00142 * first peak in the low-end of the power histogram, and are updated every few seconds. 00143 * Separate thresholds are used to provide some hysteresis. 00144 * 00145 * The module maintains a linked list of speech (non-silence) segments not yet read by the 00146 * application. The cont_ad_read function returns speech data, if any available, by 00147 * following this list. It also updates an "absolute" timestamp at the end of the 00148 * cont_ad_read operation. The timestamp indicates the total #samples of A/D data read 00149 * until this point, including data discarded as silence frames. The application is 00150 * responsible for using this timestamp to make any policy decisions regarding utterance 00151 * boundaries or whatever. 00152 */ 00153 00154 #include <stdio.h> 00155 #include <stdlib.h> 00156 #include <string.h> 00157 #include <assert.h> 00158 #include <math.h> 00159 00160 #ifdef HAVE_CONFIG_H 00161 #include <config.h> 00162 #endif 00163 00164 #ifdef _MSC_VER 00165 #pragma warning (disable: 4305) 00166 #endif 00167 00168 #include "sphinxbase/prim_type.h" 00169 #include "sphinxbase/ad.h" 00170 #include "sphinxbase/cont_ad.h" 00171 #include "sphinxbase/err.h" 00172 00173 00174 #ifndef _ABS 00175 #define _ABS(x) ((x) >= 0 ? (x) : -(x)) 00176 #endif 00177 00178 00179 /* Various parameters, including defaults for many cont_ad_t member variables */ 00180 00181 #define CONT_AD_ADFRMSIZE 256 /* #Frames of internal A/D buffer maintained */ 00182 00183 #define CONT_AD_POWHISTSIZE 98 /* #Powhist bins: ~ FRMPOW(65536^2*CONT_AD_SPF) */ 00184 /* Maximum level is 96.3 dB full-scale; 97 for safety, plus 1 for zero-based */ 00185 00186 #define CONT_AD_CALIB_FRAMES (CONT_AD_POWHISTSIZE * 2) 00187 00188 #define CONT_AD_THRESH_UPDATE 100 /* Update thresholds approx every so many frames */ 00189 /* PWP: update was 200 frames, or 3.2 seconds. Now about every 1.6 sec. */ 00190 00191 #define CONT_AD_ADAPT_RATE 0.2 /* Interpolation of new and old noiselevel */ 00192 00193 #define CONT_AD_SPS 16000 00194 00195 #define CONT_AD_DEFAULT_NOISE 30 /* Default background noise power level */ 00196 #define CONT_AD_DELTA_SIL 10 /* Initial default for cont_ad_t.delta_sil */ 00197 #define CONT_AD_DELTA_SPEECH 17 /* Initial default for cont_ad_t.delta_speech */ 00198 #define CONT_AD_MIN_NOISE 2 /* Expected minimum background noise level */ 00199 #define CONT_AD_MAX_NOISE 70 /* Maximum background noise level */ 00200 00201 #define CONT_AD_HIST_INERTIA 3 /* Used in decaying the power histogram */ 00202 00203 #define CONT_AD_WINSIZE 21 /* Analysis window for state transitions */ 00204 /* rkm had 16 */ 00205 00206 #define CONT_AD_SPEECH_ONSET 9 /* Min #speech frames in analysis window for 00207 SILENCE -> SPEECH state transition */ 00208 /* 00209 * SReed had 100 ms == 6.25 fr contiguous; rkm had 9 (out of 16+10) with a 00210 * lower threshold. 00211 */ 00212 00213 #define CONT_AD_SIL_ONSET 18 /* Min #silence frames in analysis window for 00214 SPEECH -> SILENCE state transition 00215 MUST BE <= CONT_AD_WINSIZE */ 00216 /* 00217 * SReed had 400 ms == 25 fr contiguous; rkm had 14 out of 16 00218 */ 00219 00220 #define CONT_AD_LEADER 5 /* On transition to SPEECH state, so many frames 00221 BEFORE window included in speech data (>0) */ 00222 /* SReed had 200 ms == 12.5 fr; rkm had 5 */ 00223 00224 #define CONT_AD_TRAILER 10 /* On transition to SILENCE state, so many frames 00225 of silence included in speech data (>0). 00226 NOTE: Ensure (0 < TRAILER+LEADER <= WINSIZE) */ 00227 /* SReed had 100 ms == 6.25 fr; rkm had 10 */ 00228 00229 00230 void 00231 cont_ad_powhist_dump(FILE * fp, cont_ad_t * r) 00232 { 00233 int32 i, j; 00234 00235 fprintf(fp, "PowHist:\n"); 00236 for (i = 0, j = 0; i < CONT_AD_POWHISTSIZE; i++) { 00237 if (r->pow_hist[i] > 0) { 00238 fprintf(fp, "\t%3d %6d\n", i, r->pow_hist[i]); 00239 j = i; 00240 } 00241 } 00242 00243 fprintf(fp, "PH[%7.2f]:", 00244 (double) (r->tot_frm * r->spf) / (double) (r->sps)); 00245 for (i = 0; i <= j; i++) 00246 fprintf(fp, " %2d", r->pow_hist[i]); 00247 fprintf(fp, "\n"); 00248 00249 fflush(fp); 00250 } 00251 00252 00253 /* 00254 * Compute frame power. Interface deliberately kept low level to allow arbitrary 00255 * users to call this function with appropriate data. 00256 */ 00257 int32 00258 cont_ad_frame_pow(int16 * buf, int32 * prev, int32 spf) 00259 { 00260 double sumsq, v; 00261 int32 i; 00262 int32 p; 00263 00264 sumsq = 0.0; 00265 p = *prev; 00266 for (i = 0; i < spf; i++) { 00267 /* Note: pre-emphasis done to remove low-frequency noise. */ 00268 v = (double) (buf[i] - p); 00269 sumsq += v * v; 00270 p = buf[i]; 00271 } 00272 *prev = p; 00273 00274 if (sumsq < spf) /* Make sure FRMPOW(sumsq) >= 0 */ 00275 sumsq = spf; 00276 00277 /* 00278 * PWP: Units changed to dB 00279 * 00280 * Now the units of measurement of an input sample are volts (really!), 00281 * so the power in dB is p = 20*log10(samp). Further, we want the RMS 00282 * (root-mean-squared) average power across the frame. 00283 * 00284 * "sumsq" is the sum of the sum of the squares, so we want 00285 * 00286 * i = 20 * log10( sqrt ( sumsq / n_samps) ) 00287 * 00288 * (Stephen Reed's code actually had 00289 * i = 20 * log10( sqrt (sumsq) / n_samps ) 00290 * but this only produced an additive error.) 00291 * 00292 * i = 20 * log10( sqrt ( sumsq / n_samps) ) 00293 * = 20 * log10( ( sumsq / n_samps) ^ 0.5 ) 00294 * = 20 * log10( ( sumsq / n_samps) ) * 0.5 ) 00295 * = 10 * log10( ( sumsq / n_samps) ) 00296 * = 10 * ( log10( sumsq) - log10(n_samps) ) 00297 */ 00298 i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5); 00299 if (i < 0) 00300 i = 0; /* trim lower bound again to be safe. */ 00301 assert(i < 97); 00302 00303 return (i); 00304 } 00305 00306 00307 /* 00308 * Classify frame (id=frm, starting at sample position s) as sil/nonsil. Classification 00309 * done in isolation, independent of any other frame, based only on power histogram. 00310 */ 00311 static void 00312 compute_frame_pow(cont_ad_t * r, int32 frm) 00313 { 00314 int32 i; 00315 00316 i = cont_ad_frame_pow(r->adbuf + (frm * r->spf), &(r->prev_sample), 00317 r->spf); 00318 00319 r->frm_pow[frm] = (char) i; 00320 (r->pow_hist[i])++; 00321 r->thresh_update--; 00322 } 00323 00324 00325 /* PWP: $$$ check this */ 00326 /* 00327 * PWP: in SReed's code, decay was done by zeroing the histogram, 00328 * i.e. no history. 00329 */ 00330 static void 00331 decay_hist(cont_ad_t * r) 00332 { 00333 int32 i; 00334 00335 for (i = 0; i < CONT_AD_POWHISTSIZE; i++) 00336 r->pow_hist[i] -= (r->pow_hist[i] >> CONT_AD_HIST_INERTIA); 00337 } 00338 00339 00340 /* 00341 * Find silence threshold from power histogram. 00342 */ 00343 static int32 00344 find_thresh(cont_ad_t * r) 00345 { 00346 int32 i, j, max, th; 00347 int32 old_noise_level, old_thresh_sil, old_thresh_speech; 00348 00349 if (!r->auto_thresh) 00350 return 0; 00351 00352 /* 00353 * Find smallest non-zero histogram entry, but starting at some minimum power. 00354 * Power lower than CONT_AD_MIN_NOISE indicates bad A/D input (eg, mic off...). 00355 * Too high a minimum power is also bad. 00356 */ 00357 for (i = r->min_noise; 00358 (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++); 00359 if (i > r->max_noise) /* Bad signal? */ 00360 return -1; 00361 00362 /* PWP: Hmmmmm.... SReed's code looks over the lower 20 dB */ 00363 /* PWP: 1/14/98 Made to work like Stephen Reed's code */ 00364 00365 /* This method of detecting the noise level is VERY unsatisfactory */ 00366 max = 0; 00367 for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i + 20); j++) { /* PWP: was i+6, which was 9 dB */ 00368 if (max < r->pow_hist[j]) { 00369 max = r->pow_hist[j]; 00370 th = j; 00371 } 00372 } 00373 00374 /* "Don't change the threshold too fast" */ 00375 old_noise_level = r->noise_level; 00376 old_thresh_sil = r->thresh_sil; 00377 old_thresh_speech = r->thresh_speech; 00378 /* r->noise_level = (int32) (th * r->adapt_rate + r->noise_level * (1.0 - r->adapt_rate)); */ 00379 r->noise_level = 00380 (int32) (r->noise_level + 00381 r->adapt_rate * (th - r->noise_level) + 0.5); 00382 00383 /* update thresholds */ 00384 r->thresh_sil = r->noise_level + r->delta_sil; 00385 r->thresh_speech = r->noise_level + r->delta_speech; 00386 00387 if (r->logfp) { 00388 fprintf(r->logfp, 00389 "%7.2fs %8df: NoisePeak: %d, Noiselevel: %d -> %d, Th-Sil: %d -> %d, Th-Sp: %d -> %d\n", 00390 (double) (r->tot_frm * r->spf) / (double) (r->sps), 00391 r->tot_frm, th, old_noise_level, r->noise_level, 00392 old_thresh_sil, r->thresh_sil, old_thresh_speech, 00393 r->thresh_speech); 00394 00395 cont_ad_powhist_dump(r->logfp, r); 00396 00397 fflush(r->logfp); 00398 } 00399 00400 /* 00401 * PWP: in SReed's original, he cleared the histogram here. 00402 * I can't fathom why. 00403 */ 00404 00405 return 0; 00406 } 00407 00408 00409 /* 00410 * Silence to speech transition 00411 */ 00412 static void 00413 sil2speech_transition(cont_ad_t *r, int frm) 00414 { 00415 spseg_t *seg; 00416 00417 /* Speech detected; create speech segment description */ 00418 seg = malloc(sizeof(*seg)); 00419 00420 seg->startfrm = r->win_startfrm - r->leader; 00421 if (seg->startfrm < 0) 00422 seg->startfrm += CONT_AD_ADFRMSIZE; 00423 seg->nfrm = r->leader + r->winsize; 00424 seg->next = NULL; 00425 00426 if (!r->spseg_head) 00427 r->spseg_head = seg; 00428 else 00429 r->spseg_tail->next = seg; 00430 r->spseg_tail = seg; 00431 00432 r->tail_state = CONT_AD_STATE_SPEECH; 00433 00434 if (r->logfp) { 00435 int32 n; 00436 00437 /* Where (in absolute time) this speech segment starts */ 00438 n = frm - seg->startfrm; 00439 if (n < 0) 00440 n += CONT_AD_ADFRMSIZE; 00441 n = r->tot_frm - n - 1; 00442 00443 fprintf(r->logfp, 00444 "%7.2fs %8d[%3d]f: Sil -> Sp detect; seg start: %7.2fs %8d\n", 00445 (double) (r->tot_frm * 00446 r->spf) / 00447 (double) (r->sps), 00448 r->tot_frm, frm, 00449 (double) (n * r->spf) / (double) (r->sps), n); 00450 } 00451 00452 /* Now in SPEECH state; want to look for silence from end of this window */ 00453 r->win_validfrm = 1; 00454 r->win_startfrm = frm; 00455 00456 /* Count #sil frames remaining in reduced window (of 1 frame) */ 00457 r->n_other = (r->frm_pow[frm] <= r->thresh_sil) ? 1 : 0; 00458 } 00459 00460 /* 00461 * Speech to silence transition 00462 */ 00463 static void 00464 speech2sil_transition(cont_ad_t *r, int frm) 00465 { 00466 int f; 00467 00468 /* End of speech detected; speech->sil transition */ 00469 r->spseg_tail->nfrm += r->trailer; 00470 00471 r->tail_state = CONT_AD_STATE_SIL; 00472 00473 if (r->logfp) { 00474 int32 n; 00475 00476 /* Where (in absolute time) this speech segment ends */ 00477 n = r->spseg_tail->startfrm + r->spseg_tail->nfrm - 1; 00478 if (n >= CONT_AD_ADFRMSIZE) 00479 n -= CONT_AD_ADFRMSIZE; 00480 n = frm - n; 00481 if (n < 0) 00482 n += CONT_AD_ADFRMSIZE; 00483 n = r->tot_frm - n; 00484 00485 fprintf(r->logfp, 00486 "%7.2fs %8d[%3d]f: Sp -> Sil detect; seg end: %7.2fs %8d\n", 00487 (double) (r->tot_frm * r->spf) / 00488 (double) (r->sps), r->tot_frm, frm, 00489 (double) (n * r->spf) / (double) (r->sps), n); 00490 } 00491 00492 /* Now in SILENCE state; start looking for speech trailer+leader frames later */ 00493 r->win_validfrm -= (r->trailer + r->leader - 1); 00494 r->win_startfrm += (r->trailer + r->leader - 1); 00495 if (r->win_startfrm >= CONT_AD_ADFRMSIZE) 00496 r->win_startfrm -= CONT_AD_ADFRMSIZE; 00497 00498 /* Count #speech frames remaining in reduced window */ 00499 r->n_other = 0; 00500 for (f = r->win_startfrm;;) { 00501 if (r->frm_pow[f] >= r->thresh_speech) 00502 r->n_other++; 00503 00504 if (f == frm) 00505 break; 00506 00507 f++; 00508 if (f >= CONT_AD_ADFRMSIZE) 00509 f = 0; 00510 } 00511 } 00512 00513 00514 /* 00515 * Main silence/speech region detection routine. If currently in 00516 * SILENCE state, switch to SPEECH state if a window (r->winsize) 00517 * of frames is mostly non-silence. If in SPEECH state, switch to 00518 * SILENCE state if the window is mostly silence. 00519 */ 00520 static void 00521 boundary_detect(cont_ad_t * r, int32 frm) 00522 { 00523 assert(r->n_other >= 0); 00524 00525 r->win_validfrm++; 00526 if (r->tail_state == CONT_AD_STATE_SIL) { 00527 if (r->frm_pow[frm] >= r->thresh_speech) 00528 r->n_other++; 00529 } 00530 else { 00531 if (r->frm_pow[frm] <= r->thresh_sil) 00532 r->n_other++; 00533 } 00534 00535 if (r->logfp) { 00536 fprintf(r->logfp, 00537 "%7.2fs %8d[%3d]f: P: %2d, N: %2d, T+: %2d, T-: %2d, #O: %2d, %s\n", 00538 (double) (r->tot_frm * r->spf) / (double) (r->sps), 00539 r->tot_frm, frm, r->frm_pow[frm], r->noise_level, 00540 r->thresh_speech, r->thresh_sil, r->n_other, 00541 (r->tail_state == CONT_AD_STATE_SIL) ? "--" : "Sp"); 00542 } 00543 00544 if (r->win_validfrm < r->winsize) /* Not reached full analysis window size */ 00545 return; 00546 assert(r->win_validfrm == r->winsize); 00547 00548 if (r->tail_state == CONT_AD_STATE_SIL) { /* Currently in SILENCE state */ 00549 if (r->n_frm >= r->winsize + r->leader 00550 && r->n_other >= r->speech_onset) { 00551 sil2speech_transition(r, frm); 00552 } 00553 } 00554 else { 00555 if (r->n_other >= r->sil_onset) { 00556 speech2sil_transition(r, frm); 00557 } 00558 else { 00559 /* In speech state, and staying there; add this frame to segment */ 00560 r->spseg_tail->nfrm++; 00561 } 00562 } 00563 00564 /* 00565 * Get rid of oldest frame in analysis window. Not quite correct; 00566 * thresholds could have changed over the window; should preserve 00567 * the original speech/silence label for the frame and undo it. Later.. 00568 */ 00569 if (r->tail_state == CONT_AD_STATE_SIL) { 00570 if (r->frm_pow[r->win_startfrm] >= r->thresh_speech) { 00571 if (r->n_other > 0) 00572 r->n_other--; 00573 } 00574 } 00575 else { 00576 if (r->frm_pow[r->win_startfrm] <= r->thresh_sil) { 00577 if (r->n_other > 0) 00578 r->n_other--; 00579 } 00580 } 00581 r->win_validfrm--; 00582 r->win_startfrm++; 00583 if (r->win_startfrm >= CONT_AD_ADFRMSIZE) 00584 r->win_startfrm = 0; 00585 00586 if (r->logfp) 00587 fflush(r->logfp); 00588 } 00589 00590 00591 static int32 00592 max_siglvl(cont_ad_t * r, int32 startfrm, int32 nfrm) 00593 { 00594 int32 siglvl, i, f; 00595 00596 siglvl = 0; 00597 if (nfrm > 0) { 00598 for (i = 0, f = startfrm; i < nfrm; i++, f++) { 00599 if (f >= CONT_AD_ADFRMSIZE) 00600 f -= CONT_AD_ADFRMSIZE; 00601 if (r->frm_pow[f] > siglvl) 00602 siglvl = r->frm_pow[f]; 00603 } 00604 } 00605 return siglvl; 00606 } 00607 00608 00609 #if 0 00610 /* 00611 * RKM(2005/01/31): Where did this come from? If needed, it should be called 00612 * cont_ad_get_audio_data. 00613 */ 00614 void 00615 get_audio_data(cont_ad_t * r, int16 * buf, int32 max) 00616 { 00617 } 00618 #endif 00619 00620 00621 static void 00622 cont_ad_read_log(cont_ad_t * r, int32 retval) 00623 { 00624 spseg_t *seg; 00625 00626 fprintf(r->logfp, "return from cont_ad_read() -> %d:\n", retval); 00627 fprintf(r->logfp, "\tstate: %d\n", r->state); 00628 fprintf(r->logfp, "\tread_ts: %d (%.2fs)\n", 00629 r->read_ts, (float32) r->read_ts / (float32) r->sps); 00630 fprintf(r->logfp, "\tseglen: %d (%.2fs)\n", 00631 r->seglen, (float32) r->seglen / (float32) r->sps); 00632 fprintf(r->logfp, "\tsiglvl: %d\n", r->siglvl); 00633 fprintf(r->logfp, "\theadfrm: %d\n", r->headfrm); 00634 fprintf(r->logfp, "\tn_frm: %d\n", r->n_frm); 00635 fprintf(r->logfp, "\tn_sample: %d\n", r->n_sample); 00636 fprintf(r->logfp, "\twin_startfrm: %d\n", r->win_startfrm); 00637 fprintf(r->logfp, "\twin_validfrm: %d\n", r->win_validfrm); 00638 fprintf(r->logfp, "\tnoise_level: %d\n", r->noise_level); 00639 fprintf(r->logfp, "\tthresh_sil: %d\n", r->thresh_sil); 00640 fprintf(r->logfp, "\tthresh_speech: %d\n", r->thresh_speech); 00641 fprintf(r->logfp, "\tn_other: %d\n", r->n_other); 00642 fprintf(r->logfp, "\ttail_state: %d\n", r->tail_state); 00643 fprintf(r->logfp, "\ttot_frm: %d\n", r->tot_frm); 00644 00645 fprintf(r->logfp, "\tspseg:"); 00646 for (seg = r->spseg_head; seg; seg = seg->next) 00647 fprintf(r->logfp, " %d[%d]", seg->startfrm, seg->nfrm); 00648 fprintf(r->logfp, "\n"); 00649 00650 fflush(r->logfp); 00651 } 00652 00653 00654 /* 00655 * Copy data from r->adbuf[sf], for nf frames, into buf. 00656 * All length checks must have been completed before this call; hence, this 00657 * function will copy exactly the specified number of frames. 00658 * 00659 * Return value: Index of frame just after the segment copied, possibly wrapped 00660 * around to 0. 00661 */ 00662 static int32 00663 buf_copy(cont_ad_t * r, int32 sf, int32 nf, int16 * buf) 00664 { 00665 int32 f, l; 00666 00667 assert((sf >= 0) && (sf < CONT_AD_ADFRMSIZE)); 00668 assert(nf >= 0); 00669 00670 if (sf + nf > CONT_AD_ADFRMSIZE) { 00671 /* Amount to be copied wraps around adbuf; copy in two stages */ 00672 f = CONT_AD_ADFRMSIZE - sf; 00673 l = (f * r->spf); 00674 memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16)); 00675 00676 if (r->logfp) { 00677 fprintf(r->logfp, 00678 "return %d speech frames [%d..%d]; %d samples\n", 00679 f, sf, sf + f - 1, l); 00680 } 00681 00682 buf += l; 00683 sf = 0; 00684 nf -= f; 00685 } 00686 00687 if (nf > 0) { 00688 l = (nf * r->spf); 00689 memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16)); 00690 00691 if (r->logfp) { 00692 fprintf(r->logfp, 00693 "return %d speech frames [%d..%d]; %d samples\n", 00694 nf, sf, sf + nf - 1, l); 00695 } 00696 } 00697 00698 if ((sf + nf) >= CONT_AD_ADFRMSIZE) { 00699 assert((sf + nf) == CONT_AD_ADFRMSIZE); 00700 return 0; 00701 } 00702 else 00703 return (sf + nf); 00704 } 00705 00706 int32 00707 cont_ad_buffer_space(cont_ad_t *r) 00708 { 00709 return r->adbufsize - r->n_sample; 00710 } 00711 00712 /* 00713 * Read as much data as possible from r->adfunc into r->adbuf. 00714 */ 00715 static int32 00716 cont_ad_read_internal(cont_ad_t *r, int16 *buf, int32 max) 00717 { 00718 int32 head, tail, len, l; 00719 00720 /* 00721 * First read as much of raw A/D as possible and available. adbuf is not 00722 * really a circular buffer, so may have to read in two steps for wrapping 00723 * around. 00724 */ 00725 head = r->headfrm * r->spf; 00726 tail = head + r->n_sample; 00727 len = r->n_sample - (r->n_frm * r->spf); /* #partial frame samples at the tail */ 00728 assert((len >= 0) && (len < r->spf)); 00729 00730 if ((tail < r->adbufsize) && (!r->eof)) { 00731 if (r->adfunc) { 00732 if ((l = 00733 (*(r->adfunc)) (r->ad, r->adbuf + tail, 00734 r->adbufsize - tail)) < 0) { 00735 r->eof = 1; 00736 l = 0; 00737 } 00738 } 00739 else { 00740 l = r->adbufsize - tail; 00741 if (l > max) { 00742 l = max; 00743 max = 0; 00744 } 00745 else { 00746 max -= l; 00747 } 00748 memcpy(r->adbuf + tail, buf, l * sizeof(int16)); 00749 buf += l; 00750 } 00751 if ((l > 0) && r->rawfp) { 00752 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp); 00753 fflush(r->rawfp); 00754 } 00755 00756 tail += l; 00757 len += l; 00758 r->n_sample += l; 00759 } 00760 if ((tail >= r->adbufsize) && (!r->eof)) { 00761 tail -= r->adbufsize; 00762 if (tail < head) { 00763 if (r->adfunc) { 00764 if ((l = 00765 (*(r->adfunc)) (r->ad, 00766 r->adbuf + tail, head - tail)) < 0) { 00767 r->eof = 1; 00768 l = 0; 00769 } 00770 } 00771 else { 00772 l = head - tail; 00773 if (l > max) 00774 l = max; 00775 memcpy(r->adbuf + tail, buf, l * sizeof(int16)); 00776 } 00777 if ((l > 0) && r->rawfp) { 00778 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp); 00779 fflush(r->rawfp); 00780 } 00781 00782 tail += l; 00783 len += l; 00784 r->n_sample += l; 00785 } 00786 } 00787 00788 return len; 00789 } 00790 00791 /* 00792 * Classify incoming frames as silence or speech. 00793 */ 00794 int32 00795 cont_ad_classify(cont_ad_t *r, int32 len) 00796 { 00797 int32 tailfrm; 00798 00799 tailfrm = (r->headfrm + r->n_frm); /* Next free frame slot to be filled */ 00800 if (tailfrm >= CONT_AD_ADFRMSIZE) 00801 tailfrm -= CONT_AD_ADFRMSIZE; 00802 00803 for (; len >= r->spf; len -= r->spf) { 00804 compute_frame_pow(r, tailfrm); 00805 r->n_frm++; 00806 r->tot_frm++; 00807 00808 /* 00809 * Find speech/sil state change, if any. Also, if staying in speech state 00810 * add this frame to current speech segment. 00811 */ 00812 boundary_detect(r, tailfrm); 00813 00814 if (++tailfrm >= CONT_AD_ADFRMSIZE) 00815 tailfrm = 0; 00816 00817 /* Update thresholds if time to do so */ 00818 if (r->thresh_update <= 0) { 00819 int32 i, f; 00820 find_thresh(r); 00821 decay_hist(r); 00822 r->thresh_update = CONT_AD_THRESH_UPDATE; 00823 00824 #if 1 00825 /* 00826 * Since threshold has been updated, recompute r->n_other. 00827 * (RKM: Is this really necessary? Comment out??) 00828 */ 00829 r->n_other = 0; 00830 if (r->tail_state == CONT_AD_STATE_SIL) { 00831 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) { 00832 if (r->frm_pow[f] >= r->thresh_speech) 00833 r->n_other++; 00834 00835 f++; 00836 if (f >= CONT_AD_ADFRMSIZE) 00837 f = 0; 00838 } 00839 } 00840 else { 00841 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) { 00842 if (r->frm_pow[f] <= r->thresh_sil) 00843 r->n_other++; 00844 00845 f++; 00846 if (f >= CONT_AD_ADFRMSIZE) 00847 f = 0; 00848 } 00849 } 00850 #endif 00851 } 00852 } 00853 00854 return r->tail_state; 00855 } 00856 00857 /* 00858 * Main function called by the application to filter out silence regions. 00859 * Maintains a linked list of speech segments pointing into r->adbuf and feeds 00860 * data to application from them. 00861 */ 00862 int32 00863 cont_ad_read(cont_ad_t * r, int16 * buf, int32 max) 00864 { 00865 int32 flen, len, retval, newstate; 00866 spseg_t *seg; 00867 00868 if ((r == NULL) || (buf == NULL)) 00869 return -1; 00870 00871 if (max < r->spf) { 00872 E_ERROR 00873 ("cont_ad_read requires buffer of at least %d samples\n", 00874 r->spf); 00875 return -1; 00876 } 00877 00878 if (r->logfp) { 00879 fprintf(r->logfp, "cont_ad_read(,, %d)\n", max); 00880 fflush(r->logfp); 00881 } 00882 00883 /* Read data from adfunc or from buf. */ 00884 len = cont_ad_read_internal(r, buf, max); 00885 00886 /* Compute frame power for unprocessed+new data and find speech/silence boundaries */ 00887 cont_ad_classify(r, len); 00888 00889 /* 00890 * If eof on input data source, cleanup the final segment. 00891 */ 00892 if (r->eof) { 00893 if (r->tail_state == CONT_AD_STATE_SPEECH) { 00894 /* 00895 * Still inside a speech segment when input data got over. Absort any 00896 * remaining frames into the final speech segment. 00897 */ 00898 assert(r->spseg_tail != NULL); 00899 00900 /* Absorb frames still in analysis window into final speech seg */ 00901 assert((r->win_validfrm >= 0) 00902 && (r->win_validfrm < r->winsize)); 00903 r->spseg_tail->nfrm += r->win_validfrm; 00904 00905 r->tail_state = CONT_AD_STATE_SIL; 00906 } 00907 00908 r->win_startfrm += r->win_validfrm; 00909 if (r->win_startfrm >= CONT_AD_ADFRMSIZE) 00910 r->win_startfrm -= CONT_AD_ADFRMSIZE; 00911 r->win_validfrm = 0; 00912 r->n_other = 0; 00913 } 00914 00915 /* 00916 * At last ready to copy speech data, if any, into caller's buffer. Raw 00917 * speech data is segmented into alternating speech and silence segments. 00918 * But any single call to cont_ad_read will never cross a speech/silence 00919 * boundary. 00920 */ 00921 seg = r->spseg_head; /* first speech segment available, if any */ 00922 00923 if ((seg == NULL) || (r->headfrm != seg->startfrm)) { 00924 /* 00925 * Either no speech data available, or inside a silence segment. Find 00926 * length of silence segment. 00927 */ 00928 if (seg == NULL) { 00929 assert(r->tail_state == CONT_AD_STATE_SIL); 00930 00931 flen = 00932 (r->eof) ? r->n_frm : r->n_frm - (r->winsize + 00933 r->leader - 1); 00934 if (flen < 0) 00935 flen = 0; 00936 } 00937 else { 00938 flen = seg->startfrm - r->headfrm; 00939 if (flen < 0) 00940 flen += CONT_AD_ADFRMSIZE; 00941 } 00942 00943 if (r->rawmode) { 00944 /* Restrict silence segment to user buffer size, integral #frames */ 00945 int32 f = max / r->spf; 00946 if (flen > f) 00947 flen = f; 00948 } 00949 00950 newstate = CONT_AD_STATE_SIL; 00951 } 00952 else { 00953 flen = max / r->spf; /* truncate read-size to integral #frames */ 00954 if (flen > seg->nfrm) 00955 flen = seg->nfrm; /* truncate further to this segment size */ 00956 00957 newstate = CONT_AD_STATE_SPEECH; 00958 } 00959 00960 len = flen * r->spf; /* #samples being consumed */ 00961 00962 r->siglvl = max_siglvl(r, r->headfrm, flen); 00963 00964 if ((newstate == CONT_AD_STATE_SIL) && (!r->rawmode)) { 00965 /* Skip silence data */ 00966 r->headfrm += flen; 00967 if (r->headfrm >= CONT_AD_ADFRMSIZE) 00968 r->headfrm -= CONT_AD_ADFRMSIZE; 00969 00970 retval = 0; /* #samples being copied/returned */ 00971 } 00972 else { 00973 /* Copy speech/silence(in rawmode) data */ 00974 r->headfrm = buf_copy(r, r->headfrm, flen, buf); 00975 00976 retval = len; /* #samples being copied/returned */ 00977 } 00978 00979 r->n_frm -= flen; 00980 r->n_sample -= len; 00981 assert((r->n_frm >= 0) && (r->n_sample >= 0)); 00982 assert(r->win_validfrm <= r->n_frm); 00983 00984 if (r->state == newstate) 00985 r->seglen += len; 00986 else 00987 r->seglen = len; 00988 r->state = newstate; 00989 00990 if (newstate == CONT_AD_STATE_SPEECH) { 00991 seg->startfrm = r->headfrm; 00992 assert(seg->startfrm >= 0); 00993 seg->nfrm -= flen; 00994 00995 /* Free seg if empty and not recording into it */ 00996 if ((seg->nfrm == 0) 00997 && (seg->next || (r->tail_state == CONT_AD_STATE_SIL))) { 00998 r->spseg_head = seg->next; 00999 if (seg->next == NULL) 01000 r->spseg_tail = NULL; 01001 free(seg); 01002 } 01003 } 01004 01005 /* Update timestamp. Total raw A/D read - those remaining to be consumed */ 01006 r->read_ts = (r->tot_frm - r->n_frm) * r->spf; 01007 01008 if (retval == 0) 01009 retval = (r->eof && (r->spseg_head == NULL)) ? -1 : 0; 01010 01011 if (r->logfp) 01012 cont_ad_read_log(r, retval); 01013 01014 return retval; 01015 } 01016 01017 01018 /* 01019 * Calibrate input channel for silence threshold. 01020 */ 01021 int32 01022 cont_ad_calib(cont_ad_t * r) 01023 { 01024 int32 i, s, k, len, tailfrm; 01025 01026 if (r == NULL) 01027 return -1; 01028 01029 /* clear histogram */ 01030 for (i = 0; i < CONT_AD_POWHISTSIZE; i++) 01031 r->pow_hist[i] = 0; 01032 tailfrm = r->headfrm + r->n_frm; 01033 if (tailfrm >= CONT_AD_ADFRMSIZE) 01034 tailfrm -= CONT_AD_ADFRMSIZE; 01035 s = (tailfrm * r->spf); 01036 01037 for (r->n_calib_frame = 0; 01038 r->n_calib_frame < CONT_AD_CALIB_FRAMES; 01039 ++r->n_calib_frame) { 01040 len = r->spf; 01041 while (len > 0) { 01042 /*Trouble */ 01043 if ((k = (*(r->adfunc)) (r->ad, r->adbuf + s, len)) < 0) 01044 return -1; 01045 len -= k; 01046 s += k; 01047 } 01048 s -= r->spf; 01049 01050 compute_frame_pow(r, tailfrm); 01051 } 01052 01053 r->thresh_update = CONT_AD_THRESH_UPDATE; 01054 return find_thresh(r); 01055 } 01056 01057 int32 01058 cont_ad_calib_size(cont_ad_t *r) 01059 { 01060 return r->spf * CONT_AD_CALIB_FRAMES; 01061 } 01062 01063 int32 01064 cont_ad_calib_loop(cont_ad_t * r, int16 * buf, int32 max) 01065 { 01066 int32 i, s, len, tailfrm; 01067 01068 if (r->n_calib_frame == CONT_AD_CALIB_FRAMES) { 01069 /* If calibration previously succeeded, then this is a 01070 * recalibration, so start again. */ 01071 r->n_calib_frame = 0; 01072 /* clear histogram */ 01073 for (i = 0; i < CONT_AD_POWHISTSIZE; i++) 01074 r->pow_hist[i] = 0; 01075 } 01076 01077 tailfrm = r->headfrm + r->n_frm; 01078 if (tailfrm >= CONT_AD_ADFRMSIZE) 01079 tailfrm -= CONT_AD_ADFRMSIZE; 01080 s = (tailfrm * r->spf); 01081 01082 len = r->spf; 01083 for (; r->n_calib_frame < CONT_AD_CALIB_FRAMES; 01084 ++r->n_calib_frame) { 01085 if (max < len) 01086 return 1; 01087 memcpy(r->adbuf + s, buf, len * sizeof(int16)); 01088 max -= len; 01089 buf += len; 01090 compute_frame_pow(r, tailfrm); 01091 } 01092 01093 r->thresh_update = CONT_AD_THRESH_UPDATE; 01094 return find_thresh(r); 01095 } 01096 01097 01098 /* PWP 1/14/98 -- modified for compatibility with old code */ 01099 int32 01100 cont_ad_set_thresh(cont_ad_t * r, int32 sil, int32 speech) 01101 { 01102 if (r == NULL) 01103 return -1; 01104 01105 if ((sil < 0) || (speech < 0)) { 01106 fprintf(stderr, 01107 "cont_ad_set_thresh: invalid threshold arguments: %d, %d\n", 01108 sil, speech); 01109 return -1; 01110 } 01111 r->delta_sil = (3 * sil) / 2; 01112 r->delta_speech = (3 * speech) / 2; 01113 01114 return 0; 01115 } 01116 01117 01118 /* 01119 * PWP 1/14/98 -- set the changable params. 01120 * 01121 * delta_sil, delta_speech, min_noise, and max_noise are in dB, 01122 * winsize, speech_onset, sil_onset, leader and trailer are in frames of 01123 * 16 ms length (256 samples @ 16kHz sampling). 01124 */ 01125 int32 01126 cont_ad_set_params(cont_ad_t * r, int32 delta_sil, 01127 int32 delta_speech, int32 min_noise, 01128 int32 max_noise, int32 winsize, 01129 int32 speech_onset, int32 sil_onset, int32 leader, 01130 int32 trailer, float32 adapt_rate) 01131 { 01132 if ((delta_sil < 0) || (delta_speech < 0) || (min_noise < 0) 01133 || (max_noise < 0)) { 01134 E_ERROR("threshold arguments: " 01135 "%d, %d, %d, %d must all be >=0\n", delta_sil, 01136 delta_speech, min_noise, max_noise); 01137 return -1; 01138 } 01139 01140 if ((speech_onset > winsize) || (speech_onset <= 0) 01141 || (winsize <= 0)) { 01142 E_ERROR 01143 ("speech_onset, %d, must be <= winsize, %d, and both >0\n", 01144 speech_onset, winsize); 01145 return -1; 01146 } 01147 01148 if ((sil_onset > winsize) || (sil_onset <= 0) || (winsize <= 0)) { 01149 E_ERROR 01150 ("sil_onset, %d, must be <= winsize, %d, and both >0\n", 01151 sil_onset, winsize); 01152 return -1; 01153 } 01154 01155 if (((leader + trailer) > winsize) || (leader <= 0) 01156 || (trailer <= 0)) { 01157 E_ERROR 01158 ("leader, %d, plus trailer, %d, must be <= winsize, %d, and both >0\n", 01159 leader, trailer, winsize); 01160 return -1; 01161 } 01162 01163 if ((adapt_rate < 0.0) || (adapt_rate > 1.0)) { 01164 E_ERROR("adapt_rate, %e; must be in range 0..1\n", adapt_rate); 01165 return -1; 01166 } 01167 01168 if (r == NULL) 01169 return -1; 01170 01171 r->delta_sil = delta_sil; 01172 r->delta_speech = delta_speech; 01173 r->min_noise = min_noise; 01174 r->max_noise = max_noise; 01175 01176 r->winsize = winsize; 01177 r->speech_onset = speech_onset; 01178 r->sil_onset = sil_onset; 01179 r->leader = leader; 01180 r->trailer = trailer; 01181 01182 r->adapt_rate = adapt_rate; 01183 01184 if (r->win_validfrm >= r->winsize) 01185 r->win_validfrm = r->winsize - 1; 01186 01187 return 0; 01188 } 01189 01190 01191 /* 01192 * PWP 1/14/98 -- get the changable params. 01193 * 01194 * delta_sil, delta_speech, min_noise, and max_noise are in dB, 01195 * winsize, speech_onset, sil_onset, leader and trailer are in frames of 01196 * 16 ms length (256 samples @ 16kHz sampling). 01197 */ 01198 int32 01199 cont_ad_get_params(cont_ad_t * r, int32 * delta_sil, 01200 int32 * delta_speech, int32 * min_noise, 01201 int32 * max_noise, int32 * winsize, 01202 int32 * speech_onset, int32 * sil_onset, 01203 int32 * leader, int32 * trailer, float32 * adapt_rate) 01204 { 01205 if (!delta_sil || !delta_speech || !min_noise || !max_noise 01206 || !winsize || !speech_onset || !sil_onset || !leader 01207 || !trailer || !adapt_rate) { 01208 fprintf(stderr, "cont_ad_get_params: some param slots are NULL\n"); 01209 return (-1); 01210 } 01211 01212 if (r == NULL) 01213 return -1; 01214 01215 *delta_sil = r->delta_sil; 01216 *delta_speech = r->delta_speech; 01217 *min_noise = r->min_noise; 01218 *max_noise = r->max_noise; 01219 01220 *winsize = r->winsize; 01221 *speech_onset = r->speech_onset; 01222 *sil_onset = r->sil_onset; 01223 *leader = r->leader; 01224 *trailer = r->trailer; 01225 01226 *adapt_rate = r->adapt_rate; 01227 01228 return 0; 01229 } 01230 01231 01232 /* 01233 * Reset, discarded any accumulated speech. 01234 */ 01235 int32 01236 cont_ad_reset(cont_ad_t * r) 01237 { 01238 spseg_t *seg; 01239 01240 if (r == NULL) 01241 return -1; 01242 01243 while (r->spseg_head) { 01244 seg = r->spseg_head; 01245 r->spseg_head = seg->next; 01246 free(seg); 01247 } 01248 r->spseg_tail = NULL; 01249 01250 r->headfrm = 0; 01251 r->n_frm = 0; 01252 r->n_sample = 0; 01253 r->win_startfrm = 0; 01254 r->win_validfrm = 0; 01255 r->n_other = 0; 01256 01257 r->tail_state = CONT_AD_STATE_SIL; 01258 01259 return 0; 01260 } 01261 01262 01263 int32 01264 cont_ad_close(cont_ad_t * cont) 01265 { 01266 if (cont == NULL) 01267 return -1; 01268 01269 cont_ad_reset(cont); /* Frees any remaining speech segments */ 01270 01271 free(cont->adbuf); 01272 free(cont->pow_hist); 01273 free(cont->frm_pow); 01274 free(cont); 01275 01276 return 0; 01277 } 01278 01279 01280 int32 01281 cont_ad_detach(cont_ad_t * c) 01282 { 01283 if (c == NULL) 01284 return -1; 01285 01286 c->ad = NULL; 01287 c->adfunc = NULL; 01288 return 0; 01289 } 01290 01291 01292 int32 01293 cont_ad_attach(cont_ad_t * c, ad_rec_t * a, 01294 int32(*func) (ad_rec_t *, int16 *, int32)) 01295 { 01296 if (c == NULL) 01297 return -1; 01298 01299 c->ad = a; 01300 c->adfunc = func; 01301 c->eof = 0; 01302 01303 return 0; 01304 } 01305 01306 01307 int32 01308 cont_set_thresh(cont_ad_t * r, int32 silence, int32 speech) 01309 { 01310 int32 i, f; 01311 01312 r->thresh_speech = speech; 01313 r->thresh_sil = silence; 01314 01315 /* Since threshold has been updated, recompute r->n_other */ 01316 r->n_other = 0; 01317 if (r->tail_state == CONT_AD_STATE_SIL) { 01318 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) { 01319 if (r->frm_pow[f] >= r->thresh_speech) 01320 r->n_other++; 01321 01322 f++; 01323 if (f >= CONT_AD_ADFRMSIZE) 01324 f = 0; 01325 } 01326 } 01327 else if (r->tail_state == CONT_AD_STATE_SPEECH) { 01328 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) { 01329 if (r->frm_pow[f] <= r->thresh_sil) 01330 r->n_other++; 01331 01332 f++; 01333 if (f >= CONT_AD_ADFRMSIZE) 01334 f = 0; 01335 } 01336 } 01337 01338 return 0; 01339 } 01340 01341 01342 /* 01343 * Set the file pointer for dumping the raw input audio stream. 01344 */ 01345 int32 01346 cont_ad_set_rawfp(cont_ad_t * r, FILE * fp) 01347 { 01348 if (r == NULL) 01349 return -1; 01350 01351 r->rawfp = fp; 01352 return 0; 01353 } 01354 01355 01356 /* 01357 * Set the file pointer for logging cont_ad progress. 01358 */ 01359 int32 01360 cont_ad_set_logfp(cont_ad_t * r, FILE * fp) 01361 { 01362 if (r == NULL) 01363 return -1; 01364 01365 r->logfp = fp; 01366 return 0; 01367 } 01368 01369 01370 /* 01371 * One-time initialization. 01372 */ 01373 cont_ad_t * 01374 cont_ad_init(ad_rec_t * a, int32(*func) (ad_rec_t *, int16 *, int32)) 01375 { 01376 cont_ad_t *r; 01377 01378 if ((r = malloc(sizeof(*r))) == NULL) { 01379 E_ERROR_SYSTEM("allocation of cont_ad_t failed"); 01380 return NULL; 01381 } 01382 01383 r->ad = a; 01384 r->adfunc = func; 01385 r->eof = 0; 01386 r->rawmode = 0; 01387 01388 if (a != NULL) 01389 r->sps = a->sps; 01390 else 01391 r->sps = CONT_AD_SPS; 01392 01393 /* Set samples/frame such that when sps=16000, spf=256 */ 01394 r->spf = (r->sps * 256) / CONT_AD_SPS; 01395 r->adbufsize = CONT_AD_ADFRMSIZE * r->spf; 01396 01397 if ((r->adbuf = malloc(r->adbufsize * sizeof(*r->adbuf))) == NULL) { 01398 E_ERROR_SYSTEM("allocation of audio buffer failed"); 01399 free(r); 01400 return NULL; 01401 } 01402 if ((r->pow_hist = 01403 calloc(CONT_AD_POWHISTSIZE, sizeof(*r->pow_hist))) == NULL) { 01404 E_ERROR_SYSTEM("allocation of power history buffer failed"); 01405 free(r->adbuf); 01406 free(r); 01407 return NULL; 01408 } 01409 if ((r->frm_pow = 01410 calloc(CONT_AD_ADFRMSIZE, sizeof(*r->frm_pow))) == NULL) { 01411 E_ERROR_SYSTEM("allocation of frame power buffer failed"); 01412 free(r->pow_hist); 01413 free(r->adbuf); 01414 free(r); 01415 return NULL; 01416 } 01417 01418 r->state = CONT_AD_STATE_SIL; 01419 r->read_ts = 0; 01420 r->seglen = 0; 01421 r->siglvl = 0; 01422 r->prev_sample = 0; 01423 r->tot_frm = 0; 01424 r->noise_level = CONT_AD_DEFAULT_NOISE; 01425 01426 r->auto_thresh = 1; 01427 r->delta_sil = CONT_AD_DELTA_SIL; 01428 r->delta_speech = CONT_AD_DELTA_SPEECH; 01429 r->min_noise = CONT_AD_MIN_NOISE; 01430 r->max_noise = CONT_AD_MAX_NOISE; 01431 r->winsize = CONT_AD_WINSIZE; 01432 r->speech_onset = CONT_AD_SPEECH_ONSET; 01433 r->sil_onset = CONT_AD_SIL_ONSET; 01434 r->leader = CONT_AD_LEADER; 01435 r->trailer = CONT_AD_TRAILER; 01436 01437 r->thresh_sil = r->noise_level + r->delta_sil; 01438 r->thresh_speech = r->noise_level + r->delta_speech; 01439 r->thresh_update = CONT_AD_THRESH_UPDATE; 01440 r->adapt_rate = CONT_AD_ADAPT_RATE; 01441 01442 r->tail_state = CONT_AD_STATE_SIL; 01443 01444 r->spseg_head = NULL; 01445 r->spseg_tail = NULL; 01446 01447 r->rawfp = NULL; 01448 r->logfp = NULL; 01449 01450 r->n_calib_frame = 0; 01451 01452 cont_ad_reset(r); 01453 01454 return r; 01455 } 01456 01457 01458 cont_ad_t * 01459 cont_ad_init_rawmode(ad_rec_t * a, 01460 int32(*func) (ad_rec_t *, int16 *, int32)) 01461 { 01462 cont_ad_t *r; 01463 01464 r = cont_ad_init(a, func); 01465 r->rawmode = 1; 01466 01467 return r; 01468 }