00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include <stdio.h>
00038 #include <string.h>
00039 #include <math.h>
00040 #include <stdlib.h>
00041 #include <assert.h>
00042 #ifdef _WIN32_WCE
00043 #include <windows.h>
00044 #else
00045 #include <time.h>
00046 #endif
00047
00048 #ifdef HAVE_CONFIG_H
00049 #include <config.h>
00050 #endif
00051
00052 #include "prim_type.h"
00053 #include "byteorder.h"
00054 #include "fixpoint.h"
00055 #include "fe_internal.h"
00056 #include "genrand.h"
00057 #include "err.h"
00058 #include "cmd_ln.h"
00059 #include "ckd_alloc.h"
00060 #include "fe_warp.h"
00061
00062 static const arg_t fe_args[] = {
00063 waveform_to_cepstral_command_line_macro(),
00064 { NULL, 0, NULL, NULL }
00065 };
00066
00067 int
00068 fe_parse_general_params(cmd_ln_t *config, fe_t * fe)
00069 {
00070 int j;
00071
00072 fe->config = config;
00073 fe->sampling_rate = cmd_ln_float32_r(config, "-samprate");
00074 fe->frame_rate = (int16)cmd_ln_int32_r(config, "-frate");
00075 if (cmd_ln_boolean_r(config, "-dither")) {
00076 fe->dither = 1;
00077 fe->seed = cmd_ln_int32_r(config, "-seed");
00078 }
00079 #ifdef WORDS_BIGENDIAN
00080 fe->swap = strcmp("big", cmd_ln_str_r(config, "-input_endian")) == 0 ? 0 : 1;
00081 #else
00082 fe->swap = strcmp("little", cmd_ln_str_r(config, "-input_endian")) == 0 ? 0 : 1;
00083 #endif
00084 fe->window_length = cmd_ln_float32_r(config, "-wlen");
00085 fe->pre_emphasis_alpha = cmd_ln_float32_r(config, "-alpha");
00086
00087 fe->num_cepstra = (uint8)cmd_ln_int32_r(config, "-ncep");
00088 fe->fft_size = (int16)cmd_ln_int32_r(config, "-nfft");
00089
00090
00091 for (j = fe->fft_size, fe->fft_order = 0; j > 1; j >>= 1, fe->fft_order++) {
00092 if (((j % 2) != 0) || (fe->fft_size <= 0)) {
00093 E_ERROR("fft: number of points must be a power of 2 (is %d)\n",
00094 fe->fft_size);
00095 return -1;
00096 }
00097 }
00098
00099 if (fe->fft_size < (int)(fe->window_length * fe->sampling_rate)) {
00100 E_ERROR("FFT: Number of points must be greater or equal to frame size (%d samples)\n",
00101 (int)(fe->window_length * fe->sampling_rate));
00102 return -1;
00103 }
00104
00105 fe->remove_dc = cmd_ln_boolean_r(config, "-remove_dc");
00106
00107 if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "dct"))
00108 fe->transform = DCT_II;
00109 else if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "legacy"))
00110 fe->transform = LEGACY_DCT;
00111 else if (0 == strcmp(cmd_ln_str_r(config, "-transform"), "htk"))
00112 fe->transform = DCT_HTK;
00113 else {
00114 E_ERROR("Invalid transform type (values are 'dct', 'legacy', 'htk')\n");
00115 return -1;
00116 }
00117
00118 if (cmd_ln_boolean_r(config, "-logspec"))
00119 fe->log_spec = RAW_LOG_SPEC;
00120 if (cmd_ln_boolean_r(config, "-smoothspec"))
00121 fe->log_spec = SMOOTH_LOG_SPEC;
00122
00123 return 0;
00124 }
00125
00126 static int
00127 fe_parse_melfb_params(cmd_ln_t *config, fe_t *fe, melfb_t * mel)
00128 {
00129 mel->sampling_rate = fe->sampling_rate;
00130 mel->fft_size = fe->fft_size;
00131 mel->num_cepstra = fe->num_cepstra;
00132 mel->num_filters = cmd_ln_int32_r(config, "-nfilt");
00133
00134 if (fe->log_spec)
00135 fe->feature_dimension = mel->num_filters;
00136 else
00137 fe->feature_dimension = fe->num_cepstra;
00138
00139 mel->upper_filt_freq = cmd_ln_float32_r(config, "-upperf");
00140 mel->lower_filt_freq = cmd_ln_float32_r(config, "-lowerf");
00141
00142 mel->doublewide = cmd_ln_boolean_r(config, "-doublebw");
00143
00144 mel->warp_type = cmd_ln_str_r(config, "-warp_type");
00145 mel->warp_params = cmd_ln_str_r(config, "-warp_params");
00146 mel->lifter_val = cmd_ln_int32_r(config, "-lifter");
00147
00148 mel->unit_area = cmd_ln_boolean_r(config, "-unit_area");
00149 mel->round_filters = cmd_ln_boolean_r(config, "-round_filters");
00150
00151 if (fe_warp_set(mel, mel->warp_type) != FE_SUCCESS) {
00152 E_ERROR("Failed to initialize the warping function.\n");
00153 return -1;
00154 }
00155 fe_warp_set_parameters(mel, mel->warp_params, mel->sampling_rate);
00156 return 0;
00157 }
00158
00159 void
00160 fe_print_current(fe_t const *fe)
00161 {
00162 E_INFO("Current FE Parameters:\n");
00163 E_INFO("\tSampling Rate: %f\n", fe->sampling_rate);
00164 E_INFO("\tFrame Size: %d\n", fe->frame_size);
00165 E_INFO("\tFrame Shift: %d\n", fe->frame_shift);
00166 E_INFO("\tFFT Size: %d\n", fe->fft_size);
00167 E_INFO("\tLower Frequency: %g\n",
00168 fe->mel_fb->lower_filt_freq);
00169 E_INFO("\tUpper Frequency: %g\n",
00170 fe->mel_fb->upper_filt_freq);
00171 E_INFO("\tNumber of filters: %d\n", fe->mel_fb->num_filters);
00172 E_INFO("\tNumber of Overflow Samps: %d\n", fe->num_overflow_samps);
00173 E_INFO("\tStart Utt Status: %d\n", fe->start_flag);
00174 E_INFO("Will %sremove DC offset at frame level\n",
00175 fe->remove_dc ? "" : "not ");
00176 if (fe->dither) {
00177 E_INFO("Will add dither to audio\n");
00178 E_INFO("Dither seeded with %d\n", fe->seed);
00179 }
00180 else {
00181 E_INFO("Will not add dither to audio\n");
00182 }
00183 if (fe->mel_fb->lifter_val) {
00184 E_INFO("Will apply sine-curve liftering, period %d\n",
00185 fe->mel_fb->lifter_val);
00186 }
00187 E_INFO("Will %snormalize filters to unit area\n",
00188 fe->mel_fb->unit_area ? "" : "not ");
00189 E_INFO("Will %sround filter frequencies to DFT points\n",
00190 fe->mel_fb->round_filters ? "" : "not ");
00191 E_INFO("Will %suse double bandwidth in mel filter\n",
00192 fe->mel_fb->doublewide ? "" : "not ");
00193 }
00194
00195 fe_t *
00196 fe_init_auto()
00197 {
00198 return fe_init_auto_r(cmd_ln_retain(cmd_ln_get()));
00199 }
00200
00201 fe_t *
00202 fe_init_auto_r(cmd_ln_t *config)
00203 {
00204 fe_t *fe;
00205
00206 fe = ckd_calloc(1, sizeof(*fe));
00207 fe->refcount = 1;
00208
00209
00210 if (fe_parse_general_params(config, fe) < 0) {
00211 fe_free(fe);
00212 return NULL;
00213 }
00214
00215
00216
00217
00218
00219 fe->frame_shift = (int32) (fe->sampling_rate / fe->frame_rate + 0.5);
00220 fe->frame_size = (int32) (fe->window_length * fe->sampling_rate + 0.5);
00221 fe->prior = 0;
00222 fe->frame_counter = 0;
00223
00224 if (fe->frame_size > (fe->fft_size)) {
00225 E_WARN
00226 ("Number of FFT points has to be a power of 2 higher than %d\n",
00227 (fe->frame_size));
00228 return (NULL);
00229 }
00230
00231 if (fe->dither)
00232 fe_init_dither(fe->seed);
00233
00234
00235 fe->overflow_samps = ckd_calloc(fe->frame_size, sizeof(int16));
00236 fe->hamming_window = ckd_calloc(fe->frame_size/2, sizeof(window_t));
00237
00238
00239 fe_create_hamming(fe->hamming_window, fe->frame_size);
00240
00241
00242 fe->mel_fb = ckd_calloc(1, sizeof(*fe->mel_fb));
00243
00244
00245 fe_parse_melfb_params(config, fe, fe->mel_fb);
00246 fe_build_melfilters(fe->mel_fb);
00247 fe_compute_melcosine(fe->mel_fb);
00248
00249
00250
00251 fe->spch = ckd_calloc(fe->frame_size, sizeof(*fe->spch));
00252 fe->frame = ckd_calloc(fe->fft_size, sizeof(*fe->frame));
00253 fe->spec = ckd_calloc(fe->fft_size, sizeof(*fe->spec));
00254 fe->mfspec = ckd_calloc(fe->mel_fb->num_filters, sizeof(*fe->mfspec));
00255
00256
00257 fe->ccc = ckd_calloc(fe->fft_size / 4, sizeof(*fe->ccc));
00258 fe->sss = ckd_calloc(fe->fft_size / 4, sizeof(*fe->sss));
00259 fe_create_twiddle(fe);
00260
00261 if (cmd_ln_boolean_r(config, "-verbose")) {
00262 fe_print_current(fe);
00263 }
00264
00265
00266
00267 fe_start_utt(fe);
00268 return fe;
00269 }
00270
00271 arg_t const *
00272 fe_get_args(void)
00273 {
00274 return fe_args;
00275 }
00276
00277 cmd_ln_t *
00278 fe_get_config(fe_t *fe)
00279 {
00280 return fe->config;
00281 }
00282
00283 void
00284 fe_init_dither(int32 seed)
00285 {
00286 if (seed < 0) {
00287 E_INFO("You are using the internal mechanism to generate the seed.\n");
00288 #ifdef _WIN32_WCE
00289 s3_rand_seed(GetTickCount());
00290 #else
00291 s3_rand_seed((long) time(0));
00292 #endif
00293 }
00294 else {
00295 E_INFO("You are using %d as the seed.\n", seed);
00296 s3_rand_seed(seed);
00297 }
00298 }
00299
00300 int32
00301 fe_start_utt(fe_t * fe)
00302 {
00303 fe->num_overflow_samps = 0;
00304 memset(fe->overflow_samps, 0, fe->frame_size * sizeof(int16));
00305 fe->start_flag = 1;
00306 fe->prior = 0;
00307 return 0;
00308 }
00309
00310 int
00311 fe_get_output_size(fe_t *fe)
00312 {
00313 return (int)fe->feature_dimension;
00314 }
00315
00316 void
00317 fe_get_input_size(fe_t *fe, int *out_frame_shift,
00318 int *out_frame_size)
00319 {
00320 if (out_frame_shift)
00321 *out_frame_shift = fe->frame_shift;
00322 if (out_frame_size)
00323 *out_frame_size = fe->frame_size;
00324 }
00325
00326 int32
00327 fe_process_frame(fe_t * fe, int16 const *spch, int32 nsamps, mfcc_t * fr_cep)
00328 {
00329 fe_read_frame(fe, spch, nsamps);
00330 return fe_write_frame(fe, fr_cep);
00331 }
00332
00333 int
00334 fe_process_frames(fe_t *fe,
00335 int16 const **inout_spch,
00336 size_t *inout_nsamps,
00337 mfcc_t **buf_cep,
00338 int32 *inout_nframes)
00339 {
00340 int32 frame_count;
00341 int i, n_overflow, orig_n_overflow;
00342 int16 const *orig_spch;
00343
00344
00345
00346 if (buf_cep == NULL) {
00347 if (*inout_nsamps + fe->num_overflow_samps < (size_t)fe->frame_size)
00348 *inout_nframes = 0;
00349 else
00350 *inout_nframes = 1
00351 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
00352 / fe->frame_shift);
00353 return 0;
00354 }
00355
00356
00357 if (*inout_nsamps + fe->num_overflow_samps < (size_t)fe->frame_size) {
00358 if (*inout_nsamps > 0) {
00359
00360 memcpy(fe->overflow_samps + fe->num_overflow_samps,
00361 *inout_spch, *inout_nsamps * (sizeof(int16)));
00362 fe->num_overflow_samps += *inout_nsamps;
00363
00364 *inout_spch += *inout_nsamps;
00365 *inout_nsamps = 0;
00366 }
00367
00368 *inout_nframes = 0;
00369 return 0;
00370 }
00371
00372
00373 if (*inout_nframes < 1) {
00374 *inout_nframes = 0;
00375 return 0;
00376 }
00377
00378
00379 orig_spch = *inout_spch;
00380 orig_n_overflow = fe->num_overflow_samps;
00381
00382 frame_count = 1
00383 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
00384 / fe->frame_shift);
00385
00386 if (frame_count > *inout_nframes)
00387 frame_count = *inout_nframes;
00388
00389
00390 if (fe->num_overflow_samps) {
00391 int offset = fe->frame_size - fe->num_overflow_samps;
00392
00393
00394 memcpy(fe->overflow_samps + fe->num_overflow_samps,
00395 *inout_spch, offset * sizeof(**inout_spch));
00396 fe_read_frame(fe, fe->overflow_samps, fe->frame_size);
00397 fe_write_frame(fe, buf_cep[0]);
00398
00399 *inout_spch += offset;
00400 *inout_nsamps -= offset;
00401 fe->num_overflow_samps -= fe->frame_shift;
00402 }
00403 else {
00404 fe_read_frame(fe, *inout_spch, fe->frame_size);
00405 fe_write_frame(fe, buf_cep[0]);
00406
00407 *inout_spch += fe->frame_size;
00408 *inout_nsamps -= fe->frame_size;
00409 }
00410
00411 --*inout_nframes;
00412
00413
00414 for (i = 1; i < frame_count; ++i) {
00415 assert(*inout_nsamps >= (size_t)fe->frame_shift);
00416 assert(*inout_nframes > 0);
00417
00418 fe_shift_frame(fe, *inout_spch, fe->frame_shift);
00419 fe_write_frame(fe, buf_cep[i]);
00420
00421 *inout_spch += fe->frame_shift;
00422 *inout_nsamps -= fe->frame_shift;
00423
00424 if (fe->num_overflow_samps > 0)
00425 fe->num_overflow_samps -= fe->frame_shift;
00426
00427 --*inout_nframes;
00428 }
00429
00430
00431 if (fe->num_overflow_samps <= 0) {
00432
00433 n_overflow = *inout_nsamps;
00434 if (n_overflow > fe->frame_shift)
00435 n_overflow = fe->frame_shift;
00436 fe->num_overflow_samps = fe->frame_size - fe->frame_shift;
00437
00438 if (fe->num_overflow_samps > *inout_spch - orig_spch)
00439 fe->num_overflow_samps = *inout_spch - orig_spch;
00440 fe->num_overflow_samps += n_overflow;
00441 if (fe->num_overflow_samps > 0) {
00442 memcpy(fe->overflow_samps,
00443 *inout_spch - (fe->frame_size - fe->frame_shift),
00444 fe->num_overflow_samps * sizeof(**inout_spch));
00445
00446 *inout_spch += n_overflow;
00447 *inout_nsamps -= n_overflow;
00448 }
00449 }
00450 else {
00451
00452
00453 memmove(fe->overflow_samps,
00454 fe->overflow_samps + orig_n_overflow - fe->num_overflow_samps,
00455 fe->num_overflow_samps * sizeof(*fe->overflow_samps));
00456
00457 n_overflow = *inout_spch - orig_spch + *inout_nsamps;
00458 if (n_overflow > fe->frame_size - fe->num_overflow_samps)
00459 n_overflow = fe->frame_size - fe->num_overflow_samps;
00460 memcpy(fe->overflow_samps + fe->num_overflow_samps,
00461 orig_spch, n_overflow * sizeof(*orig_spch));
00462 fe->num_overflow_samps += n_overflow;
00463
00464 if (n_overflow > *inout_spch - orig_spch) {
00465 n_overflow -= (*inout_spch - orig_spch);
00466 *inout_spch += n_overflow;
00467 *inout_nsamps -= n_overflow;
00468 }
00469 }
00470
00471
00472 *inout_nframes = frame_count;
00473 return 0;
00474 }
00475
00476 int
00477 fe_process_utt(fe_t * fe, int16 const * spch, size_t nsamps,
00478 mfcc_t *** cep_block, int32 * nframes)
00479 {
00480 mfcc_t **cep;
00481 int rv;
00482
00483
00484 fe_process_frames(fe, NULL, &nsamps, NULL, nframes);
00485
00486 if (*nframes)
00487 cep = (mfcc_t **)ckd_calloc_2d(*nframes, fe->feature_dimension, sizeof(**cep));
00488 else
00489 cep = (mfcc_t **)ckd_calloc_2d(1, fe->feature_dimension, sizeof(**cep));
00490
00491 rv = fe_process_frames(fe, &spch, &nsamps, cep, nframes);
00492 *cep_block = cep;
00493
00494 return rv;
00495 }
00496
00497
00498 int32
00499 fe_end_utt(fe_t * fe, mfcc_t * cepvector, int32 * nframes)
00500 {
00501 int32 frame_count;
00502
00503
00504 if (fe->num_overflow_samps > 0) {
00505 fe_read_frame(fe, fe->overflow_samps, fe->num_overflow_samps);
00506 fe_write_frame(fe, cepvector);
00507 frame_count = 1;
00508 }
00509 else {
00510 frame_count = 0;
00511 }
00512
00513
00514 fe->num_overflow_samps = 0;
00515 fe->start_flag = 0;
00516
00517 *nframes = frame_count;
00518 return 0;
00519 }
00520
00521 fe_t *
00522 fe_retain(fe_t *fe)
00523 {
00524 ++fe->refcount;
00525 return fe;
00526 }
00527
00528 int
00529 fe_free(fe_t * fe)
00530 {
00531 if (fe == NULL)
00532 return 0;
00533 if (--fe->refcount > 0)
00534 return fe->refcount;
00535
00536
00537 fe_free_2d((void *) fe->mel_fb->mel_cosine);
00538 if (fe->mel_fb->lifter)
00539 free(fe->mel_fb->lifter);
00540 ckd_free(fe->mel_fb->spec_start);
00541 ckd_free(fe->mel_fb->filt_start);
00542 ckd_free(fe->mel_fb->filt_width);
00543 ckd_free(fe->mel_fb->filt_coeffs);
00544 ckd_free(fe->mel_fb);
00545 ckd_free(fe->spch);
00546 ckd_free(fe->frame);
00547 ckd_free(fe->ccc);
00548 ckd_free(fe->sss);
00549 ckd_free(fe->spec);
00550 ckd_free(fe->mfspec);
00551 ckd_free(fe->overflow_samps);
00552 ckd_free(fe->hamming_window);
00553 cmd_ln_free_r(fe->config);
00554 ckd_free(fe);
00555
00556 return 0;
00557 }
00558
00562 int32
00563 fe_mfcc_to_float(fe_t * fe,
00564 mfcc_t ** input, float32 ** output, int32 nframes)
00565 {
00566 int32 i;
00567
00568 #ifndef FIXED_POINT
00569 if ((void *) input == (void *) output)
00570 return nframes * fe->feature_dimension;
00571 #endif
00572 for (i = 0; i < nframes * fe->feature_dimension; ++i)
00573 output[0][i] = MFCC2FLOAT(input[0][i]);
00574
00575 return i;
00576 }
00577
00581 int32
00582 fe_float_to_mfcc(fe_t * fe,
00583 float32 ** input, mfcc_t ** output, int32 nframes)
00584 {
00585 int32 i;
00586
00587 #ifndef FIXED_POINT
00588 if ((void *) input == (void *) output)
00589 return nframes * fe->feature_dimension;
00590 #endif
00591 for (i = 0; i < nframes * fe->feature_dimension; ++i)
00592 output[0][i] = FLOAT2MFCC(input[0][i]);
00593
00594 return i;
00595 }
00596
00597 int32
00598 fe_logspec_to_mfcc(fe_t * fe, const mfcc_t * fr_spec, mfcc_t * fr_cep)
00599 {
00600 #ifdef FIXED_POINT
00601 fe_spec2cep(fe, fr_spec, fr_cep);
00602 #else
00603 powspec_t *powspec;
00604 int32 i;
00605
00606 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
00607 for (i = 0; i < fe->mel_fb->num_filters; ++i)
00608 powspec[i] = (powspec_t) fr_spec[i];
00609 fe_spec2cep(fe, powspec, fr_cep);
00610 ckd_free(powspec);
00611 #endif
00612 return 0;
00613 }
00614
00615 int32
00616 fe_logspec_dct2(fe_t * fe, const mfcc_t * fr_spec, mfcc_t * fr_cep)
00617 {
00618 #ifdef FIXED_POINT
00619 fe_dct2(fe, fr_spec, fr_cep, 0);
00620 #else
00621 powspec_t *powspec;
00622 int32 i;
00623
00624 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
00625 for (i = 0; i < fe->mel_fb->num_filters; ++i)
00626 powspec[i] = (powspec_t) fr_spec[i];
00627 fe_dct2(fe, powspec, fr_cep, 0);
00628 ckd_free(powspec);
00629 #endif
00630 return 0;
00631 }
00632
00633 int32
00634 fe_mfcc_dct3(fe_t * fe, const mfcc_t * fr_cep, mfcc_t * fr_spec)
00635 {
00636 #ifdef FIXED_POINT
00637 fe_dct3(fe, fr_cep, fr_spec);
00638 #else
00639 powspec_t *powspec;
00640 int32 i;
00641
00642 powspec = ckd_malloc(fe->mel_fb->num_filters * sizeof(powspec_t));
00643 fe_dct3(fe, fr_cep, powspec);
00644 for (i = 0; i < fe->mel_fb->num_filters; ++i)
00645 fr_spec[i] = (mfcc_t) powspec[i];
00646 ckd_free(powspec);
00647 #endif
00648 return 0;
00649 }