00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051 #if defined(WIN32) && !defined(GNUWINCE)
00052 #define srand48(x) srand(x)
00053 #define lrand48() rand()
00054 #endif
00055
00056 #ifndef _NEW_FE_H_
00057 #define _NEW_FE_H_
00058
00059
00060 #include <sphinxbase_export.h>
00061
00062 #include <sphinx_config.h>
00063 #include <cmd_ln.h>
00064 #include <fixpoint.h>
00065
00066 #ifdef __cplusplus
00067 extern "C" {
00068 #endif
00069 #if 0
00070
00071 }
00072 #endif
00073
00074 #ifdef WORDS_BIGENDIAN
00075 #define NATIVE_ENDIAN "big"
00076 #else
00077 #define NATIVE_ENDIAN "little"
00078 #endif
00079
00081 #define DEFAULT_SAMPLING_RATE 16000
00082
00083 #define DEFAULT_FRAME_RATE 100
00084
00086 #define DEFAULT_FRAME_SHIFT 160
00087
00088 #define DEFAULT_WINDOW_LENGTH 0.025625
00089
00090 #define DEFAULT_FFT_SIZE 512
00091
00092 #define DEFAULT_NUM_CEPSTRA 13
00093
00094 #define DEFAULT_NUM_FILTERS 40
00095
00096 #define DEFAULT_LOWER_FILT_FREQ 133.33334
00097
00098 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
00099
00100 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
00101
00102 #define DEFAULT_WARP_TYPE "inverse_linear"
00103
00104 #define SEED -1
00105
00106 #define waveform_to_cepstral_command_line_macro() \
00107 { "-logspec", \
00108 ARG_BOOLEAN, \
00109 "no", \
00110 "Write out logspectral files instead of cepstra" }, \
00111 \
00112 { "-smoothspec", \
00113 ARG_BOOLEAN, \
00114 "no", \
00115 "Write out cepstral-smoothed logspectral files" }, \
00116 \
00117 { "-transform", \
00118 ARG_STRING, \
00119 "legacy", \
00120 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
00121 \
00122 { "-alpha", \
00123 ARG_FLOAT32, \
00124 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
00125 "Preemphasis parameter" }, \
00126 \
00127 { "-samprate", \
00128 ARG_FLOAT32, \
00129 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
00130 "Sampling rate" }, \
00131 \
00132 { "-frate", \
00133 ARG_INT32, \
00134 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
00135 "Frame rate" }, \
00136 \
00137 { "-wlen", \
00138 ARG_FLOAT32, \
00139 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
00140 "Hamming window length" }, \
00141 \
00142 { "-nfft", \
00143 ARG_INT32, \
00144 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
00145 "Size of FFT" }, \
00146 \
00147 { "-nfilt", \
00148 ARG_INT32, \
00149 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
00150 "Number of filter banks" }, \
00151 \
00152 { "-lowerf", \
00153 ARG_FLOAT32, \
00154 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
00155 "Lower edge of filters" }, \
00156 \
00157 { "-upperf", \
00158 ARG_FLOAT32, \
00159 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
00160 "Upper edge of filters" }, \
00161 \
00162 { "-unit_area", \
00163 ARG_BOOLEAN, \
00164 "yes", \
00165 "Normalize mel filters to unit area" }, \
00166 \
00167 { "-round_filters", \
00168 ARG_BOOLEAN, \
00169 "yes", \
00170 "Round mel filter frequencies to DFT points" }, \
00171 \
00172 { "-ncep", \
00173 ARG_INT32, \
00174 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
00175 "Number of cep coefficients" }, \
00176 \
00177 { "-doublebw", \
00178 ARG_BOOLEAN, \
00179 "no", \
00180 "Use double bandwidth filters (same center freq)" }, \
00181 \
00182 { "-lifter", \
00183 ARG_INT32, \
00184 "0", \
00185 "Length of sin-curve for liftering, or 0 for no liftering." }, \
00186 \
00187 { "-input_endian", \
00188 ARG_STRING, \
00189 NATIVE_ENDIAN, \
00190 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
00191 \
00192 { "-warp_type", \
00193 ARG_STRING, \
00194 DEFAULT_WARP_TYPE, \
00195 "Warping function type (or shape)" }, \
00196 \
00197 { "-warp_params", \
00198 ARG_STRING, \
00199 NULL, \
00200 "Parameters defining the warping function" }, \
00201 \
00202 { "-dither", \
00203 ARG_BOOLEAN, \
00204 "no", \
00205 "Add 1/2-bit noise" }, \
00206 \
00207 { "-seed", \
00208 ARG_INT32, \
00209 ARG_STRINGIFY(SEED), \
00210 "Seed for random number generator; if less than zero, pick our own" }, \
00211 \
00212 { "-remove_dc", \
00213 ARG_BOOLEAN, \
00214 "no", \
00215 "Remove DC offset from each frame" }, \
00216 \
00217 { "-verbose", \
00218 ARG_BOOLEAN, \
00219 "no", \
00220 "Show input filenames" } \
00221
00222
00223 #ifdef FIXED_POINT
00224
00225 typedef fixed32 mfcc_t;
00226
00228 #define FLOAT2MFCC(x) FLOAT2FIX(x)
00229
00230 #define MFCC2FLOAT(x) FIX2FLOAT(x)
00231
00232 #define MFCCMUL(a,b) FIXMUL(a,b)
00233 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
00234 #else
00235
00237 typedef float32 mfcc_t;
00239 #define FLOAT2MFCC(x) (x)
00240
00241 #define MFCC2FLOAT(x) (x)
00242
00243 #define MFCCMUL(a,b) ((a)*(b))
00244 #define MFCCLN(x,in,out) log(x)
00245 #endif
00246
00250 typedef struct fe_s fe_t;
00251
00255 enum fe_error_e {
00256 FE_SUCCESS = 0,
00257 FE_OUTPUT_FILE_SUCCESS = 0,
00258 FE_CONTROL_FILE_ERROR = -1,
00259 FE_START_ERROR = -2,
00260 FE_UNKNOWN_SINGLE_OR_BATCH = -3,
00261 FE_INPUT_FILE_OPEN_ERROR = -4,
00262 FE_INPUT_FILE_READ_ERROR = -5,
00263 FE_MEM_ALLOC_ERROR = -6,
00264 FE_OUTPUT_FILE_WRITE_ERROR = -7,
00265 FE_OUTPUT_FILE_OPEN_ERROR = -8,
00266 FE_ZERO_ENERGY_ERROR = -9,
00267 FE_INVALID_PARAM_ERROR = -10
00268 };
00269
00277 SPHINXBASE_EXPORT
00278 fe_t* fe_init_auto(void);
00279
00287 SPHINXBASE_EXPORT
00288 arg_t const *fe_get_args(void);
00289
00300 SPHINXBASE_EXPORT
00301 fe_t *fe_init_auto_r(cmd_ln_t *config);
00302
00310 SPHINXBASE_EXPORT
00311 cmd_ln_t *fe_get_config(fe_t *fe);
00312
00317 SPHINXBASE_EXPORT
00318 int fe_start_utt(fe_t *fe);
00319
00332 SPHINXBASE_EXPORT
00333 int fe_get_output_size(fe_t *fe);
00334
00347 SPHINXBASE_EXPORT
00348 void fe_get_input_size(fe_t *fe, int *out_frame_shift,
00349 int *out_frame_size);
00350
00365 SPHINXBASE_EXPORT
00366 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
00367
00373 SPHINXBASE_EXPORT
00374 fe_t *fe_retain(fe_t *fe);
00375
00383 SPHINXBASE_EXPORT
00384 int fe_free(fe_t *fe);
00385
00394 SPHINXBASE_EXPORT
00395 int fe_process_frame(fe_t *fe, int16 const *spch,
00396 int32 nsamps, mfcc_t *out_cep);
00397
00445 SPHINXBASE_EXPORT
00446 int fe_process_frames(fe_t *fe,
00447 int16 const **inout_spch,
00448 size_t *inout_nsamps,
00449 mfcc_t **buf_cep,
00450 int32 *inout_nframes);
00451
00467 SPHINXBASE_EXPORT
00468 int fe_process_utt(fe_t *fe,
00469 int16 const *spch,
00470 size_t nsamps,
00471 mfcc_t ***cep_block,
00472 int32 *nframes
00473 );
00474
00478 SPHINXBASE_EXPORT
00479 void fe_free_2d(void *arr);
00480
00484 SPHINXBASE_EXPORT
00485 int fe_mfcc_to_float(fe_t *fe,
00486 mfcc_t **input,
00487 float32 **output,
00488 int32 nframes);
00489
00493 SPHINXBASE_EXPORT
00494 int fe_float_to_mfcc(fe_t *fe,
00495 float32 **input,
00496 mfcc_t **output,
00497 int32 nframes);
00498
00522 SPHINXBASE_EXPORT
00523 int fe_logspec_to_mfcc(fe_t *fe,
00524 const mfcc_t *fr_spec,
00525 mfcc_t *fr_cep
00526 );
00527
00536 SPHINXBASE_EXPORT
00537 int fe_logspec_dct2(fe_t *fe,
00538 const mfcc_t *fr_spec,
00539 mfcc_t *fr_cep
00540 );
00541
00550 SPHINXBASE_EXPORT
00551 int fe_mfcc_dct3(fe_t *fe,
00552 const mfcc_t *fr_cep,
00553 mfcc_t *fr_spec
00554 );
00555
00556 #ifdef __cplusplus
00557 }
00558 #endif
00559
00560
00561 #endif