SphinxBase
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00041 #include <sphinxbase/logmath.h> 00042 #include <sphinxbase/ngram_model.h> 00043 #include <sphinxbase/cmd_ln.h> 00044 #include <sphinxbase/ckd_alloc.h> 00045 #include <sphinxbase/err.h> 00046 #include <sphinxbase/pio.h> 00047 #include <sphinxbase/strfuncs.h> 00048 00049 #include <stdio.h> 00050 #include <string.h> 00051 #include <math.h> 00052 00053 static const arg_t defn[] = { 00054 { "-help", 00055 ARG_BOOLEAN, 00056 "no", 00057 "Shows the usage of the tool"}, 00058 00059 { "-logbase", 00060 ARG_FLOAT64, 00061 "1.0001", 00062 "Base in which all log-likelihoods calculated" }, 00063 00064 { "-lm", 00065 ARG_STRING, 00066 NULL, 00067 "Language model file"}, 00068 00069 { "-probdef", 00070 ARG_STRING, 00071 NULL, 00072 "Probability definition file for classes in LM"}, 00073 00074 { "-lmctlfn", 00075 ARG_STRING, 00076 NULL, 00077 "Control file listing a set of language models"}, 00078 00079 { "-lmname", 00080 ARG_STRING, 00081 NULL, 00082 "Name of language model in -lmctlfn to use for all utterances" }, 00083 00084 { "-lsn", 00085 ARG_STRING, 00086 NULL, 00087 "Transcription file to evaluate"}, 00088 00089 { "-text", 00090 ARG_STRING, 00091 "Text string to evaluate"}, 00092 00093 { "-mmap", 00094 ARG_BOOLEAN, 00095 "no", 00096 "Use memory-mapped I/O for reading binary LM files"}, 00097 00098 { "-lw", 00099 ARG_FLOAT32, 00100 "1.0", 00101 "Language model weight" }, 00102 00103 { "-wip", 00104 ARG_FLOAT32, 00105 "1.0", 00106 "Word insertion probability" }, 00107 00108 { "-uw", 00109 ARG_FLOAT32, 00110 "1.0", 00111 "Unigram probability weight (interpolated with uniform distribution)"}, 00112 00113 { "-verbose", 00114 ARG_BOOLEAN, 00115 "no", 00116 "Print details of perplexity calculation" }, 00117 00118 /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */ 00119 { NULL, 0, NULL, NULL } 00120 }; 00121 00122 static int verbose; 00123 00124 static int 00125 calc_entropy(ngram_model_t *lm, char **words, int32 n, 00126 int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score) 00127 { 00128 int32 *wids; 00129 int32 startwid; 00130 int32 i, ch, nccs, noovs, unk; 00131 00132 if (n == 0) 00133 return 0; 00134 00135 unk = ngram_unknown_wid(lm); 00136 00137 /* Reverse this array into an array of word IDs. */ 00138 wids = ckd_calloc(n, sizeof(*wids)); 00139 for (i = 0; i < n; ++i) 00140 wids[n-i-1] = ngram_wid(lm, words[i]); 00141 /* Skip <s> as it's a context cue (HACK, this should be configurable). */ 00142 startwid = ngram_wid(lm, "<s>"); 00143 00144 /* Now evaluate the list of words in reverse using the 00145 * remainder of the array as the history. */ 00146 ch = noovs = nccs = 0; 00147 for (i = 0; i < n; ++i) { 00148 int32 n_used; 00149 int32 prob; 00150 00151 /* Skip <s> as it's a context cue (HACK, this should be configurable). */ 00152 if (wids[i] == startwid) { 00153 ++nccs; 00154 continue; 00155 } 00156 /* Skip and count OOVs. */ 00157 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) { 00158 ++noovs; 00159 continue; 00160 } 00161 /* Sum up information for each N-gram */ 00162 prob = ngram_ng_score(lm, 00163 wids[i], wids + i + 1, 00164 n - i - 1, &n_used); 00165 if (verbose) { 00166 int m; 00167 printf("log P(%s|", ngram_word(lm, wids[i])); 00168 m = i + ngram_model_get_size(lm) - 1; 00169 if (m >= n) 00170 m = n - 1; 00171 while (m > i) { 00172 printf("%s ", ngram_word(lm, wids[m--])); 00173 } 00174 printf(") = %d\n", prob); 00175 } 00176 ch -= prob; 00177 } 00178 00179 if (out_n_ccs) *out_n_ccs = nccs; 00180 if (out_n_oovs) *out_n_oovs = noovs; 00181 00182 /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */ 00183 n -= (nccs + noovs); 00184 if (n <= 0) 00185 return 0; 00186 if (out_lm_score) 00187 *out_lm_score = -ch; 00188 return ch / n; 00189 } 00190 00191 static void 00192 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn) 00193 { 00194 FILE *fh; 00195 lineiter_t *litor; 00196 int32 nccs, noovs, nwords, lscr; 00197 float64 ch, log_to_log2;; 00198 00199 if ((fh = fopen(lsnfn, "r")) == NULL) 00200 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn); 00201 00202 /* We have to keep ch in floating-point to avoid overflows, so 00203 * we might as well use log2. */ 00204 log_to_log2 = log(logmath_get_base(lmath)) / log(2); 00205 nccs = noovs = nwords = 0; 00206 ch = 0.0; 00207 for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) { 00208 char **words; 00209 int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr; 00210 00211 n = str2words(litor->buf, NULL, 0); 00212 if (n < 0) 00213 E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n); 00214 if (n == 0) /* Do nothing! */ 00215 continue; 00216 words = ckd_calloc(n, sizeof(*words)); 00217 str2words(litor->buf, words, n); 00218 00219 /* Remove any utterance ID (FIXME: has to be a single "word") */ 00220 if (words[n-1][0] == '(' 00221 && words[n-1][strlen(words[n-1])-1] == ')') 00222 n = n - 1; 00223 00224 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, 00225 &tmp_noovs, &tmp_lscr); 00226 00227 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2; 00228 nccs += tmp_nccs; 00229 noovs += tmp_noovs; 00230 lscr += tmp_lscr; 00231 nwords += n; 00232 00233 ckd_free(words); 00234 } 00235 00236 ch /= (nwords - nccs - noovs); 00237 printf("cross-entropy: %f bits\n", ch); 00238 00239 /* Calculate perplexity pplx = exp CH */ 00240 printf("perplexity: %f\n", pow(2.0, ch)); 00241 printf("lm score: %d\n", lscr); 00242 00243 /* Report OOVs and CCs */ 00244 printf("%d words evaluated\n", nwords); 00245 printf("%d OOVs (%.2f%%), %d context cues removed\n", 00246 noovs, (double)noovs / nwords * 100, nccs); 00247 } 00248 00249 static void 00250 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text) 00251 { 00252 char *textfoo; 00253 char **words; 00254 int32 n, ch, noovs, nccs, lscr; 00255 00256 /* Split it into an array of strings. */ 00257 textfoo = ckd_salloc(text); 00258 n = str2words(textfoo, NULL, 0); 00259 if (n < 0) 00260 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n); 00261 if (n == 0) /* Do nothing! */ 00262 return; 00263 words = ckd_calloc(n, sizeof(*words)); 00264 str2words(textfoo, words, n); 00265 00266 ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr); 00267 00268 printf("input: %s\n", text); 00269 printf("cross-entropy: %f bits\n", 00270 ch * log(logmath_get_base(lmath)) / log(2)); 00271 00272 /* Calculate perplexity pplx = exp CH */ 00273 printf("perplexity: %f\n", logmath_exp(lmath, ch)); 00274 printf("lm score: %d\n", lscr); 00275 00276 /* Report OOVs and CCs */ 00277 printf("%d words evaluated\n", n); 00278 printf("%d OOVs, %d context cues removed\n", 00279 noovs, nccs); 00280 00281 ckd_free(textfoo); 00282 ckd_free(words); 00283 } 00284 00285 int 00286 main(int argc, char *argv[]) 00287 { 00288 cmd_ln_t *config; 00289 ngram_model_t *lm = NULL; 00290 logmath_t *lmath; 00291 const char *lmfn, *probdefn, *lsnfn, *text; 00292 00293 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) 00294 return 1; 00295 00296 verbose = cmd_ln_boolean_r(config, "-verbose"); 00297 00298 /* Create log math object. */ 00299 if ((lmath = logmath_init 00300 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) { 00301 E_FATAL("Failed to initialize log math\n"); 00302 } 00303 00304 /* Load the language model. */ 00305 lmfn = cmd_ln_str_r(config, "-lm"); 00306 if (lmfn == NULL 00307 || (lm = ngram_model_read(config, lmfn, 00308 NGRAM_AUTO, lmath)) == NULL) { 00309 E_FATAL("Failed to load language model from %s\n", 00310 cmd_ln_str_r(config, "-lm")); 00311 } 00312 if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL) 00313 ngram_model_read_classdef(lm, probdefn); 00314 ngram_model_apply_weights(lm, 00315 cmd_ln_float32_r(config, "-lw"), 00316 cmd_ln_float32_r(config, "-wip"), 00317 cmd_ln_float32_r(config, "-uw")); 00318 00319 /* Now evaluate some text. */ 00320 lsnfn = cmd_ln_str_r(config, "-lsn"); 00321 text = cmd_ln_str_r(config, "-text"); 00322 if (lsnfn) { 00323 evaluate_file(lm, lmath, lsnfn); 00324 } 00325 else if (text) { 00326 evaluate_string(lm, lmath, text); 00327 } 00328 00329 return 0; 00330 }