SphinxBase  0.6
src/libsphinxbase/lm/ngram_model_internal.h
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model_internal.h Internal structures for N-Gram models
00039  *
00040  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
00041  */
00042 
00043 #ifndef __NGRAM_MODEL_INTERNAL_H__
00044 #define __NGRAM_MODEL_INTERNAL_H__
00045 
00046 #include "sphinxbase/ngram_model.h"
00047 #include "sphinxbase/hash_table.h"
00048 
00055 struct ngram_model_s {
00056     int refcount;       
00057     int32 *n_counts;    
00058     int32 n_1g_alloc;   
00059     int32 n_words;      
00061     uint8 n;            
00062     uint8 n_classes;    
00063     uint8 writable;     
00064     uint8 flags;        
00066     logmath_t *lmath;   
00067     float32 lw;         
00068     int32 log_wip;      
00069     int32 log_uw;       
00070     int32 log_uniform;  
00071     int32 log_uniform_weight; 
00072     int32 log_zero;     
00073     char **word_str;    
00074     hash_table_t *wid;  
00075     int32 *tmp_wids;    
00076     struct ngram_class_s **classes; 
00077     struct ngram_funcs_s *funcs;   
00078 };
00079 
00083 struct ngram_class_s {
00084     int32 tag_wid;  
00085     int32 start_wid; 
00086     int32 n_words;   
00087     int32 *prob1;    
00091     struct ngram_hash_s {
00092         int32 wid;    
00093         int32 prob1;  
00094         int32 next;   
00095     } *nword_hash;
00096     int32 n_hash;       
00097     int32 n_hash_inuse; 
00098 };
00099 
00100 #define NGRAM_HASH_SIZE 128
00101 
00102 #define NGRAM_BASEWID(wid) ((wid)&0xffffff)
00103 #define NGRAM_CLASSID(wid) (((wid)>>24) & 0x7f)
00104 #define NGRAM_CLASSWID(wid,classid) (((classid)<<24) | 0x80000000 | (wid))
00105 #define NGRAM_IS_CLASSWID(wid) ((wid)&0x80000000)
00106 
00107 #define UG_ALLOC_STEP 10
00108 
00110 typedef struct ngram_funcs_s {
00114     void (*free)(ngram_model_t *model);
00118     int (*apply_weights)(ngram_model_t *model,
00119                          float32 lw,
00120                          float32 wip,
00121                          float32 uw);
00125     int32 (*score)(ngram_model_t *model,
00126                    int32 wid,
00127                    int32 *history,
00128                    int32 n_hist,
00129                    int32 *n_used);
00134     int32 (*raw_score)(ngram_model_t *model,
00135                        int32 wid,
00136                        int32 *history,
00137                        int32 n_hist,
00138                        int32 *n_used);
00150     int32 (*add_ug)(ngram_model_t *model,
00151                     int32 wid, int32 lweight);
00155     void (*flush)(ngram_model_t *model);
00156 
00160     ngram_iter_t * (*iter)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist);
00161 
00165     ngram_iter_t * (*mgrams)(ngram_model_t *model, int32 m);
00166 
00170     ngram_iter_t * (*successors)(ngram_iter_t *itor);
00171 
00175     int32 const * (*iter_get)(ngram_iter_t *itor,
00176                               int32 *out_score,
00177                               int32 *out_bowt);
00178 
00182     ngram_iter_t * (*iter_next)(ngram_iter_t *itor);
00183 
00187     void (*iter_free)(ngram_iter_t *itor);
00188 } ngram_funcs_t;
00189 
00193 struct ngram_iter_s {
00194     ngram_model_t *model;
00195     int32 *wids;      
00196     int16 m;          
00197     int16 successor;  
00198 };
00199 
00203 typedef struct classdef_s {
00204     char **words;
00205     float32 *weights;
00206     int32 n_words;
00207 } classdef_t;
00208 
00212 int32
00213 ngram_model_init(ngram_model_t *model,
00214                  ngram_funcs_t *funcs,
00215                  logmath_t *lmath,
00216                  int32 n, int32 n_unigram);
00217 
00221 ngram_model_t *ngram_model_arpa_read(cmd_ln_t *config,
00222                                      const char *file_name,
00223                                      logmath_t *lmath);
00227 ngram_model_t *ngram_model_dmp_read(cmd_ln_t *config,
00228                                     const char *file_name,
00229                                     logmath_t *lmath);
00233 ngram_model_t *ngram_model_dmp32_read(cmd_ln_t *config,
00234                                      const char *file_name,
00235                                      logmath_t *lmath);
00236 
00240 int ngram_model_arpa_write(ngram_model_t *model,
00241                            const char *file_name);
00245 int ngram_model_dmp_write(ngram_model_t *model,
00246                           const char *file_name);
00247 
00251 int32 read_classdef_file(hash_table_t *classes, const char *classdef_file);
00252 
00256 void classdef_free(classdef_t *classdef);
00257 
00261 ngram_class_t *ngram_class_new(ngram_model_t *model, int32 tag_wid,
00262                                int32 start_wid, glist_t classwords);
00263 
00267 void ngram_class_free(ngram_class_t *lmclass);
00268 
00274 int32 ngram_class_prob(ngram_class_t *lmclass, int32 wid);
00275 
00279 void ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
00280                      int m, int successor);
00281 
00282 #endif /* __NGRAM_MODEL_INTERNAL_H__ */