SphinxBase  0.6
src/libsphinxbase/fe/yin.c
Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /*
00003  * Copyright (c) 2008 Beyond Access, Inc.  All rights reserved.
00004  * 
00005  * Redistribution and use in source and binary forms, with or without
00006  * modification, are permitted provided that the following conditions
00007  * are met:
00008  * 
00009  * 1. Redistributions of source code must retain the above copyright
00010  *    notice, this list of conditions and the following disclaimer. 
00011  * 
00012  * 2. Redistributions in binary form must reproduce the above copyright
00013  *    notice, this list of conditions and the following disclaimer in
00014  *    the documentation and/or other materials provided with the
00015  *    distribution.
00016  * 
00017  * THIS SOFTWARE IS PROVIDED BY BEYOND ACCESS, INC. ``AS IS'' AND ANY
00018  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00020  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL BEYOND ACCESS, INC.  NOR
00021  * ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00022  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00023  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00024  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00025  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00026  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00027  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00028  */
00029 
00035 /* This implements part of the YIN algorithm:
00036  *
00037  * "YIN, a fundamental frequency estimator for speech and music".
00038  * Alain de Cheveigné and Hideki Kawahara.  Journal of the Acoustical
00039  * Society of America, 111 (4), April 2002.
00040  */
00041 
00042 #include "sphinxbase/prim_type.h"
00043 #include "sphinxbase/ckd_alloc.h"
00044 #include "sphinxbase/fixpoint.h"
00045 
00046 #include "sphinxbase/yin.h"
00047 
00048 #include <stdio.h>
00049 
00050 struct yin_s {
00051     uint16 frame_size;       
00052     uint16 search_threshold; 
00053     uint16 search_range;     
00054     uint16 nfr;              
00056     unsigned char wsize;    
00057     unsigned char wstart;   
00058     unsigned char wcur;     
00059     unsigned char endut;    
00061     fixed32 **diff_window;  
00062     uint16 *period_window;  
00063 };
00064 
00068 static void
00069 cmn_diff(int16 const *signal, int32 *out_diff, int ndiff)
00070 {
00071     uint32 cum, cshift;
00072     int32 t, tscale;
00073 
00074     out_diff[0] = 32768;
00075     cum = 0;
00076     cshift = 0;
00077 
00078     /* Determine how many bits we can scale t up by below. */
00079     for (tscale = 0; tscale < 32; ++tscale)
00080         if (ndiff & (1<<(31-tscale)))
00081             break;
00082     --tscale; /* Avoid teh overflowz. */
00083     /* printf("tscale is %d (ndiff - 1) << tscale is %d\n",
00084        tscale, (ndiff-1) << tscale); */
00085 
00086     /* Somewhat elaborate block floating point implementation.
00087      * The fp implementation of this is really a lot simpler. */
00088     for (t = 1; t < ndiff; ++t) {
00089         uint32 dd, dshift, norm;
00090         int j;
00091 
00092         dd = 0;
00093         dshift = 0;
00094         for (j = 0; j < ndiff; ++j) {
00095             int diff = signal[j] - signal[t + j];
00096             /* Guard against overflows. */
00097             if (dd > (1UL<<tscale)) {
00098                 dd >>= 1;
00099                 ++dshift;
00100             }
00101             dd += (diff * diff) >> dshift;
00102         }
00103         /* Make sure the diffs and cum are shifted to the same
00104          * scaling factor (usually dshift will be zero) */
00105         if (dshift > cshift) {
00106             cum += dd << (dshift-cshift);
00107         }
00108         else {
00109             cum += dd >> (cshift-dshift);
00110         }
00111 
00112         /* Guard against overflows and also ensure that (t<<tscale) > cum. */
00113         while (cum > (1UL<<tscale)) {
00114             cum >>= 1;
00115             ++cshift;
00116         }
00117         /* Avoid divide-by-zero! */
00118         if (cum == 0) cum = 1;
00119         /* Calculate the normalizer in high precision. */
00120         norm = (t << tscale) / cum;
00121         /* Do a long multiply and shift down to Q15. */
00122         out_diff[t] = (int32)(((long long)dd * norm)
00123                               >> (tscale - 15 + cshift - dshift));
00124         /* printf("dd %d cshift %d dshift %d scaledt %d cum %d norm %d cmn %d\n",
00125            dd, cshift, dshift, (t<<tscale), cum, norm, out_diff[t]); */
00126     }
00127 }
00128 
00129 yin_t *
00130 yin_init(int frame_size, float search_threshold,
00131          float search_range, int smooth_window)
00132 {
00133     yin_t *pe;
00134 
00135     pe = ckd_calloc(1, sizeof(*pe));
00136     pe->frame_size = frame_size;
00137     pe->search_threshold = (uint16)(search_threshold * 32768);
00138     pe->search_range = (uint16)(search_range * 32768);
00139     pe->wsize = smooth_window * 2 + 1;
00140     pe->diff_window = ckd_calloc_2d(pe->wsize,
00141                                     pe->frame_size / 2,
00142                                     sizeof(**pe->diff_window));
00143     pe->period_window = ckd_calloc(pe->wsize,
00144                                    sizeof(*pe->period_window));
00145     return pe;
00146 }
00147 
00148 void
00149 yin_free(yin_t *pe)
00150 {
00151     ckd_free_2d(pe->diff_window);
00152     ckd_free(pe->period_window);
00153     ckd_free(pe);
00154 }
00155 
00156 void
00157 yin_start(yin_t *pe)
00158 {
00159     /* Reset the circular window pointers. */
00160     pe->wstart = pe->endut = 0;
00161     pe->nfr = 0;
00162 }
00163 
00164 void
00165 yin_end(yin_t *pe)
00166 {
00167     pe->endut = 1;
00168 }
00169 
00170 int
00171 thresholded_search(int32 *diff_window, fixed32 threshold, int start, int end)
00172 {
00173     int i, min, argmin;
00174 
00175     min = INT_MAX;
00176     argmin = 0;
00177     for (i = start; i < end; ++i) {
00178         int diff = diff_window[i];
00179 
00180         if (diff < threshold) {
00181             min = diff;
00182             argmin = i;
00183             break;
00184         }
00185         if (diff < min) {
00186             min = diff;
00187             argmin = i;
00188         }
00189     }
00190     return argmin;
00191 }
00192 
00193 void
00194 yin_write(yin_t *pe, int16 const *frame)
00195 {
00196     int outptr, difflen;
00197 
00198     /* Rotate the window one frame forward. */
00199     ++pe->wstart;
00200     /* Fill in the frame before wstart. */
00201     outptr = pe->wstart - 1;
00202     /* Wrap around the window pointer. */
00203     if (pe->wstart == pe->wsize)
00204         pe->wstart = 0;
00205 
00206     /* Now calculate normalized difference function. */
00207     difflen = pe->frame_size / 2;
00208     cmn_diff(frame, pe->diff_window[outptr], difflen);
00209 
00210     /* Find the first point under threshold.  If not found, then
00211      * use the absolute minimum. */
00212     pe->period_window[outptr]
00213         = thresholded_search(pe->diff_window[outptr],
00214                              pe->search_threshold, 0, difflen);
00215 
00216     /* Increment total number of frames. */
00217     ++pe->nfr;
00218 }
00219 
00220 int
00221 yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
00222 {
00223     int wstart, wlen, half_wsize, i;
00224     int best, best_diff, search_width, low_period, high_period;
00225 
00226     half_wsize = (pe->wsize-1)/2;
00227     /* Without any smoothing, just return the current value (don't
00228      * need to do anything to the current poitner either). */
00229     if (half_wsize == 0) {
00230         if (pe->endut)
00231             return 0;
00232         *out_period = pe->period_window[0];
00233         *out_bestdiff = pe->diff_window[0][pe->period_window[0]];
00234         return 1;
00235     }
00236 
00237     /* We can't do anything unless we have at least (wsize-1)/2 + 1
00238      * frames, unless we're at the end of the utterance. */
00239     if (pe->endut == 0 && pe->nfr < half_wsize + 1) {
00240         /* Don't increment the current pointer either. */
00241         return 0;
00242     }
00243 
00244     /* Establish the smoothing window. */
00245     /* End of utterance. */
00246     if (pe->endut) {
00247         /* We are done (no more data) when pe->wcur = pe->wstart. */
00248         if (pe->wcur == pe->wstart)
00249             return 0;
00250         /* I.e. pe->wcur (circular minus) half_wsize. */
00251         wstart = (pe->wcur + pe->wsize - half_wsize) % pe->wsize;
00252         /* Number of frames from wstart up to pe->wstart. */
00253         wlen = pe->wstart - wstart;
00254         if (wlen < 0) wlen += pe->wsize;
00255         /*printf("ENDUT! ");*/
00256     }
00257     /* Beginning of utterance. */
00258     else if (pe->nfr < pe->wsize) {
00259         wstart = 0;
00260         wlen = pe->nfr;
00261     }
00262     /* Normal case, it is what it is. */
00263     else {
00264         wstart = pe->wstart;
00265         wlen = pe->wsize;
00266     }
00267 
00268     /* Now (finally) look for the best local estimate. */
00269     /* printf("Searching for local estimate in %d frames around %d\n",
00270        wlen, pe->nfr + 1 - wlen); */
00271     best = pe->period_window[pe->wcur];
00272     best_diff = pe->diff_window[pe->wcur][best];
00273     for (i = 0; i < wlen; ++i) {
00274         int j = wstart + i;
00275         int diff;
00276 
00277         j %= pe->wsize;
00278         diff = pe->diff_window[j][pe->period_window[j]];
00279         /* printf("%.2f,%.2f ", 1.0 - (double)diff/32768,
00280            pe->period_window[j] ? 8000.0/pe->period_window[j] : 8000.0); */
00281         if (diff < best_diff) {
00282             best_diff = diff;
00283             best = pe->period_window[j];
00284         }
00285     }
00286     /* printf("best: %.2f, %.2f\n", 1.0 - (double)best_diff/32768,
00287        best ? 8000.0/best : 8000.0); */
00288     /* If it's the same as the current one then return it. */
00289     if (best == pe->period_window[pe->wcur]) {
00290         /* Increment the current pointer. */
00291         if (++pe->wcur == pe->wsize)
00292             pe->wcur = 0;
00293         *out_period = best;
00294         *out_bestdiff = best_diff;
00295         return 1;
00296     }
00297     /* Otherwise, redo the search inside a narrower range. */
00298     search_width = best * pe->search_range / 32768;
00299     /* printf("Search width = %d * %.2f = %d\n",
00300        best, (double)pe->search_range/32768, search_width); */
00301     if (search_width == 0) search_width = 1;
00302     low_period = best - search_width;
00303     high_period = best + search_width;
00304     if (low_period < 0) low_period = 0;
00305     if (high_period > pe->frame_size / 2) high_period = pe->frame_size / 2;
00306     /* printf("Searching from %d to %d\n", low_period, high_period); */
00307     best = thresholded_search(pe->diff_window[pe->wcur],
00308                               pe->search_threshold,
00309                               low_period, high_period);
00310     best_diff = pe->diff_window[pe->wcur][best];
00311 
00312     if (out_period)
00313         *out_period = (best > 65535) ? 65535 : best;
00314     if (out_bestdiff)
00315         *out_bestdiff = (best_diff > 65535) ? 65535 : best_diff;
00316 
00317     /* Increment the current pointer. */
00318     if (++pe->wcur == pe->wsize)
00319         pe->wcur = 0;
00320     return 1;
00321 }