SphinxBase  0.6
src/libsphinxbase/util/utf8.c
00001 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> */
00002 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
00003 
00004 /* Slightly modified to use Sphinx types and remove explicit inline. */
00005 
00006 #include "sphinxbase/prim_type.h"
00007 
00008 #define UTF8_ACCEPT 0
00009 #define UTF8_REJECT 1
00010 
00011 static const uint8 utf8d[] = {
00012   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
00013   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
00014   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
00015   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
00016   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
00017   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
00018   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
00019   0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
00020   0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
00021   0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
00022   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
00023   1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
00024   1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
00025   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
00026 };
00027 
00028 uint32
00029 utf8_decode(uint32 *state, uint32 *codep, uint32 byte) {
00030   uint32 type = utf8d[byte];
00031 
00032   *codep = (*state != UTF8_ACCEPT) ?
00033     (byte & 0x3fu) | (*codep << 6) :
00034     (0xff >> type) & (byte);
00035 
00036   *state = utf8d[256 + *state*16 + type];
00037   return *state;
00038 }
00039 
00040 /* CMU code starts here. */
00041 /* ====================================================================
00042  * Copyright (c) 2009 Carnegie Mellon University.  All rights
00043  * reserved.
00044  *
00045  * Redistribution and use in source and binary forms, with or without
00046  * modification, are permitted provided that the following conditions
00047  * are met:
00048  *
00049  * 1. Redistributions of source code must retain the above copyright
00050  *    notice, this list of conditions and the following disclaimer. 
00051  *
00052  * 2. Redistributions in binary form must reproduce the above copyright
00053  *    notice, this list of conditions and the following disclaimer in
00054  *    the documentation and/or other materials provided with the
00055  *    distribution.
00056  *
00057  * This work was supported in part by funding from the Defense Advanced 
00058  * Research Projects Agency and the National Science Foundation of the 
00059  * United States of America, and the CMU Sphinx Speech Consortium.
00060  *
00061  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00062  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00063  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00064  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00065  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00066  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00067  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00068  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00069  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00070  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00071  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00072  *
00073  * ====================================================================
00074  *
00075  */