Audacious  $Id:Doxyfile42802007-03-2104:39:00Znenolod$
chardet.c
Go to the documentation of this file.
00001 /*  Audacious
00002  *  Copyright (C) 2005-2007  Audacious development team.
00003  *
00004  *  This program is free software; you can redistribute it and/or modify
00005  *  it under the terms of the GNU General Public License as published by
00006  *  the Free Software Foundation; under version 3 of the License.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program.  If not, see <http://www.gnu.org/licenses>.
00015  *
00016  *  The Audacious team does not consider modular code linking to
00017  *  Audacious or using our public API to be a derived work.
00018  */
00019 
00020 #include <glib.h>
00021 #include <string.h>
00022 #include <libaudcore/audstrings.h>
00023 
00024 #include "config.h"
00025 #include "debug.h"
00026 #include "i18n.h"
00027 #include "main.h"
00028 #include "misc.h"
00029 
00030 #ifdef USE_CHARDET
00031 #  include <libguess.h>
00032 #endif
00033 
00034 static char * cd_chardet_to_utf8 (const char * str, int len,
00035  int * arg_bytes_read, int * arg_bytes_written);
00036 
00037 static char * str_to_utf8_fallback (const char * str)
00038 {
00039     char * out = g_strconcat (str, _("  (invalid UTF-8)"), NULL);
00040 
00041     for (char * c = out; * c; c ++)
00042     {
00043         if (* c & 0x80)
00044             * c = '?';
00045     }
00046 
00047     return out;
00048 }
00049 
00050 static char * cd_str_to_utf8 (const char * str)
00051 {
00052     char *out_str;
00053 
00054     if (str == NULL)
00055         return NULL;
00056 
00057     /* Note: Currently, playlist calls this function repeatedly, even
00058      * if the string is already converted into utf-8.
00059      * chardet_to_utf8() would convert a valid utf-8 string into a
00060      * different utf-8 string, if fallback encodings were supplied and
00061      * the given string could be treated as a string in one of
00062      * fallback encodings. To avoid this, g_utf8_validate() had been
00063      * used at the top of evaluation.
00064      */
00065 
00066     /* Note 2: g_utf8_validate() has so called encapsulated utf-8
00067      * problem, thus chardet_to_utf8() took the place of that.
00068      */
00069 
00070     /* Note 3: As introducing madplug, the problem of conversion from
00071      * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
00072      * located near the end of chardet_to_utf8(), but it requires utf8
00073      * validation guard where g_utf8_validate() was. New
00074      * dfa_validate_utf8() employs libguess' DFA engine to validate
00075      * utf-8 and can properly distinguish examples of encapsulated
00076      * utf-8. It is considered to be safe to use as a guard.
00077      */
00078 
00079     /* Already UTF-8? */
00080 #ifdef USE_CHARDET
00081     if (libguess_validate_utf8(str, strlen(str)))
00082         return g_strdup(str);
00083 #else
00084     if (g_utf8_validate(str, strlen(str), NULL))
00085         return g_strdup(str);
00086 #endif
00087 
00088     /* chardet encoding detector */
00089     if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL)))
00090         return out_str;
00091 
00092     /* all else fails, we mask off character codes >= 128, replace with '?' */
00093     return str_to_utf8_fallback(str);
00094 }
00095 
00096 static char * cd_chardet_to_utf8 (const char * str, int len,
00097  int * arg_bytes_read, int * arg_bytes_write)
00098 {
00099     char *ret = NULL;
00100     int * bytes_read, * bytes_write;
00101     int my_bytes_read, my_bytes_write;
00102 
00103     bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
00104     bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
00105 
00106     g_return_val_if_fail(str != NULL, NULL);
00107 
00108 #ifdef USE_CHARDET
00109     if (libguess_validate_utf8(str, len))
00110 #else
00111     if (g_utf8_validate(str, len, NULL))
00112 #endif
00113     {
00114         if (len < 0)
00115             len = strlen (str);
00116 
00117         ret = g_malloc (len + 1);
00118         memcpy (ret, str, len);
00119         ret[len] = 0;
00120 
00121         if (arg_bytes_read != NULL)
00122             * arg_bytes_read = len;
00123         if (arg_bytes_write != NULL)
00124             * arg_bytes_write = len;
00125 
00126         return ret;
00127     }
00128 
00129 #ifdef USE_CHARDET
00130     char * det = get_string (NULL, "chardet_detector");
00131 
00132     if (det[0])
00133     {
00134         AUDDBG("guess encoding (%s) %s\n", det, str);
00135         const char * encoding = libguess_determine_encoding (str, len, det);
00136         AUDDBG("encoding = %s\n", encoding);
00137         if (encoding)
00138         {
00139             gsize read_gsize = 0, written_gsize = 0;
00140             ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL);
00141             * bytes_read = read_gsize;
00142             * bytes_write = written_gsize;
00143         }
00144     }
00145 
00146     g_free (det);
00147 #endif
00148 
00149     /* If detection failed or was not enabled, try fallbacks (if there are any) */
00150     if (! ret)
00151     {
00152         char * fallbacks = get_string (NULL, "chardet_fallback");
00153         char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);
00154 
00155         for (char * * enc = split; * enc; enc ++)
00156         {
00157             gsize read_gsize = 0, written_gsize = 0;
00158             ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL);
00159             * bytes_read = read_gsize;
00160             * bytes_write = written_gsize;
00161 
00162             if (len == *bytes_read)
00163                 break;
00164             else {
00165                 g_free(ret);
00166                 ret = NULL;
00167             }
00168         }
00169 
00170         g_strfreev (split);
00171         g_free (fallbacks);
00172     }
00173 
00174     /* First fallback: locale (duh!) */
00175     if (ret == NULL)
00176     {
00177         gsize read_gsize = 0, written_gsize = 0;
00178         ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL);
00179         * bytes_read = read_gsize;
00180         * bytes_write = written_gsize;
00181     }
00182 
00183     /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
00184     if (ret == NULL)
00185     {
00186         gsize read_gsize = 0, written_gsize = 0;
00187         ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL);
00188         * bytes_read = read_gsize;
00189         * bytes_write = written_gsize;
00190     }
00191 
00192     if (ret != NULL)
00193     {
00194         if (g_utf8_validate(ret, -1, NULL))
00195             return ret;
00196         else
00197         {
00198             g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
00199             g_free(ret);
00200             return NULL;
00201         }
00202     }
00203 
00204     return NULL; /* If we have no idea, return NULL. */
00205 }
00206 
00207 void chardet_init (void)
00208 {
00209 #ifdef USE_CHARDET
00210     libguess_determine_encoding(NULL, -1, "");
00211 #endif
00212     str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8);
00213 }