Hubbub
detect.c
Go to the documentation of this file.
1 /*
2  * This file is part of Hubbub.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdbool.h>
10 #include <string.h>
11 #include <strings.h>
12 
13 #include <parserutils/charset/mibenum.h>
14 
15 #include <hubbub/types.h>
16 
17 #include "utils/utils.h"
18 
19 #include "detect.h"
20 
21 static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len);
22 static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
23 static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
24  const uint8_t *end);
25 static bool hubbub_charset_get_attribute(const uint8_t **data,
26  const uint8_t *end,
27  const uint8_t **name, uint32_t *namelen,
28  const uint8_t **value, uint32_t *valuelen);
29 
44 parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
45  uint16_t *mibenum, uint32_t *source)
46 {
47  uint16_t charset = 0;
48 
49  if (data == NULL || mibenum == NULL || source == NULL)
50  return PARSERUTILS_BADPARM;
51 
64  /* 1. */
65 
66  /* If the source is dictated, there's nothing for us to do */
67  if (*source == HUBBUB_CHARSET_CONFIDENT ||
68  *source == HUBBUB_CHARSET_TENTATIVE) {
69  return PARSERUTILS_OK;
70  }
71 
72  /* 2. */
73 
76  /* 3. */
77 
78  /* We need at least 3 bytes of data */
79  if (len < 3)
80  goto default_encoding;
81 
82  /* First, look for a BOM */
83  charset = hubbub_charset_read_bom(data, len);
84  if (charset != 0) {
85  *mibenum = charset;
86  *source = HUBBUB_CHARSET_CONFIDENT;
87 
88  return PARSERUTILS_OK;
89  }
90 
91  /* 4. */
92 
93  /* No BOM was found, so we must look for a meta charset within
94  * the document itself. */
95  charset = hubbub_charset_scan_meta(data, len);
96  if (charset != 0) {
97  /* Fix charsets according to HTML5,
98  * section 8.2.2.2. Character encoding requirements */
100 
101  /* If we've encountered a meta charset for a non-ASCII-
102  * compatible encoding, don't trust it.
103  *
104  * Firstly, it should have been sent with a BOM (and thus
105  * detected above).
106  *
107  * Secondly, we've just used an ASCII-only parser to
108  * extract the encoding from the document. Therefore,
109  * the document plainly isn't what the meta charset
110  * claims it is.
111  *
112  * What we do in this case is to ignore the meta charset's
113  * claims and leave the charset determination to the
114  * autodetection routines (or the fallback case if they
115  * fail).
116  */
117  if (charset != parserutils_charset_mibenum_from_name(
118  "UTF-32", SLEN("UTF-32")) &&
119  charset != parserutils_charset_mibenum_from_name(
120  "UTF-32LE", SLEN("UTF-32LE")) &&
121  charset != parserutils_charset_mibenum_from_name(
122  "UTF-32BE", SLEN("UTF-32BE"))) {
123 
124  *mibenum = charset;
125  *source = HUBBUB_CHARSET_TENTATIVE;
126 
127  return PARSERUTILS_OK;
128  }
129  }
130 
131  /* No charset was specified within the document, attempt to
132  * autodetect the encoding from the data that we have available. */
133 
136  /* We failed to autodetect a charset, so use the default fallback */
137 default_encoding:
138 
139  /* 7. */
140 
141  charset = parserutils_charset_mibenum_from_name("Windows-1252",
142  SLEN("Windows-1252"));
143  if (charset == 0)
144  charset = parserutils_charset_mibenum_from_name("ISO-8859-1",
145  SLEN("ISO-8859-1"));
146 
147  *mibenum = charset;
148  *source = HUBBUB_CHARSET_TENTATIVE;
149 
150  return PARSERUTILS_OK;
151 }
152 
153 
162 uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
163 {
164  if (data == NULL)
165  return 0;
166 
167  /* We require at least 3 bytes of data */
168  if (len < 3)
169  return 0;
170 
171  if (data[0] == 0xFE && data[1] == 0xFF) {
172  return parserutils_charset_mibenum_from_name("UTF-16BE",
173  SLEN("UTF-16BE"));
174  } else if (data[0] == 0xFF && data[1] == 0xFE) {
175  return parserutils_charset_mibenum_from_name("UTF-16LE",
176  SLEN("UTF-16LE"));
177  } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
178  return parserutils_charset_mibenum_from_name("UTF-8",
179  SLEN("UTF-8"));
180  }
181 
182  return 0;
183 }
184 
185 #define PEEK(a) \
186  (pos < end - SLEN(a) && \
187  strncasecmp((const char *) pos, a, SLEN(a)) == 0)
188 
189 #define ADVANCE(a) \
190  while (pos < end - SLEN(a)) { \
191  if (PEEK(a)) \
192  break; \
193  pos++; \
194  } \
195  \
196  if (pos == end - SLEN(a)) \
197  return 0;
198 
199 #define ISSPACE(a) \
200  (a == 0x09 || a == 0x0a || a == 0x0c || \
201  a == 0x0d || a == 0x20 || a == 0x2f)
202 
210 uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
211 {
212  const uint8_t *pos = data;
213  const uint8_t *end;
214  uint16_t mibenum;
215 
216  if (data == NULL)
217  return 0;
218 
219  end = pos + min(512, len);
220 
221  /* 1. */
222  while (pos < end) {
223  /* a */
224  if (PEEK("<!--")) {
225  pos += SLEN("<!--");
226  ADVANCE("-->");
227  /* b */
228  } else if (PEEK("<meta")) {
229  if (pos + SLEN("<meta") >= end - 1)
230  return 0;
231 
232  if (ISSPACE(*(pos + SLEN("<meta")))) {
233  /* 1 */
234  pos += SLEN("<meta");
235 
237  &pos, end);
238  if (mibenum != 0)
239  return mibenum;
240 
241  if (pos >= end)
242  return 0;
243  }
244  /* c */
245  } else if ((PEEK("</") && (pos < end - 3 &&
246  (0x41 <= (*(pos + 2) & ~ 0x20) &&
247  (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
248  (pos < end - 2 && *pos == '<' &&
249  (0x41 <= (*(pos + 1) & ~ 0x20) &&
250  (*(pos + 1) & ~ 0x20) <= 0x5A))) {
251 
252  /* skip '<' */
253  pos++;
254 
255  /* 1. */
256  while (pos < end) {
257  if (ISSPACE(*pos) ||
258  *pos == '>' || *pos == '<')
259  break;
260  pos++;
261  }
262 
263  if (pos >= end)
264  return 0;
265 
266  /* 3 */
267  if (*pos != '<') {
268  const uint8_t *n;
269  const uint8_t *v;
270  uint32_t nl, vl;
271 
272  while (hubbub_charset_get_attribute(&pos, end,
273  &n, &nl, &v, &vl))
274  ; /* do nothing */
275  /* 2 */
276  } else
277  continue;
278  /* d */
279  } else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
280  pos++;
281  ADVANCE(">");
282  }
283 
284  /* e - do nothing */
285 
286  /* 2 */
287  pos++;
288  }
289 
290  return 0;
291 }
292 
300 uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
301  const uint8_t *end)
302 {
303  const uint8_t *name;
304  const uint8_t *value;
305  uint32_t namelen, valuelen;
306  uint16_t mibenum = 0;
307 
308  if (pos == NULL || *pos == NULL || end == NULL)
309  return 0;
310 
311  /* 2 */
312  while (hubbub_charset_get_attribute(pos, end,
313  &name, &namelen, &value, &valuelen)) {
314  /* 3 done by default */
315 
316  /* 4 */
317  if (namelen == SLEN("charset") && valuelen > 0 &&
318  strncasecmp((const char *) name, "charset",
319  SLEN("charset")) == 0) {
320  /* strip value */
321  while (ISSPACE(*value)) {
322  value++;
323  valuelen--;
324  }
325 
326  while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
327  valuelen--;
328 
329  mibenum = parserutils_charset_mibenum_from_name(
330  (const char *) value, valuelen);
331  /* 5 */
332  } else if (namelen == SLEN("content") && valuelen > 0 &&
333  strncasecmp((const char *) name, "content",
334  SLEN("content")) == 0) {
335  mibenum = hubbub_charset_parse_content(value,
336  valuelen);
337  }
338 
339  /* 6 */
340  if (mibenum == parserutils_charset_mibenum_from_name(
341  "UTF-16LE", SLEN("UTF-16LE")) ||
342  mibenum ==
343  parserutils_charset_mibenum_from_name(
344  "UTF-16BE", SLEN("UTF-16BE")) ||
345  mibenum ==
346  parserutils_charset_mibenum_from_name(
347  "UTF-16", SLEN("UTF-16"))) {
348  mibenum = parserutils_charset_mibenum_from_name(
349  "UTF-8", SLEN("UTF-8"));
350  }
351 
352  /* 7 */
353  if (mibenum != 0) {
354  /* confidence = tentative; */
355  return mibenum;
356  }
357  }
358 
359  return 0;
360 }
361 
369 uint16_t hubbub_charset_parse_content(const uint8_t *value,
370  uint32_t valuelen)
371 {
372  const uint8_t *end;
373  const uint8_t *tentative = NULL;
374  uint32_t tentative_len = 0;
375 
376  if (value == NULL)
377  return 0;
378 
379  end = value + valuelen;
380 
381  /* 1 */
382  while (value < end) {
383  if (*value == ';') {
384  value++;
385  break;
386  }
387 
388  value++;
389  }
390 
391  if (value >= end)
392  return 0;
393 
394  /* 2 */
395  while (value < end && ISSPACE(*value)) {
396  value++;
397  }
398 
399  if (value >= end)
400  return 0;
401 
402  /* 3 */
403  if (value < end - SLEN("charset") &&
404  strncasecmp((const char *) value,
405  "charset", SLEN("charset")) != 0)
406  return 0;
407 
408  value += SLEN("charset");
409 
410  /* 4 */
411  while (value < end && ISSPACE(*value)) {
412  value++;
413  }
414 
415  if (value >= end)
416  return 0;
417 
418  /* 5 */
419  if (*value != '=')
420  return 0;
421  /* skip '=' */
422  value++;
423 
424  /* 6 */
425  while (value < end && ISSPACE(*value)) {
426  value++;
427  }
428 
429  if (value >= end)
430  return 0;
431 
432  /* 7 */
433  tentative = value;
434 
435  /* a */
436  if (*value == '"') {
437  while (++value < end && *value != '"') {
438  tentative_len++;
439  }
440 
441  if (value < end)
442  tentative++;
443  else
444  tentative = NULL;
445  /* b */
446  } else if (*value == '\'') {
447  while (++value < end && *value != '\'') {
448  tentative_len++;
449  }
450 
451  if (value < end)
452  tentative++;
453  else
454  tentative = NULL;
455  /* c */
456  } else {
457  while (value < end && !ISSPACE(*value)) {
458  value++;
459  tentative_len++;
460  }
461  }
462 
463  /* 8 */
464  if (tentative != NULL) {
465  return parserutils_charset_mibenum_from_name(
466  (const char *) tentative, tentative_len);
467  }
468 
469  /* 9 */
470  return 0;
471 }
472 
487 bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
488  const uint8_t **name, uint32_t *namelen,
489  const uint8_t **value, uint32_t *valuelen)
490 {
491  const uint8_t *pos;
492 
493  if (data == NULL || *data == NULL || end == NULL || name == NULL ||
494  namelen == NULL || value == NULL || valuelen == NULL)
495  return false;
496 
497  pos = *data;
498 
499  /* 1. Skip leading spaces or '/' characters */
500  while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
501  pos++;
502  }
503 
504  if (pos >= end) {
505  *data = pos;
506  return false;
507  }
508 
509  /* 2. Invalid element open character */
510  if (*pos == '<') {
511  pos--;
512  *data = pos;
513  return false;
514  }
515 
516  /* 3. End of element */
517  if (*pos == '>') {
518  *data = pos;
519  return false;
520  }
521 
522  /* 4. Initialise name & value to empty string */
523  *name = pos;
524  *namelen = 0;
525  *value = (const uint8_t *) "";
526  *valuelen = 0;
527 
528  /* 5. Extract name */
529  while (pos < end) {
530  /* a */
531  if (*pos == '=') {
532  break;
533  }
534 
535  /* b */
536  if (ISSPACE(*pos)) {
537  break;
538  }
539 
540  /* c */
541  if (*pos == '/' || *pos == '<' || *pos == '>') {
542  *data = pos;
543  return true;
544  }
545 
546  /* d is handled by strncasecmp in _parse_attributes */
547 
548  /* e */
549  (*namelen)++;
550 
551  /* 6 */
552  pos++;
553  }
554 
555  if (pos >= end) {
556  *data = pos;
557  return false;
558  }
559 
560  if (ISSPACE(*pos)) {
561  /* 7. Skip trailing spaces */
562  while (pos < end && ISSPACE(*pos)) {
563  pos++;
564  }
565 
566  if (pos >= end) {
567  *data = pos;
568  return false;
569  }
570 
571  /* 8. Must be '=' */
572  if (*pos != '=') {
573  pos--;
574  *data = pos;
575  return true;
576  }
577  }
578 
579  /* 9. Skip '=' */
580  pos++;
581 
582  /* 10. Skip any spaces after '=' */
583  while (pos < end && ISSPACE(*pos)) {
584  pos++;
585  }
586 
587  if (pos >= end) {
588  *data = pos;
589  return false;
590  }
591 
592  /* 11. Extract value, if quoted */
593  /* a */
594  if (*pos == '\'' || *pos == '"') {
595  /* 1 */
596  const uint8_t *quote = pos;
597 
598  /* 2 */
599  while (++pos < end) {
600  /* 3 */
601  if (*pos == *quote) {
602  *value = (quote + 1);
603  *data = ++pos;
604  return true;
605  }
606 
607  /* 4 is handled by strncasecmp */
608 
609  /* 5 */
610  (*valuelen)++;
611 
612  /* 6 */
613  }
614 
615  if (pos >= end) {
616  *data = pos;
617  return false;
618  }
619  }
620 
621  /* b */
622  if (*pos == '<' || *pos == '>') {
623  *data = pos;
624  return true;
625  }
626 
627  /* c is handled by strncasecmp */
628 
629  /* d */
630  *value = pos;
631 
632  while (pos < end) {
633  /* 12. Extract unquoted value */
634  /* a */
635  if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
636  *data = pos;
637  return true;
638  }
639 
640  /* b is handled by strncasecmp */
641 
642  /* c */
643  (*valuelen)++;
644 
645  /* 13. Advance */
646  pos++;
647  }
648 
649  if (pos >= end) {
650  *data = pos;
651  return false;
652  }
653 
654  /* should never be reached */
655  abort();
656 
657  return false;
658 }
659 
667 void hubbub_charset_fix_charset(uint16_t *charset)
668 {
669  uint16_t tmp = 0;
670  assert(*charset != 0);
671 
672  /* ISO-8859-1 -> Windows-1252 */
673  if (*charset == parserutils_charset_mibenum_from_name(
674  "ISO-8859-1", SLEN("ISO-8859-1"))) {
675  tmp = parserutils_charset_mibenum_from_name(
676  "Windows-1252", SLEN("Windows-1252"));
677  assert(tmp != 0 && "Windows-1252 MUST be supported");
678  /* ISO-8859-9 -> Windows-1254 */
679  } else if (*charset == parserutils_charset_mibenum_from_name(
680  "ISO-8859-9", SLEN("ISO-8859-9"))) {
681  tmp = parserutils_charset_mibenum_from_name(
682  "Windows-1254", SLEN("Windows-1254"));
683  /* ISO-8859-11 -> Windows-874 */
684  } else if (*charset == parserutils_charset_mibenum_from_name(
685  "ISO-8859-11", SLEN("ISO-8859-11"))) {
686  tmp = parserutils_charset_mibenum_from_name(
687  "Windows-874", SLEN("Windows-874"));
688  /* KS_C_5601-1987 and EUC-KR -> Windows-949 */
689  } else if (*charset == parserutils_charset_mibenum_from_name(
690  "KS_C_5601-1987", SLEN("KS_C_5601-1987")) ||
691  *charset == parserutils_charset_mibenum_from_name(
692  "EUC-KR", SLEN("EUC-KR"))) {
693  tmp = parserutils_charset_mibenum_from_name(
694  "Windows-949", SLEN("Windows-949"));
695  /* TIS-620 -> Windows-874 */
696  } else if (*charset == parserutils_charset_mibenum_from_name(
697  "TIS-620", SLEN("TIS-620"))) {
698  tmp = parserutils_charset_mibenum_from_name(
699  "Windows-874", SLEN("Windows-874"));
700  /* x-x-big5 -> Big5 */
701  } else if (*charset == parserutils_charset_mibenum_from_name(
702  "x-x-big5", SLEN("x-x-big5"))) {
703  tmp = parserutils_charset_mibenum_from_name(
704  "Big5", SLEN("Big5"));
705  /* GB2312 and GB_2312-80 -> GBK */
706  } else if (*charset == parserutils_charset_mibenum_from_name(
707  "GB2312", SLEN("GB2312")) ||
708  *charset == parserutils_charset_mibenum_from_name(
709  "GB_2312-80", SLEN("GB_2312-80"))) {
710  tmp = parserutils_charset_mibenum_from_name(
711  "GBK", SLEN("GBK"));
712  }
713 
714  if (tmp != 0)
715  *charset = tmp;
716 }
Charset may be changed with further data.
Definition: types.h:24
#define SLEN(s)
Definition: utils.h:21
static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
Inspect the beginning of a buffer of data for the presence of a UTF Byte Order Mark.
Definition: detect.c:162
#define ADVANCE(a)
Definition: detect.c:189
Charset definite.
Definition: types.h:26
static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen)
Extract an attribute from the data stream.
Definition: detect.c:487
static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
Search for a meta charset within a buffer of data.
Definition: detect.c:210
static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end)
Parse attributes on a meta tag.
Definition: detect.c:300
uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen)
Parse a content= attribute's value.
Definition: detect.c:369
#define min(a, b)
Definition: utils.h:16
const char * name
Definition: initial.c:22
#define ISSPACE(a)
Definition: detect.c:199
size_t len
Definition: initial.c:23
#define PEEK(a)
Definition: detect.c:185
parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Extract a charset from a chunk of data.
Definition: detect.c:44
void hubbub_charset_fix_charset(uint16_t *charset)
Fix charsets, according to the override table in HTML5, section 8.2.2.2.
Definition: detect.c:667