Libparserutils
Data Structures | Macros | Typedefs | Functions
inputstream.c File Reference
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <parserutils/charset/mibenum.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/input/inputstream.h>
#include "input/filter.h"
#include "utils/utils.h"

Go to the source code of this file.

Data Structures

struct  parserutils_inputstream_private
 Private input stream definition. More...
 

Macros

#define IS_ASCII(x)   (((x) & 0x80) == 0)
 
#define UTF32_BOM_LEN   (4)
 
#define UTF16_BOM_LEN   (2)
 
#define UTF8_BOM_LEN   (3)
 

Typedefs

typedef struct parserutils_inputstream_private parserutils_inputstream_private
 Private input stream definition. More...
 

Functions

static parserutils_error parserutils_inputstream_refill_buffer (parserutils_inputstream_private *stream)
 Refill the UTF-8 buffer from the raw buffer. More...
 
static parserutils_error parserutils_inputstream_strip_bom (uint16_t *mibenum, parserutils_buffer *buffer)
 Strip a BOM from a buffer in the given encoding. More...
 
parserutils_error parserutils_inputstream_create (const char *enc, uint32_t encsrc, parserutils_charset_detect_func csdetect, parserutils_inputstream **stream)
 Create an input stream. More...
 
parserutils_error parserutils_inputstream_destroy (parserutils_inputstream *stream)
 Destroy an input stream. More...
 
parserutils_error parserutils_inputstream_append (parserutils_inputstream *stream, const uint8_t *data, size_t len)
 Append data to an input stream. More...
 
parserutils_error parserutils_inputstream_insert (parserutils_inputstream *stream, const uint8_t *data, size_t len)
 Insert data into stream at current location. More...
 
parserutils_error parserutils_inputstream_peek_slow (parserutils_inputstream *stream, size_t offset, const uint8_t **ptr, size_t *length)
 Look at the character in the stream that starts at offset bytes from the cursor (slow version) More...
 
const char * parserutils_inputstream_read_charset (parserutils_inputstream *stream, uint32_t *source)
 Read the source charset of the input stream. More...
 
parserutils_error parserutils_inputstream_change_charset (parserutils_inputstream *stream, const char *enc, uint32_t source)
 Change the source charset of the input stream. More...
 

Macro Definition Documentation

#define IS_ASCII (   x)    (((x) & 0x80) == 0)

Definition at line 209 of file inputstream.c.

Referenced by parserutils_inputstream_peek_slow().

#define UTF16_BOM_LEN   (2)
#define UTF32_BOM_LEN   (4)
#define UTF8_BOM_LEN   (3)

Typedef Documentation

Private input stream definition.

Function Documentation

parserutils_error parserutils_inputstream_append ( parserutils_inputstream stream,
const uint8_t *  data,
size_t  len 
)

Append data to an input stream.

Parameters
streamInput stream to append data to
dataData to append (in document charset), or NULL to flag EOF
lenLength, in bytes, of data
Returns
PARSERUTILS_OK on success, appropriate error otherwise

Definition at line 169 of file inputstream.c.

References parserutils_inputstream::had_eof, PARSERUTILS_BADPARM, parserutils_buffer_append(), PARSERUTILS_OK, parserutils_inputstream_private::public, and parserutils_inputstream_private::raw.

parserutils_error parserutils_inputstream_change_charset ( parserutils_inputstream stream,
const char *  enc,
uint32_t  source 
)

Change the source charset of the input stream.

Parameters
streamInput stream to modify
encCharset name
sourceCharset source identifier
Returns
PARSERUTILS_OK on success, PARSERUTILS_BADPARM on invalid parameters, PARSERUTILS_INVALID if called after data has been read from stream, PARSERUTILS_BADENCODING if the encoding is unsupported, PARSERUTILS_NOMEM on memory exhaustion.

Definition at line 321 of file inputstream.c.

References parserutils_inputstream_private::done_first_chunk, parserutils_filter_optparams::encoding, parserutils_inputstream_private::encsrc, parserutils_inputstream_private::input, parserutils_inputstream_private::mibenum, parserutils_filter_optparams::name, parserutils__filter_setopt(), PARSERUTILS_BADENCODING, PARSERUTILS_BADPARM, parserutils_charset_mibenum_from_name(), PARSERUTILS_FILTER_SET_ENCODING, PARSERUTILS_INVALID, and PARSERUTILS_OK.

parserutils_error parserutils_inputstream_create ( const char *  enc,
uint32_t  encsrc,
parserutils_charset_detect_func  csdetect,
parserutils_inputstream **  stream 
)

Create an input stream.

Parameters
encDocument charset, or NULL to autodetect
encsrcValue for encoding source, if specified, or 0
csdetectCharset detection function, or NULL
streamPointer to location to receive stream instance
Returns
PARSERUTILS_OK on success, PARSERUTILS_BADPARM on bad parameters, PARSERUTILS_NOMEM on memory exhaustion, PARSERUTILS_BADENCODING on unsupported encoding

The value 0 is defined as being the lowest priority encoding source (i.e. the default fallback encoding). Beyond this, no further interpretation is made upon the encoding source.

Definition at line 59 of file inputstream.c.

References parserutils_inputstream_private::csdetect, parserutils_inputstream::cursor, parserutils_inputstream_private::done_first_chunk, parserutils_filter_optparams::encoding, parserutils_inputstream_private::encsrc, parserutils_inputstream::had_eof, parserutils_inputstream_private::input, parserutils_inputstream_private::mibenum, parserutils_filter_optparams::name, parserutils__filter_create(), parserutils__filter_destroy(), parserutils__filter_setopt(), PARSERUTILS_BADENCODING, PARSERUTILS_BADPARM, parserutils_buffer_create(), parserutils_buffer_destroy(), parserutils_charset_mibenum_from_name(), PARSERUTILS_FILTER_SET_ENCODING, PARSERUTILS_NOMEM, PARSERUTILS_OK, parserutils_inputstream_private::public, parserutils_inputstream_private::raw, and parserutils_inputstream::utf8.

parserutils_error parserutils_inputstream_destroy ( parserutils_inputstream stream)

Destroy an input stream.

Parameters
streamInput stream to destroy
Returns
PARSERUTILS_OK on success, appropriate error otherwise

Definition at line 144 of file inputstream.c.

References parserutils_inputstream_private::input, parserutils__filter_destroy(), PARSERUTILS_BADPARM, parserutils_buffer_destroy(), PARSERUTILS_OK, parserutils_inputstream_private::public, parserutils_inputstream_private::raw, and parserutils_inputstream::utf8.

parserutils_error parserutils_inputstream_insert ( parserutils_inputstream stream,
const uint8_t *  data,
size_t  len 
)

Insert data into stream at current location.

Parameters
streamInput stream to insert into
dataData to insert (UTF-8 encoded)
lenLength, in bytes, of data
Returns
PARSERUTILS_OK on success, appropriate error otherwise

Definition at line 195 of file inputstream.c.

References parserutils_inputstream::cursor, PARSERUTILS_BADPARM, parserutils_buffer_insert(), parserutils_inputstream_private::public, and parserutils_inputstream::utf8.

parserutils_error parserutils_inputstream_peek_slow ( parserutils_inputstream stream,
size_t  offset,
const uint8_t **  ptr,
size_t *  length 
)

Look at the character in the stream that starts at offset bytes from the cursor (slow version)

Parameters
streamStream to look in
offsetByte offset of start of character
ptrPointer to location to receive pointer to character data
lengthPointer to location to receive character length (in bytes)
Returns
PARSERUTILS_OK on success, _NEEDDATA on reaching the end of available input, _EOF on reaching the end of all input, _BADENCODING if the input cannot be decoded, _NOMEM on memory exhaustion, _BADPARM if bad parameters are passed.

Once the character pointed to by the result of this call has been advanced past (i.e. parserutils_inputstream_advance has caused the stream cursor to pass over the character), then no guarantee is made as to the validity of the data pointed to. Thus, any attempt to dereference the pointer after advancing past the data it points to is a bug.

Definition at line 232 of file inputstream.c.

References parserutils_inputstream::cursor, parserutils_buffer::data, parserutils_inputstream::had_eof, IS_ASCII, len, parserutils_buffer::length, PARSERUTILS_BADPARM, parserutils_charset_utf8_char_byte_length(), PARSERUTILS_EOF, parserutils_inputstream_refill_buffer(), PARSERUTILS_NEEDDATA, PARSERUTILS_OK, parserutils_inputstream_private::public, parserutils_inputstream_private::raw, and parserutils_inputstream::utf8.

Referenced by parserutils_inputstream_peek().

const char* parserutils_inputstream_read_charset ( parserutils_inputstream stream,
uint32_t *  source 
)

Read the source charset of the input stream.

Parameters
streamInput stream to query
sourcePointer to location to receive charset source identifier
Returns
Pointer to charset name (constant; do not free)

Definition at line 292 of file inputstream.c.

References parserutils_inputstream_private::encsrc, parserutils_inputstream_private::mibenum, and parserutils_charset_mibenum_to_name().

parserutils_error parserutils_inputstream_refill_buffer ( parserutils_inputstream_private stream)
inlinestatic
parserutils_error parserutils_inputstream_strip_bom ( uint16_t *  mibenum,
parserutils_buffer buffer 
)
inlinestatic

Strip a BOM from a buffer in the given encoding.

Parameters
mibenumPointer to the character set of the buffer, updated on exit
bufferThe buffer to process

Definition at line 496 of file inputstream.c.

References parserutils_buffer::data, parserutils_buffer::length, parserutils_buffer_discard(), parserutils_charset_mibenum_from_name(), PARSERUTILS_OK, SLEN, UTF16_BOM_LEN, UTF32_BOM_LEN, and UTF8_BOM_LEN.

Referenced by parserutils_inputstream_refill_buffer().