Zorba
The XQuery Processor
Documentation
Live Demo
Modules
Download
Tools
Blog
Code
Main Page
Related Pages
Namespaces
Classes
Files
Examples
File List
File Members
include
zorba
tokenizer.h
Go to the documentation of this file.
1
/*
2
* Copyright 2006-2008 The FLWOR Foundation.
3
*
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
7
*
8
* http://www.apache.org/licenses/LICENSE-2.0
9
*
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
15
*/
16
17
#pragma once
18
#ifndef ZORBA_TOKENIZER_API_H
19
#define ZORBA_TOKENIZER_API_H
20
21
#include <vector>
22
23
#include <zorba/config.h>
24
#include <
zorba/locale.h
>
25
#include <
zorba/internal/unique_ptr.h
>
26
#include <
zorba/internal/ztd.h
>
27
28
namespace
zorba {
29
30
class
Item;
31
32
///////////////////////////////////////////////////////////////////////////////
33
34
/**
35
* A %Tokenizer breaks a string into a stream of word tokens. Each token is
36
* assigned a token, sentence, and paragraph number.
37
*
38
* A %Tokenizer determines word and sentence boundaries automatically, but must
39
* be told when to increment the paragraph number.
40
*/
41
class
ZORBA_DLL_PUBLIC
Tokenizer
{
42
public
:
43
typedef
std::unique_ptr<Tokenizer,internal::ztd::destroy_delete<Tokenizer>
>
44
ptr
;
45
46
typedef
unsigned
size_type
;
47
48
/////////////////////////////////////////////////////////////////////////////
49
50
/**
51
* A %State contains inter-Tokenizer state, currently the current token,
52
* sentence, and paragraph numbers.
53
*/
54
struct
State
{
55
typedef
Tokenizer::size_type
value_type
;
56
57
value_type
token
;
///< Token number.
58
value_type
sent
;
///< Sentence number.
59
value_type
para
;
///< Paragraph number.
60
61
/**
62
* Default constructor.
63
*/
64
State
();
65
};
66
67
/////////////////////////////////////////////////////////////////////////////
68
69
/**
70
* A %Callback is called once per token.
71
* This is only internally by Zorba.
72
* You do not need to derive from this class.
73
*/
74
class
Callback
{
75
public
:
76
typedef
Tokenizer::size_type
size_type
;
77
78
virtual
~
Callback
();
79
80
/**
81
* This member-function is called whenever an item that is being tokenized
82
* is entered or exited. The default implementation does nothing.
83
*
84
* @param item The item being entered or exited.
85
* @param entering If \c true, the item is being entered; if \c false, the
86
* item is being exited.
87
*/
88
virtual
void
item(
Item
const
&item,
bool
entering );
89
90
/**
91
* This member-function is called once per token.
92
*
93
* @param utf8_s The UTF-8 token string. It is not null-terminated.
94
* @param utf8_len The number of bytes in the token string.
95
* @param lang The language of the token.
96
* @param token_no The token number. Token numbers start at 0.
97
* @param sent_no The sentence number. Sentence numbers start at 1.
98
* @param para_no The paragraph number. Paragraph numbers start at 1.
99
* @param item The Item this token is from, if any.
100
*/
101
virtual
void
token(
char
const
*utf8_s,
size_type
utf8_len,
102
locale::iso639_1::type
lang,
103
size_type
token_no,
size_type
sent_no,
104
size_type
para_no,
Item
const
*item = 0 ) = 0;
105
};
106
107
/////////////////////////////////////////////////////////////////////////////
108
109
/**
110
* Various properties of this %Tokenizer.
111
*/
112
struct
Properties
{
113
typedef
std::vector<locale::iso639_1::type>
languages_type
;
114
115
/**
116
* If \c true, XML comments separate tokens. For example,
117
* \c net<!---->work would be 2 tokens instead of 1.
118
*/
119
bool
comments_separate_tokens
;
120
121
/**
122
* If \c true, XML elements separate tokens. For example,
123
* \c <b>B</b>old would be 2 tokens instead of 1.
124
*/
125
bool
elements_separate_tokens
;
126
127
/**
128
* If \c true, XML processing instructions separate tokens. For example,
129
* <code>net<?PI pi?>work</code> would be 2 tokens instead of 1.
130
*/
131
bool
processing_instructions_separate_tokens
;
132
133
/**
134
* The set of languages supported.
135
*/
136
languages_type
languages
;
137
138
/**
139
* The URI that uniquely identifies this %Tokenizer.
140
*/
141
char
const
*
uri
;
142
};
143
144
/**
145
* Gets the Properties of this %Tokenizer.
146
*
147
* @param result The Properties to populate.
148
*/
149
virtual
void
properties(
Properties
*result )
const
= 0;
150
151
/////////////////////////////////////////////////////////////////////////////
152
153
/**
154
* Destroys this %Tokenizer.
155
* This function is called by Zorba when the %Tokenizer is no longer needed.
156
*
157
* If your TokenizerProvider dynamically allocates %Tokenizer objects, then
158
* the implementation can simply be (and usually is) <code>delete this</code>.
159
*
160
* If your TokenizerProvider returns a pointer to a static %Tokenizer object,
161
* then the implementation should do nothing.
162
*/
163
virtual
void
destroy()
const
= 0;
164
165
/**
166
* Gets this %Tokenizer's associated State.
167
*
168
* @return Returns said State.
169
*/
170
State
& state();
171
172
/**
173
* Gets this %Tokenizer's associated State.
174
*
175
* @return Returns said State.
176
*/
177
State
const
& state()
const
;
178
179
/**
180
* Tokenizes the given node.
181
*
182
* @param node The node to tokenize.
183
* @param lang The default language to use.
184
* @param callback The Callback to call once per token.
185
*/
186
void
tokenize_node(
Item
const
&node,
locale::iso639_1::type
lang,
187
Callback
&callback );
188
189
/**
190
* Tokenizes the given string.
191
*
192
* @param utf8_s The UTF-8 string to tokenize. It need not be
193
* null-terminated.
194
* @param utf8_len The number of bytes in the string to be tokenized.
195
* @param lang The language of the string.
196
* @param wildcards If \c true, allows XQuery wildcard syntax characters to
197
* be part of tokens.
198
* @param callback The Callback to call once per token.
199
* @param item The Item this string is from, if any.
200
*/
201
virtual
void
tokenize_string(
char
const
*utf8_s,
size_type
utf8_len,
202
locale::iso639_1::type
lang,
bool
wildcards,
203
Callback
&callback,
Item
const
*item = 0 ) = 0;
204
205
/////////////////////////////////////////////////////////////////////////////
206
207
protected
:
208
/**
209
* Constructs a %Tokenizer.
210
*
211
* @param state the State to use.
212
*/
213
Tokenizer
(
State
&state );
214
215
/**
216
* Destroys a %Tokenizer.
217
*/
218
virtual
~
Tokenizer
() = 0;
219
220
/**
221
* Given an element, finds its \c xml:lang attribute, if any, and gets its
222
* value.
223
*
224
* @param element The element to check.
225
* @param lang A pointer to where to put the found language, if any.
226
* @return Returns \c true only if an \c xml:lang attribute is found and the
227
* value is a known language.
228
*/
229
bool
find_lang_attribute(
Item
const
&element,
locale::iso639_1::type
*lang );
230
231
/**
232
* This member-function is called whenever an item that is being tokenized is
233
* entered or exited.
234
*
235
* @param item The item being entered or exited.
236
* @param entering If \c true, the item is being entered; if \c false, the
237
* item is being exited.
238
*/
239
virtual
void
item(
Item
const
&item,
bool
entering );
240
241
/**
242
* Tokenizes the given node and all of its child nodes, if any. For each
243
* node, it is required that this function call the item() member function of
244
* both this %Tokenizer and of the Callback twice, once each for entrance and
245
* exit.
246
*
247
* @param node The node to tokenize.
248
* @param lang The default language to use.
249
* @param callback The Callback to call per token.
250
* @param tokenize_acp If \c true, additionally tokenize all attribute,
251
* comment, and processing-instruction nodes encountered;
252
* if \c false, skip them.
253
*/
254
virtual
void
tokenize_node_impl(
Item
const
&node,
255
locale::iso639_1::type
lang,
256
Callback
&callback,
bool
tokenize_acp );
257
258
private
:
259
State
*state_;
260
};
261
262
inline
Tokenizer::Tokenizer
(
State
&state ) : state_( &state ) {
263
}
264
265
inline
Tokenizer::State
&
Tokenizer::state
() {
266
return
*state_;
267
}
268
269
inline
Tokenizer::State
const
&
Tokenizer::state
()
const
{
270
return
*state_;
271
}
272
273
inline
void
Tokenizer::tokenize_node
(
Item
const
&item,
274
locale::iso639_1::type
lang,
275
Callback
&callback ) {
276
tokenize_node_impl
( item, lang, callback,
true
);
277
}
278
279
///////////////////////////////////////////////////////////////////////////////
280
281
/**
282
* A %TokenizerProvider provides a Tokenizer for a given language.
283
*/
284
class
ZORBA_DLL_PUBLIC
TokenizerProvider
{
285
public
:
286
virtual
~
TokenizerProvider
();
287
288
/**
289
* Creates a new %Tokenizer.
290
*
291
* @param lang The language of the text that the tokenizer will tokenize.
292
* @param state The State to use. If \c null, \a t is not set.
293
* @param t If not \c null, set to point to a Tokenizer for \a lang.
294
* @return Returns \c true only if this provider can provide a tokenizer for
295
* \a lang.
296
*/
297
virtual
bool
getTokenizer(
locale::iso639_1::type
lang,
298
Tokenizer::State
*state = 0,
299
Tokenizer::ptr
*t = 0 )
const
= 0;
300
};
301
302
///////////////////////////////////////////////////////////////////////////////
303
304
}
// namespace zorba
305
#endif
/* ZORBA_TOKENIZER_API_H */
306
/* vim:set et sw=2 ts=2: */