00001 /* 00002 * Copyright 2006-2008 The FLWOR Foundation. 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * http://www.apache.org/licenses/LICENSE-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 00017 #pragma once 00018 #ifndef ZORBA_TOKENIZER_API_H 00019 #define ZORBA_TOKENIZER_API_H 00020 00021 #include <zorba/config.h> 00022 #include <zorba/locale.h> 00023 #include <zorba/internal/unique_ptr.h> 00024 #include <zorba/internal/ztd.h> 00025 00026 namespace zorba { 00027 00028 class Item; 00029 00030 /////////////////////////////////////////////////////////////////////////////// 00031 00032 /** 00033 * A %Tokenizer breaks a string into a stream of word tokens. Each token is 00034 * assigned a token, sentence, and paragraph number. 00035 * 00036 * A %Tokenizer determines word and sentence boundaries automatically, but must 00037 * be told when to increment the paragraph number. 00038 */ 00039 class ZORBA_DLL_PUBLIC Tokenizer { 00040 public: 00041 typedef std::unique_ptr<Tokenizer,internal::ztd::destroy_delete<Tokenizer> > 00042 ptr; 00043 00044 typedef unsigned size_type; 00045 00046 ///////////////////////////////////////////////////////////////////////////// 00047 00048 /** 00049 * A %Numbers contains the current token, sentence, and paragraph numbers. 00050 */ 00051 struct Numbers { 00052 typedef Tokenizer::size_type value_type; 00053 00054 value_type token; ///< Token number. 00055 value_type sent; ///< Sentence number. 00056 value_type para; ///< Paragraph number. 00057 00058 /** 00059 * Default constructor. 00060 */ 00061 Numbers(); 00062 }; 00063 00064 ///////////////////////////////////////////////////////////////////////////// 00065 00066 /** 00067 * A %Callback is called once per token. 00068 * This is only internally by Zorba. 00069 * You do not need to derive from this class. 00070 * The only thing you need to do is call the callback's \c operator() once 00071 * for each token you parse in \c tokenize(). 00072 */ 00073 class Callback { 00074 public: 00075 typedef Tokenizer::size_type size_type; 00076 00077 virtual ~Callback(); 00078 00079 /** 00080 * This member-function is called once per token. 00081 * 00082 * @param utf8_s The UTF-8 token string. It is not null-terminated. 00083 * @param utf8_len The number of bytes in the token string. 00084 * @param token_no The token number. Token numbers start at 0. 00085 * @param sent_no The sentence number. Sentence numbers start at 1. 00086 * @param para_no The paragraph number. Paragraph numbers start at 1. 00087 * @param payload Optional user-defined data. 00088 */ 00089 virtual void operator()( char const *utf8_s, size_type utf8_len, 00090 size_type token_no, size_type sent_no, 00091 size_type para_no, void *payload = 0 ) = 0; 00092 }; 00093 00094 ///////////////////////////////////////////////////////////////////////////// 00095 00096 /** 00097 * Destroys this %Tokenizer. 00098 * This function is called by Zorba when the %Tokenizer is no longer needed. 00099 * 00100 * If your TokenizerProvider dynamically allocates %Tokenizer objects, then 00101 * the implementation can simply be (and usually is) <code>delete this</code>. 00102 * 00103 * If your TokenizerProvider returns a pointer to a static %Tokenizer object, 00104 * then the implementation should do nothing. 00105 */ 00106 virtual void destroy() const = 0; 00107 00108 /** 00109 * Trace options for XML elements combined via bitwise-or. 00110 */ 00111 enum ElementTraceOptions { 00112 trace_none = 0x0, ///< Trace no elements. 00113 trace_begin = 0x1, ///< Trace the beginning of elements. 00114 trace_end = 0x2 ///< Trace the ending of elements. 00115 }; 00116 00117 /** 00118 * Gets the trace options. If the value is \c trace_none, then the paragraph 00119 * number will be incremented upon entering an XML element; if the value is 00120 * anything other than \c trace_none, then the tokenizer assumes 00121 * responsibility for incrementing the paragraph number. 00122 * 00123 * @return Returns said options. 00124 */ 00125 int trace_options() const { 00126 return trace_options_; 00127 } 00128 00129 /** 00130 * This function is called whenever an XML element is entered during 00131 * tokenization. Note that this function is called only if \c 00132 * trace_options() returns non-zero. 00133 * 00134 * @param qname The element's QName. 00135 * @param trace_options The bitwise-or of the trace option(s) in effect for a 00136 * particular call. 00137 * @see trace_options() 00138 */ 00139 virtual void element( Item const &qname, int trace_options ); 00140 00141 /** 00142 * Gets this %Tokenizer's associated Numbers. 00143 * 00144 * @return Returns said Numbers. 00145 */ 00146 Numbers& numbers(); 00147 00148 /** 00149 * Gets this %Tokenizer's associated Numbers. 00150 * 00151 * @return Returns said Numbers. 00152 */ 00153 Numbers const& numbers() const; 00154 00155 /** 00156 * Tokenizes the given string. 00157 * 00158 * @param utf8_s The UTF-8 string to tokenize. It need not be 00159 * null-terminated. 00160 * @param utf8_len The number of bytes in the string to be tokenized. 00161 * @param lang The language of the string. 00162 * @param wildcards If \c true, allows XQuery wildcard syntax characters to 00163 * be part of tokens. 00164 * @param callback The Callback to call once per token. 00165 * @param payload Optional user-defined data. 00166 */ 00167 virtual void tokenize( char const *utf8_s, size_type utf8_len, 00168 locale::iso639_1::type lang, bool wildcards, 00169 Callback &callback, void *payload = 0 ) = 0; 00170 00171 ///////////////////////////////////////////////////////////////////////////// 00172 00173 protected: 00174 /** 00175 * Constructs a %Tokenizer. 00176 * 00177 * @param numbers the Numbers to use. 00178 * @param trace_options The bitwise-or of the available trace options, if 00179 * any. 00180 */ 00181 Tokenizer( Numbers &numbers, int trace_options = trace_none ); 00182 00183 /** 00184 * Destroys a %Tokenizer. 00185 */ 00186 virtual ~Tokenizer() = 0; 00187 00188 private: 00189 int trace_options_; 00190 Numbers *no_; 00191 }; 00192 00193 inline Tokenizer::Numbers& Tokenizer::numbers() { 00194 return *no_; 00195 } 00196 00197 inline Tokenizer::Numbers const& Tokenizer::numbers() const { 00198 return *no_; 00199 } 00200 00201 /////////////////////////////////////////////////////////////////////////////// 00202 00203 /** 00204 * A %TokenizerProvider provides a Tokenizer for a given language. 00205 */ 00206 class ZORBA_DLL_PUBLIC TokenizerProvider { 00207 public: 00208 virtual ~TokenizerProvider(); 00209 00210 /** 00211 * Creates a new %Tokenizer. 00212 * 00213 * @param lang The language of the text that the tokenizer will tokenize. 00214 * @param numbers The Numbers to use. 00215 * @return Returns said %Tokenizer. 00216 */ 00217 virtual Tokenizer::ptr getTokenizer( locale::iso639_1::type lang, 00218 Tokenizer::Numbers &numbers ) const = 0; 00219 }; 00220 00221 /////////////////////////////////////////////////////////////////////////////// 00222 00223 } // namespace zorba 00224 #endif /* ZORBA_TOKENIZER_API_H */ 00225 /* vim:set et sw=2 ts=2: */