regexp.cpp
00001 // -*- c-basic-offset: 2 -*- 00002 /* 00003 * This file is part of the KDE libraries 00004 * Copyright (C) 1999-2001 Harri Porten (porten@kde.org) 00005 * Copyright (C) 2003,2004 Apple Computer, Inc. 00006 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org) 00007 * 00008 * This library is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2 of the License, or (at your option) any later version. 00012 * 00013 * This library is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with this library; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 * 00022 */ 00023 00024 #include "regexp.h" 00025 00026 #include "lexer.h" 00027 #include <assert.h> 00028 #include <stdio.h> 00029 #include <stdlib.h> 00030 #include <string.h> 00031 00032 using namespace KJS; 00033 00034 #ifdef PCRE_CONFIG_UTF8 00035 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; 00036 #endif 00037 00038 RegExp::RegExp(const UString &p, int f) 00039 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0) 00040 { 00041 // Determine whether libpcre has unicode support if need be.. 00042 #ifdef PCRE_CONFIG_UTF8 00043 if (utf8Support == Unknown) { 00044 int supported; 00045 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); 00046 utf8Support = supported ? Supported : Unsupported; 00047 } 00048 #endif 00049 00050 nrSubPatterns = 0; // determined in match() with POSIX regex. 00051 00052 // JS regexps can contain Unicode escape sequences (\uxxxx) which 00053 // are rather uncommon elsewhere. As our regexp libs don't understand 00054 // them we do the unescaping ourselves internally. 00055 // Also make sure to expand out any nulls as pcre_compile 00056 // expects null termination.. 00057 UString intern; 00058 const char* const nil = "\\x00"; 00059 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) { 00060 bool escape = false; 00061 for (int i = 0; i < p.size(); ++i) { 00062 UChar c = p[i]; 00063 if (escape) { 00064 escape = false; 00065 // we only care about \u 00066 if (c == 'u') { 00067 // standard unicode escape sequence looks like \uxxxx but 00068 // other browsers also accept less then 4 hex digits 00069 unsigned short u = 0; 00070 int j = 0; 00071 for (j = 0; j < 4; ++j) { 00072 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { 00073 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); 00074 ++i; 00075 } else { 00076 // sequence incomplete. restore index. 00077 // TODO: cleaner way to propagate warning 00078 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); 00079 i -= j; 00080 break; 00081 } 00082 } 00083 if (j < 4) { 00084 // sequence was incomplete. treat \u as u which IE always 00085 // and FF sometimes does. 00086 intern.append(UString('u')); 00087 } else { 00088 c = UChar(u); 00089 switch (u) { 00090 case 0: 00091 // Make sure to encode 0, to avoid terminating the string 00092 intern += UString(nil); 00093 break; 00094 case '^': 00095 case '$': 00096 case '\\': 00097 case '.': 00098 case '*': 00099 case '+': 00100 case '?': 00101 case '(': case ')': 00102 case '{': case '}': 00103 case '[': case ']': 00104 case '|': 00105 // escape pattern characters have to remain escaped 00106 intern.append(UString('\\')); 00107 // intentional fallthrough 00108 default: 00109 intern += UString(&c, 1); 00110 break; 00111 } 00112 } 00113 continue; 00114 } 00115 intern += UString('\\'); 00116 intern += UString(&c, 1); 00117 } else { 00118 if (c == '\\') 00119 escape = true; 00120 else if (c == '\0') 00121 intern += UString(nil); 00122 else 00123 intern += UString(&c, 1); 00124 } 00125 } 00126 } else { 00127 intern = p; 00128 } 00129 00130 #ifdef HAVE_PCREPOSIX 00131 int pcreflags = 0; 00132 const char *perrormsg; 00133 int errorOffset; 00134 00135 if (flgs & IgnoreCase) 00136 pcreflags |= PCRE_CASELESS; 00137 00138 if (flgs & Multiline) 00139 pcreflags |= PCRE_MULTILINE; 00140 00141 #ifdef PCRE_CONFIG_UTF8 00142 if (utf8Support == Supported) 00143 pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK); 00144 #endif 00145 00146 // Fill our buffer with an encoded version, whether utf-8, or, 00147 // if PCRE is incapable, truncated. 00148 prepareMatch(intern); 00149 00150 pcregex = pcre_compile(buffer, pcreflags, 00151 &perrormsg, &errorOffset, NULL); 00152 doneMatch(); // Cleanup buffers 00153 if (!pcregex) { 00154 #ifndef NDEBUG 00155 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg); 00156 #endif 00157 valid = false; 00158 return; 00159 } 00160 00161 #ifdef PCRE_INFO_CAPTURECOUNT 00162 // Get number of subpatterns that will be returned 00163 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns); 00164 if (rc != 0) 00165 #endif 00166 nrSubPatterns = 0; // fallback. We always need the first pair of offsets. 00167 00168 #else /* HAVE_PCREPOSIX */ 00169 00170 int regflags = 0; 00171 #ifdef REG_EXTENDED 00172 regflags |= REG_EXTENDED; 00173 #endif 00174 #ifdef REG_ICASE 00175 if ( f & IgnoreCase ) 00176 regflags |= REG_ICASE; 00177 #endif 00178 00179 //NOTE: Multiline is not feasible with POSIX regex. 00180 //if ( f & Multiline ) 00181 // ; 00182 // Note: the Global flag is already handled by RegExpProtoFunc::execute 00183 00184 int errorCode = regcomp(&preg, intern.ascii(), regflags); 00185 if (errorCode != 0) { 00186 #ifndef NDEBUG 00187 char errorMessage[80]; 00188 regerror(errorCode, &preg, errorMessage, sizeof errorMessage); 00189 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage); 00190 #endif 00191 valid = false; 00192 } 00193 #endif 00194 } 00195 00196 RegExp::~RegExp() 00197 { 00198 doneMatch(); // Be 100% sure buffers are freed 00199 #ifdef HAVE_PCREPOSIX 00200 if (pcregex) 00201 pcre_free(pcregex); 00202 #else 00203 /* TODO: is this really okay after an error ? */ 00204 regfree(&preg); 00205 #endif 00206 } 00207 00208 void RegExp::prepareUtf8(const UString& s) 00209 { 00210 // Allocate a buffer big enough to hold all the characters plus \0 00211 const int length = s.size(); 00212 buffer = new char[length * 3 + 1]; 00213 00214 // Also create buffer for positions. We need one extra character in there, 00215 // even past the \0 since the non-empty handling may jump one past the end 00216 originalPos = new int[length * 3 + 2]; 00217 00218 // Convert to runs of 8-bit characters, and generate indeces 00219 // Note that we do NOT combine surrogate pairs here, as 00220 // regexps operate on them as separate characters 00221 char *p = buffer; 00222 int *posOut = originalPos; 00223 const UChar *d = s.data(); 00224 for (int i = 0; i != length; ++i) { 00225 unsigned short c = d[i].unicode(); 00226 00227 int sequenceLen; 00228 if (c < 0x80) { 00229 *p++ = (char)c; 00230 sequenceLen = 1; 00231 } else if (c < 0x800) { 00232 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 00233 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 00234 sequenceLen = 2; 00235 } else { 00236 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 00237 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set 00238 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set 00239 sequenceLen = 3; 00240 } 00241 00242 while (sequenceLen > 0) { 00243 *posOut = i; 00244 ++posOut; 00245 --sequenceLen; 00246 } 00247 } 00248 00249 bufferSize = p - buffer; 00250 00251 *p++ = '\0'; 00252 00253 // Record positions for \0, and the fictional character after that. 00254 *posOut = length; 00255 *(posOut+1) = length+1; 00256 } 00257 00258 void RegExp::prepareASCII (const UString& s) 00259 { 00260 originalPos = 0; 00261 00262 // Best-effort attempt to get something done 00263 // when we don't have utf 8 available -- use 00264 // truncated version, and pray for the best 00265 CString truncated = s.cstring(); 00266 buffer = new char[truncated.size() + 1]; 00267 memcpy(buffer, truncated.c_str(), truncated.size()); 00268 buffer[truncated.size()] = '\0'; // For _compile use 00269 bufferSize = truncated.size(); 00270 } 00271 00272 void RegExp::prepareMatch(const UString &s) 00273 { 00274 delete[] originalPos; // Just to be sure.. 00275 delete[] buffer; 00276 #ifdef PCRE_CONFIG_UTF8 00277 if (utf8Support == Supported) 00278 prepareUtf8(s); 00279 else 00280 #endif 00281 prepareASCII(s); 00282 00283 #ifndef NDEBUG 00284 originalS = s; 00285 #endif 00286 } 00287 00288 void RegExp::doneMatch() 00289 { 00290 delete[] originalPos; originalPos = 0; 00291 delete[] buffer; buffer = 0; 00292 } 00293 00294 UString RegExp::match(const UString &s, int i, int *pos, int **ovector) 00295 { 00296 #ifndef NDEBUG 00297 assert(s.data() == originalS.data()); // Make sure prepareMatch got called right.. 00298 #endif 00299 assert(valid); 00300 00301 if (i < 0) 00302 i = 0; 00303 if (ovector) 00304 *ovector = 0L; 00305 int dummyPos; 00306 if (!pos) 00307 pos = &dummyPos; 00308 *pos = -1; 00309 if (i > s.size() || s.isNull()) 00310 return UString::null; 00311 00312 #ifdef HAVE_PCREPOSIX 00313 int ovecsize = (nrSubPatterns+1)*3; // see pcre docu 00314 if (ovector) *ovector = new int[ovecsize]; 00315 if (!pcregex) 00316 return UString::null; 00317 00318 int startPos; 00319 int nextPos; 00320 00321 #ifdef PCRE_CONFIG_UTF8 00322 if (utf8Support == Supported) { 00323 startPos = i; 00324 while (originalPos[startPos] < i) 00325 ++startPos; 00326 00327 nextPos = startPos; 00328 while (originalPos[nextPos] < (i + 1)) 00329 ++nextPos; 00330 } else 00331 #endif 00332 { 00333 startPos = i; 00334 nextPos = i + 1; 00335 } 00336 00337 int baseFlags = 00338 #ifdef PCRE_CONFIG_UTF8 00339 utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 00340 #endif 00341 0; 00342 if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos, 00343 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest 00344 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) 00345 { 00346 // Failed to match. 00347 if ((flgs & Global) && m_notEmpty && ovector) 00348 { 00349 // We set m_notEmpty ourselves, to look for a non-empty match 00350 // (see man pcretest or pcretest.c for details). 00351 // So we don't stop here, we want to try again at i+1. 00352 #ifdef KJS_VERBOSE 00353 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n"); 00354 #endif 00355 m_notEmpty = 0; 00356 if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags, 00357 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) 00358 return UString::null; 00359 } 00360 else // done 00361 return UString::null; 00362 } 00363 00364 // Got a match, proceed with it. 00365 // But fix up the ovector if need be.. 00366 if (ovector && originalPos) { 00367 for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) { 00368 if ((*ovector)[c] != -1) 00369 (*ovector)[c] = originalPos[(*ovector)[c]]; 00370 } 00371 } 00372 00373 if (!ovector) 00374 return UString::null; // don't rely on the return value if you pass ovector==0 00375 #else 00376 const uint maxMatch = 10; 00377 regmatch_t rmatch[maxMatch]; 00378 00379 char *str = strdup(s.ascii()); // TODO: why ??? 00380 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) { 00381 free(str); 00382 return UString::null; 00383 } 00384 free(str); 00385 00386 if (!ovector) { 00387 *pos = rmatch[0].rm_so + i; 00388 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so); 00389 } 00390 00391 // map rmatch array to ovector used in PCRE case 00392 nrSubPatterns = 0; 00393 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) { 00394 nrSubPatterns++; 00395 // if the nonEmpty flag is set, return a failed match if any of the 00396 // subMatches happens to be an empty string. 00397 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo) 00398 return UString::null; 00399 } 00400 // Allow an ovector slot to return the (failed) match result. 00401 if (nrSubPatterns == 0) nrSubPatterns = 1; 00402 00403 int ovecsize = (nrSubPatterns)*3; // see above 00404 *ovector = new int[ovecsize]; 00405 for (uint j = 0; j < nrSubPatterns; j++) { 00406 (*ovector)[2*j] = rmatch[j].rm_so + i; 00407 (*ovector)[2*j+1] = rmatch[j].rm_eo + i; 00408 } 00409 #endif 00410 00411 *pos = (*ovector)[0]; 00412 if ( *pos == (*ovector)[1] && (flgs & Global) ) 00413 { 00414 // empty match, next try will be with m_notEmpty=true 00415 m_notEmpty=true; 00416 } 00417 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]); 00418 } 00419 00420 #if 0 // unused 00421 bool RegExp::test(const UString &s, int) 00422 { 00423 #ifdef HAVE_PCREPOSIX 00424 int ovector[300]; 00425 CString buffer(s.cstring()); 00426 00427 if (s.isNull() || 00428 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0, 00429 0, ovector, 300) == PCRE_ERROR_NOMATCH) 00430 return false; 00431 else 00432 return true; 00433 00434 #else 00435 00436 char *str = strdup(s.ascii()); 00437 int r = regexec(&preg, str, 0, 0, 0); 00438 free(str); 00439 00440 return r == 0; 00441 #endif 00442 } 00443 #endif