34 #ifdef PCRE_CONFIG_UTF8 35 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
38 RegExp::RegExp(
const UString &p,
int f)
39 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
42 #ifdef PCRE_CONFIG_UTF8 43 if (utf8Support == Unknown) {
45 pcre_config(PCRE_CONFIG_UTF8, (
void*)&supported);
46 utf8Support = supported ? Supported : Unsupported;
58 const char*
const nil =
"\\x00";
61 for (
int i = 0; i < p.
size(); ++i) {
71 for (j = 0; j < 4; ++j) {
72 if (i + 1 < p.
size() && Lexer::isHexDigit(p[i + 1].unicode())) {
73 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
78 fprintf(stderr,
"KJS: saw %d digit \\u sequence.\n", j);
130 #ifdef HAVE_PCREPOSIX 132 const char *perrormsg;
135 if (flgs & IgnoreCase)
136 pcreflags |= PCRE_CASELESS;
138 if (flgs & Multiline)
139 pcreflags |= PCRE_MULTILINE;
141 #ifdef PCRE_CONFIG_UTF8 142 if (utf8Support == Supported)
143 pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
148 prepareMatch(intern);
150 pcregex = pcre_compile(buffer, pcreflags,
151 &perrormsg, &errorOffset, NULL);
155 fprintf(stderr,
"KJS: pcre_compile() failed with '%s'\n", perrormsg);
161 #ifdef PCRE_INFO_CAPTURECOUNT 163 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
172 regflags |= REG_EXTENDED;
175 if ( f & IgnoreCase )
176 regflags |= REG_ICASE;
184 int errorCode = regcomp(&preg, intern.
ascii(), regflags);
185 if (errorCode != 0) {
187 char errorMessage[80];
188 regerror(errorCode, &preg, errorMessage,
sizeof errorMessage);
189 fprintf(stderr,
"KJS: regcomp failed with '%s'\n", errorMessage);
199 #ifdef HAVE_PCREPOSIX 208 void RegExp::prepareUtf8(
const UString& s)
211 const int length = s.
size();
212 buffer =
new char[length * 3 + 1];
216 originalPos =
new int[length * 3 + 2];
222 int *posOut = originalPos;
224 for (
int i = 0; i != length; ++i) {
225 unsigned short c = d[i].
unicode();
231 }
else if (c < 0x800) {
232 *p++ = (char)((c >> 6) | 0xC0);
233 *p++ = (char)((c | 0x80) & 0xBF);
236 *p++ = (char)((c >> 12) | 0xE0);
237 *p++ = (char)(((c >> 6) | 0x80) & 0xBF);
238 *p++ = (char)((c | 0x80) & 0xBF);
242 while (sequenceLen > 0) {
249 bufferSize = p - buffer;
255 *(posOut+1) = length+1;
258 void RegExp::prepareASCII (
const UString& s)
266 buffer =
new char[truncated.size() + 1];
267 memcpy(buffer, truncated.c_str(), truncated.size());
268 buffer[truncated.size()] =
'\0';
269 bufferSize = truncated.size();
272 void RegExp::prepareMatch(
const UString &s)
274 delete[] originalPos;
276 #ifdef PCRE_CONFIG_UTF8 277 if (utf8Support == Supported)
288 void RegExp::doneMatch()
290 delete[] originalPos; originalPos = 0;
291 delete[] buffer; buffer = 0;
294 UString RegExp::match(
const UString &s,
int i,
int *pos,
int **ovector)
297 assert(s.
data() == originalS.data());
312 #ifdef HAVE_PCREPOSIX 313 int ovecsize = (nrSubPatterns+1)*3;
314 if (ovector) *ovector =
new int[ovecsize];
321 #ifdef PCRE_CONFIG_UTF8 322 if (utf8Support == Supported) {
324 while (originalPos[startPos] < i)
329 while (originalPos[nextPos] < (i + 1))
336 nextPos = i + (i < s.
size() ? 1 : 0);
340 #ifdef PCRE_CONFIG_UTF8 341 utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
344 int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
345 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags,
346 ovector ? *ovector : 0L, ovecsize);
350 if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
356 fprintf(stderr,
"No match after m_notEmpty. +1 and keep going.\n");
359 numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
360 ovector ? *ovector : 0L, ovecsize);
370 if (ovector && originalPos) {
371 for (
unsigned c = 0; c < 2 * TQMIN((
unsigned)numMatches, nrSubPatterns+1); ++c) {
372 if ((*ovector)[c] != -1)
373 (*ovector)[c] = originalPos[(*ovector)[c]];
380 const uint maxMatch = 10;
381 regmatch_t rmatch[maxMatch];
383 char *str = strdup(s.
ascii());
384 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
391 *pos = rmatch[0].rm_so + i;
392 return s.
substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
397 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
401 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
405 if (nrSubPatterns == 0) nrSubPatterns = 1;
407 int ovecsize = (nrSubPatterns)*3;
408 *ovector =
new int[ovecsize];
409 for (uint j = 0; j < nrSubPatterns; j++) {
410 (*ovector)[2*j] = rmatch[j].rm_so + i;
411 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
415 *pos = (*ovector)[0];
416 if ( *pos == (*ovector)[1] && (flgs & Global) )
421 return s.
substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
425 bool RegExp::test(
const UString &s,
int)
427 #ifdef HAVE_PCREPOSIX 432 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
433 0, ovector, 300) == PCRE_ERROR_NOMATCH)
440 char *str = strdup(s.
ascii());
441 int r = regexec(&preg, str, 0, 0, 0);
UString substr(int pos=0, int len=-1) const
static UString null
Static instance of a null string.
int find(const UString &f, int pos=0) const
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
8 bit char based string class
const UChar * data() const
unsigned short unicode() const
UString & append(const UString &)
Append another string.