• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kjs
 

kjs

  • kjs
regexp.cpp
1 // -*- c-basic-offset: 2 -*-
2 /*
3  * This file is part of the KDE libraries
4  * Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
5  * Copyright (C) 2003,2004 Apple Computer, Inc.
6  * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with this library; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  *
22  */
23 
24 #include "regexp.h"
25 
26 #include "lexer.h"
27 #include <assert.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 using namespace KJS;
33 
34 #ifdef PCRE_CONFIG_UTF8
35 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
36 #endif
37 
38 RegExp::RegExp(const UString &p, int f)
39  : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
40 {
41  // Determine whether libpcre has unicode support if need be..
42 #ifdef PCRE_CONFIG_UTF8
43  if (utf8Support == Unknown) {
44  int supported;
45  pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
46  utf8Support = supported ? Supported : Unsupported;
47  }
48 #endif
49 
50  nrSubPatterns = 0; // determined in match() with POSIX regex.
51 
52  // JS regexps can contain Unicode escape sequences (\uxxxx) which
53  // are rather uncommon elsewhere. As our regexp libs don't understand
54  // them we do the unescaping ourselves internally.
55  // Also make sure to expand out any nulls as pcre_compile
56  // expects null termination..
57  UString intern;
58  const char* const nil = "\\x00";
59  if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
60  bool escape = false;
61  for (int i = 0; i < p.size(); ++i) {
62  UChar c = p[i];
63  if (escape) {
64  escape = false;
65  // we only care about \u
66  if (c == 'u') {
67  // standard unicode escape sequence looks like \uxxxx but
68  // other browsers also accept less then 4 hex digits
69  unsigned short u = 0;
70  int j = 0;
71  for (j = 0; j < 4; ++j) {
72  if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
73  u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
74  ++i;
75  } else {
76  // sequence incomplete. restore index.
77  // TODO: cleaner way to propagate warning
78  fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
79  i -= j;
80  break;
81  }
82  }
83  if (j < 4) {
84  // sequence was incomplete. treat \u as u which IE always
85  // and FF sometimes does.
86  intern.append(UString('u'));
87  } else {
88  c = UChar(u);
89  switch (u) {
90  case 0:
91  // Make sure to encode 0, to avoid terminating the string
92  intern += UString(nil);
93  break;
94  case '^':
95  case '$':
96  case '\\':
97  case '.':
98  case '*':
99  case '+':
100  case '?':
101  case '(': case ')':
102  case '{': case '}':
103  case '[': case ']':
104  case '|':
105  // escape pattern characters have to remain escaped
106  intern.append(UString('\\'));
107  // intentional fallthrough
108  default:
109  intern += UString(&c, 1);
110  break;
111  }
112  }
113  continue;
114  }
115  intern += UString('\\');
116  intern += UString(&c, 1);
117  } else {
118  if (c == '\\')
119  escape = true;
120  else if (c == '\0')
121  intern += UString(nil);
122  else
123  intern += UString(&c, 1);
124  }
125  }
126  } else {
127  intern = p;
128  }
129 
130 #ifdef HAVE_PCREPOSIX
131  int pcreflags = 0;
132  const char *perrormsg;
133  int errorOffset;
134 
135  if (flgs & IgnoreCase)
136  pcreflags |= PCRE_CASELESS;
137 
138  if (flgs & Multiline)
139  pcreflags |= PCRE_MULTILINE;
140 
141 #ifdef PCRE_CONFIG_UTF8
142  if (utf8Support == Supported)
143  pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
144 #endif
145 
146  // Fill our buffer with an encoded version, whether utf-8, or,
147  // if PCRE is incapable, truncated.
148  prepareMatch(intern);
149 
150  pcregex = pcre_compile(buffer, pcreflags,
151  &perrormsg, &errorOffset, NULL);
152  doneMatch(); // Cleanup buffers
153  if (!pcregex) {
154 #ifndef NDEBUG
155  fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
156 #endif
157  valid = false;
158  return;
159  }
160 
161 #ifdef PCRE_INFO_CAPTURECOUNT
162  // Get number of subpatterns that will be returned
163  int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
164  if (rc != 0)
165 #endif
166  nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
167 
168 #else /* HAVE_PCREPOSIX */
169 
170  int regflags = 0;
171 #ifdef REG_EXTENDED
172  regflags |= REG_EXTENDED;
173 #endif
174 #ifdef REG_ICASE
175  if ( f & IgnoreCase )
176  regflags |= REG_ICASE;
177 #endif
178 
179  //NOTE: Multiline is not feasible with POSIX regex.
180  //if ( f & Multiline )
181  // ;
182  // Note: the Global flag is already handled by RegExpProtoFunc::execute
183 
184  int errorCode = regcomp(&preg, intern.ascii(), regflags);
185  if (errorCode != 0) {
186 #ifndef NDEBUG
187  char errorMessage[80];
188  regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
189  fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
190 #endif
191  valid = false;
192  }
193 #endif
194 }
195 
196 RegExp::~RegExp()
197 {
198  doneMatch(); // Be 100% sure buffers are freed
199 #ifdef HAVE_PCREPOSIX
200  if (pcregex)
201  pcre_free(pcregex);
202 #else
203  /* TODO: is this really okay after an error ? */
204  regfree(&preg);
205 #endif
206 }
207 
208 void RegExp::prepareUtf8(const UString& s)
209 {
210  // Allocate a buffer big enough to hold all the characters plus \0
211  const int length = s.size();
212  buffer = new char[length * 3 + 1];
213 
214  // Also create buffer for positions. We need one extra character in there,
215  // even past the \0 since the non-empty handling may jump one past the end
216  originalPos = new int[length * 3 + 2];
217 
218  // Convert to runs of 8-bit characters, and generate indeces
219  // Note that we do NOT combine surrogate pairs here, as
220  // regexps operate on them as separate characters
221  char *p = buffer;
222  int *posOut = originalPos;
223  const UChar *d = s.data();
224  for (int i = 0; i != length; ++i) {
225  unsigned short c = d[i].unicode();
226 
227  int sequenceLen;
228  if (c < 0x80) {
229  *p++ = (char)c;
230  sequenceLen = 1;
231  } else if (c < 0x800) {
232  *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
233  *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
234  sequenceLen = 2;
235  } else {
236  *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
237  *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
238  *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
239  sequenceLen = 3;
240  }
241 
242  while (sequenceLen > 0) {
243  *posOut = i;
244  ++posOut;
245  --sequenceLen;
246  }
247  }
248 
249  bufferSize = p - buffer;
250 
251  *p++ = '\0';
252 
253  // Record positions for \0, and the fictional character after that.
254  *posOut = length;
255  *(posOut+1) = length+1;
256 }
257 
258 void RegExp::prepareASCII (const UString& s)
259 {
260  originalPos = 0;
261 
262  // Best-effort attempt to get something done
263  // when we don't have utf 8 available -- use
264  // truncated version, and pray for the best
265  CString truncated = s.cstring();
266  buffer = new char[truncated.size() + 1];
267  memcpy(buffer, truncated.c_str(), truncated.size());
268  buffer[truncated.size()] = '\0'; // For _compile use
269  bufferSize = truncated.size();
270 }
271 
272 void RegExp::prepareMatch(const UString &s)
273 {
274  delete[] originalPos; // Just to be sure..
275  delete[] buffer;
276 #ifdef PCRE_CONFIG_UTF8
277  if (utf8Support == Supported)
278  prepareUtf8(s);
279  else
280 #endif
281  prepareASCII(s);
282 
283 #ifndef NDEBUG
284  originalS = s;
285 #endif
286 }
287 
288 void RegExp::doneMatch()
289 {
290  delete[] originalPos; originalPos = 0;
291  delete[] buffer; buffer = 0;
292 }
293 
294 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
295 {
296 #ifndef NDEBUG
297  assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
298 #endif
299  assert(valid);
300 
301  if (i < 0)
302  i = 0;
303  if (ovector)
304  *ovector = 0L;
305  int dummyPos;
306  if (!pos)
307  pos = &dummyPos;
308  *pos = -1;
309  if (i > s.size() || s.isNull())
310  return UString::null;
311 
312 #ifdef HAVE_PCREPOSIX
313  int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
314  if (ovector) *ovector = new int[ovecsize];
315  if (!pcregex)
316  return UString::null;
317 
318  int startPos;
319  int nextPos;
320 
321 #ifdef PCRE_CONFIG_UTF8
322  if (utf8Support == Supported) {
323  startPos = i;
324  while (originalPos[startPos] < i)
325  ++startPos;
326 
327  nextPos = startPos;
328  if (i < s.size()) {
329  while (originalPos[nextPos] < (i + 1))
330  ++nextPos;
331  }
332  } else
333 #endif
334  {
335  startPos = i;
336  nextPos = i + (i < s.size() ? 1 : 0);
337  }
338 
339  int baseFlags =
340 #ifdef PCRE_CONFIG_UTF8
341  utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
342 #endif
343  0;
344  int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
345  m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
346  ovector ? *ovector : 0L, ovecsize);
347  if (numMatches < 0)
348  {
349  // Failed to match.
350  if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
351  {
352  // We set m_notEmpty ourselves, to look for a non-empty match
353  // (see man pcretest or pcretest.c for details).
354  // So we don't stop here, we want to try again at i+1.
355 #ifdef KJS_VERBOSE
356  fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
357 #endif
358  m_notEmpty = 0;
359  numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
360  ovector ? *ovector : 0L, ovecsize);
361  if (numMatches < 0)
362  return UString::null;
363  }
364  else // done
365  return UString::null;
366  }
367 
368  // Got a match, proceed with it.
369  // But fix up the ovector if need be..
370  if (ovector && originalPos) {
371  for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) {
372  if ((*ovector)[c] != -1)
373  (*ovector)[c] = originalPos[(*ovector)[c]];
374  }
375  }
376 
377  if (!ovector)
378  return UString::null; // don't rely on the return value if you pass ovector==0
379 #else
380  const uint maxMatch = 10;
381  regmatch_t rmatch[maxMatch];
382 
383  char *str = strdup(s.ascii()); // TODO: why ???
384  if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
385  free(str);
386  return UString::null;
387  }
388  free(str);
389 
390  if (!ovector) {
391  *pos = rmatch[0].rm_so + i;
392  return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
393  }
394 
395  // map rmatch array to ovector used in PCRE case
396  nrSubPatterns = 0;
397  for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
398  nrSubPatterns++;
399  // if the nonEmpty flag is set, return a failed match if any of the
400  // subMatches happens to be an empty string.
401  if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
402  return UString::null;
403  }
404  // Allow an ovector slot to return the (failed) match result.
405  if (nrSubPatterns == 0) nrSubPatterns = 1;
406 
407  int ovecsize = (nrSubPatterns)*3; // see above
408  *ovector = new int[ovecsize];
409  for (uint j = 0; j < nrSubPatterns; j++) {
410  (*ovector)[2*j] = rmatch[j].rm_so + i;
411  (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
412  }
413 #endif
414 
415  *pos = (*ovector)[0];
416  if ( *pos == (*ovector)[1] && (flgs & Global) )
417  {
418  // empty match, next try will be with m_notEmpty=true
419  m_notEmpty=true;
420  }
421  return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
422 }
423 
424 #if 0 // unused
425 bool RegExp::test(const UString &s, int)
426 {
427 #ifdef HAVE_PCREPOSIX
428  int ovector[300];
429  CString buffer(s.cstring());
430 
431  if (s.isNull() ||
432  pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
433  0, ovector, 300) == PCRE_ERROR_NOMATCH)
434  return false;
435  else
436  return true;
437 
438 #else
439 
440  char *str = strdup(s.ascii());
441  int r = regexec(&preg, str, 0, 0, 0);
442  free(str);
443 
444  return r == 0;
445 #endif
446 }
447 #endif
KJS::UString::substr
UString substr(int pos=0, int len=-1) const
Definition: ustring.cpp:869
KJS::UString::null
static UString null
Static instance of a null string.
Definition: ustring.h:429
KJS::UString::find
int find(const UString &f, int pos=0) const
Definition: ustring.cpp:799
KJS::UString::ascii
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
Definition: ustring.cpp:486
KJS::CString
8 bit char based string class
Definition: ustring.h:166
KJS::UString::data
const UChar * data() const
Definition: ustring.h:340
KJS::UString
Unicode string class.
Definition: ustring.h:190
KJS::UChar
Unicode character.
Definition: ustring.h:52
KJS
Definition: array_instance.h:28
KJS::UString::isNull
bool isNull() const
Definition: ustring.h:344
KJS::UChar::unicode
unsigned short unicode() const
Definition: ustring.h:82
KJS::UString::append
UString & append(const UString &)
Append another string.
Definition: ustring.cpp:458
KJS::UString::cstring
CString cstring() const
Definition: ustring.cpp:481
KJS::UString::size
int size() const
Definition: ustring.h:360

kjs

Skip menu "kjs"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kjs

Skip menu "kjs"
  • arts
  • dcop
  • dnssd
  • interfaces
  •     interface
  •     library
  •   kspeech
  •   ktexteditor
  • kabc
  • kate
  • kcmshell
  • kdecore
  • kded
  • kdefx
  • kdeprint
  • kdesu
  • kdeui
  • kdoctools
  • khtml
  • kimgio
  • kinit
  • kio
  •   bookmarks
  •   httpfilter
  •   kfile
  •   kio
  •   kioexec
  •   kpasswdserver
  •   kssl
  • kioslave
  •   http
  • kjs
  • kmdi
  •   kmdi
  • knewstuff
  • kparts
  • krandr
  • kresources
  • kspell2
  • kunittest
  • kutils
  • kwallet
  • libkmid
  • libkscreensaver
Generated for kjs by doxygen 1.8.13
This website is maintained by Timothy Pearson.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. |