1// Copyright (c) 2010, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29//
30// Author: Sanjay Ghemawat
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <stdlib.h>
37#include <stdio.h>
38#include <ctype.h>
39#include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
40#include <string.h>      /* for memcpy */
41#include <assert.h>
42#include <errno.h>
43#include <string>
44#include <algorithm>
45
46#include "pcrecpp_internal.h"
47#include "pcre.h"
48#include "pcrecpp.h"
49#include "pcre_stringpiece.h"
50
51
52namespace pcrecpp {
53
54// Maximum number of args we can set
55static const int kMaxArgs = 16;
56static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
57
58// Special object that stands-in for no argument
59Arg RE::no_arg((void*)NULL);
60
61// This is for ABI compatibility with old versions of pcre (pre-7.6),
62// which defined a global no_arg variable instead of putting it in the
63// RE class.  This works on GCC >= 3, at least.  It definitely works
64// for ELF, but may not for other object formats (Mach-O, for
65// instance, does not support aliases.)  We could probably have a more
66// inclusive test if we ever needed it.  (Note that not only the
67// __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
68// gnu-specific.)
69#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
70# define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
71# define ULP_AS_STRING_INTERNAL(x)   #x
72# define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
73extern Arg no_arg
74  __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
75#endif
76
77// If a regular expression has no error, its error_ field points here
78static const string empty_string;
79
80// If the user doesn't ask for any options, we just use this one
81static RE_Options default_options;
82
83void RE::Init(const string& pat, const RE_Options* options) {
84  pattern_ = pat;
85  if (options == NULL) {
86    options_ = default_options;
87  } else {
88    options_ = *options;
89  }
90  error_ = &empty_string;
91  re_full_ = NULL;
92  re_partial_ = NULL;
93
94  re_partial_ = Compile(UNANCHORED);
95  if (re_partial_ != NULL) {
96    re_full_ = Compile(ANCHOR_BOTH);
97  }
98}
99
100void RE::Cleanup() {
101  if (re_full_ != NULL)         (*pcre_free)(re_full_);
102  if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
103  if (error_ != &empty_string)  delete error_;
104}
105
106
107RE::~RE() {
108  Cleanup();
109}
110
111
112pcre* RE::Compile(Anchor anchor) {
113  // First, convert RE_Options into pcre options
114  int pcre_options = 0;
115  pcre_options = options_.all_options();
116
117  // Special treatment for anchoring.  This is needed because at
118  // runtime pcre only provides an option for anchoring at the
119  // beginning of a string (unless you use offset).
120  //
121  // There are three types of anchoring we want:
122  //    UNANCHORED      Compile the original pattern, and use
123  //                    a pcre unanchored match.
124  //    ANCHOR_START    Compile the original pattern, and use
125  //                    a pcre anchored match.
126  //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
127  //                    and use a pcre anchored match.
128
129  const char* compile_error;
130  int eoffset;
131  pcre* re;
132  if (anchor != ANCHOR_BOTH) {
133    re = pcre_compile(pattern_.c_str(), pcre_options,
134                      &compile_error, &eoffset, NULL);
135  } else {
136    // Tack a '\z' at the end of RE.  Parenthesize it first so that
137    // the '\z' applies to all top-level alternatives in the regexp.
138    string wrapped = "(?:";  // A non-counting grouping operator
139    wrapped += pattern_;
140    wrapped += ")\\z";
141    re = pcre_compile(wrapped.c_str(), pcre_options,
142                      &compile_error, &eoffset, NULL);
143  }
144  if (re == NULL) {
145    if (error_ == &empty_string) error_ = new string(compile_error);
146  }
147  return re;
148}
149
150/***** Matching interfaces *****/
151
152bool RE::FullMatch(const StringPiece& text,
153                   const Arg& ptr1,
154                   const Arg& ptr2,
155                   const Arg& ptr3,
156                   const Arg& ptr4,
157                   const Arg& ptr5,
158                   const Arg& ptr6,
159                   const Arg& ptr7,
160                   const Arg& ptr8,
161                   const Arg& ptr9,
162                   const Arg& ptr10,
163                   const Arg& ptr11,
164                   const Arg& ptr12,
165                   const Arg& ptr13,
166                   const Arg& ptr14,
167                   const Arg& ptr15,
168                   const Arg& ptr16) const {
169  const Arg* args[kMaxArgs];
170  int n = 0;
171  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
172  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
173  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
174  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
175  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
176  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
177  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
178  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
179  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
180  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
181  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
182  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
183  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
184  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
185  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
186  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
187 done:
188
189  int consumed;
190  int vec[kVecSize];
191  return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
192}
193
194bool RE::PartialMatch(const StringPiece& text,
195                      const Arg& ptr1,
196                      const Arg& ptr2,
197                      const Arg& ptr3,
198                      const Arg& ptr4,
199                      const Arg& ptr5,
200                      const Arg& ptr6,
201                      const Arg& ptr7,
202                      const Arg& ptr8,
203                      const Arg& ptr9,
204                      const Arg& ptr10,
205                      const Arg& ptr11,
206                      const Arg& ptr12,
207                      const Arg& ptr13,
208                      const Arg& ptr14,
209                      const Arg& ptr15,
210                      const Arg& ptr16) const {
211  const Arg* args[kMaxArgs];
212  int n = 0;
213  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
214  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
215  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
216  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
217  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
218  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
219  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
220  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
221  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
222  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
223  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
224  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
225  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
226  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
227  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
228  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
229 done:
230
231  int consumed;
232  int vec[kVecSize];
233  return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
234}
235
236bool RE::Consume(StringPiece* input,
237                 const Arg& ptr1,
238                 const Arg& ptr2,
239                 const Arg& ptr3,
240                 const Arg& ptr4,
241                 const Arg& ptr5,
242                 const Arg& ptr6,
243                 const Arg& ptr7,
244                 const Arg& ptr8,
245                 const Arg& ptr9,
246                 const Arg& ptr10,
247                 const Arg& ptr11,
248                 const Arg& ptr12,
249                 const Arg& ptr13,
250                 const Arg& ptr14,
251                 const Arg& ptr15,
252                 const Arg& ptr16) const {
253  const Arg* args[kMaxArgs];
254  int n = 0;
255  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
256  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
257  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
258  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
259  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
260  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
261  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
262  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
263  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
264  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
265  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
266  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
267  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
268  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
269  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
270  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
271 done:
272
273  int consumed;
274  int vec[kVecSize];
275  if (DoMatchImpl(*input, ANCHOR_START, &consumed,
276                  args, n, vec, kVecSize)) {
277    input->remove_prefix(consumed);
278    return true;
279  } else {
280    return false;
281  }
282}
283
284bool RE::FindAndConsume(StringPiece* input,
285                        const Arg& ptr1,
286                        const Arg& ptr2,
287                        const Arg& ptr3,
288                        const Arg& ptr4,
289                        const Arg& ptr5,
290                        const Arg& ptr6,
291                        const Arg& ptr7,
292                        const Arg& ptr8,
293                        const Arg& ptr9,
294                        const Arg& ptr10,
295                        const Arg& ptr11,
296                        const Arg& ptr12,
297                        const Arg& ptr13,
298                        const Arg& ptr14,
299                        const Arg& ptr15,
300                        const Arg& ptr16) const {
301  const Arg* args[kMaxArgs];
302  int n = 0;
303  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
304  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
305  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
306  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
307  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
308  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
309  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
310  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
311  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
312  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
313  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
314  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
315  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
316  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
317  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
318  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
319 done:
320
321  int consumed;
322  int vec[kVecSize];
323  if (DoMatchImpl(*input, UNANCHORED, &consumed,
324                  args, n, vec, kVecSize)) {
325    input->remove_prefix(consumed);
326    return true;
327  } else {
328    return false;
329  }
330}
331
332bool RE::Replace(const StringPiece& rewrite,
333                 string *str) const {
334  int vec[kVecSize];
335  int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
336  if (matches == 0)
337    return false;
338
339  string s;
340  if (!Rewrite(&s, rewrite, *str, vec, matches))
341    return false;
342
343  assert(vec[0] >= 0);
344  assert(vec[1] >= 0);
345  str->replace(vec[0], vec[1] - vec[0], s);
346  return true;
347}
348
349// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
350// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
351// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
352
353static int NewlineMode(int pcre_options) {
354  // TODO: if we can make it threadsafe, cache this var
355  int newline_mode = 0;
356  /* if (newline_mode) return newline_mode; */  // do this once it's cached
357  if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
358                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
359    newline_mode = (pcre_options &
360                    (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
361                     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
362  } else {
363    int newline;
364    pcre_config(PCRE_CONFIG_NEWLINE, &newline);
365    if (newline == 10)
366      newline_mode = PCRE_NEWLINE_LF;
367    else if (newline == 13)
368      newline_mode = PCRE_NEWLINE_CR;
369    else if (newline == 3338)
370      newline_mode = PCRE_NEWLINE_CRLF;
371    else if (newline == -1)
372      newline_mode = PCRE_NEWLINE_ANY;
373    else if (newline == -2)
374      newline_mode = PCRE_NEWLINE_ANYCRLF;
375    else
376      assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
377  }
378  return newline_mode;
379}
380
381int RE::GlobalReplace(const StringPiece& rewrite,
382                      string *str) const {
383  int count = 0;
384  int vec[kVecSize];
385  string out;
386  int start = 0;
387  bool last_match_was_empty_string = false;
388
389  while (start <= static_cast<int>(str->length())) {
390    // If the previous match was for the empty string, we shouldn't
391    // just match again: we'll match in the same way and get an
392    // infinite loop.  Instead, we do the match in a special way:
393    // anchored -- to force another try at the same position --
394    // and with a flag saying that this time, ignore empty matches.
395    // If this special match returns, that means there's a non-empty
396    // match at this position as well, and we can continue.  If not,
397    // we do what perl does, and just advance by one.
398    // Notice that perl prints '@@@' for this;
399    //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
400    int matches;
401    if (last_match_was_empty_string) {
402      matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
403      if (matches <= 0) {
404        int matchend = start + 1;     // advance one character.
405        // If the current char is CR and we're in CRLF mode, skip LF too.
406        // Note it's better to call pcre_fullinfo() than to examine
407        // all_options(), since options_ could have changed bewteen
408        // compile-time and now, but this is simpler and safe enough.
409        // Modified by PH to add ANY and ANYCRLF.
410        if (matchend < static_cast<int>(str->length()) &&
411            (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
412            (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
413             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
414             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
415          matchend++;
416        }
417        // We also need to advance more than one char if we're in utf8 mode.
418#ifdef SUPPORT_UTF8
419        if (options_.utf8()) {
420          while (matchend < static_cast<int>(str->length()) &&
421                 ((*str)[matchend] & 0xc0) == 0x80)
422            matchend++;
423        }
424#endif
425        if (start < static_cast<int>(str->length()))
426          out.append(*str, start, matchend - start);
427        start = matchend;
428        last_match_was_empty_string = false;
429        continue;
430      }
431    } else {
432      matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
433      if (matches <= 0)
434        break;
435    }
436    int matchstart = vec[0], matchend = vec[1];
437    assert(matchstart >= start);
438    assert(matchend >= matchstart);
439    out.append(*str, start, matchstart - start);
440    Rewrite(&out, rewrite, *str, vec, matches);
441    start = matchend;
442    count++;
443    last_match_was_empty_string = (matchstart == matchend);
444  }
445
446  if (count == 0)
447    return 0;
448
449  if (start < static_cast<int>(str->length()))
450    out.append(*str, start, str->length() - start);
451  swap(out, *str);
452  return count;
453}
454
455bool RE::Extract(const StringPiece& rewrite,
456                 const StringPiece& text,
457                 string *out) const {
458  int vec[kVecSize];
459  int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
460  if (matches == 0)
461    return false;
462  out->erase();
463  return Rewrite(out, rewrite, text, vec, matches);
464}
465
466/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
467  string result;
468
469  // Escape any ascii character not in [A-Za-z_0-9].
470  //
471  // Note that it's legal to escape a character even if it has no
472  // special meaning in a regular expression -- so this function does
473  // that.  (This also makes it identical to the perl function of the
474  // same name; see `perldoc -f quotemeta`.)  The one exception is
475  // escaping NUL: rather than doing backslash + NUL, like perl does,
476  // we do '\0', because pcre itself doesn't take embedded NUL chars.
477  for (int ii = 0; ii < unquoted.size(); ++ii) {
478    // Note that using 'isalnum' here raises the benchmark time from
479    // 32ns to 58ns:
480    if (unquoted[ii] == '\0') {
481      result += "\\0";
482    } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
483               (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
484               (unquoted[ii] < '0' || unquoted[ii] > '9') &&
485               unquoted[ii] != '_' &&
486               // If this is the part of a UTF8 or Latin1 character, we need
487               // to copy this byte without escaping.  Experimentally this is
488               // what works correctly with the regexp library.
489               !(unquoted[ii] & 128)) {
490      result += '\\';
491      result += unquoted[ii];
492    } else {
493      result += unquoted[ii];
494    }
495  }
496
497  return result;
498}
499
500/***** Actual matching and rewriting code *****/
501
502int RE::TryMatch(const StringPiece& text,
503                 int startpos,
504                 Anchor anchor,
505                 bool empty_ok,
506                 int *vec,
507                 int vecsize) const {
508  pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
509  if (re == NULL) {
510    //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
511    return 0;
512  }
513
514  pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
515  if (options_.match_limit() > 0) {
516    extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
517    extra.match_limit = options_.match_limit();
518  }
519  if (options_.match_limit_recursion() > 0) {
520    extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
521    extra.match_limit_recursion = options_.match_limit_recursion();
522  }
523
524  int options = 0;
525  if (anchor != UNANCHORED)
526    options |= PCRE_ANCHORED;
527  if (!empty_ok)
528    options |= PCRE_NOTEMPTY;
529
530  int rc = pcre_exec(re,              // The regular expression object
531                     &extra,
532                     (text.data() == NULL) ? "" : text.data(),
533                     text.size(),
534                     startpos,
535                     options,
536                     vec,
537                     vecsize);
538
539  // Handle errors
540  if (rc == PCRE_ERROR_NOMATCH) {
541    return 0;
542  } else if (rc < 0) {
543    //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
544    //        re, pattern_.c_str());
545    return 0;
546  } else if (rc == 0) {
547    // pcre_exec() returns 0 as a special case when the number of
548    // capturing subpatterns exceeds the size of the vector.
549    // When this happens, there is a match and the output vector
550    // is filled, but we miss out on the positions of the extra subpatterns.
551    rc = vecsize / 2;
552  }
553
554  return rc;
555}
556
557bool RE::DoMatchImpl(const StringPiece& text,
558                     Anchor anchor,
559                     int* consumed,
560                     const Arg* const* args,
561                     int n,
562                     int* vec,
563                     int vecsize) const {
564  assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
565  int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
566  assert(matches >= 0);  // TryMatch never returns negatives
567  if (matches == 0)
568    return false;
569
570  *consumed = vec[1];
571
572  if (n == 0 || args == NULL) {
573    // We are not interested in results
574    return true;
575  }
576
577  if (NumberOfCapturingGroups() < n) {
578    // RE has fewer capturing groups than number of arg pointers passed in
579    return false;
580  }
581
582  // If we got here, we must have matched the whole pattern.
583  // We do not need (can not do) any more checks on the value of 'matches' here
584  // -- see the comment for TryMatch.
585  for (int i = 0; i < n; i++) {
586    const int start = vec[2*(i+1)];
587    const int limit = vec[2*(i+1)+1];
588    if (!args[i]->Parse(text.data() + start, limit-start)) {
589      // TODO: Should we indicate what the error was?
590      return false;
591    }
592  }
593
594  return true;
595}
596
597bool RE::DoMatch(const StringPiece& text,
598                 Anchor anchor,
599                 int* consumed,
600                 const Arg* const args[],
601                 int n) const {
602  assert(n >= 0);
603  size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
604                                       // (as for kVecSize)
605  int space[21];   // use stack allocation for small vecsize (common case)
606  int* vec = vecsize <= 21 ? space : new int[vecsize];
607  bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
608  if (vec != space) delete [] vec;
609  return retval;
610}
611
612bool RE::Rewrite(string *out, const StringPiece &rewrite,
613                 const StringPiece &text, int *vec, int veclen) const {
614  for (const char *s = rewrite.data(), *end = s + rewrite.size();
615       s < end; s++) {
616    int c = *s;
617    if (c == '\\') {
618      c = *++s;
619      if (isdigit(c)) {
620        int n = (c - '0');
621        if (n >= veclen) {
622          //fprintf(stderr, requested group %d in regexp %.*s\n",
623          //        n, rewrite.size(), rewrite.data());
624          return false;
625        }
626        int start = vec[2 * n];
627        if (start >= 0)
628          out->append(text.data() + start, vec[2 * n + 1] - start);
629      } else if (c == '\\') {
630        *out += '\\';
631      } else {
632        //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
633        //        rewrite.size(), rewrite.data());
634        return false;
635      }
636    } else {
637      *out += c;
638    }
639  }
640  return true;
641}
642
643// Return the number of capturing subpatterns, or -1 if the
644// regexp wasn't valid on construction.
645int RE::NumberOfCapturingGroups() const {
646  if (re_partial_ == NULL) return -1;
647
648  int result;
649  int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
650                                  NULL,         // We did not study the pattern
651                                  PCRE_INFO_CAPTURECOUNT,
652                                  &result);
653  assert(pcre_retval == 0);
654  return result;
655}
656
657/***** Parsers for various types *****/
658
659bool Arg::parse_null(const char* str, int n, void* dest) {
660  // We fail if somebody asked us to store into a non-NULL void* pointer
661  return (dest == NULL);
662}
663
664bool Arg::parse_string(const char* str, int n, void* dest) {
665  if (dest == NULL) return true;
666  reinterpret_cast<string*>(dest)->assign(str, n);
667  return true;
668}
669
670bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
671  if (dest == NULL) return true;
672  reinterpret_cast<StringPiece*>(dest)->set(str, n);
673  return true;
674}
675
676bool Arg::parse_char(const char* str, int n, void* dest) {
677  if (n != 1) return false;
678  if (dest == NULL) return true;
679  *(reinterpret_cast<char*>(dest)) = str[0];
680  return true;
681}
682
683bool Arg::parse_uchar(const char* str, int n, void* dest) {
684  if (n != 1) return false;
685  if (dest == NULL) return true;
686  *(reinterpret_cast<unsigned char*>(dest)) = str[0];
687  return true;
688}
689
690// Largest number spec that we are willing to parse
691static const int kMaxNumberLength = 32;
692
693// REQUIRES "buf" must have length at least kMaxNumberLength+1
694// REQUIRES "n > 0"
695// Copies "str" into "buf" and null-terminates if necessary.
696// Returns one of:
697//      a. "str" if no termination is needed
698//      b. "buf" if the string was copied and null-terminated
699//      c. "" if the input was invalid and has no hope of being parsed
700static const char* TerminateNumber(char* buf, const char* str, int n) {
701  if ((n > 0) && isspace(*str)) {
702    // We are less forgiving than the strtoxxx() routines and do not
703    // allow leading spaces.
704    return "";
705  }
706
707  // See if the character right after the input text may potentially
708  // look like a digit.
709  if (isdigit(str[n]) ||
710      ((str[n] >= 'a') && (str[n] <= 'f')) ||
711      ((str[n] >= 'A') && (str[n] <= 'F'))) {
712    if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
713    memcpy(buf, str, n);
714    buf[n] = '\0';
715    return buf;
716  } else {
717    // We can parse right out of the supplied string, so return it.
718    return str;
719  }
720}
721
722bool Arg::parse_long_radix(const char* str,
723                           int n,
724                           void* dest,
725                           int radix) {
726  if (n == 0) return false;
727  char buf[kMaxNumberLength+1];
728  str = TerminateNumber(buf, str, n);
729  char* end;
730  errno = 0;
731  long r = strtol(str, &end, radix);
732  if (end != str + n) return false;   // Leftover junk
733  if (errno) return false;
734  if (dest == NULL) return true;
735  *(reinterpret_cast<long*>(dest)) = r;
736  return true;
737}
738
739bool Arg::parse_ulong_radix(const char* str,
740                            int n,
741                            void* dest,
742                            int radix) {
743  if (n == 0) return false;
744  char buf[kMaxNumberLength+1];
745  str = TerminateNumber(buf, str, n);
746  if (str[0] == '-') return false;    // strtoul() on a negative number?!
747  char* end;
748  errno = 0;
749  unsigned long r = strtoul(str, &end, radix);
750  if (end != str + n) return false;   // Leftover junk
751  if (errno) return false;
752  if (dest == NULL) return true;
753  *(reinterpret_cast<unsigned long*>(dest)) = r;
754  return true;
755}
756
757bool Arg::parse_short_radix(const char* str,
758                            int n,
759                            void* dest,
760                            int radix) {
761  long r;
762  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
763  if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
764  if (dest == NULL) return true;
765  *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
766  return true;
767}
768
769bool Arg::parse_ushort_radix(const char* str,
770                             int n,
771                             void* dest,
772                             int radix) {
773  unsigned long r;
774  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
775  if (r > USHRT_MAX) return false;                      // Out of range
776  if (dest == NULL) return true;
777  *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
778  return true;
779}
780
781bool Arg::parse_int_radix(const char* str,
782                          int n,
783                          void* dest,
784                          int radix) {
785  long r;
786  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
787  if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
788  if (dest == NULL) return true;
789  *(reinterpret_cast<int*>(dest)) = r;
790  return true;
791}
792
793bool Arg::parse_uint_radix(const char* str,
794                           int n,
795                           void* dest,
796                           int radix) {
797  unsigned long r;
798  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
799  if (r > UINT_MAX) return false;                       // Out of range
800  if (dest == NULL) return true;
801  *(reinterpret_cast<unsigned int*>(dest)) = r;
802  return true;
803}
804
805bool Arg::parse_longlong_radix(const char* str,
806                               int n,
807                               void* dest,
808                               int radix) {
809#ifndef HAVE_LONG_LONG
810  return false;
811#else
812  if (n == 0) return false;
813  char buf[kMaxNumberLength+1];
814  str = TerminateNumber(buf, str, n);
815  char* end;
816  errno = 0;
817#if defined HAVE_STRTOQ
818  long long r = strtoq(str, &end, radix);
819#elif defined HAVE_STRTOLL
820  long long r = strtoll(str, &end, radix);
821#elif defined HAVE__STRTOI64
822  long long r = _strtoi64(str, &end, radix);
823#elif defined HAVE_STRTOIMAX
824  long long r = strtoimax(str, &end, radix);
825#else
826#error parse_longlong_radix: cannot convert input to a long-long
827#endif
828  if (end != str + n) return false;   // Leftover junk
829  if (errno) return false;
830  if (dest == NULL) return true;
831  *(reinterpret_cast<long long*>(dest)) = r;
832  return true;
833#endif   /* HAVE_LONG_LONG */
834}
835
836bool Arg::parse_ulonglong_radix(const char* str,
837                                int n,
838                                void* dest,
839                                int radix) {
840#ifndef HAVE_UNSIGNED_LONG_LONG
841  return false;
842#else
843  if (n == 0) return false;
844  char buf[kMaxNumberLength+1];
845  str = TerminateNumber(buf, str, n);
846  if (str[0] == '-') return false;    // strtoull() on a negative number?!
847  char* end;
848  errno = 0;
849#if defined HAVE_STRTOQ
850  unsigned long long r = strtouq(str, &end, radix);
851#elif defined HAVE_STRTOLL
852  unsigned long long r = strtoull(str, &end, radix);
853#elif defined HAVE__STRTOI64
854  unsigned long long r = _strtoui64(str, &end, radix);
855#elif defined HAVE_STRTOIMAX
856  unsigned long long r = strtoumax(str, &end, radix);
857#else
858#error parse_ulonglong_radix: cannot convert input to a long-long
859#endif
860  if (end != str + n) return false;   // Leftover junk
861  if (errno) return false;
862  if (dest == NULL) return true;
863  *(reinterpret_cast<unsigned long long*>(dest)) = r;
864  return true;
865#endif   /* HAVE_UNSIGNED_LONG_LONG */
866}
867
868bool Arg::parse_double(const char* str, int n, void* dest) {
869  if (n == 0) return false;
870  static const int kMaxLength = 200;
871  char buf[kMaxLength];
872  if (n >= kMaxLength) return false;
873  memcpy(buf, str, n);
874  buf[n] = '\0';
875  errno = 0;
876  char* end;
877  double r = strtod(buf, &end);
878  if (end != buf + n) return false;   // Leftover junk
879  if (errno) return false;
880  if (dest == NULL) return true;
881  *(reinterpret_cast<double*>(dest)) = r;
882  return true;
883}
884
885bool Arg::parse_float(const char* str, int n, void* dest) {
886  double r;
887  if (!parse_double(str, n, &r)) return false;
888  if (dest == NULL) return true;
889  *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
890  return true;
891}
892
893
894#define DEFINE_INTEGER_PARSERS(name)                                    \
895  bool Arg::parse_##name(const char* str, int n, void* dest) {          \
896    return parse_##name##_radix(str, n, dest, 10);                      \
897  }                                                                     \
898  bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
899    return parse_##name##_radix(str, n, dest, 16);                      \
900  }                                                                     \
901  bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
902    return parse_##name##_radix(str, n, dest, 8);                       \
903  }                                                                     \
904  bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
905    return parse_##name##_radix(str, n, dest, 0);                       \
906  }
907
908DEFINE_INTEGER_PARSERS(short)      /*                                   */
909DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
910DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
911DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
912DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
913DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
914DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
915DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
916
917#undef DEFINE_INTEGER_PARSERS
918
919}   // namespace pcrecpp
920