1// Copyright (c) 2010, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29//
30// Author: Sanjay Ghemawat
31
32#ifdef HAVE_CONFIG_H
33#include "config.h"
34#endif
35
36#include <stdlib.h>
37#include <stdio.h>
38#include <ctype.h>
39#include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
40#include <assert.h>
41#include <errno.h>
42#include <string>
43#include <algorithm>
44
45#include "pcrecpp_internal.h"
46#include "pcre.h"
47#include "pcrecpp.h"
48#include "pcre_stringpiece.h"
49
50
51namespace pcrecpp {
52
53// Maximum number of args we can set
54static const int kMaxArgs = 16;
55static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
56
57// Special object that stands-in for no argument
58Arg RE::no_arg((void*)NULL);
59
60// This is for ABI compatibility with old versions of pcre (pre-7.6),
61// which defined a global no_arg variable instead of putting it in the
62// RE class.  This works on GCC >= 3, at least.  It definitely works
63// for ELF, but may not for other object formats (Mach-O, for
64// instance, does not support aliases.)  We could probably have a more
65// inclusive test if we ever needed it.  (Note that not only the
66// __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
67// gnu-specific.)
68#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
69# define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
70# define ULP_AS_STRING_INTERNAL(x)   #x
71# define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
72extern Arg no_arg
73  __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
74#endif
75
76// If a regular expression has no error, its error_ field points here
77static const string empty_string;
78
79// If the user doesn't ask for any options, we just use this one
80static RE_Options default_options;
81
82void RE::Init(const string& pat, const RE_Options* options) {
83  pattern_ = pat;
84  if (options == NULL) {
85    options_ = default_options;
86  } else {
87    options_ = *options;
88  }
89  error_ = &empty_string;
90  re_full_ = NULL;
91  re_partial_ = NULL;
92
93  re_partial_ = Compile(UNANCHORED);
94  if (re_partial_ != NULL) {
95    re_full_ = Compile(ANCHOR_BOTH);
96  }
97}
98
99void RE::Cleanup() {
100  if (re_full_ != NULL)         (*pcre_free)(re_full_);
101  if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
102  if (error_ != &empty_string)  delete error_;
103}
104
105
106RE::~RE() {
107  Cleanup();
108}
109
110
111pcre* RE::Compile(Anchor anchor) {
112  // First, convert RE_Options into pcre options
113  int pcre_options = 0;
114  pcre_options = options_.all_options();
115
116  // Special treatment for anchoring.  This is needed because at
117  // runtime pcre only provides an option for anchoring at the
118  // beginning of a string (unless you use offset).
119  //
120  // There are three types of anchoring we want:
121  //    UNANCHORED      Compile the original pattern, and use
122  //                    a pcre unanchored match.
123  //    ANCHOR_START    Compile the original pattern, and use
124  //                    a pcre anchored match.
125  //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
126  //                    and use a pcre anchored match.
127
128  const char* compile_error;
129  int eoffset;
130  pcre* re;
131  if (anchor != ANCHOR_BOTH) {
132    re = pcre_compile(pattern_.c_str(), pcre_options,
133                      &compile_error, &eoffset, NULL);
134  } else {
135    // Tack a '\z' at the end of RE.  Parenthesize it first so that
136    // the '\z' applies to all top-level alternatives in the regexp.
137    string wrapped = "(?:";  // A non-counting grouping operator
138    wrapped += pattern_;
139    wrapped += ")\\z";
140    re = pcre_compile(wrapped.c_str(), pcre_options,
141                      &compile_error, &eoffset, NULL);
142  }
143  if (re == NULL) {
144    if (error_ == &empty_string) error_ = new string(compile_error);
145  }
146  return re;
147}
148
149/***** Matching interfaces *****/
150
151bool RE::FullMatch(const StringPiece& text,
152                   const Arg& ptr1,
153                   const Arg& ptr2,
154                   const Arg& ptr3,
155                   const Arg& ptr4,
156                   const Arg& ptr5,
157                   const Arg& ptr6,
158                   const Arg& ptr7,
159                   const Arg& ptr8,
160                   const Arg& ptr9,
161                   const Arg& ptr10,
162                   const Arg& ptr11,
163                   const Arg& ptr12,
164                   const Arg& ptr13,
165                   const Arg& ptr14,
166                   const Arg& ptr15,
167                   const Arg& ptr16) const {
168  const Arg* args[kMaxArgs];
169  int n = 0;
170  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
171  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
172  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
173  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
174  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
175  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
176  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
177  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
178  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
179  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
180  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
181  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
182  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
183  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
184  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
185  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
186 done:
187
188  int consumed;
189  int vec[kVecSize];
190  return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
191}
192
193bool RE::PartialMatch(const StringPiece& text,
194                      const Arg& ptr1,
195                      const Arg& ptr2,
196                      const Arg& ptr3,
197                      const Arg& ptr4,
198                      const Arg& ptr5,
199                      const Arg& ptr6,
200                      const Arg& ptr7,
201                      const Arg& ptr8,
202                      const Arg& ptr9,
203                      const Arg& ptr10,
204                      const Arg& ptr11,
205                      const Arg& ptr12,
206                      const Arg& ptr13,
207                      const Arg& ptr14,
208                      const Arg& ptr15,
209                      const Arg& ptr16) const {
210  const Arg* args[kMaxArgs];
211  int n = 0;
212  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
213  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
214  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
215  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
216  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
217  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
218  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
219  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
220  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
221  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
222  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
223  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
224  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
225  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
226  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
227  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
228 done:
229
230  int consumed;
231  int vec[kVecSize];
232  return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
233}
234
235bool RE::Consume(StringPiece* input,
236                 const Arg& ptr1,
237                 const Arg& ptr2,
238                 const Arg& ptr3,
239                 const Arg& ptr4,
240                 const Arg& ptr5,
241                 const Arg& ptr6,
242                 const Arg& ptr7,
243                 const Arg& ptr8,
244                 const Arg& ptr9,
245                 const Arg& ptr10,
246                 const Arg& ptr11,
247                 const Arg& ptr12,
248                 const Arg& ptr13,
249                 const Arg& ptr14,
250                 const Arg& ptr15,
251                 const Arg& ptr16) const {
252  const Arg* args[kMaxArgs];
253  int n = 0;
254  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
255  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
256  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
257  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
258  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
259  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
260  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
261  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
262  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
263  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
264  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
265  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
266  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
267  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
268  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
269  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
270 done:
271
272  int consumed;
273  int vec[kVecSize];
274  if (DoMatchImpl(*input, ANCHOR_START, &consumed,
275                  args, n, vec, kVecSize)) {
276    input->remove_prefix(consumed);
277    return true;
278  } else {
279    return false;
280  }
281}
282
283bool RE::FindAndConsume(StringPiece* input,
284                        const Arg& ptr1,
285                        const Arg& ptr2,
286                        const Arg& ptr3,
287                        const Arg& ptr4,
288                        const Arg& ptr5,
289                        const Arg& ptr6,
290                        const Arg& ptr7,
291                        const Arg& ptr8,
292                        const Arg& ptr9,
293                        const Arg& ptr10,
294                        const Arg& ptr11,
295                        const Arg& ptr12,
296                        const Arg& ptr13,
297                        const Arg& ptr14,
298                        const Arg& ptr15,
299                        const Arg& ptr16) const {
300  const Arg* args[kMaxArgs];
301  int n = 0;
302  if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
303  if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
304  if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
305  if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
306  if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
307  if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
308  if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
309  if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
310  if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
311  if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
312  if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
313  if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
314  if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
315  if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
316  if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
317  if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
318 done:
319
320  int consumed;
321  int vec[kVecSize];
322  if (DoMatchImpl(*input, UNANCHORED, &consumed,
323                  args, n, vec, kVecSize)) {
324    input->remove_prefix(consumed);
325    return true;
326  } else {
327    return false;
328  }
329}
330
331bool RE::Replace(const StringPiece& rewrite,
332                 string *str) const {
333  int vec[kVecSize];
334  int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
335  if (matches == 0)
336    return false;
337
338  string s;
339  if (!Rewrite(&s, rewrite, *str, vec, matches))
340    return false;
341
342  assert(vec[0] >= 0);
343  assert(vec[1] >= 0);
344  str->replace(vec[0], vec[1] - vec[0], s);
345  return true;
346}
347
348// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
349// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
350// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
351
352static int NewlineMode(int pcre_options) {
353  // TODO: if we can make it threadsafe, cache this var
354  int newline_mode = 0;
355  /* if (newline_mode) return newline_mode; */  // do this once it's cached
356  if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
357                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
358    newline_mode = (pcre_options &
359                    (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
360                     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
361  } else {
362    int newline;
363    pcre_config(PCRE_CONFIG_NEWLINE, &newline);
364    if (newline == 10)
365      newline_mode = PCRE_NEWLINE_LF;
366    else if (newline == 13)
367      newline_mode = PCRE_NEWLINE_CR;
368    else if (newline == 3338)
369      newline_mode = PCRE_NEWLINE_CRLF;
370    else if (newline == -1)
371      newline_mode = PCRE_NEWLINE_ANY;
372    else if (newline == -2)
373      newline_mode = PCRE_NEWLINE_ANYCRLF;
374    else
375      assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
376  }
377  return newline_mode;
378}
379
380int RE::GlobalReplace(const StringPiece& rewrite,
381                      string *str) const {
382  int count = 0;
383  int vec[kVecSize];
384  string out;
385  int start = 0;
386  int lastend = -1;
387  bool last_match_was_empty_string = false;
388
389  while (start <= static_cast<int>(str->length())) {
390    // If the previous match was for the empty string, we shouldn't
391    // just match again: we'll match in the same way and get an
392    // infinite loop.  Instead, we do the match in a special way:
393    // anchored -- to force another try at the same position --
394    // and with a flag saying that this time, ignore empty matches.
395    // If this special match returns, that means there's a non-empty
396    // match at this position as well, and we can continue.  If not,
397    // we do what perl does, and just advance by one.
398    // Notice that perl prints '@@@' for this;
399    //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
400    int matches;
401    if (last_match_was_empty_string) {
402      matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
403      if (matches <= 0) {
404        int matchend = start + 1;     // advance one character.
405        // If the current char is CR and we're in CRLF mode, skip LF too.
406        // Note it's better to call pcre_fullinfo() than to examine
407        // all_options(), since options_ could have changed bewteen
408        // compile-time and now, but this is simpler and safe enough.
409        // Modified by PH to add ANY and ANYCRLF.
410        if (matchend < static_cast<int>(str->length()) &&
411            (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
412            (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
413             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
414             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
415          matchend++;
416        }
417        // We also need to advance more than one char if we're in utf8 mode.
418#ifdef SUPPORT_UTF8
419        if (options_.utf8()) {
420          while (matchend < static_cast<int>(str->length()) &&
421                 ((*str)[matchend] & 0xc0) == 0x80)
422            matchend++;
423        }
424#endif
425        if (start < static_cast<int>(str->length()))
426          out.append(*str, start, matchend - start);
427        start = matchend;
428        last_match_was_empty_string = false;
429        continue;
430      }
431    } else {
432      matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
433      if (matches <= 0)
434        break;
435    }
436    int matchstart = vec[0], matchend = vec[1];
437    assert(matchstart >= start);
438    assert(matchend >= matchstart);
439    out.append(*str, start, matchstart - start);
440    Rewrite(&out, rewrite, *str, vec, matches);
441    start = matchend;
442    lastend = matchend;
443    count++;
444    last_match_was_empty_string = (matchstart == matchend);
445  }
446
447  if (count == 0)
448    return 0;
449
450  if (start < static_cast<int>(str->length()))
451    out.append(*str, start, str->length() - start);
452  swap(out, *str);
453  return count;
454}
455
456bool RE::Extract(const StringPiece& rewrite,
457                 const StringPiece& text,
458                 string *out) const {
459  int vec[kVecSize];
460  int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
461  if (matches == 0)
462    return false;
463  out->erase();
464  return Rewrite(out, rewrite, text, vec, matches);
465}
466
467/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
468  string result;
469
470  // Escape any ascii character not in [A-Za-z_0-9].
471  //
472  // Note that it's legal to escape a character even if it has no
473  // special meaning in a regular expression -- so this function does
474  // that.  (This also makes it identical to the perl function of the
475  // same name; see `perldoc -f quotemeta`.)  The one exception is
476  // escaping NUL: rather than doing backslash + NUL, like perl does,
477  // we do '\0', because pcre itself doesn't take embedded NUL chars.
478  for (int ii = 0; ii < unquoted.size(); ++ii) {
479    // Note that using 'isalnum' here raises the benchmark time from
480    // 32ns to 58ns:
481    if (unquoted[ii] == '\0') {
482      result += "\\0";
483    } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
484               (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
485               (unquoted[ii] < '0' || unquoted[ii] > '9') &&
486               unquoted[ii] != '_' &&
487               // If this is the part of a UTF8 or Latin1 character, we need
488               // to copy this byte without escaping.  Experimentally this is
489               // what works correctly with the regexp library.
490               !(unquoted[ii] & 128)) {
491      result += '\\';
492      result += unquoted[ii];
493    } else {
494      result += unquoted[ii];
495    }
496  }
497
498  return result;
499}
500
501/***** Actual matching and rewriting code *****/
502
503int RE::TryMatch(const StringPiece& text,
504                 int startpos,
505                 Anchor anchor,
506                 bool empty_ok,
507                 int *vec,
508                 int vecsize) const {
509  pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
510  if (re == NULL) {
511    //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
512    return 0;
513  }
514
515  pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
516  if (options_.match_limit() > 0) {
517    extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
518    extra.match_limit = options_.match_limit();
519  }
520  if (options_.match_limit_recursion() > 0) {
521    extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
522    extra.match_limit_recursion = options_.match_limit_recursion();
523  }
524
525  int options = 0;
526  if (anchor != UNANCHORED)
527    options |= PCRE_ANCHORED;
528  if (!empty_ok)
529    options |= PCRE_NOTEMPTY;
530
531  int rc = pcre_exec(re,              // The regular expression object
532                     &extra,
533                     (text.data() == NULL) ? "" : text.data(),
534                     text.size(),
535                     startpos,
536                     options,
537                     vec,
538                     vecsize);
539
540  // Handle errors
541  if (rc == PCRE_ERROR_NOMATCH) {
542    return 0;
543  } else if (rc < 0) {
544    //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
545    //        re, pattern_.c_str());
546    return 0;
547  } else if (rc == 0) {
548    // pcre_exec() returns 0 as a special case when the number of
549    // capturing subpatterns exceeds the size of the vector.
550    // When this happens, there is a match and the output vector
551    // is filled, but we miss out on the positions of the extra subpatterns.
552    rc = vecsize / 2;
553  }
554
555  return rc;
556}
557
558bool RE::DoMatchImpl(const StringPiece& text,
559                     Anchor anchor,
560                     int* consumed,
561                     const Arg* const* args,
562                     int n,
563                     int* vec,
564                     int vecsize) const {
565  assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
566  int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
567  assert(matches >= 0);  // TryMatch never returns negatives
568  if (matches == 0)
569    return false;
570
571  *consumed = vec[1];
572
573  if (n == 0 || args == NULL) {
574    // We are not interested in results
575    return true;
576  }
577
578  if (NumberOfCapturingGroups() < n) {
579    // RE has fewer capturing groups than number of arg pointers passed in
580    return false;
581  }
582
583  // If we got here, we must have matched the whole pattern.
584  // We do not need (can not do) any more checks on the value of 'matches' here
585  // -- see the comment for TryMatch.
586  for (int i = 0; i < n; i++) {
587    const int start = vec[2*(i+1)];
588    const int limit = vec[2*(i+1)+1];
589    if (!args[i]->Parse(text.data() + start, limit-start)) {
590      // TODO: Should we indicate what the error was?
591      return false;
592    }
593  }
594
595  return true;
596}
597
598bool RE::DoMatch(const StringPiece& text,
599                 Anchor anchor,
600                 int* consumed,
601                 const Arg* const args[],
602                 int n) const {
603  assert(n >= 0);
604  size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
605                                       // (as for kVecSize)
606  int space[21];   // use stack allocation for small vecsize (common case)
607  int* vec = vecsize <= 21 ? space : new int[vecsize];
608  bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
609  if (vec != space) delete [] vec;
610  return retval;
611}
612
613bool RE::Rewrite(string *out, const StringPiece &rewrite,
614                 const StringPiece &text, int *vec, int veclen) const {
615  for (const char *s = rewrite.data(), *end = s + rewrite.size();
616       s < end; s++) {
617    int c = *s;
618    if (c == '\\') {
619      c = *++s;
620      if (isdigit(c)) {
621        int n = (c - '0');
622        if (n >= veclen) {
623          //fprintf(stderr, requested group %d in regexp %.*s\n",
624          //        n, rewrite.size(), rewrite.data());
625          return false;
626        }
627        int start = vec[2 * n];
628        if (start >= 0)
629          out->append(text.data() + start, vec[2 * n + 1] - start);
630      } else if (c == '\\') {
631        *out += '\\';
632      } else {
633        //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
634        //        rewrite.size(), rewrite.data());
635        return false;
636      }
637    } else {
638      *out += c;
639    }
640  }
641  return true;
642}
643
644// Return the number of capturing subpatterns, or -1 if the
645// regexp wasn't valid on construction.
646int RE::NumberOfCapturingGroups() const {
647  if (re_partial_ == NULL) return -1;
648
649  int result;
650  int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
651                                  NULL,         // We did not study the pattern
652                                  PCRE_INFO_CAPTURECOUNT,
653                                  &result);
654  assert(pcre_retval == 0);
655  return result;
656}
657
658/***** Parsers for various types *****/
659
660bool Arg::parse_null(const char* str, int n, void* dest) {
661  // We fail if somebody asked us to store into a non-NULL void* pointer
662  return (dest == NULL);
663}
664
665bool Arg::parse_string(const char* str, int n, void* dest) {
666  if (dest == NULL) return true;
667  reinterpret_cast<string*>(dest)->assign(str, n);
668  return true;
669}
670
671bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
672  if (dest == NULL) return true;
673  reinterpret_cast<StringPiece*>(dest)->set(str, n);
674  return true;
675}
676
677bool Arg::parse_char(const char* str, int n, void* dest) {
678  if (n != 1) return false;
679  if (dest == NULL) return true;
680  *(reinterpret_cast<char*>(dest)) = str[0];
681  return true;
682}
683
684bool Arg::parse_uchar(const char* str, int n, void* dest) {
685  if (n != 1) return false;
686  if (dest == NULL) return true;
687  *(reinterpret_cast<unsigned char*>(dest)) = str[0];
688  return true;
689}
690
691// Largest number spec that we are willing to parse
692static const int kMaxNumberLength = 32;
693
694// REQUIRES "buf" must have length at least kMaxNumberLength+1
695// REQUIRES "n > 0"
696// Copies "str" into "buf" and null-terminates if necessary.
697// Returns one of:
698//      a. "str" if no termination is needed
699//      b. "buf" if the string was copied and null-terminated
700//      c. "" if the input was invalid and has no hope of being parsed
701static const char* TerminateNumber(char* buf, const char* str, int n) {
702  if ((n > 0) && isspace(*str)) {
703    // We are less forgiving than the strtoxxx() routines and do not
704    // allow leading spaces.
705    return "";
706  }
707
708  // See if the character right after the input text may potentially
709  // look like a digit.
710  if (isdigit(str[n]) ||
711      ((str[n] >= 'a') && (str[n] <= 'f')) ||
712      ((str[n] >= 'A') && (str[n] <= 'F'))) {
713    if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
714    memcpy(buf, str, n);
715    buf[n] = '\0';
716    return buf;
717  } else {
718    // We can parse right out of the supplied string, so return it.
719    return str;
720  }
721}
722
723bool Arg::parse_long_radix(const char* str,
724                           int n,
725                           void* dest,
726                           int radix) {
727  if (n == 0) return false;
728  char buf[kMaxNumberLength+1];
729  str = TerminateNumber(buf, str, n);
730  char* end;
731  errno = 0;
732  long r = strtol(str, &end, radix);
733  if (end != str + n) return false;   // Leftover junk
734  if (errno) return false;
735  if (dest == NULL) return true;
736  *(reinterpret_cast<long*>(dest)) = r;
737  return true;
738}
739
740bool Arg::parse_ulong_radix(const char* str,
741                            int n,
742                            void* dest,
743                            int radix) {
744  if (n == 0) return false;
745  char buf[kMaxNumberLength+1];
746  str = TerminateNumber(buf, str, n);
747  if (str[0] == '-') return false;    // strtoul() on a negative number?!
748  char* end;
749  errno = 0;
750  unsigned long r = strtoul(str, &end, radix);
751  if (end != str + n) return false;   // Leftover junk
752  if (errno) return false;
753  if (dest == NULL) return true;
754  *(reinterpret_cast<unsigned long*>(dest)) = r;
755  return true;
756}
757
758bool Arg::parse_short_radix(const char* str,
759                            int n,
760                            void* dest,
761                            int radix) {
762  long r;
763  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
764  if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
765  if (dest == NULL) return true;
766  *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
767  return true;
768}
769
770bool Arg::parse_ushort_radix(const char* str,
771                             int n,
772                             void* dest,
773                             int radix) {
774  unsigned long r;
775  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
776  if (r > USHRT_MAX) return false;                      // Out of range
777  if (dest == NULL) return true;
778  *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
779  return true;
780}
781
782bool Arg::parse_int_radix(const char* str,
783                          int n,
784                          void* dest,
785                          int radix) {
786  long r;
787  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
788  if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
789  if (dest == NULL) return true;
790  *(reinterpret_cast<int*>(dest)) = r;
791  return true;
792}
793
794bool Arg::parse_uint_radix(const char* str,
795                           int n,
796                           void* dest,
797                           int radix) {
798  unsigned long r;
799  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
800  if (r > UINT_MAX) return false;                       // Out of range
801  if (dest == NULL) return true;
802  *(reinterpret_cast<unsigned int*>(dest)) = r;
803  return true;
804}
805
806bool Arg::parse_longlong_radix(const char* str,
807                               int n,
808                               void* dest,
809                               int radix) {
810#ifndef HAVE_LONG_LONG
811  return false;
812#else
813  if (n == 0) return false;
814  char buf[kMaxNumberLength+1];
815  str = TerminateNumber(buf, str, n);
816  char* end;
817  errno = 0;
818#if defined HAVE_STRTOQ
819  long long r = strtoq(str, &end, radix);
820#elif defined HAVE_STRTOLL
821  long long r = strtoll(str, &end, radix);
822#elif defined HAVE__STRTOI64
823  long long r = _strtoi64(str, &end, radix);
824#elif defined HAVE_STRTOIMAX
825  long long r = strtoimax(str, &end, radix);
826#else
827#error parse_longlong_radix: cannot convert input to a long-long
828#endif
829  if (end != str + n) return false;   // Leftover junk
830  if (errno) return false;
831  if (dest == NULL) return true;
832  *(reinterpret_cast<long long*>(dest)) = r;
833  return true;
834#endif   /* HAVE_LONG_LONG */
835}
836
837bool Arg::parse_ulonglong_radix(const char* str,
838                                int n,
839                                void* dest,
840                                int radix) {
841#ifndef HAVE_UNSIGNED_LONG_LONG
842  return false;
843#else
844  if (n == 0) return false;
845  char buf[kMaxNumberLength+1];
846  str = TerminateNumber(buf, str, n);
847  if (str[0] == '-') return false;    // strtoull() on a negative number?!
848  char* end;
849  errno = 0;
850#if defined HAVE_STRTOQ
851  unsigned long long r = strtouq(str, &end, radix);
852#elif defined HAVE_STRTOLL
853  unsigned long long r = strtoull(str, &end, radix);
854#elif defined HAVE__STRTOI64
855  unsigned long long r = _strtoui64(str, &end, radix);
856#elif defined HAVE_STRTOIMAX
857  unsigned long long r = strtoumax(str, &end, radix);
858#else
859#error parse_ulonglong_radix: cannot convert input to a long-long
860#endif
861  if (end != str + n) return false;   // Leftover junk
862  if (errno) return false;
863  if (dest == NULL) return true;
864  *(reinterpret_cast<unsigned long long*>(dest)) = r;
865  return true;
866#endif   /* HAVE_UNSIGNED_LONG_LONG */
867}
868
869bool Arg::parse_double(const char* str, int n, void* dest) {
870  if (n == 0) return false;
871  static const int kMaxLength = 200;
872  char buf[kMaxLength];
873  if (n >= kMaxLength) return false;
874  memcpy(buf, str, n);
875  buf[n] = '\0';
876  errno = 0;
877  char* end;
878  double r = strtod(buf, &end);
879  if (end != buf + n) return false;   // Leftover junk
880  if (errno) return false;
881  if (dest == NULL) return true;
882  *(reinterpret_cast<double*>(dest)) = r;
883  return true;
884}
885
886bool Arg::parse_float(const char* str, int n, void* dest) {
887  double r;
888  if (!parse_double(str, n, &r)) return false;
889  if (dest == NULL) return true;
890  *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
891  return true;
892}
893
894
895#define DEFINE_INTEGER_PARSERS(name)                                    \
896  bool Arg::parse_##name(const char* str, int n, void* dest) {          \
897    return parse_##name##_radix(str, n, dest, 10);                      \
898  }                                                                     \
899  bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
900    return parse_##name##_radix(str, n, dest, 16);                      \
901  }                                                                     \
902  bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
903    return parse_##name##_radix(str, n, dest, 8);                       \
904  }                                                                     \
905  bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
906    return parse_##name##_radix(str, n, dest, 0);                       \
907  }
908
909DEFINE_INTEGER_PARSERS(short)      /*                                   */
910DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
911DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
912DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
913DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
914DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
915DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
916DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
917
918#undef DEFINE_INTEGER_PARSERS
919
920}   // namespace pcrecpp
921