1// Copyright (c) 2010, Google Inc. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above 11// copyright notice, this list of conditions and the following disclaimer 12// in the documentation and/or other materials provided with the 13// distribution. 14// * Neither the name of Google Inc. nor the names of its 15// contributors may be used to endorse or promote products derived from 16// this software without specific prior written permission. 17// 18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29// 30// Author: Sanjay Ghemawat 31 32#ifdef HAVE_CONFIG_H 33#include "config.h" 34#endif 35 36#include <stdlib.h> 37#include <stdio.h> 38#include <ctype.h> 39#include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */ 40#include <assert.h> 41#include <errno.h> 42#include <string> 43#include <algorithm> 44 45#include "pcrecpp_internal.h" 46#include "pcre.h" 47#include "pcrecpp.h" 48#include "pcre_stringpiece.h" 49 50 51namespace pcrecpp { 52 53// Maximum number of args we can set 54static const int kMaxArgs = 16; 55static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace 56 57// Special object that stands-in for no argument 58Arg RE::no_arg((void*)NULL); 59 60// This is for ABI compatibility with old versions of pcre (pre-7.6), 61// which defined a global no_arg variable instead of putting it in the 62// RE class. This works on GCC >= 3, at least. It definitely works 63// for ELF, but may not for other object formats (Mach-O, for 64// instance, does not support aliases.) We could probably have a more 65// inclusive test if we ever needed it. (Note that not only the 66// __attribute__ syntax, but also __USER_LABEL_PREFIX__, are 67// gnu-specific.) 68#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) 69# define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x) 70# define ULP_AS_STRING_INTERNAL(x) #x 71# define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__) 72extern Arg no_arg 73 __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE"))); 74#endif 75 76// If a regular expression has no error, its error_ field points here 77static const string empty_string; 78 79// If the user doesn't ask for any options, we just use this one 80static RE_Options default_options; 81 82void RE::Init(const string& pat, const RE_Options* options) { 83 pattern_ = pat; 84 if (options == NULL) { 85 options_ = default_options; 86 } else { 87 options_ = *options; 88 } 89 error_ = &empty_string; 90 re_full_ = NULL; 91 re_partial_ = NULL; 92 93 re_partial_ = Compile(UNANCHORED); 94 if (re_partial_ != NULL) { 95 re_full_ = Compile(ANCHOR_BOTH); 96 } 97} 98 99void RE::Cleanup() { 100 if (re_full_ != NULL) (*pcre_free)(re_full_); 101 if (re_partial_ != NULL) (*pcre_free)(re_partial_); 102 if (error_ != &empty_string) delete error_; 103} 104 105 106RE::~RE() { 107 Cleanup(); 108} 109 110 111pcre* RE::Compile(Anchor anchor) { 112 // First, convert RE_Options into pcre options 113 int pcre_options = 0; 114 pcre_options = options_.all_options(); 115 116 // Special treatment for anchoring. This is needed because at 117 // runtime pcre only provides an option for anchoring at the 118 // beginning of a string (unless you use offset). 119 // 120 // There are three types of anchoring we want: 121 // UNANCHORED Compile the original pattern, and use 122 // a pcre unanchored match. 123 // ANCHOR_START Compile the original pattern, and use 124 // a pcre anchored match. 125 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern 126 // and use a pcre anchored match. 127 128 const char* compile_error; 129 int eoffset; 130 pcre* re; 131 if (anchor != ANCHOR_BOTH) { 132 re = pcre_compile(pattern_.c_str(), pcre_options, 133 &compile_error, &eoffset, NULL); 134 } else { 135 // Tack a '\z' at the end of RE. Parenthesize it first so that 136 // the '\z' applies to all top-level alternatives in the regexp. 137 string wrapped = "(?:"; // A non-counting grouping operator 138 wrapped += pattern_; 139 wrapped += ")\\z"; 140 re = pcre_compile(wrapped.c_str(), pcre_options, 141 &compile_error, &eoffset, NULL); 142 } 143 if (re == NULL) { 144 if (error_ == &empty_string) error_ = new string(compile_error); 145 } 146 return re; 147} 148 149/***** Matching interfaces *****/ 150 151bool RE::FullMatch(const StringPiece& text, 152 const Arg& ptr1, 153 const Arg& ptr2, 154 const Arg& ptr3, 155 const Arg& ptr4, 156 const Arg& ptr5, 157 const Arg& ptr6, 158 const Arg& ptr7, 159 const Arg& ptr8, 160 const Arg& ptr9, 161 const Arg& ptr10, 162 const Arg& ptr11, 163 const Arg& ptr12, 164 const Arg& ptr13, 165 const Arg& ptr14, 166 const Arg& ptr15, 167 const Arg& ptr16) const { 168 const Arg* args[kMaxArgs]; 169 int n = 0; 170 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 171 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 172 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 173 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 174 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 175 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 176 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 177 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 178 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 179 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 180 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 181 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 182 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 183 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 184 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 185 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 186 done: 187 188 int consumed; 189 int vec[kVecSize]; 190 return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); 191} 192 193bool RE::PartialMatch(const StringPiece& text, 194 const Arg& ptr1, 195 const Arg& ptr2, 196 const Arg& ptr3, 197 const Arg& ptr4, 198 const Arg& ptr5, 199 const Arg& ptr6, 200 const Arg& ptr7, 201 const Arg& ptr8, 202 const Arg& ptr9, 203 const Arg& ptr10, 204 const Arg& ptr11, 205 const Arg& ptr12, 206 const Arg& ptr13, 207 const Arg& ptr14, 208 const Arg& ptr15, 209 const Arg& ptr16) const { 210 const Arg* args[kMaxArgs]; 211 int n = 0; 212 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 213 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 214 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 215 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 216 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 217 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 218 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 219 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 220 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 221 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 222 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 223 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 224 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 225 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 226 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 227 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 228 done: 229 230 int consumed; 231 int vec[kVecSize]; 232 return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); 233} 234 235bool RE::Consume(StringPiece* input, 236 const Arg& ptr1, 237 const Arg& ptr2, 238 const Arg& ptr3, 239 const Arg& ptr4, 240 const Arg& ptr5, 241 const Arg& ptr6, 242 const Arg& ptr7, 243 const Arg& ptr8, 244 const Arg& ptr9, 245 const Arg& ptr10, 246 const Arg& ptr11, 247 const Arg& ptr12, 248 const Arg& ptr13, 249 const Arg& ptr14, 250 const Arg& ptr15, 251 const Arg& ptr16) const { 252 const Arg* args[kMaxArgs]; 253 int n = 0; 254 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 255 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 256 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 257 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 258 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 259 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 260 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 261 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 262 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 263 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 264 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 265 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 266 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 267 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 268 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 269 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 270 done: 271 272 int consumed; 273 int vec[kVecSize]; 274 if (DoMatchImpl(*input, ANCHOR_START, &consumed, 275 args, n, vec, kVecSize)) { 276 input->remove_prefix(consumed); 277 return true; 278 } else { 279 return false; 280 } 281} 282 283bool RE::FindAndConsume(StringPiece* input, 284 const Arg& ptr1, 285 const Arg& ptr2, 286 const Arg& ptr3, 287 const Arg& ptr4, 288 const Arg& ptr5, 289 const Arg& ptr6, 290 const Arg& ptr7, 291 const Arg& ptr8, 292 const Arg& ptr9, 293 const Arg& ptr10, 294 const Arg& ptr11, 295 const Arg& ptr12, 296 const Arg& ptr13, 297 const Arg& ptr14, 298 const Arg& ptr15, 299 const Arg& ptr16) const { 300 const Arg* args[kMaxArgs]; 301 int n = 0; 302 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; 303 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; 304 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; 305 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; 306 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; 307 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; 308 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; 309 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; 310 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; 311 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; 312 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; 313 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; 314 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; 315 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; 316 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; 317 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; 318 done: 319 320 int consumed; 321 int vec[kVecSize]; 322 if (DoMatchImpl(*input, UNANCHORED, &consumed, 323 args, n, vec, kVecSize)) { 324 input->remove_prefix(consumed); 325 return true; 326 } else { 327 return false; 328 } 329} 330 331bool RE::Replace(const StringPiece& rewrite, 332 string *str) const { 333 int vec[kVecSize]; 334 int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); 335 if (matches == 0) 336 return false; 337 338 string s; 339 if (!Rewrite(&s, rewrite, *str, vec, matches)) 340 return false; 341 342 assert(vec[0] >= 0); 343 assert(vec[1] >= 0); 344 str->replace(vec[0], vec[1] - vec[0], s); 345 return true; 346} 347 348// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. 349// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. 350// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF. 351 352static int NewlineMode(int pcre_options) { 353 // TODO: if we can make it threadsafe, cache this var 354 int newline_mode = 0; 355 /* if (newline_mode) return newline_mode; */ // do this once it's cached 356 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| 357 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) { 358 newline_mode = (pcre_options & 359 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| 360 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)); 361 } else { 362 int newline; 363 pcre_config(PCRE_CONFIG_NEWLINE, &newline); 364 if (newline == 10) 365 newline_mode = PCRE_NEWLINE_LF; 366 else if (newline == 13) 367 newline_mode = PCRE_NEWLINE_CR; 368 else if (newline == 3338) 369 newline_mode = PCRE_NEWLINE_CRLF; 370 else if (newline == -1) 371 newline_mode = PCRE_NEWLINE_ANY; 372 else if (newline == -2) 373 newline_mode = PCRE_NEWLINE_ANYCRLF; 374 else 375 assert(NULL == "Unexpected return value from pcre_config(NEWLINE)"); 376 } 377 return newline_mode; 378} 379 380int RE::GlobalReplace(const StringPiece& rewrite, 381 string *str) const { 382 int count = 0; 383 int vec[kVecSize]; 384 string out; 385 int start = 0; 386 int lastend = -1; 387 bool last_match_was_empty_string = false; 388 389 while (start <= static_cast<int>(str->length())) { 390 // If the previous match was for the empty string, we shouldn't 391 // just match again: we'll match in the same way and get an 392 // infinite loop. Instead, we do the match in a special way: 393 // anchored -- to force another try at the same position -- 394 // and with a flag saying that this time, ignore empty matches. 395 // If this special match returns, that means there's a non-empty 396 // match at this position as well, and we can continue. If not, 397 // we do what perl does, and just advance by one. 398 // Notice that perl prints '@@@' for this; 399 // perl -le '$_ = "aa"; s/b*|aa/@/g; print' 400 int matches; 401 if (last_match_was_empty_string) { 402 matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize); 403 if (matches <= 0) { 404 int matchend = start + 1; // advance one character. 405 // If the current char is CR and we're in CRLF mode, skip LF too. 406 // Note it's better to call pcre_fullinfo() than to examine 407 // all_options(), since options_ could have changed bewteen 408 // compile-time and now, but this is simpler and safe enough. 409 // Modified by PH to add ANY and ANYCRLF. 410 if (matchend < static_cast<int>(str->length()) && 411 (*str)[start] == '\r' && (*str)[matchend] == '\n' && 412 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF || 413 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY || 414 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) { 415 matchend++; 416 } 417 // We also need to advance more than one char if we're in utf8 mode. 418#ifdef SUPPORT_UTF8 419 if (options_.utf8()) { 420 while (matchend < static_cast<int>(str->length()) && 421 ((*str)[matchend] & 0xc0) == 0x80) 422 matchend++; 423 } 424#endif 425 if (start < static_cast<int>(str->length())) 426 out.append(*str, start, matchend - start); 427 start = matchend; 428 last_match_was_empty_string = false; 429 continue; 430 } 431 } else { 432 matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); 433 if (matches <= 0) 434 break; 435 } 436 int matchstart = vec[0], matchend = vec[1]; 437 assert(matchstart >= start); 438 assert(matchend >= matchstart); 439 out.append(*str, start, matchstart - start); 440 Rewrite(&out, rewrite, *str, vec, matches); 441 start = matchend; 442 lastend = matchend; 443 count++; 444 last_match_was_empty_string = (matchstart == matchend); 445 } 446 447 if (count == 0) 448 return 0; 449 450 if (start < static_cast<int>(str->length())) 451 out.append(*str, start, str->length() - start); 452 swap(out, *str); 453 return count; 454} 455 456bool RE::Extract(const StringPiece& rewrite, 457 const StringPiece& text, 458 string *out) const { 459 int vec[kVecSize]; 460 int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); 461 if (matches == 0) 462 return false; 463 out->erase(); 464 return Rewrite(out, rewrite, text, vec, matches); 465} 466 467/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { 468 string result; 469 470 // Escape any ascii character not in [A-Za-z_0-9]. 471 // 472 // Note that it's legal to escape a character even if it has no 473 // special meaning in a regular expression -- so this function does 474 // that. (This also makes it identical to the perl function of the 475 // same name; see `perldoc -f quotemeta`.) The one exception is 476 // escaping NUL: rather than doing backslash + NUL, like perl does, 477 // we do '\0', because pcre itself doesn't take embedded NUL chars. 478 for (int ii = 0; ii < unquoted.size(); ++ii) { 479 // Note that using 'isalnum' here raises the benchmark time from 480 // 32ns to 58ns: 481 if (unquoted[ii] == '\0') { 482 result += "\\0"; 483 } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && 484 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && 485 (unquoted[ii] < '0' || unquoted[ii] > '9') && 486 unquoted[ii] != '_' && 487 // If this is the part of a UTF8 or Latin1 character, we need 488 // to copy this byte without escaping. Experimentally this is 489 // what works correctly with the regexp library. 490 !(unquoted[ii] & 128)) { 491 result += '\\'; 492 result += unquoted[ii]; 493 } else { 494 result += unquoted[ii]; 495 } 496 } 497 498 return result; 499} 500 501/***** Actual matching and rewriting code *****/ 502 503int RE::TryMatch(const StringPiece& text, 504 int startpos, 505 Anchor anchor, 506 bool empty_ok, 507 int *vec, 508 int vecsize) const { 509 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; 510 if (re == NULL) { 511 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str()); 512 return 0; 513 } 514 515 pcre_extra extra = { 0, 0, 0, 0, 0, 0 }; 516 if (options_.match_limit() > 0) { 517 extra.flags |= PCRE_EXTRA_MATCH_LIMIT; 518 extra.match_limit = options_.match_limit(); 519 } 520 if (options_.match_limit_recursion() > 0) { 521 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; 522 extra.match_limit_recursion = options_.match_limit_recursion(); 523 } 524 525 int options = 0; 526 if (anchor != UNANCHORED) 527 options |= PCRE_ANCHORED; 528 if (!empty_ok) 529 options |= PCRE_NOTEMPTY; 530 531 int rc = pcre_exec(re, // The regular expression object 532 &extra, 533 (text.data() == NULL) ? "" : text.data(), 534 text.size(), 535 startpos, 536 options, 537 vec, 538 vecsize); 539 540 // Handle errors 541 if (rc == PCRE_ERROR_NOMATCH) { 542 return 0; 543 } else if (rc < 0) { 544 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n", 545 // re, pattern_.c_str()); 546 return 0; 547 } else if (rc == 0) { 548 // pcre_exec() returns 0 as a special case when the number of 549 // capturing subpatterns exceeds the size of the vector. 550 // When this happens, there is a match and the output vector 551 // is filled, but we miss out on the positions of the extra subpatterns. 552 rc = vecsize / 2; 553 } 554 555 return rc; 556} 557 558bool RE::DoMatchImpl(const StringPiece& text, 559 Anchor anchor, 560 int* consumed, 561 const Arg* const* args, 562 int n, 563 int* vec, 564 int vecsize) const { 565 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace 566 int matches = TryMatch(text, 0, anchor, true, vec, vecsize); 567 assert(matches >= 0); // TryMatch never returns negatives 568 if (matches == 0) 569 return false; 570 571 *consumed = vec[1]; 572 573 if (n == 0 || args == NULL) { 574 // We are not interested in results 575 return true; 576 } 577 578 if (NumberOfCapturingGroups() < n) { 579 // RE has fewer capturing groups than number of arg pointers passed in 580 return false; 581 } 582 583 // If we got here, we must have matched the whole pattern. 584 // We do not need (can not do) any more checks on the value of 'matches' here 585 // -- see the comment for TryMatch. 586 for (int i = 0; i < n; i++) { 587 const int start = vec[2*(i+1)]; 588 const int limit = vec[2*(i+1)+1]; 589 if (!args[i]->Parse(text.data() + start, limit-start)) { 590 // TODO: Should we indicate what the error was? 591 return false; 592 } 593 } 594 595 return true; 596} 597 598bool RE::DoMatch(const StringPiece& text, 599 Anchor anchor, 600 int* consumed, 601 const Arg* const args[], 602 int n) const { 603 assert(n >= 0); 604 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace 605 // (as for kVecSize) 606 int space[21]; // use stack allocation for small vecsize (common case) 607 int* vec = vecsize <= 21 ? space : new int[vecsize]; 608 bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); 609 if (vec != space) delete [] vec; 610 return retval; 611} 612 613bool RE::Rewrite(string *out, const StringPiece &rewrite, 614 const StringPiece &text, int *vec, int veclen) const { 615 for (const char *s = rewrite.data(), *end = s + rewrite.size(); 616 s < end; s++) { 617 int c = *s; 618 if (c == '\\') { 619 c = *++s; 620 if (isdigit(c)) { 621 int n = (c - '0'); 622 if (n >= veclen) { 623 //fprintf(stderr, requested group %d in regexp %.*s\n", 624 // n, rewrite.size(), rewrite.data()); 625 return false; 626 } 627 int start = vec[2 * n]; 628 if (start >= 0) 629 out->append(text.data() + start, vec[2 * n + 1] - start); 630 } else if (c == '\\') { 631 *out += '\\'; 632 } else { 633 //fprintf(stderr, "invalid rewrite pattern: %.*s\n", 634 // rewrite.size(), rewrite.data()); 635 return false; 636 } 637 } else { 638 *out += c; 639 } 640 } 641 return true; 642} 643 644// Return the number of capturing subpatterns, or -1 if the 645// regexp wasn't valid on construction. 646int RE::NumberOfCapturingGroups() const { 647 if (re_partial_ == NULL) return -1; 648 649 int result; 650 int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object 651 NULL, // We did not study the pattern 652 PCRE_INFO_CAPTURECOUNT, 653 &result); 654 assert(pcre_retval == 0); 655 return result; 656} 657 658/***** Parsers for various types *****/ 659 660bool Arg::parse_null(const char* str, int n, void* dest) { 661 // We fail if somebody asked us to store into a non-NULL void* pointer 662 return (dest == NULL); 663} 664 665bool Arg::parse_string(const char* str, int n, void* dest) { 666 if (dest == NULL) return true; 667 reinterpret_cast<string*>(dest)->assign(str, n); 668 return true; 669} 670 671bool Arg::parse_stringpiece(const char* str, int n, void* dest) { 672 if (dest == NULL) return true; 673 reinterpret_cast<StringPiece*>(dest)->set(str, n); 674 return true; 675} 676 677bool Arg::parse_char(const char* str, int n, void* dest) { 678 if (n != 1) return false; 679 if (dest == NULL) return true; 680 *(reinterpret_cast<char*>(dest)) = str[0]; 681 return true; 682} 683 684bool Arg::parse_uchar(const char* str, int n, void* dest) { 685 if (n != 1) return false; 686 if (dest == NULL) return true; 687 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; 688 return true; 689} 690 691// Largest number spec that we are willing to parse 692static const int kMaxNumberLength = 32; 693 694// REQUIRES "buf" must have length at least kMaxNumberLength+1 695// REQUIRES "n > 0" 696// Copies "str" into "buf" and null-terminates if necessary. 697// Returns one of: 698// a. "str" if no termination is needed 699// b. "buf" if the string was copied and null-terminated 700// c. "" if the input was invalid and has no hope of being parsed 701static const char* TerminateNumber(char* buf, const char* str, int n) { 702 if ((n > 0) && isspace(*str)) { 703 // We are less forgiving than the strtoxxx() routines and do not 704 // allow leading spaces. 705 return ""; 706 } 707 708 // See if the character right after the input text may potentially 709 // look like a digit. 710 if (isdigit(str[n]) || 711 ((str[n] >= 'a') && (str[n] <= 'f')) || 712 ((str[n] >= 'A') && (str[n] <= 'F'))) { 713 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number 714 memcpy(buf, str, n); 715 buf[n] = '\0'; 716 return buf; 717 } else { 718 // We can parse right out of the supplied string, so return it. 719 return str; 720 } 721} 722 723bool Arg::parse_long_radix(const char* str, 724 int n, 725 void* dest, 726 int radix) { 727 if (n == 0) return false; 728 char buf[kMaxNumberLength+1]; 729 str = TerminateNumber(buf, str, n); 730 char* end; 731 errno = 0; 732 long r = strtol(str, &end, radix); 733 if (end != str + n) return false; // Leftover junk 734 if (errno) return false; 735 if (dest == NULL) return true; 736 *(reinterpret_cast<long*>(dest)) = r; 737 return true; 738} 739 740bool Arg::parse_ulong_radix(const char* str, 741 int n, 742 void* dest, 743 int radix) { 744 if (n == 0) return false; 745 char buf[kMaxNumberLength+1]; 746 str = TerminateNumber(buf, str, n); 747 if (str[0] == '-') return false; // strtoul() on a negative number?! 748 char* end; 749 errno = 0; 750 unsigned long r = strtoul(str, &end, radix); 751 if (end != str + n) return false; // Leftover junk 752 if (errno) return false; 753 if (dest == NULL) return true; 754 *(reinterpret_cast<unsigned long*>(dest)) = r; 755 return true; 756} 757 758bool Arg::parse_short_radix(const char* str, 759 int n, 760 void* dest, 761 int radix) { 762 long r; 763 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 764 if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range 765 if (dest == NULL) return true; 766 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r); 767 return true; 768} 769 770bool Arg::parse_ushort_radix(const char* str, 771 int n, 772 void* dest, 773 int radix) { 774 unsigned long r; 775 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 776 if (r > USHRT_MAX) return false; // Out of range 777 if (dest == NULL) return true; 778 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r); 779 return true; 780} 781 782bool Arg::parse_int_radix(const char* str, 783 int n, 784 void* dest, 785 int radix) { 786 long r; 787 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse 788 if (r < INT_MIN || r > INT_MAX) return false; // Out of range 789 if (dest == NULL) return true; 790 *(reinterpret_cast<int*>(dest)) = r; 791 return true; 792} 793 794bool Arg::parse_uint_radix(const char* str, 795 int n, 796 void* dest, 797 int radix) { 798 unsigned long r; 799 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse 800 if (r > UINT_MAX) return false; // Out of range 801 if (dest == NULL) return true; 802 *(reinterpret_cast<unsigned int*>(dest)) = r; 803 return true; 804} 805 806bool Arg::parse_longlong_radix(const char* str, 807 int n, 808 void* dest, 809 int radix) { 810#ifndef HAVE_LONG_LONG 811 return false; 812#else 813 if (n == 0) return false; 814 char buf[kMaxNumberLength+1]; 815 str = TerminateNumber(buf, str, n); 816 char* end; 817 errno = 0; 818#if defined HAVE_STRTOQ 819 long long r = strtoq(str, &end, radix); 820#elif defined HAVE_STRTOLL 821 long long r = strtoll(str, &end, radix); 822#elif defined HAVE__STRTOI64 823 long long r = _strtoi64(str, &end, radix); 824#elif defined HAVE_STRTOIMAX 825 long long r = strtoimax(str, &end, radix); 826#else 827#error parse_longlong_radix: cannot convert input to a long-long 828#endif 829 if (end != str + n) return false; // Leftover junk 830 if (errno) return false; 831 if (dest == NULL) return true; 832 *(reinterpret_cast<long long*>(dest)) = r; 833 return true; 834#endif /* HAVE_LONG_LONG */ 835} 836 837bool Arg::parse_ulonglong_radix(const char* str, 838 int n, 839 void* dest, 840 int radix) { 841#ifndef HAVE_UNSIGNED_LONG_LONG 842 return false; 843#else 844 if (n == 0) return false; 845 char buf[kMaxNumberLength+1]; 846 str = TerminateNumber(buf, str, n); 847 if (str[0] == '-') return false; // strtoull() on a negative number?! 848 char* end; 849 errno = 0; 850#if defined HAVE_STRTOQ 851 unsigned long long r = strtouq(str, &end, radix); 852#elif defined HAVE_STRTOLL 853 unsigned long long r = strtoull(str, &end, radix); 854#elif defined HAVE__STRTOI64 855 unsigned long long r = _strtoui64(str, &end, radix); 856#elif defined HAVE_STRTOIMAX 857 unsigned long long r = strtoumax(str, &end, radix); 858#else 859#error parse_ulonglong_radix: cannot convert input to a long-long 860#endif 861 if (end != str + n) return false; // Leftover junk 862 if (errno) return false; 863 if (dest == NULL) return true; 864 *(reinterpret_cast<unsigned long long*>(dest)) = r; 865 return true; 866#endif /* HAVE_UNSIGNED_LONG_LONG */ 867} 868 869bool Arg::parse_double(const char* str, int n, void* dest) { 870 if (n == 0) return false; 871 static const int kMaxLength = 200; 872 char buf[kMaxLength]; 873 if (n >= kMaxLength) return false; 874 memcpy(buf, str, n); 875 buf[n] = '\0'; 876 errno = 0; 877 char* end; 878 double r = strtod(buf, &end); 879 if (end != buf + n) return false; // Leftover junk 880 if (errno) return false; 881 if (dest == NULL) return true; 882 *(reinterpret_cast<double*>(dest)) = r; 883 return true; 884} 885 886bool Arg::parse_float(const char* str, int n, void* dest) { 887 double r; 888 if (!parse_double(str, n, &r)) return false; 889 if (dest == NULL) return true; 890 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); 891 return true; 892} 893 894 895#define DEFINE_INTEGER_PARSERS(name) \ 896 bool Arg::parse_##name(const char* str, int n, void* dest) { \ 897 return parse_##name##_radix(str, n, dest, 10); \ 898 } \ 899 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ 900 return parse_##name##_radix(str, n, dest, 16); \ 901 } \ 902 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ 903 return parse_##name##_radix(str, n, dest, 8); \ 904 } \ 905 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ 906 return parse_##name##_radix(str, n, dest, 0); \ 907 } 908 909DEFINE_INTEGER_PARSERS(short) /* */ 910DEFINE_INTEGER_PARSERS(ushort) /* */ 911DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ 912DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ 913DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ 914DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ 915DEFINE_INTEGER_PARSERS(longlong) /* */ 916DEFINE_INTEGER_PARSERS(ulonglong) /* */ 917 918#undef DEFINE_INTEGER_PARSERS 919 920} // namespace pcrecpp 921