1198090Srdivacky//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
2198090Srdivacky//
3198090Srdivacky//                     The LLVM Compiler Infrastructure
4198090Srdivacky//
5198090Srdivacky// This file is distributed under the University of Illinois Open Source
6198090Srdivacky// License. See LICENSE.TXT for details.
7198090Srdivacky//
8198090Srdivacky//===----------------------------------------------------------------------===//
9198090Srdivacky//
10198090Srdivacky// This file implements a POSIX regular expression matcher.
11198090Srdivacky//
12198090Srdivacky//===----------------------------------------------------------------------===//
13198090Srdivacky
14198090Srdivacky#include "llvm/Support/Regex.h"
15249423Sdim#include "regex_impl.h"
16249423Sdim#include "llvm/ADT/SmallVector.h"
17198090Srdivacky#include "llvm/Support/ErrorHandling.h"
18198090Srdivacky#include "llvm/Support/raw_ostream.h"
19198090Srdivacky#include <string>
20198090Srdivackyusing namespace llvm;
21198090Srdivacky
22210299SedRegex::Regex(StringRef regex, unsigned Flags) {
23198090Srdivacky  unsigned flags = 0;
24198090Srdivacky  preg = new llvm_regex();
25198090Srdivacky  preg->re_endp = regex.end();
26198090Srdivacky  if (Flags & IgnoreCase)
27198090Srdivacky    flags |= REG_ICASE;
28198090Srdivacky  if (Flags & Newline)
29198090Srdivacky    flags |= REG_NEWLINE;
30249423Sdim  if (!(Flags & BasicRegex))
31249423Sdim    flags |= REG_EXTENDED;
32249423Sdim  error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
33198090Srdivacky}
34198090Srdivacky
35198090SrdivackyRegex::~Regex() {
36198090Srdivacky  llvm_regfree(preg);
37198090Srdivacky  delete preg;
38198090Srdivacky}
39198090Srdivacky
40198090Srdivackybool Regex::isValid(std::string &Error) {
41198090Srdivacky  if (!error)
42198090Srdivacky    return true;
43198090Srdivacky
44198090Srdivacky  size_t len = llvm_regerror(error, preg, NULL, 0);
45198090Srdivacky
46263508Sdim  Error.resize(len - 1);
47198090Srdivacky  llvm_regerror(error, preg, &Error[0], len);
48198090Srdivacky  return false;
49198090Srdivacky}
50198090Srdivacky
51198090Srdivacky/// getNumMatches - In a valid regex, return the number of parenthesized
52198090Srdivacky/// matches it contains.
53198090Srdivackyunsigned Regex::getNumMatches() const {
54198090Srdivacky  return preg->re_nsub;
55198090Srdivacky}
56198090Srdivacky
57210299Sedbool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
58198090Srdivacky  unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
59198090Srdivacky
60198090Srdivacky  // pmatch needs to have at least one element.
61198090Srdivacky  SmallVector<llvm_regmatch_t, 8> pm;
62198090Srdivacky  pm.resize(nmatch > 0 ? nmatch : 1);
63198090Srdivacky  pm[0].rm_so = 0;
64198090Srdivacky  pm[0].rm_eo = String.size();
65198090Srdivacky
66198090Srdivacky  int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
67198090Srdivacky
68198090Srdivacky  if (rc == REG_NOMATCH)
69198090Srdivacky    return false;
70198090Srdivacky  if (rc != 0) {
71198090Srdivacky    // regexec can fail due to invalid pattern or running out of memory.
72198090Srdivacky    error = rc;
73198090Srdivacky    return false;
74198090Srdivacky  }
75198090Srdivacky
76198090Srdivacky  // There was a match.
77198090Srdivacky
78198090Srdivacky  if (Matches) { // match position requested
79198090Srdivacky    Matches->clear();
80198090Srdivacky
81198090Srdivacky    for (unsigned i = 0; i != nmatch; ++i) {
82198090Srdivacky      if (pm[i].rm_so == -1) {
83198090Srdivacky        // this group didn't match
84198090Srdivacky        Matches->push_back(StringRef());
85198090Srdivacky        continue;
86198090Srdivacky      }
87221345Sdim      assert(pm[i].rm_eo >= pm[i].rm_so);
88198090Srdivacky      Matches->push_back(StringRef(String.data()+pm[i].rm_so,
89198090Srdivacky                                   pm[i].rm_eo-pm[i].rm_so));
90198090Srdivacky    }
91198090Srdivacky  }
92198090Srdivacky
93198090Srdivacky  return true;
94198090Srdivacky}
95204642Srdivacky
96204642Srdivackystd::string Regex::sub(StringRef Repl, StringRef String,
97204642Srdivacky                       std::string *Error) {
98204642Srdivacky  SmallVector<StringRef, 8> Matches;
99204642Srdivacky
100204642Srdivacky  // Reset error, if given.
101204642Srdivacky  if (Error && !Error->empty()) *Error = "";
102204642Srdivacky
103204642Srdivacky  // Return the input if there was no match.
104204642Srdivacky  if (!match(String, &Matches))
105204642Srdivacky    return String;
106204642Srdivacky
107204642Srdivacky  // Otherwise splice in the replacement string, starting with the prefix before
108204642Srdivacky  // the match.
109204642Srdivacky  std::string Res(String.begin(), Matches[0].begin());
110204642Srdivacky
111204642Srdivacky  // Then the replacement string, honoring possible substitutions.
112204642Srdivacky  while (!Repl.empty()) {
113204642Srdivacky    // Skip to the next escape.
114204642Srdivacky    std::pair<StringRef, StringRef> Split = Repl.split('\\');
115204642Srdivacky
116204642Srdivacky    // Add the skipped substring.
117204642Srdivacky    Res += Split.first;
118204642Srdivacky
119204642Srdivacky    // Check for terminimation and trailing backslash.
120204642Srdivacky    if (Split.second.empty()) {
121204642Srdivacky      if (Repl.size() != Split.first.size() &&
122204642Srdivacky          Error && Error->empty())
123204642Srdivacky        *Error = "replacement string contained trailing backslash";
124204642Srdivacky      break;
125204642Srdivacky    }
126204642Srdivacky
127204642Srdivacky    // Otherwise update the replacement string and interpret escapes.
128204642Srdivacky    Repl = Split.second;
129204642Srdivacky
130204642Srdivacky    // FIXME: We should have a StringExtras function for mapping C99 escapes.
131204642Srdivacky    switch (Repl[0]) {
132204642Srdivacky      // Treat all unrecognized characters as self-quoting.
133204642Srdivacky    default:
134204642Srdivacky      Res += Repl[0];
135204642Srdivacky      Repl = Repl.substr(1);
136204642Srdivacky      break;
137204642Srdivacky
138204642Srdivacky      // Single character escapes.
139204642Srdivacky    case 't':
140204642Srdivacky      Res += '\t';
141204642Srdivacky      Repl = Repl.substr(1);
142204642Srdivacky      break;
143204642Srdivacky    case 'n':
144204642Srdivacky      Res += '\n';
145204642Srdivacky      Repl = Repl.substr(1);
146204642Srdivacky      break;
147204642Srdivacky
148204642Srdivacky      // Decimal escapes are backreferences.
149204642Srdivacky    case '0': case '1': case '2': case '3': case '4':
150204642Srdivacky    case '5': case '6': case '7': case '8': case '9': {
151204642Srdivacky      // Extract the backreference number.
152204642Srdivacky      StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
153204642Srdivacky      Repl = Repl.substr(Ref.size());
154204642Srdivacky
155204642Srdivacky      unsigned RefValue;
156204642Srdivacky      if (!Ref.getAsInteger(10, RefValue) &&
157204642Srdivacky          RefValue < Matches.size())
158204642Srdivacky        Res += Matches[RefValue];
159204642Srdivacky      else if (Error && Error->empty())
160204642Srdivacky        *Error = "invalid backreference string '" + Ref.str() + "'";
161204642Srdivacky      break;
162204642Srdivacky    }
163204642Srdivacky    }
164204642Srdivacky  }
165204642Srdivacky
166204642Srdivacky  // And finally the suffix.
167204642Srdivacky  Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
168204642Srdivacky
169204642Srdivacky  return Res;
170204642Srdivacky}
171263508Sdim
172263508Sdimbool Regex::isLiteralERE(StringRef Str) {
173263508Sdim  // Check for regex metacharacters.  This list was derived from our regex
174263508Sdim  // implementation in regcomp.c and double checked against the POSIX extended
175263508Sdim  // regular expression specification.
176263508Sdim  return Str.find_first_of("()^$|*+?.[]\\{}") == StringRef::npos;
177263508Sdim}
178