1198090Srdivacky//===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2198090Srdivacky// 3198090Srdivacky// The LLVM Compiler Infrastructure 4198090Srdivacky// 5198090Srdivacky// This file is distributed under the University of Illinois Open Source 6198090Srdivacky// License. See LICENSE.TXT for details. 7198090Srdivacky// 8198090Srdivacky//===----------------------------------------------------------------------===// 9198090Srdivacky// 10198090Srdivacky// This file implements a POSIX regular expression matcher. 11198090Srdivacky// 12198090Srdivacky//===----------------------------------------------------------------------===// 13198090Srdivacky 14198090Srdivacky#include "llvm/Support/Regex.h" 15249423Sdim#include "regex_impl.h" 16249423Sdim#include "llvm/ADT/SmallVector.h" 17198090Srdivacky#include "llvm/Support/ErrorHandling.h" 18198090Srdivacky#include "llvm/Support/raw_ostream.h" 19198090Srdivacky#include <string> 20198090Srdivackyusing namespace llvm; 21198090Srdivacky 22210299SedRegex::Regex(StringRef regex, unsigned Flags) { 23198090Srdivacky unsigned flags = 0; 24198090Srdivacky preg = new llvm_regex(); 25198090Srdivacky preg->re_endp = regex.end(); 26198090Srdivacky if (Flags & IgnoreCase) 27198090Srdivacky flags |= REG_ICASE; 28198090Srdivacky if (Flags & Newline) 29198090Srdivacky flags |= REG_NEWLINE; 30249423Sdim if (!(Flags & BasicRegex)) 31249423Sdim flags |= REG_EXTENDED; 32249423Sdim error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 33198090Srdivacky} 34198090Srdivacky 35198090SrdivackyRegex::~Regex() { 36198090Srdivacky llvm_regfree(preg); 37198090Srdivacky delete preg; 38198090Srdivacky} 39198090Srdivacky 40198090Srdivackybool Regex::isValid(std::string &Error) { 41198090Srdivacky if (!error) 42198090Srdivacky return true; 43198090Srdivacky 44198090Srdivacky size_t len = llvm_regerror(error, preg, NULL, 0); 45198090Srdivacky 46263508Sdim Error.resize(len - 1); 47198090Srdivacky llvm_regerror(error, preg, &Error[0], len); 48198090Srdivacky return false; 49198090Srdivacky} 50198090Srdivacky 51198090Srdivacky/// getNumMatches - In a valid regex, return the number of parenthesized 52198090Srdivacky/// matches it contains. 53198090Srdivackyunsigned Regex::getNumMatches() const { 54198090Srdivacky return preg->re_nsub; 55198090Srdivacky} 56198090Srdivacky 57210299Sedbool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 58198090Srdivacky unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 59198090Srdivacky 60198090Srdivacky // pmatch needs to have at least one element. 61198090Srdivacky SmallVector<llvm_regmatch_t, 8> pm; 62198090Srdivacky pm.resize(nmatch > 0 ? nmatch : 1); 63198090Srdivacky pm[0].rm_so = 0; 64198090Srdivacky pm[0].rm_eo = String.size(); 65198090Srdivacky 66198090Srdivacky int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 67198090Srdivacky 68198090Srdivacky if (rc == REG_NOMATCH) 69198090Srdivacky return false; 70198090Srdivacky if (rc != 0) { 71198090Srdivacky // regexec can fail due to invalid pattern or running out of memory. 72198090Srdivacky error = rc; 73198090Srdivacky return false; 74198090Srdivacky } 75198090Srdivacky 76198090Srdivacky // There was a match. 77198090Srdivacky 78198090Srdivacky if (Matches) { // match position requested 79198090Srdivacky Matches->clear(); 80198090Srdivacky 81198090Srdivacky for (unsigned i = 0; i != nmatch; ++i) { 82198090Srdivacky if (pm[i].rm_so == -1) { 83198090Srdivacky // this group didn't match 84198090Srdivacky Matches->push_back(StringRef()); 85198090Srdivacky continue; 86198090Srdivacky } 87221345Sdim assert(pm[i].rm_eo >= pm[i].rm_so); 88198090Srdivacky Matches->push_back(StringRef(String.data()+pm[i].rm_so, 89198090Srdivacky pm[i].rm_eo-pm[i].rm_so)); 90198090Srdivacky } 91198090Srdivacky } 92198090Srdivacky 93198090Srdivacky return true; 94198090Srdivacky} 95204642Srdivacky 96204642Srdivackystd::string Regex::sub(StringRef Repl, StringRef String, 97204642Srdivacky std::string *Error) { 98204642Srdivacky SmallVector<StringRef, 8> Matches; 99204642Srdivacky 100204642Srdivacky // Reset error, if given. 101204642Srdivacky if (Error && !Error->empty()) *Error = ""; 102204642Srdivacky 103204642Srdivacky // Return the input if there was no match. 104204642Srdivacky if (!match(String, &Matches)) 105204642Srdivacky return String; 106204642Srdivacky 107204642Srdivacky // Otherwise splice in the replacement string, starting with the prefix before 108204642Srdivacky // the match. 109204642Srdivacky std::string Res(String.begin(), Matches[0].begin()); 110204642Srdivacky 111204642Srdivacky // Then the replacement string, honoring possible substitutions. 112204642Srdivacky while (!Repl.empty()) { 113204642Srdivacky // Skip to the next escape. 114204642Srdivacky std::pair<StringRef, StringRef> Split = Repl.split('\\'); 115204642Srdivacky 116204642Srdivacky // Add the skipped substring. 117204642Srdivacky Res += Split.first; 118204642Srdivacky 119204642Srdivacky // Check for terminimation and trailing backslash. 120204642Srdivacky if (Split.second.empty()) { 121204642Srdivacky if (Repl.size() != Split.first.size() && 122204642Srdivacky Error && Error->empty()) 123204642Srdivacky *Error = "replacement string contained trailing backslash"; 124204642Srdivacky break; 125204642Srdivacky } 126204642Srdivacky 127204642Srdivacky // Otherwise update the replacement string and interpret escapes. 128204642Srdivacky Repl = Split.second; 129204642Srdivacky 130204642Srdivacky // FIXME: We should have a StringExtras function for mapping C99 escapes. 131204642Srdivacky switch (Repl[0]) { 132204642Srdivacky // Treat all unrecognized characters as self-quoting. 133204642Srdivacky default: 134204642Srdivacky Res += Repl[0]; 135204642Srdivacky Repl = Repl.substr(1); 136204642Srdivacky break; 137204642Srdivacky 138204642Srdivacky // Single character escapes. 139204642Srdivacky case 't': 140204642Srdivacky Res += '\t'; 141204642Srdivacky Repl = Repl.substr(1); 142204642Srdivacky break; 143204642Srdivacky case 'n': 144204642Srdivacky Res += '\n'; 145204642Srdivacky Repl = Repl.substr(1); 146204642Srdivacky break; 147204642Srdivacky 148204642Srdivacky // Decimal escapes are backreferences. 149204642Srdivacky case '0': case '1': case '2': case '3': case '4': 150204642Srdivacky case '5': case '6': case '7': case '8': case '9': { 151204642Srdivacky // Extract the backreference number. 152204642Srdivacky StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 153204642Srdivacky Repl = Repl.substr(Ref.size()); 154204642Srdivacky 155204642Srdivacky unsigned RefValue; 156204642Srdivacky if (!Ref.getAsInteger(10, RefValue) && 157204642Srdivacky RefValue < Matches.size()) 158204642Srdivacky Res += Matches[RefValue]; 159204642Srdivacky else if (Error && Error->empty()) 160204642Srdivacky *Error = "invalid backreference string '" + Ref.str() + "'"; 161204642Srdivacky break; 162204642Srdivacky } 163204642Srdivacky } 164204642Srdivacky } 165204642Srdivacky 166204642Srdivacky // And finally the suffix. 167204642Srdivacky Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 168204642Srdivacky 169204642Srdivacky return Res; 170204642Srdivacky} 171263508Sdim 172263508Sdimbool Regex::isLiteralERE(StringRef Str) { 173263508Sdim // Check for regex metacharacters. This list was derived from our regex 174263508Sdim // implementation in regcomp.c and double checked against the POSIX extended 175263508Sdim // regular expression specification. 176263508Sdim return Str.find_first_of("()^$|*+?.[]\\{}") == StringRef::npos; 177263508Sdim} 178