1// Copyright 2014 The Kyua Authors. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above copyright 11// notice, this list of conditions and the following disclaimer in the 12// documentation and/or other materials provided with the distribution. 13// * Neither the name of Google Inc. nor the names of its contributors 14// may be used to endorse or promote products derived from this software 15// without specific prior written permission. 16// 17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29#include "utils/text/regex.hpp" 30 31extern "C" { 32#include <sys/types.h> 33 34#include <regex.h> 35} 36 37#include "utils/auto_array.ipp" 38#include "utils/defs.hpp" 39#include "utils/format/macros.hpp" 40#include "utils/noncopyable.hpp" 41#include "utils/sanity.hpp" 42#include "utils/text/exceptions.hpp" 43 44namespace text = utils::text; 45 46 47namespace { 48 49 50static void throw_regex_error(const int, const ::regex_t*, const std::string&) 51 UTILS_NORETURN; 52 53 54/// Constructs and raises a regex_error. 55/// 56/// \param error The error code returned by regcomp(3) or regexec(3). 57/// \param preg The native regex object that caused this error. 58/// \param prefix Error message prefix string. 59/// 60/// \throw regex_error The constructed exception. 61static void 62throw_regex_error(const int error, const ::regex_t* preg, 63 const std::string& prefix) 64{ 65 char buffer[1024]; 66 67 // TODO(jmmv): Would be nice to handle the case where the message does 68 // not fit in the temporary buffer. 69 (void)::regerror(error, preg, buffer, sizeof(buffer)); 70 71 throw text::regex_error(F("%s: %s") % prefix % buffer); 72} 73 74 75} // anonymous namespace 76 77 78/// Internal implementation for regex_matches. 79struct utils::text::regex_matches::impl : utils::noncopyable { 80 /// String on which we are matching. 81 /// 82 /// In theory, we could take a reference here instead of a copy, and make 83 /// it a requirement for the caller to ensure that the lifecycle of the 84 /// input string outlasts the lifecycle of the regex_matches. However, that 85 /// contract is very easy to break with hardcoded strings (as we do in 86 /// tests). Just go for the safer case here. 87 const std::string _string; 88 89 /// Maximum number of matching groups we expect, including the full match. 90 /// 91 /// In other words, this is the size of the _matches array. 92 const std::size_t _nmatches; 93 94 /// Native regular expression match representation. 95 utils::auto_array< ::regmatch_t > _matches; 96 97 /// Constructor. 98 /// 99 /// This executes the regex on the given string and sets up the internal 100 /// class state based on the results. 101 /// 102 /// \param preg The native regex object. 103 /// \param str The string on which to execute the regex. 104 /// \param ngroups Number of capture groups in the regex. This is an upper 105 /// bound and may be greater than the actual matches. 106 /// 107 /// \throw regex_error If the call to regexec(3) fails. 108 impl(const ::regex_t* preg, const std::string& str, 109 const std::size_t ngroups) : 110 _string(str), 111 _nmatches(ngroups + 1), 112 _matches(new ::regmatch_t[_nmatches]) 113 { 114 const int error = ::regexec(preg, _string.c_str(), _nmatches, 115 _matches.get(), 0); 116 if (error == REG_NOMATCH) { 117 _matches.reset(NULL); 118 } else if (error != 0) { 119 throw_regex_error(error, preg, 120 F("regexec on '%s' failed") % _string); 121 } 122 } 123 124 /// Destructor. 125 ~impl(void) 126 { 127 } 128}; 129 130 131/// Constructor. 132/// 133/// \param pimpl Constructed implementation of the object. 134text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) : 135 _pimpl(pimpl) 136{ 137} 138 139 140/// Destructor. 141text::regex_matches::~regex_matches(void) 142{ 143} 144 145 146/// Returns the number of matches in this object. 147/// 148/// Note that this does not correspond to the number of groups provided at 149/// construction time. The returned value here accounts for only the returned 150/// valid matches. 151/// 152/// \return Number of matches, including the full match. 153std::size_t 154text::regex_matches::count(void) const 155{ 156 std::size_t total = 0; 157 if (_pimpl->_matches.get() != NULL) { 158 for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) { 159 if (_pimpl->_matches[i].rm_so != -1) 160 ++total; 161 } 162 INV(total <= _pimpl->_nmatches); 163 } 164 return total; 165} 166 167 168/// Gets a match. 169/// 170/// \param index Number of the match to get. Index 0 always contains the match 171/// of the whole regex. 172/// 173/// \pre There regex must have matched the input string. 174/// \pre index must be lower than count(). 175/// 176/// \return The textual match. 177std::string 178text::regex_matches::get(const std::size_t index) const 179{ 180 PRE(*this); 181 PRE(index < count()); 182 183 const ::regmatch_t* match = &_pimpl->_matches[index]; 184 185 return std::string(_pimpl->_string.c_str() + match->rm_so, 186 match->rm_eo - match->rm_so); 187} 188 189 190/// Checks if there are any matches. 191/// 192/// \return True if the object contains one or more matches; false otherwise. 193text::regex_matches::operator bool(void) const 194{ 195 return _pimpl->_matches.get() != NULL; 196} 197 198 199/// Internal implementation for regex. 200struct utils::text::regex::impl : utils::noncopyable { 201 /// Native regular expression representation. 202 ::regex_t _preg; 203 204 /// Number of capture groups in the regular expression. This is an upper 205 /// bound and does NOT include the default full string match. 206 std::size_t _ngroups; 207 208 /// Constructor. 209 /// 210 /// This compiles the given regular expression. 211 /// 212 /// \param regex_ The regular expression to compile. 213 /// \param ngroups Number of capture groups in the regular expression. This 214 /// is an upper bound and does NOT include the default full string 215 /// match. 216 /// \param ignore_case Whether to ignore case during matching. 217 /// 218 /// \throw regex_error If the call to regcomp(3) fails. 219 impl(const std::string& regex_, const std::size_t ngroups, 220 const bool ignore_case) : 221 _ngroups(ngroups) 222 { 223 const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0); 224 const int error = ::regcomp(&_preg, regex_.c_str(), flags); 225 if (error != 0) 226 throw_regex_error(error, &_preg, F("regcomp on '%s' failed") 227 % regex_); 228 } 229 230 /// Destructor. 231 ~impl(void) 232 { 233 ::regfree(&_preg); 234 } 235}; 236 237 238/// Constructor. 239/// 240/// \param pimpl Constructed implementation of the object. 241text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl) 242{ 243} 244 245 246/// Destructor. 247text::regex::~regex(void) 248{ 249} 250 251 252/// Compiles a new regular expression. 253/// 254/// \param regex_ The regular expression to compile. 255/// \param ngroups Number of capture groups in the regular expression. This is 256/// an upper bound and does NOT include the default full string match. 257/// \param ignore_case Whether to ignore case during matching. 258/// 259/// \return A new regular expression, ready to match strings. 260/// 261/// \throw regex_error If the regular expression is invalid and cannot be 262/// compiled. 263text::regex 264text::regex::compile(const std::string& regex_, const std::size_t ngroups, 265 const bool ignore_case) 266{ 267 return regex(std::shared_ptr< impl >(new impl(regex_, ngroups, 268 ignore_case))); 269} 270 271 272/// Matches the regular expression against a string. 273/// 274/// \param str String to match the regular expression against. 275/// 276/// \return A new regex_matches object with the results of the match. 277text::regex_matches 278text::regex::match(const std::string& str) const 279{ 280 std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl( 281 &_pimpl->_preg, str, _pimpl->_ngroups)); 282 return regex_matches(pimpl); 283} 284 285 286/// Compiles and matches a regular expression once. 287/// 288/// This is syntactic sugar to simplify the instantiation of a new regex object 289/// and its subsequent match on a string. 290/// 291/// \param regex_ The regular expression to compile and match. 292/// \param str String to match the regular expression against. 293/// \param ngroups Number of capture groups in the regular expression. 294/// \param ignore_case Whether to ignore case during matching. 295/// 296/// \return A new regex_matches object with the results of the match. 297text::regex_matches 298text::match_regex(const std::string& regex_, const std::string& str, 299 const std::size_t ngroups, const bool ignore_case) 300{ 301 return regex::compile(regex_, ngroups, ignore_case).match(str); 302} 303