1// Copyright 2014 The Kyua Authors.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8// * Redistributions of source code must retain the above copyright
9//   notice, this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright
11//   notice, this list of conditions and the following disclaimer in the
12//   documentation and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors
14//   may be used to endorse or promote products derived from this software
15//   without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29#include "utils/text/regex.hpp"
30
31extern "C" {
32#include <sys/types.h>
33
34#include <regex.h>
35}
36
37#include "utils/auto_array.ipp"
38#include "utils/defs.hpp"
39#include "utils/format/macros.hpp"
40#include "utils/noncopyable.hpp"
41#include "utils/sanity.hpp"
42#include "utils/text/exceptions.hpp"
43
44namespace text = utils::text;
45
46
47namespace {
48
49
50static void throw_regex_error(const int, const ::regex_t*, const std::string&)
51    UTILS_NORETURN;
52
53
54/// Constructs and raises a regex_error.
55///
56/// \param error The error code returned by regcomp(3) or regexec(3).
57/// \param preg The native regex object that caused this error.
58/// \param prefix Error message prefix string.
59///
60/// \throw regex_error The constructed exception.
61static void
62throw_regex_error(const int error, const ::regex_t* preg,
63                  const std::string& prefix)
64{
65    char buffer[1024];
66
67    // TODO(jmmv): Would be nice to handle the case where the message does
68    // not fit in the temporary buffer.
69    (void)::regerror(error, preg, buffer, sizeof(buffer));
70
71    throw text::regex_error(F("%s: %s") % prefix % buffer);
72}
73
74
75}  // anonymous namespace
76
77
78/// Internal implementation for regex_matches.
79struct utils::text::regex_matches::impl : utils::noncopyable {
80    /// String on which we are matching.
81    ///
82    /// In theory, we could take a reference here instead of a copy, and make
83    /// it a requirement for the caller to ensure that the lifecycle of the
84    /// input string outlasts the lifecycle of the regex_matches.  However, that
85    /// contract is very easy to break with hardcoded strings (as we do in
86    /// tests).  Just go for the safer case here.
87    const std::string _string;
88
89    /// Maximum number of matching groups we expect, including the full match.
90    ///
91    /// In other words, this is the size of the _matches array.
92    const std::size_t _nmatches;
93
94    /// Native regular expression match representation.
95    utils::auto_array< ::regmatch_t > _matches;
96
97    /// Constructor.
98    ///
99    /// This executes the regex on the given string and sets up the internal
100    /// class state based on the results.
101    ///
102    /// \param preg The native regex object.
103    /// \param str The string on which to execute the regex.
104    /// \param ngroups Number of capture groups in the regex.  This is an upper
105    ///     bound and may be greater than the actual matches.
106    ///
107    /// \throw regex_error If the call to regexec(3) fails.
108    impl(const ::regex_t* preg, const std::string& str,
109         const std::size_t ngroups) :
110        _string(str),
111        _nmatches(ngroups + 1),
112        _matches(new ::regmatch_t[_nmatches])
113    {
114        const int error = ::regexec(preg, _string.c_str(), _nmatches,
115                                    _matches.get(), 0);
116        if (error == REG_NOMATCH) {
117            _matches.reset(NULL);
118        } else if (error != 0) {
119            throw_regex_error(error, preg,
120                              F("regexec on '%s' failed") % _string);
121        }
122    }
123
124    /// Destructor.
125    ~impl(void)
126    {
127    }
128};
129
130
131/// Constructor.
132///
133/// \param pimpl Constructed implementation of the object.
134text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) :
135    _pimpl(pimpl)
136{
137}
138
139
140/// Destructor.
141text::regex_matches::~regex_matches(void)
142{
143}
144
145
146/// Returns the number of matches in this object.
147///
148/// Note that this does not correspond to the number of groups provided at
149/// construction time.  The returned value here accounts for only the returned
150/// valid matches.
151///
152/// \return Number of matches, including the full match.
153std::size_t
154text::regex_matches::count(void) const
155{
156    std::size_t total = 0;
157    if (_pimpl->_matches.get() != NULL) {
158        for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) {
159            if (_pimpl->_matches[i].rm_so != -1)
160                ++total;
161        }
162        INV(total <= _pimpl->_nmatches);
163    }
164    return total;
165}
166
167
168/// Gets a match.
169///
170/// \param index Number of the match to get.  Index 0 always contains the match
171///     of the whole regex.
172///
173/// \pre There regex must have matched the input string.
174/// \pre index must be lower than count().
175///
176/// \return The textual match.
177std::string
178text::regex_matches::get(const std::size_t index) const
179{
180    PRE(*this);
181    PRE(index < count());
182
183    const ::regmatch_t* match = &_pimpl->_matches[index];
184
185    return std::string(_pimpl->_string.c_str() + match->rm_so,
186                       match->rm_eo - match->rm_so);
187}
188
189
190/// Checks if there are any matches.
191///
192/// \return True if the object contains one or more matches; false otherwise.
193text::regex_matches::operator bool(void) const
194{
195    return _pimpl->_matches.get() != NULL;
196}
197
198
199/// Internal implementation for regex.
200struct utils::text::regex::impl : utils::noncopyable {
201    /// Native regular expression representation.
202    ::regex_t _preg;
203
204    /// Number of capture groups in the regular expression.  This is an upper
205    /// bound and does NOT include the default full string match.
206    std::size_t _ngroups;
207
208    /// Constructor.
209    ///
210    /// This compiles the given regular expression.
211    ///
212    /// \param regex_ The regular expression to compile.
213    /// \param ngroups Number of capture groups in the regular expression.  This
214    ///     is an upper bound and does NOT include the default full string
215    ///     match.
216    /// \param ignore_case Whether to ignore case during matching.
217    ///
218    /// \throw regex_error If the call to regcomp(3) fails.
219    impl(const std::string& regex_, const std::size_t ngroups,
220         const bool ignore_case) :
221        _ngroups(ngroups)
222    {
223        const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0);
224        const int error = ::regcomp(&_preg, regex_.c_str(), flags);
225        if (error != 0)
226            throw_regex_error(error, &_preg, F("regcomp on '%s' failed")
227                              % regex_);
228    }
229
230    /// Destructor.
231    ~impl(void)
232    {
233        ::regfree(&_preg);
234    }
235};
236
237
238/// Constructor.
239///
240/// \param pimpl Constructed implementation of the object.
241text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl)
242{
243}
244
245
246/// Destructor.
247text::regex::~regex(void)
248{
249}
250
251
252/// Compiles a new regular expression.
253///
254/// \param regex_ The regular expression to compile.
255/// \param ngroups Number of capture groups in the regular expression.  This is
256///     an upper bound and does NOT include the default full string match.
257/// \param ignore_case Whether to ignore case during matching.
258///
259/// \return A new regular expression, ready to match strings.
260///
261/// \throw regex_error If the regular expression is invalid and cannot be
262///     compiled.
263text::regex
264text::regex::compile(const std::string& regex_, const std::size_t ngroups,
265                     const bool ignore_case)
266{
267    return regex(std::shared_ptr< impl >(new impl(regex_, ngroups,
268                                                  ignore_case)));
269}
270
271
272/// Matches the regular expression against a string.
273///
274/// \param str String to match the regular expression against.
275///
276/// \return A new regex_matches object with the results of the match.
277text::regex_matches
278text::regex::match(const std::string& str) const
279{
280    std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl(
281        &_pimpl->_preg, str, _pimpl->_ngroups));
282    return regex_matches(pimpl);
283}
284
285
286/// Compiles and matches a regular expression once.
287///
288/// This is syntactic sugar to simplify the instantiation of a new regex object
289/// and its subsequent match on a string.
290///
291/// \param regex_ The regular expression to compile and match.
292/// \param str String to match the regular expression against.
293/// \param ngroups Number of capture groups in the regular expression.
294/// \param ignore_case Whether to ignore case during matching.
295///
296/// \return A new regex_matches object with the results of the match.
297text::regex_matches
298text::match_regex(const std::string& regex_, const std::string& str,
299                  const std::size_t ngroups, const bool ignore_case)
300{
301    return regex::compile(regex_, ngroups, ignore_case).match(str);
302}
303