regex_compiler.h revision 1.1.1.13
1// class template regex -*- C++ -*-
2
3// Copyright (C) 2010-2022 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library.  This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23// <http://www.gnu.org/licenses/>.
24
25/**
26 *  @file bits/regex_compiler.h
27 *  This is an internal header file, included by other library headers.
28 *  Do not attempt to use it directly. @headername{regex}
29 */
30
31namespace std _GLIBCXX_VISIBILITY(default)
32{
33_GLIBCXX_BEGIN_NAMESPACE_VERSION
34_GLIBCXX_BEGIN_NAMESPACE_CXX11
35
36  template<typename>
37    class regex_traits;
38
39_GLIBCXX_END_NAMESPACE_CXX11
40
41namespace __detail
42{
43  /**
44   * @addtogroup regex-detail
45   * @{
46   */
47
48  template<typename, bool, bool>
49    struct _BracketMatcher;
50
51  /**
52   * @brief Builds an NFA from an input iterator range.
53   *
54   * The %_TraitsT type should fulfill requirements [28.3].
55   */
56  template<typename _TraitsT>
57    class _Compiler
58    {
59    public:
60      typedef typename _TraitsT::char_type        _CharT;
61      typedef _NFA<_TraitsT>              	  _RegexT;
62      typedef regex_constants::syntax_option_type _FlagT;
63
64      _Compiler(const _CharT* __b, const _CharT* __e,
65		const typename _TraitsT::locale_type& __traits, _FlagT __flags);
66
67      shared_ptr<const _RegexT>
68      _M_get_nfa() noexcept
69      { return std::move(_M_nfa); }
70
71    private:
72      typedef _Scanner<_CharT>               _ScannerT;
73      typedef typename _TraitsT::string_type _StringT;
74      typedef typename _ScannerT::_TokenT    _TokenT;
75      typedef _StateSeq<_TraitsT>            _StateSeqT;
76      typedef std::stack<_StateSeqT>         _StackT;
77      typedef std::ctype<_CharT>             _CtypeT;
78
79      // accepts a specific token or returns false.
80      bool
81      _M_match_token(_TokenT __token);
82
83      void
84      _M_disjunction();
85
86      void
87      _M_alternative();
88
89      bool
90      _M_term();
91
92      bool
93      _M_assertion();
94
95      bool
96      _M_quantifier();
97
98      bool
99      _M_atom();
100
101      bool
102      _M_bracket_expression();
103
104      template<bool __icase, bool __collate>
105	void
106	_M_insert_any_matcher_ecma();
107
108      template<bool __icase, bool __collate>
109	void
110	_M_insert_any_matcher_posix();
111
112      template<bool __icase, bool __collate>
113	void
114	_M_insert_char_matcher();
115
116      template<bool __icase, bool __collate>
117	void
118	_M_insert_character_class_matcher();
119
120      template<bool __icase, bool __collate>
121	void
122	_M_insert_bracket_matcher(bool __neg);
123
124      // Cache of the last atom seen in a bracketed range expression.
125      struct _BracketState
126      {
127	enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None;
128	_CharT _M_char = _CharT();
129
130	void
131	set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; }
132
133	_GLIBCXX_NODISCARD _CharT
134	get() const noexcept { return _M_char; }
135
136	void
137	reset(_Type __t = _Type::_None) noexcept { _M_type = __t; }
138
139	explicit operator bool() const noexcept
140	{ return _M_type != _Type::_None; }
141
142	// Previous token was a single character.
143	_GLIBCXX_NODISCARD bool
144	_M_is_char() const noexcept { return _M_type == _Type::_Char; }
145
146	// Previous token was a character class, equivalent class,
147	// collating symbol etc.
148	_GLIBCXX_NODISCARD bool
149	_M_is_class() const noexcept { return _M_type == _Type::_Class; }
150      };
151
152      template<bool __icase, bool __collate>
153	using _BracketMatcher
154	  = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>;
155
156      // Returns true if successfully parsed one term and should continue
157      // compiling a bracket expression.
158      // Returns false if the compiler should move on.
159      template<bool __icase, bool __collate>
160	bool
161	_M_expression_term(_BracketState& __last_char,
162			   _BracketMatcher<__icase, __collate>& __matcher);
163
164      int
165      _M_cur_int_value(int __radix);
166
167      bool
168      _M_try_char();
169
170      _StateSeqT
171      _M_pop()
172      {
173	auto ret = _M_stack.top();
174	_M_stack.pop();
175	return ret;
176      }
177
178      static _FlagT
179      _S_validate(_FlagT __f)
180      {
181	using namespace regex_constants;
182	switch (__f & (ECMAScript|basic|extended|awk|grep|egrep))
183	  {
184	  case ECMAScript:
185	  case basic:
186	  case extended:
187	  case awk:
188	  case grep:
189	  case egrep:
190	    return __f;
191	  case _FlagT(0):
192	    return __f | ECMAScript;
193	  default:
194	    std::__throw_regex_error(_S_grammar, "conflicting grammar options");
195	  }
196      }
197
198      _FlagT              _M_flags;
199      _ScannerT           _M_scanner;
200      shared_ptr<_RegexT> _M_nfa;
201      _StringT            _M_value;
202      _StackT             _M_stack;
203      const _TraitsT&     _M_traits;
204      const _CtypeT&      _M_ctype;
205    };
206
207  // [28.13.14]
208  template<typename _TraitsT, bool __icase, bool __collate>
209    class _RegexTranslatorBase
210    {
211    public:
212      typedef typename _TraitsT::char_type	      _CharT;
213      typedef typename _TraitsT::string_type	      _StringT;
214      typedef _StringT _StrTransT;
215
216      explicit
217      _RegexTranslatorBase(const _TraitsT& __traits)
218      : _M_traits(__traits)
219      { }
220
221      _CharT
222      _M_translate(_CharT __ch) const
223      {
224	if _GLIBCXX17_CONSTEXPR (__icase)
225	  return _M_traits.translate_nocase(__ch);
226	else if _GLIBCXX17_CONSTEXPR (__collate)
227	  return _M_traits.translate(__ch);
228	else
229	  return __ch;
230      }
231
232      _StrTransT
233      _M_transform(_CharT __ch) const
234      {
235	_StrTransT __str(1, __ch);
236	return _M_traits.transform(__str.begin(), __str.end());
237      }
238
239      // See LWG 523. It's not efficiently implementable when _TraitsT is not
240      // std::regex_traits<>, and __collate is true. See specializations for
241      // implementations of other cases.
242      bool
243      _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
244		     const _StrTransT& __s) const
245      { return __first <= __s && __s <= __last; }
246
247    protected:
248      bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
249      {
250	typedef std::ctype<_CharT> __ctype_type;
251	const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
252	auto __lower = __fctyp.tolower(__ch);
253	auto __upper = __fctyp.toupper(__ch);
254	return (__first <= __lower && __lower <= __last)
255	  || (__first <= __upper && __upper <= __last);
256      }
257
258      const _TraitsT& _M_traits;
259    };
260
261  template<typename _TraitsT, bool __icase, bool __collate>
262    class _RegexTranslator
263    : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
264    {
265    public:
266      typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
267      using _Base::_Base;
268    };
269
270  template<typename _TraitsT, bool __icase>
271    class _RegexTranslator<_TraitsT, __icase, false>
272    : public _RegexTranslatorBase<_TraitsT, __icase, false>
273    {
274    public:
275      typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
276      typedef typename _Base::_CharT _CharT;
277      typedef _CharT _StrTransT;
278
279      using _Base::_Base;
280
281      _StrTransT
282      _M_transform(_CharT __ch) const
283      { return __ch; }
284
285      bool
286      _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
287      {
288	if _GLIBCXX17_CONSTEXPR (!__icase)
289	  return __first <= __ch && __ch <= __last;
290	else
291	  return this->_M_in_range_icase(__first, __last, __ch);
292      }
293    };
294
295  template<typename _CharType>
296    class _RegexTranslator<std::regex_traits<_CharType>, true, true>
297    : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
298    {
299    public:
300      typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
301	_Base;
302      typedef typename _Base::_CharT _CharT;
303      typedef typename _Base::_StrTransT _StrTransT;
304
305      using _Base::_Base;
306
307      bool
308      _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
309		     const _StrTransT& __str) const
310      {
311	__glibcxx_assert(__first.size() == 1);
312	__glibcxx_assert(__last.size() == 1);
313	__glibcxx_assert(__str.size() == 1);
314	return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
315      }
316    };
317
318  template<typename _TraitsT>
319    class _RegexTranslator<_TraitsT, false, false>
320    {
321    public:
322      typedef typename _TraitsT::char_type _CharT;
323      typedef _CharT                       _StrTransT;
324
325      explicit
326      _RegexTranslator(const _TraitsT&)
327      { }
328
329      _CharT
330      _M_translate(_CharT __ch) const
331      { return __ch; }
332
333      _StrTransT
334      _M_transform(_CharT __ch) const
335      { return __ch; }
336
337      bool
338      _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
339      { return __first <= __ch && __ch <= __last; }
340    };
341
342  template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
343    struct _AnyMatcher;
344
345  template<typename _TraitsT, bool __icase, bool __collate>
346    struct _AnyMatcher<_TraitsT, false, __icase, __collate>
347    {
348      typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
349      typedef typename _TransT::_CharT                       _CharT;
350
351      explicit
352      _AnyMatcher(const _TraitsT& __traits)
353      : _M_translator(__traits)
354      { }
355
356      bool
357      operator()(_CharT __ch) const
358      {
359	static auto __nul = _M_translator._M_translate('\0');
360	return _M_translator._M_translate(__ch) != __nul;
361      }
362
363      _TransT _M_translator;
364    };
365
366  template<typename _TraitsT, bool __icase, bool __collate>
367    struct _AnyMatcher<_TraitsT, true, __icase, __collate>
368    {
369      typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
370      typedef typename _TransT::_CharT                       _CharT;
371
372      explicit
373      _AnyMatcher(const _TraitsT& __traits)
374      : _M_translator(__traits)
375      { }
376
377      bool
378      operator()(_CharT __ch) const
379      { return _M_apply(__ch, typename is_same<_CharT, char>::type()); }
380
381      bool
382      _M_apply(_CharT __ch, true_type) const
383      {
384	auto __c = _M_translator._M_translate(__ch);
385	auto __n = _M_translator._M_translate('\n');
386	auto __r = _M_translator._M_translate('\r');
387	return __c != __n && __c != __r;
388      }
389
390      bool
391      _M_apply(_CharT __ch, false_type) const
392      {
393	auto __c = _M_translator._M_translate(__ch);
394	auto __n = _M_translator._M_translate('\n');
395	auto __r = _M_translator._M_translate('\r');
396	auto __u2028 = _M_translator._M_translate(u'\u2028');
397	auto __u2029 = _M_translator._M_translate(u'\u2029');
398	return __c != __n && __c != __r && __c != __u2028 && __c != __u2029;
399      }
400
401      _TransT _M_translator;
402    };
403
404  template<typename _TraitsT, bool __icase, bool __collate>
405    struct _CharMatcher
406    {
407      typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
408      typedef typename _TransT::_CharT                       _CharT;
409
410      _CharMatcher(_CharT __ch, const _TraitsT& __traits)
411      : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
412      { }
413
414      bool
415      operator()(_CharT __ch) const
416      { return _M_ch == _M_translator._M_translate(__ch); }
417
418      _TransT _M_translator;
419      _CharT  _M_ch;
420    };
421
422  /// Matches a character range (bracket expression)
423  template<typename _TraitsT, bool __icase, bool __collate>
424    struct _BracketMatcher
425    {
426    public:
427      typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
428      typedef typename _TransT::_CharT                       _CharT;
429      typedef typename _TransT::_StrTransT                   _StrTransT;
430      typedef typename _TraitsT::string_type                 _StringT;
431      typedef typename _TraitsT::char_class_type             _CharClassT;
432
433    public:
434      _BracketMatcher(bool __is_non_matching,
435		      const _TraitsT& __traits)
436      : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
437      _M_is_non_matching(__is_non_matching)
438      { }
439
440      bool
441      operator()(_CharT __ch) const
442      {
443	_GLIBCXX_DEBUG_ASSERT(_M_is_ready);
444	return _M_apply(__ch, _UseCache());
445      }
446
447      void
448      _M_add_char(_CharT __c)
449      {
450	_M_char_set.push_back(_M_translator._M_translate(__c));
451	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
452      }
453
454      _StringT
455      _M_add_collate_element(const _StringT& __s)
456      {
457	auto __st = _M_traits.lookup_collatename(__s.data(),
458						 __s.data() + __s.size());
459	if (__st.empty())
460	  __throw_regex_error(regex_constants::error_collate,
461			      "Invalid collate element.");
462	_M_char_set.push_back(_M_translator._M_translate(__st[0]));
463	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
464	return __st;
465      }
466
467      void
468      _M_add_equivalence_class(const _StringT& __s)
469      {
470	auto __st = _M_traits.lookup_collatename(__s.data(),
471						 __s.data() + __s.size());
472	if (__st.empty())
473	  __throw_regex_error(regex_constants::error_collate,
474			      "Invalid equivalence class.");
475	__st = _M_traits.transform_primary(__st.data(),
476					   __st.data() + __st.size());
477	_M_equiv_set.push_back(__st);
478	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
479      }
480
481      // __neg should be true for \D, \S and \W only.
482      void
483      _M_add_character_class(const _StringT& __s, bool __neg)
484      {
485	auto __mask = _M_traits.lookup_classname(__s.data(),
486						 __s.data() + __s.size(),
487						 __icase);
488	if (__mask == 0)
489	  __throw_regex_error(regex_constants::error_collate,
490			      "Invalid character class.");
491	if (!__neg)
492	  _M_class_set |= __mask;
493	else
494	  _M_neg_class_set.push_back(__mask);
495	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
496      }
497
498      void
499      _M_make_range(_CharT __l, _CharT __r)
500      {
501	if (__l > __r)
502	  __throw_regex_error(regex_constants::error_range,
503			      "Invalid range in bracket expression.");
504	_M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
505					 _M_translator._M_transform(__r)));
506	_GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
507      }
508
509      void
510      _M_ready()
511      {
512	std::sort(_M_char_set.begin(), _M_char_set.end());
513	auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
514	_M_char_set.erase(__end, _M_char_set.end());
515	_M_make_cache(_UseCache());
516	_GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
517      }
518
519    private:
520      // Currently we only use the cache for char
521      using _UseCache = typename std::is_same<_CharT, char>::type;
522
523      static constexpr size_t
524      _S_cache_size =
525	1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
526
527      struct _Dummy { };
528      using _CacheT = std::__conditional_t<_UseCache::value,
529					   std::bitset<_S_cache_size>,
530					   _Dummy>;
531      using _UnsignedCharT = typename std::make_unsigned<_CharT>::type;
532
533      bool
534      _M_apply(_CharT __ch, false_type) const;
535
536      bool
537      _M_apply(_CharT __ch, true_type) const
538      { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
539
540      void
541      _M_make_cache(true_type)
542      {
543	for (unsigned __i = 0; __i < _M_cache.size(); __i++)
544	  _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type());
545      }
546
547      void
548      _M_make_cache(false_type)
549      { }
550
551    private:
552      _GLIBCXX_STD_C::vector<_CharT>            _M_char_set;
553      _GLIBCXX_STD_C::vector<_StringT>          _M_equiv_set;
554      _GLIBCXX_STD_C::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
555      _GLIBCXX_STD_C::vector<_CharClassT>       _M_neg_class_set;
556      _CharClassT                               _M_class_set;
557      _TransT                                   _M_translator;
558      const _TraitsT&                           _M_traits;
559      bool                                      _M_is_non_matching;
560      _CacheT					_M_cache;
561#ifdef _GLIBCXX_DEBUG
562      bool                                      _M_is_ready = false;
563#endif
564    };
565
566 ///@} regex-detail
567} // namespace __detail
568_GLIBCXX_END_NAMESPACE_VERSION
569} // namespace std
570
571#include <bits/regex_compiler.tcc>
572