regex_scanner.h revision 1.6
1// class template regex -*- C++ -*-
2
3// Copyright (C) 2013-2020 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library.  This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23// <http://www.gnu.org/licenses/>.
24
25/**
26 *  @file bits/regex_scanner.h
27 *  This is an internal header file, included by other library headers.
28 *  Do not attempt to use it directly. @headername{regex}
29 */
30
31namespace std _GLIBCXX_VISIBILITY(default)
32{
33_GLIBCXX_BEGIN_NAMESPACE_VERSION
34
35namespace __detail
36{
37  /**
38   * @addtogroup regex-detail
39   * @{
40   */
41
42  struct _ScannerBase
43  {
44  public:
45    /// Token types returned from the scanner.
46    enum _TokenT : unsigned
47    {
48      _S_token_anychar,
49      _S_token_ord_char,
50      _S_token_oct_num,
51      _S_token_hex_num,
52      _S_token_backref,
53      _S_token_subexpr_begin,
54      _S_token_subexpr_no_group_begin,
55      _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
56      _S_token_subexpr_end,
57      _S_token_bracket_begin,
58      _S_token_bracket_neg_begin,
59      _S_token_bracket_end,
60      _S_token_interval_begin,
61      _S_token_interval_end,
62      _S_token_quoted_class,
63      _S_token_char_class_name,
64      _S_token_collsymbol,
65      _S_token_equiv_class_name,
66      _S_token_opt,
67      _S_token_or,
68      _S_token_closure0,
69      _S_token_closure1,
70      _S_token_line_begin,
71      _S_token_line_end,
72      _S_token_word_bound, // neg if _M_value[0] == 'n'
73      _S_token_comma,
74      _S_token_dup_count,
75      _S_token_eof,
76      _S_token_bracket_dash,
77      _S_token_unknown = -1u
78    };
79
80  protected:
81    typedef regex_constants::syntax_option_type _FlagT;
82
83    enum _StateT
84    {
85      _S_state_normal,
86      _S_state_in_brace,
87      _S_state_in_bracket,
88    };
89
90  protected:
91    _ScannerBase(_FlagT __flags)
92    : _M_state(_S_state_normal),
93    _M_flags(__flags),
94    _M_escape_tbl(_M_is_ecma()
95		  ? _M_ecma_escape_tbl
96		  : _M_awk_escape_tbl),
97    _M_spec_char(_M_is_ecma()
98		 ? _M_ecma_spec_char
99		 : _M_flags & regex_constants::basic
100		 ? _M_basic_spec_char
101		 : _M_flags & regex_constants::extended
102		 ? _M_extended_spec_char
103		 : _M_flags & regex_constants::grep
104		 ?  ".[\\*^$\n"
105		 : _M_flags & regex_constants::egrep
106		 ? ".[\\()*+?{|^$\n"
107		 : _M_flags & regex_constants::awk
108		 ? _M_extended_spec_char
109		 : nullptr),
110    _M_at_bracket_start(false)
111    { __glibcxx_assert(_M_spec_char); }
112
113  protected:
114    const char*
115    _M_find_escape(char __c)
116    {
117      auto __it = _M_escape_tbl;
118      for (; __it->first != '\0'; ++__it)
119	if (__it->first == __c)
120	  return &__it->second;
121      return nullptr;
122    }
123
124    bool
125    _M_is_ecma() const
126    { return _M_flags & regex_constants::ECMAScript; }
127
128    bool
129    _M_is_basic() const
130    { return _M_flags & (regex_constants::basic | regex_constants::grep); }
131
132    bool
133    _M_is_extended() const
134    {
135      return _M_flags & (regex_constants::extended
136			 | regex_constants::egrep
137			 | regex_constants::awk);
138    }
139
140    bool
141    _M_is_grep() const
142    { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
143
144    bool
145    _M_is_awk() const
146    { return _M_flags & regex_constants::awk; }
147
148  protected:
149    // TODO: Make them static in the next abi change.
150    const std::pair<char, _TokenT> _M_token_tbl[9] =
151      {
152	{'^', _S_token_line_begin},
153	{'$', _S_token_line_end},
154	{'.', _S_token_anychar},
155	{'*', _S_token_closure0},
156	{'+', _S_token_closure1},
157	{'?', _S_token_opt},
158	{'|', _S_token_or},
159	{'\n', _S_token_or}, // grep and egrep
160	{'\0', _S_token_or},
161      };
162    const std::pair<char, char> _M_ecma_escape_tbl[8] =
163      {
164	{'0', '\0'},
165	{'b', '\b'},
166	{'f', '\f'},
167	{'n', '\n'},
168	{'r', '\r'},
169	{'t', '\t'},
170	{'v', '\v'},
171	{'\0', '\0'},
172      };
173    const std::pair<char, char> _M_awk_escape_tbl[11] =
174      {
175	{'"', '"'},
176	{'/', '/'},
177	{'\\', '\\'},
178	{'a', '\a'},
179	{'b', '\b'},
180	{'f', '\f'},
181	{'n', '\n'},
182	{'r', '\r'},
183	{'t', '\t'},
184	{'v', '\v'},
185	{'\0', '\0'},
186      };
187    const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
188    const char* _M_basic_spec_char = ".[\\*^$";
189    const char* _M_extended_spec_char = ".[\\()*+?{|^$";
190
191    _StateT                       _M_state;
192    _FlagT                        _M_flags;
193    _TokenT                       _M_token;
194    const std::pair<char, char>*  _M_escape_tbl;
195    const char*                   _M_spec_char;
196    bool                          _M_at_bracket_start;
197  };
198
199  /**
200   * @brief Scans an input range for regex tokens.
201   *
202   * The %_Scanner class interprets the regular expression pattern in
203   * the input range passed to its constructor as a sequence of parse
204   * tokens passed to the regular expression compiler.  The sequence
205   * of tokens provided depends on the flag settings passed to the
206   * constructor: different regular expression grammars will interpret
207   * the same input pattern in syntactically different ways.
208   */
209  template<typename _CharT>
210    class _Scanner
211    : public _ScannerBase
212    {
213    public:
214      typedef const _CharT*                                       _IterT;
215      typedef std::basic_string<_CharT>                           _StringT;
216      typedef regex_constants::syntax_option_type                 _FlagT;
217      typedef const std::ctype<_CharT>                            _CtypeT;
218
219      _Scanner(_IterT __begin, _IterT __end,
220	       _FlagT __flags, std::locale __loc);
221
222      void
223      _M_advance();
224
225      _TokenT
226      _M_get_token() const
227      { return _M_token; }
228
229      const _StringT&
230      _M_get_value() const
231      { return _M_value; }
232
233#ifdef _GLIBCXX_DEBUG
234      std::ostream&
235      _M_print(std::ostream&);
236#endif
237
238    private:
239      void
240      _M_scan_normal();
241
242      void
243      _M_scan_in_bracket();
244
245      void
246      _M_scan_in_brace();
247
248      void
249      _M_eat_escape_ecma();
250
251      void
252      _M_eat_escape_posix();
253
254      void
255      _M_eat_escape_awk();
256
257      void
258      _M_eat_class(char);
259
260      _IterT                        _M_current;
261      _IterT                        _M_end;
262      _CtypeT&                      _M_ctype;
263      _StringT                      _M_value;
264      void (_Scanner::* _M_eat_escape)();
265    };
266
267 ///@} regex-detail
268} // namespace __detail
269_GLIBCXX_END_NAMESPACE_VERSION
270} // namespace std
271
272#include <bits/regex_scanner.tcc>
273