libstdc++
regex_scanner.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2015 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 // back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 namespace __detail
52 {
53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
54 
55  template<typename _CharT>
56  _Scanner<_CharT>::
57  _Scanner(typename _Scanner::_IterT __begin,
58  typename _Scanner::_IterT __end,
59  _FlagT __flags, std::locale __loc)
60  : _ScannerBase(__flags),
61  _M_current(__begin), _M_end(__end),
62  _M_ctype(std::use_facet<_CtypeT>(__loc)),
63  _M_eat_escape(_M_is_ecma()
64  ? &_Scanner::_M_eat_escape_ecma
65  : &_Scanner::_M_eat_escape_posix)
66  { _M_advance(); }
67 
68  template<typename _CharT>
69  void
70  _Scanner<_CharT>::
71  _M_advance()
72  {
73  if (_M_current == _M_end)
74  {
75  _M_token = _S_token_eof;
76  return;
77  }
78 
79  if (_M_state == _S_state_normal)
80  _M_scan_normal();
81  else if (_M_state == _S_state_in_bracket)
82  _M_scan_in_bracket();
83  else if (_M_state == _S_state_in_brace)
84  _M_scan_in_brace();
85  else
86  {
87  _GLIBCXX_DEBUG_ASSERT(false);
88  }
89  }
90 
91  // Differences between styles:
92  // 1) "\(", "\)", "\{" in basic. It's not escaping.
93  // 2) "(?:", "(?=", "(?!" in ECMAScript.
94  template<typename _CharT>
95  void
96  _Scanner<_CharT>::
97  _M_scan_normal()
98  {
99  auto __c = *_M_current++;
100  const char* __pos;
101 
102  if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr)
103  {
104  _M_token = _S_token_ord_char;
105  _M_value.assign(1, __c);
106  return;
107  }
108  if (__c == '\\')
109  {
110  if (_M_current == _M_end)
111  __throw_regex_error(regex_constants::error_escape);
112 
113  if (!_M_is_basic()
114  || (*_M_current != '('
115  && *_M_current != ')'
116  && *_M_current != '{'))
117  {
118  (this->*_M_eat_escape)();
119  return;
120  }
121  __c = *_M_current++;
122  }
123  if (__c == '(')
124  {
125  if (_M_is_ecma() && *_M_current == '?')
126  {
127  if (++_M_current == _M_end)
128  __throw_regex_error(regex_constants::error_paren);
129 
130  if (*_M_current == ':')
131  {
132  ++_M_current;
133  _M_token = _S_token_subexpr_no_group_begin;
134  }
135  else if (*_M_current == '=')
136  {
137  ++_M_current;
138  _M_token = _S_token_subexpr_lookahead_begin;
139  _M_value.assign(1, 'p');
140  }
141  else if (*_M_current == '!')
142  {
143  ++_M_current;
144  _M_token = _S_token_subexpr_lookahead_begin;
145  _M_value.assign(1, 'n');
146  }
147  else
148  __throw_regex_error(regex_constants::error_paren);
149  }
150  else if (_M_flags & regex_constants::nosubs)
151  _M_token = _S_token_subexpr_no_group_begin;
152  else
153  _M_token = _S_token_subexpr_begin;
154  }
155  else if (__c == ')')
156  _M_token = _S_token_subexpr_end;
157  else if (__c == '[')
158  {
159  _M_state = _S_state_in_bracket;
160  _M_at_bracket_start = true;
161  if (_M_current != _M_end && *_M_current == '^')
162  {
163  _M_token = _S_token_bracket_neg_begin;
164  ++_M_current;
165  }
166  else
167  _M_token = _S_token_bracket_begin;
168  }
169  else if (__c == '{')
170  {
171  _M_state = _S_state_in_brace;
172  _M_token = _S_token_interval_begin;
173  }
174  else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
175  != nullptr
176  && *__pos != '\0'
177  && __c != ']'
178  && __c != '}')
179  || (_M_is_grep() && __c == '\n'))
180  {
181  auto __it = _M_token_tbl;
182  auto __narrowc = _M_ctype.narrow(__c, '\0');
183  for (; __it->first != '\0'; ++__it)
184  if (__it->first == __narrowc)
185  {
186  _M_token = __it->second;
187  return;
188  }
189  _GLIBCXX_DEBUG_ASSERT(false);
190  }
191  else
192  {
193  _M_token = _S_token_ord_char;
194  _M_value.assign(1, __c);
195  }
196  }
197 
198  // Differences between styles:
199  // 1) different semantics of "[]" and "[^]".
200  // 2) Escaping in bracket expr.
201  template<typename _CharT>
202  void
203  _Scanner<_CharT>::
204  _M_scan_in_bracket()
205  {
206  if (_M_current == _M_end)
207  __throw_regex_error(regex_constants::error_brack);
208 
209  auto __c = *_M_current++;
210 
211  if (__c == '[')
212  {
213  if (_M_current == _M_end)
214  __throw_regex_error(regex_constants::error_brack);
215 
216  if (*_M_current == '.')
217  {
218  _M_token = _S_token_collsymbol;
219  _M_eat_class(*_M_current++);
220  }
221  else if (*_M_current == ':')
222  {
223  _M_token = _S_token_char_class_name;
224  _M_eat_class(*_M_current++);
225  }
226  else if (*_M_current == '=')
227  {
228  _M_token = _S_token_equiv_class_name;
229  _M_eat_class(*_M_current++);
230  }
231  else
232  {
233  _M_token = _S_token_ord_char;
234  _M_value.assign(1, __c);
235  }
236  }
237  // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
238  // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
239  // `*/empty_range.cc`.
240  else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
241  {
242  _M_token = _S_token_bracket_end;
243  _M_state = _S_state_normal;
244  }
245  // ECMAScript and awk permits escaping in bracket.
246  else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
247  (this->*_M_eat_escape)();
248  else
249  {
250  _M_token = _S_token_ord_char;
251  _M_value.assign(1, __c);
252  }
253  _M_at_bracket_start = false;
254  }
255 
256  // Differences between styles:
257  // 1) "\}" in basic style.
258  template<typename _CharT>
259  void
260  _Scanner<_CharT>::
261  _M_scan_in_brace()
262  {
263  if (_M_current == _M_end)
264  __throw_regex_error(regex_constants::error_brace);
265 
266  auto __c = *_M_current++;
267 
268  if (_M_ctype.is(_CtypeT::digit, __c))
269  {
270  _M_token = _S_token_dup_count;
271  _M_value.assign(1, __c);
272  while (_M_current != _M_end
273  && _M_ctype.is(_CtypeT::digit, *_M_current))
274  _M_value += *_M_current++;
275  }
276  else if (__c == ',')
277  _M_token = _S_token_comma;
278  // basic use \}.
279  else if (_M_is_basic())
280  {
281  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
282  {
283  _M_state = _S_state_normal;
284  _M_token = _S_token_interval_end;
285  ++_M_current;
286  }
287  else
288  __throw_regex_error(regex_constants::error_badbrace);
289  }
290  else if (__c == '}')
291  {
292  _M_state = _S_state_normal;
293  _M_token = _S_token_interval_end;
294  }
295  else
296  __throw_regex_error(regex_constants::error_badbrace);
297  }
298 
299  template<typename _CharT>
300  void
301  _Scanner<_CharT>::
302  _M_eat_escape_ecma()
303  {
304  if (_M_current == _M_end)
305  __throw_regex_error(regex_constants::error_escape);
306 
307  auto __c = *_M_current++;
308  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
309 
310  if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
311  {
312  _M_token = _S_token_ord_char;
313  _M_value.assign(1, *__pos);
314  }
315  else if (__c == 'b')
316  {
317  _M_token = _S_token_word_bound;
318  _M_value.assign(1, 'p');
319  }
320  else if (__c == 'B')
321  {
322  _M_token = _S_token_word_bound;
323  _M_value.assign(1, 'n');
324  }
325  // N3376 28.13
326  else if (__c == 'd'
327  || __c == 'D'
328  || __c == 's'
329  || __c == 'S'
330  || __c == 'w'
331  || __c == 'W')
332  {
333  _M_token = _S_token_quoted_class;
334  _M_value.assign(1, __c);
335  }
336  else if (__c == 'c')
337  {
338  if (_M_current == _M_end)
339  __throw_regex_error(regex_constants::error_escape);
340  _M_token = _S_token_ord_char;
341  _M_value.assign(1, *_M_current++);
342  }
343  else if (__c == 'x' || __c == 'u')
344  {
345  _M_value.erase();
346  for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
347  {
348  if (_M_current == _M_end
349  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
350  __throw_regex_error(regex_constants::error_escape);
351  _M_value += *_M_current++;
352  }
353  _M_token = _S_token_hex_num;
354  }
355  // ECMAScript recognizes multi-digit back-references.
356  else if (_M_ctype.is(_CtypeT::digit, __c))
357  {
358  _M_value.assign(1, __c);
359  while (_M_current != _M_end
360  && _M_ctype.is(_CtypeT::digit, *_M_current))
361  _M_value += *_M_current++;
362  _M_token = _S_token_backref;
363  }
364  else
365  {
366  _M_token = _S_token_ord_char;
367  _M_value.assign(1, __c);
368  }
369  }
370 
371  // Differences between styles:
372  // 1) Extended doesn't support backref, but basic does.
373  template<typename _CharT>
374  void
375  _Scanner<_CharT>::
376  _M_eat_escape_posix()
377  {
378  if (_M_current == _M_end)
379  __throw_regex_error(regex_constants::error_escape);
380 
381  auto __c = *_M_current;
382  auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
383 
384  if (__pos != nullptr && *__pos != '\0')
385  {
386  _M_token = _S_token_ord_char;
387  _M_value.assign(1, __c);
388  }
389  // We MUST judge awk before handling backrefs. There's no backref in awk.
390  else if (_M_is_awk())
391  {
392  _M_eat_escape_awk();
393  return;
394  }
395  else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
396  {
397  _M_token = _S_token_backref;
398  _M_value.assign(1, __c);
399  }
400  else
401  {
402 #ifdef __STRICT_ANSI__
403  // POSIX says it is undefined to escape ordinary characters
404  __throw_regex_error(regex_constants::error_escape);
405 #else
406  _M_token = _S_token_ord_char;
407  _M_value.assign(1, __c);
408 #endif
409  }
410  ++_M_current;
411  }
412 
413  template<typename _CharT>
414  void
415  _Scanner<_CharT>::
416  _M_eat_escape_awk()
417  {
418  auto __c = *_M_current++;
419  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
420 
421  if (__pos != nullptr)
422  {
423  _M_token = _S_token_ord_char;
424  _M_value.assign(1, *__pos);
425  }
426  // \ddd for oct representation
427  else if (_M_ctype.is(_CtypeT::digit, __c)
428  && __c != '8'
429  && __c != '9')
430  {
431  _M_value.assign(1, __c);
432  for (int __i = 0;
433  __i < 2
434  && _M_current != _M_end
435  && _M_ctype.is(_CtypeT::digit, *_M_current)
436  && *_M_current != '8'
437  && *_M_current != '9';
438  __i++)
439  _M_value += *_M_current++;
440  _M_token = _S_token_oct_num;
441  return;
442  }
443  else
444  __throw_regex_error(regex_constants::error_escape);
445  }
446 
447  // Eats a character class or throws an exception.
448  // __ch could be ':', '.' or '=', _M_current is the char after ']' when
449  // returning.
450  template<typename _CharT>
451  void
452  _Scanner<_CharT>::
453  _M_eat_class(char __ch)
454  {
455  for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
456  _M_value += *_M_current++;
457  if (_M_current == _M_end
458  || *_M_current++ != __ch
459  || _M_current == _M_end // skip __ch
460  || *_M_current++ != ']') // skip ']'
461  {
462  if (__ch == ':')
463  __throw_regex_error(regex_constants::error_ctype);
464  else
465  __throw_regex_error(regex_constants::error_collate);
466  }
467  }
468 
469 #ifdef _GLIBCXX_DEBUG
470  template<typename _CharT>
471  std::ostream&
472  _Scanner<_CharT>::
473  _M_print(std::ostream& ostr)
474  {
475  switch (_M_token)
476  {
477  case _S_token_anychar:
478  ostr << "any-character\n";
479  break;
480  case _S_token_backref:
481  ostr << "backref\n";
482  break;
483  case _S_token_bracket_begin:
484  ostr << "bracket-begin\n";
485  break;
486  case _S_token_bracket_neg_begin:
487  ostr << "bracket-neg-begin\n";
488  break;
489  case _S_token_bracket_end:
490  ostr << "bracket-end\n";
491  break;
492  case _S_token_char_class_name:
493  ostr << "char-class-name \"" << _M_value << "\"\n";
494  break;
495  case _S_token_closure0:
496  ostr << "closure0\n";
497  break;
498  case _S_token_closure1:
499  ostr << "closure1\n";
500  break;
501  case _S_token_collsymbol:
502  ostr << "collsymbol \"" << _M_value << "\"\n";
503  break;
504  case _S_token_comma:
505  ostr << "comma\n";
506  break;
507  case _S_token_dup_count:
508  ostr << "dup count: " << _M_value << "\n";
509  break;
510  case _S_token_eof:
511  ostr << "EOF\n";
512  break;
513  case _S_token_equiv_class_name:
514  ostr << "equiv-class-name \"" << _M_value << "\"\n";
515  break;
516  case _S_token_interval_begin:
517  ostr << "interval begin\n";
518  break;
519  case _S_token_interval_end:
520  ostr << "interval end\n";
521  break;
522  case _S_token_line_begin:
523  ostr << "line begin\n";
524  break;
525  case _S_token_line_end:
526  ostr << "line end\n";
527  break;
528  case _S_token_opt:
529  ostr << "opt\n";
530  break;
531  case _S_token_or:
532  ostr << "or\n";
533  break;
534  case _S_token_ord_char:
535  ostr << "ordinary character: \"" << _M_value << "\"\n";
536  break;
537  case _S_token_subexpr_begin:
538  ostr << "subexpr begin\n";
539  break;
540  case _S_token_subexpr_no_group_begin:
541  ostr << "no grouping subexpr begin\n";
542  break;
543  case _S_token_subexpr_lookahead_begin:
544  ostr << "lookahead subexpr begin\n";
545  break;
546  case _S_token_subexpr_end:
547  ostr << "subexpr end\n";
548  break;
549  case _S_token_unknown:
550  ostr << "-- unknown token --\n";
551  break;
552  case _S_token_oct_num:
553  ostr << "oct number " << _M_value << "\n";
554  break;
555  case _S_token_hex_num:
556  ostr << "hex number " << _M_value << "\n";
557  break;
558  case _S_token_quoted_class:
559  ostr << "quoted class " << "\\" << _M_value << "\n";
560  break;
561  default:
562  _GLIBCXX_DEBUG_ASSERT(false);
563  }
564  return ostr;
565  }
566 #endif
567 
568 _GLIBCXX_END_NAMESPACE_VERSION
569 } // namespace __detail
570 } // namespace
constexpr error_type error_ctype(_S_error_ctype)
constexpr syntax_option_type nosubs
constexpr error_type error_brack(_S_error_brack)
constexpr error_type error_collate(_S_error_collate)
constexpr error_type error_badbrace(_S_error_badbrace)
constexpr error_type error_escape(_S_error_escape)
constexpr error_type error_brace(_S_error_brace)
Container class for localization functionality.The locale class is first a class wrapper for C librar...
constexpr error_type error_paren(_S_error_paren)
const _Facet & use_facet(const locale &__loc)
Return a facet.use_facet looks for and returns a reference to a facet of type Facet where Facet is th...