libstdc++
regex_nfa.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2010-2013 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_nfa.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 namespace __detail
34 {
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 
37  /**
38  * @addtogroup regex-detail
39  * @{
40  */
41 
42  /// Provides a generic facade for a templated match_results.
43  struct _Results
44  {
45  virtual
46  ~_Results()
47  { }
48  virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
49  virtual void _M_set_matched(int __i, bool __is_matched) = 0;
50  virtual std::unique_ptr<_Results> _M_clone() const = 0;
51  virtual void _M_assign(const _Results& __rhs) = 0;
52  };
53 
54  class _Grep_matcher;
55  class _Automaton;
56 
57  /// Generic shared pointer to an automaton.
59 
60  /// Base class for, um, automata. Could be an NFA or a DFA. Your choice.
61  class _Automaton
62  {
63  public:
64  typedef unsigned int _SizeT;
65 
66  public:
67  virtual
68  ~_Automaton() { }
69 
70  virtual _SizeT
71  _M_sub_count() const = 0;
72 
74  _M_get_matcher(_PatternCursor& __p,
75  _Results& __r,
76  const _AutomatonPtr& __automaton,
78 
79 #ifdef _GLIBCXX_DEBUG
80  virtual std::ostream&
81  _M_dot(std::ostream& __ostr) const = 0;
82 #endif
83  };
84 
85  /// Operation codes that define the type of transitions within the base NFA
86  /// that represents the regular expression.
87  enum _Opcode
88  {
89  _S_opcode_unknown = 0,
90  _S_opcode_alternative = 1,
91  _S_opcode_subexpr_begin = 4,
92  _S_opcode_subexpr_end = 5,
93  _S_opcode_match = 100,
94  _S_opcode_accept = 255
95  };
96 
97  /// Tags current state (for subexpr begin/end).
98  typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
99 
100  /// Start state tag.
101  template<typename _FwdIterT, typename _TraitsT>
103  {
104  explicit
105  _StartTagger(int __i)
106  : _M_index(__i)
107  { }
108 
109  void
110  operator()(const _PatternCursor& __pc, _Results& __r)
111  { __r._M_set_pos(_M_index, 0, __pc); }
112 
113  int _M_index;
114  };
115 
116  /// End state tag.
117  template<typename _FwdIterT, typename _TraitsT>
118  struct _EndTagger
119  {
120  explicit
121  _EndTagger(int __i)
122  : _M_index(__i)
123  { }
124 
125  void
126  operator()(const _PatternCursor& __pc, _Results& __r)
127  { __r._M_set_pos(_M_index, 1, __pc); }
128 
129  int _M_index;
130  };
131 
132  /// Indicates if current state matches cursor current.
133  typedef std::function<bool (const _PatternCursor&)> _Matcher;
134 
135  /// Matches any character
136  inline bool
138  { return true; }
139 
140  /// Matches a single character
141  template<typename _InIterT, typename _TraitsT>
143  {
144  typedef typename _TraitsT::char_type char_type;
145 
146  explicit
147  _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
148  : _M_traits(__t), _M_c(_M_traits.translate(__c))
149  { }
150 
151  bool
152  operator()(const _PatternCursor& __pc) const
153  {
154  typedef const _SpecializedCursor<_InIterT>& _CursorT;
155  _CursorT __c = static_cast<_CursorT>(__pc);
156  return _M_traits.translate(__c._M_current()) == _M_c;
157  }
158 
159  const _TraitsT& _M_traits;
160  char_type _M_c;
161  };
162 
163  /// Matches a character range (bracket expression)
164  template<typename _InIterT, typename _TraitsT>
166  {
167  typedef typename _TraitsT::char_type _CharT;
169 
170  explicit
171  _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
172  : _M_traits(__t), _M_is_non_matching(__is_non_matching)
173  { }
174 
175  bool
176  operator()(const _PatternCursor& __pc) const
177  {
178  typedef const _SpecializedCursor<_InIterT>& _CursorT;
179  _CursorT __c = static_cast<_CursorT>(__pc);
180  return true;
181  }
182 
183  void
184  _M_add_char(_CharT __c)
185  { }
186 
187  void
188  _M_add_collating_element(const _StringT& __s)
189  { }
190 
191  void
192  _M_add_equivalence_class(const _StringT& __s)
193  { }
194 
195  void
196  _M_add_character_class(const _StringT& __s)
197  { }
198 
199  void
200  _M_make_range()
201  { }
202 
203  const _TraitsT& _M_traits;
204  bool _M_is_non_matching;
205  };
206 
207  /// Identifies a state in the NFA.
208  typedef int _StateIdT;
209 
210  /// The special case in which a state identifier is not an index.
211  static const _StateIdT _S_invalid_state_id = -1;
212 
213 
214  /**
215  * @brief struct _State
216  *
217  * An individual state in an NFA
218  *
219  * In this case a "state" is an entry in the NFA definition coupled
220  * with its outgoing transition(s). All states have a single outgoing
221  * transition, except for accepting states (which have no outgoing
222  * transitions) and alt states, which have two outgoing transitions.
223  */
224  struct _State
225  {
226  typedef int _OpcodeT;
227 
228  _OpcodeT _M_opcode; // type of outgoing transition
229  _StateIdT _M_next; // outgoing transition
230  _StateIdT _M_alt; // for _S_opcode_alternative
231  unsigned int _M_subexpr; // for _S_opcode_subexpr_*
232  _Tagger _M_tagger; // for _S_opcode_subexpr_*
233  _Matcher _M_matches; // for _S_opcode_match
234 
235  explicit _State(_OpcodeT __opcode)
236  : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
237  { }
238 
239  _State(const _Matcher& __m)
240  : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
241  { }
242 
243  _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
244  : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
245  _M_tagger(__t)
246  { }
247 
248  _State(_StateIdT __next, _StateIdT __alt)
249  : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
250  { }
251 
252 #ifdef _GLIBCXX_DEBUG
253  std::ostream&
254  _M_print(std::ostream& ostr) const;
255 
256  // Prints graphviz dot commands for state.
257  std::ostream&
258  _M_dot(std::ostream& __ostr, _StateIdT __id) const;
259 #endif
260  };
261 
262 
263  /// The Grep Matcher works on sets of states. Here are sets of states.
265 
266  /**
267  * @brief struct _Nfa
268  *
269  * A collection of all states making up an NFA.
270  *
271  * An NFA is a 4-tuple M = (K, S, s, F), where
272  * K is a finite set of states,
273  * S is the alphabet of the NFA,
274  * s is the initial state,
275  * F is a set of final (accepting) states.
276  *
277  * This NFA class is templated on S, a type that will hold values of the
278  * underlying alphabet (without regard to semantics of that alphabet). The
279  * other elements of the tuple are generated during construction of the NFA
280  * and are available through accessor member functions.
281  */
282  class _Nfa
283  : public _Automaton, public std::vector<_State>
284  {
285  public:
286  typedef _State _StateT;
287  typedef unsigned int _SizeT;
289 
290  _Nfa(_FlagT __f)
291  : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
292  // TODO: BFS by default. Your choice. Need to be set by the compiler.
293  _M_has_back_ref(false)
294  { }
295 
296  ~_Nfa()
297  { }
298 
299  _FlagT
300  _M_options() const
301  { return _M_flags; }
302 
303  _StateIdT
304  _M_start() const
305  { return _M_start_state; }
306 
307  const _StateSet&
308  _M_final_states() const
309  { return _M_accepting_states; }
310 
311  _SizeT
312  _M_sub_count() const
313  { return _M_subexpr_count; }
314 
315  _StateIdT
316  _M_insert_accept()
317  {
318  this->push_back(_StateT(_S_opcode_accept));
319  _M_accepting_states.insert(this->size()-1);
320  return this->size()-1;
321  }
322 
323  _StateIdT
324  _M_insert_alt(_StateIdT __next, _StateIdT __alt)
325  {
326  this->push_back(_StateT(__next, __alt));
327  return this->size()-1;
328  }
329 
330  _StateIdT
331  _M_insert_matcher(_Matcher __m)
332  {
333  this->push_back(_StateT(__m));
334  return this->size()-1;
335  }
336 
337  _StateIdT
338  _M_insert_subexpr_begin(const _Tagger& __t)
339  {
340  this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++,
341  __t));
342  return this->size()-1;
343  }
344 
345  _StateIdT
346  _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
347  {
348  this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
349  return this->size()-1;
350  }
351 
352  void
353  _M_set_back_ref(bool __b)
354  { _M_has_back_ref = __b; }
355 
357  _M_get_matcher(_PatternCursor& __p,
358  _Results& __r,
359  const _AutomatonPtr& __automaton,
361 
362 #ifdef _GLIBCXX_DEBUG
363  std::ostream&
364  _M_dot(std::ostream& __ostr) const;
365 #endif
366 
367  private:
368  _FlagT _M_flags;
369  _StateIdT _M_start_state;
370  _StateSet _M_accepting_states;
371  _SizeT _M_subexpr_count;
372  bool _M_has_back_ref;
373  };
374 
375  /// Describes a sequence of one or more %_State, its current start
376  /// and end(s). This structure contains fragments of an NFA during
377  /// construction.
378  class _StateSeq
379  {
380  public:
381  // Constructs a single-node sequence
383  : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
384  { }
385  // Constructs a split sequence from two other sequencces
386  _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
387  : _M_nfa(__e1._M_nfa),
388  _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
389  _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
390  { }
391 
392  // Constructs a split sequence from a single sequence
393  _StateSeq(const _StateSeq& __e, _StateIdT __id)
394  : _M_nfa(__e._M_nfa),
395  _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
396  _M_end1(__id), _M_end2(__e._M_end1)
397  { }
398 
399  // Constructs a copy of a %_StateSeq
400  _StateSeq(const _StateSeq& __rhs)
401  : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
402  _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
403  { }
404 
405 
406  _StateSeq& operator=(const _StateSeq& __rhs);
407 
408  _StateIdT
409  _M_front() const
410  { return _M_start; }
411 
412  // Extends a sequence by one.
413  void
414  _M_push_back(_StateIdT __id);
415 
416  // Extends and maybe joins a sequence.
417  void
418  _M_append(_StateIdT __id);
419 
420  void
421  _M_append(_StateSeq& __rhs);
422 
423  // Clones an entire sequence.
424  _StateIdT
425  _M_clone();
426 
427  private:
428  _Nfa& _M_nfa;
429  _StateIdT _M_start;
430  _StateIdT _M_end1;
431  _StateIdT _M_end2;
432 
433  };
434 
435  //@} regex-detail
436 _GLIBCXX_END_NAMESPACE_VERSION
437 } // namespace __detail
438 } // namespace std
439 
440 #include <bits/regex_nfa.tcc>
441