PageRenderTime 60ms CodeModel.GetById 21ms app.highlight 33ms RepoModel.GetById 1ms app.codeStats 0ms

/Src/Dependencies/Boost/boost/spirit/home/support/detail/lexer/parser/parser.hpp

http://hadesmem.googlecode.com/
C++ Header | 511 lines | 398 code | 70 blank | 43 comment | 65 complexity | a016dfdaa8dfa86881f8ba87cf5273a4 MD5 | raw file
  1// parser.hpp
  2// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3//
  4// Distributed under the Boost Software License, Version 1.0. (See accompanying
  5// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6#ifndef BOOST_LEXER_PARSER_HPP
  7#define BOOST_LEXER_PARSER_HPP
  8
  9#include <boost/assert.hpp>
 10#include "tree/end_node.hpp"
 11#include "tree/iteration_node.hpp"
 12#include "tree/leaf_node.hpp"
 13#include "../runtime_error.hpp"
 14#include "tree/selection_node.hpp"
 15#include "tree/sequence_node.hpp"
 16#include "../size_t.hpp"
 17#include "tokeniser/re_tokeniser.hpp"
 18
 19namespace boost
 20{
 21namespace lexer
 22{
 23namespace detail
 24{
 25template<typename CharT>
 26class basic_parser
 27{
 28public:
 29    typedef basic_re_tokeniser<CharT> tokeniser;
 30    typedef typename tokeniser::string string;
 31    typedef std::map<string, const node *> macro_map;
 32    typedef node::node_ptr_vector node_ptr_vector;
 33    typedef typename tokeniser::num_token token;
 34
 35/*
 36    General principles of regex parsing:
 37    - Every regex is a sequence of sub-regexes.
 38    - Regexes consist of operands and operators
 39    - All operators decompose to sequence, selection ('|') and iteration ('*')
 40    - Regex tokens are stored on the stack.
 41    - When a complete sequence of regex tokens is on the stack it is processed.
 42
 43Grammar:
 44
 45<REGEX>      -> <OREXP>
 46<OREXP>      -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
 47<SEQUENCE>   -> <SUB>
 48<SUB>        -> <EXPRESSION> | <SUB><EXPRESSION>
 49<EXPRESSION> -> <REPEAT>
 50<REPEAT>     -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
 51<DUPLICATE>  -> '?' | '*' | '+' | '{n[,[m]]}'
 52*/
 53    static node *parse (const CharT *start_, const CharT * const end_,
 54        const std::size_t id_, const std::size_t unique_id_,
 55        const std::size_t dfa_state_, const regex_flags flags_,
 56        const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
 57        const macro_map &macromap_, typename tokeniser::token_map &map_,
 58        bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
 59    {
 60        node *root_ = 0;
 61        state state_ (start_, end_, flags_, locale_);
 62        token lhs_token_;
 63        token rhs_token_;
 64        token_stack token_stack_;
 65        tree_node_stack tree_node_stack_;
 66        char action_ = 0;
 67
 68        token_stack_.push (rhs_token_);
 69        tokeniser::next (state_, map_, rhs_token_);
 70
 71        do
 72        {
 73            lhs_token_ = token_stack_.top ();
 74            action_ = lhs_token_.precedence (rhs_token_._type);
 75
 76            switch (action_)
 77            {
 78            case '<':
 79            case '=':
 80                token_stack_.push (rhs_token_);
 81                tokeniser::next (state_, map_, rhs_token_);
 82                break;
 83            case '>':
 84                reduce (token_stack_, macromap_, node_ptr_vector_,
 85                    tree_node_stack_);
 86                break;
 87            default:
 88                std::ostringstream ss_;
 89
 90                ss_ << "A syntax error occurred: '" <<
 91                    lhs_token_.precedence_string () <<
 92                    "' against '" << rhs_token_.precedence_string () <<
 93                    "' at index " << state_.index () << ".";
 94                throw runtime_error (ss_.str ().c_str ());
 95                break;
 96            }
 97        } while (!token_stack_.empty ());
 98
 99        if (tree_node_stack_.empty ())
100        {
101            throw runtime_error ("Empty rules are not allowed.");
102        }
103
104        BOOST_ASSERT(tree_node_stack_.size () == 1);
105
106        node *lhs_node_ = tree_node_stack_.top ();
107
108        tree_node_stack_.pop ();
109
110        if (id_ == 0)
111        {
112            // Macros have no end state...
113            root_ = lhs_node_;
114        }
115        else
116        {
117            node_ptr_vector_->push_back (static_cast<end_node *>(0));
118
119            node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
120
121            node_ptr_vector_->back () = rhs_node_;
122            node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
123            node_ptr_vector_->back () = new sequence_node
124                (lhs_node_, rhs_node_);
125            root_ = node_ptr_vector_->back ();
126        }
127
128        // Done this way as bug in VC++ 6 prevents |= operator working
129        // properly!
130        if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
131
132        if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
133
134        return root_;
135    }
136
137private:
138    typedef typename tokeniser::state state;
139    typedef std::stack<token> token_stack;
140    typedef node::node_stack tree_node_stack;
141
142    static void reduce (token_stack &token_stack_,
143        const macro_map &macromap_, node_ptr_vector &node_vector_ptr_,
144        tree_node_stack &tree_node_stack_)
145    {
146        typename tokeniser::num_token lhs_;
147        typename tokeniser::num_token rhs_;
148        token_stack handle_;
149        char action_ = 0;
150
151        do
152        {
153            rhs_ = token_stack_.top ();
154            token_stack_.pop ();
155            handle_.push (rhs_);
156
157            if (!token_stack_.empty ())
158            {
159                lhs_ = token_stack_.top ();
160                action_ = lhs_.precedence (rhs_._type);
161            }
162        } while (!token_stack_.empty () && action_ == '=');
163
164        BOOST_ASSERT(token_stack_.empty () || action_ == '<');
165
166        switch (rhs_._type)
167        {
168        case token::BEGIN:
169            // finished processing so exit
170            break;
171        case token::REGEX:
172            // finished parsing, nothing to do
173            break;
174        case token::OREXP:
175            orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
176            break;
177        case token::SEQUENCE:
178            token_stack_.push (token::OREXP);
179            break;
180        case token::SUB:
181            sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
182            break;
183        case token::EXPRESSION:
184            token_stack_.push (token::SUB);
185            break;
186        case token::REPEAT:
187            repeat (handle_, token_stack_);
188            break;
189        case token::CHARSET:
190            charset (handle_, token_stack_, node_vector_ptr_,
191                tree_node_stack_);
192            break;
193        case token::MACRO:
194            macro (handle_, token_stack_, macromap_, node_vector_ptr_,
195                tree_node_stack_);
196            break;
197        case token::OPENPAREN:
198            openparen (handle_, token_stack_);
199            break;
200        case token::OPT:
201        case token::AOPT:
202            optional (rhs_._type == token::OPT, node_vector_ptr_,
203                tree_node_stack_);
204            token_stack_.push (token::DUP);
205            break;
206        case token::ZEROORMORE:
207        case token::AZEROORMORE:
208            zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
209                tree_node_stack_);
210            token_stack_.push (token::DUP);
211            break;
212        case token::ONEORMORE:
213        case token::AONEORMORE:
214            one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
215                tree_node_stack_);
216            token_stack_.push (token::DUP);
217            break;
218        case token::REPEATN:
219        case token::AREPEATN:
220            repeatn (rhs_._type == token::REPEATN, handle_.top (),
221                node_vector_ptr_, tree_node_stack_);
222            token_stack_.push (token::DUP);
223            break;
224        default:
225            throw runtime_error
226                ("Internal error regex_parser::reduce");
227            break;
228        }
229    }
230
231    static void orexp (token_stack &handle_, token_stack &token_stack_,
232        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
233    {
234        BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
235            (handle_.size () == 1 || handle_.size () == 3));
236
237        if (handle_.size () == 1)
238        {
239            token_stack_.push (token::REGEX);
240        }
241        else
242        {
243            handle_.pop ();
244            BOOST_ASSERT(handle_.top ()._type == token::OR);
245            handle_.pop ();
246            BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
247            perform_or (node_ptr_vector_, tree_node_stack_);
248            token_stack_.push (token::OREXP);
249        }
250    }
251
252    static void sub (token_stack &handle_, token_stack &token_stack_,
253        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
254    {
255        BOOST_ASSERT(handle_.top ()._type == token::SUB &&
256            (handle_.size () == 1 || handle_.size () == 2));
257
258        if (handle_.size () == 1)
259        {
260            token_stack_.push (token::SEQUENCE);
261        }
262        else
263        {
264            handle_.pop ();
265            BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
266            // perform join
267            sequence (node_ptr_vector_, tree_node_stack_);
268            token_stack_.push (token::SUB);
269        }
270    }
271
272    static void repeat (token_stack &handle_, token_stack &token_stack_)
273    {
274        BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
275            handle_.size () >= 1 && handle_.size () <= 3);
276
277        if (handle_.size () == 1)
278        {
279            token_stack_.push (token::EXPRESSION);
280        }
281        else
282        {
283            handle_.pop ();
284            BOOST_ASSERT(handle_.top ()._type == token::DUP);
285            token_stack_.push (token::REPEAT);
286        }
287    }
288
289    static void charset (token_stack &handle_, token_stack &token_stack_,
290        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
291    {
292        BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
293            handle_.size () == 1);
294        // store charset
295        node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
296
297        const size_t id_ = handle_.top ()._id;
298
299        node_ptr_vector_->back () = new leaf_node (id_, true);
300        tree_node_stack_.push (node_ptr_vector_->back ());
301        token_stack_.push (token::REPEAT);
302    }
303
304    static void macro (token_stack &handle_, token_stack &token_stack_,
305        const macro_map &macromap_, node_ptr_vector &node_ptr_vector_,
306        tree_node_stack &tree_node_stack_)
307    {
308        token &top_ = handle_.top ();
309
310        BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
311
312        typename macro_map::const_iterator iter_ =
313            macromap_.find (top_._macro);
314
315        if (iter_ == macromap_.end ())
316        {
317            const CharT *name_ = top_._macro;
318            std::basic_stringstream<CharT> ss_;
319            std::ostringstream os_;
320
321            os_ << "Unknown MACRO name '";
322
323            while (*name_)
324            {
325                os_ << ss_.narrow (*name_++, ' ');
326            }
327
328            os_ << "'.";
329            throw runtime_error (os_.str ());
330        }
331
332        tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
333        token_stack_.push (token::REPEAT);
334    }
335
336    static void openparen (token_stack &handle_, token_stack &token_stack_)
337    {
338        BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
339            handle_.size () == 3);
340        handle_.pop ();
341        BOOST_ASSERT(handle_.top ()._type == token::REGEX);
342        handle_.pop ();
343        BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
344        token_stack_.push (token::REPEAT);
345    }
346
347    static void perform_or (node_ptr_vector &node_ptr_vector_,
348        tree_node_stack &tree_node_stack_)
349    {
350        // perform or
351        node *rhs_ = tree_node_stack_.top ();
352
353        tree_node_stack_.pop ();
354
355        node *lhs_ = tree_node_stack_.top ();
356
357        node_ptr_vector_->push_back (static_cast<selection_node *>(0));
358        node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
359        tree_node_stack_.top () = node_ptr_vector_->back ();
360    }
361
362    static void sequence (node_ptr_vector &node_ptr_vector_,
363        tree_node_stack &tree_node_stack_)
364    {
365        node *rhs_ = tree_node_stack_.top ();
366
367        tree_node_stack_.pop ();
368
369        node *lhs_ = tree_node_stack_.top ();
370
371        node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
372        node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
373        tree_node_stack_.top () = node_ptr_vector_->back ();
374    }
375
376    static void optional (const bool greedy_,
377        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
378    {
379        // perform ?
380        node *lhs_ = tree_node_stack_.top ();
381        // You don't know if lhs_ is a leaf_node, so get firstpos.
382        node::node_vector &firstpos_ = lhs_->firstpos ();
383
384        for (node::node_vector::iterator iter_ = firstpos_.begin (),
385            end_ = firstpos_.end (); iter_ != end_; ++iter_)
386        {
387            // These are leaf_nodes!
388            (*iter_)->greedy (greedy_);
389        }
390
391        node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
392
393        node *rhs_ = new leaf_node (null_token, greedy_);
394
395        node_ptr_vector_->back () = rhs_;
396        node_ptr_vector_->push_back (static_cast<selection_node *>(0));
397        node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
398        tree_node_stack_.top () = node_ptr_vector_->back ();
399    }
400
401    static void zero_or_more (const bool greedy_,
402        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
403    {
404        // perform *
405        node *ptr_ = tree_node_stack_.top ();
406
407        node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
408        node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
409        tree_node_stack_.top () = node_ptr_vector_->back ();
410    }
411
412    static void one_or_more (const bool greedy_,
413        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
414    {
415        // perform +
416        node *lhs_ = tree_node_stack_.top ();
417        node *copy_ = lhs_->copy (node_ptr_vector_);
418
419        node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
420
421        node *rhs_ = new iteration_node (copy_, greedy_);
422
423        node_ptr_vector_->back () = rhs_;
424        node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
425        node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
426        tree_node_stack_.top () = node_ptr_vector_->back ();
427    }
428
429    // This is one of the most mind bending routines in this code...
430    static void repeatn (const bool greedy_, const token &token_,
431        node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
432    {
433        // perform {n[,[m]]}
434        // Semantic checks have already been performed.
435        // {0,}  = *
436        // {0,1} = ?
437        // {1,}  = +
438        // therefore we do not check for these cases.
439        if (!(token_._min == 1 && !token_._comma))
440        {
441            const std::size_t top_ = token_._min > 0 ?
442                token_._min : token_._max;
443
444            if (token_._min == 0)
445            {
446                optional (greedy_, node_ptr_vector_, tree_node_stack_);
447            }
448
449            node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
450            node *curr_ = 0;
451
452            for (std::size_t i_ = 2; i_ < top_; ++i_)
453            {
454                curr_ = prev_->copy (node_ptr_vector_);
455                tree_node_stack_.push (static_cast<node *>(0));
456                tree_node_stack_.top () = prev_;
457                sequence (node_ptr_vector_, tree_node_stack_);
458                prev_ = curr_;
459            }
460
461            if (token_._comma && token_._min > 0)
462            {
463                if (token_._min > 1)
464                {
465                    curr_ = prev_->copy (node_ptr_vector_);
466                    tree_node_stack_.push (static_cast<node *>(0));
467                    tree_node_stack_.top () = prev_;
468                    sequence (node_ptr_vector_, tree_node_stack_);
469                    prev_ = curr_;
470                }
471
472                if (token_._comma && token_._max)
473                {
474                    tree_node_stack_.push (static_cast<node *>(0));
475                    tree_node_stack_.top () = prev_;
476                    optional (greedy_, node_ptr_vector_, tree_node_stack_);
477                    prev_ = tree_node_stack_.top ();
478                    tree_node_stack_.pop ();
479
480                    const std::size_t count_ = token_._max - token_._min;
481
482                    for (std::size_t i_ = 1; i_ < count_; ++i_)
483                    {
484                        curr_ = prev_->copy (node_ptr_vector_);
485                        tree_node_stack_.push (static_cast<node *>(0));
486                        tree_node_stack_.top () = prev_;
487                        sequence (node_ptr_vector_, tree_node_stack_);
488                        prev_ = curr_;
489                    }
490                }
491                else
492                {
493                    tree_node_stack_.push (static_cast<node *>(0));
494                    tree_node_stack_.top () = prev_;
495                    zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
496                    prev_ = tree_node_stack_.top ();
497                    tree_node_stack_.pop ();
498                }
499            }
500
501            tree_node_stack_.push (static_cast<node *>(0));
502            tree_node_stack_.top () = prev_;
503            sequence (node_ptr_vector_, tree_node_stack_);
504        }
505    }
506};
507}
508}
509}
510
511#endif