PageRenderTime 23ms CodeModel.GetById 12ms app.highlight 6ms RepoModel.GetById 1ms app.codeStats 1ms

/src/lib/regular_expression/low_level/posix_regular_expression_builder.e

http://github.com/tybor/Liberty
Specman e | 560 lines | 468 code | 20 blank | 72 comment | 74 complexity | 437cd7412991a95745c9e744c5b28276 MD5 | raw file
  1-- This file is part of a Liberty Eiffel library.
  2-- See the full copyright at the end.
  3--
  4class POSIX_REGULAR_EXPRESSION_BUILDER
  5   --
  6   -- Parses POSIX regular expressions and build its matchable form
  7   --
  8   -- regular-expression ::= alternative
  9   -- alternative ::= sequence [ '|' sequence ]...
 10   -- sequence    ::= term [ term ]...
 11   -- term        ::= factor [ repeat-spec ]
 12   -- repeat-spec ::= '?' | '*' | '+' | '{' integer [',' [integer]] '}'
 13   -- factor      ::= group | union | '.' | '^' | '$' | escaped | text
 14   -- group       ::= '(' alternative ')'
 15   -- union       ::= '[' union ']'
 16   -- union       ::= '[' ['^'] union_term... ']'
 17   -- union_term  ::= union_factor ['-' union_factor]
 18   -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER
 19   -- escaped     ::= '\' CHARACTER
 20   -- text        ::= A SEQUENCE NOT FOLLOWED BY EITHER '*', '+', '?', '{' OF NOT ESCAPED CHARACTERS
 21
 22inherit
 23   BACKTRACKING_REGULAR_EXPRESSION_BUILDER
 24
 25create {ANY}
 26   make
 27
 28feature {BACKTRACKING_REGULAR_EXPRESSION_BUILDER} -- parsing
 29   internal_parse
 30         -- Main parse of a POSIX regular expression.
 31      do
 32         if end_of_input then
 33            set_error(once "empty regular expression")
 34         else
 35            set_greedy
 36            parse_alternative
 37            if not has_error and then not end_of_input then
 38               set_error(once "extra character(s) found")
 39            end
 40         end
 41      end
 42
 43feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- parsing
 44   parse_alternative
 45         -- Parses an alternative of sequences.
 46         -- alternative ::= sequence [ '|' sequence ]...
 47      require
 48         has_no_error: not has_error
 49         not_at_end: not end_of_input
 50      local
 51         has_empty: BOOLEAN
 52      do
 53         begin_collect
 54         from
 55            if last_character = '|' or else last_character = ')' then
 56               has_empty := True
 57            else
 58               parse_sequence
 59            end
 60         until
 61            has_error or else end_of_input or else last_character /= '|'
 62         loop
 63            read_character
 64            if end_of_input or else last_character = '|' or else last_character = ')' then
 65               has_empty := True
 66            else
 67               parse_sequence
 68            end
 69         end
 70         if not has_error then
 71            if is_collect_empty then
 72               end_collect_true
 73               --set_error(once "empty expression is not allowed")
 74            else
 75               end_collect_or
 76               if has_empty then
 77                  emit_controled_or_true
 78               end
 79            end
 80         end
 81      ensure
 82         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
 83         state_known: has_error or else end_of_input or else last_character = ')'
 84      end
 85
 86   parse_sequence
 87         -- Parses a sequence of terms.
 88         -- sequence ::= term [ term ]...
 89      require
 90         has_no_error: not has_error
 91         not_at_end: not end_of_input
 92         end_excluded: last_character /= '|' and then last_character /= ')'
 93      do
 94         begin_collect
 95         from
 96            parse_term
 97         until
 98            has_error or else end_of_input or else last_character = '|' or else last_character = ')'
 99         loop
100            parse_term
101         end
102         if not has_error then
103            if is_collect_empty then
104               --emit(the_true_node)
105               set_error(once "empty expression is not allowed")
106            else
107               end_collect_and
108            end
109         end
110      ensure
111         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
112         state_known: has_error or else end_of_input or else last_character = '|' or else last_character = ')'
113      end
114
115   parse_term
116         -- Parses a term.
117         -- term        ::= factor [ repeat-spec ]
118         -- repeat-spec ::= '?' | '*' | '+' | '{' integer [',' [integer]] '}'
119      require
120         has_no_error: not has_error
121         not_at_end: not end_of_input
122         end_excluded: last_character /= '|' and then last_character /= ')'
123      local
124         mini, maxi: INTEGER
125      do
126         parse_factor
127         if not has_error and then not end_of_input then
128            inspect
129               last_character
130            when '*' then
131               read_character
132               emit_repeat(0, Repeat_infiny)
133            when '+' then
134               read_character
135               emit_repeat(1, Repeat_infiny)
136            when '?' then
137               read_character
138               emit_repeat(0, 1)
139            when '{' then
140               save_position
141               read_character
142               if end_of_input or else not last_character.is_decimal_digit then
143                  restore_saved_position
144               else
145                  read_integer
146                  mini := last_integer
147                  if not end_of_input then
148                     if last_character /= ',' then
149                        maxi := mini
150                     else
151                        read_character
152                        if end_of_input or else not last_character.is_decimal_digit then
153                           maxi := Repeat_infiny
154                        else
155                           read_integer
156                           maxi := last_integer
157                        end
158                     end
159                  end
160                  if end_of_input or else last_character /= '}' then
161                     set_error(once "expected '}' not found")
162                  elseif maxi /= Repeat_infiny and then maxi < mini then
163                     set_error(once "repeat count error (lower > upper is not allowed)")
164                  else
165                     read_character
166                     emit_repeat(mini, maxi)
167                  end
168               end
169            else
170            end
171         end
172      ensure
173         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
174      end
175
176   parse_factor
177         -- Parses a factor.
178         -- factor      ::= group | union | '.' | '^' | '$' | escaped | text
179      require
180         has_no_error: not has_error
181         not_at_end: not end_of_input
182         end_excluded: last_character /= '|' and then last_character /= ')'
183      do
184         inspect
185            last_character
186         when '(' then
187            parse_group
188         when '[' then
189            parse_union
190         when '.' then
191            emit_any_character
192            read_character
193         when '^' then
194            emit_begin_of_line
195            read_character
196         when '$' then
197            emit_end_of_line
198            read_character
199         when '\' then
200            parse_escaped
201         when '*', '+', '?', '{' then
202            set_error(once "unescaped reserved char")
203         else
204            parse_text
205         end
206      ensure
207         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
208      end
209
210   parse_group
211         -- Parses a group.
212         -- group       ::= '(' alternative ')'
213      require
214         has_no_error: not has_error
215         not_at_end: not end_of_input
216         begin_with_open_parenthesis: last_character = '('
217      do
218         read_character
219         if not end_of_input then
220            prepare_group
221            parse_alternative
222         end
223         if not has_error then
224            if end_of_input or else last_character /= ')' then
225               set_error(once "expected ')' not found")
226            else
227               read_character
228               emit_group
229            end
230         end
231      ensure
232         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
233      end
234
235   parse_escaped
236         -- Parses an escaped character.
237         -- escaped     ::= '\' CHARACTER
238      require
239         has_no_error: not has_error
240         not_at_end: not end_of_input
241         begin_with_escape: last_character = '\'
242      do
243         read_character
244         if end_of_input then
245            set_error(once "invalid '\' at the end of the expression")
246         else
247            emit_match_single(last_character)
248            read_character
249         end
250      ensure
251         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
252      end
253
254   parse_text
255         -- Parses a text.
256         -- text        ::= A SEQUENCE NOT FOLLOWED BY EITHER '*', '+', '?', '{' OF NOT ESCAPED CHARACTERS
257      require
258         has_no_error: not has_error
259         not_at_end: not end_of_input
260         current_character_is_valid: valid_last_character and then not (once "([.^$\*+?{").has(last_character)
261      local
262         stop: BOOLEAN
263      do
264         from
265            last_string.clear_count
266            last_string.add_last(last_character)
267            read_character
268            if not end_of_input then
269               inspect
270                  last_character
271               when '*', '+', '?', '{' then
272                  stop := True
273               else
274               end
275            end
276         until
277            end_of_input or else stop
278         loop
279            inspect
280               last_character
281            when '.', '(', ')', '[', '^', '$', '|', '\' then
282               stop := True
283            else
284               if valid_next_character then
285                  inspect
286                     next_character
287                  when '*', '+', '?', '{' then
288                     stop := True
289                  else
290                     last_string.add_last(last_character)
291                     read_character
292                  end
293               else
294                  last_string.add_last(last_character)
295                  read_character
296               end
297            end
298         end
299         check
300            last_string.count > 0
301         end
302         if last_string.count = 1 then
303            emit_match_single(last_string.first)
304         else
305            emit_match_text(last_string)
306         end
307      ensure
308         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
309      end
310
311   parse_union
312         -- Parses a union.
313         -- union       ::= '[' ['^'] union_term... ']'
314      require
315         has_no_error: not has_error
316         not_at_end: not end_of_input
317         begin_with_open_bracket: last_character = '['
318      local
319         negate: BOOLEAN
320      do
321         read_character
322         if not end_of_input and then last_character = '^' then
323            negate := True
324            read_character
325         end
326         if not end_of_input then
327            begin_collect
328            from
329               parse_union_term
330            until
331               has_error or else end_of_input or else last_character = ']'
332            loop
333               parse_union_term
334            end
335         end
336         if not has_error then
337            if end_of_input then
338               set_error(once "expected ']' not found")
339            else
340               end_collect_or
341               check
342                  last_character = ']'
343               end
344               read_character
345               if negate then
346                  emit_not_then_any
347               end
348            end
349         end
350      ensure
351         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
352      end
353
354feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- sub parts of union
355   parse_union_term
356         -- Parses a union term.
357         -- union_term  ::= union_factor ['-' union_factor]
358      require
359         has_no_error: not has_error
360         not_at_end: not end_of_input
361      local
362         mini, maxi: CHARACTER
363      do
364         parse_union_factor
365         if not has_error then
366            if end_of_input or else last_character /= '-' or else valid_next_character and then next_character = ']' then
367               emit_recorded
368            else
369               if recorded_item /= Void then
370                  set_error(once "first factor of an interval must be a single character")
371               else
372                  read_character
373                  if end_of_input then
374                     set_error(once "unterminated interval")
375                  else
376                     mini := recorded_character
377                     parse_union_factor
378                     if not has_error then
379                        if recorded_item /= Void then
380                           set_error(once "second factor of an interval must be a single character")
381                        else
382                           maxi := recorded_character
383                           if mini > maxi then
384                              set_error(once "invalid interval because the first factor has a character code greater than the last factor one")
385                           else
386                              emit_match_range(mini, maxi)
387                           end
388                        end
389                     end
390                  end
391               end
392            end
393         end
394      ensure
395         error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
396      end
397
398   parse_union_factor
399         -- Parses a union factor.
400         -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER
401      require
402         has_no_error: not has_error
403         not_at_end: not end_of_input
404      do
405         inspect
406            last_character
407         when '[' then
408            read_character
409            if end_of_input then
410               set_recorded_character('[')
411            else
412               inspect
413                  last_character
414               when '.' then
415                  read_embedded
416                  if not has_error then
417                     inspect
418                        last_string.count
419                     when 0 then
420                        set_error(once "empty merge list")
421                     when 1 then
422                        set_recorded_character(last_string.first)
423                     else
424                        emit_match_text(last_string)
425                        set_recorded_item(unemit)
426                     end
427                  end
428               when ':' then
429                  read_embedded
430                  if not has_error then
431                     inspect
432                        last_string
433                     when "<" then
434                        set_recorded_item(the_begin_of_word_item)
435                     when ">" then
436                        set_recorded_item(the_end_of_word_item)
437                     else
438                        if has_named_posix_item(last_string) then
439                           set_recorded_item(named_posix_item(last_string))
440                        else
441                           set_error(once "unkwon posix class")
442                        end
443                     end
444                  end
445               when '=' then
446                  set_error(once "unimplemented class expression '[=....=]'")
447               else
448                  set_recorded_character('[')
449               end
450            end
451         else
452            set_recorded_character(last_character)
453            read_character
454         end
455      end
456
457   read_embedded
458         -- Parses the text embedded in one of '[.' TEXT '.]',
459         -- '[:' TEXT ':]' or '[=' TEXT '=]'.
460         -- The parsed text is put in feature 'last_string'.
461      require
462         has_no_error: not has_error
463         not_at_end: not end_of_input
464         previous_character_is_open_brace: valid_previous_character and then previous_character = '['
465         current_character_is_valid: valid_last_character and then (once ".:=").has(last_character)
466      local
467         tag: CHARACTER; stop: BOOLEAN
468      do
469         from
470            last_string.clear_count
471            tag := last_character
472            read_character
473         until
474            stop
475         loop
476            from
477            until
478               end_of_input or else last_character = tag
479            loop
480               last_string.add_last(last_character)
481               read_character
482            end
483            if end_of_input then
484               set_error(once "unmatched '[.' or '[:' or '[='")
485               stop := True
486            else
487               read_character
488               if not end_of_input then
489                  if last_character = ']' then
490                     read_character
491                     stop := True
492                  else
493                     last_string.add_last(tag)
494                  end
495               end
496            end
497         end
498      end
499
500   recorded_character: CHARACTER
501         -- Last union_factor's character recorded.
502
503   recorded_item: BACKTRACKING_NODE
504         -- Last union_factor's item (complex expression) recorded.
505
506   set_recorded_character (value: CHARACTER)
507         -- Records the union_factor's character 'value'.
508      do
509         recorded_item := Void
510         recorded_character := value
511      ensure
512         recorded_item = Void
513         recorded_character = value
514      end
515
516   set_recorded_item (value: BACKTRACKING_NODE)
517         -- Records the union_factor's item (complex expression) 'value'.
518      require
519         item_not_void: value /= Void
520      do
521         recorded_item := value
522      ensure
523         recorded_item /= Void
524         recorded_item = value
525      end
526
527   emit_recorded
528         -- Emits the last union_factor's recorded character or item,
529         -- depending on its kind.
530      do
531         if recorded_item = Void then
532            emit_match_single(recorded_character)
533         else
534            emit(recorded_item)
535         end
536      ensure
537         incremented_by_one: stack.count = old stack.count + 1
538      end
539
540end -- class POSIX_REGULAR_EXPRESSION_BUILDER
541--
542-- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file.
543--
544-- Permission is hereby granted, free of charge, to any person obtaining a copy
545-- of this software and associated documentation files (the "Software"), to deal
546-- in the Software without restriction, including without limitation the rights
547-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
548-- copies of the Software, and to permit persons to whom the Software is
549-- furnished to do so, subject to the following conditions:
550--
551-- The above copyright notice and this permission notice shall be included in
552-- all copies or substantial portions of the Software.
553--
554-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
555-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
556-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
557-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
558-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
559-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
560-- THE SOFTWARE.