PageRenderTime 24ms CodeModel.GetById 17ms app.highlight 3ms RepoModel.GetById 1ms app.codeStats 0ms

/src/lib/regular_expression/low_level/perl5_regular_expression_builder.e

http://github.com/tybor/Liberty
Specman e | 578 lines | 258 code | 19 blank | 301 comment | 16 complexity | b79482d96a49f2d9610205b94a1eef97 MD5 | raw file
  1-- This file is part of a Liberty Eiffel library.
  2-- See the full copyright at the end.
  3--
  4class PERL5_REGULAR_EXPRESSION_BUILDER
  5
  6inherit
  7   POSIX_REGULAR_EXPRESSION_BUILDER
  8      redefine emit_repeat, parse_alternative, parse_group, parse_escaped, parse_union_factor, read_character, read_integer,
  9         set_default_options
 10      end
 11
 12create {ANY}
 13   make
 14
 15feature {ANY} -- options
 16   has_extended_legibility: BOOLEAN
 17         -- Is the extended legibility active?
 18
 19   has_extended_ligibility: BOOLEAN
 20      obsolete "Use `has_extended_legibility' instead."
 21      do
 22         Result := has_extended_legibility
 23      end
 24
 25   set_extended_legibility
 26         -- Activate extended legibility.
 27      do
 28         has_extended_legibility := True
 29      ensure
 30         definition: has_extended_legibility = True
 31      end
 32
 33   set_extended_ligibility
 34      obsolete "Use `set_extended_legibility' instead."
 35      do
 36         set_extended_legibility
 37      ensure
 38         definition: has_extended_legibility = True
 39      end
 40
 41   set_no_extended_legibility
 42         -- Deactivate extended legibility.
 43      do
 44         has_extended_legibility := False
 45      ensure
 46         definition: has_extended_legibility = False
 47      end
 48
 49   set_no_extended_ligibility
 50      obsolete "Use `set_no_extended_legibility' instead."
 51      do
 52         set_no_extended_legibility
 53      ensure
 54         definition: has_extended_legibility = False
 55      end
 56
 57   set_default_options
 58         -- Set the default options
 59      do
 60         Precursor
 61         set_no_extended_legibility
 62      ensure then
 63         not has_extended_legibility
 64      end
 65
 66feature {PERL5_REGULAR_EXPRESSION_BUILDER} -- scanning
 67   has_unterminated_comment: BOOLEAN
 68         -- was an unterminated comment sequence (?#... detected
 69
 70   skip_blanks_and_comments
 71         -- Skips the blanks and comments when the extended legibility
 72         -- option is set.
 73      require
 74         has_no_error: not has_error
 75      local
 76         stop: BOOLEAN
 77      do
 78         from
 79         until
 80            end_of_input or else stop
 81         loop
 82            if last_character = '(' and then expression.valid_index(position + 2) and then expression.item(position + 1) = '?' and then expression.item(position + 2) = '#' then
 83               from
 84                  goto_position(position + 3)
 85               until
 86                  end_of_input or else stop
 87               loop
 88                  stop := last_character = ')'
 89                  goto_position(position + 1)
 90               end
 91               has_unterminated_comment := not stop
 92               stop := False
 93            elseif has_extended_legibility then
 94               inspect
 95                  last_character
 96               when ' ', '%T', '%N', '%R' then
 97                  goto_position(position + 1)
 98               when '#' then
 99                  from
100                     goto_position(position + 1)
101                  until
102                     end_of_input or else last_character = '%N'
103                  loop
104                     goto_position(position + 1)
105                  end
106               else
107                  stop := True
108               end
109            else
110               stop := True
111            end
112         end
113      ensure
114         has_no_error: not has_error
115      end
116
117feature {BACKTRACKING_REGULAR_EXPRESSION_BUILDER} -- parsing
118   read_character
119         -- Goto to the next character that is not a blank or a comment.
120      do
121         Precursor
122         skip_blanks_and_comments
123      end
124
125   read_integer
126         -- Reads in 'last_integer' the current integer values and
127         -- then goto to the next character that is not a blank or a comment.
128      do
129         Precursor
130         skip_blanks_and_comments
131      end
132
133   emit_repeat (mini, maxi: INTEGER)
134         -- Takes the top of the stack and replace it with
135         -- a construction that will evaluate the repeating of
136         -- it from 'mini' to 'maxi' times.
137         -- If current character is '?' it means that the repeat
138         -- is not greedy.
139      do
140         if not end_of_input and then last_character = '?' then
141            read_character
142            set_not_greedy
143         end
144         Precursor(mini, maxi)
145         set_greedy
146      end
147
148feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- parsing
149   parse_alternative
150      local
151         saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline,
152         saved_has_extended_legibility: BOOLEAN
153      do
154         has_unterminated_comment := False
155         saved_is_case_insensitive := is_case_insensitive
156         saved_does_match_line_boundary := does_match_line_boundary
157         saved_does_any_match_newline := does_any_match_newline
158         saved_has_extended_legibility := has_extended_legibility
159         Precursor
160         is_case_insensitive := saved_is_case_insensitive
161         does_match_line_boundary := saved_does_match_line_boundary
162         does_any_match_newline := saved_does_any_match_newline
163         has_extended_legibility := saved_has_extended_legibility
164         if has_unterminated_comment then
165            set_error(once "unterminated comment sequence (?#...")
166         end
167      end
168
169   parse_group
170         -- Parses a group. A group is either a POSIX group
171         -- or an extended pattern group.
172      local
173         saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline,
174         saved_has_extended_legibility: BOOLEAN
175      do
176         if valid_next_character and then next_character = '?' then
177            parse_extended_pattern
178         else
179            saved_is_case_insensitive := is_case_insensitive
180            saved_does_match_line_boundary := does_match_line_boundary
181            saved_does_any_match_newline := does_any_match_newline
182            saved_has_extended_legibility := has_extended_legibility
183            Precursor
184            is_case_insensitive := saved_is_case_insensitive
185            does_match_line_boundary := saved_does_match_line_boundary
186            does_any_match_newline := saved_does_any_match_newline
187            has_extended_legibility := saved_has_extended_legibility
188         end
189      end
190
191   parse_escaped
192         -- Parses an escaped character.
193         -- escaped     ::= '\' CHARACTER
194      do
195         internal_parse_escaped(False)
196         if not has_error then
197            emit_recorded
198         end
199      end
200
201   parse_union_factor
202         -- Parses a union factor.
203         -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER
204      local
205         not_class: BOOLEAN
206      do
207         inspect
208            last_character
209         when '[' then
210            read_character
211            if end_of_input then
212               set_recorded_character('[')
213            else
214               inspect
215                  last_character
216               when '.' then
217                  read_embedded
218                  if not has_error then
219                     inspect
220                        last_string.count
221                     when 0 then
222                        set_error(once "empty merge list")
223                     when 1 then
224                        set_recorded_character(last_string.first)
225                     else
226                        emit_match_text(last_string)
227                        set_recorded_item(unemit)
228                     end
229                  end
230               when ':' then
231                  read_embedded
232                  if not has_error then
233                     if last_string.first = '^' then
234                        last_string.remove_first
235                        not_class := True
236                     end
237                     inspect
238                        last_string
239                     when "<" then
240                        set_recorded_item(the_begin_of_word_item)
241                     when ">" then
242                        set_recorded_item(the_end_of_word_item)
243                     else
244                        if has_named_posix_item(last_string) then
245                           set_recorded_item(named_posix_item(last_string))
246                        else
247                           set_error(once "unkwon posix class")
248                        end
249                     end
250                     if not_class and then not has_error then
251                        emit(recorded_item)
252                        emit_not_then_any
253                        set_recorded_item(unemit)
254                     end
255                  end
256               when '=' then
257                  set_error(once "unimplemented class expression '[=....=]'")
258               else
259                  set_recorded_character('[')
260               end
261            end
262         when '\' then
263            internal_parse_escaped(True)
264         else
265            set_recorded_character(last_character)
266            read_character
267         end
268      end
269
270feature {} -- parsing
271   internal_parse_escaped (in_union: BOOLEAN)
272      require
273         has_no_error: not has_error
274         not_at_end: not end_of_input
275         begin_with_escape: last_character = '\'
276      do
277         read_character
278         if end_of_input then
279            set_error(once "invalid '\' at the end of the expression")
280         elseif in_union then
281            inspect
282               last_character
283            when 'b', 'B', 'A', 'Z', 'z', '0' .. '9', '<', '>' then
284               set_error(once "prohibited in unions")
285            else
286            end
287         end
288         if not has_error then
289            inspect
290               last_character
291            when 'b', 'B' then
292               -- word boundary or not
293               begin_collect
294               emit(the_begin_of_word_item)
295               emit(the_end_of_word_item)
296               end_collect_or
297               if last_character = 'B' then
298                  emit_not
299               end
300               set_recorded_item(unemit)
301               read_character
302            when '<' then
303               -- begin of word
304               set_recorded_item(the_begin_of_word_item)
305               read_character
306            when '>' then
307               -- end of word
308               set_recorded_item(the_end_of_word_item)
309               read_character
310            when 'A' then
311               -- begin of text
312               set_recorded_item(the_begin_of_text_item)
313               read_character
314            when 'Z' then
315               -- end of text
316               set_recorded_item(the_end_of_text_item)
317               read_character
318            when 'z' then
319               -- end of text
320               set_recorded_item(the_real_end_of_text_item)
321               read_character
322            when 'w', 'W' then
323               -- word or not word
324               emit(the_is_posix_word_item)
325               if last_character = 'W' then
326                  emit_not_then_any
327               end
328               set_recorded_item(unemit)
329               read_character
330            when 's', 'S' then
331               -- space or not space
332               emit(the_is_posix_space_item)
333               if last_character = 'S' then
334                  emit_not_then_any
335               end
336               set_recorded_item(unemit)
337               read_character
338            when 'd', 'D' then
339               -- space or not space
340               emit(the_is_posix_digit_item)
341               if last_character = 'D' then
342                  emit_not_then_any
343               end
344               set_recorded_item(unemit)
345               read_character
346            when '0' .. '9' then
347               -- backtrack match
348               read_integer
349               if last_integer.in_range(1, last_group_count) and then not group_stack.has(last_integer) then
350                  emit_match_previous_group(last_integer)
351                  set_recorded_item(unemit)
352               else
353                  set_error(once "unsupported forward group number")
354               end
355            when 'p' then
356               -- positive POSIX indication
357               read_character
358               parse_posix_indication
359               if not has_error then
360                  set_recorded_item(unemit)
361               end
362            when 'P' then
363               -- negative POSIX indication
364               read_character
365               parse_posix_indication
366               if not has_error then
367                  emit_not_then_any
368                  set_recorded_item(unemit)
369               end
370            else
371               set_recorded_character(last_character)
372               read_character
373            end
374         end
375      end
376
377   parse_posix_indication
378      do
379         if end_of_input then
380            set_error(once "class missing in \p or \P construct")
381         else
382            inspect
383               last_character
384            when '{' then
385               from
386                  last_string.clear_count
387                  read_character
388               until
389                  end_of_input or else last_character = '}'
390               loop
391                  last_string.add_last(last_character)
392                  read_character
393               end
394               if end_of_input then
395                  set_error(once "unmatched '{'")
396               else
397                  if not has_named_perl_item(last_string) then
398                     set_error(once "invalid perl class name")
399                  else
400                     emit(named_perl_item(last_string))
401                     read_character
402                  end
403               end
404            else
405               set_error(once "currently, only \p{..} or \P{..} construct is allowed")
406            end
407         end
408      end
409
410   parse_extended_pattern
411      require
412         has_no_error: not has_error
413         not_at_end: not end_of_input
414         begin_with_open_parenthesis: last_character = '('
415         followed_with_question_mark: valid_next_character and next_character = '?'
416      local
417         dont_restore, saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline,
418         saved_has_extended_legibility: BOOLEAN
419      do
420         -- skip known characters
421         read_character
422         read_character
423         -- save the state of the flags
424         saved_is_case_insensitive := is_case_insensitive
425         saved_does_match_line_boundary := does_match_line_boundary
426         saved_does_any_match_newline := does_any_match_newline
427         saved_has_extended_legibility := has_extended_legibility
428         -- read the flags
429         read_modifiers(True)
430         if not end_of_input and then last_character = '-' then
431            read_character
432            read_modifiers(False)
433         end
434         if not end_of_input then
435            inspect
436               last_character
437            when ')' then
438               -- flag alteration only
439               emit(the_true_node)
440               dont_restore := True
441            when '#' then
442               -- comment
443               emit(the_true_node)
444               from
445               until
446                  end_of_input or else last_character = ')'
447               loop
448                  read_character
449               end
450            when ':' then
451               -- not capturing
452               read_character
453               if not end_of_input then
454                  parse_alternative
455               end
456            when '=' then
457               -- zero width positive look-ahead
458               parse_looking(True)
459            when '!' then
460               -- zero width negative look-ahead
461               parse_looking(True)
462            when '<' then
463               -- zero width look-behind
464               read_character
465               if not end_of_input then
466                  inspect
467                     last_character
468                  when '=' then
469                     -- zero width positive look-behind
470                     parse_looking(False)
471                  when '!' then
472                     -- zero width negative look-behind
473                     parse_looking(False)
474                  else
475                     set_error(once "bad zero width look-behind")
476                  end
477               end
478            when '{', '?', '(', '>' then
479               -- unsupported
480               set_error(once "unsupported experimental extended pattern")
481            else
482               set_error(once "unknown extended pattern")
483            end
484         end
485         if not has_error then
486            if end_of_input or else last_character /= ')' then
487               set_error(once "extended pattern not finished")
488            else
489               if dont_restore then
490               else
491                  -- restore the flags
492                  is_case_insensitive := saved_is_case_insensitive
493                  does_match_line_boundary := saved_does_match_line_boundary
494                  does_any_match_newline := saved_does_any_match_newline
495                  has_extended_legibility := saved_has_extended_legibility
496               end
497               read_character
498            end
499         end
500      end
501
502   parse_looking (ahead: BOOLEAN)
503      require
504         has_no_error: not has_error
505         not_at_end: not end_of_input
506         begin_with: last_character = '=' or else last_character = '!'
507      do
508         if is_looking_around then
509            set_error(once "nested mix look-ahead / look-behind not implemented")
510         else
511            is_looking_ahead := ahead
512            is_looking_behind := not ahead
513            is_looking_positive := last_character = '='
514            read_character
515            if not end_of_input then
516               parse_alternative
517               if not has_error then
518                  emit_looking
519               end
520            end
521            is_looking_ahead := False
522            is_looking_behind := False
523         end
524      end
525
526   read_modifiers (level: BOOLEAN)
527      require
528         has_no_error: not has_error
529      local
530         stop: BOOLEAN
531      do
532         from
533         until
534            end_of_input or else stop
535         loop
536            inspect
537               last_character
538            when 'i' then
539               is_case_insensitive := level
540               read_character
541            when 'm' then
542               does_match_line_boundary := level
543               read_character
544            when 's' then
545               does_any_match_newline := level
546               read_character
547            when 'x' then
548               has_extended_legibility := level
549               read_character
550            else
551               stop := True
552            end
553         end
554      ensure
555         has_no_error: not has_error
556      end
557
558end -- class PERL5_REGULAR_EXPRESSION_BUILDER
559--
560-- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file.
561--
562-- Permission is hereby granted, free of charge, to any person obtaining a copy
563-- of this software and associated documentation files (the "Software"), to deal
564-- in the Software without restriction, including without limitation the rights
565-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
566-- copies of the Software, and to permit persons to whom the Software is
567-- furnished to do so, subject to the following conditions:
568--
569-- The above copyright notice and this permission notice shall be included in
570-- all copies or substantial portions of the Software.
571--
572-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
573-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
574-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
575-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
576-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
577-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
578-- THE SOFTWARE.