/src/lib/regular_expression/low_level/perl5_regular_expression_builder.e
Specman e | 578 lines | 258 code | 19 blank | 301 comment | 16 complexity | b79482d96a49f2d9610205b94a1eef97 MD5 | raw file
1-- This file is part of a Liberty Eiffel library. 2-- See the full copyright at the end. 3-- 4class PERL5_REGULAR_EXPRESSION_BUILDER 5 6inherit 7 POSIX_REGULAR_EXPRESSION_BUILDER 8 redefine emit_repeat, parse_alternative, parse_group, parse_escaped, parse_union_factor, read_character, read_integer, 9 set_default_options 10 end 11 12create {ANY} 13 make 14 15feature {ANY} -- options 16 has_extended_legibility: BOOLEAN 17 -- Is the extended legibility active? 18 19 has_extended_ligibility: BOOLEAN 20 obsolete "Use `has_extended_legibility' instead." 21 do 22 Result := has_extended_legibility 23 end 24 25 set_extended_legibility 26 -- Activate extended legibility. 27 do 28 has_extended_legibility := True 29 ensure 30 definition: has_extended_legibility = True 31 end 32 33 set_extended_ligibility 34 obsolete "Use `set_extended_legibility' instead." 35 do 36 set_extended_legibility 37 ensure 38 definition: has_extended_legibility = True 39 end 40 41 set_no_extended_legibility 42 -- Deactivate extended legibility. 43 do 44 has_extended_legibility := False 45 ensure 46 definition: has_extended_legibility = False 47 end 48 49 set_no_extended_ligibility 50 obsolete "Use `set_no_extended_legibility' instead." 51 do 52 set_no_extended_legibility 53 ensure 54 definition: has_extended_legibility = False 55 end 56 57 set_default_options 58 -- Set the default options 59 do 60 Precursor 61 set_no_extended_legibility 62 ensure then 63 not has_extended_legibility 64 end 65 66feature {PERL5_REGULAR_EXPRESSION_BUILDER} -- scanning 67 has_unterminated_comment: BOOLEAN 68 -- was an unterminated comment sequence (?#... detected 69 70 skip_blanks_and_comments 71 -- Skips the blanks and comments when the extended legibility 72 -- option is set. 73 require 74 has_no_error: not has_error 75 local 76 stop: BOOLEAN 77 do 78 from 79 until 80 end_of_input or else stop 81 loop 82 if last_character = '(' and then expression.valid_index(position + 2) and then expression.item(position + 1) = '?' and then expression.item(position + 2) = '#' then 83 from 84 goto_position(position + 3) 85 until 86 end_of_input or else stop 87 loop 88 stop := last_character = ')' 89 goto_position(position + 1) 90 end 91 has_unterminated_comment := not stop 92 stop := False 93 elseif has_extended_legibility then 94 inspect 95 last_character 96 when ' ', '%T', '%N', '%R' then 97 goto_position(position + 1) 98 when '#' then 99 from 100 goto_position(position + 1) 101 until 102 end_of_input or else last_character = '%N' 103 loop 104 goto_position(position + 1) 105 end 106 else 107 stop := True 108 end 109 else 110 stop := True 111 end 112 end 113 ensure 114 has_no_error: not has_error 115 end 116 117feature {BACKTRACKING_REGULAR_EXPRESSION_BUILDER} -- parsing 118 read_character 119 -- Goto to the next character that is not a blank or a comment. 120 do 121 Precursor 122 skip_blanks_and_comments 123 end 124 125 read_integer 126 -- Reads in 'last_integer' the current integer values and 127 -- then goto to the next character that is not a blank or a comment. 128 do 129 Precursor 130 skip_blanks_and_comments 131 end 132 133 emit_repeat (mini, maxi: INTEGER) 134 -- Takes the top of the stack and replace it with 135 -- a construction that will evaluate the repeating of 136 -- it from 'mini' to 'maxi' times. 137 -- If current character is '?' it means that the repeat 138 -- is not greedy. 139 do 140 if not end_of_input and then last_character = '?' then 141 read_character 142 set_not_greedy 143 end 144 Precursor(mini, maxi) 145 set_greedy 146 end 147 148feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- parsing 149 parse_alternative 150 local 151 saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline, 152 saved_has_extended_legibility: BOOLEAN 153 do 154 has_unterminated_comment := False 155 saved_is_case_insensitive := is_case_insensitive 156 saved_does_match_line_boundary := does_match_line_boundary 157 saved_does_any_match_newline := does_any_match_newline 158 saved_has_extended_legibility := has_extended_legibility 159 Precursor 160 is_case_insensitive := saved_is_case_insensitive 161 does_match_line_boundary := saved_does_match_line_boundary 162 does_any_match_newline := saved_does_any_match_newline 163 has_extended_legibility := saved_has_extended_legibility 164 if has_unterminated_comment then 165 set_error(once "unterminated comment sequence (?#...") 166 end 167 end 168 169 parse_group 170 -- Parses a group. A group is either a POSIX group 171 -- or an extended pattern group. 172 local 173 saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline, 174 saved_has_extended_legibility: BOOLEAN 175 do 176 if valid_next_character and then next_character = '?' then 177 parse_extended_pattern 178 else 179 saved_is_case_insensitive := is_case_insensitive 180 saved_does_match_line_boundary := does_match_line_boundary 181 saved_does_any_match_newline := does_any_match_newline 182 saved_has_extended_legibility := has_extended_legibility 183 Precursor 184 is_case_insensitive := saved_is_case_insensitive 185 does_match_line_boundary := saved_does_match_line_boundary 186 does_any_match_newline := saved_does_any_match_newline 187 has_extended_legibility := saved_has_extended_legibility 188 end 189 end 190 191 parse_escaped 192 -- Parses an escaped character. 193 -- escaped ::= '\' CHARACTER 194 do 195 internal_parse_escaped(False) 196 if not has_error then 197 emit_recorded 198 end 199 end 200 201 parse_union_factor 202 -- Parses a union factor. 203 -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER 204 local 205 not_class: BOOLEAN 206 do 207 inspect 208 last_character 209 when '[' then 210 read_character 211 if end_of_input then 212 set_recorded_character('[') 213 else 214 inspect 215 last_character 216 when '.' then 217 read_embedded 218 if not has_error then 219 inspect 220 last_string.count 221 when 0 then 222 set_error(once "empty merge list") 223 when 1 then 224 set_recorded_character(last_string.first) 225 else 226 emit_match_text(last_string) 227 set_recorded_item(unemit) 228 end 229 end 230 when ':' then 231 read_embedded 232 if not has_error then 233 if last_string.first = '^' then 234 last_string.remove_first 235 not_class := True 236 end 237 inspect 238 last_string 239 when "<" then 240 set_recorded_item(the_begin_of_word_item) 241 when ">" then 242 set_recorded_item(the_end_of_word_item) 243 else 244 if has_named_posix_item(last_string) then 245 set_recorded_item(named_posix_item(last_string)) 246 else 247 set_error(once "unkwon posix class") 248 end 249 end 250 if not_class and then not has_error then 251 emit(recorded_item) 252 emit_not_then_any 253 set_recorded_item(unemit) 254 end 255 end 256 when '=' then 257 set_error(once "unimplemented class expression '[=....=]'") 258 else 259 set_recorded_character('[') 260 end 261 end 262 when '\' then 263 internal_parse_escaped(True) 264 else 265 set_recorded_character(last_character) 266 read_character 267 end 268 end 269 270feature {} -- parsing 271 internal_parse_escaped (in_union: BOOLEAN) 272 require 273 has_no_error: not has_error 274 not_at_end: not end_of_input 275 begin_with_escape: last_character = '\' 276 do 277 read_character 278 if end_of_input then 279 set_error(once "invalid '\' at the end of the expression") 280 elseif in_union then 281 inspect 282 last_character 283 when 'b', 'B', 'A', 'Z', 'z', '0' .. '9', '<', '>' then 284 set_error(once "prohibited in unions") 285 else 286 end 287 end 288 if not has_error then 289 inspect 290 last_character 291 when 'b', 'B' then 292 -- word boundary or not 293 begin_collect 294 emit(the_begin_of_word_item) 295 emit(the_end_of_word_item) 296 end_collect_or 297 if last_character = 'B' then 298 emit_not 299 end 300 set_recorded_item(unemit) 301 read_character 302 when '<' then 303 -- begin of word 304 set_recorded_item(the_begin_of_word_item) 305 read_character 306 when '>' then 307 -- end of word 308 set_recorded_item(the_end_of_word_item) 309 read_character 310 when 'A' then 311 -- begin of text 312 set_recorded_item(the_begin_of_text_item) 313 read_character 314 when 'Z' then 315 -- end of text 316 set_recorded_item(the_end_of_text_item) 317 read_character 318 when 'z' then 319 -- end of text 320 set_recorded_item(the_real_end_of_text_item) 321 read_character 322 when 'w', 'W' then 323 -- word or not word 324 emit(the_is_posix_word_item) 325 if last_character = 'W' then 326 emit_not_then_any 327 end 328 set_recorded_item(unemit) 329 read_character 330 when 's', 'S' then 331 -- space or not space 332 emit(the_is_posix_space_item) 333 if last_character = 'S' then 334 emit_not_then_any 335 end 336 set_recorded_item(unemit) 337 read_character 338 when 'd', 'D' then 339 -- space or not space 340 emit(the_is_posix_digit_item) 341 if last_character = 'D' then 342 emit_not_then_any 343 end 344 set_recorded_item(unemit) 345 read_character 346 when '0' .. '9' then 347 -- backtrack match 348 read_integer 349 if last_integer.in_range(1, last_group_count) and then not group_stack.has(last_integer) then 350 emit_match_previous_group(last_integer) 351 set_recorded_item(unemit) 352 else 353 set_error(once "unsupported forward group number") 354 end 355 when 'p' then 356 -- positive POSIX indication 357 read_character 358 parse_posix_indication 359 if not has_error then 360 set_recorded_item(unemit) 361 end 362 when 'P' then 363 -- negative POSIX indication 364 read_character 365 parse_posix_indication 366 if not has_error then 367 emit_not_then_any 368 set_recorded_item(unemit) 369 end 370 else 371 set_recorded_character(last_character) 372 read_character 373 end 374 end 375 end 376 377 parse_posix_indication 378 do 379 if end_of_input then 380 set_error(once "class missing in \p or \P construct") 381 else 382 inspect 383 last_character 384 when '{' then 385 from 386 last_string.clear_count 387 read_character 388 until 389 end_of_input or else last_character = '}' 390 loop 391 last_string.add_last(last_character) 392 read_character 393 end 394 if end_of_input then 395 set_error(once "unmatched '{'") 396 else 397 if not has_named_perl_item(last_string) then 398 set_error(once "invalid perl class name") 399 else 400 emit(named_perl_item(last_string)) 401 read_character 402 end 403 end 404 else 405 set_error(once "currently, only \p{..} or \P{..} construct is allowed") 406 end 407 end 408 end 409 410 parse_extended_pattern 411 require 412 has_no_error: not has_error 413 not_at_end: not end_of_input 414 begin_with_open_parenthesis: last_character = '(' 415 followed_with_question_mark: valid_next_character and next_character = '?' 416 local 417 dont_restore, saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline, 418 saved_has_extended_legibility: BOOLEAN 419 do 420 -- skip known characters 421 read_character 422 read_character 423 -- save the state of the flags 424 saved_is_case_insensitive := is_case_insensitive 425 saved_does_match_line_boundary := does_match_line_boundary 426 saved_does_any_match_newline := does_any_match_newline 427 saved_has_extended_legibility := has_extended_legibility 428 -- read the flags 429 read_modifiers(True) 430 if not end_of_input and then last_character = '-' then 431 read_character 432 read_modifiers(False) 433 end 434 if not end_of_input then 435 inspect 436 last_character 437 when ')' then 438 -- flag alteration only 439 emit(the_true_node) 440 dont_restore := True 441 when '#' then 442 -- comment 443 emit(the_true_node) 444 from 445 until 446 end_of_input or else last_character = ')' 447 loop 448 read_character 449 end 450 when ':' then 451 -- not capturing 452 read_character 453 if not end_of_input then 454 parse_alternative 455 end 456 when '=' then 457 -- zero width positive look-ahead 458 parse_looking(True) 459 when '!' then 460 -- zero width negative look-ahead 461 parse_looking(True) 462 when '<' then 463 -- zero width look-behind 464 read_character 465 if not end_of_input then 466 inspect 467 last_character 468 when '=' then 469 -- zero width positive look-behind 470 parse_looking(False) 471 when '!' then 472 -- zero width negative look-behind 473 parse_looking(False) 474 else 475 set_error(once "bad zero width look-behind") 476 end 477 end 478 when '{', '?', '(', '>' then 479 -- unsupported 480 set_error(once "unsupported experimental extended pattern") 481 else 482 set_error(once "unknown extended pattern") 483 end 484 end 485 if not has_error then 486 if end_of_input or else last_character /= ')' then 487 set_error(once "extended pattern not finished") 488 else 489 if dont_restore then 490 else 491 -- restore the flags 492 is_case_insensitive := saved_is_case_insensitive 493 does_match_line_boundary := saved_does_match_line_boundary 494 does_any_match_newline := saved_does_any_match_newline 495 has_extended_legibility := saved_has_extended_legibility 496 end 497 read_character 498 end 499 end 500 end 501 502 parse_looking (ahead: BOOLEAN) 503 require 504 has_no_error: not has_error 505 not_at_end: not end_of_input 506 begin_with: last_character = '=' or else last_character = '!' 507 do 508 if is_looking_around then 509 set_error(once "nested mix look-ahead / look-behind not implemented") 510 else 511 is_looking_ahead := ahead 512 is_looking_behind := not ahead 513 is_looking_positive := last_character = '=' 514 read_character 515 if not end_of_input then 516 parse_alternative 517 if not has_error then 518 emit_looking 519 end 520 end 521 is_looking_ahead := False 522 is_looking_behind := False 523 end 524 end 525 526 read_modifiers (level: BOOLEAN) 527 require 528 has_no_error: not has_error 529 local 530 stop: BOOLEAN 531 do 532 from 533 until 534 end_of_input or else stop 535 loop 536 inspect 537 last_character 538 when 'i' then 539 is_case_insensitive := level 540 read_character 541 when 'm' then 542 does_match_line_boundary := level 543 read_character 544 when 's' then 545 does_any_match_newline := level 546 read_character 547 when 'x' then 548 has_extended_legibility := level 549 read_character 550 else 551 stop := True 552 end 553 end 554 ensure 555 has_no_error: not has_error 556 end 557 558end -- class PERL5_REGULAR_EXPRESSION_BUILDER 559-- 560-- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file. 561-- 562-- Permission is hereby granted, free of charge, to any person obtaining a copy 563-- of this software and associated documentation files (the "Software"), to deal 564-- in the Software without restriction, including without limitation the rights 565-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 566-- copies of the Software, and to permit persons to whom the Software is 567-- furnished to do so, subject to the following conditions: 568-- 569-- The above copyright notice and this permission notice shall be included in 570-- all copies or substantial portions of the Software. 571-- 572-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 573-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 574-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 575-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 576-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 577-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 578-- THE SOFTWARE.