/src/lib/regular_expression/low_level/posix_regular_expression_builder.e
Specman e | 560 lines | 468 code | 20 blank | 72 comment | 74 complexity | 437cd7412991a95745c9e744c5b28276 MD5 | raw file
1-- This file is part of a Liberty Eiffel library. 2-- See the full copyright at the end. 3-- 4class POSIX_REGULAR_EXPRESSION_BUILDER 5 -- 6 -- Parses POSIX regular expressions and build its matchable form 7 -- 8 -- regular-expression ::= alternative 9 -- alternative ::= sequence [ '|' sequence ]... 10 -- sequence ::= term [ term ]... 11 -- term ::= factor [ repeat-spec ] 12 -- repeat-spec ::= '?' | '*' | '+' | '{' integer [',' [integer]] '}' 13 -- factor ::= group | union | '.' | '^' | '$' | escaped | text 14 -- group ::= '(' alternative ')' 15 -- union ::= '[' union ']' 16 -- union ::= '[' ['^'] union_term... ']' 17 -- union_term ::= union_factor ['-' union_factor] 18 -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER 19 -- escaped ::= '\' CHARACTER 20 -- text ::= A SEQUENCE NOT FOLLOWED BY EITHER '*', '+', '?', '{' OF NOT ESCAPED CHARACTERS 21 22inherit 23 BACKTRACKING_REGULAR_EXPRESSION_BUILDER 24 25create {ANY} 26 make 27 28feature {BACKTRACKING_REGULAR_EXPRESSION_BUILDER} -- parsing 29 internal_parse 30 -- Main parse of a POSIX regular expression. 31 do 32 if end_of_input then 33 set_error(once "empty regular expression") 34 else 35 set_greedy 36 parse_alternative 37 if not has_error and then not end_of_input then 38 set_error(once "extra character(s) found") 39 end 40 end 41 end 42 43feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- parsing 44 parse_alternative 45 -- Parses an alternative of sequences. 46 -- alternative ::= sequence [ '|' sequence ]... 47 require 48 has_no_error: not has_error 49 not_at_end: not end_of_input 50 local 51 has_empty: BOOLEAN 52 do 53 begin_collect 54 from 55 if last_character = '|' or else last_character = ')' then 56 has_empty := True 57 else 58 parse_sequence 59 end 60 until 61 has_error or else end_of_input or else last_character /= '|' 62 loop 63 read_character 64 if end_of_input or else last_character = '|' or else last_character = ')' then 65 has_empty := True 66 else 67 parse_sequence 68 end 69 end 70 if not has_error then 71 if is_collect_empty then 72 end_collect_true 73 --set_error(once "empty expression is not allowed") 74 else 75 end_collect_or 76 if has_empty then 77 emit_controled_or_true 78 end 79 end 80 end 81 ensure 82 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 83 state_known: has_error or else end_of_input or else last_character = ')' 84 end 85 86 parse_sequence 87 -- Parses a sequence of terms. 88 -- sequence ::= term [ term ]... 89 require 90 has_no_error: not has_error 91 not_at_end: not end_of_input 92 end_excluded: last_character /= '|' and then last_character /= ')' 93 do 94 begin_collect 95 from 96 parse_term 97 until 98 has_error or else end_of_input or else last_character = '|' or else last_character = ')' 99 loop 100 parse_term 101 end 102 if not has_error then 103 if is_collect_empty then 104 --emit(the_true_node) 105 set_error(once "empty expression is not allowed") 106 else 107 end_collect_and 108 end 109 end 110 ensure 111 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 112 state_known: has_error or else end_of_input or else last_character = '|' or else last_character = ')' 113 end 114 115 parse_term 116 -- Parses a term. 117 -- term ::= factor [ repeat-spec ] 118 -- repeat-spec ::= '?' | '*' | '+' | '{' integer [',' [integer]] '}' 119 require 120 has_no_error: not has_error 121 not_at_end: not end_of_input 122 end_excluded: last_character /= '|' and then last_character /= ')' 123 local 124 mini, maxi: INTEGER 125 do 126 parse_factor 127 if not has_error and then not end_of_input then 128 inspect 129 last_character 130 when '*' then 131 read_character 132 emit_repeat(0, Repeat_infiny) 133 when '+' then 134 read_character 135 emit_repeat(1, Repeat_infiny) 136 when '?' then 137 read_character 138 emit_repeat(0, 1) 139 when '{' then 140 save_position 141 read_character 142 if end_of_input or else not last_character.is_decimal_digit then 143 restore_saved_position 144 else 145 read_integer 146 mini := last_integer 147 if not end_of_input then 148 if last_character /= ',' then 149 maxi := mini 150 else 151 read_character 152 if end_of_input or else not last_character.is_decimal_digit then 153 maxi := Repeat_infiny 154 else 155 read_integer 156 maxi := last_integer 157 end 158 end 159 end 160 if end_of_input or else last_character /= '}' then 161 set_error(once "expected '}' not found") 162 elseif maxi /= Repeat_infiny and then maxi < mini then 163 set_error(once "repeat count error (lower > upper is not allowed)") 164 else 165 read_character 166 emit_repeat(mini, maxi) 167 end 168 end 169 else 170 end 171 end 172 ensure 173 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 174 end 175 176 parse_factor 177 -- Parses a factor. 178 -- factor ::= group | union | '.' | '^' | '$' | escaped | text 179 require 180 has_no_error: not has_error 181 not_at_end: not end_of_input 182 end_excluded: last_character /= '|' and then last_character /= ')' 183 do 184 inspect 185 last_character 186 when '(' then 187 parse_group 188 when '[' then 189 parse_union 190 when '.' then 191 emit_any_character 192 read_character 193 when '^' then 194 emit_begin_of_line 195 read_character 196 when '$' then 197 emit_end_of_line 198 read_character 199 when '\' then 200 parse_escaped 201 when '*', '+', '?', '{' then 202 set_error(once "unescaped reserved char") 203 else 204 parse_text 205 end 206 ensure 207 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 208 end 209 210 parse_group 211 -- Parses a group. 212 -- group ::= '(' alternative ')' 213 require 214 has_no_error: not has_error 215 not_at_end: not end_of_input 216 begin_with_open_parenthesis: last_character = '(' 217 do 218 read_character 219 if not end_of_input then 220 prepare_group 221 parse_alternative 222 end 223 if not has_error then 224 if end_of_input or else last_character /= ')' then 225 set_error(once "expected ')' not found") 226 else 227 read_character 228 emit_group 229 end 230 end 231 ensure 232 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 233 end 234 235 parse_escaped 236 -- Parses an escaped character. 237 -- escaped ::= '\' CHARACTER 238 require 239 has_no_error: not has_error 240 not_at_end: not end_of_input 241 begin_with_escape: last_character = '\' 242 do 243 read_character 244 if end_of_input then 245 set_error(once "invalid '\' at the end of the expression") 246 else 247 emit_match_single(last_character) 248 read_character 249 end 250 ensure 251 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 252 end 253 254 parse_text 255 -- Parses a text. 256 -- text ::= A SEQUENCE NOT FOLLOWED BY EITHER '*', '+', '?', '{' OF NOT ESCAPED CHARACTERS 257 require 258 has_no_error: not has_error 259 not_at_end: not end_of_input 260 current_character_is_valid: valid_last_character and then not (once "([.^$\*+?{").has(last_character) 261 local 262 stop: BOOLEAN 263 do 264 from 265 last_string.clear_count 266 last_string.add_last(last_character) 267 read_character 268 if not end_of_input then 269 inspect 270 last_character 271 when '*', '+', '?', '{' then 272 stop := True 273 else 274 end 275 end 276 until 277 end_of_input or else stop 278 loop 279 inspect 280 last_character 281 when '.', '(', ')', '[', '^', '$', '|', '\' then 282 stop := True 283 else 284 if valid_next_character then 285 inspect 286 next_character 287 when '*', '+', '?', '{' then 288 stop := True 289 else 290 last_string.add_last(last_character) 291 read_character 292 end 293 else 294 last_string.add_last(last_character) 295 read_character 296 end 297 end 298 end 299 check 300 last_string.count > 0 301 end 302 if last_string.count = 1 then 303 emit_match_single(last_string.first) 304 else 305 emit_match_text(last_string) 306 end 307 ensure 308 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 309 end 310 311 parse_union 312 -- Parses a union. 313 -- union ::= '[' ['^'] union_term... ']' 314 require 315 has_no_error: not has_error 316 not_at_end: not end_of_input 317 begin_with_open_bracket: last_character = '[' 318 local 319 negate: BOOLEAN 320 do 321 read_character 322 if not end_of_input and then last_character = '^' then 323 negate := True 324 read_character 325 end 326 if not end_of_input then 327 begin_collect 328 from 329 parse_union_term 330 until 331 has_error or else end_of_input or else last_character = ']' 332 loop 333 parse_union_term 334 end 335 end 336 if not has_error then 337 if end_of_input then 338 set_error(once "expected ']' not found") 339 else 340 end_collect_or 341 check 342 last_character = ']' 343 end 344 read_character 345 if negate then 346 emit_not_then_any 347 end 348 end 349 end 350 ensure 351 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 352 end 353 354feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- sub parts of union 355 parse_union_term 356 -- Parses a union term. 357 -- union_term ::= union_factor ['-' union_factor] 358 require 359 has_no_error: not has_error 360 not_at_end: not end_of_input 361 local 362 mini, maxi: CHARACTER 363 do 364 parse_union_factor 365 if not has_error then 366 if end_of_input or else last_character /= '-' or else valid_next_character and then next_character = ']' then 367 emit_recorded 368 else 369 if recorded_item /= Void then 370 set_error(once "first factor of an interval must be a single character") 371 else 372 read_character 373 if end_of_input then 374 set_error(once "unterminated interval") 375 else 376 mini := recorded_character 377 parse_union_factor 378 if not has_error then 379 if recorded_item /= Void then 380 set_error(once "second factor of an interval must be a single character") 381 else 382 maxi := recorded_character 383 if mini > maxi then 384 set_error(once "invalid interval because the first factor has a character code greater than the last factor one") 385 else 386 emit_match_range(mini, maxi) 387 end 388 end 389 end 390 end 391 end 392 end 393 end 394 ensure 395 error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1 396 end 397 398 parse_union_factor 399 -- Parses a union factor. 400 -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER 401 require 402 has_no_error: not has_error 403 not_at_end: not end_of_input 404 do 405 inspect 406 last_character 407 when '[' then 408 read_character 409 if end_of_input then 410 set_recorded_character('[') 411 else 412 inspect 413 last_character 414 when '.' then 415 read_embedded 416 if not has_error then 417 inspect 418 last_string.count 419 when 0 then 420 set_error(once "empty merge list") 421 when 1 then 422 set_recorded_character(last_string.first) 423 else 424 emit_match_text(last_string) 425 set_recorded_item(unemit) 426 end 427 end 428 when ':' then 429 read_embedded 430 if not has_error then 431 inspect 432 last_string 433 when "<" then 434 set_recorded_item(the_begin_of_word_item) 435 when ">" then 436 set_recorded_item(the_end_of_word_item) 437 else 438 if has_named_posix_item(last_string) then 439 set_recorded_item(named_posix_item(last_string)) 440 else 441 set_error(once "unkwon posix class") 442 end 443 end 444 end 445 when '=' then 446 set_error(once "unimplemented class expression '[=....=]'") 447 else 448 set_recorded_character('[') 449 end 450 end 451 else 452 set_recorded_character(last_character) 453 read_character 454 end 455 end 456 457 read_embedded 458 -- Parses the text embedded in one of '[.' TEXT '.]', 459 -- '[:' TEXT ':]' or '[=' TEXT '=]'. 460 -- The parsed text is put in feature 'last_string'. 461 require 462 has_no_error: not has_error 463 not_at_end: not end_of_input 464 previous_character_is_open_brace: valid_previous_character and then previous_character = '[' 465 current_character_is_valid: valid_last_character and then (once ".:=").has(last_character) 466 local 467 tag: CHARACTER; stop: BOOLEAN 468 do 469 from 470 last_string.clear_count 471 tag := last_character 472 read_character 473 until 474 stop 475 loop 476 from 477 until 478 end_of_input or else last_character = tag 479 loop 480 last_string.add_last(last_character) 481 read_character 482 end 483 if end_of_input then 484 set_error(once "unmatched '[.' or '[:' or '[='") 485 stop := True 486 else 487 read_character 488 if not end_of_input then 489 if last_character = ']' then 490 read_character 491 stop := True 492 else 493 last_string.add_last(tag) 494 end 495 end 496 end 497 end 498 end 499 500 recorded_character: CHARACTER 501 -- Last union_factor's character recorded. 502 503 recorded_item: BACKTRACKING_NODE 504 -- Last union_factor's item (complex expression) recorded. 505 506 set_recorded_character (value: CHARACTER) 507 -- Records the union_factor's character 'value'. 508 do 509 recorded_item := Void 510 recorded_character := value 511 ensure 512 recorded_item = Void 513 recorded_character = value 514 end 515 516 set_recorded_item (value: BACKTRACKING_NODE) 517 -- Records the union_factor's item (complex expression) 'value'. 518 require 519 item_not_void: value /= Void 520 do 521 recorded_item := value 522 ensure 523 recorded_item /= Void 524 recorded_item = value 525 end 526 527 emit_recorded 528 -- Emits the last union_factor's recorded character or item, 529 -- depending on its kind. 530 do 531 if recorded_item = Void then 532 emit_match_single(recorded_character) 533 else 534 emit(recorded_item) 535 end 536 ensure 537 incremented_by_one: stack.count = old stack.count + 1 538 end 539 540end -- class POSIX_REGULAR_EXPRESSION_BUILDER 541-- 542-- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file. 543-- 544-- Permission is hereby granted, free of charge, to any person obtaining a copy 545-- of this software and associated documentation files (the "Software"), to deal 546-- in the Software without restriction, including without limitation the rights 547-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 548-- copies of the Software, and to permit persons to whom the Software is 549-- furnished to do so, subject to the following conditions: 550-- 551-- The above copyright notice and this permission notice shall be included in 552-- all copies or substantial portions of the Software. 553-- 554-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 555-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 556-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 557-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 558-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 559-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 560-- THE SOFTWARE.