/src/lib/regular_expression/low_level/perl5_regular_expression_builder.e

http://github.com/tybor/Liberty · Specman e · 578 lines · 258 code · 19 blank · 301 comment · 16 complexity · b79482d96a49f2d9610205b94a1eef97 MD5 · raw file

  1. -- This file is part of a Liberty Eiffel library.
  2. -- See the full copyright at the end.
  3. --
  4. class PERL5_REGULAR_EXPRESSION_BUILDER
  5. inherit
  6. POSIX_REGULAR_EXPRESSION_BUILDER
  7. redefine emit_repeat, parse_alternative, parse_group, parse_escaped, parse_union_factor, read_character, read_integer,
  8. set_default_options
  9. end
  10. create {ANY}
  11. make
  12. feature {ANY} -- options
  13. has_extended_legibility: BOOLEAN
  14. -- Is the extended legibility active?
  15. has_extended_ligibility: BOOLEAN
  16. obsolete "Use `has_extended_legibility' instead."
  17. do
  18. Result := has_extended_legibility
  19. end
  20. set_extended_legibility
  21. -- Activate extended legibility.
  22. do
  23. has_extended_legibility := True
  24. ensure
  25. definition: has_extended_legibility = True
  26. end
  27. set_extended_ligibility
  28. obsolete "Use `set_extended_legibility' instead."
  29. do
  30. set_extended_legibility
  31. ensure
  32. definition: has_extended_legibility = True
  33. end
  34. set_no_extended_legibility
  35. -- Deactivate extended legibility.
  36. do
  37. has_extended_legibility := False
  38. ensure
  39. definition: has_extended_legibility = False
  40. end
  41. set_no_extended_ligibility
  42. obsolete "Use `set_no_extended_legibility' instead."
  43. do
  44. set_no_extended_legibility
  45. ensure
  46. definition: has_extended_legibility = False
  47. end
  48. set_default_options
  49. -- Set the default options
  50. do
  51. Precursor
  52. set_no_extended_legibility
  53. ensure then
  54. not has_extended_legibility
  55. end
  56. feature {PERL5_REGULAR_EXPRESSION_BUILDER} -- scanning
  57. has_unterminated_comment: BOOLEAN
  58. -- was an unterminated comment sequence (?#... detected
  59. skip_blanks_and_comments
  60. -- Skips the blanks and comments when the extended legibility
  61. -- option is set.
  62. require
  63. has_no_error: not has_error
  64. local
  65. stop: BOOLEAN
  66. do
  67. from
  68. until
  69. end_of_input or else stop
  70. loop
  71. if last_character = '(' and then expression.valid_index(position + 2) and then expression.item(position + 1) = '?' and then expression.item(position + 2) = '#' then
  72. from
  73. goto_position(position + 3)
  74. until
  75. end_of_input or else stop
  76. loop
  77. stop := last_character = ')'
  78. goto_position(position + 1)
  79. end
  80. has_unterminated_comment := not stop
  81. stop := False
  82. elseif has_extended_legibility then
  83. inspect
  84. last_character
  85. when ' ', '%T', '%N', '%R' then
  86. goto_position(position + 1)
  87. when '#' then
  88. from
  89. goto_position(position + 1)
  90. until
  91. end_of_input or else last_character = '%N'
  92. loop
  93. goto_position(position + 1)
  94. end
  95. else
  96. stop := True
  97. end
  98. else
  99. stop := True
  100. end
  101. end
  102. ensure
  103. has_no_error: not has_error
  104. end
  105. feature {BACKTRACKING_REGULAR_EXPRESSION_BUILDER} -- parsing
  106. read_character
  107. -- Goto to the next character that is not a blank or a comment.
  108. do
  109. Precursor
  110. skip_blanks_and_comments
  111. end
  112. read_integer
  113. -- Reads in 'last_integer' the current integer values and
  114. -- then goto to the next character that is not a blank or a comment.
  115. do
  116. Precursor
  117. skip_blanks_and_comments
  118. end
  119. emit_repeat (mini, maxi: INTEGER)
  120. -- Takes the top of the stack and replace it with
  121. -- a construction that will evaluate the repeating of
  122. -- it from 'mini' to 'maxi' times.
  123. -- If current character is '?' it means that the repeat
  124. -- is not greedy.
  125. do
  126. if not end_of_input and then last_character = '?' then
  127. read_character
  128. set_not_greedy
  129. end
  130. Precursor(mini, maxi)
  131. set_greedy
  132. end
  133. feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- parsing
  134. parse_alternative
  135. local
  136. saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline,
  137. saved_has_extended_legibility: BOOLEAN
  138. do
  139. has_unterminated_comment := False
  140. saved_is_case_insensitive := is_case_insensitive
  141. saved_does_match_line_boundary := does_match_line_boundary
  142. saved_does_any_match_newline := does_any_match_newline
  143. saved_has_extended_legibility := has_extended_legibility
  144. Precursor
  145. is_case_insensitive := saved_is_case_insensitive
  146. does_match_line_boundary := saved_does_match_line_boundary
  147. does_any_match_newline := saved_does_any_match_newline
  148. has_extended_legibility := saved_has_extended_legibility
  149. if has_unterminated_comment then
  150. set_error(once "unterminated comment sequence (?#...")
  151. end
  152. end
  153. parse_group
  154. -- Parses a group. A group is either a POSIX group
  155. -- or an extended pattern group.
  156. local
  157. saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline,
  158. saved_has_extended_legibility: BOOLEAN
  159. do
  160. if valid_next_character and then next_character = '?' then
  161. parse_extended_pattern
  162. else
  163. saved_is_case_insensitive := is_case_insensitive
  164. saved_does_match_line_boundary := does_match_line_boundary
  165. saved_does_any_match_newline := does_any_match_newline
  166. saved_has_extended_legibility := has_extended_legibility
  167. Precursor
  168. is_case_insensitive := saved_is_case_insensitive
  169. does_match_line_boundary := saved_does_match_line_boundary
  170. does_any_match_newline := saved_does_any_match_newline
  171. has_extended_legibility := saved_has_extended_legibility
  172. end
  173. end
  174. parse_escaped
  175. -- Parses an escaped character.
  176. -- escaped ::= '\' CHARACTER
  177. do
  178. internal_parse_escaped(False)
  179. if not has_error then
  180. emit_recorded
  181. end
  182. end
  183. parse_union_factor
  184. -- Parses a union factor.
  185. -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER
  186. local
  187. not_class: BOOLEAN
  188. do
  189. inspect
  190. last_character
  191. when '[' then
  192. read_character
  193. if end_of_input then
  194. set_recorded_character('[')
  195. else
  196. inspect
  197. last_character
  198. when '.' then
  199. read_embedded
  200. if not has_error then
  201. inspect
  202. last_string.count
  203. when 0 then
  204. set_error(once "empty merge list")
  205. when 1 then
  206. set_recorded_character(last_string.first)
  207. else
  208. emit_match_text(last_string)
  209. set_recorded_item(unemit)
  210. end
  211. end
  212. when ':' then
  213. read_embedded
  214. if not has_error then
  215. if last_string.first = '^' then
  216. last_string.remove_first
  217. not_class := True
  218. end
  219. inspect
  220. last_string
  221. when "<" then
  222. set_recorded_item(the_begin_of_word_item)
  223. when ">" then
  224. set_recorded_item(the_end_of_word_item)
  225. else
  226. if has_named_posix_item(last_string) then
  227. set_recorded_item(named_posix_item(last_string))
  228. else
  229. set_error(once "unkwon posix class")
  230. end
  231. end
  232. if not_class and then not has_error then
  233. emit(recorded_item)
  234. emit_not_then_any
  235. set_recorded_item(unemit)
  236. end
  237. end
  238. when '=' then
  239. set_error(once "unimplemented class expression '[=....=]'")
  240. else
  241. set_recorded_character('[')
  242. end
  243. end
  244. when '\' then
  245. internal_parse_escaped(True)
  246. else
  247. set_recorded_character(last_character)
  248. read_character
  249. end
  250. end
  251. feature {} -- parsing
  252. internal_parse_escaped (in_union: BOOLEAN)
  253. require
  254. has_no_error: not has_error
  255. not_at_end: not end_of_input
  256. begin_with_escape: last_character = '\'
  257. do
  258. read_character
  259. if end_of_input then
  260. set_error(once "invalid '\' at the end of the expression")
  261. elseif in_union then
  262. inspect
  263. last_character
  264. when 'b', 'B', 'A', 'Z', 'z', '0' .. '9', '<', '>' then
  265. set_error(once "prohibited in unions")
  266. else
  267. end
  268. end
  269. if not has_error then
  270. inspect
  271. last_character
  272. when 'b', 'B' then
  273. -- word boundary or not
  274. begin_collect
  275. emit(the_begin_of_word_item)
  276. emit(the_end_of_word_item)
  277. end_collect_or
  278. if last_character = 'B' then
  279. emit_not
  280. end
  281. set_recorded_item(unemit)
  282. read_character
  283. when '<' then
  284. -- begin of word
  285. set_recorded_item(the_begin_of_word_item)
  286. read_character
  287. when '>' then
  288. -- end of word
  289. set_recorded_item(the_end_of_word_item)
  290. read_character
  291. when 'A' then
  292. -- begin of text
  293. set_recorded_item(the_begin_of_text_item)
  294. read_character
  295. when 'Z' then
  296. -- end of text
  297. set_recorded_item(the_end_of_text_item)
  298. read_character
  299. when 'z' then
  300. -- end of text
  301. set_recorded_item(the_real_end_of_text_item)
  302. read_character
  303. when 'w', 'W' then
  304. -- word or not word
  305. emit(the_is_posix_word_item)
  306. if last_character = 'W' then
  307. emit_not_then_any
  308. end
  309. set_recorded_item(unemit)
  310. read_character
  311. when 's', 'S' then
  312. -- space or not space
  313. emit(the_is_posix_space_item)
  314. if last_character = 'S' then
  315. emit_not_then_any
  316. end
  317. set_recorded_item(unemit)
  318. read_character
  319. when 'd', 'D' then
  320. -- space or not space
  321. emit(the_is_posix_digit_item)
  322. if last_character = 'D' then
  323. emit_not_then_any
  324. end
  325. set_recorded_item(unemit)
  326. read_character
  327. when '0' .. '9' then
  328. -- backtrack match
  329. read_integer
  330. if last_integer.in_range(1, last_group_count) and then not group_stack.has(last_integer) then
  331. emit_match_previous_group(last_integer)
  332. set_recorded_item(unemit)
  333. else
  334. set_error(once "unsupported forward group number")
  335. end
  336. when 'p' then
  337. -- positive POSIX indication
  338. read_character
  339. parse_posix_indication
  340. if not has_error then
  341. set_recorded_item(unemit)
  342. end
  343. when 'P' then
  344. -- negative POSIX indication
  345. read_character
  346. parse_posix_indication
  347. if not has_error then
  348. emit_not_then_any
  349. set_recorded_item(unemit)
  350. end
  351. else
  352. set_recorded_character(last_character)
  353. read_character
  354. end
  355. end
  356. end
  357. parse_posix_indication
  358. do
  359. if end_of_input then
  360. set_error(once "class missing in \p or \P construct")
  361. else
  362. inspect
  363. last_character
  364. when '{' then
  365. from
  366. last_string.clear_count
  367. read_character
  368. until
  369. end_of_input or else last_character = '}'
  370. loop
  371. last_string.add_last(last_character)
  372. read_character
  373. end
  374. if end_of_input then
  375. set_error(once "unmatched '{'")
  376. else
  377. if not has_named_perl_item(last_string) then
  378. set_error(once "invalid perl class name")
  379. else
  380. emit(named_perl_item(last_string))
  381. read_character
  382. end
  383. end
  384. else
  385. set_error(once "currently, only \p{..} or \P{..} construct is allowed")
  386. end
  387. end
  388. end
  389. parse_extended_pattern
  390. require
  391. has_no_error: not has_error
  392. not_at_end: not end_of_input
  393. begin_with_open_parenthesis: last_character = '('
  394. followed_with_question_mark: valid_next_character and next_character = '?'
  395. local
  396. dont_restore, saved_is_case_insensitive, saved_does_match_line_boundary, saved_does_any_match_newline,
  397. saved_has_extended_legibility: BOOLEAN
  398. do
  399. -- skip known characters
  400. read_character
  401. read_character
  402. -- save the state of the flags
  403. saved_is_case_insensitive := is_case_insensitive
  404. saved_does_match_line_boundary := does_match_line_boundary
  405. saved_does_any_match_newline := does_any_match_newline
  406. saved_has_extended_legibility := has_extended_legibility
  407. -- read the flags
  408. read_modifiers(True)
  409. if not end_of_input and then last_character = '-' then
  410. read_character
  411. read_modifiers(False)
  412. end
  413. if not end_of_input then
  414. inspect
  415. last_character
  416. when ')' then
  417. -- flag alteration only
  418. emit(the_true_node)
  419. dont_restore := True
  420. when '#' then
  421. -- comment
  422. emit(the_true_node)
  423. from
  424. until
  425. end_of_input or else last_character = ')'
  426. loop
  427. read_character
  428. end
  429. when ':' then
  430. -- not capturing
  431. read_character
  432. if not end_of_input then
  433. parse_alternative
  434. end
  435. when '=' then
  436. -- zero width positive look-ahead
  437. parse_looking(True)
  438. when '!' then
  439. -- zero width negative look-ahead
  440. parse_looking(True)
  441. when '<' then
  442. -- zero width look-behind
  443. read_character
  444. if not end_of_input then
  445. inspect
  446. last_character
  447. when '=' then
  448. -- zero width positive look-behind
  449. parse_looking(False)
  450. when '!' then
  451. -- zero width negative look-behind
  452. parse_looking(False)
  453. else
  454. set_error(once "bad zero width look-behind")
  455. end
  456. end
  457. when '{', '?', '(', '>' then
  458. -- unsupported
  459. set_error(once "unsupported experimental extended pattern")
  460. else
  461. set_error(once "unknown extended pattern")
  462. end
  463. end
  464. if not has_error then
  465. if end_of_input or else last_character /= ')' then
  466. set_error(once "extended pattern not finished")
  467. else
  468. if dont_restore then
  469. else
  470. -- restore the flags
  471. is_case_insensitive := saved_is_case_insensitive
  472. does_match_line_boundary := saved_does_match_line_boundary
  473. does_any_match_newline := saved_does_any_match_newline
  474. has_extended_legibility := saved_has_extended_legibility
  475. end
  476. read_character
  477. end
  478. end
  479. end
  480. parse_looking (ahead: BOOLEAN)
  481. require
  482. has_no_error: not has_error
  483. not_at_end: not end_of_input
  484. begin_with: last_character = '=' or else last_character = '!'
  485. do
  486. if is_looking_around then
  487. set_error(once "nested mix look-ahead / look-behind not implemented")
  488. else
  489. is_looking_ahead := ahead
  490. is_looking_behind := not ahead
  491. is_looking_positive := last_character = '='
  492. read_character
  493. if not end_of_input then
  494. parse_alternative
  495. if not has_error then
  496. emit_looking
  497. end
  498. end
  499. is_looking_ahead := False
  500. is_looking_behind := False
  501. end
  502. end
  503. read_modifiers (level: BOOLEAN)
  504. require
  505. has_no_error: not has_error
  506. local
  507. stop: BOOLEAN
  508. do
  509. from
  510. until
  511. end_of_input or else stop
  512. loop
  513. inspect
  514. last_character
  515. when 'i' then
  516. is_case_insensitive := level
  517. read_character
  518. when 'm' then
  519. does_match_line_boundary := level
  520. read_character
  521. when 's' then
  522. does_any_match_newline := level
  523. read_character
  524. when 'x' then
  525. has_extended_legibility := level
  526. read_character
  527. else
  528. stop := True
  529. end
  530. end
  531. ensure
  532. has_no_error: not has_error
  533. end
  534. end -- class PERL5_REGULAR_EXPRESSION_BUILDER
  535. --
  536. -- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file.
  537. --
  538. -- Permission is hereby granted, free of charge, to any person obtaining a copy
  539. -- of this software and associated documentation files (the "Software"), to deal
  540. -- in the Software without restriction, including without limitation the rights
  541. -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  542. -- copies of the Software, and to permit persons to whom the Software is
  543. -- furnished to do so, subject to the following conditions:
  544. --
  545. -- The above copyright notice and this permission notice shall be included in
  546. -- all copies or substantial portions of the Software.
  547. --
  548. -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  549. -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  550. -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  551. -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  552. -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  553. -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  554. -- THE SOFTWARE.