/src/lib/regular_expression/low_level/posix_regular_expression_builder.e

http://github.com/tybor/Liberty · Specman e · 560 lines · 468 code · 20 blank · 72 comment · 74 complexity · 437cd7412991a95745c9e744c5b28276 MD5 · raw file

  1. -- This file is part of a Liberty Eiffel library.
  2. -- See the full copyright at the end.
  3. --
  4. class POSIX_REGULAR_EXPRESSION_BUILDER
  5. --
  6. -- Parses POSIX regular expressions and build its matchable form
  7. --
  8. -- regular-expression ::= alternative
  9. -- alternative ::= sequence [ '|' sequence ]...
  10. -- sequence ::= term [ term ]...
  11. -- term ::= factor [ repeat-spec ]
  12. -- repeat-spec ::= '?' | '*' | '+' | '{' integer [',' [integer]] '}'
  13. -- factor ::= group | union | '.' | '^' | '$' | escaped | text
  14. -- group ::= '(' alternative ')'
  15. -- union ::= '[' union ']'
  16. -- union ::= '[' ['^'] union_term... ']'
  17. -- union_term ::= union_factor ['-' union_factor]
  18. -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER
  19. -- escaped ::= '\' CHARACTER
  20. -- text ::= A SEQUENCE NOT FOLLOWED BY EITHER '*', '+', '?', '{' OF NOT ESCAPED CHARACTERS
  21. inherit
  22. BACKTRACKING_REGULAR_EXPRESSION_BUILDER
  23. create {ANY}
  24. make
  25. feature {BACKTRACKING_REGULAR_EXPRESSION_BUILDER} -- parsing
  26. internal_parse
  27. -- Main parse of a POSIX regular expression.
  28. do
  29. if end_of_input then
  30. set_error(once "empty regular expression")
  31. else
  32. set_greedy
  33. parse_alternative
  34. if not has_error and then not end_of_input then
  35. set_error(once "extra character(s) found")
  36. end
  37. end
  38. end
  39. feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- parsing
  40. parse_alternative
  41. -- Parses an alternative of sequences.
  42. -- alternative ::= sequence [ '|' sequence ]...
  43. require
  44. has_no_error: not has_error
  45. not_at_end: not end_of_input
  46. local
  47. has_empty: BOOLEAN
  48. do
  49. begin_collect
  50. from
  51. if last_character = '|' or else last_character = ')' then
  52. has_empty := True
  53. else
  54. parse_sequence
  55. end
  56. until
  57. has_error or else end_of_input or else last_character /= '|'
  58. loop
  59. read_character
  60. if end_of_input or else last_character = '|' or else last_character = ')' then
  61. has_empty := True
  62. else
  63. parse_sequence
  64. end
  65. end
  66. if not has_error then
  67. if is_collect_empty then
  68. end_collect_true
  69. --set_error(once "empty expression is not allowed")
  70. else
  71. end_collect_or
  72. if has_empty then
  73. emit_controled_or_true
  74. end
  75. end
  76. end
  77. ensure
  78. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  79. state_known: has_error or else end_of_input or else last_character = ')'
  80. end
  81. parse_sequence
  82. -- Parses a sequence of terms.
  83. -- sequence ::= term [ term ]...
  84. require
  85. has_no_error: not has_error
  86. not_at_end: not end_of_input
  87. end_excluded: last_character /= '|' and then last_character /= ')'
  88. do
  89. begin_collect
  90. from
  91. parse_term
  92. until
  93. has_error or else end_of_input or else last_character = '|' or else last_character = ')'
  94. loop
  95. parse_term
  96. end
  97. if not has_error then
  98. if is_collect_empty then
  99. --emit(the_true_node)
  100. set_error(once "empty expression is not allowed")
  101. else
  102. end_collect_and
  103. end
  104. end
  105. ensure
  106. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  107. state_known: has_error or else end_of_input or else last_character = '|' or else last_character = ')'
  108. end
  109. parse_term
  110. -- Parses a term.
  111. -- term ::= factor [ repeat-spec ]
  112. -- repeat-spec ::= '?' | '*' | '+' | '{' integer [',' [integer]] '}'
  113. require
  114. has_no_error: not has_error
  115. not_at_end: not end_of_input
  116. end_excluded: last_character /= '|' and then last_character /= ')'
  117. local
  118. mini, maxi: INTEGER
  119. do
  120. parse_factor
  121. if not has_error and then not end_of_input then
  122. inspect
  123. last_character
  124. when '*' then
  125. read_character
  126. emit_repeat(0, Repeat_infiny)
  127. when '+' then
  128. read_character
  129. emit_repeat(1, Repeat_infiny)
  130. when '?' then
  131. read_character
  132. emit_repeat(0, 1)
  133. when '{' then
  134. save_position
  135. read_character
  136. if end_of_input or else not last_character.is_decimal_digit then
  137. restore_saved_position
  138. else
  139. read_integer
  140. mini := last_integer
  141. if not end_of_input then
  142. if last_character /= ',' then
  143. maxi := mini
  144. else
  145. read_character
  146. if end_of_input or else not last_character.is_decimal_digit then
  147. maxi := Repeat_infiny
  148. else
  149. read_integer
  150. maxi := last_integer
  151. end
  152. end
  153. end
  154. if end_of_input or else last_character /= '}' then
  155. set_error(once "expected '}' not found")
  156. elseif maxi /= Repeat_infiny and then maxi < mini then
  157. set_error(once "repeat count error (lower > upper is not allowed)")
  158. else
  159. read_character
  160. emit_repeat(mini, maxi)
  161. end
  162. end
  163. else
  164. end
  165. end
  166. ensure
  167. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  168. end
  169. parse_factor
  170. -- Parses a factor.
  171. -- factor ::= group | union | '.' | '^' | '$' | escaped | text
  172. require
  173. has_no_error: not has_error
  174. not_at_end: not end_of_input
  175. end_excluded: last_character /= '|' and then last_character /= ')'
  176. do
  177. inspect
  178. last_character
  179. when '(' then
  180. parse_group
  181. when '[' then
  182. parse_union
  183. when '.' then
  184. emit_any_character
  185. read_character
  186. when '^' then
  187. emit_begin_of_line
  188. read_character
  189. when '$' then
  190. emit_end_of_line
  191. read_character
  192. when '\' then
  193. parse_escaped
  194. when '*', '+', '?', '{' then
  195. set_error(once "unescaped reserved char")
  196. else
  197. parse_text
  198. end
  199. ensure
  200. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  201. end
  202. parse_group
  203. -- Parses a group.
  204. -- group ::= '(' alternative ')'
  205. require
  206. has_no_error: not has_error
  207. not_at_end: not end_of_input
  208. begin_with_open_parenthesis: last_character = '('
  209. do
  210. read_character
  211. if not end_of_input then
  212. prepare_group
  213. parse_alternative
  214. end
  215. if not has_error then
  216. if end_of_input or else last_character /= ')' then
  217. set_error(once "expected ')' not found")
  218. else
  219. read_character
  220. emit_group
  221. end
  222. end
  223. ensure
  224. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  225. end
  226. parse_escaped
  227. -- Parses an escaped character.
  228. -- escaped ::= '\' CHARACTER
  229. require
  230. has_no_error: not has_error
  231. not_at_end: not end_of_input
  232. begin_with_escape: last_character = '\'
  233. do
  234. read_character
  235. if end_of_input then
  236. set_error(once "invalid '\' at the end of the expression")
  237. else
  238. emit_match_single(last_character)
  239. read_character
  240. end
  241. ensure
  242. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  243. end
  244. parse_text
  245. -- Parses a text.
  246. -- text ::= A SEQUENCE NOT FOLLOWED BY EITHER '*', '+', '?', '{' OF NOT ESCAPED CHARACTERS
  247. require
  248. has_no_error: not has_error
  249. not_at_end: not end_of_input
  250. current_character_is_valid: valid_last_character and then not (once "([.^$\*+?{").has(last_character)
  251. local
  252. stop: BOOLEAN
  253. do
  254. from
  255. last_string.clear_count
  256. last_string.add_last(last_character)
  257. read_character
  258. if not end_of_input then
  259. inspect
  260. last_character
  261. when '*', '+', '?', '{' then
  262. stop := True
  263. else
  264. end
  265. end
  266. until
  267. end_of_input or else stop
  268. loop
  269. inspect
  270. last_character
  271. when '.', '(', ')', '[', '^', '$', '|', '\' then
  272. stop := True
  273. else
  274. if valid_next_character then
  275. inspect
  276. next_character
  277. when '*', '+', '?', '{' then
  278. stop := True
  279. else
  280. last_string.add_last(last_character)
  281. read_character
  282. end
  283. else
  284. last_string.add_last(last_character)
  285. read_character
  286. end
  287. end
  288. end
  289. check
  290. last_string.count > 0
  291. end
  292. if last_string.count = 1 then
  293. emit_match_single(last_string.first)
  294. else
  295. emit_match_text(last_string)
  296. end
  297. ensure
  298. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  299. end
  300. parse_union
  301. -- Parses a union.
  302. -- union ::= '[' ['^'] union_term... ']'
  303. require
  304. has_no_error: not has_error
  305. not_at_end: not end_of_input
  306. begin_with_open_bracket: last_character = '['
  307. local
  308. negate: BOOLEAN
  309. do
  310. read_character
  311. if not end_of_input and then last_character = '^' then
  312. negate := True
  313. read_character
  314. end
  315. if not end_of_input then
  316. begin_collect
  317. from
  318. parse_union_term
  319. until
  320. has_error or else end_of_input or else last_character = ']'
  321. loop
  322. parse_union_term
  323. end
  324. end
  325. if not has_error then
  326. if end_of_input then
  327. set_error(once "expected ']' not found")
  328. else
  329. end_collect_or
  330. check
  331. last_character = ']'
  332. end
  333. read_character
  334. if negate then
  335. emit_not_then_any
  336. end
  337. end
  338. end
  339. ensure
  340. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  341. end
  342. feature {POSIX_REGULAR_EXPRESSION_BUILDER} -- sub parts of union
  343. parse_union_term
  344. -- Parses a union term.
  345. -- union_term ::= union_factor ['-' union_factor]
  346. require
  347. has_no_error: not has_error
  348. not_at_end: not end_of_input
  349. local
  350. mini, maxi: CHARACTER
  351. do
  352. parse_union_factor
  353. if not has_error then
  354. if end_of_input or else last_character /= '-' or else valid_next_character and then next_character = ']' then
  355. emit_recorded
  356. else
  357. if recorded_item /= Void then
  358. set_error(once "first factor of an interval must be a single character")
  359. else
  360. read_character
  361. if end_of_input then
  362. set_error(once "unterminated interval")
  363. else
  364. mini := recorded_character
  365. parse_union_factor
  366. if not has_error then
  367. if recorded_item /= Void then
  368. set_error(once "second factor of an interval must be a single character")
  369. else
  370. maxi := recorded_character
  371. if mini > maxi then
  372. set_error(once "invalid interval because the first factor has a character code greater than the last factor one")
  373. else
  374. emit_match_range(mini, maxi)
  375. end
  376. end
  377. end
  378. end
  379. end
  380. end
  381. end
  382. ensure
  383. error_or_stack_incremented_by_one: has_error or else stack.count = old stack.count + 1
  384. end
  385. parse_union_factor
  386. -- Parses a union factor.
  387. -- union_factor::= '[.' TEXT '.]' | '[:' CLASS ':]' | '[:<:]' | '[:>:]' | CHARACTER
  388. require
  389. has_no_error: not has_error
  390. not_at_end: not end_of_input
  391. do
  392. inspect
  393. last_character
  394. when '[' then
  395. read_character
  396. if end_of_input then
  397. set_recorded_character('[')
  398. else
  399. inspect
  400. last_character
  401. when '.' then
  402. read_embedded
  403. if not has_error then
  404. inspect
  405. last_string.count
  406. when 0 then
  407. set_error(once "empty merge list")
  408. when 1 then
  409. set_recorded_character(last_string.first)
  410. else
  411. emit_match_text(last_string)
  412. set_recorded_item(unemit)
  413. end
  414. end
  415. when ':' then
  416. read_embedded
  417. if not has_error then
  418. inspect
  419. last_string
  420. when "<" then
  421. set_recorded_item(the_begin_of_word_item)
  422. when ">" then
  423. set_recorded_item(the_end_of_word_item)
  424. else
  425. if has_named_posix_item(last_string) then
  426. set_recorded_item(named_posix_item(last_string))
  427. else
  428. set_error(once "unkwon posix class")
  429. end
  430. end
  431. end
  432. when '=' then
  433. set_error(once "unimplemented class expression '[=....=]'")
  434. else
  435. set_recorded_character('[')
  436. end
  437. end
  438. else
  439. set_recorded_character(last_character)
  440. read_character
  441. end
  442. end
  443. read_embedded
  444. -- Parses the text embedded in one of '[.' TEXT '.]',
  445. -- '[:' TEXT ':]' or '[=' TEXT '=]'.
  446. -- The parsed text is put in feature 'last_string'.
  447. require
  448. has_no_error: not has_error
  449. not_at_end: not end_of_input
  450. previous_character_is_open_brace: valid_previous_character and then previous_character = '['
  451. current_character_is_valid: valid_last_character and then (once ".:=").has(last_character)
  452. local
  453. tag: CHARACTER; stop: BOOLEAN
  454. do
  455. from
  456. last_string.clear_count
  457. tag := last_character
  458. read_character
  459. until
  460. stop
  461. loop
  462. from
  463. until
  464. end_of_input or else last_character = tag
  465. loop
  466. last_string.add_last(last_character)
  467. read_character
  468. end
  469. if end_of_input then
  470. set_error(once "unmatched '[.' or '[:' or '[='")
  471. stop := True
  472. else
  473. read_character
  474. if not end_of_input then
  475. if last_character = ']' then
  476. read_character
  477. stop := True
  478. else
  479. last_string.add_last(tag)
  480. end
  481. end
  482. end
  483. end
  484. end
  485. recorded_character: CHARACTER
  486. -- Last union_factor's character recorded.
  487. recorded_item: BACKTRACKING_NODE
  488. -- Last union_factor's item (complex expression) recorded.
  489. set_recorded_character (value: CHARACTER)
  490. -- Records the union_factor's character 'value'.
  491. do
  492. recorded_item := Void
  493. recorded_character := value
  494. ensure
  495. recorded_item = Void
  496. recorded_character = value
  497. end
  498. set_recorded_item (value: BACKTRACKING_NODE)
  499. -- Records the union_factor's item (complex expression) 'value'.
  500. require
  501. item_not_void: value /= Void
  502. do
  503. recorded_item := value
  504. ensure
  505. recorded_item /= Void
  506. recorded_item = value
  507. end
  508. emit_recorded
  509. -- Emits the last union_factor's recorded character or item,
  510. -- depending on its kind.
  511. do
  512. if recorded_item = Void then
  513. emit_match_single(recorded_character)
  514. else
  515. emit(recorded_item)
  516. end
  517. ensure
  518. incremented_by_one: stack.count = old stack.count + 1
  519. end
  520. end -- class POSIX_REGULAR_EXPRESSION_BUILDER
  521. --
  522. -- Copyright (C) 2009-2017: by all the people cited in the AUTHORS file.
  523. --
  524. -- Permission is hereby granted, free of charge, to any person obtaining a copy
  525. -- of this software and associated documentation files (the "Software"), to deal
  526. -- in the Software without restriction, including without limitation the rights
  527. -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  528. -- copies of the Software, and to permit persons to whom the Software is
  529. -- furnished to do so, subject to the following conditions:
  530. --
  531. -- The above copyright notice and this permission notice shall be included in
  532. -- all copies or substantial portions of the Software.
  533. --
  534. -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  535. -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  536. -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  537. -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  538. -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  539. -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  540. -- THE SOFTWARE.