PageRenderTime 50ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/re.ml

http://github.com/avsm/ocaml-re
OCaml | 953 lines | 742 code | 91 blank | 120 comment | 92 complexity | 119428c917ee0ffbe22339ae3ca1b5fe MD5 | raw file
Possible License(s): LGPL-2.1
  1. (*
  2. RE - A regular expression library
  3. Copyright (C) 2001 Jerome Vouillon
  4. email: Jerome.Vouillon@pps.jussieu.fr
  5. This library is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU Lesser General Public
  7. License as published by the Free Software Foundation; either
  8. version 2 of the License, or (at your option) any later version.
  9. This library is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. Lesser General Public License for more details.
  13. You should have received a copy of the GNU Lesser General Public
  14. License along with this library; if not, write to the Free Software
  15. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  16. *)
  17. let rec first f l =
  18. match l with
  19. [] -> None
  20. | x :: r -> match f x with
  21. None -> first f r
  22. | Some _ as res -> res
  23. let rec iter n f v = if n = 0 then v else iter (n - 1) f (f v)
  24. (****)
  25. let unknown = -2
  26. let break = -3
  27. type 'a match_info =
  28. [ `Match of 'a
  29. | `Failed
  30. | `Running ]
  31. type state =
  32. { idx : int;
  33. (* Index of the current position in the position table.
  34. Not yet computed transitions point to a dummy state where
  35. [idx] is set to [unknown];
  36. If [idx] is set to [break] for states that either always
  37. succeed or always fail. *)
  38. real_idx : int;
  39. (* The real index, in case [idx] is set to [break] *)
  40. next : state array;
  41. (* Transition table, indexed by color *)
  42. mutable final :
  43. (Automata.category *
  44. (Automata.idx * Automata.mark_infos match_info)) list;
  45. (* Mapping from the category of the next character to
  46. - the index where the next position should be saved
  47. - possibly, the list of marks (and the corresponding indices)
  48. corresponding to the best match *)
  49. desc : Automata.state
  50. (* Description of this state of the automata *) }
  51. (* Automata (compiled regular expression) *)
  52. type re =
  53. { initial : Automata.expr;
  54. (* The whole regular expression *)
  55. mutable initial_states : (int * state) list;
  56. (* Initial states, indexed by initial category *)
  57. cols : string;
  58. (* Color table *)
  59. col_repr : string;
  60. (* Table from colors to one character of this color *)
  61. ncol : int;
  62. (* Number of colors *)
  63. lnl : int;
  64. (* Color of the last newline *)
  65. mutable tbl : Automata.working_area;
  66. (* Temporary table used to compute the first available index
  67. when computing a new state *)
  68. states : state Automata.States.t;
  69. (* States of the deterministic automata *)
  70. group_count : int
  71. (* Number of groups in the regular expression *) }
  72. let print_re ch re = Automata.print_expr ch re.initial
  73. (* Information used during matching *)
  74. type info =
  75. { re : re;
  76. (* The automata *)
  77. i_cols : string;
  78. (* Color table ([x.i_cols = x.re.cols])
  79. Sortcut used for performance reasons *)
  80. mutable positions : int array;
  81. (* Array of mark positions
  82. The mark are off by one for performance reasons *)
  83. mutable pos : int;
  84. (* Position where the match is started *)
  85. mutable last : int
  86. (* Position where the match should stop *) }
  87. (****)
  88. let cat_inexistant = 1
  89. let cat_letter = 2
  90. let cat_not_letter = 4
  91. let cat_newline = 8
  92. let cat_lastnewline = 16
  93. let cat_search_boundary = 32
  94. let category re c =
  95. if c = -1 then cat_inexistant else
  96. (* Special category for the last newline *)
  97. if c = re.lnl then cat_lastnewline lor cat_newline lor cat_not_letter else
  98. match re.col_repr.[c] with
  99. 'a'..'z' | 'A'..'Z' ->
  100. cat_letter
  101. | '\n' ->
  102. cat_not_letter lor cat_newline
  103. | _ ->
  104. cat_not_letter
  105. (****)
  106. let dummy_next = [||]
  107. let unknown_state =
  108. { idx = unknown; real_idx = 0;
  109. next = dummy_next; final = [];
  110. desc = Automata.dummy_state }
  111. let count = ref 0
  112. let mk_state ncol ((idx, _, _, _, _) as desc) =
  113. let break_state =
  114. match Automata.status desc with
  115. `Running -> false
  116. | _ -> true
  117. in
  118. { idx = if break_state then break else idx;
  119. real_idx = idx;
  120. next = if break_state then dummy_next else Array.make ncol unknown_state;
  121. final = [];
  122. desc = desc }
  123. let find_state re desc =
  124. try
  125. Automata.States.find re.states desc
  126. with Not_found ->
  127. let st = mk_state re.ncol desc in
  128. Automata.States.add re.states desc st;
  129. st
  130. (**** Match with marks ****)
  131. let delta info cat c st =
  132. let (idx, _, _, _, _) as desc = Automata.delta info.re.tbl cat c st.desc in
  133. let len = Array.length info.positions in
  134. if idx = len && len > 0 then begin
  135. let pos = info.positions in
  136. info.positions <- Array.make (2 * len) 0;
  137. Array.blit pos 0 info.positions 0 len
  138. end;
  139. desc
  140. let validate info s pos st =
  141. let c = Char.code info.i_cols.[Char.code s.[pos]] in
  142. let cat = category info.re c in
  143. let desc' = delta info cat c st in
  144. let st' = find_state info.re desc' in
  145. st.next.(c) <- st'
  146. (*
  147. let rec loop info s pos st =
  148. if pos < info.last then
  149. let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
  150. let idx = st'.idx in
  151. if idx >= 0 then begin
  152. info.positions.(idx) <- pos;
  153. loop info s (pos + 1) st'
  154. end else if idx = break then begin
  155. info.positions.(st'.real_idx) <- pos;
  156. st'
  157. end else begin (* Unknown *)
  158. validate info s pos st;
  159. loop info s pos st
  160. end
  161. else
  162. st
  163. *)
  164. let rec loop info s pos st =
  165. if pos < info.last then
  166. let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
  167. loop2 info s pos st st'
  168. else
  169. st
  170. and loop2 info s pos st st' =
  171. let idx = st'.idx in
  172. if idx >= 0 then begin
  173. let pos = pos + 1 in
  174. if pos < info.last then begin
  175. (* It is important to place these reads before the write *)
  176. (* But then, we don't have enough registers left to store the
  177. right position. So, we store the position plus one. *)
  178. let st'' = st'.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
  179. info.positions.(idx) <- pos;
  180. loop2 info s pos st' st''
  181. end else begin
  182. info.positions.(idx) <- pos;
  183. st'
  184. end
  185. end else if idx = break then begin
  186. info.positions.(st'.real_idx) <- pos + 1;
  187. st'
  188. end else begin (* Unknown *)
  189. validate info s pos st;
  190. loop info s pos st
  191. end
  192. let rec loop_no_mark info s pos last st =
  193. if pos < last then
  194. let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
  195. let idx = st'.idx in
  196. if idx >= 0 then
  197. loop_no_mark info s (pos + 1) last st'
  198. else if idx = break then
  199. st'
  200. else begin (* Unknown *)
  201. validate info s pos st;
  202. loop_no_mark info s pos last st
  203. end
  204. else
  205. st
  206. let final info st cat =
  207. try
  208. List.assq cat st.final
  209. with Not_found ->
  210. let (idx, _, _, _, _) as st' = delta info cat (-1) st in
  211. let res = (idx, Automata.status st') in
  212. st.final <- (cat, res) :: st.final;
  213. res
  214. let find_initial_state re cat =
  215. try
  216. List.assq cat re.initial_states
  217. with Not_found ->
  218. let st =
  219. find_state re (Automata.create_state cat re.initial)
  220. in
  221. re.initial_states <- (cat, st) :: re.initial_states;
  222. st
  223. let dummy_substrings = `Match ("", [], [||], 0)
  224. let get_color re s pos =
  225. if pos < 0 then -1 else
  226. let slen = String.length s in
  227. if pos >= slen then -1 else
  228. (* Special case for the last newline *)
  229. if pos = slen - 1 && re.lnl <> -1 && s.[pos] = '\n' then re.lnl else
  230. Char.code re.cols.[Char.code s.[pos]]
  231. let rec handle_last_newline info pos st groups =
  232. let st' = st.next.(info.re.lnl) in
  233. let idx = st'.idx in
  234. if idx >= 0 then begin
  235. if groups then info.positions.(idx) <- pos + 1;
  236. st'
  237. end else if idx = break then begin
  238. if groups then info.positions.(st'.real_idx) <- pos + 1;
  239. st'
  240. end else begin (* Unknown *)
  241. let c = info.re.lnl in
  242. let real_c = Char.code info.i_cols.[Char.code '\n'] in
  243. let cat = category info.re c in
  244. let desc' = delta info cat real_c st in
  245. let st' = find_state info.re desc' in
  246. st.next.(c) <- st';
  247. handle_last_newline info pos st groups
  248. end
  249. let rec scan_str info s initial_state groups =
  250. let pos = info.pos in
  251. let last = info.last in
  252. if
  253. last = String.length s &&
  254. info.re.lnl <> -1 &&
  255. last > pos &&
  256. s.[last - 1] = '\n'
  257. then begin
  258. info.last <- last - 1;
  259. let st = scan_str info s initial_state groups in
  260. if st.idx = break then
  261. st
  262. else
  263. handle_last_newline info (last - 1) st groups
  264. end else if groups then
  265. loop info s pos initial_state
  266. else
  267. loop_no_mark info s pos last initial_state
  268. let match_str groups re s pos len =
  269. let slen = String.length s in
  270. let last = if len = -1 then slen else pos + len in
  271. let info =
  272. { re = re; i_cols = re.cols; pos = pos; last = last;
  273. positions =
  274. if groups then begin
  275. let n = Automata.index_count re.tbl + 1 in
  276. if n <= 10 then
  277. [|0;0;0;0;0;0;0;0;0;0|]
  278. else
  279. Array.make n 0
  280. end else
  281. [||] }
  282. in
  283. let initial_cat =
  284. if pos = 0 then
  285. cat_search_boundary lor cat_inexistant
  286. else
  287. cat_search_boundary lor category re (get_color re s (pos - 1)) in
  288. let initial_state = find_initial_state re initial_cat in
  289. let st = scan_str info s initial_state groups in
  290. let res =
  291. if st.idx = break then
  292. Automata.status st.desc
  293. else
  294. let final_cat =
  295. if last = slen then
  296. cat_search_boundary lor cat_inexistant
  297. else
  298. cat_search_boundary lor category re (get_color re s last) in
  299. let (idx, res) = final info st final_cat in
  300. if groups then info.positions.(idx) <- last + 1;
  301. res
  302. in
  303. match res with
  304. `Match m ->
  305. `Match (s, m, info.positions, re.group_count)
  306. | (`Failed | `Running) as res ->
  307. res
  308. let mk_re init cols col_repr ncol lnl group_count =
  309. { initial = init;
  310. initial_states = [];
  311. cols = cols;
  312. col_repr = col_repr;
  313. ncol = ncol;
  314. lnl = lnl;
  315. tbl = Automata.create_working_area ();
  316. states = Automata.States.create 97;
  317. group_count = group_count }
  318. (**** Character sets ****)
  319. let cany = [0, 255]
  320. let cseq c c' = Cset.seq (Char.code c) (Char.code c')
  321. let cadd c s = Cset.add (Char.code c) s
  322. let csingle c = Cset.single (Char.code c)
  323. let rec interval i j = if i > j then [] else i :: interval (i + 1) j
  324. let rec cset_hash_rec l =
  325. match l with
  326. [] -> 0
  327. | (i, j)::r -> i + 13 * j + 257 * cset_hash_rec r
  328. let cset_hash l = (cset_hash_rec l) land 0x3FFFFFFF
  329. module CSetMap =
  330. Map.Make
  331. (struct
  332. type t = int * (int * int) list
  333. let compare (i, u) (j, v) =
  334. let c = compare i j in if c <> 0 then c else compare u v
  335. end)
  336. let trans_set cache cm s =
  337. match s with
  338. [i, j] when i = j ->
  339. csingle cm.[i]
  340. | _ ->
  341. let v = (cset_hash_rec s, s) in
  342. try
  343. CSetMap.find v !cache
  344. with Not_found ->
  345. let l =
  346. List.fold_right
  347. (fun (i, j) l -> Cset.union (cseq cm.[i] cm.[j]) l)
  348. s Cset.empty
  349. in
  350. cache := CSetMap.add v l !cache;
  351. l
  352. (****)
  353. type sem_status = Compulsory | Indicative
  354. type regexp =
  355. Set of Cset.t
  356. | Sequence of regexp list
  357. | Alternative of regexp list
  358. | Repeat of regexp * int * int option
  359. | Beg_of_line | End_of_line
  360. | Beg_of_word | End_of_word | Not_bound
  361. | Beg_of_str | End_of_str
  362. | Last_end_of_line | Start | Stop
  363. | Sem of Automata.sem * regexp
  364. | Sem_greedy of Automata.rep_kind * regexp
  365. | Group of regexp | No_group of regexp | Nest of regexp
  366. | Case of regexp | No_case of regexp
  367. | Intersection of regexp list
  368. | Complement of regexp list
  369. | Difference of regexp * regexp
  370. let rec is_charset r =
  371. match r with
  372. Set _ ->
  373. true
  374. | Alternative l | Intersection l | Complement l ->
  375. List.for_all is_charset l
  376. | Difference (r, r') ->
  377. is_charset r && is_charset r'
  378. | Sem (_, r) | Sem_greedy (_, r)
  379. | No_group r | Case r | No_case r ->
  380. is_charset r
  381. | Sequence _ | Repeat _ | Beg_of_line | End_of_line
  382. | Beg_of_word | End_of_word | Beg_of_str | End_of_str
  383. | Not_bound | Last_end_of_line | Start | Stop | Group _ | Nest _ ->
  384. false
  385. (**** Colormap ****)
  386. (*XXX Use a better algorithm allowing non-contiguous regions? *)
  387. let rec split s cm =
  388. match s with
  389. [] -> ()
  390. | (i, j)::r -> cm.[i] <- '\001'; cm.[j + 1] <- '\001'; split r cm
  391. let cupper =
  392. Cset.union (cseq 'A' 'Z') (Cset.union (cseq '\192' '\214') (cseq '\216' '\222'))
  393. let clower = Cset.offset 32 cupper
  394. let calpha = cadd '\170' (cadd '\186' (Cset.union clower cupper))
  395. let cdigit = cseq '0' '9'
  396. let calnum = Cset.union calpha cdigit
  397. let cword = cadd '_' calnum
  398. let rec colorize c regexp =
  399. let lnl = ref false in
  400. let rec colorize regexp =
  401. match regexp with
  402. Set s -> split s c
  403. | Sequence l -> List.iter colorize l
  404. | Alternative l -> List.iter colorize l
  405. | Repeat (r, _, _) -> colorize r
  406. | Beg_of_line | End_of_line -> split (csingle '\n') c
  407. | Beg_of_word | End_of_word
  408. | Not_bound -> split cword c
  409. | Beg_of_str | End_of_str
  410. | Start | Stop | Not_bound -> ()
  411. | Last_end_of_line -> lnl := true
  412. | Sem (_, r)
  413. | Sem_greedy (_, r)
  414. | Group r | No_group r
  415. | Nest r -> colorize r
  416. | Case _ | No_case _
  417. | Intersection _
  418. | Complement _
  419. | Difference _ -> assert false
  420. in
  421. colorize regexp;
  422. !lnl
  423. let make_cmap () = String.make 257 '\000'
  424. let flatten_cmap cm =
  425. let c = String.create 256 in
  426. let col_repr = String.create 256 in
  427. let v = ref 0 in
  428. c.[0] <- '\000';
  429. col_repr.[0] <- '\000';
  430. for i = 1 to 255 do
  431. if cm.[i] <> '\000' then incr v;
  432. c.[i] <- Char.chr !v;
  433. col_repr.[!v] <- Char.chr i
  434. done;
  435. (c, String.sub col_repr 0 (!v + 1), !v + 1)
  436. (**** Compilation ****)
  437. let sequence l =
  438. match l with
  439. [x] -> x
  440. | l -> Sequence l
  441. let rec merge_sequences l =
  442. match l with
  443. [] ->
  444. l
  445. | Alternative l' :: r ->
  446. merge_sequences (l' @ r)
  447. | Sequence (x :: y) :: r ->
  448. begin match merge_sequences r with
  449. Sequence (x' :: y') :: r' when x = x' ->
  450. Sequence [x; Alternative [sequence y; sequence y']] :: r'
  451. | r' ->
  452. Sequence (x :: y) :: r'
  453. end
  454. | x :: r ->
  455. x :: merge_sequences r
  456. module A = Automata
  457. let enforce_kind ids kind kind' cr =
  458. match kind, kind' with
  459. `First, `First -> cr
  460. | `First, k -> A.seq ids k cr (A.eps ids)
  461. | _ -> cr
  462. (* XXX should probably compute a category mask *)
  463. let rec translate ids kind ign_group ign_case greedy pos cache c r =
  464. match r with
  465. Set s ->
  466. (A.cst ids (trans_set cache c s), kind)
  467. | Sequence l ->
  468. (trans_seq ids kind ign_group ign_case greedy pos cache c l, kind)
  469. | Alternative l ->
  470. begin match merge_sequences l with
  471. [r'] ->
  472. let (cr, kind') =
  473. translate ids kind ign_group ign_case greedy pos cache c r' in
  474. (enforce_kind ids kind kind' cr, kind)
  475. | l' ->
  476. (A.alt ids
  477. (List.map
  478. (fun r' ->
  479. let (cr, kind') =
  480. translate ids kind ign_group ign_case greedy
  481. pos cache c r' in
  482. enforce_kind ids kind kind' cr)
  483. (merge_sequences l)),
  484. kind)
  485. end
  486. | Repeat (r', i, j) ->
  487. let (cr, kind') =
  488. translate ids kind ign_group ign_case greedy pos cache c r' in
  489. let rem =
  490. match j with
  491. None ->
  492. A.rep ids greedy kind' cr
  493. | Some j ->
  494. let f =
  495. match greedy with
  496. `Greedy ->
  497. fun rem ->
  498. A.alt ids
  499. [A.seq ids kind' (A.rename ids cr) rem; A.eps ids]
  500. | `Non_greedy ->
  501. fun rem ->
  502. A.alt ids
  503. [A.eps ids; A.seq ids kind' (A.rename ids cr) rem]
  504. in
  505. iter (j - i) f (A.eps ids)
  506. in
  507. (iter i (fun rem -> A.seq ids kind' (A.rename ids cr) rem) rem, kind)
  508. | Beg_of_line ->
  509. (A.after ids (cat_inexistant lor cat_newline), kind)
  510. | End_of_line ->
  511. (A.before ids (cat_inexistant lor cat_newline), kind)
  512. | Beg_of_word ->
  513. (A.seq ids `First
  514. (A.after ids (cat_inexistant lor cat_not_letter))
  515. (A.before ids (cat_inexistant lor cat_letter)),
  516. kind)
  517. | End_of_word ->
  518. (A.seq ids `First
  519. (A.after ids (cat_inexistant lor cat_letter))
  520. (A.before ids (cat_inexistant lor cat_not_letter)),
  521. kind)
  522. | Not_bound ->
  523. (A.alt ids [A.seq ids `First
  524. (A.after ids cat_letter)
  525. (A.before ids cat_letter);
  526. A.seq ids `First
  527. (A.after ids cat_letter)
  528. (A.before ids cat_letter)],
  529. kind)
  530. | Beg_of_str ->
  531. (A.after ids cat_inexistant, kind)
  532. | End_of_str ->
  533. (A.before ids cat_inexistant, kind)
  534. | Last_end_of_line ->
  535. (A.before ids (cat_inexistant lor cat_lastnewline), kind)
  536. | Start ->
  537. (A.after ids cat_search_boundary, kind)
  538. | Stop ->
  539. (A.before ids cat_search_boundary, kind)
  540. | Sem (kind', r') ->
  541. let (cr, kind'') =
  542. translate ids kind' ign_group ign_case greedy pos cache c r' in
  543. (enforce_kind ids kind' kind'' cr,
  544. kind')
  545. | Sem_greedy (greedy', r') ->
  546. translate ids kind ign_group ign_case greedy' pos cache c r'
  547. | Group r' ->
  548. if ign_group then
  549. translate ids kind ign_group ign_case greedy pos cache c r'
  550. else
  551. let p = !pos in
  552. pos := !pos + 2;
  553. let (cr, kind') =
  554. translate ids kind ign_group ign_case greedy pos cache c r' in
  555. (A.seq ids `First (A.mark ids p) (
  556. A.seq ids `First cr (A.mark ids (p + 1))),
  557. kind')
  558. | No_group r' ->
  559. translate ids kind true ign_case greedy pos cache c r'
  560. | Nest r' ->
  561. let b = !pos in
  562. let (cr, kind') =
  563. translate ids kind ign_group ign_case greedy pos cache c r'
  564. in
  565. let e = !pos - 1 in
  566. if e < b then
  567. (cr, kind')
  568. else
  569. (A.seq ids `First (A.erase ids b e) cr, kind')
  570. | Difference _ | Complement _ | Intersection _ | No_case _ | Case _ ->
  571. assert false
  572. and trans_seq ids kind ign_group ign_case greedy pos cache c l =
  573. match l with
  574. [] ->
  575. A.eps ids
  576. | [r] ->
  577. let (cr', kind') =
  578. translate ids kind ign_group ign_case greedy pos cache c r in
  579. enforce_kind ids kind kind' cr'
  580. | r :: rem ->
  581. let (cr', kind') =
  582. translate ids kind ign_group ign_case greedy pos cache c r in
  583. let cr'' =
  584. trans_seq ids kind ign_group ign_case greedy pos cache c rem in
  585. if A.def cr'' = A.Eps then
  586. cr'
  587. else if A.def cr' = A.Eps then
  588. cr''
  589. else
  590. A.seq ids kind' cr' cr''
  591. (**** Case ****)
  592. let case_insens s =
  593. Cset.union s (Cset.union (Cset.offset 32 (Cset.inter s cupper))
  594. (Cset.offset (-32) (Cset.inter s clower)))
  595. let as_set r =
  596. match r with
  597. Set s -> s
  598. | _ -> assert false
  599. (* XXX Should split alternatives into (1) charsets and (2) more
  600. complex regular expressions; alternative should therefore probably
  601. be flatten here *)
  602. let rec handle_case ign_case r =
  603. match r with
  604. Set s ->
  605. Set (if ign_case then case_insens s else s)
  606. | Sequence l ->
  607. Sequence (List.map (handle_case ign_case) l)
  608. | Alternative l ->
  609. let l' = List.map (handle_case ign_case) l in
  610. if is_charset (Alternative l') then
  611. Set (List.fold_left (fun s r -> Cset.union s (as_set r)) Cset.empty l')
  612. else
  613. Alternative l'
  614. | Repeat (r, i, j) ->
  615. Repeat (handle_case ign_case r, i, j)
  616. | Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound
  617. | Beg_of_str | End_of_str | Last_end_of_line | Start | Stop ->
  618. r
  619. | Sem (k, r) ->
  620. let r' = handle_case ign_case r in
  621. if is_charset r' then r' else
  622. Sem (k, r')
  623. | Sem_greedy (k, r) ->
  624. let r' = handle_case ign_case r in
  625. if is_charset r' then r' else
  626. Sem_greedy (k, r')
  627. | Group r ->
  628. Group (handle_case ign_case r)
  629. | No_group r ->
  630. let r' = handle_case ign_case r in
  631. if is_charset r' then r' else
  632. No_group r'
  633. | Nest r ->
  634. let r' = handle_case ign_case r in
  635. if is_charset r' then r' else
  636. Nest r'
  637. | Case r ->
  638. handle_case false r
  639. | No_case r ->
  640. handle_case true r
  641. | Intersection l ->
  642. let l' = List.map (fun r -> handle_case ign_case r) l in
  643. Set (List.fold_left (fun s r -> Cset.inter s (as_set r)) cany l')
  644. | Complement l ->
  645. let l' = List.map (fun r -> handle_case ign_case r) l in
  646. Set (Cset.diff cany
  647. (List.fold_left (fun s r -> Cset.union s (as_set r))
  648. Cset.empty l'))
  649. | Difference (r, r') ->
  650. Set (Cset.inter (as_set (handle_case ign_case r))
  651. (Cset.diff cany (as_set (handle_case ign_case r'))))
  652. (****)
  653. let compile_1 regexp =
  654. let regexp = handle_case false regexp in
  655. let c = make_cmap () in
  656. let need_lnl = colorize c regexp in
  657. let (col, col_repr, ncol) = flatten_cmap c in
  658. let lnl = if need_lnl then ncol else -1 in
  659. let ncol = if need_lnl then ncol + 1 else ncol in
  660. let ids = A.create_ids () in
  661. let pos = ref 0 in
  662. let (r, kind) =
  663. translate ids
  664. `First false false `Greedy pos (ref CSetMap.empty) col regexp in
  665. let r = enforce_kind ids `First kind r in
  666. (*Format.eprintf "<%d %d>@." !ids ncol;*)
  667. mk_re r col col_repr ncol lnl (!pos / 2)
  668. (****)
  669. type t = regexp
  670. let str s =
  671. let l = ref [] in
  672. for i = String.length s - 1 downto 0 do
  673. l := Set (csingle s.[i]) :: !l
  674. done;
  675. Sequence !l
  676. let char c = Set (csingle c)
  677. let alt l =
  678. match l with
  679. [r] -> r
  680. | _ -> Alternative l
  681. let seq l =
  682. match l with
  683. [r] -> r
  684. | _ -> Sequence l
  685. let empty = alt []
  686. let epsilon = seq []
  687. let repn r i j =
  688. if i < 0 then invalid_arg "Re.repn";
  689. begin match j with Some j when j < i -> invalid_arg "Re.repn" | _ -> () end;
  690. Repeat (r, i, j)
  691. let rep r = repn r 0 None
  692. let rep1 r = repn r 1 None
  693. let opt r = repn r 0 (Some 1)
  694. let bol = Beg_of_line
  695. let eol = End_of_line
  696. let bow = Beg_of_word
  697. let eow = End_of_word
  698. let word r = seq [bow; r; eow]
  699. let not_boundary = Not_bound
  700. let bos = Beg_of_str
  701. let eos = End_of_str
  702. let leol = Last_end_of_line
  703. let start = Start
  704. let stop = Stop
  705. let longest r = Sem (`Longest, r)
  706. let shortest r = Sem (`Shortest, r)
  707. let first r = Sem (`First, r)
  708. let greedy r = Sem_greedy (`Greedy, r)
  709. let non_greedy r = Sem_greedy (`Non_greedy, r)
  710. let group r = Group r
  711. let no_group r = No_group r
  712. let nest r = Nest r
  713. let set str =
  714. let s = ref [] in
  715. for i = 0 to String.length str - 1 do
  716. s := Cset.union (csingle str.[i]) !s
  717. done;
  718. Set !s
  719. let rg c c' = Set (cseq c c')
  720. let inter l =
  721. let r = Intersection l in
  722. if is_charset r then r else
  723. invalid_arg "Re.inter"
  724. let compl l =
  725. let r = Complement l in
  726. if is_charset r then r else
  727. invalid_arg "Re.compl"
  728. let diff r r' =
  729. let r'' = Difference (r, r') in
  730. if is_charset r'' then r'' else
  731. invalid_arg "Re.compl"
  732. let any = Set cany
  733. let notnl = Set (Cset.diff cany (csingle '\n'))
  734. let lower = alt [rg 'a' 'z'; char '\181'; rg '\223' '\246'; rg '\248' '\255']
  735. let upper = alt [rg 'A' 'Z'; rg '\192' '\214'; rg '\216' '\222']
  736. let alpha = alt [lower; upper; char '\170'; char '\186']
  737. let digit = rg '0' '9'
  738. let alnum = alt [alpha; digit]
  739. let ascii = rg '\000' '\127'
  740. let blank = set "\t "
  741. let cntrl = alt [rg '\000' '\031'; rg '\127' '\159']
  742. let graph = alt [rg '\033' '\126'; rg '\160' '\255']
  743. let print = alt [rg '\032' '\126'; rg '\160' '\255']
  744. let punct =
  745. alt [rg '\033' '\047'; rg '\058' '\064'; rg '\091' '\096';
  746. rg '\123' '\126'; rg '\160' '\169'; rg '\171' '\180';
  747. rg '\182' '\185'; rg '\187' '\191'; char '\215'; char '\247']
  748. let space = alt [char ' '; rg '\009' '\013']
  749. let xdigit = alt [digit; rg 'a' 'f'; rg 'A' 'Z']
  750. let case r = Case r
  751. let no_case r = No_case r
  752. (****)
  753. type substrings = (string * Automata.mark_infos * int array * int)
  754. let compile r = compile_1 (seq [shortest (rep any); group r])
  755. let exec ?(pos = 0) ?(len = -1) re s =
  756. if pos < 0 || len < -1 || pos + len > String.length s then
  757. invalid_arg "Re.exec";
  758. match match_str true re s pos len with
  759. `Match substr -> substr
  760. | _ -> raise Not_found
  761. let execp ?(pos = 0) ?(len = -1) re s =
  762. if pos < 0 || len < -1 || pos + len > String.length s then
  763. invalid_arg "Re.execp";
  764. match match_str false re s pos len with
  765. `Match substr -> true
  766. | _ -> false
  767. let exec_partial ?(pos = 0) ?(len = -1) re s =
  768. if pos < 0 || len < -1 || pos + len > String.length s then
  769. invalid_arg "Re.exec_partial";
  770. match match_str false re s pos len with
  771. `Match _ -> `Full
  772. | `Running -> `Partial
  773. | `Failed -> `Mismatch
  774. let rec find_mark (i : int) l =
  775. match l with
  776. [] ->
  777. raise Not_found
  778. | (j, idx) :: r ->
  779. if i = j then idx else find_mark i r
  780. let get (s, marks, pos, _) i =
  781. if 2 * i + 1 >= Array.length marks then raise Not_found;
  782. let m1 = marks.(2 * i) in
  783. if m1 = -1 then raise Not_found;
  784. let p1 = pos.(m1) - 1 in
  785. let p2 = pos.(marks.(2 * i + 1)) - 1 in
  786. String.sub s p1 (p2 - p1)
  787. let get_ofs (s, marks, pos, _) i =
  788. if 2 * i + 1 >= Array.length marks then raise Not_found;
  789. let m1 = marks.(2 * i) in
  790. if m1 = -1 then raise Not_found;
  791. let p1 = pos.(m1) - 1 in
  792. let p2 = pos.(marks.(2 * i + 1)) - 1 in
  793. (p1, p2)
  794. let test (s, marks, pos, _) i =
  795. if 2 * i >= Array.length marks then false else
  796. let idx = marks.(2 * i) in
  797. idx <> -1
  798. let dummy_offset = (-1, -1)
  799. let get_all_ofs ((s, marks, pos, count) as m) =
  800. let res = Array.make count dummy_offset in
  801. for i = 0 to Array.length marks / 2 - 1 do
  802. let m1 = marks.(2 * i) in
  803. if m1 <> -1 then begin
  804. let p1 = pos.(m1) in
  805. let p2 = pos.(marks.(2 * i + 1)) in
  806. res.(i) <- (p1 - 1, p2 - 1)
  807. end
  808. done;
  809. res
  810. let dummy_string = ""
  811. let get_all ((s, marks, pos, count) as m) =
  812. let res = Array.make count dummy_string in
  813. for i = 0 to Array.length marks / 2 - 1 do
  814. let m1 = marks.(2 * i) in
  815. if m1 <> -1 then begin
  816. let p1 = pos.(m1) in
  817. let p2 = pos.(marks.(2 * i + 1)) in
  818. res.(i) <- String.sub s (p1 - 1) (p2 - p1)
  819. end
  820. done;
  821. res
  822. (**********************************)
  823. (*
  824. Information about the previous character:
  825. - does not exists
  826. - is a letter
  827. - is not a letter
  828. - is a newline
  829. - is last newline
  830. Beginning of word:
  831. - previous is not a letter or does not exist
  832. - current is a letter or does not exist
  833. End of word:
  834. - previous is a letter or does not exist
  835. - current is not a letter or does not exist
  836. Beginning of line:
  837. - previous is a newline or does not exist
  838. Beginning of buffer:
  839. - previous does not exist
  840. End of buffer
  841. - current does not exist
  842. End of line
  843. - current is a newline or does not exist
  844. *)
  845. (*
  846. Rep: e = T,e | ()
  847. - semantics of the comma (shortest/longest/first)
  848. - semantics of the union (greedy/non-greedy)
  849. Bounded repetition
  850. a{0,3} = (a,(a,a?)?)?
  851. *)