PageRenderTime 48ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/re_perl.ml

http://github.com/avsm/ocaml-re
OCaml | 226 lines | 193 code | 6 blank | 27 comment | 74 complexity | 67374eec2f750bc47154e7ddd8d482bf MD5 | raw file
Possible License(s): LGPL-2.1
  1. (*
  2. RE - A regular expression library
  3. Copyright (C) 2001 Jerome Vouillon
  4. email: Jerome.Vouillon@pps.jussieu.fr
  5. This library is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU Lesser General Public
  7. License as published by the Free Software Foundation; either
  8. version 2 of the License, or (at your option) any later version.
  9. This library is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. Lesser General Public License for more details.
  13. You should have received a copy of the GNU Lesser General Public
  14. License along with this library; if not, write to the Free Software
  15. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  16. *)
  17. exception Parse_error
  18. exception Not_supported
  19. let parse multiline dollar_endonly dotall ungreedy s =
  20. let i = ref 0 in
  21. let l = String.length s in
  22. let eos () = !i = l in
  23. let test c = not (eos ()) && s.[!i] = c in
  24. let accept c = let r = test c in if r then incr i; r in
  25. let get () = let r = s.[!i] in incr i; r in
  26. let unget () = decr i in
  27. let greedy_mod r =
  28. let gr = accept '?' in
  29. let gr = if ungreedy then not gr else gr in
  30. if gr then Re.non_greedy r else Re.greedy r
  31. in
  32. let rec regexp () = regexp' (branch ())
  33. and regexp' left =
  34. if accept '|' then regexp' (Re.alt [left; branch ()]) else left
  35. and branch () = branch' []
  36. and branch' left =
  37. if eos () || test '|' || test ')' then Re.seq (List.rev left)
  38. else branch' (piece () :: left)
  39. and piece () =
  40. let r = atom () in
  41. if accept '*' then greedy_mod (Re.rep r) else
  42. if accept '+' then greedy_mod (Re.rep1 r) else
  43. if accept '?' then greedy_mod (Re.opt r) else
  44. if accept '{' then
  45. match integer () with
  46. Some i ->
  47. let j = if accept ',' then integer () else Some i in
  48. if not (accept '}') then raise Parse_error;
  49. begin match j with
  50. Some j when j < i -> raise Parse_error | _ -> ()
  51. end;
  52. greedy_mod (Re.repn r i j)
  53. | None ->
  54. unget (); r
  55. else
  56. r
  57. and atom () =
  58. if accept '.' then begin
  59. if dotall then Re.any else Re.notnl
  60. end else if accept '(' then begin
  61. if accept '?' then begin
  62. if accept ':' then begin
  63. let r = regexp () in
  64. if not (accept ')') then raise Parse_error;
  65. r
  66. end else if accept '#' then begin
  67. comment ()
  68. end else
  69. raise Parse_error
  70. end else begin
  71. let r = regexp () in
  72. if not (accept ')') then raise Parse_error;
  73. Re.group r
  74. end
  75. end else
  76. if accept '^' then begin
  77. if multiline then Re.bol else Re.bos
  78. end else if accept '$' then begin
  79. if multiline then Re.eol else if dollar_endonly then Re.leol else Re.eos
  80. end else if accept '[' then begin
  81. if accept '^' then
  82. Re.compl (bracket [])
  83. else
  84. Re.alt (bracket [])
  85. end else if accept '\\' then begin
  86. (* XXX
  87. - Back-references
  88. - \cx (control-x), \e, \f, \n, \r, \t, \xhh, \ddd
  89. *)
  90. if eos () then raise Parse_error;
  91. match get () with
  92. 'w' ->
  93. Re.alt [Re.alnum; Re.char '_']
  94. | 'W' ->
  95. Re.compl [Re.alnum; Re.char '_']
  96. | 's' ->
  97. Re.space
  98. | 'S' ->
  99. Re.compl [Re.space]
  100. | 'd' ->
  101. Re.digit
  102. | 'D' ->
  103. Re.compl [Re.digit]
  104. | 'b' ->
  105. Re.alt [Re.bow; Re.eow]
  106. | 'B' ->
  107. Re.not_boundary
  108. | 'A' ->
  109. Re.bos
  110. | 'Z' ->
  111. Re.leol
  112. | 'z' ->
  113. Re.eos
  114. | 'G' ->
  115. Re.start
  116. | 'a'..'z' | 'A'..'Z' ->
  117. raise Parse_error
  118. | '0'..'9' ->
  119. raise Not_supported
  120. | c ->
  121. Re.char c
  122. end else begin
  123. if eos () then raise Parse_error;
  124. match get () with
  125. '*' | '+' | '?' | '{' | '\\' -> raise Parse_error
  126. | c -> Re.char c
  127. end
  128. and integer () =
  129. if eos () then None else
  130. match get () with
  131. '0'..'9' as d -> integer' (Char.code d - Char.code '0')
  132. | _ -> unget (); None
  133. and integer' i =
  134. if eos () then Some i else
  135. match get () with
  136. '0'..'9' as d ->
  137. let i' = 10 * i + (Char.code d - Char.code '0') in
  138. if i' < i then raise Parse_error;
  139. integer' i'
  140. | _ ->
  141. unget (); Some i
  142. and bracket s =
  143. if s <> [] && accept ']' then s else begin
  144. match char () with
  145. `Char c ->
  146. if accept '-' then begin
  147. if accept ']' then Re.char c :: Re.char '-' :: s else begin
  148. match char () with
  149. `Char c' ->
  150. bracket (Re.rg c c' :: s)
  151. | `Set st' ->
  152. Re.char c :: Re.char '-' :: st' :: s
  153. end
  154. end else
  155. bracket (Re.char c :: s)
  156. | `Set st ->
  157. bracket (st :: s)
  158. end
  159. and char () =
  160. if eos () then raise Parse_error;
  161. let c = get () in
  162. if c = '[' then begin
  163. if accept '=' || accept ':' then raise Not_supported;
  164. if accept '.' then begin
  165. if eos () then raise Parse_error;
  166. let c = get () in
  167. if not (accept '.') then raise Not_supported;
  168. if not (accept ']') then raise Parse_error;
  169. `Char c
  170. end else
  171. `Char c
  172. end else if c = '\\' then begin
  173. let c = get () in
  174. (* XXX
  175. \127, ...
  176. *)
  177. match c with
  178. 'b' -> `Char '\008'
  179. | 'n' -> `Char '\n' (*XXX*)
  180. | 'r' -> `Char '\r' (*XXX*)
  181. | 't' -> `Char '\t' (*XXX*)
  182. | 'w' -> `Set (Re.alt [Re.alnum; Re.char '_'])
  183. | 'W' -> `Set (Re.compl [Re.alnum; Re.char '_'])
  184. | 's' -> `Set (Re.space)
  185. | 'S' -> `Set (Re.compl [Re.space])
  186. | 'd' -> `Set (Re.digit)
  187. | 'D' -> `Set (Re.compl [Re.digit])
  188. | 'a'..'z' | 'A'..'Z' ->
  189. raise Parse_error
  190. | '0'..'9' ->
  191. raise Not_supported
  192. | _ ->
  193. `Char c
  194. end else
  195. `Char c
  196. and comment () =
  197. if accept ')' then Re.epsilon else begin incr i; comment () end
  198. in
  199. let res = regexp () in
  200. if not (eos ()) then raise Parse_error;
  201. res
  202. type opt =
  203. [ `Ungreedy | `Dotall | `Dollar_endonly
  204. | `Multiline | `Anchored | `Caseless ]
  205. let re ?(opts = []) s =
  206. let r =
  207. parse
  208. (List.memq `Multiline opts) (List.memq `Dollar_endonly opts)
  209. (List.memq `Dotall opts) (List.memq `Ungreedy opts)
  210. s
  211. in
  212. let r = if List.memq `Anchored opts then Re.seq [Re.start; r] else r in
  213. let r = if List.memq `Caseless opts then Re.no_case r else r in
  214. r
  215. let compile = Re.compile
  216. let compile_pat ?(opts = []) s = compile (re ~opts s)