PageRenderTime 43ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/src/core/extlib/uCharParser.ml

http://github.com/pelzlpj/batteries
OCaml | 157 lines | 93 code | 33 blank | 31 comment | 7 complexity | ecd07522a4bddbe5e1652b25836570c1 MD5 | raw file
Possible License(s): LGPL-2.1
  1. (*
  2. * CharParser - Parsing character strings
  3. * Copyright (C) 2008 David Teller
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2.1 of the License, or (at your option) any later version,
  9. * with the special exception on linking described in file LICENSE.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *)
  20. open ExtUChar
  21. open ExtInt
  22. open ExtFloat
  23. open ExtUTF8
  24. open ExtString
  25. open ParserCo
  26. open ExtList
  27. open ExtPrintf
  28. let string_of_uchar c = (UTF8.to_string (UTF8.of_char c))
  29. (** {6 Entry point} *)
  30. type position = CharParser.position = { offset : int; line : int; }
  31. let start_position =
  32. { CharParser.offset = 1;
  33. CharParser.line = 1 }
  34. let advance c p =
  35. if UChar.is_newline c then((*Printf.eprintf "[Have reached line %i]\n%!" (p.line + 1);*) { CharParser.offset = 1; CharParser.line = p.CharParser.line + 1})
  36. else { (p) with CharParser.offset = p.CharParser.offset + 1}
  37. let source_of_enum s = Source.of_enum s start_position advance
  38. let source_of_rope s = source_of_enum (Rope.enum s)
  39. let source_of_utf8 s = source_of_enum (UTF8.enum s)
  40. let parse_rope p s =
  41. run p (source_of_rope s)
  42. let parse_utf8 p s =
  43. run p (source_of_utf8 s)
  44. (** {6 Utilities}*)
  45. let char c = label ("\"" ^ ( string_of_uchar c ) ^ "\"") (exactly c)
  46. let ustring s = label ("\"" ^ ( UTF8.to_string s ) ^ "\"") (
  47. let len = UTF8.length s in
  48. let rec aux i =
  49. if i < len then exactly (UTF8.get s i) >>= fun _ -> aux ( i + 1 )
  50. else return s
  51. in aux 0
  52. )
  53. let string s = label ("\"" ^ s ^ "\"") (
  54. let len = String.length s in
  55. let rec aux i =
  56. if i < len then exactly (UChar.of_char (String.get s i)) >>= fun _ -> aux ( i + 1 )
  57. else return s
  58. in aux 0
  59. )
  60. let rope s = label ("\"" ^ ( UTF8.to_string (Rope.to_ustring s) ) ^ "\"") (
  61. Enum.fold (fun c (acc:(_, _, _) ParserCo.t) -> (exactly c) >>> acc ) (return s) (Rope.backwards s)
  62. )
  63. let case_char c =
  64. let utf8 = UTF8.of_char c in
  65. either [ustring (UTF8.uppercase utf8); ustring (UTF8.lowercase utf8)]
  66. (*That one is somewhat harder as the lower-cased/upper-cased string don't necessarily
  67. have the same length...*)
  68. let case_rope s = label ("case insensitive \"" ^ (UTF8.to_string (Rope.to_ustring s)) ^ "\"") (
  69. let lower_rope = Rope.lowercase s in (*lowercase the original string*)
  70. let pick enum =
  71. match Enum.get enum with
  72. | None -> raise Not_found
  73. | Some x -> x
  74. in
  75. let rec aux enum acc =
  76. if Enum.is_empty enum then return acc
  77. else
  78. ParserCo.any >>= fun c -> (*lowercase the next char*)
  79. let lower_char = UTF8.lowercase (UTF8.of_char c) in (*check that's what follows in the string*)
  80. let char_enum = UTF8.enum lower_char in
  81. match try Some (Enum.for_all (fun c -> pick enum = c) char_enum)
  82. with Not_found -> None
  83. with Some false (*The substring doesn't match. *)
  84. | None -> fail (*We have reached the end of the enumeration but not that of [s]*)
  85. | Some true -> aux enum (Rope.append (Rope.of_ustring lower_char) acc)
  86. (* in aux (Rope.get 0 lower_rope)*)
  87. in aux (Rope.enum lower_rope) s
  88. )
  89. let case_ustring s = (case_rope (Rope.of_ustring s)) >>= fun s' -> return (Rope.to_ustring s')
  90. let case_string s = (case_rope (Rope.of_ustring (UTF8.of_string s))) >>= fun s' -> return (UTF8.to_string (Rope.to_ustring s'))
  91. let whitespace = satisfy UChar.is_whitespace
  92. let uppercase = label "upper case char" (satisfy UChar.is_uppercase)
  93. let lowercase = label "lower case char" (satisfy UChar.is_lowercase)
  94. let digit = label "digit"
  95. ( satisfy (fun c -> match UChar.category c with `Nd -> true | _ -> false) )
  96. let first s = String.get s 0
  97. let not_char c = label ("anything but '" ^ string_of_uchar c ^ "'")
  98. (satisfy (fun x -> x <> c) (*>>=
  99. fun x -> Printf.eprintf "(%c)\n" x; return x*)
  100. )
  101. let none_of l = (*label (
  102. String.of_list (Vect.to_list (Vect.append ']'
  103. (List.fold_left (fun acc x -> Vect.append x acc)
  104. (Vect.of_list (String.to_list "anything but ['"))
  105. l))))*)
  106. (* label (Printf.sprintf2 "anything but [%a]" (List.print *)
  107. label (List.sprint
  108. ~first:"anything but ["
  109. ~sep:"; "
  110. ~last:"]"
  111. (fun out c -> (Printf.fprintf out "'%a'" UChar.print c))
  112. l
  113. )
  114. (none_of l)
  115. let newline = satisfy UChar.is_newline
  116. let hex_uchars =
  117. List.map UChar.of_char
  118. [ '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9';
  119. 'a'; 'b'; 'c'; 'd'; 'e'; 'f';
  120. 'A'; 'B'; 'C'; 'D'; 'E'; 'F' ]
  121. let hex = label "hex" (one_of hex_uchars)
  122. (* ( satisfy (fun x -> ( '0' <= x && x <= '9' ) || ('a' <= x && x <= 'f') || ('A' <= x && x <= 'F')))*)
  123. let letter = satisfy (fun c -> match UChar.category c with `Lu | `Ll | `Lt -> true | _ -> false )
  124. let parse p s = run p (source_of_rope s)