uCharParser.ml | searchcode

/src/core/extlib/uCharParser.ml

http://github.com/pelzlpj/batteries · OCaml · 157 lines · 93 code · 33 blank · 31 comment · 7 complexity · ecd07522a4bddbe5e1652b25836570c1 MD5 · raw file

(* 
 * CharParser - Parsing character strings
 * Copyright (C) 2008 David Teller
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version,
 * with the special exception on linking described in file LICENSE.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *)



open ExtUChar
open ExtInt
open ExtFloat
open ExtUTF8
open ExtString
open ParserCo
open ExtList
open ExtPrintf

let string_of_uchar c = (UTF8.to_string (UTF8.of_char c))

(** {6 Entry point} *)
type position = CharParser.position = { offset : int; line : int; }

let start_position =
{ CharParser.offset = 1;
  CharParser.line   = 1 }

let advance c p =
  if UChar.is_newline c then((*Printf.eprintf "[Have reached line %i]\n%!" (p.line + 1);*) { CharParser.offset = 1; CharParser.line = p.CharParser.line + 1})
  else                      { (p) with CharParser.offset = p.CharParser.offset + 1}

let source_of_enum   s = Source.of_enum s start_position advance

let source_of_rope   s = source_of_enum (Rope.enum s)

let source_of_utf8   s = source_of_enum (UTF8.enum s)

let parse_rope p s =
  run p (source_of_rope s)

let parse_utf8 p s =
  run p (source_of_utf8 s)



(** {6 Utilities}*)
let char   c = label ("\"" ^ ( string_of_uchar c ) ^ "\"") (exactly c)

let ustring s = label ("\"" ^ ( UTF8.to_string s ) ^ "\"") (
  let len = UTF8.length s in
  let rec aux i = 
    if i < len then exactly (UTF8.get s i) >>= fun _ -> aux ( i + 1 )
    else return s
  in aux 0
)

let string s = label ("\"" ^ s ^ "\"") (
  let len = String.length s in
  let rec aux i = 
    if i < len then exactly (UChar.of_char (String.get s i)) >>= fun _ -> aux ( i + 1 )
    else return s
  in aux 0
)

let rope s = label ("\"" ^ ( UTF8.to_string (Rope.to_ustring s) ) ^ "\"") (
  Enum.fold (fun c (acc:(_, _, _) ParserCo.t) -> (exactly c) >>> acc ) (return s) (Rope.backwards s)
)

let case_char c =
  let utf8 = UTF8.of_char c in
  either [ustring (UTF8.uppercase utf8); ustring (UTF8.lowercase utf8)]


(*That one is somewhat harder as the lower-cased/upper-cased string don't necessarily
  have the same length...*)
let case_rope s = label ("case insensitive \"" ^ (UTF8.to_string (Rope.to_ustring s)) ^ "\"") (
  let lower_rope  = Rope.lowercase s       in                          (*lowercase the original string*)
  let pick enum   =
    match Enum.get enum with
      | None   -> raise Not_found
      | Some x -> x
  in
  let rec aux enum acc =
    if Enum.is_empty enum then return acc
    else
      ParserCo.any >>= fun c ->                                         (*lowercase the next char*)
	let lower_char = UTF8.lowercase (UTF8.of_char c) in             (*check that's what follows in the string*)
	let char_enum  = UTF8.enum lower_char            in
	match try Some (Enum.for_all (fun c -> pick enum = c) char_enum)
	      with Not_found -> None
	with Some false         (*The substring doesn't match.                                  *)
	  |  None       -> fail (*We have reached the end of the enumeration but not that of [s]*)
	  |  Some true  -> aux enum (Rope.append (Rope.of_ustring lower_char) acc)
(*  in aux (Rope.get 0 lower_rope)*)
  in aux (Rope.enum lower_rope) s
)

let case_ustring s = (case_rope (Rope.of_ustring s)) >>= fun s' -> return (Rope.to_ustring s')

let case_string s = (case_rope (Rope.of_ustring (UTF8.of_string s))) >>= fun s' -> return (UTF8.to_string (Rope.to_ustring s'))

let whitespace = satisfy UChar.is_whitespace

let uppercase = label "upper case char" (satisfy UChar.is_uppercase)
let lowercase = label "lower case char" (satisfy UChar.is_lowercase)
let digit = label "digit"
  ( satisfy (fun c -> match UChar.category c with `Nd -> true | _ -> false) )

let first s = String.get s 0

let not_char c = label ("anything but '" ^ string_of_uchar c ^ "'")
  (satisfy (fun x -> x <> c) (*>>=
     fun x -> Printf.eprintf "(%c)\n" x; return x*)
)

let none_of l = (*label (
  String.of_list (Vect.to_list (Vect.append ']'
    (List.fold_left (fun acc x -> Vect.append x acc)
       (Vect.of_list (String.to_list "anything but ['"))
       l))))*)
(*  label (Printf.sprintf2 "anything but [%a]" (List.print *)
  label (List.sprint 
	   ~first:"anything but ["
	   ~sep:"; "
	   ~last:"]" 
	   (fun out c -> (Printf.fprintf out "'%a'" UChar.print c))
	   l
	)
	  (none_of l)

let newline = satisfy UChar.is_newline

let hex_uchars =
  List.map UChar.of_char
  [ '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9';
    'a'; 'b'; 'c'; 'd'; 'e'; 'f';
    'A'; 'B'; 'C'; 'D'; 'E'; 'F' ]

let hex = label "hex" (one_of hex_uchars)
(*  ( satisfy (fun x -> ( '0' <= x && x <= '9' ) || ('a' <= x && x <= 'f') || ('A' <= x && x <= 'F')))*)

let letter = satisfy (fun c -> match UChar.category c with `Lu | `Ll | `Lt -> true | _ -> false )

let parse p s = run p (source_of_rope s)