/interpreter/ghc/libraries/utf8-string/Codec/Binary/UTF8/Generic.hs

https://github.com/khskrede/mehh · Haskell · 296 lines · 172 code · 35 blank · 89 comment · 18 complexity · 1dd807440a0748e81164e4d2c9465a3f MD5 · raw file

  1. {-# LANGUAGE MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances #-}
  2. --
  3. -- |
  4. -- Module : Codec.Binary.UTF8.Generic
  5. -- Copyright : (c) Iavor S. Diatchki 2009
  6. -- License : BSD3-style (see LICENSE)
  7. --
  8. -- Maintainer : emertens@galois.com
  9. -- Stability : experimental
  10. -- Portability : portable
  11. --
  12. module Codec.Binary.UTF8.Generic
  13. ( UTF8Bytes(..)
  14. , decode
  15. , replacement_char
  16. , uncons
  17. , splitAt
  18. , take
  19. , drop
  20. , span
  21. , break
  22. , fromString
  23. , toString
  24. , foldl
  25. , foldr
  26. , length
  27. , lines
  28. , lines'
  29. ) where
  30. import Data.Bits
  31. import Data.Int
  32. import Data.Word
  33. import qualified Data.ByteString as B
  34. import qualified Data.ByteString.Lazy as L
  35. import qualified Data.List as List
  36. import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines,null,tail)
  37. import Codec.Binary.UTF8.String(encode)
  38. #ifdef BYTESTRING_IN_BASE
  39. import Data.ByteString.Base (unsafeHead, unsafeTail)
  40. #endif
  41. class (Num s, Ord s) => UTF8Bytes b s | b -> s where
  42. bsplit :: s -> b -> (b,b)
  43. bdrop :: s -> b -> b
  44. buncons :: b -> Maybe (Word8,b)
  45. elemIndex :: Word8 -> b -> Maybe s
  46. empty :: b
  47. null :: b -> Bool
  48. pack :: [Word8] -> b
  49. tail :: b -> b
  50. instance UTF8Bytes B.ByteString Int where
  51. bsplit = B.splitAt
  52. bdrop = B.drop
  53. buncons = unconsB
  54. elemIndex = B.elemIndex
  55. empty = B.empty
  56. null = B.null
  57. pack = B.pack
  58. tail = B.tail
  59. instance UTF8Bytes L.ByteString Int64 where
  60. bsplit = L.splitAt
  61. bdrop = L.drop
  62. buncons = unconsL
  63. elemIndex = L.elemIndex
  64. empty = L.empty
  65. null = L.null
  66. pack = L.pack
  67. tail = L.tail
  68. instance UTF8Bytes [Word8] Int where
  69. bsplit = List.splitAt
  70. bdrop = List.drop
  71. buncons (x:xs) = Just (x,xs)
  72. buncons [] = Nothing
  73. elemIndex x xs = List.elemIndex (toEnum (fromEnum x)) xs
  74. empty = []
  75. null = List.null
  76. pack = id
  77. tail = List.tail
  78. -- | Converts a Haskell string into a UTF8 encoded bytestring.
  79. {-# SPECIALIZE fromString :: String -> B.ByteString #-}
  80. {-# SPECIALIZE fromString :: String -> L.ByteString #-}
  81. {-# SPECIALIZE fromString :: String -> [Word8] #-}
  82. fromString :: UTF8Bytes b s => String -> b
  83. fromString xs = pack (encode xs)
  84. -- | Convert a UTF8 encoded bytestring into a Haskell string.
  85. -- Invalid characters are replaced with '\xFFFD'.
  86. {-# SPECIALIZE toString :: B.ByteString -> String #-}
  87. {-# SPECIALIZE toString :: L.ByteString -> String #-}
  88. {-# SPECIALIZE toString :: [Word8] -> String #-}
  89. toString :: UTF8Bytes b s => b -> String
  90. toString bs = foldr (:) [] bs
  91. -- | This character is used to mark errors in a UTF8 encoded string.
  92. replacement_char :: Char
  93. replacement_char = '\xfffd'
  94. -- | Try to extract a character from a byte string.
  95. -- Returns 'Nothing' if there are no more bytes in the byte string.
  96. -- Otherwise, it returns a decoded character and the number of
  97. -- bytes used in its representation.
  98. -- Errors are replaced by character '\0xFFFD'.
  99. -- XXX: Should we combine sequences of errors into a single replacement
  100. -- character?
  101. {-# SPECIALIZE decode :: B.ByteString -> Maybe (Char,Int) #-}
  102. {-# SPECIALIZE decode :: L.ByteString -> Maybe (Char,Int64) #-}
  103. {-# SPECIALIZE decode :: [Word8] -> Maybe (Char,Int) #-}
  104. decode :: UTF8Bytes b s => b -> Maybe (Char,s)
  105. decode bs = do (c,cs) <- buncons bs
  106. return (choose (fromEnum c) cs)
  107. where
  108. choose c cs
  109. | c < 0x80 = (toEnum $ fromEnum c, 1)
  110. | c < 0xc0 = (replacement_char, 1)
  111. | c < 0xe0 = bytes2 (mask c 0x1f) cs
  112. | c < 0xf0 = bytes3 (mask c 0x0f) cs
  113. | c < 0xf8 = bytes4 (mask c 0x07) cs
  114. | otherwise = (replacement_char, 1)
  115. mask c m = fromEnum (c .&. m)
  116. combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f)
  117. follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r)
  118. follower _ _ = Nothing
  119. {-# INLINE get_follower #-}
  120. get_follower acc cs = do (x,xs) <- buncons cs
  121. acc1 <- follower acc x
  122. return (acc1,xs)
  123. bytes2 c cs = case get_follower c cs of
  124. Just (d, _) | d >= 0x80 -> (toEnum d, 2)
  125. | otherwise -> (replacement_char, 1)
  126. _ -> (replacement_char, 1)
  127. bytes3 c cs =
  128. case get_follower c cs of
  129. Just (d1, cs1) ->
  130. case get_follower d1 cs1 of
  131. Just (d, _) | (d >= 0x800 && d < 0xd800) ||
  132. (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3)
  133. | otherwise -> (replacement_char, 3)
  134. _ -> (replacement_char, 2)
  135. _ -> (replacement_char, 1)
  136. bytes4 c cs =
  137. case get_follower c cs of
  138. Just (d1, cs1) ->
  139. case get_follower d1 cs1 of
  140. Just (d2, cs2) ->
  141. case get_follower d2 cs2 of
  142. Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4)
  143. | otherwise -> (replacement_char, 4)
  144. _ -> (replacement_char, 3)
  145. _ -> (replacement_char, 2)
  146. _ -> (replacement_char, 1)
  147. -- | Split after a given number of characters.
  148. -- Negative values are treated as if they are 0.
  149. {-# SPECIALIZE splitAt :: Int -> B.ByteString -> (B.ByteString,B.ByteString) #-}
  150. {-# SPECIALIZE splitAt :: Int64 -> L.ByteString -> (L.ByteString,L.ByteString) #-}
  151. {-# SPECIALIZE splitAt :: Int -> [Word8] -> ([Word8],[Word8]) #-}
  152. splitAt :: UTF8Bytes b s => s -> b -> (b,b)
  153. splitAt x bs = loop 0 x bs
  154. where loop a n _ | n <= 0 = bsplit a bs
  155. loop a n bs1 = case decode bs1 of
  156. Just (_,y) -> loop (a+y) (n-1) (bdrop y bs1)
  157. Nothing -> (bs, empty)
  158. -- | @take n s@ returns the first @n@ characters of @s@.
  159. -- If @s@ has less then @n@ characters, then we return the whole of @s@.
  160. {-# INLINE take #-}
  161. take :: UTF8Bytes b s => s -> b -> b
  162. take n bs = fst (splitAt n bs)
  163. -- | @drop n s@ returns the @s@ without its first @n@ characters.
  164. -- If @s@ has less then @n@ characters, then we return the an empty string.
  165. {-# INLINE drop #-}
  166. drop :: UTF8Bytes b s => s -> b -> b
  167. drop n bs = snd (splitAt n bs)
  168. -- | Split a string into two parts: the first is the longest prefix
  169. -- that contains only characters that satisfy the predicate; the second
  170. -- part is the rest of the string.
  171. -- Invalid characters are passed as '\0xFFFD' to the predicate.
  172. {-# SPECIALIZE span :: (Char -> Bool) -> B.ByteString -> (B.ByteString,B.ByteString) #-}
  173. {-# SPECIALIZE span :: (Char -> Bool) -> L.ByteString -> (L.ByteString,L.ByteString) #-}
  174. {-# SPECIALIZE span :: (Char -> Bool) -> [Word8] -> ([Word8],[Word8]) #-}
  175. span :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b)
  176. span p bs = loop 0 bs
  177. where loop a cs = case decode cs of
  178. Just (c,n) | p c -> loop (a+n) (bdrop n cs)
  179. _ -> bsplit a bs
  180. -- | Split a string into two parts: the first is the longest prefix
  181. -- that contains only characters that do not satisfy the predicate; the second
  182. -- part is the rest of the string.
  183. -- Invalid characters are passed as '\0xFFFD' to the predicate.
  184. {-# INLINE break #-}
  185. break :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b)
  186. break p bs = span (not . p) bs
  187. -- | Get the first character of a byte string, if any.
  188. -- Malformed characters are replaced by '\0xFFFD'.
  189. {-# INLINE uncons #-}
  190. uncons :: UTF8Bytes b s => b -> Maybe (Char,b)
  191. uncons bs = do (c,n) <- decode bs
  192. return (c, bdrop n bs)
  193. -- | Traverse a bytestring (right biased).
  194. {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> B.ByteString -> a #-}
  195. {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> L.ByteString -> a #-}
  196. {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> [Word8] -> a #-}
  197. foldr :: UTF8Bytes b s => (Char -> a -> a) -> a -> b -> a
  198. foldr cons nil cs = case uncons cs of
  199. Just (a,as) -> cons a (foldr cons nil as)
  200. Nothing -> nil
  201. -- | Traverse a bytestring (left biased).
  202. -- This fuction is strict in the acumulator.
  203. {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> B.ByteString -> a #-}
  204. {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> L.ByteString -> a #-}
  205. {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> [Word8] -> a #-}
  206. foldl :: UTF8Bytes b s => (a -> Char -> a) -> a -> b -> a
  207. foldl add acc cs = case uncons cs of
  208. Just (a,as) -> let v = add acc a
  209. in seq v (foldl add v as)
  210. Nothing -> acc
  211. -- | Counts the number of characters encoded in the bytestring.
  212. -- Note that this includes replacment characters.
  213. {-# SPECIALIZE length :: B.ByteString -> Int #-}
  214. {-# SPECIALIZE length :: L.ByteString -> Int64 #-}
  215. {-# SPECIALIZE length :: [Word8] -> Int #-}
  216. length :: UTF8Bytes b s => b -> s
  217. length b = loop 0 b
  218. where loop n xs = case decode xs of
  219. Just (_,m) -> loop (n+1) (bdrop m xs)
  220. Nothing -> n
  221. -- | Split a string into a list of lines.
  222. -- Lines are termianted by '\n' or the end of the string.
  223. -- Empty line may not be terminated by the end of the string.
  224. -- See also 'lines\''.
  225. {-# SPECIALIZE lines :: B.ByteString -> [B.ByteString] #-}
  226. {-# SPECIALIZE lines :: L.ByteString -> [L.ByteString] #-}
  227. {-# SPECIALIZE lines :: [Word8] -> [[Word8]] #-}
  228. lines :: UTF8Bytes b s => b -> [b]
  229. lines bs | null bs = []
  230. lines bs = case elemIndex 10 bs of
  231. Just x -> let (xs,ys) = bsplit x bs
  232. in xs : lines (tail ys)
  233. Nothing -> [bs]
  234. -- | Split a string into a list of lines.
  235. -- Lines are termianted by '\n' or the end of the string.
  236. -- Empty line may not be terminated by the end of the string.
  237. -- This function preserves the terminators.
  238. -- See also 'lines'.
  239. {-# SPECIALIZE lines' :: B.ByteString -> [B.ByteString] #-}
  240. {-# SPECIALIZE lines' :: L.ByteString -> [L.ByteString] #-}
  241. {-# SPECIALIZE lines' :: [Word8] -> [[Word8]] #-}
  242. lines' :: UTF8Bytes b s => b -> [b]
  243. lines' bs | null bs = []
  244. lines' bs = case elemIndex 10 bs of
  245. Just x -> let (xs,ys) = bsplit (x+1) bs
  246. in xs : lines' ys
  247. Nothing -> [bs]
  248. -----------
  249. -- Compatibility functions for base-2
  250. unconsB :: B.ByteString -> Maybe (Word8,B.ByteString)
  251. unconsL :: L.ByteString -> Maybe (Word8,L.ByteString)
  252. #ifdef BYTESTRING_IN_BASE
  253. unconsB bs | B.null bs = Nothing
  254. | otherwise = Just (unsafeHead bs, unsafeTail bs)
  255. unconsL bs = case L.toChunks bs of
  256. (x:xs) | not (B.null x) -> Just (unsafeHead x, L.fromChunks (unsafeTail x:xs))
  257. _ -> Nothing
  258. #else
  259. unconsB = B.uncons
  260. unconsL = L.uncons
  261. #endif