/interpreter/ghc/libraries/utf8-string/Data/ByteString/UTF8.hs

https://github.com/khskrede/mehh · Haskell · 209 lines · 126 code · 27 blank · 56 comment · 17 complexity · 6bba9fb7e9fb31d897094cb7aaa6bcc8 MD5 · raw file

  1. --
  2. -- |
  3. -- Module : Data.ByteString.UTF8
  4. -- Copyright : (c) Iavor S. Diatchki 2009
  5. -- License : BSD3-style (see LICENSE)
  6. --
  7. -- Maintainer : emertens@galois.com
  8. -- Stability : experimental
  9. -- Portability : portable
  10. --
  11. -- This module provides fast, validated encoding and decoding functions
  12. -- between 'ByteString's and 'String's. It does not exactly match the
  13. -- output of the Codec.Binary.UTF8.String output for invalid encodings
  14. -- as the number of replacement characters is sometimes longer.
  15. module Data.ByteString.UTF8
  16. ( B.ByteString
  17. , decode
  18. , replacement_char
  19. , uncons
  20. , splitAt
  21. , take
  22. , drop
  23. , span
  24. , break
  25. , fromString
  26. , toString
  27. , foldl
  28. , foldr
  29. , length
  30. , lines
  31. , lines'
  32. ) where
  33. import Data.Bits
  34. import Data.Word
  35. import qualified Data.ByteString as B
  36. import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines)
  37. import Codec.Binary.UTF8.String(encode)
  38. import Codec.Binary.UTF8.Generic (buncons)
  39. -- | Converts a Haskell string into a UTF8 encoded bytestring.
  40. fromString :: String -> B.ByteString
  41. fromString xs = B.pack (encode xs)
  42. -- | Convert a UTF8 encoded bytestring into a Haskell string.
  43. -- Invalid characters are replaced with '\xFFFD'.
  44. toString :: B.ByteString -> String
  45. toString bs = foldr (:) [] bs
  46. -- | This character is used to mark errors in a UTF8 encoded string.
  47. replacement_char :: Char
  48. replacement_char = '\xfffd'
  49. -- | Try to extract a character from a byte string.
  50. -- Returns 'Nothing' if there are no more bytes in the byte string.
  51. -- Otherwise, it returns a decoded character and the number of
  52. -- bytes used in its representation.
  53. -- Errors are replaced by character '\0xFFFD'.
  54. -- XXX: Should we combine sequences of errors into a single replacement
  55. -- character?
  56. decode :: B.ByteString -> Maybe (Char,Int)
  57. decode bs = do (c,cs) <- buncons bs
  58. return (choose (fromEnum c) cs)
  59. where
  60. choose :: Int -> B.ByteString -> (Char, Int)
  61. choose c cs
  62. | c < 0x80 = (toEnum $ fromEnum c, 1)
  63. | c < 0xc0 = (replacement_char, 1)
  64. | c < 0xe0 = bytes2 (mask c 0x1f) cs
  65. | c < 0xf0 = bytes3 (mask c 0x0f) cs
  66. | c < 0xf8 = bytes4 (mask c 0x07) cs
  67. | otherwise = (replacement_char, 1)
  68. mask :: Int -> Int -> Int
  69. mask c m = fromEnum (c .&. m)
  70. combine :: Int -> Word8 -> Int
  71. combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f)
  72. follower :: Int -> Word8 -> Maybe Int
  73. follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r)
  74. follower _ _ = Nothing
  75. {-# INLINE get_follower #-}
  76. get_follower :: Int -> B.ByteString -> Maybe (Int, B.ByteString)
  77. get_follower acc cs = do (x,xs) <- buncons cs
  78. acc1 <- follower acc x
  79. return (acc1,xs)
  80. bytes2 :: Int -> B.ByteString -> (Char, Int)
  81. bytes2 c cs = case get_follower c cs of
  82. Just (d, _) | d >= 0x80 -> (toEnum d, 2)
  83. | otherwise -> (replacement_char, 1)
  84. _ -> (replacement_char, 1)
  85. bytes3 :: Int -> B.ByteString -> (Char, Int)
  86. bytes3 c cs =
  87. case get_follower c cs of
  88. Just (d1, cs1) ->
  89. case get_follower d1 cs1 of
  90. Just (d, _) | (d >= 0x800 && d < 0xd800) ||
  91. (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3)
  92. | otherwise -> (replacement_char, 3)
  93. _ -> (replacement_char, 2)
  94. _ -> (replacement_char, 1)
  95. bytes4 :: Int -> B.ByteString -> (Char, Int)
  96. bytes4 c cs =
  97. case get_follower c cs of
  98. Just (d1, cs1) ->
  99. case get_follower d1 cs1 of
  100. Just (d2, cs2) ->
  101. case get_follower d2 cs2 of
  102. Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4)
  103. | otherwise -> (replacement_char, 4)
  104. _ -> (replacement_char, 3)
  105. _ -> (replacement_char, 2)
  106. _ -> (replacement_char, 1)
  107. -- | Split after a given number of characters.
  108. -- Negative values are treated as if they are 0.
  109. splitAt :: Int -> B.ByteString -> (B.ByteString,B.ByteString)
  110. splitAt x bs = loop 0 x bs
  111. where loop a n _ | n <= 0 = B.splitAt a bs
  112. loop a n bs1 = case decode bs1 of
  113. Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1)
  114. Nothing -> (bs, B.empty)
  115. -- | @take n s@ returns the first @n@ characters of @s@.
  116. -- If @s@ has less then @n@ characters, then we return the whole of @s@.
  117. take :: Int -> B.ByteString -> B.ByteString
  118. take n bs = fst (splitAt n bs)
  119. -- | @drop n s@ returns the @s@ without its first @n@ characters.
  120. -- If @s@ has less then @n@ characters, then we return the an empty string.
  121. drop :: Int -> B.ByteString -> B.ByteString
  122. drop n bs = snd (splitAt n bs)
  123. -- | Split a string into two parts: the first is the longest prefix
  124. -- that contains only characters that satisfy the predicate; the second
  125. -- part is the rest of the string.
  126. -- Invalid characters are passed as '\0xFFFD' to the predicate.
  127. span :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString)
  128. span p bs = loop 0 bs
  129. where loop a cs = case decode cs of
  130. Just (c,n) | p c -> loop (a+n) (B.drop n cs)
  131. _ -> B.splitAt a bs
  132. -- | Split a string into two parts: the first is the longest prefix
  133. -- that contains only characters that do not satisfy the predicate; the second
  134. -- part is the rest of the string.
  135. -- Invalid characters are passed as '\0xFFFD' to the predicate.
  136. break :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString)
  137. break p bs = span (not . p) bs
  138. -- | Get the first character of a byte string, if any.
  139. -- Malformed characters are replaced by '\0xFFFD'.
  140. uncons :: B.ByteString -> Maybe (Char,B.ByteString)
  141. uncons bs = do (c,n) <- decode bs
  142. return (c, B.drop n bs)
  143. -- | Traverse a bytestring (right biased).
  144. foldr :: (Char -> a -> a) -> a -> B.ByteString -> a
  145. foldr cons nil cs = case uncons cs of
  146. Just (a,as) -> cons a (foldr cons nil as)
  147. Nothing -> nil
  148. -- | Traverse a bytestring (left biased).
  149. -- This fuction is strict in the acumulator.
  150. foldl :: (a -> Char -> a) -> a -> B.ByteString -> a
  151. foldl add acc cs = case uncons cs of
  152. Just (a,as) -> let v = add acc a
  153. in seq v (foldl add v as)
  154. Nothing -> acc
  155. -- | Counts the number of characters encoded in the bytestring.
  156. -- Note that this includes replacment characters.
  157. length :: B.ByteString -> Int
  158. length b = loop 0 b
  159. where loop n xs = case decode xs of
  160. Just (_,m) -> loop (n+1) (B.drop m xs)
  161. Nothing -> n
  162. -- | Split a string into a list of lines.
  163. -- Lines are termianted by '\n' or the end of the string.
  164. -- Empty line may not be terminated by the end of the string.
  165. -- See also 'lines\''.
  166. lines :: B.ByteString -> [B.ByteString]
  167. lines bs | B.null bs = []
  168. lines bs = case B.elemIndex 10 bs of
  169. Just x -> let (xs,ys) = B.splitAt x bs
  170. in xs : lines (B.tail ys)
  171. Nothing -> [bs]
  172. -- | Split a string into a list of lines.
  173. -- Lines are termianted by '\n' or the end of the string.
  174. -- Empty line may not be terminated by the end of the string.
  175. -- This function preserves the terminators.
  176. -- See also 'lines'.
  177. lines' :: B.ByteString -> [B.ByteString]
  178. lines' bs | B.null bs = []
  179. lines' bs = case B.elemIndex 10 bs of
  180. Just x -> let (xs,ys) = B.splitAt (x+1) bs
  181. in xs : lines' ys
  182. Nothing -> [bs]