PageRenderTime 57ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/Data/Attoparsec/Text/Internal.hs

http://github.com/bos/attoparsec
Haskell | 549 lines | 305 code | 58 blank | 186 comment | 30 complexity | 24744606d403bbb8ee3575ce8ec9e394 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. {-# LANGUAGE BangPatterns, FlexibleInstances, GADTs, OverloadedStrings,
  2. Rank2Types, RecordWildCards, TypeFamilies, TypeSynonymInstances #-}
  3. {-# OPTIONS_GHC -fno-warn-orphans #-}
  4. -- |
  5. -- Module : Data.Attoparsec.Text.Internal
  6. -- Copyright : Bryan O'Sullivan 2007-2015
  7. -- License : BSD3
  8. --
  9. -- Maintainer : bos@serpentine.com
  10. -- Stability : experimental
  11. -- Portability : unknown
  12. --
  13. -- Simple, efficient parser combinators for 'Text' strings, loosely
  14. -- based on the Parsec library.
  15. module Data.Attoparsec.Text.Internal
  16. (
  17. -- * Parser types
  18. Parser
  19. , Result
  20. -- * Running parsers
  21. , parse
  22. , parseOnly
  23. -- * Combinators
  24. , module Data.Attoparsec.Combinator
  25. -- * Parsing individual characters
  26. , satisfy
  27. , satisfyWith
  28. , anyChar
  29. , skip
  30. , char
  31. , notChar
  32. -- ** Lookahead
  33. , peekChar
  34. , peekChar'
  35. -- ** Character classes
  36. , inClass
  37. , notInClass
  38. -- * Efficient string handling
  39. , skipWhile
  40. , string
  41. , stringCI
  42. , asciiCI
  43. , take
  44. , scan
  45. , runScanner
  46. , takeWhile
  47. , takeWhile1
  48. , takeTill
  49. -- ** Consume all remaining input
  50. , takeText
  51. , takeLazyText
  52. -- * Utilities
  53. , endOfLine
  54. , endOfInput
  55. , match
  56. , atEnd
  57. ) where
  58. import Control.Applicative ((<|>), (<$>), pure, (*>))
  59. import Control.Monad (when)
  60. import Data.Attoparsec.Combinator ((<?>))
  61. import Data.Attoparsec.Internal
  62. import Data.Attoparsec.Internal.Types hiding (Parser, Failure, Success)
  63. import qualified Data.Attoparsec.Text.Buffer as Buf
  64. import Data.Attoparsec.Text.Buffer (Buffer, buffer)
  65. import Data.Char (isAsciiUpper, isAsciiLower, toUpper, toLower)
  66. import Data.List (intercalate)
  67. import Data.String (IsString(..))
  68. import Data.Text.Internal (Text(..))
  69. import Prelude hiding (getChar, succ, take, takeWhile)
  70. import qualified Data.Attoparsec.Internal.Types as T
  71. import qualified Data.Attoparsec.Text.FastSet as Set
  72. import qualified Data.Text as T
  73. import qualified Data.Text.Lazy as L
  74. import qualified Data.Text.Unsafe as T
  75. type Parser = T.Parser Text
  76. type Result = IResult Text
  77. type Failure r = T.Failure Text Buffer r
  78. type Success a r = T.Success Text Buffer a r
  79. instance (a ~ Text) => IsString (Parser a) where
  80. fromString = string . T.pack
  81. -- | The parser @satisfy p@ succeeds for any character for which the
  82. -- predicate @p@ returns 'True'. Returns the character that is
  83. -- actually parsed.
  84. --
  85. -- >digit = satisfy isDigit
  86. -- > where isDigit c = c >= '0' && c <= '9'
  87. satisfy :: (Char -> Bool) -> Parser Char
  88. satisfy p = do
  89. (k,c) <- ensure 1
  90. let !h = T.unsafeHead c
  91. if p h
  92. then advance k >> return h
  93. else fail "satisfy"
  94. {-# INLINE satisfy #-}
  95. -- | The parser @skip p@ succeeds for any character for which the
  96. -- predicate @p@ returns 'True'.
  97. --
  98. -- >skipDigit = skip isDigit
  99. -- > where isDigit c = c >= '0' && c <= '9'
  100. skip :: (Char -> Bool) -> Parser ()
  101. skip p = do
  102. (k,s) <- ensure 1
  103. if p (T.unsafeHead s)
  104. then advance k
  105. else fail "skip"
  106. -- | The parser @satisfyWith f p@ transforms a character, and succeeds
  107. -- if the predicate @p@ returns 'True' on the transformed value. The
  108. -- parser returns the transformed character that was parsed.
  109. satisfyWith :: (Char -> a) -> (a -> Bool) -> Parser a
  110. satisfyWith f p = do
  111. (k,s) <- ensure 1
  112. let c = f $! T.unsafeHead s
  113. if p c
  114. then advance k >> return c
  115. else fail "satisfyWith"
  116. {-# INLINE satisfyWith #-}
  117. -- | Consume @n@ characters of input, but succeed only if the
  118. -- predicate returns 'True'.
  119. takeWith :: Int -> (Text -> Bool) -> Parser Text
  120. takeWith n p = do
  121. (k,s) <- ensure n
  122. if p s
  123. then advance k >> return s
  124. else fail "takeWith"
  125. -- | Consume exactly @n@ characters of input.
  126. take :: Int -> Parser Text
  127. take n = takeWith (max n 0) (const True)
  128. {-# INLINE take #-}
  129. -- | @string s@ parses a sequence of characters that identically match
  130. -- @s@. Returns the parsed string (i.e. @s@). This parser consumes no
  131. -- input if it fails (even if a partial match).
  132. --
  133. -- /Note/: The behaviour of this parser is different to that of the
  134. -- similarly-named parser in Parsec, as this one is all-or-nothing.
  135. -- To illustrate the difference, the following parser will fail under
  136. -- Parsec given an input of @\"for\"@:
  137. --
  138. -- >string "foo" <|> string "for"
  139. --
  140. -- The reason for its failure is that the first branch is a
  141. -- partial match, and will consume the letters @\'f\'@ and @\'o\'@
  142. -- before failing. In attoparsec, the above parser will /succeed/ on
  143. -- that input, because the failed first branch will consume nothing.
  144. string :: Text -> Parser Text
  145. string s = string_ (stringSuspended id) id s
  146. {-# INLINE string #-}
  147. string_ :: (forall r. Text -> Text -> Buffer -> Pos -> More
  148. -> Failure r -> Success Text r -> Result r)
  149. -> (Text -> Text)
  150. -> Text -> Parser Text
  151. string_ suspended f s0 = T.Parser $ \t pos more lose succ ->
  152. let s = f s0
  153. ft = f (Buf.unbufferAt (fromPos pos) t)
  154. in case T.commonPrefixes s ft of
  155. Nothing
  156. | T.null s -> succ t pos more T.empty
  157. | T.null ft -> suspended s s t pos more lose succ
  158. | otherwise -> lose t pos more [] "string"
  159. Just (pfx,ssfx,tsfx)
  160. | T.null ssfx -> let l = Pos (T.lengthWord16 pfx)
  161. in succ t (pos + l) more (substring pos l t)
  162. | not (T.null tsfx) -> lose t pos more [] "string"
  163. | otherwise -> suspended s ssfx t pos more lose succ
  164. {-# INLINE string_ #-}
  165. stringSuspended :: (Text -> Text)
  166. -> Text -> Text -> Buffer -> Pos -> More
  167. -> Failure r
  168. -> Success Text r
  169. -> Result r
  170. stringSuspended f s000 s0 t0 pos0 more0 lose0 succ0 =
  171. runParser (demandInput_ >>= go) t0 pos0 more0 lose0 succ0
  172. where
  173. go s' = T.Parser $ \t pos more lose succ ->
  174. let s = f s'
  175. in case T.commonPrefixes s0 s of
  176. Nothing -> lose t pos more [] "string"
  177. Just (_pfx,ssfx,tsfx)
  178. | T.null ssfx -> let l = Pos (T.lengthWord16 s000)
  179. in succ t (pos + l) more (substring pos l t)
  180. | T.null tsfx -> stringSuspended f s000 ssfx t pos more lose succ
  181. | otherwise -> lose t pos more [] "string"
  182. -- | Satisfy a literal string, ignoring case.
  183. --
  184. -- Note: this function is currently quite inefficient. Unicode case
  185. -- folding can change the length of a string (\"&#223;\" becomes
  186. -- "ss"), which makes a simple, efficient implementation tricky. We
  187. -- have (for now) chosen simplicity over efficiency.
  188. stringCI :: Text -> Parser Text
  189. stringCI s = go 0
  190. where
  191. go !n
  192. | n > T.length fs = fail "stringCI"
  193. | otherwise = do
  194. (k,t) <- ensure n
  195. if T.toCaseFold t == fs
  196. then advance k >> return t
  197. else go (n+1)
  198. fs = T.toCaseFold s
  199. {-# INLINE stringCI #-}
  200. {-# DEPRECATED stringCI "this is very inefficient, use asciiCI instead" #-}
  201. -- | Satisfy a literal string, ignoring case for characters in the ASCII range.
  202. asciiCI :: Text -> Parser Text
  203. asciiCI s = fmap fst $ match $ T.foldr ((*>) . asciiCharCI) (pure ()) s
  204. {-# INLINE asciiCI #-}
  205. asciiCharCI :: Char -> Parser Char
  206. asciiCharCI c
  207. | isAsciiUpper c = char c <|> char (toLower c)
  208. | isAsciiLower c = char c <|> char (toUpper c)
  209. | otherwise = char c
  210. {-# INLINE asciiCharCI #-}
  211. -- | Skip past input for as long as the predicate returns 'True'.
  212. skipWhile :: (Char -> Bool) -> Parser ()
  213. skipWhile p = go
  214. where
  215. go = do
  216. t <- T.takeWhile p <$> get
  217. continue <- inputSpansChunks (size t)
  218. when continue go
  219. {-# INLINE skipWhile #-}
  220. -- | Consume input as long as the predicate returns 'False'
  221. -- (i.e. until it returns 'True'), and return the consumed input.
  222. --
  223. -- This parser does not fail. It will return an empty string if the
  224. -- predicate returns 'True' on the first character of input.
  225. --
  226. -- /Note/: Because this parser does not fail, do not use it with
  227. -- combinators such as 'Control.Applicative.many', because such
  228. -- parsers loop until a failure occurs. Careless use will thus result
  229. -- in an infinite loop.
  230. takeTill :: (Char -> Bool) -> Parser Text
  231. takeTill p = takeWhile (not . p)
  232. {-# INLINE takeTill #-}
  233. -- | Consume input as long as the predicate returns 'True', and return
  234. -- the consumed input.
  235. --
  236. -- This parser does not fail. It will return an empty string if the
  237. -- predicate returns 'False' on the first character of input.
  238. --
  239. -- /Note/: Because this parser does not fail, do not use it with
  240. -- combinators such as 'Control.Applicative.many', because such
  241. -- parsers loop until a failure occurs. Careless use will thus result
  242. -- in an infinite loop.
  243. takeWhile :: (Char -> Bool) -> Parser Text
  244. takeWhile p = do
  245. h <- T.takeWhile p <$> get
  246. continue <- inputSpansChunks (size h)
  247. -- only use slow concat path if necessary
  248. if continue
  249. then takeWhileAcc p [h]
  250. else return h
  251. {-# INLINE takeWhile #-}
  252. takeWhileAcc :: (Char -> Bool) -> [Text] -> Parser Text
  253. takeWhileAcc p = go
  254. where
  255. go acc = do
  256. h <- T.takeWhile p <$> get
  257. continue <- inputSpansChunks (size h)
  258. if continue
  259. then go (h:acc)
  260. else return $ concatReverse (h:acc)
  261. {-# INLINE takeWhileAcc #-}
  262. takeRest :: Parser [Text]
  263. takeRest = go []
  264. where
  265. go acc = do
  266. input <- wantInput
  267. if input
  268. then do
  269. s <- get
  270. advance (size s)
  271. go (s:acc)
  272. else return (reverse acc)
  273. -- | Consume all remaining input and return it as a single string.
  274. takeText :: Parser Text
  275. takeText = T.concat `fmap` takeRest
  276. -- | Consume all remaining input and return it as a single string.
  277. takeLazyText :: Parser L.Text
  278. takeLazyText = L.fromChunks `fmap` takeRest
  279. data Scan s = Continue s
  280. | Finished s {-# UNPACK #-} !Int Text
  281. scan_ :: (s -> [Text] -> Parser r) -> s -> (s -> Char -> Maybe s) -> Parser r
  282. scan_ f s0 p = go [] s0
  283. where
  284. scanner s !n t =
  285. case T.uncons t of
  286. Just (c,t') -> case p s c of
  287. Just s' -> scanner s' (n+1) t'
  288. Nothing -> Finished s n t
  289. Nothing -> Continue s
  290. go acc s = do
  291. input <- get
  292. case scanner s 0 input of
  293. Continue s' -> do continue <- inputSpansChunks (size input)
  294. if continue
  295. then go (input : acc) s'
  296. else f s' (input : acc)
  297. Finished s' n t -> do advance (size input - size t)
  298. f s' (T.take n input : acc)
  299. {-# INLINE scan_ #-}
  300. -- | A stateful scanner. The predicate consumes and transforms a
  301. -- state argument, and each transformed state is passed to successive
  302. -- invocations of the predicate on each character of the input until one
  303. -- returns 'Nothing' or the input ends.
  304. --
  305. -- This parser does not fail. It will return an empty string if the
  306. -- predicate returns 'Nothing' on the first character of input.
  307. --
  308. -- /Note/: Because this parser does not fail, do not use it with
  309. -- combinators such as 'Control.Applicative.many', because such
  310. -- parsers loop until a failure occurs. Careless use will thus result
  311. -- in an infinite loop.
  312. scan :: s -> (s -> Char -> Maybe s) -> Parser Text
  313. scan = scan_ $ \_ chunks -> return $! concatReverse chunks
  314. {-# INLINE scan #-}
  315. -- | Like 'scan', but generalized to return the final state of the
  316. -- scanner.
  317. runScanner :: s -> (s -> Char -> Maybe s) -> Parser (Text, s)
  318. runScanner = scan_ $ \s xs -> let !sx = concatReverse xs in return (sx, s)
  319. {-# INLINE runScanner #-}
  320. -- | Consume input as long as the predicate returns 'True', and return
  321. -- the consumed input.
  322. --
  323. -- This parser requires the predicate to succeed on at least one
  324. -- character of input: it will fail if the predicate never returns
  325. -- 'True' or if there is no input left.
  326. takeWhile1 :: (Char -> Bool) -> Parser Text
  327. takeWhile1 p = do
  328. (`when` demandInput) =<< endOfChunk
  329. h <- T.takeWhile p <$> get
  330. let size' = size h
  331. when (size' == 0) $ fail "takeWhile1"
  332. advance size'
  333. eoc <- endOfChunk
  334. if eoc
  335. then takeWhileAcc p [h]
  336. else return h
  337. {-# INLINE takeWhile1 #-}
  338. -- | Match any character in a set.
  339. --
  340. -- >vowel = inClass "aeiou"
  341. --
  342. -- Range notation is supported.
  343. --
  344. -- >halfAlphabet = inClass "a-nA-N"
  345. --
  346. -- To add a literal @\'-\'@ to a set, place it at the beginning or end
  347. -- of the string.
  348. inClass :: String -> Char -> Bool
  349. inClass s = (`Set.member` mySet)
  350. where mySet = Set.charClass s
  351. {-# NOINLINE mySet #-}
  352. {-# INLINE inClass #-}
  353. -- | Match any character not in a set.
  354. notInClass :: String -> Char -> Bool
  355. notInClass s = not . inClass s
  356. {-# INLINE notInClass #-}
  357. -- | Match any character.
  358. anyChar :: Parser Char
  359. anyChar = satisfy $ const True
  360. {-# INLINE anyChar #-}
  361. -- | Match a specific character.
  362. char :: Char -> Parser Char
  363. char c = satisfy (== c) <?> show c
  364. {-# INLINE char #-}
  365. -- | Match any character except the given one.
  366. notChar :: Char -> Parser Char
  367. notChar c = satisfy (/= c) <?> "not " ++ show c
  368. {-# INLINE notChar #-}
  369. -- | Match any character, to perform lookahead. Returns 'Nothing' if
  370. -- end of input has been reached. Does not consume any input.
  371. --
  372. -- /Note/: Because this parser does not fail, do not use it with
  373. -- combinators such as 'Control.Applicative.many', because such
  374. -- parsers loop until a failure occurs. Careless use will thus result
  375. -- in an infinite loop.
  376. peekChar :: Parser (Maybe Char)
  377. peekChar = T.Parser $ \t pos more _lose succ ->
  378. case () of
  379. _| pos < lengthOf t ->
  380. let T.Iter !c _ = Buf.iter t (fromPos pos)
  381. in succ t pos more (Just c)
  382. | more == Complete ->
  383. succ t pos more Nothing
  384. | otherwise ->
  385. let succ' t' pos' more' =
  386. let T.Iter !c _ = Buf.iter t' (fromPos pos')
  387. in succ t' pos' more' (Just c)
  388. lose' t' pos' more' = succ t' pos' more' Nothing
  389. in prompt t pos more lose' succ'
  390. {-# INLINE peekChar #-}
  391. -- | Match any character, to perform lookahead. Does not consume any
  392. -- input, but will fail if end of input has been reached.
  393. peekChar' :: Parser Char
  394. peekChar' = do
  395. (_,s) <- ensure 1
  396. return $! T.unsafeHead s
  397. {-# INLINE peekChar' #-}
  398. -- | Match either a single newline character @\'\\n\'@, or a carriage
  399. -- return followed by a newline character @\"\\r\\n\"@.
  400. endOfLine :: Parser ()
  401. endOfLine = (char '\n' >> return ()) <|> (string "\r\n" >> return ())
  402. -- | Terminal failure continuation.
  403. failK :: Failure a
  404. failK t (Pos pos) _more stack msg = Fail (Buf.dropWord16 pos t) stack msg
  405. {-# INLINE failK #-}
  406. -- | Terminal success continuation.
  407. successK :: Success a a
  408. successK t (Pos pos) _more a = Done (Buf.dropWord16 pos t) a
  409. {-# INLINE successK #-}
  410. -- | Run a parser.
  411. parse :: Parser a -> Text -> Result a
  412. parse m s = runParser m (buffer s) 0 Incomplete failK successK
  413. {-# INLINE parse #-}
  414. -- | Run a parser that cannot be resupplied via a 'Partial' result.
  415. --
  416. -- This function does not force a parser to consume all of its input.
  417. -- Instead, any residual input will be discarded. To force a parser
  418. -- to consume all of its input, use something like this:
  419. --
  420. -- @
  421. --'parseOnly' (myParser 'Control.Applicative.<*' 'endOfInput')
  422. -- @
  423. parseOnly :: Parser a -> Text -> Either String a
  424. parseOnly m s = case runParser m (buffer s) 0 Complete failK successK of
  425. Fail _ [] err -> Left err
  426. Fail _ ctxs err -> Left (intercalate " > " ctxs ++ ": " ++ err)
  427. Done _ a -> Right a
  428. _ -> error "parseOnly: impossible error!"
  429. {-# INLINE parseOnly #-}
  430. get :: Parser Text
  431. get = T.Parser $ \t pos more _lose succ ->
  432. succ t pos more (Buf.dropWord16 (fromPos pos) t)
  433. {-# INLINE get #-}
  434. endOfChunk :: Parser Bool
  435. endOfChunk = T.Parser $ \t pos more _lose succ ->
  436. succ t pos more (pos == lengthOf t)
  437. {-# INLINE endOfChunk #-}
  438. inputSpansChunks :: Pos -> Parser Bool
  439. inputSpansChunks i = T.Parser $ \t pos_ more _lose succ ->
  440. let pos = pos_ + i
  441. in if pos < lengthOf t || more == Complete
  442. then succ t pos more False
  443. else let lose' t' pos' more' = succ t' pos' more' False
  444. succ' t' pos' more' = succ t' pos' more' True
  445. in prompt t pos more lose' succ'
  446. {-# INLINE inputSpansChunks #-}
  447. advance :: Pos -> Parser ()
  448. advance n = T.Parser $ \t pos more _lose succ -> succ t (pos+n) more ()
  449. {-# INLINE advance #-}
  450. ensureSuspended :: Int -> Buffer -> Pos -> More
  451. -> Failure r -> Success (Pos, Text) r
  452. -> Result r
  453. ensureSuspended n t pos more lose succ =
  454. runParser (demandInput >> go) t pos more lose succ
  455. where go = T.Parser $ \t' pos' more' lose' succ' ->
  456. case lengthAtLeast pos' n t' of
  457. Just n' -> succ' t' pos' more' (n', substring pos n' t')
  458. Nothing -> runParser (demandInput >> go) t' pos' more' lose' succ'
  459. -- | If at least @n@ elements of input are available, return the
  460. -- current input, otherwise fail.
  461. ensure :: Int -> Parser (Pos, Text)
  462. ensure n = T.Parser $ \t pos more lose succ ->
  463. case lengthAtLeast pos n t of
  464. Just n' -> succ t pos more (n', substring pos n' t)
  465. -- The uncommon case is kept out-of-line to reduce code size:
  466. Nothing -> ensureSuspended n t pos more lose succ
  467. {-# INLINE ensure #-}
  468. -- | Return both the result of a parse and the portion of the input
  469. -- that was consumed while it was being parsed.
  470. match :: Parser a -> Parser (Text, a)
  471. match p = T.Parser $ \t pos more lose succ ->
  472. let succ' t' pos' more' a = succ t' pos' more'
  473. (substring pos (pos'-pos) t', a)
  474. in runParser p t pos more lose succ'
  475. -- | Ensure that at least @n@ code points of input are available.
  476. -- Returns the number of words consumed while traversing.
  477. lengthAtLeast :: Pos -> Int -> Buffer -> Maybe Pos
  478. lengthAtLeast pos n t = go 0 (fromPos pos)
  479. where go i !p
  480. | i == n = Just (Pos p - pos)
  481. | p == len = Nothing
  482. | otherwise = go (i+1) (p + Buf.iter_ t p)
  483. Pos len = lengthOf t
  484. {-# INLINE lengthAtLeast #-}
  485. substring :: Pos -> Pos -> Buffer -> Text
  486. substring (Pos pos) (Pos n) = Buf.substring pos n
  487. {-# INLINE substring #-}
  488. lengthOf :: Buffer -> Pos
  489. lengthOf = Pos . Buf.length
  490. size :: Text -> Pos
  491. size (Text _ _ l) = Pos l