PageRenderTime 50ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/Data/Attoparsec/ByteString/Internal.hs

http://github.com/bos/attoparsec
Haskell | 590 lines | 345 code | 58 blank | 187 comment | 53 complexity | c5754fd02091031f8a071a69342a26a1 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. {-# LANGUAGE BangPatterns, CPP, GADTs, OverloadedStrings, RankNTypes,
  2. RecordWildCards #-}
  3. -- |
  4. -- Module : Data.Attoparsec.ByteString.Internal
  5. -- Copyright : Bryan O'Sullivan 2007-2015
  6. -- License : BSD3
  7. --
  8. -- Maintainer : bos@serpentine.com
  9. -- Stability : experimental
  10. -- Portability : unknown
  11. --
  12. -- Simple, efficient parser combinators for 'ByteString' strings,
  13. -- loosely based on the Parsec library.
  14. module Data.Attoparsec.ByteString.Internal
  15. (
  16. -- * Parser types
  17. Parser
  18. , Result
  19. -- * Running parsers
  20. , parse
  21. , parseOnly
  22. -- * Combinators
  23. , module Data.Attoparsec.Combinator
  24. -- * Parsing individual bytes
  25. , satisfy
  26. , satisfyWith
  27. , anyWord8
  28. , skip
  29. , word8
  30. , notWord8
  31. -- ** Lookahead
  32. , peekWord8
  33. , peekWord8'
  34. -- ** Byte classes
  35. , inClass
  36. , notInClass
  37. -- * Parsing more complicated structures
  38. , storable
  39. -- * Efficient string handling
  40. , skipWhile
  41. , string
  42. , stringCI
  43. , take
  44. , scan
  45. , runScanner
  46. , takeWhile
  47. , takeWhile1
  48. , takeWhileIncluding
  49. , takeTill
  50. , getChunk
  51. -- ** Consume all remaining input
  52. , takeByteString
  53. , takeLazyByteString
  54. -- * Utilities
  55. , endOfLine
  56. , endOfInput
  57. , match
  58. , atEnd
  59. ) where
  60. #if !MIN_VERSION_base(4,8,0)
  61. import Control.Applicative ((<$>))
  62. #endif
  63. import Control.Applicative ((<|>))
  64. import Control.Monad (when)
  65. import Data.Attoparsec.ByteString.Buffer (Buffer, buffer)
  66. import Data.Attoparsec.ByteString.FastSet (charClass, memberWord8)
  67. import Data.Attoparsec.Combinator ((<?>))
  68. import Data.Attoparsec.Internal
  69. import Data.Attoparsec.Internal.Compat
  70. import Data.Attoparsec.Internal.Fhthagn (inlinePerformIO)
  71. import Data.Attoparsec.Internal.Types hiding (Parser, Failure, Success)
  72. import Data.ByteString (ByteString)
  73. import Data.List (intercalate)
  74. import Data.Word (Word8)
  75. import Foreign.ForeignPtr (withForeignPtr)
  76. import Foreign.Ptr (castPtr, minusPtr, plusPtr)
  77. import Foreign.Storable (Storable(peek, sizeOf))
  78. import Prelude hiding (getChar, succ, take, takeWhile)
  79. import qualified Data.Attoparsec.ByteString.Buffer as Buf
  80. import qualified Data.Attoparsec.Internal.Types as T
  81. import qualified Data.ByteString as B8
  82. import qualified Data.ByteString.Char8 as B
  83. import qualified Data.ByteString.Internal as B
  84. import qualified Data.ByteString.Lazy as L
  85. import qualified Data.ByteString.Unsafe as B
  86. type Parser = T.Parser ByteString
  87. type Result = IResult ByteString
  88. type Failure r = T.Failure ByteString Buffer r
  89. type Success a r = T.Success ByteString Buffer a r
  90. -- | The parser @satisfy p@ succeeds for any byte for which the
  91. -- predicate @p@ returns 'True'. Returns the byte that is actually
  92. -- parsed.
  93. --
  94. -- >digit = satisfy isDigit
  95. -- > where isDigit w = w >= 48 && w <= 57
  96. satisfy :: (Word8 -> Bool) -> Parser Word8
  97. satisfy p = do
  98. h <- peekWord8'
  99. if p h
  100. then advance 1 >> return h
  101. else fail "satisfy"
  102. {-# INLINE satisfy #-}
  103. -- | The parser @skip p@ succeeds for any byte for which the predicate
  104. -- @p@ returns 'True'.
  105. --
  106. -- >skipDigit = skip isDigit
  107. -- > where isDigit w = w >= 48 && w <= 57
  108. skip :: (Word8 -> Bool) -> Parser ()
  109. skip p = do
  110. h <- peekWord8'
  111. if p h
  112. then advance 1
  113. else fail "skip"
  114. -- | The parser @satisfyWith f p@ transforms a byte, and succeeds if
  115. -- the predicate @p@ returns 'True' on the transformed value. The
  116. -- parser returns the transformed byte that was parsed.
  117. satisfyWith :: (Word8 -> a) -> (a -> Bool) -> Parser a
  118. satisfyWith f p = do
  119. h <- peekWord8'
  120. let c = f h
  121. if p c
  122. then advance 1 >> return c
  123. else fail "satisfyWith"
  124. {-# INLINE satisfyWith #-}
  125. storable :: Storable a => Parser a
  126. storable = hack undefined
  127. where
  128. hack :: Storable b => b -> Parser b
  129. hack dummy = do
  130. (fp,o,_) <- B.toForeignPtr `fmap` take (sizeOf dummy)
  131. return . inlinePerformIO . withForeignPtr fp $ \p ->
  132. peek (castPtr $ p `plusPtr` o)
  133. -- | Consume exactly @n@ bytes of input.
  134. take :: Int -> Parser ByteString
  135. take n0 = do
  136. let n = max n0 0
  137. s <- ensure n
  138. advance n >> return s
  139. {-# INLINE take #-}
  140. -- | @string s@ parses a sequence of bytes that identically match
  141. -- @s@. Returns the parsed string (i.e. @s@). This parser consumes no
  142. -- input if it fails (even if a partial match).
  143. --
  144. -- /Note/: The behaviour of this parser is different to that of the
  145. -- similarly-named parser in Parsec, as this one is all-or-nothing.
  146. -- To illustrate the difference, the following parser will fail under
  147. -- Parsec given an input of @\"for\"@:
  148. --
  149. -- >string "foo" <|> string "for"
  150. --
  151. -- The reason for its failure is that the first branch is a
  152. -- partial match, and will consume the letters @\'f\'@ and @\'o\'@
  153. -- before failing. In attoparsec, the above parser will /succeed/ on
  154. -- that input, because the failed first branch will consume nothing.
  155. string :: ByteString -> Parser ByteString
  156. string s = string_ (stringSuspended id) id s
  157. {-# INLINE string #-}
  158. -- ASCII-specific but fast, oh yes.
  159. toLower :: Word8 -> Word8
  160. toLower w | w >= 65 && w <= 90 = w + 32
  161. | otherwise = w
  162. -- | Satisfy a literal string, ignoring case.
  163. stringCI :: ByteString -> Parser ByteString
  164. stringCI s = string_ (stringSuspended lower) lower s
  165. where lower = B8.map toLower
  166. {-# INLINE stringCI #-}
  167. string_ :: (forall r. ByteString -> ByteString -> Buffer -> Pos -> More
  168. -> Failure r -> Success ByteString r -> Result r)
  169. -> (ByteString -> ByteString)
  170. -> ByteString -> Parser ByteString
  171. string_ suspended f s0 = T.Parser $ \t pos more lose succ ->
  172. let n = B.length s
  173. s = f s0
  174. in if lengthAtLeast pos n t
  175. then let t' = substring pos (Pos n) t
  176. in if s == f t'
  177. then succ t (pos + Pos n) more t'
  178. else lose t pos more [] "string"
  179. else let t' = Buf.unsafeDrop (fromPos pos) t
  180. in if f t' `B.isPrefixOf` s
  181. then suspended s (B.drop (B.length t') s) t pos more lose succ
  182. else lose t pos more [] "string"
  183. {-# INLINE string_ #-}
  184. stringSuspended :: (ByteString -> ByteString)
  185. -> ByteString -> ByteString -> Buffer -> Pos -> More
  186. -> Failure r
  187. -> Success ByteString r
  188. -> Result r
  189. stringSuspended f s0 s t pos more lose succ =
  190. runParser (demandInput_ >>= go) t pos more lose succ
  191. where go s'0 = T.Parser $ \t' pos' more' lose' succ' ->
  192. let m = B.length s
  193. s' = f s'0
  194. n = B.length s'
  195. in if n >= m
  196. then if B.unsafeTake m s' == s
  197. then let o = Pos (B.length s0)
  198. in succ' t' (pos' + o) more'
  199. (substring pos' o t')
  200. else lose' t' pos' more' [] "string"
  201. else if s' == B.unsafeTake n s
  202. then stringSuspended f s0 (B.unsafeDrop n s)
  203. t' pos' more' lose' succ'
  204. else lose' t' pos' more' [] "string"
  205. -- | Skip past input for as long as the predicate returns 'True'.
  206. skipWhile :: (Word8 -> Bool) -> Parser ()
  207. skipWhile p = go
  208. where
  209. go = do
  210. t <- B8.takeWhile p <$> get
  211. continue <- inputSpansChunks (B.length t)
  212. when continue go
  213. {-# INLINE skipWhile #-}
  214. -- | Consume input as long as the predicate returns 'False'
  215. -- (i.e. until it returns 'True'), and return the consumed input.
  216. --
  217. -- This parser does not fail. It will return an empty string if the
  218. -- predicate returns 'True' on the first byte of input.
  219. --
  220. -- /Note/: Because this parser does not fail, do not use it with
  221. -- combinators such as 'Control.Applicative.many', because such
  222. -- parsers loop until a failure occurs. Careless use will thus result
  223. -- in an infinite loop.
  224. takeTill :: (Word8 -> Bool) -> Parser ByteString
  225. takeTill p = takeWhile (not . p)
  226. {-# INLINE takeTill #-}
  227. -- | Consume input as long as the predicate returns 'True', and return
  228. -- the consumed input.
  229. --
  230. -- This parser does not fail. It will return an empty string if the
  231. -- predicate returns 'False' on the first byte of input.
  232. --
  233. -- /Note/: Because this parser does not fail, do not use it with
  234. -- combinators such as 'Control.Applicative.many', because such
  235. -- parsers loop until a failure occurs. Careless use will thus result
  236. -- in an infinite loop.
  237. takeWhile :: (Word8 -> Bool) -> Parser ByteString
  238. takeWhile p = do
  239. s <- B8.takeWhile p <$> get
  240. continue <- inputSpansChunks (B.length s)
  241. if continue
  242. then takeWhileAcc p [s]
  243. else return s
  244. {-# INLINE takeWhile #-}
  245. takeWhileAcc :: (Word8 -> Bool) -> [ByteString] -> Parser ByteString
  246. takeWhileAcc p = go
  247. where
  248. go acc = do
  249. s <- B8.takeWhile p <$> get
  250. continue <- inputSpansChunks (B.length s)
  251. if continue
  252. then go (s:acc)
  253. else return $ concatReverse (s:acc)
  254. {-# INLINE takeWhileAcc #-}
  255. -- | Consume input until immediately after the predicate returns 'True', and return
  256. -- the consumed input.
  257. --
  258. -- This parser will consume at least one 'Word8' or fail.
  259. takeWhileIncluding :: (Word8 -> Bool) -> Parser B.ByteString
  260. takeWhileIncluding p = do
  261. (s', t) <- B8.span p <$> get
  262. case B8.uncons t of
  263. -- Since we reached a break point and managed to get the next byte,
  264. -- input can not have been exhausted thus we succed and advance unconditionally.
  265. Just (h, _) -> do
  266. let s = s' `B8.snoc` h
  267. advance (B8.length s)
  268. return s
  269. -- The above isn't true so either we ran out of input or we need to process the next chunk.
  270. Nothing -> do
  271. continue <- inputSpansChunks (B8.length s')
  272. if continue
  273. then takeWhileIncAcc p [s']
  274. -- Our spec says that if we run out of input we fail.
  275. else fail "takeWhileIncluding reached end of input"
  276. {-# INLINE takeWhileIncluding #-}
  277. takeWhileIncAcc :: (Word8 -> Bool) -> [B.ByteString] -> Parser B.ByteString
  278. takeWhileIncAcc p = go
  279. where
  280. go acc = do
  281. (s', t) <- B8.span p <$> get
  282. case B8.uncons t of
  283. Just (h, _) -> do
  284. let s = s' `B8.snoc` h
  285. advance (B8.length s)
  286. return (concatReverse $ s:acc)
  287. Nothing -> do
  288. continue <- inputSpansChunks (B8.length s')
  289. if continue
  290. then go (s':acc)
  291. else fail "takeWhileIncAcc reached end of input"
  292. {-# INLINE takeWhileIncAcc #-}
  293. takeRest :: Parser [ByteString]
  294. takeRest = go []
  295. where
  296. go acc = do
  297. input <- wantInput
  298. if input
  299. then do
  300. s <- get
  301. advance (B.length s)
  302. go (s:acc)
  303. else return (reverse acc)
  304. -- | Consume all remaining input and return it as a single string.
  305. takeByteString :: Parser ByteString
  306. takeByteString = B.concat `fmap` takeRest
  307. -- | Consume all remaining input and return it as a single string.
  308. takeLazyByteString :: Parser L.ByteString
  309. takeLazyByteString = L.fromChunks `fmap` takeRest
  310. -- | Return the rest of the current chunk without consuming anything.
  311. --
  312. -- If the current chunk is empty, then ask for more input.
  313. -- If there is no more input, then return 'Nothing'
  314. getChunk :: Parser (Maybe ByteString)
  315. getChunk = do
  316. input <- wantInput
  317. if input
  318. then Just <$> get
  319. else return Nothing
  320. data T s = T {-# UNPACK #-} !Int s
  321. scan_ :: (s -> [ByteString] -> Parser r) -> s -> (s -> Word8 -> Maybe s)
  322. -> Parser r
  323. scan_ f s0 p = go [] s0
  324. where
  325. go acc s1 = do
  326. let scanner bs = withPS bs $ \fp off len ->
  327. withForeignPtr fp $ \ptr0 -> do
  328. let start = ptr0 `plusPtr` off
  329. end = start `plusPtr` len
  330. inner ptr !s
  331. | ptr < end = do
  332. w <- peek ptr
  333. case p s w of
  334. Just s' -> inner (ptr `plusPtr` 1) s'
  335. _ -> done (ptr `minusPtr` start) s
  336. | otherwise = done (ptr `minusPtr` start) s
  337. done !i !s = return (T i s)
  338. inner start s1
  339. bs <- get
  340. let T i s' = inlinePerformIO $ scanner bs
  341. !h = B.unsafeTake i bs
  342. continue <- inputSpansChunks i
  343. if continue
  344. then go (h:acc) s'
  345. else f s' (h:acc)
  346. {-# INLINE scan_ #-}
  347. -- | A stateful scanner. The predicate consumes and transforms a
  348. -- state argument, and each transformed state is passed to successive
  349. -- invocations of the predicate on each byte of the input until one
  350. -- returns 'Nothing' or the input ends.
  351. --
  352. -- This parser does not fail. It will return an empty string if the
  353. -- predicate returns 'Nothing' on the first byte of input.
  354. --
  355. -- /Note/: Because this parser does not fail, do not use it with
  356. -- combinators such as 'Control.Applicative.many', because such
  357. -- parsers loop until a failure occurs. Careless use will thus result
  358. -- in an infinite loop.
  359. scan :: s -> (s -> Word8 -> Maybe s) -> Parser ByteString
  360. scan = scan_ $ \_ chunks -> return $! concatReverse chunks
  361. {-# INLINE scan #-}
  362. -- | Like 'scan', but generalized to return the final state of the
  363. -- scanner.
  364. runScanner :: s -> (s -> Word8 -> Maybe s) -> Parser (ByteString, s)
  365. runScanner = scan_ $ \s xs -> let !sx = concatReverse xs in return (sx, s)
  366. {-# INLINE runScanner #-}
  367. -- | Consume input as long as the predicate returns 'True', and return
  368. -- the consumed input.
  369. --
  370. -- This parser requires the predicate to succeed on at least one byte
  371. -- of input: it will fail if the predicate never returns 'True' or if
  372. -- there is no input left.
  373. takeWhile1 :: (Word8 -> Bool) -> Parser ByteString
  374. takeWhile1 p = do
  375. (`when` demandInput) =<< endOfChunk
  376. s <- B8.takeWhile p <$> get
  377. let len = B.length s
  378. if len == 0
  379. then fail "takeWhile1"
  380. else do
  381. advance len
  382. eoc <- endOfChunk
  383. if eoc
  384. then takeWhileAcc p [s]
  385. else return s
  386. {-# INLINE takeWhile1 #-}
  387. -- | Match any byte in a set.
  388. --
  389. -- >vowel = inClass "aeiou"
  390. --
  391. -- Range notation is supported.
  392. --
  393. -- >halfAlphabet = inClass "a-nA-N"
  394. --
  395. -- To add a literal @\'-\'@ to a set, place it at the beginning or end
  396. -- of the string.
  397. inClass :: String -> Word8 -> Bool
  398. inClass s = (`memberWord8` mySet)
  399. where mySet = charClass s
  400. {-# NOINLINE mySet #-}
  401. {-# INLINE inClass #-}
  402. -- | Match any byte not in a set.
  403. notInClass :: String -> Word8 -> Bool
  404. notInClass s = not . inClass s
  405. {-# INLINE notInClass #-}
  406. -- | Match any byte.
  407. anyWord8 :: Parser Word8
  408. anyWord8 = satisfy $ const True
  409. {-# INLINE anyWord8 #-}
  410. -- | Match a specific byte.
  411. word8 :: Word8 -> Parser Word8
  412. word8 c = satisfy (== c) <?> show c
  413. {-# INLINE word8 #-}
  414. -- | Match any byte except the given one.
  415. notWord8 :: Word8 -> Parser Word8
  416. notWord8 c = satisfy (/= c) <?> "not " ++ show c
  417. {-# INLINE notWord8 #-}
  418. -- | Match any byte, to perform lookahead. Returns 'Nothing' if end of
  419. -- input has been reached. Does not consume any input.
  420. --
  421. -- /Note/: Because this parser does not fail, do not use it with
  422. -- combinators such as 'Control.Applicative.many', because such
  423. -- parsers loop until a failure occurs. Careless use will thus result
  424. -- in an infinite loop.
  425. peekWord8 :: Parser (Maybe Word8)
  426. peekWord8 = T.Parser $ \t pos@(Pos pos_) more _lose succ ->
  427. case () of
  428. _| pos_ < Buf.length t ->
  429. let !w = Buf.unsafeIndex t pos_
  430. in succ t pos more (Just w)
  431. | more == Complete ->
  432. succ t pos more Nothing
  433. | otherwise ->
  434. let succ' t' pos' more' = let !w = Buf.unsafeIndex t' pos_
  435. in succ t' pos' more' (Just w)
  436. lose' t' pos' more' = succ t' pos' more' Nothing
  437. in prompt t pos more lose' succ'
  438. {-# INLINE peekWord8 #-}
  439. -- | Match any byte, to perform lookahead. Does not consume any
  440. -- input, but will fail if end of input has been reached.
  441. peekWord8' :: Parser Word8
  442. peekWord8' = T.Parser $ \t pos more lose succ ->
  443. if lengthAtLeast pos 1 t
  444. then succ t pos more (Buf.unsafeIndex t (fromPos pos))
  445. else let succ' t' pos' more' bs' = succ t' pos' more' $! B.unsafeHead bs'
  446. in ensureSuspended 1 t pos more lose succ'
  447. {-# INLINE peekWord8' #-}
  448. -- | Match either a single newline character @\'\\n\'@, or a carriage
  449. -- return followed by a newline character @\"\\r\\n\"@.
  450. endOfLine :: Parser ()
  451. endOfLine = (word8 10 >> return ()) <|> (string "\r\n" >> return ())
  452. -- | Terminal failure continuation.
  453. failK :: Failure a
  454. failK t (Pos pos) _more stack msg = Fail (Buf.unsafeDrop pos t) stack msg
  455. {-# INLINE failK #-}
  456. -- | Terminal success continuation.
  457. successK :: Success a a
  458. successK t (Pos pos) _more a = Done (Buf.unsafeDrop pos t) a
  459. {-# INLINE successK #-}
  460. -- | Run a parser.
  461. parse :: Parser a -> ByteString -> Result a
  462. parse m s = T.runParser m (buffer s) (Pos 0) Incomplete failK successK
  463. {-# INLINE parse #-}
  464. -- | Run a parser that cannot be resupplied via a 'Partial' result.
  465. --
  466. -- This function does not force a parser to consume all of its input.
  467. -- Instead, any residual input will be discarded. To force a parser
  468. -- to consume all of its input, use something like this:
  469. --
  470. -- @
  471. --'parseOnly' (myParser 'Control.Applicative.<*' 'endOfInput')
  472. -- @
  473. parseOnly :: Parser a -> ByteString -> Either String a
  474. parseOnly m s = case T.runParser m (buffer s) (Pos 0) Complete failK successK of
  475. Fail _ [] err -> Left err
  476. Fail _ ctxs err -> Left (intercalate " > " ctxs ++ ": " ++ err)
  477. Done _ a -> Right a
  478. _ -> error "parseOnly: impossible error!"
  479. {-# INLINE parseOnly #-}
  480. get :: Parser ByteString
  481. get = T.Parser $ \t pos more _lose succ ->
  482. succ t pos more (Buf.unsafeDrop (fromPos pos) t)
  483. {-# INLINE get #-}
  484. endOfChunk :: Parser Bool
  485. endOfChunk = T.Parser $ \t pos more _lose succ ->
  486. succ t pos more (fromPos pos == Buf.length t)
  487. {-# INLINE endOfChunk #-}
  488. inputSpansChunks :: Int -> Parser Bool
  489. inputSpansChunks i = T.Parser $ \t pos_ more _lose succ ->
  490. let pos = pos_ + Pos i
  491. in if fromPos pos < Buf.length t || more == Complete
  492. then succ t pos more False
  493. else let lose' t' pos' more' = succ t' pos' more' False
  494. succ' t' pos' more' = succ t' pos' more' True
  495. in prompt t pos more lose' succ'
  496. {-# INLINE inputSpansChunks #-}
  497. advance :: Int -> Parser ()
  498. advance n = T.Parser $ \t pos more _lose succ ->
  499. succ t (pos + Pos n) more ()
  500. {-# INLINE advance #-}
  501. ensureSuspended :: Int -> Buffer -> Pos -> More
  502. -> Failure r
  503. -> Success ByteString r
  504. -> Result r
  505. ensureSuspended n t pos more lose succ =
  506. runParser (demandInput >> go) t pos more lose succ
  507. where go = T.Parser $ \t' pos' more' lose' succ' ->
  508. if lengthAtLeast pos' n t'
  509. then succ' t' pos' more' (substring pos (Pos n) t')
  510. else runParser (demandInput >> go) t' pos' more' lose' succ'
  511. -- | If at least @n@ elements of input are available, return the
  512. -- current input, otherwise fail.
  513. ensure :: Int -> Parser ByteString
  514. ensure n = T.Parser $ \t pos more lose succ ->
  515. if lengthAtLeast pos n t
  516. then succ t pos more (substring pos (Pos n) t)
  517. -- The uncommon case is kept out-of-line to reduce code size:
  518. else ensureSuspended n t pos more lose succ
  519. {-# INLINE ensure #-}
  520. -- | Return both the result of a parse and the portion of the input
  521. -- that was consumed while it was being parsed.
  522. match :: Parser a -> Parser (ByteString, a)
  523. match p = T.Parser $ \t pos more lose succ ->
  524. let succ' t' pos' more' a =
  525. succ t' pos' more' (substring pos (pos'-pos) t', a)
  526. in runParser p t pos more lose succ'
  527. lengthAtLeast :: Pos -> Int -> Buffer -> Bool
  528. lengthAtLeast (Pos pos) n bs = Buf.length bs >= pos + n
  529. {-# INLINE lengthAtLeast #-}
  530. substring :: Pos -> Pos -> Buffer -> ByteString
  531. substring (Pos pos) (Pos n) = Buf.substring pos n
  532. {-# INLINE substring #-}