PageRenderTime 40ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/Data/Attoparsec/ByteString.hs

http://github.com/bos/attoparsec
Haskell | 232 lines | 83 code | 20 blank | 129 comment | 0 complexity | 04439c78b550e94fe8d2e64abfd1b091 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. {-# LANGUAGE CPP #-}
  2. #if __GLASGOW_HASKELL__ >= 702
  3. {-# LANGUAGE Trustworthy #-}
  4. #endif
  5. -- |
  6. -- Module : Data.Attoparsec.ByteString
  7. -- Copyright : Bryan O'Sullivan 2007-2015
  8. -- License : BSD3
  9. --
  10. -- Maintainer : bos@serpentine.com
  11. -- Stability : experimental
  12. -- Portability : unknown
  13. --
  14. -- Simple, efficient combinator parsing for 'B.ByteString' strings,
  15. -- loosely based on the Parsec library.
  16. module Data.Attoparsec.ByteString
  17. (
  18. -- * Differences from Parsec
  19. -- $parsec
  20. -- * Incremental input
  21. -- $incremental
  22. -- * Performance considerations
  23. -- $performance
  24. -- * Parser types
  25. I.Parser
  26. , Result
  27. , T.IResult(..)
  28. , I.compareResults
  29. -- * Running parsers
  30. , parse
  31. , feed
  32. , I.parseOnly
  33. , parseWith
  34. , parseTest
  35. -- ** Result conversion
  36. , maybeResult
  37. , eitherResult
  38. -- * Parsing individual bytes
  39. , I.word8
  40. , I.anyWord8
  41. , I.notWord8
  42. , I.satisfy
  43. , I.satisfyWith
  44. , I.skip
  45. -- ** Lookahead
  46. , I.peekWord8
  47. , I.peekWord8'
  48. -- ** Byte classes
  49. , I.inClass
  50. , I.notInClass
  51. -- * Efficient string handling
  52. , I.string
  53. , I.skipWhile
  54. , I.take
  55. , I.scan
  56. , I.runScanner
  57. , I.takeWhile
  58. , I.takeWhile1
  59. , I.takeWhileIncluding
  60. , I.takeTill
  61. , I.getChunk
  62. -- ** Consume all remaining input
  63. , I.takeByteString
  64. , I.takeLazyByteString
  65. -- * Combinators
  66. , try
  67. , (<?>)
  68. , choice
  69. , count
  70. , option
  71. , many'
  72. , many1
  73. , many1'
  74. , manyTill
  75. , manyTill'
  76. , sepBy
  77. , sepBy'
  78. , sepBy1
  79. , sepBy1'
  80. , skipMany
  81. , skipMany1
  82. , eitherP
  83. , I.match
  84. -- * State observation and manipulation functions
  85. , I.endOfInput
  86. , I.atEnd
  87. ) where
  88. import Data.Attoparsec.Combinator
  89. import Data.List (intercalate)
  90. import qualified Data.Attoparsec.ByteString.Internal as I
  91. import qualified Data.Attoparsec.Internal as I
  92. import qualified Data.ByteString as B
  93. import Data.Attoparsec.ByteString.Internal (Result, parse)
  94. import qualified Data.Attoparsec.Internal.Types as T
  95. -- $parsec
  96. --
  97. -- Compared to Parsec 3, attoparsec makes several tradeoffs. It is
  98. -- not intended for, or ideal for, all possible uses.
  99. --
  100. -- * While attoparsec can consume input incrementally, Parsec cannot.
  101. -- Incremental input is a huge deal for efficient and secure network
  102. -- and system programming, since it gives much more control to users
  103. -- of the library over matters such as resource usage and the I/O
  104. -- model to use.
  105. --
  106. -- * Much of the performance advantage of attoparsec is gained via
  107. -- high-performance parsers such as 'I.takeWhile' and 'I.string'.
  108. -- If you use complicated combinators that return lists of bytes or
  109. -- characters, there is less performance difference between the two
  110. -- libraries.
  111. --
  112. -- * Unlike Parsec 3, attoparsec does not support being used as a
  113. -- monad transformer.
  114. --
  115. -- * attoparsec is specialised to deal only with strict 'B.ByteString'
  116. -- input. Efficiency concerns rule out both lists and lazy
  117. -- bytestrings. The usual use for lazy bytestrings would be to
  118. -- allow consumption of very large input without a large footprint.
  119. -- For this need, attoparsec's incremental input provides an
  120. -- excellent substitute, with much more control over when input
  121. -- takes place. If you must use lazy bytestrings, see the
  122. -- "Data.Attoparsec.ByteString.Lazy" module, which feeds lazy chunks
  123. -- to a regular parser.
  124. --
  125. -- * Parsec parsers can produce more helpful error messages than
  126. -- attoparsec parsers. This is a matter of focus: attoparsec avoids
  127. -- the extra book-keeping in favour of higher performance.
  128. -- $incremental
  129. --
  130. -- attoparsec supports incremental input, meaning that you can feed it
  131. -- a bytestring that represents only part of the expected total amount
  132. -- of data to parse. If your parser reaches the end of a fragment of
  133. -- input and could consume more input, it will suspend parsing and
  134. -- return a 'T.Partial' continuation.
  135. --
  136. -- Supplying the 'T.Partial' continuation with a bytestring will
  137. -- resume parsing at the point where it was suspended, with the
  138. -- bytestring you supplied used as new input at the end of the
  139. -- existing input. You must be prepared for the result of the resumed
  140. -- parse to be another 'T.Partial' continuation.
  141. --
  142. -- To indicate that you have no more input, supply the 'T.Partial'
  143. -- continuation with an empty bytestring.
  144. --
  145. -- Remember that some parsing combinators will not return a result
  146. -- until they reach the end of input. They may thus cause 'T.Partial'
  147. -- results to be returned.
  148. --
  149. -- If you do not need support for incremental input, consider using
  150. -- the 'I.parseOnly' function to run your parser. It will never
  151. -- prompt for more input.
  152. --
  153. -- /Note/: incremental input does /not/ imply that attoparsec will
  154. -- release portions of its internal state for garbage collection as it
  155. -- proceeds. Its internal representation is equivalent to a single
  156. -- 'ByteString': if you feed incremental input to a parser, it will
  157. -- require memory proportional to the amount of input you supply.
  158. -- (This is necessary to support arbitrary backtracking.)
  159. -- $performance
  160. --
  161. -- If you write an attoparsec-based parser carefully, it can be
  162. -- realistic to expect it to perform similarly to a hand-rolled C
  163. -- parser (measuring megabytes parsed per second).
  164. --
  165. -- To actually achieve high performance, there are a few guidelines
  166. -- that it is useful to follow.
  167. --
  168. -- Use the 'B.ByteString'-oriented parsers whenever possible,
  169. -- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyWord8'. There is
  170. -- about a factor of 100 difference in performance between the two
  171. -- kinds of parser.
  172. --
  173. -- For very simple byte-testing predicates, write them by hand instead
  174. -- of using 'I.inClass' or 'I.notInClass'. For instance, both of
  175. -- these predicates test for an end-of-line byte, but the first is
  176. -- much faster than the second:
  177. --
  178. -- >endOfLine_fast w = w == 13 || w == 10
  179. -- >endOfLine_slow = inClass "\r\n"
  180. --
  181. -- Make active use of benchmarking and profiling tools to measure,
  182. -- find the problems with, and improve the performance of your parser.
  183. -- | Run a parser and print its result to standard output.
  184. parseTest :: (Show a) => I.Parser a -> B.ByteString -> IO ()
  185. parseTest p s = print (parse p s)
  186. -- | Run a parser with an initial input string, and a monadic action
  187. -- that can supply more input if needed.
  188. parseWith :: Monad m =>
  189. (m B.ByteString)
  190. -- ^ An action that will be executed to provide the parser
  191. -- with more input, if necessary. The action must return an
  192. -- 'B.empty' string when there is no more input available.
  193. -> I.Parser a
  194. -> B.ByteString
  195. -- ^ Initial input for the parser.
  196. -> m (Result a)
  197. parseWith refill p s = step $ parse p s
  198. where step (T.Partial k) = (step . k) =<< refill
  199. step r = return r
  200. {-# INLINE parseWith #-}
  201. -- | Convert a 'Result' value to a 'Maybe' value. A 'T.Partial' result
  202. -- is treated as failure.
  203. maybeResult :: Result r -> Maybe r
  204. maybeResult (T.Done _ r) = Just r
  205. maybeResult _ = Nothing
  206. -- | Convert a 'Result' value to an 'Either' value. A 'T.Partial'
  207. -- result is treated as failure.
  208. eitherResult :: Result r -> Either String r
  209. eitherResult (T.Done _ r) = Right r
  210. eitherResult (T.Fail _ [] msg) = Left msg
  211. eitherResult (T.Fail _ ctxs msg) = Left (intercalate " > " ctxs ++ ": " ++ msg)
  212. eitherResult _ = Left "Result: incomplete input"