ByteString.hs - This is a Haskell module for parsing binary…

/Data/Attoparsec/ByteString.hs

http://github.com/bos/attoparsec · Haskell · 232 lines · 83 code · 20 blank · 129 comment · 0 complexity · 04439c78b550e94fe8d2e64abfd1b091 MD5 · raw file


{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module      :  Data.Attoparsec.ByteString
-- Copyright   :  Bryan O'Sullivan 2007-2015
-- License     :  BSD3
--
-- Maintainer  :  bos@serpentine.com
-- Stability   :  experimental
-- Portability :  unknown
--
-- Simple, efficient combinator parsing for 'B.ByteString' strings,
-- loosely based on the Parsec library.

module Data.Attoparsec.ByteString
    (
    -- * Differences from Parsec
    -- $parsec

    -- * Incremental input
    -- $incremental

    -- * Performance considerations
    -- $performance

    -- * Parser types
      I.Parser
    , Result
    , T.IResult(..)
    , I.compareResults

    -- * Running parsers
    , parse
    , feed
    , I.parseOnly
    , parseWith
    , parseTest

    -- ** Result conversion
    , maybeResult
    , eitherResult

    -- * Parsing individual bytes
    , I.word8
    , I.anyWord8
    , I.notWord8
    , I.satisfy
    , I.satisfyWith
    , I.skip

    -- ** Lookahead
    , I.peekWord8
    , I.peekWord8'

    -- ** Byte classes
    , I.inClass
    , I.notInClass

    -- * Efficient string handling
    , I.string
    , I.skipWhile
    , I.take
    , I.scan
    , I.runScanner
    , I.takeWhile
    , I.takeWhile1
    , I.takeWhileIncluding
    , I.takeTill
    , I.getChunk

    -- ** Consume all remaining input
    , I.takeByteString
    , I.takeLazyByteString

    -- * Combinators
    , try
    , (<?>)
    , choice
    , count
    , option
    , many'
    , many1
    , many1'
    , manyTill
    , manyTill'
    , sepBy
    , sepBy'
    , sepBy1
    , sepBy1'
    , skipMany
    , skipMany1
    , eitherP
    , I.match
    -- * State observation and manipulation functions
    , I.endOfInput
    , I.atEnd
    ) where

import Data.Attoparsec.Combinator
import Data.List (intercalate)
import qualified Data.Attoparsec.ByteString.Internal as I
import qualified Data.Attoparsec.Internal as I
import qualified Data.ByteString as B
import Data.Attoparsec.ByteString.Internal (Result, parse)
import qualified Data.Attoparsec.Internal.Types as T

-- $parsec
--
-- Compared to Parsec 3, attoparsec makes several tradeoffs.  It is
-- not intended for, or ideal for, all possible uses.
--
-- * While attoparsec can consume input incrementally, Parsec cannot.
--   Incremental input is a huge deal for efficient and secure network
--   and system programming, since it gives much more control to users
--   of the library over matters such as resource usage and the I/O
--   model to use.
--
-- * Much of the performance advantage of attoparsec is gained via
--   high-performance parsers such as 'I.takeWhile' and 'I.string'.
--   If you use complicated combinators that return lists of bytes or
--   characters, there is less performance difference between the two
--   libraries.
--
-- * Unlike Parsec 3, attoparsec does not support being used as a
--   monad transformer.
--
-- * attoparsec is specialised to deal only with strict 'B.ByteString'
--   input.  Efficiency concerns rule out both lists and lazy
--   bytestrings.  The usual use for lazy bytestrings would be to
--   allow consumption of very large input without a large footprint.
--   For this need, attoparsec's incremental input provides an
--   excellent substitute, with much more control over when input
--   takes place.  If you must use lazy bytestrings, see the
--   "Data.Attoparsec.ByteString.Lazy" module, which feeds lazy chunks
--   to a regular parser.
--
-- * Parsec parsers can produce more helpful error messages than
--   attoparsec parsers.  This is a matter of focus: attoparsec avoids
--   the extra book-keeping in favour of higher performance.

-- $incremental
--
-- attoparsec supports incremental input, meaning that you can feed it
-- a bytestring that represents only part of the expected total amount
-- of data to parse. If your parser reaches the end of a fragment of
-- input and could consume more input, it will suspend parsing and
-- return a 'T.Partial' continuation.
--
-- Supplying the 'T.Partial' continuation with a bytestring will
-- resume parsing at the point where it was suspended, with the
-- bytestring you supplied used as new input at the end of the
-- existing input. You must be prepared for the result of the resumed
-- parse to be another 'T.Partial' continuation.
--
-- To indicate that you have no more input, supply the 'T.Partial'
-- continuation with an empty bytestring.
--
-- Remember that some parsing combinators will not return a result
-- until they reach the end of input.  They may thus cause 'T.Partial'
-- results to be returned.
--
-- If you do not need support for incremental input, consider using
-- the 'I.parseOnly' function to run your parser.  It will never
-- prompt for more input.
--
-- /Note/: incremental input does /not/ imply that attoparsec will
-- release portions of its internal state for garbage collection as it
-- proceeds.  Its internal representation is equivalent to a single
-- 'ByteString': if you feed incremental input to a parser, it will
-- require memory proportional to the amount of input you supply.
-- (This is necessary to support arbitrary backtracking.)

-- $performance
--
-- If you write an attoparsec-based parser carefully, it can be
-- realistic to expect it to perform similarly to a hand-rolled C
-- parser (measuring megabytes parsed per second).
--
-- To actually achieve high performance, there are a few guidelines
-- that it is useful to follow.
--
-- Use the 'B.ByteString'-oriented parsers whenever possible,
-- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyWord8'.  There is
-- about a factor of 100 difference in performance between the two
-- kinds of parser.
--
-- For very simple byte-testing predicates, write them by hand instead
-- of using 'I.inClass' or 'I.notInClass'.  For instance, both of
-- these predicates test for an end-of-line byte, but the first is
-- much faster than the second:
--
-- >endOfLine_fast w = w == 13 || w == 10
-- >endOfLine_slow   = inClass "\r\n"
--
-- Make active use of benchmarking and profiling tools to measure,
-- find the problems with, and improve the performance of your parser.

-- | Run a parser and print its result to standard output.
parseTest :: (Show a) => I.Parser a -> B.ByteString -> IO ()
parseTest p s = print (parse p s)

-- | Run a parser with an initial input string, and a monadic action
-- that can supply more input if needed.
parseWith :: Monad m =>
             (m B.ByteString)
          -- ^ An action that will be executed to provide the parser
          -- with more input, if necessary.  The action must return an
          -- 'B.empty' string when there is no more input available.
          -> I.Parser a
          -> B.ByteString
          -- ^ Initial input for the parser.
          -> m (Result a)
parseWith refill p s = step $ parse p s
  where step (T.Partial k) = (step . k) =<< refill
        step r             = return r
{-# INLINE parseWith #-}

-- | Convert a 'Result' value to a 'Maybe' value. A 'T.Partial' result
-- is treated as failure.
maybeResult :: Result r -> Maybe r
maybeResult (T.Done _ r) = Just r
maybeResult _            = Nothing

-- | Convert a 'Result' value to an 'Either' value. A 'T.Partial'
-- result is treated as failure.
eitherResult :: Result r -> Either String r
eitherResult (T.Done _ r)        = Right r
eitherResult (T.Fail _ [] msg)   = Left msg
eitherResult (T.Fail _ ctxs msg) = Left (intercalate " > " ctxs ++ ": " ++ msg)
eitherResult _                   = Left "Result: incomplete input"