Text.hs - This Haskell code defines a parser for various ma…

/Data/Attoparsec/Text.hs

http://github.com/bos/attoparsec · Haskell · 502 lines · 177 code · 49 blank · 276 comment · 20 complexity · 2202aeaa3a566c0b3bb2bee715536a0d MD5 · raw file

{-# LANGUAGE BangPatterns, CPP, FlexibleInstances, TypeSynonymInstances #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-} -- Imports internal modules
#endif
{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}

-- |
-- Module      :  Data.Attoparsec.Text
-- Copyright   :  Bryan O'Sullivan 2007-2015
-- License     :  BSD3
--
-- Maintainer  :  bos@serpentine.com
-- Stability   :  experimental
-- Portability :  unknown
--
-- Simple, efficient combinator parsing for 'Text' strings,
-- loosely based on the Parsec library.

module Data.Attoparsec.Text
    (
    -- * Differences from Parsec
    -- $parsec

    -- * Incremental input
    -- $incremental

    -- * Performance considerations
    -- $performance

    -- * Parser types
      Parser
    , Result
    , T.IResult(..)
    , I.compareResults

    -- * Running parsers
    , parse
    , feed
    , I.parseOnly
    , parseWith
    , parseTest

    -- ** Result conversion
    , maybeResult
    , eitherResult

    -- * Parsing individual characters
    , I.char
    , I.anyChar
    , I.notChar
    , I.satisfy
    , I.satisfyWith
    , I.skip

    -- ** Lookahead
    , I.peekChar
    , I.peekChar'

    -- ** Special character parsers
    , digit
    , letter
    , space

    -- ** Character classes
    , I.inClass
    , I.notInClass

    -- * Efficient string handling
    , I.string
    , I.stringCI
    , I.asciiCI
    , skipSpace
    , I.skipWhile
    , I.scan
    , I.runScanner
    , I.take
    , I.takeWhile
    , I.takeWhile1
    , I.takeTill

    -- ** String combinators
    -- $specalt
    , (.*>)
    , (<*.)

    -- ** Consume all remaining input
    , I.takeText
    , I.takeLazyText

    -- * Text parsing
    , I.endOfLine
    , isEndOfLine
    , isHorizontalSpace

    -- * Numeric parsers
    , decimal
    , hexadecimal
    , signed
    , double
    , Number(..)
    , number
    , rational
    , scientific

    -- * Combinators
    , try
    , (<?>)
    , choice
    , count
    , option
    , many'
    , many1
    , many1'
    , manyTill
    , manyTill'
    , sepBy
    , sepBy'
    , sepBy1
    , sepBy1'
    , skipMany
    , skipMany1
    , eitherP
    , I.match
    -- * State observation and manipulation functions
    , I.endOfInput
    , I.atEnd
    ) where

#if !MIN_VERSION_base(4,8,0)
import Control.Applicative (pure, (*>), (<*), (<$>))
import Data.Word (Word)
#endif
import Control.Applicative ((<|>))
import Data.Attoparsec.Combinator
import Data.Attoparsec.Number (Number(..))
import Data.Scientific (Scientific)
import qualified Data.Scientific as Sci
import Data.Attoparsec.Text.Internal (Parser, Result, parse, takeWhile1)
import Data.Bits (Bits, (.|.), shiftL)
import Data.Char (isAlpha, isDigit, isSpace, ord)
import Data.Int (Int8, Int16, Int32, Int64)
import Data.List (intercalate)
import Data.Text (Text)
import Data.Word (Word8, Word16, Word32, Word64)
import qualified Data.Attoparsec.Internal as I
import qualified Data.Attoparsec.Internal.Types as T
import qualified Data.Attoparsec.Text.Internal as I
import qualified Data.Text as T

-- $parsec
--
-- Compared to Parsec 3, attoparsec makes several tradeoffs.  It is
-- not intended for, or ideal for, all possible uses.
--
-- * While attoparsec can consume input incrementally, Parsec cannot.
--   Incremental input is a huge deal for efficient and secure network
--   and system programming, since it gives much more control to users
--   of the library over matters such as resource usage and the I/O
--   model to use.
--
-- * Much of the performance advantage of attoparsec is gained via
--   high-performance parsers such as 'I.takeWhile' and 'I.string'.
--   If you use complicated combinators that return lists of
--   characters, there is less performance difference between the two
--   libraries.
--
-- * Unlike Parsec 3, attoparsec does not support being used as a
--   monad transformer.
--
-- * attoparsec is specialised to deal only with strict 'Text'
--   input.  Efficiency concerns rule out both lists and lazy text.
--   The usual use for lazy text would be to allow consumption of very
--   large input without a large footprint.  For this need,
--   attoparsec's incremental input provides an excellent substitute,
--   with much more control over when input takes place.  If you must
--   use lazy text, see the 'Lazy' module, which feeds lazy chunks to
--   a regular parser.
--
-- * Parsec parsers can produce more helpful error messages than
--   attoparsec parsers.  This is a matter of focus: attoparsec avoids
--   the extra book-keeping in favour of higher performance.

-- $incremental
--
-- attoparsec supports incremental input, meaning that you can feed it
-- a 'Text' that represents only part of the expected total amount
-- of data to parse. If your parser reaches the end of a fragment of
-- input and could consume more input, it will suspend parsing and
-- return a 'T.Partial' continuation.
--
-- Supplying the 'T.Partial' continuation with another string will
-- resume parsing at the point where it was suspended, with the string
-- you supplied used as new input at the end of the existing
-- input. You must be prepared for the result of the resumed parse to
-- be another 'Partial' continuation.
--
-- To indicate that you have no more input, supply the 'Partial'
-- continuation with an 'T.empty' 'Text'.
--
-- Remember that some parsing combinators will not return a result
-- until they reach the end of input.  They may thus cause 'T.Partial'
-- results to be returned.
--
-- If you do not need support for incremental input, consider using
-- the 'I.parseOnly' function to run your parser.  It will never
-- prompt for more input.
--
-- /Note/: incremental input does /not/ imply that attoparsec will
-- release portions of its internal state for garbage collection as it
-- proceeds.  Its internal representation is equivalent to a single
-- 'Text': if you feed incremental input to an a parser, it will
-- require memory proportional to the amount of input you supply.
-- (This is necessary to support arbitrary backtracking.)

-- $performance
--
-- If you write an attoparsec-based parser carefully, it can be
-- realistic to expect it to perform similarly to a hand-rolled C
-- parser (measuring megabytes parsed per second).
--
-- To actually achieve high performance, there are a few guidelines
-- that it is useful to follow.
--
-- Use the 'Text'-oriented parsers whenever possible,
-- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyChar'.  There is
-- about a factor of 100 difference in performance between the two
-- kinds of parser.
--
-- For very simple character-testing predicates, write them by hand
-- instead of using 'I.inClass' or 'I.notInClass'.  For instance, both
-- of these predicates test for an end-of-line character, but the
-- first is much faster than the second:
--
-- >endOfLine_fast c = c == '\r' || c == '\n'
-- >endOfLine_slow   = inClass "\r\n"
--
-- Make active use of benchmarking and profiling tools to measure,
-- find the problems with, and improve the performance of your parser.

-- | Run a parser and print its result to standard output.
parseTest :: (Show a) => I.Parser a -> Text -> IO ()
parseTest p s = print (parse p s)

-- | Run a parser with an initial input string, and a monadic action
-- that can supply more input if needed.
parseWith :: Monad m =>
             (m Text)
          -- ^ An action that will be executed to provide the parser
          -- with more input, if necessary.  The action must return an
          -- 'T.empty' string when there is no more input available.
          -> I.Parser a
          -> Text
          -- ^ Initial input for the parser.
          -> m (Result a)
parseWith refill p s = step $ parse p s
  where step (T.Partial k) = (step . k) =<< refill
        step r           = return r
{-# INLINE parseWith #-}

-- | Convert a 'Result' value to a 'Maybe' value. A 'Partial' result
-- is treated as failure.
maybeResult :: Result r -> Maybe r
maybeResult (T.Done _ r) = Just r
maybeResult _            = Nothing

-- | Convert a 'Result' value to an 'Either' value. A 'Partial' result
-- is treated as failure.
eitherResult :: Result r -> Either String r
eitherResult (T.Done _ r)        = Right r
eitherResult (T.Fail _ [] msg)   = Left msg
eitherResult (T.Fail _ ctxs msg) = Left (intercalate " > " ctxs ++ ": " ++ msg)
eitherResult _                   = Left "Result: incomplete input"

-- | A predicate that matches either a carriage return @\'\\r\'@ or
-- newline @\'\\n\'@ character.
isEndOfLine :: Char -> Bool
isEndOfLine c = c == '\n' || c == '\r'
{-# INLINE isEndOfLine #-}

-- | A predicate that matches either a space @\' \'@ or horizontal tab
-- @\'\\t\'@ character.
isHorizontalSpace :: Char -> Bool
isHorizontalSpace c = c == ' ' || c == '\t'
{-# INLINE isHorizontalSpace #-}

-- | Parse and decode an unsigned hexadecimal number.  The hex digits
-- @\'a\'@ through @\'f\'@ may be upper or lower case.
--
-- This parser does not accept a leading @\"0x\"@ string.
hexadecimal :: (Integral a, Bits a) => Parser a
hexadecimal = T.foldl' step 0 `fmap` takeWhile1 isHexDigit
  where
    isHexDigit c = (c >= '0' && c <= '9') ||
                   (c >= 'a' && c <= 'f') ||
                   (c >= 'A' && c <= 'F')
    step a c | w >= 48 && w <= 57  = (a `shiftL` 4) .|. fromIntegral (w - 48)
             | w >= 97             = (a `shiftL` 4) .|. fromIntegral (w - 87)
             | otherwise           = (a `shiftL` 4) .|. fromIntegral (w - 55)
      where w = ord c
{-# SPECIALISE hexadecimal :: Parser Int #-}
{-# SPECIALISE hexadecimal :: Parser Int8 #-}
{-# SPECIALISE hexadecimal :: Parser Int16 #-}
{-# SPECIALISE hexadecimal :: Parser Int32 #-}
{-# SPECIALISE hexadecimal :: Parser Int64 #-}
{-# SPECIALISE hexadecimal :: Parser Integer #-}
{-# SPECIALISE hexadecimal :: Parser Word #-}
{-# SPECIALISE hexadecimal :: Parser Word8 #-}
{-# SPECIALISE hexadecimal :: Parser Word16 #-}
{-# SPECIALISE hexadecimal :: Parser Word32 #-}
{-# SPECIALISE hexadecimal :: Parser Word64 #-}

-- | Parse and decode an unsigned decimal number.
decimal :: Integral a => Parser a
decimal = T.foldl' step 0 `fmap` takeWhile1 isDecimal
  where step a c = a * 10 + fromIntegral (ord c - 48)
{-# SPECIALISE decimal :: Parser Int #-}
{-# SPECIALISE decimal :: Parser Int8 #-}
{-# SPECIALISE decimal :: Parser Int16 #-}
{-# SPECIALISE decimal :: Parser Int32 #-}
{-# SPECIALISE decimal :: Parser Int64 #-}
{-# SPECIALISE decimal :: Parser Integer #-}
{-# SPECIALISE decimal :: Parser Word #-}
{-# SPECIALISE decimal :: Parser Word8 #-}
{-# SPECIALISE decimal :: Parser Word16 #-}
{-# SPECIALISE decimal :: Parser Word32 #-}
{-# SPECIALISE decimal :: Parser Word64 #-}

isDecimal :: Char -> Bool
isDecimal c = c >= '0' && c <= '9'
{-# INLINE isDecimal #-}

-- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign
-- character.
signed :: Num a => Parser a -> Parser a
{-# SPECIALISE signed :: Parser Int -> Parser Int #-}
{-# SPECIALISE signed :: Parser Int8 -> Parser Int8 #-}
{-# SPECIALISE signed :: Parser Int16 -> Parser Int16 #-}
{-# SPECIALISE signed :: Parser Int32 -> Parser Int32 #-}
{-# SPECIALISE signed :: Parser Int64 -> Parser Int64 #-}
{-# SPECIALISE signed :: Parser Integer -> Parser Integer #-}
signed p = (negate <$> (I.char '-' *> p))
       <|> (I.char '+' *> p)
       <|> p

-- | Parse a rational number.
--
-- The syntax accepted by this parser is the same as for 'double'.
--
-- /Note/: this parser is not safe for use with inputs from untrusted
-- sources.  An input with a suitably large exponent such as
-- @"1e1000000000"@ will cause a huge 'Integer' to be allocated,
-- resulting in what is effectively a denial-of-service attack.
--
-- In most cases, it is better to use 'double' or 'scientific'
-- instead.
rational :: Fractional a => Parser a
{-# SPECIALIZE rational :: Parser Double #-}
{-# SPECIALIZE rational :: Parser Float #-}
{-# SPECIALIZE rational :: Parser Rational #-}
{-# SPECIALIZE rational :: Parser Scientific #-}
rational = scientifically realToFrac

-- | Parse a 'Double'.
--
-- This parser accepts an optional leading sign character, followed by
-- at most one decimal digit.  The syntax is similar to that accepted by
-- the 'read' function, with the exception that a trailing @\'.\'@ is
-- consumed.
--
-- === Examples
--
-- These examples use this helper:
--
-- @
-- r :: 'Parser' a -> 'Data.Text.Text' -> 'Data.Attoparsec.Text.Result' a
-- r p s = 'feed' ('Data.Attoparsec.parse' p s) 'mempty'
-- @
--
-- Examples with behaviour identical to 'read', if you feed an empty
-- continuation to the first result:
--
-- > r double "3"     == Done "" 3.0
-- > r double "3.1"   == Done "" 3.1
-- > r double "3e4"   == Done "" 30000.0
-- > r double "3.1e4" == Done "" 31000.0
-- > r double "3e"    == Done "e" 3.0
--
-- Examples with behaviour identical to 'read':
--
-- > r double ".3"    == Fail ".3" _ _
-- > r double "e3"    == Fail "e3" _ _
--
-- Example of difference from 'read':
--
-- > r double "3.foo" == Done "foo" 3.0
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
double :: Parser Double
double = scientifically Sci.toRealFloat

-- | Parse a number, attempting to preserve both speed and precision.
--
-- The syntax accepted by this parser is the same as for 'double'.
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
number :: Parser Number
number = scientifically $ \s ->
            let e = Sci.base10Exponent s
                c = Sci.coefficient s
            in if e >= 0
               then I (c * 10 ^ e)
               else D (Sci.toRealFloat s)
{-# DEPRECATED number "Use 'scientific' instead." #-}

-- | Parse a scientific number.
--
-- The syntax accepted by this parser is the same as for 'double'.
scientific :: Parser Scientific
scientific = scientifically id

-- A strict pair
data SP = SP !Integer {-# UNPACK #-}!Int

{-# INLINE scientifically #-}
scientifically :: (Scientific -> a) -> Parser a
scientifically h = do
  !positive <- ((== '+') <$> I.satisfy (\c -> c == '-' || c == '+')) <|>
               pure True

  n <- decimal

  let f fracDigits = SP (T.foldl' step n fracDigits)
                        (negate $ T.length fracDigits)
      step a c = a * 10 + fromIntegral (ord c - 48)

  SP c e <- (I.satisfy (=='.') *> (f <$> I.takeWhile isDigit)) <|>
            pure (SP n 0)

  let !signedCoeff | positive  =  c
                   | otherwise = -c

  (I.satisfy (\w -> w == 'e' || w == 'E') *>
      fmap (h . Sci.scientific signedCoeff . (e +)) (signed decimal)) <|>
    return (h $ Sci.scientific signedCoeff    e)

-- | Parse a single digit, as recognised by 'isDigit'.
digit :: Parser Char
digit = I.satisfy isDigit <?> "digit"
{-# INLINE digit #-}

-- | Parse a letter, as recognised by 'isAlpha'.
letter :: Parser Char
letter = I.satisfy isAlpha <?> "letter"
{-# INLINE letter #-}

-- | Parse a space character, as recognised by 'isSpace'.
space :: Parser Char
space = I.satisfy isSpace <?> "space"
{-# INLINE space #-}

-- | Skip over white space.
skipSpace :: Parser ()
skipSpace = I.skipWhile isSpace
{-# INLINE skipSpace #-}

-- $specalt
--
-- If you enable the @OverloadedStrings@ language extension, you can
-- use the '*>' and '<*' combinators to simplify the common task of
-- matching a statically known string, then immediately parsing
-- something else.
--
-- Instead of writing something like this:
--
-- @
--'I.string' \"foo\" '*>' wibble
-- @
--
-- Using @OverloadedStrings@, you can omit the explicit use of
-- 'I.string', and write a more compact version:
--
-- @
-- \"foo\" '*>' wibble
-- @
--
-- (Note: the '.*>' and '<*.' combinators that were originally
-- provided for this purpose are obsolete and unnecessary, and will be
-- removed in the next major version.)

-- | /Obsolete/. A type-specialized version of '*>' for 'Text'. Use
-- '*>' instead.
(.*>) :: Text -> Parser a -> Parser a
s .*> f = I.string s *> f
{-# DEPRECATED (.*>) "This is no longer necessary, and will be removed. Use '*>' instead." #-}

-- | /Obsolete/. A type-specialized version of '<*' for 'Text'. Use
-- '*>' instead.
(<*.) :: Parser a -> Text -> Parser a
f <*. s = f <* I.string s
{-# DEPRECATED (<*.) "This is no longer necessary, and will be removed. Use '<*' instead." #-}
Summary ✨

This Haskell code defines a parser for various mathematical and scientific notation formats, including numbers, letters, spaces, and strings. It provides functions to parse specific types of input, such as digits, letters, and scientific notation, and combines them to create more complex parsers. The code is designed to work with the Scientific type, which represents a number in scientific notation.
Alerts (4)

Complexity hotspot; lines 293 to 296 (total complexity: 4)
293 294 295 296