PageRenderTime 38ms CodeModel.GetById 6ms RepoModel.GetById 1ms app.codeStats 0ms

/Data/Attoparsec/Text.hs

http://github.com/bos/attoparsec
Haskell | 502 lines | 177 code | 49 blank | 276 comment | 19 complexity | 2202aeaa3a566c0b3bb2bee715536a0d MD5 | raw file
Possible License(s): BSD-3-Clause
  1. {-# LANGUAGE BangPatterns, CPP, FlexibleInstances, TypeSynonymInstances #-}
  2. #if __GLASGOW_HASKELL__ >= 702
  3. {-# LANGUAGE Trustworthy #-} -- Imports internal modules
  4. #endif
  5. {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
  6. -- |
  7. -- Module : Data.Attoparsec.Text
  8. -- Copyright : Bryan O'Sullivan 2007-2015
  9. -- License : BSD3
  10. --
  11. -- Maintainer : bos@serpentine.com
  12. -- Stability : experimental
  13. -- Portability : unknown
  14. --
  15. -- Simple, efficient combinator parsing for 'Text' strings,
  16. -- loosely based on the Parsec library.
  17. module Data.Attoparsec.Text
  18. (
  19. -- * Differences from Parsec
  20. -- $parsec
  21. -- * Incremental input
  22. -- $incremental
  23. -- * Performance considerations
  24. -- $performance
  25. -- * Parser types
  26. Parser
  27. , Result
  28. , T.IResult(..)
  29. , I.compareResults
  30. -- * Running parsers
  31. , parse
  32. , feed
  33. , I.parseOnly
  34. , parseWith
  35. , parseTest
  36. -- ** Result conversion
  37. , maybeResult
  38. , eitherResult
  39. -- * Parsing individual characters
  40. , I.char
  41. , I.anyChar
  42. , I.notChar
  43. , I.satisfy
  44. , I.satisfyWith
  45. , I.skip
  46. -- ** Lookahead
  47. , I.peekChar
  48. , I.peekChar'
  49. -- ** Special character parsers
  50. , digit
  51. , letter
  52. , space
  53. -- ** Character classes
  54. , I.inClass
  55. , I.notInClass
  56. -- * Efficient string handling
  57. , I.string
  58. , I.stringCI
  59. , I.asciiCI
  60. , skipSpace
  61. , I.skipWhile
  62. , I.scan
  63. , I.runScanner
  64. , I.take
  65. , I.takeWhile
  66. , I.takeWhile1
  67. , I.takeTill
  68. -- ** String combinators
  69. -- $specalt
  70. , (.*>)
  71. , (<*.)
  72. -- ** Consume all remaining input
  73. , I.takeText
  74. , I.takeLazyText
  75. -- * Text parsing
  76. , I.endOfLine
  77. , isEndOfLine
  78. , isHorizontalSpace
  79. -- * Numeric parsers
  80. , decimal
  81. , hexadecimal
  82. , signed
  83. , double
  84. , Number(..)
  85. , number
  86. , rational
  87. , scientific
  88. -- * Combinators
  89. , try
  90. , (<?>)
  91. , choice
  92. , count
  93. , option
  94. , many'
  95. , many1
  96. , many1'
  97. , manyTill
  98. , manyTill'
  99. , sepBy
  100. , sepBy'
  101. , sepBy1
  102. , sepBy1'
  103. , skipMany
  104. , skipMany1
  105. , eitherP
  106. , I.match
  107. -- * State observation and manipulation functions
  108. , I.endOfInput
  109. , I.atEnd
  110. ) where
  111. #if !MIN_VERSION_base(4,8,0)
  112. import Control.Applicative (pure, (*>), (<*), (<$>))
  113. import Data.Word (Word)
  114. #endif
  115. import Control.Applicative ((<|>))
  116. import Data.Attoparsec.Combinator
  117. import Data.Attoparsec.Number (Number(..))
  118. import Data.Scientific (Scientific)
  119. import qualified Data.Scientific as Sci
  120. import Data.Attoparsec.Text.Internal (Parser, Result, parse, takeWhile1)
  121. import Data.Bits (Bits, (.|.), shiftL)
  122. import Data.Char (isAlpha, isDigit, isSpace, ord)
  123. import Data.Int (Int8, Int16, Int32, Int64)
  124. import Data.List (intercalate)
  125. import Data.Text (Text)
  126. import Data.Word (Word8, Word16, Word32, Word64)
  127. import qualified Data.Attoparsec.Internal as I
  128. import qualified Data.Attoparsec.Internal.Types as T
  129. import qualified Data.Attoparsec.Text.Internal as I
  130. import qualified Data.Text as T
  131. -- $parsec
  132. --
  133. -- Compared to Parsec 3, attoparsec makes several tradeoffs. It is
  134. -- not intended for, or ideal for, all possible uses.
  135. --
  136. -- * While attoparsec can consume input incrementally, Parsec cannot.
  137. -- Incremental input is a huge deal for efficient and secure network
  138. -- and system programming, since it gives much more control to users
  139. -- of the library over matters such as resource usage and the I/O
  140. -- model to use.
  141. --
  142. -- * Much of the performance advantage of attoparsec is gained via
  143. -- high-performance parsers such as 'I.takeWhile' and 'I.string'.
  144. -- If you use complicated combinators that return lists of
  145. -- characters, there is less performance difference between the two
  146. -- libraries.
  147. --
  148. -- * Unlike Parsec 3, attoparsec does not support being used as a
  149. -- monad transformer.
  150. --
  151. -- * attoparsec is specialised to deal only with strict 'Text'
  152. -- input. Efficiency concerns rule out both lists and lazy text.
  153. -- The usual use for lazy text would be to allow consumption of very
  154. -- large input without a large footprint. For this need,
  155. -- attoparsec's incremental input provides an excellent substitute,
  156. -- with much more control over when input takes place. If you must
  157. -- use lazy text, see the 'Lazy' module, which feeds lazy chunks to
  158. -- a regular parser.
  159. --
  160. -- * Parsec parsers can produce more helpful error messages than
  161. -- attoparsec parsers. This is a matter of focus: attoparsec avoids
  162. -- the extra book-keeping in favour of higher performance.
  163. -- $incremental
  164. --
  165. -- attoparsec supports incremental input, meaning that you can feed it
  166. -- a 'Text' that represents only part of the expected total amount
  167. -- of data to parse. If your parser reaches the end of a fragment of
  168. -- input and could consume more input, it will suspend parsing and
  169. -- return a 'T.Partial' continuation.
  170. --
  171. -- Supplying the 'T.Partial' continuation with another string will
  172. -- resume parsing at the point where it was suspended, with the string
  173. -- you supplied used as new input at the end of the existing
  174. -- input. You must be prepared for the result of the resumed parse to
  175. -- be another 'Partial' continuation.
  176. --
  177. -- To indicate that you have no more input, supply the 'Partial'
  178. -- continuation with an 'T.empty' 'Text'.
  179. --
  180. -- Remember that some parsing combinators will not return a result
  181. -- until they reach the end of input. They may thus cause 'T.Partial'
  182. -- results to be returned.
  183. --
  184. -- If you do not need support for incremental input, consider using
  185. -- the 'I.parseOnly' function to run your parser. It will never
  186. -- prompt for more input.
  187. --
  188. -- /Note/: incremental input does /not/ imply that attoparsec will
  189. -- release portions of its internal state for garbage collection as it
  190. -- proceeds. Its internal representation is equivalent to a single
  191. -- 'Text': if you feed incremental input to an a parser, it will
  192. -- require memory proportional to the amount of input you supply.
  193. -- (This is necessary to support arbitrary backtracking.)
  194. -- $performance
  195. --
  196. -- If you write an attoparsec-based parser carefully, it can be
  197. -- realistic to expect it to perform similarly to a hand-rolled C
  198. -- parser (measuring megabytes parsed per second).
  199. --
  200. -- To actually achieve high performance, there are a few guidelines
  201. -- that it is useful to follow.
  202. --
  203. -- Use the 'Text'-oriented parsers whenever possible,
  204. -- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyChar'. There is
  205. -- about a factor of 100 difference in performance between the two
  206. -- kinds of parser.
  207. --
  208. -- For very simple character-testing predicates, write them by hand
  209. -- instead of using 'I.inClass' or 'I.notInClass'. For instance, both
  210. -- of these predicates test for an end-of-line character, but the
  211. -- first is much faster than the second:
  212. --
  213. -- >endOfLine_fast c = c == '\r' || c == '\n'
  214. -- >endOfLine_slow = inClass "\r\n"
  215. --
  216. -- Make active use of benchmarking and profiling tools to measure,
  217. -- find the problems with, and improve the performance of your parser.
  218. -- | Run a parser and print its result to standard output.
  219. parseTest :: (Show a) => I.Parser a -> Text -> IO ()
  220. parseTest p s = print (parse p s)
  221. -- | Run a parser with an initial input string, and a monadic action
  222. -- that can supply more input if needed.
  223. parseWith :: Monad m =>
  224. (m Text)
  225. -- ^ An action that will be executed to provide the parser
  226. -- with more input, if necessary. The action must return an
  227. -- 'T.empty' string when there is no more input available.
  228. -> I.Parser a
  229. -> Text
  230. -- ^ Initial input for the parser.
  231. -> m (Result a)
  232. parseWith refill p s = step $ parse p s
  233. where step (T.Partial k) = (step . k) =<< refill
  234. step r = return r
  235. {-# INLINE parseWith #-}
  236. -- | Convert a 'Result' value to a 'Maybe' value. A 'Partial' result
  237. -- is treated as failure.
  238. maybeResult :: Result r -> Maybe r
  239. maybeResult (T.Done _ r) = Just r
  240. maybeResult _ = Nothing
  241. -- | Convert a 'Result' value to an 'Either' value. A 'Partial' result
  242. -- is treated as failure.
  243. eitherResult :: Result r -> Either String r
  244. eitherResult (T.Done _ r) = Right r
  245. eitherResult (T.Fail _ [] msg) = Left msg
  246. eitherResult (T.Fail _ ctxs msg) = Left (intercalate " > " ctxs ++ ": " ++ msg)
  247. eitherResult _ = Left "Result: incomplete input"
  248. -- | A predicate that matches either a carriage return @\'\\r\'@ or
  249. -- newline @\'\\n\'@ character.
  250. isEndOfLine :: Char -> Bool
  251. isEndOfLine c = c == '\n' || c == '\r'
  252. {-# INLINE isEndOfLine #-}
  253. -- | A predicate that matches either a space @\' \'@ or horizontal tab
  254. -- @\'\\t\'@ character.
  255. isHorizontalSpace :: Char -> Bool
  256. isHorizontalSpace c = c == ' ' || c == '\t'
  257. {-# INLINE isHorizontalSpace #-}
  258. -- | Parse and decode an unsigned hexadecimal number. The hex digits
  259. -- @\'a\'@ through @\'f\'@ may be upper or lower case.
  260. --
  261. -- This parser does not accept a leading @\"0x\"@ string.
  262. hexadecimal :: (Integral a, Bits a) => Parser a
  263. hexadecimal = T.foldl' step 0 `fmap` takeWhile1 isHexDigit
  264. where
  265. isHexDigit c = (c >= '0' && c <= '9') ||
  266. (c >= 'a' && c <= 'f') ||
  267. (c >= 'A' && c <= 'F')
  268. step a c | w >= 48 && w <= 57 = (a `shiftL` 4) .|. fromIntegral (w - 48)
  269. | w >= 97 = (a `shiftL` 4) .|. fromIntegral (w - 87)
  270. | otherwise = (a `shiftL` 4) .|. fromIntegral (w - 55)
  271. where w = ord c
  272. {-# SPECIALISE hexadecimal :: Parser Int #-}
  273. {-# SPECIALISE hexadecimal :: Parser Int8 #-}
  274. {-# SPECIALISE hexadecimal :: Parser Int16 #-}
  275. {-# SPECIALISE hexadecimal :: Parser Int32 #-}
  276. {-# SPECIALISE hexadecimal :: Parser Int64 #-}
  277. {-# SPECIALISE hexadecimal :: Parser Integer #-}
  278. {-# SPECIALISE hexadecimal :: Parser Word #-}
  279. {-# SPECIALISE hexadecimal :: Parser Word8 #-}
  280. {-# SPECIALISE hexadecimal :: Parser Word16 #-}
  281. {-# SPECIALISE hexadecimal :: Parser Word32 #-}
  282. {-# SPECIALISE hexadecimal :: Parser Word64 #-}
  283. -- | Parse and decode an unsigned decimal number.
  284. decimal :: Integral a => Parser a
  285. decimal = T.foldl' step 0 `fmap` takeWhile1 isDecimal
  286. where step a c = a * 10 + fromIntegral (ord c - 48)
  287. {-# SPECIALISE decimal :: Parser Int #-}
  288. {-# SPECIALISE decimal :: Parser Int8 #-}
  289. {-# SPECIALISE decimal :: Parser Int16 #-}
  290. {-# SPECIALISE decimal :: Parser Int32 #-}
  291. {-# SPECIALISE decimal :: Parser Int64 #-}
  292. {-# SPECIALISE decimal :: Parser Integer #-}
  293. {-# SPECIALISE decimal :: Parser Word #-}
  294. {-# SPECIALISE decimal :: Parser Word8 #-}
  295. {-# SPECIALISE decimal :: Parser Word16 #-}
  296. {-# SPECIALISE decimal :: Parser Word32 #-}
  297. {-# SPECIALISE decimal :: Parser Word64 #-}
  298. isDecimal :: Char -> Bool
  299. isDecimal c = c >= '0' && c <= '9'
  300. {-# INLINE isDecimal #-}
  301. -- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign
  302. -- character.
  303. signed :: Num a => Parser a -> Parser a
  304. {-# SPECIALISE signed :: Parser Int -> Parser Int #-}
  305. {-# SPECIALISE signed :: Parser Int8 -> Parser Int8 #-}
  306. {-# SPECIALISE signed :: Parser Int16 -> Parser Int16 #-}
  307. {-# SPECIALISE signed :: Parser Int32 -> Parser Int32 #-}
  308. {-# SPECIALISE signed :: Parser Int64 -> Parser Int64 #-}
  309. {-# SPECIALISE signed :: Parser Integer -> Parser Integer #-}
  310. signed p = (negate <$> (I.char '-' *> p))
  311. <|> (I.char '+' *> p)
  312. <|> p
  313. -- | Parse a rational number.
  314. --
  315. -- The syntax accepted by this parser is the same as for 'double'.
  316. --
  317. -- /Note/: this parser is not safe for use with inputs from untrusted
  318. -- sources. An input with a suitably large exponent such as
  319. -- @"1e1000000000"@ will cause a huge 'Integer' to be allocated,
  320. -- resulting in what is effectively a denial-of-service attack.
  321. --
  322. -- In most cases, it is better to use 'double' or 'scientific'
  323. -- instead.
  324. rational :: Fractional a => Parser a
  325. {-# SPECIALIZE rational :: Parser Double #-}
  326. {-# SPECIALIZE rational :: Parser Float #-}
  327. {-# SPECIALIZE rational :: Parser Rational #-}
  328. {-# SPECIALIZE rational :: Parser Scientific #-}
  329. rational = scientifically realToFrac
  330. -- | Parse a 'Double'.
  331. --
  332. -- This parser accepts an optional leading sign character, followed by
  333. -- at most one decimal digit. The syntax is similar to that accepted by
  334. -- the 'read' function, with the exception that a trailing @\'.\'@ is
  335. -- consumed.
  336. --
  337. -- === Examples
  338. --
  339. -- These examples use this helper:
  340. --
  341. -- @
  342. -- r :: 'Parser' a -> 'Data.Text.Text' -> 'Data.Attoparsec.Text.Result' a
  343. -- r p s = 'feed' ('Data.Attoparsec.parse' p s) 'mempty'
  344. -- @
  345. --
  346. -- Examples with behaviour identical to 'read', if you feed an empty
  347. -- continuation to the first result:
  348. --
  349. -- > r double "3" == Done "" 3.0
  350. -- > r double "3.1" == Done "" 3.1
  351. -- > r double "3e4" == Done "" 30000.0
  352. -- > r double "3.1e4" == Done "" 31000.0
  353. -- > r double "3e" == Done "e" 3.0
  354. --
  355. -- Examples with behaviour identical to 'read':
  356. --
  357. -- > r double ".3" == Fail ".3" _ _
  358. -- > r double "e3" == Fail "e3" _ _
  359. --
  360. -- Example of difference from 'read':
  361. --
  362. -- > r double "3.foo" == Done "foo" 3.0
  363. --
  364. -- This function does not accept string representations of \"NaN\" or
  365. -- \"Infinity\".
  366. double :: Parser Double
  367. double = scientifically Sci.toRealFloat
  368. -- | Parse a number, attempting to preserve both speed and precision.
  369. --
  370. -- The syntax accepted by this parser is the same as for 'double'.
  371. --
  372. -- This function does not accept string representations of \"NaN\" or
  373. -- \"Infinity\".
  374. number :: Parser Number
  375. number = scientifically $ \s ->
  376. let e = Sci.base10Exponent s
  377. c = Sci.coefficient s
  378. in if e >= 0
  379. then I (c * 10 ^ e)
  380. else D (Sci.toRealFloat s)
  381. {-# DEPRECATED number "Use 'scientific' instead." #-}
  382. -- | Parse a scientific number.
  383. --
  384. -- The syntax accepted by this parser is the same as for 'double'.
  385. scientific :: Parser Scientific
  386. scientific = scientifically id
  387. -- A strict pair
  388. data SP = SP !Integer {-# UNPACK #-}!Int
  389. {-# INLINE scientifically #-}
  390. scientifically :: (Scientific -> a) -> Parser a
  391. scientifically h = do
  392. !positive <- ((== '+') <$> I.satisfy (\c -> c == '-' || c == '+')) <|>
  393. pure True
  394. n <- decimal
  395. let f fracDigits = SP (T.foldl' step n fracDigits)
  396. (negate $ T.length fracDigits)
  397. step a c = a * 10 + fromIntegral (ord c - 48)
  398. SP c e <- (I.satisfy (=='.') *> (f <$> I.takeWhile isDigit)) <|>
  399. pure (SP n 0)
  400. let !signedCoeff | positive = c
  401. | otherwise = -c
  402. (I.satisfy (\w -> w == 'e' || w == 'E') *>
  403. fmap (h . Sci.scientific signedCoeff . (e +)) (signed decimal)) <|>
  404. return (h $ Sci.scientific signedCoeff e)
  405. -- | Parse a single digit, as recognised by 'isDigit'.
  406. digit :: Parser Char
  407. digit = I.satisfy isDigit <?> "digit"
  408. {-# INLINE digit #-}
  409. -- | Parse a letter, as recognised by 'isAlpha'.
  410. letter :: Parser Char
  411. letter = I.satisfy isAlpha <?> "letter"
  412. {-# INLINE letter #-}
  413. -- | Parse a space character, as recognised by 'isSpace'.
  414. space :: Parser Char
  415. space = I.satisfy isSpace <?> "space"
  416. {-# INLINE space #-}
  417. -- | Skip over white space.
  418. skipSpace :: Parser ()
  419. skipSpace = I.skipWhile isSpace
  420. {-# INLINE skipSpace #-}
  421. -- $specalt
  422. --
  423. -- If you enable the @OverloadedStrings@ language extension, you can
  424. -- use the '*>' and '<*' combinators to simplify the common task of
  425. -- matching a statically known string, then immediately parsing
  426. -- something else.
  427. --
  428. -- Instead of writing something like this:
  429. --
  430. -- @
  431. --'I.string' \"foo\" '*>' wibble
  432. -- @
  433. --
  434. -- Using @OverloadedStrings@, you can omit the explicit use of
  435. -- 'I.string', and write a more compact version:
  436. --
  437. -- @
  438. -- \"foo\" '*>' wibble
  439. -- @
  440. --
  441. -- (Note: the '.*>' and '<*.' combinators that were originally
  442. -- provided for this purpose are obsolete and unnecessary, and will be
  443. -- removed in the next major version.)
  444. -- | /Obsolete/. A type-specialized version of '*>' for 'Text'. Use
  445. -- '*>' instead.
  446. (.*>) :: Text -> Parser a -> Parser a
  447. s .*> f = I.string s *> f
  448. {-# DEPRECATED (.*>) "This is no longer necessary, and will be removed. Use '*>' instead." #-}
  449. -- | /Obsolete/. A type-specialized version of '<*' for 'Text'. Use
  450. -- '*>' instead.
  451. (<*.) :: Parser a -> Text -> Parser a
  452. f <*. s = f <* I.string s
  453. {-# DEPRECATED (<*.) "This is no longer necessary, and will be removed. Use '<*' instead." #-}