PageRenderTime 55ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/compiler/parser/Lexer.x

https://bitbucket.org/carter/ghc
Alex | 1356 lines | 1174 code | 182 blank | 0 comment | 0 complexity | d4d61ffd76a3bafcc0cd15839d0ef0ad MD5 | raw file
  1. -----------------------------------------------------------------------------
  2. -- (c) The University of Glasgow, 2006
  3. --
  4. -- GHC's lexer.
  5. --
  6. -- This is a combination of an Alex-generated lexer from a regex
  7. -- definition, with some hand-coded bits.
  8. --
  9. -- Completely accurate information about token-spans within the source
  10. -- file is maintained. Every token has a start and end RealSrcLoc
  11. -- attached to it.
  12. --
  13. -----------------------------------------------------------------------------
  14. -- ToDo / known bugs:
  15. -- - parsing integers is a bit slow
  16. -- - readRational is a bit slow
  17. --
  18. -- Known bugs, that were also in the previous version:
  19. -- - M... should be 3 tokens, not 1.
  20. -- - pragma-end should be only valid in a pragma
  21. -- qualified operator NOTES.
  22. --
  23. -- - If M.(+) is a single lexeme, then..
  24. -- - Probably (+) should be a single lexeme too, for consistency.
  25. -- Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
  26. -- - But we have to rule out reserved operators, otherwise (..) becomes
  27. -- a different lexeme.
  28. -- - Should we therefore also rule out reserved operators in the qualified
  29. -- form? This is quite difficult to achieve. We don't do it for
  30. -- qualified varids.
  31. {
  32. -- XXX The above flags turn off warnings in the generated code:
  33. {-# LANGUAGE BangPatterns #-}
  34. {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  35. {-# OPTIONS_GHC -fno-warn-unused-binds #-}
  36. {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  37. {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
  38. -- But alex still generates some code that causes the "lazy unlifted bindings"
  39. -- warning, and old compilers don't know about it so we can't easily turn
  40. -- it off, so for now we use the sledge hammer:
  41. {-# OPTIONS_GHC -w #-}
  42. {-# OPTIONS_GHC -funbox-strict-fields #-}
  43. module Lexer (
  44. Token(..), lexer, pragState, mkPState, PState(..),
  45. P(..), ParseResult(..), getSrcLoc,
  46. getPState, getDynFlags, withThisPackage,
  47. failLocMsgP, failSpanMsgP, srcParseFail,
  48. getMessages,
  49. popContext, pushCurrentContext, setLastToken, setSrcLoc,
  50. activeContext, nextIsEOF,
  51. getLexState, popLexState, pushLexState,
  52. extension, bangPatEnabled, datatypeContextsEnabled,
  53. traditionalRecordSyntaxEnabled,
  54. typeLiteralsEnabled,
  55. explicitNamespacesEnabled, sccProfilingOn, hpcEnabled,
  56. addWarning,
  57. lexTokenStream
  58. ) where
  59. import Bag
  60. import ErrUtils
  61. import Outputable
  62. import StringBuffer
  63. import FastString
  64. import SrcLoc
  65. import UniqFM
  66. import DynFlags
  67. import Module
  68. import Ctype
  69. import BasicTypes ( InlineSpec(..), RuleMatchInfo(..), FractionalLit(..) )
  70. import Util ( readRational )
  71. import Control.Monad
  72. import Data.Bits
  73. import Data.Char
  74. import Data.List
  75. import Data.Maybe
  76. import Data.Map (Map)
  77. import qualified Data.Map as Map
  78. import Data.Ratio
  79. import Data.Word
  80. }
  81. $unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
  82. $whitechar = [\ \n\r\f\v $unispace]
  83. $white_no_nl = $whitechar # \n
  84. $tab = \t
  85. $ascdigit = 0-9
  86. $unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  87. $decdigit = $ascdigit -- for now, should really be $digit (ToDo)
  88. $digit = [$ascdigit $unidigit]
  89. $special = [\(\)\,\;\[\]\`\{\}]
  90. $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
  91. $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  92. $symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  93. $unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  94. $asclarge = [A-Z]
  95. $large = [$asclarge $unilarge]
  96. $unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  97. $ascsmall = [a-z]
  98. $small = [$ascsmall $unismall \_]
  99. $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  100. $graphic = [$small $large $symbol $digit $special $unigraphic \:\"\']
  101. $octit = 0-7
  102. $hexit = [$decdigit A-F a-f]
  103. $symchar = [$symbol \:]
  104. $nl = [\n\r]
  105. $idchar = [$small $large $digit \']
  106. $pragmachar = [$small $large $digit]
  107. $docsym = [\| \^ \* \$]
  108. @varid = $small $idchar*
  109. @conid = $large $idchar*
  110. @varsym = $symbol $symchar*
  111. @consym = \: $symchar*
  112. @decimal = $decdigit+
  113. @octal = $octit+
  114. @hexadecimal = $hexit+
  115. @exponent = [eE] [\-\+]? @decimal
  116. -- we support the hierarchical module name extension:
  117. @qual = (@conid \.)+
  118. @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
  119. -- normal signed numerical literals can only be explicitly negative,
  120. -- not explicitly positive (contrast @exponent)
  121. @negative = \-
  122. @signed = @negative ?
  123. haskell :-
  124. -- everywhere: skip whitespace and comments
  125. $white_no_nl+ ;
  126. $tab+ { warn Opt_WarnTabs (text "Tab character") }
  127. -- Everywhere: deal with nested comments. We explicitly rule out
  128. -- pragmas, "{-#", so that we don't accidentally treat them as comments.
  129. -- (this can happen even though pragmas will normally take precedence due to
  130. -- longest-match, because pragmas aren't valid in every state, but comments
  131. -- are). We also rule out nested Haddock comments, if the -haddock flag is
  132. -- set.
  133. "{-" / { isNormalComment } { nested_comment lexToken }
  134. -- Single-line comments are a bit tricky. Haskell 98 says that two or
  135. -- more dashes followed by a symbol should be parsed as a varsym, so we
  136. -- have to exclude those.
  137. -- Since Haddock comments aren't valid in every state, we need to rule them
  138. -- out here.
  139. -- The following two rules match comments that begin with two dashes, but
  140. -- continue with a different character. The rules test that this character
  141. -- is not a symbol (in which case we'd have a varsym), and that it's not a
  142. -- space followed by a Haddock comment symbol (docsym) (in which case we'd
  143. -- have a Haddock comment). The rules then munch the rest of the line.
  144. "-- " ~[$docsym \#] .* { lineCommentToken }
  145. "--" [^$symbol : \ ] .* { lineCommentToken }
  146. -- Next, match Haddock comments if no -haddock flag
  147. "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
  148. -- Now, when we've matched comments that begin with 2 dashes and continue
  149. -- with a different character, we need to match comments that begin with three
  150. -- or more dashes (which clearly can't be Haddock comments). We only need to
  151. -- make sure that the first non-dash character isn't a symbol, and munch the
  152. -- rest of the line.
  153. "---"\-* [^$symbol :] .* { lineCommentToken }
  154. -- Since the previous rules all match dashes followed by at least one
  155. -- character, we also need to match a whole line filled with just dashes.
  156. "--"\-* / { atEOL } { lineCommentToken }
  157. -- We need this rule since none of the other single line comment rules
  158. -- actually match this case.
  159. "-- " / { atEOL } { lineCommentToken }
  160. -- 'bol' state: beginning of a line. Slurp up all the whitespace (including
  161. -- blank lines) until we find a non-whitespace character, then do layout
  162. -- processing.
  163. --
  164. -- One slight wibble here: what if the line begins with {-#? In
  165. -- theory, we have to lex the pragma to see if it's one we recognise,
  166. -- and if it is, then we backtrack and do_bol, otherwise we treat it
  167. -- as a nested comment. We don't bother with this: if the line begins
  168. -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
  169. <bol> {
  170. \n ;
  171. ^\# (line)? { begin line_prag1 }
  172. ^\# pragma .* \n ; -- GCC 3.3 CPP generated, apparently
  173. ^\# \! .* \n ; -- #!, for scripts
  174. () { do_bol }
  175. }
  176. -- after a layout keyword (let, where, do, of), we begin a new layout
  177. -- context if the curly brace is missing.
  178. -- Careful! This stuff is quite delicate.
  179. <layout, layout_do> {
  180. \{ / { notFollowedBy '-' } { hopefully_open_brace }
  181. -- we might encounter {-# here, but {- has been handled already
  182. \n ;
  183. ^\# (line)? { begin line_prag1 }
  184. }
  185. -- do is treated in a subtly different way, see new_layout_context
  186. <layout> () { new_layout_context True }
  187. <layout_do> () { new_layout_context False }
  188. -- after a new layout context which was found to be to the left of the
  189. -- previous context, we have generated a '{' token, and we now need to
  190. -- generate a matching '}' token.
  191. <layout_left> () { do_layout_left }
  192. <0,option_prags> \n { begin bol }
  193. "{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
  194. { dispatch_pragmas linePrags }
  195. -- single-line line pragmas, of the form
  196. -- # <line> "<file>" <extra-stuff> \n
  197. <line_prag1> $decdigit+ { setLine line_prag1a }
  198. <line_prag1a> \" [$graphic \ ]* \" { setFile line_prag1b }
  199. <line_prag1b> .* { pop }
  200. -- Haskell-style line pragmas, of the form
  201. -- {-# LINE <line> "<file>" #-}
  202. <line_prag2> $decdigit+ { setLine line_prag2a }
  203. <line_prag2a> \" [$graphic \ ]* \" { setFile line_prag2b }
  204. <line_prag2b> "#-}"|"-}" { pop }
  205. -- NOTE: accept -} at the end of a LINE pragma, for compatibility
  206. -- with older versions of GHC which generated these.
  207. <0,option_prags> {
  208. "{-#" $whitechar* $pragmachar+
  209. $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
  210. { dispatch_pragmas twoWordPrags }
  211. "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
  212. { dispatch_pragmas oneWordPrags }
  213. -- We ignore all these pragmas, but don't generate a warning for them
  214. "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
  215. { dispatch_pragmas ignoredPrags }
  216. -- ToDo: should only be valid inside a pragma:
  217. "#-}" { endPrag }
  218. }
  219. <option_prags> {
  220. "{-#" $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
  221. { dispatch_pragmas fileHeaderPrags }
  222. "-- #" { multiline_doc_comment }
  223. }
  224. <0> {
  225. -- In the "0" mode we ignore these pragmas
  226. "{-#" $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
  227. { nested_comment lexToken }
  228. }
  229. <0> {
  230. "-- #" .* { lineCommentToken }
  231. }
  232. <0,option_prags> {
  233. "{-#" { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
  234. (nested_comment lexToken) }
  235. }
  236. -- '0' state: ordinary lexemes
  237. -- Haddock comments
  238. <0,option_prags> {
  239. "-- " $docsym / { ifExtension haddockEnabled } { multiline_doc_comment }
  240. "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
  241. }
  242. -- "special" symbols
  243. <0> {
  244. "[:" / { ifExtension parrEnabled } { token ITopabrack }
  245. ":]" / { ifExtension parrEnabled } { token ITcpabrack }
  246. }
  247. <0> {
  248. "[|" / { ifExtension thEnabled } { token ITopenExpQuote }
  249. "[e|" / { ifExtension thEnabled } { token ITopenExpQuote }
  250. "[p|" / { ifExtension thEnabled } { token ITopenPatQuote }
  251. "[d|" / { ifExtension thEnabled } { layout_token ITopenDecQuote }
  252. "[t|" / { ifExtension thEnabled } { token ITopenTypQuote }
  253. "|]" / { ifExtension thEnabled } { token ITcloseQuote }
  254. \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
  255. "$(" / { ifExtension thEnabled } { token ITparenEscape }
  256. -- For backward compatibility, accept the old dollar syntax
  257. "[$" @varid "|" / { ifExtension qqEnabled }
  258. { lex_quasiquote_tok }
  259. "[" @varid "|" / { ifExtension qqEnabled }
  260. { lex_quasiquote_tok }
  261. -- qualified quasi-quote (#5555)
  262. "[" @qual @varid "|" / { ifExtension qqEnabled }
  263. { lex_qquasiquote_tok }
  264. }
  265. <0> {
  266. "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
  267. { special IToparenbar }
  268. "|)" / { ifExtension arrowsEnabled } { special ITcparenbar }
  269. }
  270. <0> {
  271. \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
  272. }
  273. <0> {
  274. "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
  275. { token IToubxparen }
  276. "#)" / { ifExtension unboxedTuplesEnabled }
  277. { token ITcubxparen }
  278. }
  279. <0,option_prags> {
  280. \( { special IToparen }
  281. \) { special ITcparen }
  282. \[ { special ITobrack }
  283. \] { special ITcbrack }
  284. \, { special ITcomma }
  285. \; { special ITsemi }
  286. \` { special ITbackquote }
  287. \{ { open_brace }
  288. \} { close_brace }
  289. }
  290. <0,option_prags> {
  291. @qual @varid { idtoken qvarid }
  292. @qual @conid { idtoken qconid }
  293. @varid { varid }
  294. @conid { idtoken conid }
  295. }
  296. <0> {
  297. @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
  298. @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
  299. @varid "#"+ / { ifExtension magicHashEnabled } { varid }
  300. @conid "#"+ / { ifExtension magicHashEnabled } { idtoken conid }
  301. }
  302. -- ToDo: - move `var` and (sym) into lexical syntax?
  303. -- - remove backquote from $special?
  304. <0> {
  305. @qual @varsym { idtoken qvarsym }
  306. @qual @consym { idtoken qconsym }
  307. @varsym { varsym }
  308. @consym { consym }
  309. }
  310. -- For the normal boxed literals we need to be careful
  311. -- when trying to be close to Haskell98
  312. <0> {
  313. -- Normal integral literals (:: Num a => a, from Integer)
  314. @decimal { tok_num positive 0 0 decimal }
  315. 0[oO] @octal { tok_num positive 2 2 octal }
  316. 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
  317. -- Normal rational literals (:: Fractional a => a, from Rational)
  318. @floating_point { strtoken tok_float }
  319. }
  320. <0> {
  321. -- Unboxed ints (:: Int#) and words (:: Word#)
  322. -- It's simpler (and faster?) to give separate cases to the negatives,
  323. -- especially considering octal/hexadecimal prefixes.
  324. @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
  325. 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
  326. 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
  327. @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
  328. @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
  329. @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
  330. @decimal \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
  331. 0[oO] @octal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
  332. 0[xX] @hexadecimal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
  333. -- Unboxed floats and doubles (:: Float#, :: Double#)
  334. -- prim_{float,double} work with signed literals
  335. @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
  336. @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
  337. }
  338. -- Strings and chars are lexed by hand-written code. The reason is
  339. -- that even if we recognise the string or char here in the regex
  340. -- lexer, we would still have to parse the string afterward in order
  341. -- to convert it to a String.
  342. <0> {
  343. \' { lex_char_tok }
  344. \" { lex_string_tok }
  345. }
  346. {
  347. -- -----------------------------------------------------------------------------
  348. -- The token type
  349. data Token
  350. = ITas -- Haskell keywords
  351. | ITcase
  352. | ITclass
  353. | ITdata
  354. | ITdefault
  355. | ITderiving
  356. | ITdo
  357. | ITelse
  358. | IThiding
  359. | ITif
  360. | ITimport
  361. | ITin
  362. | ITinfix
  363. | ITinfixl
  364. | ITinfixr
  365. | ITinstance
  366. | ITlet
  367. | ITmodule
  368. | ITnewtype
  369. | ITof
  370. | ITqualified
  371. | ITthen
  372. | ITtype
  373. | ITwhere
  374. | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
  375. | ITforall -- GHC extension keywords
  376. | ITforeign
  377. | ITexport
  378. | ITlabel
  379. | ITdynamic
  380. | ITsafe
  381. | ITinterruptible
  382. | ITunsafe
  383. | ITstdcallconv
  384. | ITccallconv
  385. | ITcapiconv
  386. | ITprimcallconv
  387. | ITmdo
  388. | ITfamily
  389. | ITgroup
  390. | ITby
  391. | ITusing
  392. -- Pragmas
  393. | ITinline_prag InlineSpec RuleMatchInfo
  394. | ITspec_prag -- SPECIALISE
  395. | ITspec_inline_prag Bool -- SPECIALISE INLINE (or NOINLINE)
  396. | ITsource_prag
  397. | ITrules_prag
  398. | ITwarning_prag
  399. | ITdeprecated_prag
  400. | ITline_prag
  401. | ITscc_prag
  402. | ITgenerated_prag
  403. | ITcore_prag -- hdaume: core annotations
  404. | ITunpack_prag
  405. | ITnounpack_prag
  406. | ITann_prag
  407. | ITclose_prag
  408. | IToptions_prag String
  409. | ITinclude_prag String
  410. | ITlanguage_prag
  411. | ITvect_prag
  412. | ITvect_scalar_prag
  413. | ITnovect_prag
  414. | ITctype
  415. | ITdotdot -- reserved symbols
  416. | ITcolon
  417. | ITdcolon
  418. | ITequal
  419. | ITlam
  420. | ITlcase
  421. | ITvbar
  422. | ITlarrow
  423. | ITrarrow
  424. | ITat
  425. | ITtilde
  426. | ITtildehsh
  427. | ITdarrow
  428. | ITminus
  429. | ITbang
  430. | ITstar
  431. | ITdot
  432. | ITbiglam -- GHC-extension symbols
  433. | ITocurly -- special symbols
  434. | ITccurly
  435. | ITvocurly
  436. | ITvccurly
  437. | ITobrack
  438. | ITopabrack -- [:, for parallel arrays with -XParallelArrays
  439. | ITcpabrack -- :], for parallel arrays with -XParallelArrays
  440. | ITcbrack
  441. | IToparen
  442. | ITcparen
  443. | IToubxparen
  444. | ITcubxparen
  445. | ITsemi
  446. | ITcomma
  447. | ITunderscore
  448. | ITbackquote
  449. | ITsimpleQuote -- '
  450. | ITvarid FastString -- identifiers
  451. | ITconid FastString
  452. | ITvarsym FastString
  453. | ITconsym FastString
  454. | ITqvarid (FastString,FastString)
  455. | ITqconid (FastString,FastString)
  456. | ITqvarsym (FastString,FastString)
  457. | ITqconsym (FastString,FastString)
  458. | ITprefixqvarsym (FastString,FastString)
  459. | ITprefixqconsym (FastString,FastString)
  460. | ITdupipvarid FastString -- GHC extension: implicit param: ?x
  461. | ITchar Char
  462. | ITstring FastString
  463. | ITinteger Integer
  464. | ITrational FractionalLit
  465. | ITprimchar Char
  466. | ITprimstring FastBytes
  467. | ITprimint Integer
  468. | ITprimword Integer
  469. | ITprimfloat FractionalLit
  470. | ITprimdouble FractionalLit
  471. -- Template Haskell extension tokens
  472. | ITopenExpQuote -- [| or [e|
  473. | ITopenPatQuote -- [p|
  474. | ITopenDecQuote -- [d|
  475. | ITopenTypQuote -- [t|
  476. | ITcloseQuote -- |]
  477. | ITidEscape FastString -- $x
  478. | ITparenEscape -- $(
  479. | ITtyQuote -- ''
  480. | ITquasiQuote (FastString,FastString,RealSrcSpan)
  481. -- ITquasiQuote(quoter, quote, loc)
  482. -- represents a quasi-quote of the form
  483. -- [quoter| quote |]
  484. | ITqQuasiQuote (FastString,FastString,FastString,RealSrcSpan)
  485. -- ITqQuasiQuote(Qual, quoter, quote, loc)
  486. -- represents a qualified quasi-quote of the form
  487. -- [Qual.quoter| quote |]
  488. -- Arrow notation extension
  489. | ITproc
  490. | ITrec
  491. | IToparenbar -- (|
  492. | ITcparenbar -- |)
  493. | ITlarrowtail -- -<
  494. | ITrarrowtail -- >-
  495. | ITLarrowtail -- -<<
  496. | ITRarrowtail -- >>-
  497. | ITunknown String -- Used when the lexer can't make sense of it
  498. | ITeof -- end of file token
  499. -- Documentation annotations
  500. | ITdocCommentNext String -- something beginning '-- |'
  501. | ITdocCommentPrev String -- something beginning '-- ^'
  502. | ITdocCommentNamed String -- something beginning '-- $'
  503. | ITdocSection Int String -- a section heading
  504. | ITdocOptions String -- doc options (prune, ignore-exports, etc)
  505. | ITdocOptionsOld String -- doc options declared "-- # ..."-style
  506. | ITlineComment String -- comment starting by "--"
  507. | ITblockComment String -- comment in {- -}
  508. deriving Show
  509. -- the bitmap provided as the third component indicates whether the
  510. -- corresponding extension keyword is valid under the extension options
  511. -- provided to the compiler; if the extension corresponding to *any* of the
  512. -- bits set in the bitmap is enabled, the keyword is valid (this setup
  513. -- facilitates using a keyword in two different extensions that can be
  514. -- activated independently)
  515. --
  516. reservedWordsFM :: UniqFM (Token, Int)
  517. reservedWordsFM = listToUFM $
  518. map (\(x, y, z) -> (mkFastString x, (y, z)))
  519. [( "_", ITunderscore, 0 ),
  520. ( "as", ITas, 0 ),
  521. ( "case", ITcase, 0 ),
  522. ( "class", ITclass, 0 ),
  523. ( "data", ITdata, 0 ),
  524. ( "default", ITdefault, 0 ),
  525. ( "deriving", ITderiving, 0 ),
  526. ( "do", ITdo, 0 ),
  527. ( "else", ITelse, 0 ),
  528. ( "hiding", IThiding, 0 ),
  529. ( "if", ITif, 0 ),
  530. ( "import", ITimport, 0 ),
  531. ( "in", ITin, 0 ),
  532. ( "infix", ITinfix, 0 ),
  533. ( "infixl", ITinfixl, 0 ),
  534. ( "infixr", ITinfixr, 0 ),
  535. ( "instance", ITinstance, 0 ),
  536. ( "let", ITlet, 0 ),
  537. ( "module", ITmodule, 0 ),
  538. ( "newtype", ITnewtype, 0 ),
  539. ( "of", ITof, 0 ),
  540. ( "qualified", ITqualified, 0 ),
  541. ( "then", ITthen, 0 ),
  542. ( "type", ITtype, 0 ),
  543. ( "where", ITwhere, 0 ),
  544. ( "_scc_", ITscc, 0 ), -- ToDo: remove
  545. ( "forall", ITforall, bit explicitForallBit .|.
  546. bit inRulePragBit),
  547. ( "mdo", ITmdo, bit recursiveDoBit),
  548. ( "family", ITfamily, bit tyFamBit),
  549. ( "group", ITgroup, bit transformComprehensionsBit),
  550. ( "by", ITby, bit transformComprehensionsBit),
  551. ( "using", ITusing, bit transformComprehensionsBit),
  552. ( "foreign", ITforeign, bit ffiBit),
  553. ( "export", ITexport, bit ffiBit),
  554. ( "label", ITlabel, bit ffiBit),
  555. ( "dynamic", ITdynamic, bit ffiBit),
  556. ( "safe", ITsafe, bit ffiBit .|.
  557. bit safeHaskellBit),
  558. ( "interruptible", ITinterruptible, bit interruptibleFfiBit),
  559. ( "unsafe", ITunsafe, bit ffiBit),
  560. ( "stdcall", ITstdcallconv, bit ffiBit),
  561. ( "ccall", ITccallconv, bit ffiBit),
  562. ( "capi", ITcapiconv, bit cApiFfiBit),
  563. ( "prim", ITprimcallconv, bit ffiBit),
  564. ( "rec", ITrec, bit arrowsBit .|.
  565. bit recursiveDoBit),
  566. ( "proc", ITproc, bit arrowsBit)
  567. ]
  568. reservedSymsFM :: UniqFM (Token, Int -> Bool)
  569. reservedSymsFM = listToUFM $
  570. map (\ (x,y,z) -> (mkFastString x,(y,z)))
  571. [ ("..", ITdotdot, always)
  572. -- (:) is a reserved op, meaning only list cons
  573. ,(":", ITcolon, always)
  574. ,("::", ITdcolon, always)
  575. ,("=", ITequal, always)
  576. ,("\\", ITlam, always)
  577. ,("|", ITvbar, always)
  578. ,("<-", ITlarrow, always)
  579. ,("->", ITrarrow, always)
  580. ,("@", ITat, always)
  581. ,("~", ITtilde, always)
  582. ,("~#", ITtildehsh, always)
  583. ,("=>", ITdarrow, always)
  584. ,("-", ITminus, always)
  585. ,("!", ITbang, always)
  586. -- For data T (a::*) = MkT
  587. ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
  588. -- For 'forall a . t'
  589. ,(".", ITdot, always) -- \i -> explicitForallEnabled i || inRulePrag i)
  590. ,("-<", ITlarrowtail, arrowsEnabled)
  591. ,(">-", ITrarrowtail, arrowsEnabled)
  592. ,("-<<", ITLarrowtail, arrowsEnabled)
  593. ,(">>-", ITRarrowtail, arrowsEnabled)
  594. ,("∷", ITdcolon, unicodeSyntaxEnabled)
  595. ,("⇒", ITdarrow, unicodeSyntaxEnabled)
  596. ,("∀", ITforall, \i -> unicodeSyntaxEnabled i &&
  597. explicitForallEnabled i)
  598. ,("→", ITrarrow, unicodeSyntaxEnabled)
  599. ,("←", ITlarrow, unicodeSyntaxEnabled)
  600. ,("⤙", ITlarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
  601. ,("⤚", ITrarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
  602. ,("⤛", ITLarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
  603. ,("⤜", ITRarrowtail, \i -> unicodeSyntaxEnabled i && arrowsEnabled i)
  604. ,("★", ITstar, unicodeSyntaxEnabled)
  605. -- ToDo: ideally, and should be "specials", so that they cannot
  606. -- form part of a large operator. This would let us have a better
  607. -- syntax for kinds: ɑ** would be a legal kind signature. (maybe).
  608. ]
  609. -- -----------------------------------------------------------------------------
  610. -- Lexer actions
  611. type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated Token)
  612. special :: Token -> Action
  613. special tok span _buf _len = return (L span tok)
  614. token, layout_token :: Token -> Action
  615. token t span _buf _len = return (L span t)
  616. layout_token t span _buf _len = pushLexState layout >> return (L span t)
  617. idtoken :: (StringBuffer -> Int -> Token) -> Action
  618. idtoken f span buf len = return (L span $! (f buf len))
  619. skip_one_varid :: (FastString -> Token) -> Action
  620. skip_one_varid f span buf len
  621. = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
  622. strtoken :: (String -> Token) -> Action
  623. strtoken f span buf len =
  624. return (L span $! (f $! lexemeToString buf len))
  625. init_strtoken :: Int -> (String -> Token) -> Action
  626. -- like strtoken, but drops the last N character(s)
  627. init_strtoken drop f span buf len =
  628. return (L span $! (f $! lexemeToString buf (len-drop)))
  629. begin :: Int -> Action
  630. begin code _span _str _len = do pushLexState code; lexToken
  631. pop :: Action
  632. pop _span _buf _len = do _ <- popLexState
  633. lexToken
  634. hopefully_open_brace :: Action
  635. hopefully_open_brace span buf len
  636. = do relaxed <- extension relaxedLayout
  637. ctx <- getContext
  638. (AI l _) <- getInput
  639. let offset = srcLocCol l
  640. isOK = relaxed ||
  641. case ctx of
  642. Layout prev_off : _ -> prev_off < offset
  643. _ -> True
  644. if isOK then pop_and open_brace span buf len
  645. else failSpanMsgP (RealSrcSpan span) (text "Missing block")
  646. pop_and :: Action -> Action
  647. pop_and act span buf len = do _ <- popLexState
  648. act span buf len
  649. {-# INLINE nextCharIs #-}
  650. nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
  651. nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
  652. {-# INLINE nextCharIsNot #-}
  653. nextCharIsNot :: StringBuffer -> (Char -> Bool) -> Bool
  654. nextCharIsNot buf p = not (nextCharIs buf p)
  655. notFollowedBy :: Char -> AlexAccPred Int
  656. notFollowedBy char _ _ _ (AI _ buf)
  657. = nextCharIsNot buf (== char)
  658. notFollowedBySymbol :: AlexAccPred Int
  659. notFollowedBySymbol _ _ _ (AI _ buf)
  660. = nextCharIsNot buf (`elem` "!#$%&*+./<=>?@\\^|-~")
  661. -- We must reject doc comments as being ordinary comments everywhere.
  662. -- In some cases the doc comment will be selected as the lexeme due to
  663. -- maximal munch, but not always, because the nested comment rule is
  664. -- valid in all states, but the doc-comment rules are only valid in
  665. -- the non-layout states.
  666. isNormalComment :: AlexAccPred Int
  667. isNormalComment bits _ _ (AI _ buf)
  668. | haddockEnabled bits = notFollowedByDocOrPragma
  669. | otherwise = nextCharIsNot buf (== '#')
  670. where
  671. notFollowedByDocOrPragma
  672. = afterOptionalSpace buf (\b -> nextCharIsNot b (`elem` "|^*$#"))
  673. afterOptionalSpace :: StringBuffer -> (StringBuffer -> Bool) -> Bool
  674. afterOptionalSpace buf p
  675. = if nextCharIs buf (== ' ')
  676. then p (snd (nextChar buf))
  677. else p buf
  678. atEOL :: AlexAccPred Int
  679. atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
  680. ifExtension :: (Int -> Bool) -> AlexAccPred Int
  681. ifExtension pred bits _ _ _ = pred bits
  682. multiline_doc_comment :: Action
  683. multiline_doc_comment span buf _len = withLexedDocType (worker "")
  684. where
  685. worker commentAcc input docType oneLine = case alexGetChar' input of
  686. Just ('\n', input')
  687. | oneLine -> docCommentEnd input commentAcc docType buf span
  688. | otherwise -> case checkIfCommentLine input' of
  689. Just input -> worker ('\n':commentAcc) input docType False
  690. Nothing -> docCommentEnd input commentAcc docType buf span
  691. Just (c, input) -> worker (c:commentAcc) input docType oneLine
  692. Nothing -> docCommentEnd input commentAcc docType buf span
  693. checkIfCommentLine input = check (dropNonNewlineSpace input)
  694. where
  695. check input = case alexGetChar' input of
  696. Just ('-', input) -> case alexGetChar' input of
  697. Just ('-', input) -> case alexGetChar' input of
  698. Just (c, _) | c /= '-' -> Just input
  699. _ -> Nothing
  700. _ -> Nothing
  701. _ -> Nothing
  702. dropNonNewlineSpace input = case alexGetChar' input of
  703. Just (c, input')
  704. | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
  705. | otherwise -> input
  706. Nothing -> input
  707. lineCommentToken :: Action
  708. lineCommentToken span buf len = do
  709. b <- extension rawTokenStreamEnabled
  710. if b then strtoken ITlineComment span buf len else lexToken
  711. {-
  712. nested comments require traversing by hand, they can't be parsed
  713. using regular expressions.
  714. -}
  715. nested_comment :: P (RealLocated Token) -> Action
  716. nested_comment cont span _str _len = do
  717. input <- getInput
  718. go "" (1::Int) input
  719. where
  720. go commentAcc 0 input = do setInput input
  721. b <- extension rawTokenStreamEnabled
  722. if b
  723. then docCommentEnd input commentAcc ITblockComment _str span
  724. else cont
  725. go commentAcc n input = case alexGetChar' input of
  726. Nothing -> errBrace input span
  727. Just ('-',input) -> case alexGetChar' input of
  728. Nothing -> errBrace input span
  729. Just ('\125',input) -> go commentAcc (n-1) input
  730. Just (_,_) -> go ('-':commentAcc) n input
  731. Just ('\123',input) -> case alexGetChar' input of
  732. Nothing -> errBrace input span
  733. Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
  734. Just (_,_) -> go ('\123':commentAcc) n input
  735. Just (c,input) -> go (c:commentAcc) n input
  736. nested_doc_comment :: Action
  737. nested_doc_comment span buf _len = withLexedDocType (go "")
  738. where
  739. go commentAcc input docType _ = case alexGetChar' input of
  740. Nothing -> errBrace input span
  741. Just ('-',input) -> case alexGetChar' input of
  742. Nothing -> errBrace input span
  743. Just ('\125',input) ->
  744. docCommentEnd input commentAcc docType buf span
  745. Just (_,_) -> go ('-':commentAcc) input docType False
  746. Just ('\123', input) -> case alexGetChar' input of
  747. Nothing -> errBrace input span
  748. Just ('-',input) -> do
  749. setInput input
  750. let cont = do input <- getInput; go commentAcc input docType False
  751. nested_comment cont span buf _len
  752. Just (_,_) -> go ('\123':commentAcc) input docType False
  753. Just (c,input) -> go (c:commentAcc) input docType False
  754. withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (RealLocated Token))
  755. -> P (RealLocated Token)
  756. withLexedDocType lexDocComment = do
  757. input@(AI _ buf) <- getInput
  758. case prevChar buf ' ' of
  759. '|' -> lexDocComment input ITdocCommentNext False
  760. '^' -> lexDocComment input ITdocCommentPrev False
  761. '$' -> lexDocComment input ITdocCommentNamed False
  762. '*' -> lexDocSection 1 input
  763. '#' -> lexDocComment input ITdocOptionsOld False
  764. _ -> panic "withLexedDocType: Bad doc type"
  765. where
  766. lexDocSection n input = case alexGetChar' input of
  767. Just ('*', input) -> lexDocSection (n+1) input
  768. Just (_, _) -> lexDocComment input (ITdocSection n) True
  769. Nothing -> do setInput input; lexToken -- eof reached, lex it normally
  770. -- RULES pragmas turn on the forall and '.' keywords, and we turn them
  771. -- off again at the end of the pragma.
  772. rulePrag :: Action
  773. rulePrag span _buf _len = do
  774. setExts (.|. bit inRulePragBit)
  775. return (L span ITrules_prag)
  776. endPrag :: Action
  777. endPrag span _buf _len = do
  778. setExts (.&. complement (bit inRulePragBit))
  779. return (L span ITclose_prag)
  780. -- docCommentEnd
  781. -------------------------------------------------------------------------------
  782. -- This function is quite tricky. We can't just return a new token, we also
  783. -- need to update the state of the parser. Why? Because the token is longer
  784. -- than what was lexed by Alex, and the lexToken function doesn't know this, so
  785. -- it writes the wrong token length to the parser state. This function is
  786. -- called afterwards, so it can just update the state.
  787. docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
  788. RealSrcSpan -> P (RealLocated Token)
  789. docCommentEnd input commentAcc docType buf span = do
  790. setInput input
  791. let (AI loc nextBuf) = input
  792. comment = reverse commentAcc
  793. span' = mkRealSrcSpan (realSrcSpanStart span) loc
  794. last_len = byteDiff buf nextBuf
  795. span `seq` setLastToken span' last_len
  796. return (L span' (docType comment))
  797. errBrace :: AlexInput -> RealSrcSpan -> P a
  798. errBrace (AI end _) span = failLocMsgP (realSrcSpanStart span) end "unterminated `{-'"
  799. open_brace, close_brace :: Action
  800. open_brace span _str _len = do
  801. ctx <- getContext
  802. setContext (NoLayout:ctx)
  803. return (L span ITocurly)
  804. close_brace span _str _len = do
  805. popContext
  806. return (L span ITccurly)
  807. qvarid, qconid :: StringBuffer -> Int -> Token
  808. qvarid buf len = ITqvarid $! splitQualName buf len False
  809. qconid buf len = ITqconid $! splitQualName buf len False
  810. splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
  811. -- takes a StringBuffer and a length, and returns the module name
  812. -- and identifier parts of a qualified name. Splits at the *last* dot,
  813. -- because of hierarchical module names.
  814. splitQualName orig_buf len parens = split orig_buf orig_buf
  815. where
  816. split buf dot_buf
  817. | orig_buf `byteDiff` buf >= len = done dot_buf
  818. | c == '.' = found_dot buf'
  819. | otherwise = split buf' dot_buf
  820. where
  821. (c,buf') = nextChar buf
  822. -- careful, we might get names like M....
  823. -- so, if the character after the dot is not upper-case, this is
  824. -- the end of the qualifier part.
  825. found_dot buf -- buf points after the '.'
  826. | isUpper c = split buf' buf
  827. | otherwise = done buf
  828. where
  829. (c,buf') = nextChar buf
  830. done dot_buf =
  831. (lexemeToFastString orig_buf (qual_size - 1),
  832. if parens -- Prelude.(+)
  833. then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
  834. else lexemeToFastString dot_buf (len - qual_size))
  835. where
  836. qual_size = orig_buf `byteDiff` dot_buf
  837. varid :: Action
  838. varid span buf len =
  839. case lookupUFM reservedWordsFM fs of
  840. Just (ITcase, _) -> do
  841. lambdaCase <- extension lambdaCaseEnabled
  842. keyword <- if lambdaCase
  843. then do
  844. lastTk <- getLastTk
  845. return $ case lastTk of
  846. Just ITlam -> ITlcase
  847. _ -> ITcase
  848. else
  849. return ITcase
  850. maybe_layout keyword
  851. return $ L span keyword
  852. Just (keyword, 0) -> do
  853. maybe_layout keyword
  854. return $ L span keyword
  855. Just (keyword, exts) -> do
  856. extsEnabled <- extension $ \i -> exts .&. i /= 0
  857. if extsEnabled
  858. then do
  859. maybe_layout keyword
  860. return $ L span keyword
  861. else
  862. return $ L span $ ITvarid fs
  863. Nothing ->
  864. return $ L span $ ITvarid fs
  865. where
  866. !fs = lexemeToFastString buf len
  867. conid :: StringBuffer -> Int -> Token
  868. conid buf len = ITconid $! lexemeToFastString buf len
  869. qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
  870. qvarsym buf len = ITqvarsym $! splitQualName buf len False
  871. qconsym buf len = ITqconsym $! splitQualName buf len False
  872. prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
  873. prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
  874. varsym, consym :: Action
  875. varsym = sym ITvarsym
  876. consym = sym ITconsym
  877. sym :: (FastString -> Token) -> Action
  878. sym con span buf len =
  879. case lookupUFM reservedSymsFM fs of
  880. Just (keyword, exts) -> do
  881. extsEnabled <- extension exts
  882. let !tk | extsEnabled = keyword
  883. | otherwise = con fs
  884. return $ L span tk
  885. Nothing ->
  886. return $ L span $! con fs
  887. where
  888. !fs = lexemeToFastString buf len
  889. -- Variations on the integral numeric literal.
  890. tok_integral :: (Integer -> Token)
  891. -> (Integer -> Integer)
  892. -> Int -> Int
  893. -> (Integer, (Char -> Int))
  894. -> Action
  895. tok_integral itint transint transbuf translen (radix,char_to_int) span buf len
  896. = return $ L span $ itint $! transint $ parseUnsignedInteger
  897. (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
  898. -- some conveniences for use with tok_integral
  899. tok_num :: (Integer -> Integer)
  900. -> Int -> Int
  901. -> (Integer, (Char->Int)) -> Action
  902. tok_num = tok_integral ITinteger
  903. tok_primint :: (Integer -> Integer)
  904. -> Int -> Int
  905. -> (Integer, (Char->Int)) -> Action
  906. tok_primint = tok_integral ITprimint
  907. tok_primword :: Int -> Int
  908. -> (Integer, (Char->Int)) -> Action
  909. tok_primword = tok_integral ITprimword positive
  910. positive, negative :: (Integer -> Integer)
  911. positive = id
  912. negative = negate
  913. decimal, octal, hexadecimal :: (Integer, Char -> Int)
  914. decimal = (10,octDecDigit)
  915. octal = (8,octDecDigit)
  916. hexadecimal = (16,hexDigit)
  917. -- readRational can understand negative rationals, exponents, everything.
  918. tok_float, tok_primfloat, tok_primdouble :: String -> Token
  919. tok_float str = ITrational $! readFractionalLit str
  920. tok_primfloat str = ITprimfloat $! readFractionalLit str
  921. tok_primdouble str = ITprimdouble $! readFractionalLit str
  922. readFractionalLit :: String -> FractionalLit
  923. readFractionalLit str = (FL $! str) $! readRational str
  924. -- -----------------------------------------------------------------------------
  925. -- Layout processing
  926. -- we're at the first token on a line, insert layout tokens if necessary
  927. do_bol :: Action
  928. do_bol span _str _len = do
  929. pos <- getOffside
  930. case pos of
  931. LT -> do
  932. --trace "layout: inserting '}'" $ do
  933. popContext
  934. -- do NOT pop the lex state, we might have a ';' to insert
  935. return (L span ITvccurly)
  936. EQ -> do
  937. --trace "layout: inserting ';'" $ do
  938. _ <- popLexState
  939. return (L span ITsemi)
  940. GT -> do
  941. _ <- popLexState
  942. lexToken
  943. -- certain keywords put us in the "layout" state, where we might
  944. -- add an opening curly brace.
  945. maybe_layout :: Token -> P ()
  946. maybe_layout t = do -- If the alternative layout rule is enabled then
  947. -- we never create an implicit layout context here.
  948. -- Layout is handled XXX instead.
  949. -- The code for closing implicit contexts, or
  950. -- inserting implicit semi-colons, is therefore
  951. -- irrelevant as it only applies in an implicit
  952. -- context.
  953. alr <- extension alternativeLayoutRule
  954. unless alr $ f t
  955. where f ITdo = pushLexState layout_do
  956. f ITmdo = pushLexState layout_do
  957. f ITof = pushLexState layout
  958. f ITlcase = pushLexState layout
  959. f ITlet = pushLexState layout
  960. f ITwhere = pushLexState layout
  961. f ITrec = pushLexState layout
  962. f _ = return ()
  963. -- Pushing a new implicit layout context. If the indentation of the
  964. -- next token is not greater than the previous layout context, then
  965. -- Haskell 98 says that the new layout context should be empty; that is
  966. -- the lexer must generate {}.
  967. --
  968. -- We are slightly more lenient than this: when the new context is started
  969. -- by a 'do', then we allow the new context to be at the same indentation as
  970. -- the previous context. This is what the 'strict' argument is for.
  971. --
  972. new_layout_context :: Bool -> Action
  973. new_layout_context strict span _buf _len = do
  974. _ <- popLexState
  975. (AI l _) <- getInput
  976. let offset = srcLocCol l
  977. ctx <- getContext
  978. nondecreasing <- extension nondecreasingIndentation
  979. let strict' = strict || not nondecreasing
  980. case ctx of
  981. Layout prev_off : _ |
  982. (strict' && prev_off >= offset ||
  983. not strict' && prev_off > offset) -> do
  984. -- token is indented to the left of the previous context.
  985. -- we must generate a {} sequence now.
  986. pushLexState layout_left
  987. return (L span ITvocurly)
  988. _ -> do
  989. setContext (Layout offset : ctx)
  990. return (L span ITvocurly)
  991. do_layout_left :: Action
  992. do_layout_left span _buf _len = do
  993. _ <- popLexState
  994. pushLexState bol -- we must be at the start of a line
  995. return (L span ITvccurly)
  996. -- -----------------------------------------------------------------------------
  997. -- LINE pragmas
  998. setLine :: Int -> Action
  999. setLine code span buf len = do
  1000. let line = parseUnsignedInteger buf len 10 octDecDigit
  1001. setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
  1002. -- subtract one: the line number refers to the *following* line
  1003. _ <- popLexState
  1004. pushLexState code
  1005. lexToken
  1006. setFile :: Int -> Action
  1007. setFile code span buf len = do
  1008. let file = mkFastString (go (lexemeToString (stepOn buf) (len-2)))
  1009. where go ('\\':c:cs) = c : go cs
  1010. go (c:cs) = c : go cs
  1011. go [] = []
  1012. -- decode escapes in the filename. e.g. on Windows
  1013. -- when our filenames have backslashes in, gcc seems to
  1014. -- escape the backslashes. One symptom of not doing this
  1015. -- is that filenames in error messages look a bit strange:
  1016. -- C:\\foo\bar.hs
  1017. -- only the first backslash is doubled, because we apply
  1018. -- System.FilePath.normalise before printing out
  1019. -- filenames and it does not remove duplicate
  1020. -- backslashes after the drive letter (should it?).
  1021. setAlrLastLoc $ alrInitialLoc file
  1022. setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
  1023. addSrcFile file
  1024. _ <- popLexState
  1025. pushLexState code
  1026. lexToken
  1027. alrInitialLoc :: FastString -> RealSrcSpan
  1028. alrInitialLoc file = mkRealSrcSpan loc loc
  1029. where -- This is a hack to ensure that the first line in a file
  1030. -- looks like it is after the initial location:
  1031. loc = mkRealSrcLoc file (-1) (-1)
  1032. -- -----------------------------------------------------------------------------
  1033. -- Options, includes and language pragmas.
  1034. lex_string_prag :: (String -> Token) -> Action
  1035. lex_string_prag mkTok span _buf _len
  1036. = do input <- getInput
  1037. start <- getSrcLoc
  1038. tok <- go [] input
  1039. end <- getSrcLoc
  1040. return (L (mkRealSrcSpan start end) tok)
  1041. where go acc input
  1042. = if isString input "#-}"
  1043. then do setInput input
  1044. return (mkTok (reverse acc))
  1045. else case alexGetChar input of
  1046. Just (c,i) -> go (c:acc) i
  1047. Nothing -> err input
  1048. isString _ [] = True
  1049. isString i (x:xs)
  1050. = case alexGetChar i of
  1051. Just (c,i') | c == x -> isString i' xs
  1052. _other -> False
  1053. err (AI end _) = failLocMsgP (realSrcSpanStart span) end "unterminated options pragma"
  1054. -- -----------------------------------------------------------------------------
  1055. -- Strings & Chars
  1056. -- This stuff is horrible. I hates it.
  1057. lex_string_tok :: Action
  1058. lex_string_tok span _buf _len = do
  1059. tok <- lex_string ""
  1060. end <- getSrcLoc
  1061. return (L (mkRealSrcSpan (realSrcSpanStart span) end) tok)
  1062. lex_string :: String -> P Token
  1063. lex_string s = do
  1064. i <- getInput
  1065. case alexGetChar' i of
  1066. Nothing -> lit_error i
  1067. Just ('"',i) -> do
  1068. setInput i
  1069. magicHash <- extension magicHashEnabled
  1070. if magicHash
  1071. then do
  1072. i <- getInput
  1073. case alexGetChar' i of
  1074. Just ('#',i) -> do
  1075. setInput i
  1076. if any (> '\xFF') s
  1077. then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
  1078. else let fb = unsafeMkFastBytesString (reverse s)
  1079. in return (ITprimstring fb)
  1080. _other ->
  1081. return (ITstring (mkFastString (reverse s)))
  1082. else
  1083. return (ITstring (mkFastString (reverse s)))
  1084. Just ('\\',i)
  1085. | Just ('&',i) <- next -> do
  1086. setInput i; lex_string s
  1087. | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
  1088. -- is_space only works for <= '\x7f' (#3751, #5425)
  1089. setInput i; lex_stringgap s
  1090. where next = alexGetChar' i
  1091. Just (c, i1) -> do
  1092. case c of
  1093. '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s)
  1094. c | isAny c -> do setInput i1; lex_string (c:s)
  1095. _other -> lit_error i
  1096. lex_stringgap :: String -> P Token
  1097. lex_stringgap s = do
  1098. i <- getInput
  1099. c <- getCharOrFail i
  1100. case c of
  1101. '\\' -> lex_string s
  1102. c | c <= '\x7f' && is_space c -> lex_stringgap s
  1103. -- is_space only works for <= '\x7f' (#3751, #5425)
  1104. _other -> lit_error i
  1105. lex_char_tok :: Action
  1106. -- Here we are basically parsing character literals, such as 'x' or '\n'
  1107. -- but, when Template Haskell is on, we additionally spot
  1108. -- 'x and ''T, returning ITsimpleQuote and ITtyQuote respectively,
  1109. -- but WITHOUT CONSUMING the x or T part (the parser does that).
  1110. -- So we have to do two characters of lookahead: when we see 'x we need to
  1111. -- see if there's a trailing quote
  1112. lex_char_tok span _buf _len = do -- We've seen '
  1113. i1 <- getInput -- Look ahead to first character
  1114. let loc = realSrcSpanStart span
  1115. case alexGetChar' i1 of
  1116. Nothing -> lit_error i1
  1117. Just ('\'', i2@(AI end2 _)) -> do -- We've seen ''
  1118. setInput i2
  1119. return (L (mkRealSrcSpan loc end2) ITtyQuote)
  1120. Just ('\\', i2@(AI _end2 _)) -> do -- We've seen 'backslash
  1121. setInput i2
  1122. lit_ch <- lex_escape
  1123. i3 <- getInput
  1124. mc <- getCharOrFail i3 -- Trailing quote
  1125. if mc == '\'' then finish_char_tok loc lit_ch
  1126. else lit_error i3
  1127. Just (c, i2@(AI _end2 _))
  1128. | not (isAny c) -> lit_error i1
  1129. | otherwise ->
  1130. -- We've seen 'x, where x is a valid character
  1131. -- (i.e. not newline etc) but not a quote or backslash
  1132. case alexGetChar' i2 of -- Look ahead one more character
  1133. Just ('\'', i3) -> do -- We've seen 'x'
  1134. setInput i3
  1135. finish_char_tok loc c
  1136. _other -> do -- We've seen 'x not followed by quote
  1137. -- (including the possibility of EOF)
  1138. -- If TH is on, just parse the quote only
  1139. let (AI end _) = i1
  1140. return (L (mkRealSrcSpan loc end) ITsimpleQuote)
  1141. finish_char_tok :: RealSrcLoc -> Char -> P (RealLocated Token)
  1142. finish_char_tok loc ch -- We've already seen the closing quote
  1143. -- Just need to check for trailing #
  1144. = do magicHash <- extension magicHashEnabled
  1145. i@(AI end _) <- getInput
  1146. if magicHash then do
  1147. case alexGetChar' i of
  1148. Just ('#',i@(AI end _)) -> do
  1149. setInput i
  1150. return (L (mkRealSrcSpan loc end) (ITprimchar ch))
  1151. _other ->
  1152. return (L (mkRealSrcSpan loc end) (ITchar ch))
  1153. else do
  1154. return (L (mkRealSrcSpan loc end) (ITchar ch))
  1155. isAny :: Char -> Bool
  1156. isAny c | c > '\x7f' = isPrint c
  1157. | otherwise = is_any c
  1158. lex_escape :: P Char
  1159. lex_escape = do
  1160. i0 <- getInput
  1161. c <- getCharOrFail i0
  1162. case c of
  1163. 'a' -> return '\a'
  1164. 'b' -> return '\b'
  1165. 'f' -> return '\f'
  1166. 'n' -> return '\n'
  1167. 'r' -> return '\r'
  1168. 't' -> return '\t'
  1169. 'v' -> return '\v'
  1170. '\\' -> return '\\'
  1171. '"' -> return '\"'
  1172. '\'' -> return '\''
  1173. '^' -> do i1 <- getInput
  1174. c <- getCharOrFail i1