/src/Database/HsSqlPpp/Parsing/Lexer.lhs

http://github.com/JakeWheat/hssqlppp · Haskell · 362 lines · 298 code · 64 blank · 0 comment · 34 complexity · f03a63fe6d9520a9aab959016e617e2a MD5 · raw file

  1. This file contains the lexer for sql source text.
  2. Lexicon:
  3. ~~~~
  4. string
  5. identifier or keyword
  6. symbols - operators and ;,()[]
  7. positional arg
  8. int
  9. float
  10. copy payload (used to lex copy from stdin data)
  11. ~~~~
  12. > module Database.HsSqlPpp.Parsing.Lexer (
  13. > Token
  14. > ,Tok(..)
  15. > ,lexSqlFile
  16. > ,lexSqlText
  17. > ,lexSqlTextWithPosition
  18. > ,identifierString
  19. > ,LexState
  20. > ) where
  21. > import Text.Parsec hiding(many, optional, (<|>))
  22. > import qualified Text.Parsec.Token as P
  23. > import Text.Parsec.Language
  24. > --import Text.Parsec.String
  25. > import Text.Parsec.Pos
  26. >
  27. > import Control.Applicative
  28. > import Control.Monad.Identity
  29. > import Data.Maybe
  30. >
  31. > import Database.HsSqlPpp.Parsing.ParseErrors
  32. > import Database.HsSqlPpp.Utils.Utils
  33. > -- import Database.HsSqlPpp.Ast.Name
  34. ================================================================================
  35. = data types
  36. > type Token = (SourcePos, Tok)
  37. >
  38. > data Tok = StringTok String String --delim, value (delim will one of
  39. > --', $$, $[stuff]$
  40. > | IdStringTok String -- either a identifier component (without .) or a *
  41. > | QIdStringTok String -- same as IdStringTok with quotes
  42. > | SymbolTok String -- operators, and ()[],;: and also .
  43. > -- '*' is currently always lexed as an id
  44. > -- rather than an operator
  45. > -- this gets fixed in the parsing stage
  46. > | PositionalArgTok Integer -- used for $1, etc.
  47. Use a numbertok with a string to parse numbers. This is mainly so that
  48. numeric constants can be parsed accurately - if they are parsed to
  49. floats in the ast then converted back to numeric, then the accuracy
  50. can be lost (e.g. something like "0.2" parsing to 0.199999999 float.
  51. > | NumberTok String
  52. > | CopyPayloadTok String -- support copy from stdin; with inline data
  53. > deriving (Eq,Show)
  54. >
  55. > type LexState = [Tok]
  56. > type Parser = ParsecT String LexState Identity
  57. >
  58. > lexSqlFile :: FilePath -> IO (Either ParseErrorExtra [Token])
  59. > lexSqlFile f = do
  60. > te <- readFile f
  61. > let x = runParser sqlTokens [] f te
  62. > return $ toParseErrorExtra x Nothing te
  63. >
  64. > lexSqlText :: String -> String -> Either ParseErrorExtra [Token]
  65. > lexSqlText f s = toParseErrorExtra (runParser sqlTokens [] f s) Nothing s
  66. >
  67. > lexSqlTextWithPosition :: String -> Int -> Int -> String
  68. > -> Either ParseErrorExtra [Token]
  69. > lexSqlTextWithPosition f l c s =
  70. > toParseErrorExtra (runParser (do
  71. > setPosition (newPos f l c)
  72. > sqlTokens) [] f s) (Just (l,c)) s
  73. ================================================================================
  74. = lexers
  75. lexer for tokens, contains a hack for copy from stdin with inline
  76. table data.
  77. > sqlTokens :: Parser [Token]
  78. > sqlTokens =
  79. > setState [] >>
  80. > whiteSpace >>
  81. > many sqlToken <* eof
  82. Lexer for an individual token.
  83. Could lex lazily and when the lexer reads a copy from stdin statement,
  84. it switches lexers to lex the inline table data, then switches
  85. back. Don't know how to do this in parsec, or even if it is possible,
  86. so as a work around, you use the state to trap if we've just seen 'from
  87. stdin;', if so, you read the copy payload as one big token, otherwise
  88. we read a normal token.
  89. > sqlToken :: Parser Token
  90. > sqlToken = do
  91. > sp <- getPosition
  92. > sta <- getState
  93. > t <- if sta == [ft,st,mt]
  94. > then copyPayload
  95. > else try sqlNumber
  96. > <|> try sqlString
  97. > <|> try idString
  98. > <|> try qidString
  99. > <|> try positionalArg
  100. > <|> try sqlSymbol
  101. > updateState $ \stt ->
  102. > case () of
  103. > _ | stt == [] && t == ft -> [ft]
  104. > | stt == [ft] && t == st -> [ft,st]
  105. > | stt == [ft,st] && t == mt -> [ft,st,mt]
  106. > | otherwise -> []
  107. >
  108. > return (sp,t)
  109. > where
  110. > ft = IdStringTok "from"
  111. > st = IdStringTok "stdin"
  112. > mt = SymbolTok ";"
  113. == specialized token parsers
  114. > sqlString :: Parser Tok
  115. > sqlString = stringQuotes <|> stringLD
  116. > where
  117. > --parse a string delimited by single quotes
  118. > stringQuotes = StringTok "\'" <$> stringPar
  119. > stringPar = optional (char 'E') *> char '\''
  120. > *> readQuoteEscape <* whiteSpace
  121. > --(readquoteescape reads the trailing ')
  122. have to read two consecutive single quotes as a quote character
  123. instead of the end of the string, probably an easier way to do this
  124. other escapes (e.g. \n \t) are left unprocessed
  125. > readQuoteEscape = do
  126. > x <- anyChar
  127. > if x == '\''
  128. > then try ((x:) <$> (char '\'' *> readQuoteEscape))
  129. > <|> return ""
  130. > else (x:) <$> readQuoteEscape
  131. parse a dollar quoted string
  132. > stringLD = do
  133. > -- cope with $$ as well as $[identifier]$
  134. > tag <- try (char '$' *> ((char '$' *> return "")
  135. > <|> (identifierString <* char '$')))
  136. > s <- lexeme $ manyTill anyChar
  137. > (try $ char '$' <* string tag <* char '$')
  138. > return $ StringTok ("$" ++ tag ++ "$") s
  139. >
  140. > idString :: Parser Tok
  141. > idString = IdStringTok <$> identifierString
  142. > qidString :: Parser Tok
  143. > qidString = QIdStringTok <$> qidentifierString
  144. >
  145. > positionalArg :: Parser Tok
  146. > positionalArg = char '$' >> PositionalArgTok <$> integer
  147. Lexing symbols:
  148. ~~~~
  149. approach 1:
  150. try to keep multi symbol operators as single lexical items
  151. (e.g. "==", "~=="
  152. approach 2:
  153. make each character a separate element
  154. e.g. == lexes to ['=', '=']
  155. then the parser sorts this out
  156. Sort of using approach 1 at the moment, see below
  157. == notes on symbols in pg operators
  158. pg symbols can be made from:
  159. =_*/<>=~!@#%^&|`?
  160. no --, /* in symbols
  161. can't end in + or - unless contains
  162. ~!@#%^&|?
  163. Most of this isn't relevant for the current lexer.
  164. == sql symbols for this lexer:
  165. sql symbol is one of
  166. ()[],; - single character
  167. +-*/<>=~!@#%^&|`? string - one or more of these, parsed until hit char
  168. which isn't one of these (including whitespace). This will parse some
  169. standard sql expressions wrongly at the moment, work around is to add
  170. whitespace e.g. i think 3*-4 is valid sql, should lex as '3' '*' '-'
  171. '4', but will currently lex as '3' '*-' '4'. This is planned to be
  172. fixed in the parser.
  173. .. := :: : - other special cases
  174. A single * will lex as an identifier rather than a symbol, the parser
  175. deals with this.
  176. ~~~~
  177. > sqlSymbol :: Parser Tok
  178. > sqlSymbol =
  179. > SymbolTok <$> lexeme (choice [
  180. > replicate 1 <$> oneOf "()[],;"
  181. > ,try $ string ".."
  182. > ,string "."
  183. > ,try $ string "::"
  184. > ,try $ string ":="
  185. > ,string ":"
  186. > ,try $ string "$(" -- antiquote standard splice
  187. > ,try $ string "$s(" -- antiquote string splice
  188. > ,string "$i(" -- antiquote identifier splice
  189. > ,many1 (oneOf "+-*/<>=~!@#%^&|`?")
  190. > ])
  191. >
  192. parse a number:
  193. digits
  194. digits.[digits][e[+-]digits]
  195. [digits].digits[e[+-]digits]
  196. digitse[+-]digits
  197. I'm sure the implementation can be simpler than this
  198. > sqlNumber :: Parser Tok
  199. > sqlNumber = NumberTok <$> lexeme (
  200. > choice [do
  201. > -- starts with digits
  202. > d <- digits
  203. > suff <- choice [-- complete fractional part
  204. > try fracPart
  205. > ,-- dot followed by optional exp
  206. > -- check for .. symbol
  207. > choice [try $ do
  208. > _ <- lookAhead $ string ".."
  209. > return []
  210. > ,do
  211. > _ <- char '.'
  212. > e <- optionMaybe expn
  213. > return $ concat $ catMaybes
  214. > [Just "."
  215. > ,e]
  216. > ]
  217. > ,--no dot then expn
  218. > expn
  219. > -- just an integer
  220. > ,return ""
  221. > ]
  222. > return $ d ++ suff
  223. > ,fracPart
  224. > ])
  225. > where
  226. > fracPart = do
  227. > _ <- char '.'
  228. > d <- digits
  229. > e <- optionMaybe expn
  230. > return $ concat $ catMaybes
  231. > [Just "."
  232. > ,Just d
  233. > ,e]
  234. > expn = do
  235. > _ <- char 'e'
  236. > s <- optionMaybe (char '+' <|> char '-')
  237. > d <- digits
  238. > return $ concat $ catMaybes [Just "e"
  239. > ,fmap (:[]) s
  240. > ,Just d]
  241. > digits = many1 digit
  242. ================================================================================
  243. additional parser bits and pieces
  244. include * in identifier strings during lexing. This parser is also
  245. used for keywords, so identifiers and keywords aren't distinguished
  246. until during proper parsing, and * isn't really examined until type
  247. checking
  248. > identifierString :: Parser String
  249. > identifierString = lexeme $ (letter <|> char '_')
  250. > <:> many (alphaNum <|> char '_')
  251. todo:
  252. select adrelid as "a""a" from pg_attrdef;
  253. creates a column named: 'a"a' with a double quote in it
  254. > qidentifierString :: Parser String
  255. > qidentifierString = lexeme $ char '"' *> many (noneOf "\"") <* char '"'
  256. parse the block of inline data for a copy from stdin, ends with \. on
  257. its own on a line
  258. > copyPayload :: Parser Tok
  259. > copyPayload = CopyPayloadTok <$> lexeme (getLinesTillMatches "\\.\n")
  260. > where
  261. > getLinesTillMatches s = do
  262. > x <- getALine
  263. > if x == s
  264. > then return ""
  265. > else (x++) <$> getLinesTillMatches s
  266. > getALine = (++"\n") <$> manyTill anyChar (try newline)
  267. >
  268. ================================================================================
  269. = parsec pass throughs
  270. > --symbol :: String -> Parser String
  271. > --symbol = P.symbol lexer
  272. >
  273. > integer :: Parser Integer
  274. > integer = lexeme $ P.integer lexer
  275. > whiteSpace :: Parser ()
  276. > whiteSpace = P.whiteSpace lexer
  277. >
  278. > lexeme :: Parser a -> Parser a
  279. > lexeme = P.lexeme lexer
  280. this lexer isn't really used as much as it could be, probably some of
  281. the fields are not used at all (like identifier and operator stuff)
  282. > lexer :: P.GenTokenParser String LexState Identity
  283. > lexer = P.makeTokenParser (emptyDef {
  284. > P.commentStart = "/*"
  285. > ,P.commentEnd = "*/"
  286. > ,P.commentLine = "--"
  287. > ,P.nestedComments = False
  288. > ,P.identStart = letter <|> char '_'
  289. > ,P.identLetter = alphaNum <|> oneOf "_"
  290. > ,P.opStart = P.opLetter emptyDef
  291. > ,P.opLetter = oneOf opLetters
  292. > ,P.reservedOpNames= []
  293. > ,P.reservedNames = []
  294. > ,P.caseSensitive = False
  295. > })
  296. >
  297. > opLetters :: String
  298. > opLetters = ".:^*/%+-<>=|!"