/src/Database/HsSqlPpp/Parsing/Lexer.lhs
Haskell | 362 lines | 298 code | 64 blank | 0 comment | 34 complexity | f03a63fe6d9520a9aab959016e617e2a MD5 | raw file
Possible License(s): BSD-3-Clause
1 2This file contains the lexer for sql source text. 3 4Lexicon: 5 6~~~~ 7string 8identifier or keyword 9symbols - operators and ;,()[] 10positional arg 11int 12float 13copy payload (used to lex copy from stdin data) 14~~~~ 15 16> module Database.HsSqlPpp.Parsing.Lexer ( 17> Token 18> ,Tok(..) 19> ,lexSqlFile 20> ,lexSqlText 21> ,lexSqlTextWithPosition 22> ,identifierString 23> ,LexState 24> ) where 25> import Text.Parsec hiding(many, optional, (<|>)) 26> import qualified Text.Parsec.Token as P 27> import Text.Parsec.Language 28> --import Text.Parsec.String 29> import Text.Parsec.Pos 30> 31> import Control.Applicative 32> import Control.Monad.Identity 33 34> import Data.Maybe 35> 36> import Database.HsSqlPpp.Parsing.ParseErrors 37> import Database.HsSqlPpp.Utils.Utils 38> -- import Database.HsSqlPpp.Ast.Name 39 40================================================================================ 41 42= data types 43 44> type Token = (SourcePos, Tok) 45> 46> data Tok = StringTok String String --delim, value (delim will one of 47> --', $$, $[stuff]$ 48 49> | IdStringTok String -- either a identifier component (without .) or a * 50> | QIdStringTok String -- same as IdStringTok with quotes 51 52> | SymbolTok String -- operators, and ()[],;: and also . 53> -- '*' is currently always lexed as an id 54> -- rather than an operator 55> -- this gets fixed in the parsing stage 56 57> | PositionalArgTok Integer -- used for $1, etc. 58 59Use a numbertok with a string to parse numbers. This is mainly so that 60numeric constants can be parsed accurately - if they are parsed to 61floats in the ast then converted back to numeric, then the accuracy 62can be lost (e.g. something like "0.2" parsing to 0.199999999 float. 63 64> | NumberTok String 65 66> | CopyPayloadTok String -- support copy from stdin; with inline data 67> deriving (Eq,Show) 68> 69> type LexState = [Tok] 70> type Parser = ParsecT String LexState Identity 71> 72> lexSqlFile :: FilePath -> IO (Either ParseErrorExtra [Token]) 73> lexSqlFile f = do 74> te <- readFile f 75> let x = runParser sqlTokens [] f te 76> return $ toParseErrorExtra x Nothing te 77> 78> lexSqlText :: String -> String -> Either ParseErrorExtra [Token] 79> lexSqlText f s = toParseErrorExtra (runParser sqlTokens [] f s) Nothing s 80> 81> lexSqlTextWithPosition :: String -> Int -> Int -> String 82> -> Either ParseErrorExtra [Token] 83> lexSqlTextWithPosition f l c s = 84> toParseErrorExtra (runParser (do 85> setPosition (newPos f l c) 86> sqlTokens) [] f s) (Just (l,c)) s 87 88================================================================================ 89 90= lexers 91 92lexer for tokens, contains a hack for copy from stdin with inline 93table data. 94 95> sqlTokens :: Parser [Token] 96> sqlTokens = 97> setState [] >> 98> whiteSpace >> 99> many sqlToken <* eof 100 101Lexer for an individual token. 102 103Could lex lazily and when the lexer reads a copy from stdin statement, 104it switches lexers to lex the inline table data, then switches 105back. Don't know how to do this in parsec, or even if it is possible, 106so as a work around, you use the state to trap if we've just seen 'from 107stdin;', if so, you read the copy payload as one big token, otherwise 108we read a normal token. 109 110> sqlToken :: Parser Token 111> sqlToken = do 112> sp <- getPosition 113> sta <- getState 114> t <- if sta == [ft,st,mt] 115> then copyPayload 116> else try sqlNumber 117> <|> try sqlString 118> <|> try idString 119> <|> try qidString 120> <|> try positionalArg 121> <|> try sqlSymbol 122> updateState $ \stt -> 123> case () of 124> _ | stt == [] && t == ft -> [ft] 125> | stt == [ft] && t == st -> [ft,st] 126> | stt == [ft,st] && t == mt -> [ft,st,mt] 127> | otherwise -> [] 128> 129> return (sp,t) 130> where 131> ft = IdStringTok "from" 132> st = IdStringTok "stdin" 133> mt = SymbolTok ";" 134 135== specialized token parsers 136 137> sqlString :: Parser Tok 138> sqlString = stringQuotes <|> stringLD 139> where 140> --parse a string delimited by single quotes 141> stringQuotes = StringTok "\'" <$> stringPar 142> stringPar = optional (char 'E') *> char '\'' 143> *> readQuoteEscape <* whiteSpace 144> --(readquoteescape reads the trailing ') 145 146have to read two consecutive single quotes as a quote character 147instead of the end of the string, probably an easier way to do this 148 149other escapes (e.g. \n \t) are left unprocessed 150 151> readQuoteEscape = do 152> x <- anyChar 153> if x == '\'' 154> then try ((x:) <$> (char '\'' *> readQuoteEscape)) 155> <|> return "" 156> else (x:) <$> readQuoteEscape 157 158parse a dollar quoted string 159 160> stringLD = do 161> -- cope with $$ as well as $[identifier]$ 162> tag <- try (char '$' *> ((char '$' *> return "") 163> <|> (identifierString <* char '$'))) 164> s <- lexeme $ manyTill anyChar 165> (try $ char '$' <* string tag <* char '$') 166> return $ StringTok ("$" ++ tag ++ "$") s 167> 168> idString :: Parser Tok 169> idString = IdStringTok <$> identifierString 170 171> qidString :: Parser Tok 172> qidString = QIdStringTok <$> qidentifierString 173 174 175> 176> positionalArg :: Parser Tok 177> positionalArg = char '$' >> PositionalArgTok <$> integer 178 179 180Lexing symbols: 181 182~~~~ 183approach 1: 184try to keep multi symbol operators as single lexical items 185(e.g. "==", "~==" 186 187approach 2: 188make each character a separate element 189e.g. == lexes to ['=', '='] 190then the parser sorts this out 191 192Sort of using approach 1 at the moment, see below 193 194== notes on symbols in pg operators 195pg symbols can be made from: 196 197=_*/<>=~!@#%^&|`? 198 199no --, /* in symbols 200 201can't end in + or - unless contains 202~!@#%^&|? 203 204Most of this isn't relevant for the current lexer. 205 206== sql symbols for this lexer: 207 208sql symbol is one of 209()[],; - single character 210+-*/<>=~!@#%^&|`? string - one or more of these, parsed until hit char 211which isn't one of these (including whitespace). This will parse some 212standard sql expressions wrongly at the moment, work around is to add 213whitespace e.g. i think 3*-4 is valid sql, should lex as '3' '*' '-' 214'4', but will currently lex as '3' '*-' '4'. This is planned to be 215fixed in the parser. 216.. := :: : - other special cases 217A single * will lex as an identifier rather than a symbol, the parser 218deals with this. 219 220~~~~ 221 222> sqlSymbol :: Parser Tok 223> sqlSymbol = 224> SymbolTok <$> lexeme (choice [ 225> replicate 1 <$> oneOf "()[],;" 226> ,try $ string ".." 227> ,string "." 228> ,try $ string "::" 229> ,try $ string ":=" 230> ,string ":" 231> ,try $ string "$(" -- antiquote standard splice 232> ,try $ string "$s(" -- antiquote string splice 233> ,string "$i(" -- antiquote identifier splice 234> ,many1 (oneOf "+-*/<>=~!@#%^&|`?") 235> ]) 236> 237 238parse a number: 239digits 240digits.[digits][e[+-]digits] 241[digits].digits[e[+-]digits] 242digitse[+-]digits 243 244I'm sure the implementation can be simpler than this 245 246> sqlNumber :: Parser Tok 247> sqlNumber = NumberTok <$> lexeme ( 248> choice [do 249> -- starts with digits 250> d <- digits 251> suff <- choice [-- complete fractional part 252> try fracPart 253> ,-- dot followed by optional exp 254> -- check for .. symbol 255> choice [try $ do 256> _ <- lookAhead $ string ".." 257> return [] 258> ,do 259> _ <- char '.' 260> e <- optionMaybe expn 261> return $ concat $ catMaybes 262> [Just "." 263> ,e] 264> ] 265> ,--no dot then expn 266> expn 267> -- just an integer 268> ,return "" 269> ] 270> return $ d ++ suff 271> ,fracPart 272> ]) 273> where 274> fracPart = do 275> _ <- char '.' 276> d <- digits 277> e <- optionMaybe expn 278> return $ concat $ catMaybes 279> [Just "." 280> ,Just d 281> ,e] 282> expn = do 283> _ <- char 'e' 284> s <- optionMaybe (char '+' <|> char '-') 285> d <- digits 286> return $ concat $ catMaybes [Just "e" 287> ,fmap (:[]) s 288> ,Just d] 289> digits = many1 digit 290 291================================================================================ 292 293additional parser bits and pieces 294 295include * in identifier strings during lexing. This parser is also 296used for keywords, so identifiers and keywords aren't distinguished 297until during proper parsing, and * isn't really examined until type 298checking 299 300> identifierString :: Parser String 301> identifierString = lexeme $ (letter <|> char '_') 302> <:> many (alphaNum <|> char '_') 303 304todo: 305select adrelid as "a""a" from pg_attrdef; 306creates a column named: 'a"a' with a double quote in it 307 308> qidentifierString :: Parser String 309> qidentifierString = lexeme $ char '"' *> many (noneOf "\"") <* char '"' 310 311 312parse the block of inline data for a copy from stdin, ends with \. on 313its own on a line 314 315> copyPayload :: Parser Tok 316> copyPayload = CopyPayloadTok <$> lexeme (getLinesTillMatches "\\.\n") 317> where 318> getLinesTillMatches s = do 319> x <- getALine 320> if x == s 321> then return "" 322> else (x++) <$> getLinesTillMatches s 323> getALine = (++"\n") <$> manyTill anyChar (try newline) 324> 325 326================================================================================ 327 328= parsec pass throughs 329 330> --symbol :: String -> Parser String 331> --symbol = P.symbol lexer 332> 333 334> integer :: Parser Integer 335> integer = lexeme $ P.integer lexer 336 337> whiteSpace :: Parser () 338> whiteSpace = P.whiteSpace lexer 339> 340> lexeme :: Parser a -> Parser a 341> lexeme = P.lexeme lexer 342 343this lexer isn't really used as much as it could be, probably some of 344the fields are not used at all (like identifier and operator stuff) 345 346> lexer :: P.GenTokenParser String LexState Identity 347> lexer = P.makeTokenParser (emptyDef { 348> P.commentStart = "/*" 349> ,P.commentEnd = "*/" 350> ,P.commentLine = "--" 351> ,P.nestedComments = False 352> ,P.identStart = letter <|> char '_' 353> ,P.identLetter = alphaNum <|> oneOf "_" 354> ,P.opStart = P.opLetter emptyDef 355> ,P.opLetter = oneOf opLetters 356> ,P.reservedOpNames= [] 357> ,P.reservedNames = [] 358> ,P.caseSensitive = False 359> }) 360> 361> opLetters :: String 362> opLetters = ".:^*/%+-<>=|!"