/src/Database/HsSqlPpp/Parsing/Lexer.lhs
http://github.com/JakeWheat/hssqlppp · Haskell · 362 lines · 298 code · 64 blank · 0 comment · 34 complexity · f03a63fe6d9520a9aab959016e617e2a MD5 · raw file
- This file contains the lexer for sql source text.
- Lexicon:
- ~~~~
- string
- identifier or keyword
- symbols - operators and ;,()[]
- positional arg
- int
- float
- copy payload (used to lex copy from stdin data)
- ~~~~
- > module Database.HsSqlPpp.Parsing.Lexer (
- > Token
- > ,Tok(..)
- > ,lexSqlFile
- > ,lexSqlText
- > ,lexSqlTextWithPosition
- > ,identifierString
- > ,LexState
- > ) where
- > import Text.Parsec hiding(many, optional, (<|>))
- > import qualified Text.Parsec.Token as P
- > import Text.Parsec.Language
- > --import Text.Parsec.String
- > import Text.Parsec.Pos
- >
- > import Control.Applicative
- > import Control.Monad.Identity
- > import Data.Maybe
- >
- > import Database.HsSqlPpp.Parsing.ParseErrors
- > import Database.HsSqlPpp.Utils.Utils
- > -- import Database.HsSqlPpp.Ast.Name
- ================================================================================
- = data types
- > type Token = (SourcePos, Tok)
- >
- > data Tok = StringTok String String --delim, value (delim will one of
- > --', $$, $[stuff]$
- > | IdStringTok String -- either a identifier component (without .) or a *
- > | QIdStringTok String -- same as IdStringTok with quotes
- > | SymbolTok String -- operators, and ()[],;: and also .
- > -- '*' is currently always lexed as an id
- > -- rather than an operator
- > -- this gets fixed in the parsing stage
- > | PositionalArgTok Integer -- used for $1, etc.
- Use a numbertok with a string to parse numbers. This is mainly so that
- numeric constants can be parsed accurately - if they are parsed to
- floats in the ast then converted back to numeric, then the accuracy
- can be lost (e.g. something like "0.2" parsing to 0.199999999 float.
- > | NumberTok String
- > | CopyPayloadTok String -- support copy from stdin; with inline data
- > deriving (Eq,Show)
- >
- > type LexState = [Tok]
- > type Parser = ParsecT String LexState Identity
- >
- > lexSqlFile :: FilePath -> IO (Either ParseErrorExtra [Token])
- > lexSqlFile f = do
- > te <- readFile f
- > let x = runParser sqlTokens [] f te
- > return $ toParseErrorExtra x Nothing te
- >
- > lexSqlText :: String -> String -> Either ParseErrorExtra [Token]
- > lexSqlText f s = toParseErrorExtra (runParser sqlTokens [] f s) Nothing s
- >
- > lexSqlTextWithPosition :: String -> Int -> Int -> String
- > -> Either ParseErrorExtra [Token]
- > lexSqlTextWithPosition f l c s =
- > toParseErrorExtra (runParser (do
- > setPosition (newPos f l c)
- > sqlTokens) [] f s) (Just (l,c)) s
- ================================================================================
- = lexers
- lexer for tokens, contains a hack for copy from stdin with inline
- table data.
- > sqlTokens :: Parser [Token]
- > sqlTokens =
- > setState [] >>
- > whiteSpace >>
- > many sqlToken <* eof
- Lexer for an individual token.
- Could lex lazily and when the lexer reads a copy from stdin statement,
- it switches lexers to lex the inline table data, then switches
- back. Don't know how to do this in parsec, or even if it is possible,
- so as a work around, you use the state to trap if we've just seen 'from
- stdin;', if so, you read the copy payload as one big token, otherwise
- we read a normal token.
- > sqlToken :: Parser Token
- > sqlToken = do
- > sp <- getPosition
- > sta <- getState
- > t <- if sta == [ft,st,mt]
- > then copyPayload
- > else try sqlNumber
- > <|> try sqlString
- > <|> try idString
- > <|> try qidString
- > <|> try positionalArg
- > <|> try sqlSymbol
- > updateState $ \stt ->
- > case () of
- > _ | stt == [] && t == ft -> [ft]
- > | stt == [ft] && t == st -> [ft,st]
- > | stt == [ft,st] && t == mt -> [ft,st,mt]
- > | otherwise -> []
- >
- > return (sp,t)
- > where
- > ft = IdStringTok "from"
- > st = IdStringTok "stdin"
- > mt = SymbolTok ";"
- == specialized token parsers
- > sqlString :: Parser Tok
- > sqlString = stringQuotes <|> stringLD
- > where
- > --parse a string delimited by single quotes
- > stringQuotes = StringTok "\'" <$> stringPar
- > stringPar = optional (char 'E') *> char '\''
- > *> readQuoteEscape <* whiteSpace
- > --(readquoteescape reads the trailing ')
- have to read two consecutive single quotes as a quote character
- instead of the end of the string, probably an easier way to do this
- other escapes (e.g. \n \t) are left unprocessed
- > readQuoteEscape = do
- > x <- anyChar
- > if x == '\''
- > then try ((x:) <$> (char '\'' *> readQuoteEscape))
- > <|> return ""
- > else (x:) <$> readQuoteEscape
- parse a dollar quoted string
- > stringLD = do
- > -- cope with $$ as well as $[identifier]$
- > tag <- try (char '$' *> ((char '$' *> return "")
- > <|> (identifierString <* char '$')))
- > s <- lexeme $ manyTill anyChar
- > (try $ char '$' <* string tag <* char '$')
- > return $ StringTok ("$" ++ tag ++ "$") s
- >
- > idString :: Parser Tok
- > idString = IdStringTok <$> identifierString
- > qidString :: Parser Tok
- > qidString = QIdStringTok <$> qidentifierString
- >
- > positionalArg :: Parser Tok
- > positionalArg = char '$' >> PositionalArgTok <$> integer
- Lexing symbols:
- ~~~~
- approach 1:
- try to keep multi symbol operators as single lexical items
- (e.g. "==", "~=="
- approach 2:
- make each character a separate element
- e.g. == lexes to ['=', '=']
- then the parser sorts this out
- Sort of using approach 1 at the moment, see below
- == notes on symbols in pg operators
- pg symbols can be made from:
- =_*/<>=~!@#%^&|`?
- no --, /* in symbols
- can't end in + or - unless contains
- ~!@#%^&|?
- Most of this isn't relevant for the current lexer.
- == sql symbols for this lexer:
- sql symbol is one of
- ()[],; - single character
- +-*/<>=~!@#%^&|`? string - one or more of these, parsed until hit char
- which isn't one of these (including whitespace). This will parse some
- standard sql expressions wrongly at the moment, work around is to add
- whitespace e.g. i think 3*-4 is valid sql, should lex as '3' '*' '-'
- '4', but will currently lex as '3' '*-' '4'. This is planned to be
- fixed in the parser.
- .. := :: : - other special cases
- A single * will lex as an identifier rather than a symbol, the parser
- deals with this.
- ~~~~
- > sqlSymbol :: Parser Tok
- > sqlSymbol =
- > SymbolTok <$> lexeme (choice [
- > replicate 1 <$> oneOf "()[],;"
- > ,try $ string ".."
- > ,string "."
- > ,try $ string "::"
- > ,try $ string ":="
- > ,string ":"
- > ,try $ string "$(" -- antiquote standard splice
- > ,try $ string "$s(" -- antiquote string splice
- > ,string "$i(" -- antiquote identifier splice
- > ,many1 (oneOf "+-*/<>=~!@#%^&|`?")
- > ])
- >
- parse a number:
- digits
- digits.[digits][e[+-]digits]
- [digits].digits[e[+-]digits]
- digitse[+-]digits
- I'm sure the implementation can be simpler than this
- > sqlNumber :: Parser Tok
- > sqlNumber = NumberTok <$> lexeme (
- > choice [do
- > -- starts with digits
- > d <- digits
- > suff <- choice [-- complete fractional part
- > try fracPart
- > ,-- dot followed by optional exp
- > -- check for .. symbol
- > choice [try $ do
- > _ <- lookAhead $ string ".."
- > return []
- > ,do
- > _ <- char '.'
- > e <- optionMaybe expn
- > return $ concat $ catMaybes
- > [Just "."
- > ,e]
- > ]
- > ,--no dot then expn
- > expn
- > -- just an integer
- > ,return ""
- > ]
- > return $ d ++ suff
- > ,fracPart
- > ])
- > where
- > fracPart = do
- > _ <- char '.'
- > d <- digits
- > e <- optionMaybe expn
- > return $ concat $ catMaybes
- > [Just "."
- > ,Just d
- > ,e]
- > expn = do
- > _ <- char 'e'
- > s <- optionMaybe (char '+' <|> char '-')
- > d <- digits
- > return $ concat $ catMaybes [Just "e"
- > ,fmap (:[]) s
- > ,Just d]
- > digits = many1 digit
- ================================================================================
- additional parser bits and pieces
- include * in identifier strings during lexing. This parser is also
- used for keywords, so identifiers and keywords aren't distinguished
- until during proper parsing, and * isn't really examined until type
- checking
- > identifierString :: Parser String
- > identifierString = lexeme $ (letter <|> char '_')
- > <:> many (alphaNum <|> char '_')
- todo:
- select adrelid as "a""a" from pg_attrdef;
- creates a column named: 'a"a' with a double quote in it
- > qidentifierString :: Parser String
- > qidentifierString = lexeme $ char '"' *> many (noneOf "\"") <* char '"'
- parse the block of inline data for a copy from stdin, ends with \. on
- its own on a line
- > copyPayload :: Parser Tok
- > copyPayload = CopyPayloadTok <$> lexeme (getLinesTillMatches "\\.\n")
- > where
- > getLinesTillMatches s = do
- > x <- getALine
- > if x == s
- > then return ""
- > else (x++) <$> getLinesTillMatches s
- > getALine = (++"\n") <$> manyTill anyChar (try newline)
- >
- ================================================================================
- = parsec pass throughs
- > --symbol :: String -> Parser String
- > --symbol = P.symbol lexer
- >
- > integer :: Parser Integer
- > integer = lexeme $ P.integer lexer
- > whiteSpace :: Parser ()
- > whiteSpace = P.whiteSpace lexer
- >
- > lexeme :: Parser a -> Parser a
- > lexeme = P.lexeme lexer
- this lexer isn't really used as much as it could be, probably some of
- the fields are not used at all (like identifier and operator stuff)
- > lexer :: P.GenTokenParser String LexState Identity
- > lexer = P.makeTokenParser (emptyDef {
- > P.commentStart = "/*"
- > ,P.commentEnd = "*/"
- > ,P.commentLine = "--"
- > ,P.nestedComments = False
- > ,P.identStart = letter <|> char '_'
- > ,P.identLetter = alphaNum <|> oneOf "_"
- > ,P.opStart = P.opLetter emptyDef
- > ,P.opLetter = oneOf opLetters
- > ,P.reservedOpNames= []
- > ,P.reservedNames = []
- > ,P.caseSensitive = False
- > })
- >
- > opLetters :: String
- > opLetters = ".:^*/%+-<>=|!"