PageRenderTime 22ms CodeModel.GetById 18ms app.highlight 2ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Database/HsSqlPpp/Parsing/Lexer.lhs

http://github.com/JakeWheat/hssqlppp
Haskell | 362 lines | 298 code | 64 blank | 0 comment | 34 complexity | f03a63fe6d9520a9aab959016e617e2a MD5 | raw file
Possible License(s): BSD-3-Clause
  1
  2This file contains the lexer for sql source text.
  3
  4Lexicon:
  5
  6~~~~
  7string
  8identifier or keyword
  9symbols - operators and ;,()[]
 10positional arg
 11int
 12float
 13copy payload (used to lex copy from stdin data)
 14~~~~
 15
 16> module Database.HsSqlPpp.Parsing.Lexer (
 17>               Token
 18>              ,Tok(..)
 19>              ,lexSqlFile
 20>              ,lexSqlText
 21>              ,lexSqlTextWithPosition
 22>              ,identifierString
 23>              ,LexState
 24>              ) where
 25> import Text.Parsec hiding(many, optional, (<|>))
 26> import qualified Text.Parsec.Token as P
 27> import Text.Parsec.Language
 28> --import Text.Parsec.String
 29> import Text.Parsec.Pos
 30>
 31> import Control.Applicative
 32> import Control.Monad.Identity
 33
 34> import Data.Maybe
 35>
 36> import Database.HsSqlPpp.Parsing.ParseErrors
 37> import Database.HsSqlPpp.Utils.Utils
 38> -- import Database.HsSqlPpp.Ast.Name
 39
 40================================================================================
 41
 42= data types
 43
 44> type Token = (SourcePos, Tok)
 45>
 46> data Tok = StringTok String String --delim, value (delim will one of
 47>                                    --', $$, $[stuff]$
 48
 49>          | IdStringTok String -- either a identifier component (without .) or a *
 50>          | QIdStringTok String -- same as IdStringTok with quotes
 51
 52>          | SymbolTok String -- operators, and ()[],;: and also .
 53>                             -- '*' is currently always lexed as an id
 54>                             --   rather than an operator
 55>                             -- this gets fixed in the parsing stage
 56
 57>          | PositionalArgTok Integer -- used for $1, etc.
 58
 59Use a numbertok with a string to parse numbers. This is mainly so that
 60numeric constants can be parsed accurately - if they are parsed to
 61floats in the ast then converted back to numeric, then the accuracy
 62can be lost (e.g. something like "0.2" parsing to 0.199999999 float.
 63
 64>          | NumberTok String
 65
 66>          | CopyPayloadTok String -- support copy from stdin; with inline data
 67>            deriving (Eq,Show)
 68>
 69> type LexState = [Tok]
 70> type Parser = ParsecT String LexState Identity
 71>
 72> lexSqlFile :: FilePath -> IO (Either ParseErrorExtra [Token])
 73> lexSqlFile f = do
 74>   te <- readFile f
 75>   let x = runParser sqlTokens [] f te
 76>   return $ toParseErrorExtra x Nothing te
 77>
 78> lexSqlText :: String -> String -> Either ParseErrorExtra [Token]
 79> lexSqlText f s = toParseErrorExtra (runParser sqlTokens [] f s) Nothing s
 80>
 81> lexSqlTextWithPosition :: String -> Int -> Int -> String
 82>                        -> Either ParseErrorExtra [Token]
 83> lexSqlTextWithPosition f l c s =
 84>   toParseErrorExtra (runParser (do
 85>                                 setPosition (newPos f l c)
 86>                                 sqlTokens) [] f s) (Just (l,c)) s
 87
 88================================================================================
 89
 90= lexers
 91
 92lexer for tokens, contains a hack for copy from stdin with inline
 93table data.
 94
 95> sqlTokens :: Parser [Token]
 96> sqlTokens =
 97>   setState [] >>
 98>   whiteSpace >>
 99>   many sqlToken <* eof
100
101Lexer for an individual token.
102
103Could lex lazily and when the lexer reads a copy from stdin statement,
104it switches lexers to lex the inline table data, then switches
105back. Don't know how to do this in parsec, or even if it is possible,
106so as a work around, you use the state to trap if we've just seen 'from
107stdin;', if so, you read the copy payload as one big token, otherwise
108we read a normal token.
109
110> sqlToken :: Parser Token
111> sqlToken = do
112>            sp <- getPosition
113>            sta <- getState
114>            t <- if sta == [ft,st,mt]
115>                 then copyPayload
116>                 else try sqlNumber
117>                  <|> try sqlString
118>                  <|> try idString
119>                  <|> try qidString
120>                  <|> try positionalArg
121>                  <|> try sqlSymbol
122>            updateState $ \stt ->
123>              case () of
124>                      _ | stt == [] && t == ft -> [ft]
125>                        | stt == [ft] && t == st -> [ft,st]
126>                        | stt == [ft,st] && t == mt -> [ft,st,mt]
127>                        | otherwise -> []
128>
129>            return (sp,t)
130>            where
131>              ft = IdStringTok "from"
132>              st = IdStringTok "stdin"
133>              mt = SymbolTok ";"
134
135== specialized token parsers
136
137> sqlString :: Parser Tok
138> sqlString = stringQuotes <|> stringLD
139>   where
140>     --parse a string delimited by single quotes
141>     stringQuotes = StringTok "\'" <$> stringPar
142>     stringPar = optional (char 'E') *> char '\''
143>                 *> readQuoteEscape <* whiteSpace
144>     --(readquoteescape reads the trailing ')
145
146have to read two consecutive single quotes as a quote character
147instead of the end of the string, probably an easier way to do this
148
149other escapes (e.g. \n \t) are left unprocessed
150
151>     readQuoteEscape = do
152>                       x <- anyChar
153>                       if x == '\''
154>                         then try ((x:) <$> (char '\'' *> readQuoteEscape))
155>                              <|> return ""
156>                         else (x:) <$> readQuoteEscape
157
158parse a dollar quoted string
159
160>     stringLD = do
161>                -- cope with $$ as well as $[identifier]$
162>                tag <- try (char '$' *> ((char '$' *> return "")
163>                                    <|> (identifierString <* char '$')))
164>                s <- lexeme $ manyTill anyChar
165>                       (try $ char '$' <* string tag <* char '$')
166>                return $ StringTok ("$" ++ tag ++ "$") s
167>
168> idString :: Parser Tok
169> idString = IdStringTok <$> identifierString
170
171> qidString :: Parser Tok
172> qidString = QIdStringTok <$> qidentifierString
173
174
175>
176> positionalArg :: Parser Tok
177> positionalArg = char '$' >> PositionalArgTok <$> integer
178
179
180Lexing symbols:
181
182~~~~
183approach 1:
184try to keep multi symbol operators as single lexical items
185(e.g. "==", "~=="
186
187approach 2:
188make each character a separate element
189e.g. == lexes to ['=', '=']
190then the parser sorts this out
191
192Sort of using approach 1 at the moment, see below
193
194== notes on symbols in pg operators
195pg symbols can be made from:
196
197=_*/<>=~!@#%^&|`?
198
199no --, /* in symbols
200
201can't end in + or - unless contains
202~!@#%^&|?
203
204Most of this isn't relevant for the current lexer.
205
206== sql symbols for this lexer:
207
208sql symbol is one of
209()[],; - single character
210+-*/<>=~!@#%^&|`? string - one or more of these, parsed until hit char
211which isn't one of these (including whitespace). This will parse some
212standard sql expressions wrongly at the moment, work around is to add
213whitespace e.g. i think 3*-4 is valid sql, should lex as '3' '*' '-'
214'4', but will currently lex as '3' '*-' '4'. This is planned to be
215fixed in the parser.
216.. := :: : - other special cases
217A single * will lex as an identifier rather than a symbol, the parser
218deals with this.
219
220~~~~
221
222> sqlSymbol :: Parser Tok
223> sqlSymbol =
224>   SymbolTok <$> lexeme (choice [
225>                          replicate 1 <$> oneOf "()[],;"
226>                         ,try $ string ".."
227>                         ,string "."
228>                         ,try $ string "::"
229>                         ,try $ string ":="
230>                         ,string ":"
231>                         ,try $ string "$(" -- antiquote standard splice
232>                         ,try $ string "$s(" -- antiquote string splice
233>                         ,string "$i(" -- antiquote identifier splice
234>                         ,many1 (oneOf "+-*/<>=~!@#%^&|`?")
235>                         ])
236>
237
238parse a number:
239digits
240digits.[digits][e[+-]digits]
241[digits].digits[e[+-]digits]
242digitse[+-]digits
243
244I'm sure the implementation can be simpler than this
245
246> sqlNumber :: Parser Tok
247> sqlNumber = NumberTok <$> lexeme (
248>   choice [do
249>           -- starts with digits
250>           d <- digits
251>           suff <- choice [-- complete fractional part
252>                           try fracPart
253>                          ,-- dot followed by optional exp
254>                           -- check for .. symbol
255>                           choice [try $ do
256>                                         _ <- lookAhead $ string ".."
257>                                         return []
258>                                  ,do
259>                                   _ <- char '.'
260>                                   e <- optionMaybe expn
261>                                   return $ concat $ catMaybes
262>                                     [Just "."
263>                                     ,e]
264>                                   ]
265>                          ,--no dot then expn
266>                           expn
267>                           -- just an integer
268>                          ,return ""
269>                          ]
270>           return $ d ++ suff
271>          ,fracPart
272>          ])
273>   where
274>      fracPart = do
275>           _ <- char '.'
276>           d <- digits
277>           e <- optionMaybe expn
278>           return $ concat $ catMaybes
279>             [Just "."
280>             ,Just d
281>             ,e]
282>      expn = do
283>        _ <- char 'e'
284>        s <- optionMaybe (char '+' <|> char '-')
285>        d <- digits
286>        return $ concat $ catMaybes [Just "e"
287>                                    ,fmap (:[]) s
288>                                    ,Just d]
289>      digits = many1 digit
290
291================================================================================
292
293additional parser bits and pieces
294
295include * in identifier strings during lexing. This parser is also
296used for keywords, so identifiers and keywords aren't distinguished
297until during proper parsing, and * isn't really examined until type
298checking
299
300> identifierString :: Parser String
301> identifierString = lexeme $ (letter <|> char '_')
302>                             <:> many (alphaNum <|> char '_')
303
304todo:
305select adrelid as "a""a" from pg_attrdef;
306creates a column named: 'a"a' with a double quote in it
307
308> qidentifierString :: Parser String
309> qidentifierString = lexeme $ char '"' *> many (noneOf "\"") <* char '"'
310
311
312parse the block of inline data for a copy from stdin, ends with \. on
313its own on a line
314
315> copyPayload :: Parser Tok
316> copyPayload = CopyPayloadTok <$> lexeme (getLinesTillMatches "\\.\n")
317>   where
318>     getLinesTillMatches s = do
319>                             x <- getALine
320>                             if x == s
321>                               then return ""
322>                               else (x++) <$> getLinesTillMatches s
323>     getALine = (++"\n") <$> manyTill anyChar (try newline)
324>
325
326================================================================================
327
328= parsec pass throughs
329
330> --symbol :: String -> Parser String
331> --symbol = P.symbol lexer
332>
333
334> integer :: Parser Integer
335> integer = lexeme $ P.integer lexer
336
337> whiteSpace :: Parser ()
338> whiteSpace = P.whiteSpace lexer
339>
340> lexeme :: Parser a -> Parser a
341> lexeme = P.lexeme lexer
342
343this lexer isn't really used as much as it could be, probably some of
344the fields are not used at all (like identifier and operator stuff)
345
346> lexer :: P.GenTokenParser String LexState Identity
347> lexer = P.makeTokenParser (emptyDef {
348>                             P.commentStart = "/*"
349>                            ,P.commentEnd = "*/"
350>                            ,P.commentLine = "--"
351>                            ,P.nestedComments = False
352>                            ,P.identStart = letter <|> char '_'
353>                            ,P.identLetter    = alphaNum <|> oneOf "_"
354>                            ,P.opStart        = P.opLetter emptyDef
355>                            ,P.opLetter       = oneOf opLetters
356>                            ,P.reservedOpNames= []
357>                            ,P.reservedNames  = []
358>                            ,P.caseSensitive  = False
359>                            })
360>
361> opLetters :: String
362> opLetters = ".:^*/%+-<>=|!"