/lexers/lexer.lua
Lua | 1171 lines | 331 code | 32 blank | 808 comment | 66 complexity | 3b6380780c66f3efacf5aaaedfaca6ba MD5 | raw file
Possible License(s): ISC
- -- Copyright 2006-2010 Mitchell mitchell<att>caladbolg.net. See LICENSE.
- ---
- -- Performs lexing of Scintilla documents.
- module('lexer', package.seeall)
- -- Markdown:
- -- ## Overview
- --
- -- Dynamic lexers are more flexible than Scintilla's static ones. They are often
- -- more readable as well. This document provides all the information necessary
- -- in order to write a new lexer. For illustrative purposes, a Lua lexer will be
- -- created. Lexers are written using Parsing Expression Grammars or PEGs with
- -- the Lua [LPeg library][LPeg]. Please familiarize yourself with LPeg's
- -- documentation before proceeding.
- --
- -- [LPeg]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
- --
- -- ## Writing a Dynamic Lexer
- --
- -- Rather than writing a lexer from scratch, first see if your language is
- -- similar to any of the 70+ languages supported. If so, you can copy and modify
- -- that lexer, saving some time and effort.
- --
- -- #### Introduction
- --
- -- All lexers are contained in the `lexers/` directory. To begin, create a Lua
- -- script with the name of your lexer and open it for editing.
- --
- -- $> cd lexers
- -- $> textadept lua.lua
- --
- -- Inside the lexer, the heading should look like the following:
- --
- -- -- Lua LPeg lexer
- --
- -- local l = lexer
- -- local token, word_match = l.token, l.word_match
- -- local P, R, S, V = l.lpeg.P, l.lpeg.R, l.lpeg.S, l.lpeg.V
- --
- -- module(...)
- --
- -- Each lexer is a module so the global namespace is not cluttered with lexer
- -- patterns and variables. The `...` is there for a reason! Do not replace it
- -- with the name of your lexer. This is done by Lua automatically.
- --
- -- The local variables above the module give easy access to the many useful
- -- functions available for creating lexers.
- --
- -- #### Lexer Language Structure
- --
- -- It is important to spend some time considering the structure of the language
- -- you are creating the lexer for. What kinds of tokens does it have? Comments,
- -- strings, keywords, etc.? Lua has 9 tokens: whitespace, comments, strings,
- -- numbers, keywords, functions, constants, identifiers, and operators.
- --
- -- #### Tokens
- --
- -- In a lexer, tokens are comprised of a token type followed by an LPeg pattern.
- -- They are created using the [`token()`](#token) function. The `lexer` (`l`)
- -- module provides a number of default token types:
- --
- -- * `DEFAULT`
- -- * `WHITESPACE`
- -- * `COMMENT`
- -- * `STRING`
- -- * `NUMBER`
- -- * `KEYWORD`
- -- * `IDENTIFIER`
- -- * `OPERATOR`
- -- * `ERROR`
- -- * `PREPROCESSOR`
- -- * `CONSTANT`
- -- * `VARIABLE`
- -- * `FUNCTION`
- -- * `CLASS`
- -- * `TYPE`
- --
- -- Please note you are not limited to just these token types; you can create
- -- your own. If you create your own, you will have to specify how they are
- -- colored. The procedure is discussed later.
- --
- -- A whitespace token typically looks like:
- --
- -- local ws = token(l.WHITESPACE, S('\t\v\f\n\r ')^1)
- --
- -- It is difficult to remember that a space character is either a `\t`, `\v`,
- -- `\f`, `\n`, `\r`, or ` `. The `lexer` module also provides you with a
- -- shortcut for this and many other character sequences. They are:
- --
- -- * `any`: Matches any single character.
- -- * `ascii`: Matches any ASCII character (`0`..`127`).
- -- * `extend`: Matches any ASCII extended character (`0`..`255`).
- -- * `alpha`: Matches any alphabetic character (`A-Z`, `a-z`).
- -- * `digit`: Matches any digit (`0-9`).
- -- * `alnum`: Matches any alphanumeric character (`A-Z`, `a-z`, `0-9`).
- -- * `lower`: Matches any lowercase character (`a-z`).
- -- * `upper`: Matches any uppercase character (`A-Z`).
- -- * `xdigit`: Matches any hexadecimal digit (`0-9`, `A-F`, `a-f`).
- -- * `cntrl`: Matches any control character (`0`..`31`).
- -- * `graph`: Matches any graphical character (`!` to `~`).
- -- * `print`: Matches any printable character (space to `~`).
- -- * `punct`: Matches any punctuation character not alphanumeric (`!` to `/`,
- -- `:` to `@`, `[` to `'`, `{` to `~`).
- -- * `space`: Matches any whitespace character (`\t`, `\v`, `\f`, `\n`, `\r`,
- -- space).
- -- * `newline`: Matches any newline characters.
- -- * `nonnewline`: Matches any non-newline character.
- -- * `nonnewline_esc`: Matches any non-newline character excluding newlines
- -- escaped with `\\`.
- -- * `dec_num`: Matches a decimal number.
- -- * `hex_num`: Matches a hexadecimal number.
- -- * `oct_num`: Matches an octal number.
- -- * `integer`: Matches a decimal, hexadecimal, or octal number.
- -- * `float`: Matches a floating point number.
- -- * `word`: Matches a typical word starting with a letter or underscore and
- -- then any alphanumeric or underscore characters.
- --
- -- The above whitespace token can be rewritten more simply as:
- --
- -- local ws = token(l.WHITESPACE, l.space^1)
- --
- -- The next Lua token is a comment. Short comments beginning with `--` are easy
- -- to express with LPeg:
- --
- -- local line_comment = '--' * l.nonnewline^0
- --
- -- On the other hand, long comments are more difficult to express because they
- -- have levels. See the [Lua Reference Manual][lexical_conventions] for more
- -- information. As a result, a functional pattern is necessary:
- --
- -- local longstring = #('[[' + ('[' * P('=')^0 * '[')) *
- -- P(function(input, index)
- -- local level = input:match('^%[(=*)%[', index)
- -- if level then
- -- local _, stop = input:find(']'..level..']', index, true)
- -- return stop and stop + 1 or #input + 1
- -- end
- -- end)
- -- local block_comment = '--' * longstring
- --
- -- The token for a comment is then:
- --
- -- local comment = token(l.COMMENT, line_comment + block_comment)
- --
- -- [lexical_conventions]: http://www.lua.org/manual/5.1/manual.html#2.1
- --
- -- It is worth noting that while token names are arbitrary, you are encouraged
- -- to use the ones listed in the [`tokens`](#tokens) table because a standard
- -- color theme is applied to them. If you wish to create a unique token, no
- -- problem. You can specify how it will be colored later on.
- --
- -- Lua strings should be easy to express because they are just characters
- -- surrounded by `'` or `"` characters, right? Not quite. Lua strings contain
- -- escape sequences (`\`*`char`*) so a `\'` sequence in a single-quoted string
- -- does not indicate the end of a string and must be handled appropriately.
- -- Fortunately, this is a common occurance in many programming languages, so a
- -- convenient function is provided: [`delimited_range()`](#delimited_range).
- --
- -- local sq_str = l.delimited_range("'", '\\', true)
- -- local dq_str = l.delimited_range('"', '\\', true)
- --
- -- Lua also has multi-line strings, but they have the same format as block
- -- comments. All strings can all be combined into a token:
- --
- -- local string = token(l.STRING, sq_str + dq_str + longstring)
- --
- -- Numbers are easy in Lua using `lexer`'s predefined patterns.
- --
- -- local lua_integer = P('-')^-1 * (l.hex_num + l.dec_num)
- -- local number = token(l.NUMBER, l.float + lua_integer)
- --
- -- Keep in mind that the predefined patterns may not be completely accurate for
- -- your language, so you may have to create your own variants. In the above
- -- case, Lua integers do not have octal sequences, so the `l.integer` pattern is
- -- not used.
- --
- -- Depending on the number of keywords for a particular language, a simple
- -- `P(keyword1) + P(keyword2) + ... + P(keywordN)` pattern can get quite large.
- -- In fact, LPeg has a limit on pattern size. Also, if the keywords are not case
- -- sensitive, additional complexity arises, so a better approach is necessary.
- -- Once again, `lexer` has a shortcut function: [`word_match()`](#word_match).
- --
- -- local keyword = token(l.KEYWORD, word_match {
- -- 'and', 'break', 'do', 'else', 'elseif', 'end', 'false', 'for',
- -- 'function', 'if', 'in', 'local', 'nil', 'not', 'or', 'repeat',
- -- 'return', 'then', 'true', 'until', 'while'
- -- })
- --
- -- If keywords were case-insensitive, an additional parameter would be specified
- -- in the call to [`word_match()`](#word_match); no other action is needed.
- --
- -- Lua functions and constants are specified like keywords:
- --
- -- local func = token(l.FUNCTION, word_match {
- -- 'assert', 'collectgarbage', 'dofile', 'error', 'getfenv',
- -- 'getmetatable', 'gcinfo', 'ipairs', 'loadfile', 'loadlib',
- -- 'loadstring', 'next', 'pairs', 'pcall', 'print', 'rawequal',
- -- 'rawget', 'rawset', 'require', 'setfenv', 'setmetatable',
- -- 'tonumber', 'tostring', 'type', 'unpack', 'xpcall'
- -- })
- --
- -- local constant = token(l.CONSTANT, word_match {
- -- '_G', '_VERSION', 'LUA_PATH', '_LOADED', '_REQUIREDNAME', '_ALERT',
- -- '_ERRORMESSAGE', '_PROMPT'
- -- })
- --
- -- Unlike most programming languages, Lua allows an additional range of
- -- characters in its identifier names (variables, functions, modules, etc.) so
- -- the usual `l.word` cannot be used. Instead, identifiers are represented by:
- --
- -- local word = (R('AZ', 'az', '\127\255') + '_') * (l.alnum + '_')^0
- -- local identifier = token(l.IDENTIFIER, word)
- --
- -- Finally, an operator character is one of the following:
- --
- -- local operator = token(l.OPERATOR, '~=' + S('+-*/%^#=<>;:,.{}[]()'))
- --
- -- #### Rules
- --
- -- Rules are just a combination of tokens. In Lua, all rules consist of a
- -- single token, but other languages may have two or more tokens in a rule.
- -- For example, an HTML tag consists of an element token followed by an
- -- optional set of attribute tokens. This allows each part of the tag to be
- -- colored distinctly.
- --
- -- The set of rules that comprises Lua is specified in a `_rules` table for the
- -- lexer.
- --
- -- _rules = {
- -- { 'whitespace', ws },
- -- { 'keyword', keyword },
- -- { 'function', func },
- -- { 'constant', constant },
- -- { 'identifier', identifier },
- -- { 'string', string },
- -- { 'comment', comment },
- -- { 'number', number },
- -- { 'operator', operator },
- -- { 'any_char', l.any_char },
- -- }
- --
- -- Each entry is a rule name and its associated pattern. Please note that the
- -- names of the rules can be completely different than the names of the tokens
- -- contained within them.
- --
- -- The order of the rules is important because of the nature of LPeg. LPeg tries
- -- to apply the first rule to the current position in the text it is matching.
- -- If there is a match, it colors that section appropriately and moves on. If
- -- there is not a match, it tries the next rule, and so on. Suppose instead that
- -- the `identifier` rule was before the `keyword` rule. It can be seen that all
- -- keywords satisfy the requirements for being an identifier, so any keywords
- -- would be incorrectly colored as identifiers. This is why `identifier` is
- -- where it is in the `_rules` table.
- --
- -- You might be wondering what that `any_char` is doing at the bottom of
- -- `_rules`. Its purpose is to match anything not accounted for in the above
- -- rules. For example, suppose the `!` character is in the input text. It will
- -- not be matched by any of the first 9 rules, so without `any_char`, the text
- -- would not match at all, and no coloring would occur. `any_char` matches one
- -- single character and moves on. It may be colored red (indicating a syntax
- -- error) if desired because it is a token, not just a pattern.
- --
- -- #### Summary
- --
- -- The above method of defining tokens and rules is sufficient for a majority of
- -- lexers. The `lexer` module provides many useful patterns and functions for
- -- constructing a working lexer quickly and efficiently. In most cases, the
- -- amount of knowledge of LPeg required to write a lexer is minimal.
- --
- -- As long as you used the default token types provided by `lexer`, you do not
- -- have to specify any coloring (or styling) information in the lexer; it is
- -- taken care of by the user's color theme.
- --
- -- The rest of this document is devoted to more complex lexer techniques.
- --
- -- #### Styling Tokens
- --
- -- The term for coloring text is styling. Just like with predefined LPeg
- -- patterns in `lexer`, predefined styles are available.
- --
- -- * `style_nothing`: Typically used for whitespace.
- -- * `style_char`: Typically used for character literals.
- -- * `style_class`: Typically used for class definitions.
- -- * `style_comment`: Typically used for code comments.
- -- * `style_constant`: Typically used for constants.
- -- * `style_definition`: Typically used for definitions.
- -- * `style_error`: Typically used for erroneous syntax.
- -- * `style_function`: Typically used for function definitions.
- -- * `style_keyword`: Typically used for language keywords.
- -- * `style_number`: Typically used for numbers.
- -- * `style_operator`: Typically used for operators.
- -- * `style_string`: Typically used for strings.
- -- * `style_preproc`: Typically used for preprocessor statements.
- -- * `style_tag`: Typically used for markup tags.
- -- * `style_type`: Typically used for static types.
- -- * `style_variable`: Typically used for variables.
- -- * `style_embedded`: Typically used for embedded code.
- -- * `style_identifier`: Typically used for identifier words.
- --
- -- Each style consists of a set of attributes:
- --
- -- + `font`: The style's font name.
- -- + `size`: The style's font size.
- -- + `bold`: Flag indicating whether or not the font is boldface.
- -- + `italic`: Flag indicating whether or not the font is italic.
- -- + `underline`: Flag indicating whether or not the font is underlined.
- -- + `fore`: The color of the font face.
- -- + `back`: The color of the font background.
- -- + `eolfilled`: Flag indicating whether or not to color the end of the line.
- -- + `characterset`: The character set of the font.
- -- + `case`: The case of the font. 1 for upper case, 2 for lower case, 0 for
- -- normal case.
- -- + `visible`: Flag indicating whether or not the text is visible.
- -- + `changable`: Flag indicating whether or not the text is read-only.
- -- + `hotspot`: Flag indicating whether or not the style is clickable.
- --
- -- Styles are created with [`style()`](#style). For example:
- --
- -- -- style with default theme settings
- -- local style_nothing = l.style { }
- --
- -- -- style with bold text with default theme font
- -- local style_bold = l.style { bold = true }
- --
- -- -- style with bold italic text with default theme font
- -- local style_bold_italic = l.style { bold = true, italic = true }
- --
- -- The `style_bold_italic` style can be rewritten in terms of `style_bold`:
- --
- -- local style_bold_italic = style_bold..{ italic = true }
- --
- -- In this way you can build on previously defined styles without having to
- -- rewrite them. Note the previous style is left unchanged.
- --
- -- Style colors are different than the #rrggbb RGB notation you may be familiar
- -- with. Instead, create a color using [`color()`](#color).
- --
- -- local red = l.color('FF', '00', '00')
- -- local green = l.color('00', 'FF', '00')
- -- local blue = l.color('00', '00', 'FF')
- --
- -- As you might have guessed, `lexer` has a set of default colors.
- --
- -- * `green`
- -- * `blue`
- -- * `red`
- -- * `yellow`
- -- * `teal`
- -- * `white`
- -- * `black`
- -- * `grey`
- -- * `purple`
- -- * `orange`
- --
- -- It is recommended to use them to stay consistant with a user's color theme.
- --
- -- Finally, styles are assigned to tokens via a `_tokenstyles` table in the
- -- lexer. Styles do not have to be assigned to the default tokens; it is done
- -- automatically. You only have to assign styles for tokens you create. For
- -- example:
- --
- -- local lua = token('lua', P('lua'))
- --
- -- -- ... other patterns and tokens ...
- --
- -- _tokenstyles = {
- -- { 'lua', l.style_keyword },
- -- }
- --
- -- Each entry is the token name the style is for and the style itself. The order
- -- of styles in `_tokenstyles` does not matter.
- --
- -- For examples of how styles are created, please see the theme files in the
- -- `lexers/themes/` folder.
- --
- -- #### Line Lexer
- --
- -- Sometimes it is advantageous to lex input text line by line rather than a
- -- chunk at a time. This occurs particularly in diff, patch, or make files. Put
- --
- -- _LEXBYLINE = true
- --
- -- somewhere in your lexer in order to do this.
- --
- -- #### Embedded Lexers
- --
- -- A particular advantage that dynamic lexers have over static ones is that
- -- lexers can be embedded within one another very easily, requiring minimal
- -- effort. There are two kinds of embedded lexers: a parent lexer that embeds
- -- other child lexers in it, and a child lexer that embeds itself within a
- -- parent lexer.
- --
- -- #### Parent Lexer with Children
- --
- -- An example of this kind of lexer is HTML with embedded CSS and Javascript.
- -- After creating the parent lexer, load the children lexers in it using
- -- [`lexer.load()`](#load). For example:
- --
- -- local css = l.load('css')
- --
- -- There needs to be a transition from the parent HTML lexer to the child CSS
- -- lexer. This is something of the form `<style type="text/css">`. Similarly,
- -- the transition from child to parent is `</style>`.
- --
- -- local css_start_rule = #(P('<') * P('style') *
- -- P(function(input, index)
- -- if input:find('[^>]+type%s*=%s*(["\'])text/css%1') then
- -- return index
- -- end
- -- end)) * tag
- -- local css_end_rule = #(P('</') * P('style') * ws^0 * P('>')) * tag
- --
- -- where `tag` and `ws` have been previously defined in the HTML lexer.
- --
- -- Now the CSS lexer can be embedded using [`embed_lexer()`](#embed_lexer):
- --
- -- l.embed_lexer(_M, css, css_start_rule, css_end_rule)
- --
- -- What is `_M`? It is the parent HTML lexer object, not the string `...` or
- -- `'html'`. The lexer object is needed by [`embed_lexer()`](#embed_lexer).
- --
- -- The same procedure can be done for Javascript.
- --
- -- local js = l.load('javascript')
- --
- -- local js_start_rule = #(P('<') * P('script') *
- -- P(function(input, index)
- -- if input:find('[^>]+type%s*=%s*(["\'])text/javascript%1') then
- -- return index
- -- end
- -- end)) * tag
- -- local js_end_rule = #('</' * P('script') * ws^0 * '>') * tag
- -- l.embed_lexer(_M, js, js_start_rule, js_end_rule)
- --
- -- #### Child Lexer Within Parent
- --
- -- An example of this kind of lexer is PHP embedded in HTML. After creating the
- -- child lexer, load the parent lexer. As an example:
- --
- -- local html = l.load('hypertext')
- --
- -- Since HTML should be the main lexer, (PHP is just a preprocessing language),
- -- the following statement changes the main lexer from PHP to HTML:
- --
- -- _lexer = html
- --
- -- Like in the previous section, transitions from HTML to PHP and back are
- -- specified:
- --
- -- local php_start_rule = token('php_tag', '<?' * ('php' * l.space)^-1)
- -- local php_end_rule = token('php_tag', '?>')
- --
- -- And PHP is embedded:
- --
- -- l.embed_lexer(html, _M, php_start_rule, php_end_rule)
- --
- -- #### Code Folding (Optional)
- --
- -- It is sometimes convenient to "fold", or not show blocks of text. These
- -- blocks can be functions, classes, comments, etc. A folder iterates over each
- -- line of input text and assigns a fold level to it. Certain lines can be
- -- specified as fold points that fold subsequent lines with a higher fold level.
- --
- -- In order to implement a folder, define the following function in your lexer:
- --
- -- function _fold(input, start_pos, start_line, start_level)
- --
- -- end
- --
- -- + `input`: The text to fold.
- -- + `start_pos`: Current position in the buffer of the text (used for obtaining
- -- style information from the document).
- -- + `start_line`: The line number the text starts at.
- -- + `start_level`: The fold level of the text at `start_line`.
- --
- -- The function must return a table whose indices are line numbers and whose
- -- values are tables containing the fold level and optionally a fold flag.
- --
- -- The following Scintilla fold flags are available:
- --
- -- * `SC_FOLDLEVELBASE`: The initial (root) fold level.
- -- * `SC_FOLDLEVELWHITEFLAG`: Flag indicating that the line is blank.
- -- * `SC_FOLDLEVELHEADERFLAG`: Flag indicating the line is fold point.
- -- * `SC_FOLDLEVELNUMBERMASK`: Flag used with `SCI_GETFOLDLEVEL(line)` to get
- -- the fold level of a line.
- --
- -- Have your fold function interate over each line, setting fold levels. You can
- -- use the [`get_style_at()`](#get_style_at), [`get_property()`](#get_property),
- -- [`get_fold_level()`](#get_fold_level), and
- -- [`get_indent_amount()`](#get_indent_amount) functions as necessary to
- -- determine the fold level for each line. The following example sets fold
- -- points by changes in indentation.
- --
- -- function _fold(input, start_pos, start_line, start_level)
- -- local folds = {}
- -- local current_line = start_line
- -- local prev_level = start_level
- -- for indent, line in text:gmatch('([\t ]*)(.-)\r?\n') do
- -- if #line > 0 then
- -- local current_level = l.get_indent_amount(current_line)
- -- if current_level > prev_level then -- next level
- -- local i = current_line - 1
- -- while folds[i] and folds[i][2] == l.SC_FOLDLEVELWHITEFLAG do
- -- i = i - 1
- -- end
- -- if folds[i] then
- -- folds[i][2] = l.SC_FOLDLEVELHEADERFLAG -- low indent
- -- end
- -- folds[current_line] = { current_level } -- high indent
- -- elseif current_level < prev_level then -- prev level
- -- if folds[current_line - 1] then
- -- folds[current_line - 1][1] = prev_level -- high indent
- -- end
- -- folds[current_line] = { current_level } -- low indent
- -- else -- same level
- -- folds[current_line] = { prev_level }
- -- end
- -- prev_level = current_level
- -- else
- -- folds[current_line] = { prev_level, l.SC_FOLDLEVELWHITEFLAG }
- -- end
- -- current_line = current_line + 1
- -- end
- -- return folds
- -- end
- --
- -- SciTE users note: do not use `get_property` for getting fold options from a
- -- `.properties` file because SciTE is not set up to forward them to your lexer.
- -- Instead, you can provide options that can be set at the top of the lexer.
- --
- -- #### Using the Lexer with SciTE
- --
- -- Create a `.properties` file for your lexer and `import` it in either your
- -- `SciTEUser.properties` or `SciTEGlobal.properties`. The contents of the
- -- `.properties` file should contain:
- --
- -- file.patterns.[lexer_name]=[file_patterns]
- -- lexer.$(file.patterns.[lexer_name])=[lexer_name]
- --
- -- where [lexer\_name] is the name of your lexer (minus the `.lua` extension)
- -- and [file\_patterns] is a set of file extensions matched to your lexer.
- --
- -- Please note any styling information in `.properties` files is ignored.
- --
- -- #### Using the Lexer with Textadept
- --
- -- Put your lexer in your [`~/.textadept/`][user]`lexers/` directory. That way
- -- your lexer will not be overwritten when upgrading. Also, lexers in this
- -- directory override default lexers. (A user `lua` lexer would be loaded
- -- instead of the default `lua` lexer. This is convenient if you wish to tweak
- -- a default lexer to your liking.) Do not forget to add a
- -- [mime-type](textadept.mime_types.html) for your lexer.
- --
- -- [user]: http://caladbolg.net/luadoc/textadept/manual/5_FolderStructure.html
- --
- -- #### Optimization
- --
- -- Lexers can usually be optimized for speed by re-arranging tokens so that the
- -- most common ones are recognized first. Keep in mind the issue that was raised
- -- earlier: if you put similar tokens like `identifier`s before `keyword`s, the
- -- latter will not be styled correctly.
- --
- -- #### Troubleshooting
- --
- -- Errors in lexers can be tricky to debug. Lua errors are printed to STDERR
- -- and `_G.print()` statements in lexers are printed to STDOUT.
- --
- -- #### Limitations
- --
- -- True embedded preprocessor language highlighting is not available. For most
- -- cases this will not be noticed, but code like
- --
- -- <div id="<?php echo $id; ?>">
- --
- -- or
- --
- -- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
- --
- -- will not highlight correctly.
- --
- -- #### Performance
- --
- -- There might be some slight overhead when initializing a lexer, but loading a
- -- file from disk into Scintilla is usually more expensive.
- --
- -- On modern computer systems, I see no difference in speed between LPeg lexers
- -- and Scintilla's C++ ones.
- --
- -- #### Risks
- --
- -- Poorly written lexers have the ability to crash Scintilla, so unsaved data
- -- might be lost. However, these crashes have only been observed in early lexer
- -- development, when syntax errors or pattern errors are present. Once the lexer
- -- actually starts styling text (either correctly or incorrectly; it does not
- -- matter), no crashes have occurred.
- --
- -- #### Acknowledgements
- --
- -- Thanks to Peter Odding for his [lexer post][post] on the Lua mailing list
- -- that inspired me, and of course thanks to Roberto Ierusalimschy for LPeg.
- --
- -- [post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
- local lpeg = require 'lpeg'
- local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
- local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
- local lpeg_match = lpeg.match
- package.path = _LEXERHOME..'/?.lua'
- -- Adds a rule to a lexer's current ordered list of rules.
- -- @param lexer The lexer to add the given rule to.
- -- @param name The name associated with this rule. It is used for other lexers
- -- to access this particular rule from the lexer's `_RULES` table. It does not
- -- have to be the same as the name passed to `token`.
- -- @param rule The LPeg pattern of the rule.
- local function add_rule(lexer, id, rule)
- if not lexer._RULES then
- ---
- -- List of rule names with associated LPeg patterns for a specific lexer.
- -- It is accessible to other lexers for embedded lexer applications.
- -- @class table
- -- @name _RULES
- lexer._RULES = {}
- -- Contains an ordered list (by numerical index) of rule names. This is used
- -- in conjunction with lexer._RULES for building _TOKENRULE.
- lexer._RULEORDER = {}
- end
- lexer._RULES[id] = rule
- lexer._RULEORDER[#lexer._RULEORDER + 1] = id
- end
- -- Adds a new Scintilla style to Scintilla.
- -- @param lexer The lexer to add the given style to.
- -- @param token_name The name of the token associated with this style.
- -- @param style A Scintilla style created from style().
- -- @see style
- local function add_style(lexer, token_name, style)
- local len = lexer._STYLES.len
- if len == 32 then len = len + 8 end -- skip predefined styles
- if len >= 128 then _G.print('Too many styles defined (128 MAX)') end
- lexer._TOKENS[token_name] = len
- lexer._STYLES[len] = style
- lexer._STYLES.len = len + 1
- end
- -- (Re)constructs lexer._TOKENRULE.
- -- @param parent The parent lexer.
- local function join_tokens(lexer)
- local patterns, order = lexer._RULES, lexer._RULEORDER
- local token_rule = patterns[order[1]]
- for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
- lexer._TOKENRULE = token_rule
- return lexer._TOKENRULE
- end
- -- (Re)constructs lexer._GRAMMAR.
- -- @param lexer The parent lexer.
- -- @param initial_rule The name of the rule to start lexing with. Defaults to
- -- lexer._NAME. Multilang lexers use this to start with a child rule if
- -- necessary.
- local function build_grammar(lexer, initial_rule)
- local token_rule = join_tokens(lexer)
- local children = lexer._CHILDREN
- if children then
- local lexer_name = lexer._NAME
- if not initial_rule then initial_rule = lexer_name end
- local grammar = { initial_rule, [lexer_name] = token_rule^0 }
- for _, child in ipairs(children) do
- local child_name = child._NAME
- local embedded_child = '_'..child_name
- local rules = child._EMBEDDEDRULES[lexer_name]
- grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
- rules.token_rule)^0 * rules.end_rule^-1
- token_rule = lpeg_V(embedded_child) + token_rule
- grammar[child_name] = (-rules.end_rule * rules.token_rule)^0 *
- rules.end_rule^-1 * lpeg_V(lexer_name)
- end
- grammar[lexer_name] = token_rule^0
- lexer._INITIALRULE = initial_rule
- lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
- else
- lexer._GRAMMAR = lpeg_Ct(token_rule^0)
- end
- end
- -- Default tokens.
- -- Contains token identifiers and associated style numbers.
- -- @class table
- -- @name tokens
- -- @field default The default type (0).
- -- @field whitespace The whitespace type (1).
- -- @field comment The comment type (2).
- -- @field string The string type (3).
- -- @field number The number type (4).
- -- @field keyword The keyword type (5).
- -- @field identifier The identifier type (6).
- -- @field operator The operator type (7).
- -- @field error The error type (8).
- -- @field preprocessor The preprocessor type (9).
- -- @field constant The constant type (10).
- -- @field function The function type (11).
- -- @field class The class type (12).
- -- @field type The type type (13).
- local tokens = {
- default = 0,
- whitespace = 1,
- comment = 2,
- string = 3,
- number = 4,
- keyword = 5,
- identifier = 6,
- operator = 7,
- error = 8,
- preprocessor = 9,
- constant = 10,
- variable = 11,
- ['function'] = 12,
- class = 13,
- type = 14,
- }
- local string_upper = string.upper
- for k, v in pairs(tokens) do _M[string_upper(k)] = k end
- ---
- -- Initializes the specified lexer.
- -- @param lexer_name The name of the lexing language.
- function load(lexer_name)
- _M.WHITESPACE = lexer_name..'_whitespace'
- local lexer = require(lexer_name or 'null')
- if not lexer then error('Lexer '..lexer_name..' does not exist') end
- lexer._TOKENS = tokens
- lexer._STYLES = {
- [0] = style_nothing,
- [1] = style_whitespace,
- [2] = style_comment,
- [3] = style_string,
- [4] = style_number,
- [5] = style_keyword,
- [6] = style_identifier,
- [7] = style_operator,
- [8] = style_error,
- [9] = style_preproc,
- [10] = style_constant,
- [11] = style_variable,
- [12] = style_function,
- [13] = style_class,
- [14] = style_type,
- len = 15,
- -- Predefined styles.
- [32] = style_default,
- [33] = style_line_number,
- [34] = style_bracelight,
- [35] = style_bracebad,
- [36] = style_controlchar,
- [37] = style_indentguide,
- [38] = style_calltip,
- }
- if lexer._lexer then
- local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
- if not l._tokenstyles then l._tokenstyles = {} end
- for _, r in ipairs(_r or {}) do
- -- Prevent rule id clashes.
- l._rules[#l._rules + 1] = { lexer._NAME..'_'..r[1], r[2] }
- end
- for _, s in ipairs(_s or {}) do l._tokenstyles[#l._tokenstyles + 1] = s end
- -- Each lexer that is loaded with l.load() has its _STYLES modified through
- -- add_style(). Reset _lexer's _STYLES accordingly.
- -- For example: RHTML load's HTML (which loads CSS and Javascript). CSS's
- -- styles are added to css._STYLES and JS's styles are added to js._STYLES.
- -- HTML adds its styles to html._STYLES as well as CSS's and JS's styles.
- -- RHTML adds its styles, HTML's styles, CSS's styles, and JS's styles to
- -- rhtml._STYLES. The problem is that rhtml == _lexer == html. Therefore
- -- html._STYLES would contain duplicate styles. Compensate by setting
- -- html._STYLES to rhtml._STYLES.
- l._STYLES = lexer._STYLES
- lexer = l
- end
- if lexer._rules then
- for _, s in ipairs(lexer._tokenstyles or {}) do
- add_style(lexer, s[1], s[2])
- end
- for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
- build_grammar(lexer)
- end
- add_style(lexer, lexer._NAME..'_whitespace', style_whitespace)
- _G._LEXER = lexer
- return lexer
- end
- ---
- -- Lexes the given text.
- -- Called by LexLPeg.cxx; do not call from Lua.
- -- If the lexer has a _LEXBYLINE flag set, the text is lexed one line at a time.
- -- Otherwise the text is lexed as a whole.
- -- @param text The text to lex.
- -- @param init_style The current style. Multilang lexers use this to determine
- -- which language to start lexing in.
- function lex(text, init_style)
- local lexer = _G._LEXER
- if not lexer._GRAMMAR then return {} end
- if not lexer._LEXBYLINE then
- -- For multilang lexers, build a new grammar whose initial_rule is the
- -- current language.
- if lexer._CHILDREN then
- for style, style_num in pairs(lexer._TOKENS) do
- if style_num == init_style then
- local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
- if lexer._INITIALRULE ~= lexer_name then
- build_grammar(lexer, lexer_name)
- end
- break
- end
- end
- end
- return lpeg_match(lexer._GRAMMAR, text)
- else
- local tokens = {}
- local function append(tokens, line_tokens, offset)
- for _, token in ipairs(line_tokens) do
- token[2] = token[2] + offset
- tokens[#tokens + 1] = token
- end
- end
- local offset = 0
- local grammar = lexer._GRAMMAR
- for line in text:gmatch('[^\r\n]*[\r\n]*') do
- local line_tokens = lpeg_match(grammar, line)
- if line_tokens then append(tokens, line_tokens, offset) end
- offset = offset + #line
- -- Use the default style to the end of the line if none was specified.
- if tokens[#tokens][2] ~= offset then
- tokens[#tokens + 1] = { 'default', offset + 1 }
- end
- end
- return tokens
- end
- end
- ---
- -- Folds the given text.
- -- Called by LexLPeg.cxx; do not call from Lua.
- -- If the current lexer has no _fold function, folding by indentation is
- -- performed if the 'fold.by.indentation' property is set.
- -- @param text The document text to fold.
- -- @param start_pos The position in the document text starts at.
- -- @param start_line The line number text starts on.
- -- @param start_level The fold level text starts on.
- -- @return Table of fold levels.
- function fold(text, start_pos, start_line, start_level)
- local folds = {}
- local lexer = _G._LEXER
- if lexer._fold then
- return lexer._fold(text, start_pos, start_line, start_level)
- elseif GetProperty('fold.by.indentation', 1) == 1 then
- local GetIndentAmount, GetFoldLevel, SetFoldLevel =
- GetIndentAmount, GetFoldLevel, SetFoldLevel
- local SC_FOLDLEVELHEADERFLAG, SC_FOLDLEVELWHITEFLAG =
- SC_FOLDLEVELHEADERFLAG, SC_FOLDLEVELWHITEFLAG
- -- Indentation based folding.
- local current_line = start_line
- local prev_level = start_level
- for indent, line in text:gmatch('([\t ]*)(.-)\r?\n') do
- if #line > 0 then
- local current_level = GetIndentAmount(current_line)
- if current_level > prev_level then -- next level
- local i = current_line - 1
- while folds[i] and folds[i][2] == SC_FOLDLEVELWHITEFLAG do
- i = i - 1
- end
- if folds[i] then
- folds[i][2] = SC_FOLDLEVELHEADERFLAG -- low indent
- end
- folds[current_line] = { current_level } -- high indent
- elseif current_level < prev_level then -- prev level
- if folds[current_line - 1] then
- folds[current_line - 1][1] = prev_level -- high indent
- end
- folds[current_line] = { current_level } -- low indent
- else -- same level
- folds[current_line] = { prev_level }
- end
- prev_level = current_level
- else
- folds[current_line] = { prev_level, SC_FOLDLEVELWHITEFLAG }
- end
- current_line = current_line + 1
- end
- return folds
- end
- end
- -- The following are utility functions lexers will have access to.
- -- Common patterns.
- any = lpeg_P(1)
- ascii = lpeg_R('\000\127')
- extend = lpeg_R('\000\255')
- alpha = lpeg_R('AZ', 'az')
- digit = lpeg_R('09')
- alnum = lpeg_R('AZ', 'az', '09')
- lower = lpeg_R('az')
- upper = lpeg_R('AZ')
- xdigit = lpeg_R('09', 'AF', 'af')
- cntrl = lpeg_R('\000\031')
- graph = lpeg_R('!~')
- print = lpeg_R(' ~')
- punct = lpeg_R('!/', ':@', '[\'', '{~')
- space = lpeg_S('\t\v\f\n\r ')
- newline = lpeg_S('\r\n\f')^1
- nonnewline = 1 - newline
- nonnewline_esc = 1 - (newline + '\\') + '\\' * any
- dec_num = digit^1
- hex_num = '0' * lpeg_S('xX') * xdigit^1
- oct_num = '0' * lpeg_R('07')^1
- integer = lpeg_S('+-')^-1 * (hex_num + oct_num + dec_num)
- float = lpeg_S('+-')^-1 *
- (digit^0 * '.' * digit^1 + digit^1 * '.' * digit^0 + digit^1) *
- lpeg_S('eE') * lpeg_S('+-')^-1 * digit^1
- word = (alpha + '_') * (alnum + '_')^0
- ---
- -- Creates an LPeg capture table index with the name and position of the token.
- -- @param name The name of token. If this name is not in `l.tokens` then you
- -- will have to specify a style for it in `lexer._tokenstyles`.
- -- @param patt The LPeg pattern associated with the token.
- -- @usage local ws = token(l.WHITESPACE, l.space^1)
- -- @usage php_start_rule = token('php_tag', '<?' * ('php' * l.space)^-1)
- function token(name, patt)
- if not name then _G.print('noname') end
- return lpeg_Ct(lpeg_Cc(name) * patt * lpeg_Cp())
- end
- -- common tokens
- any_char = token('default', any)
- ---
- -- Creates a Scintilla style from a table of style properties.
- -- @param style_table A table of style properties.
- -- Style properties available:
- -- font = [string]
- -- size = [integer]
- -- bold = [boolean]
- -- italic = [boolean]
- -- underline = [boolean]
- -- fore = [integer]*
- -- back = [integer]*
- -- eolfilled = [boolean]
- -- characterset = ?
- -- case = [integer]
- -- visible = [boolean]
- -- changeable = [boolean]
- -- hotspot = [boolean]
- -- * Use the value returned by `color()`.
- -- @usage local bold_italic = style { bold = true, italic = true }
- -- @see color
- function style(style_table)
- setmetatable(style_table, {
- __concat = function(t1, t2)
- local t = setmetatable({}, getmetatable(t1)) -- duplicate t1
- for k,v in pairs(t1) do t[k] = v end
- for k,v in pairs(t2) do t[k] = v end
- return t
- end
- })
- return style_table
- end
- ---
- -- Creates a Scintilla color.
- -- @param r The string red component of the hexadecimal color.
- -- @param g The string green component of the color.
- -- @param b The string blue component of the color.
- -- @usage local red = color('FF', '00', '00')
- function color(r, g, b) return tonumber(b..g..r, 16) end
- ---
- -- Creates an LPeg pattern that matches a range of characters delimitted by a
- -- specific character(s).
- -- This can be used to match a string, parenthesis, etc.
- -- @param chars The character(s) that bound the matched range.
- -- @param escape Optional escape character. This parameter may be omitted, nil,
- -- or the empty string.
- -- @param end_optional Optional flag indicating whether or not an ending
- -- delimiter is optional or not. If true, the range begun by the start
- -- delimiter matches until an end delimiter or the end of the input is
- -- reached.
- -- @param balanced Optional flag indicating whether or not a balanced range is
- -- matched, like `%b` in Lua's `string.find`. This flag only applies if
- -- `chars` consists of two different characters (e.g. '()').
- -- @param forbidden Optional string of characters forbidden in a delimited
- -- range. Each character is part of the set.
- -- @usage local sq_str_noescapes = delimited_range("'")
- -- @usage local sq_str_escapes = delimited_range("'", '\\', true)
- -- @usage local unbalanced_parens = delimited_range('()', '\\', true)
- -- @usage local balanced_parens = delimited_range('()', '\\', true, true)
- function delimited_range(chars, escape, end_optional, balanced, forbidden)
- local s = chars:sub(1, 1)
- local e = #chars == 2 and chars:sub(2, 2) or s
- local range
- local b = balanced and s or ''
- local f = forbidden or ''
- if not escape or escape == '' then
- local invalid = lpeg_S(e..f..b)
- range = any - invalid
- else
- local invalid = lpeg_S(e..f..b) + escape
- range = any - invalid + escape * any
- end
- if balanced and s ~= e then
- return lpeg_P{ s * (range + lpeg_V(1))^0 * e }
- else
- if end_optional then e = lpeg_P(e)^-1 end
- return s * range^0 * e
- end
- end
- ---
- -- Creates an LPeg pattern from a given pattern that matches the beginning of a
- -- line and returns it.
- -- @param patt The LPeg pattern to match at the beginning of a line.
- -- @usage local preproc = token(l.PREPROCESSOR, #P('#') * l.starts_line('#' *
- -- l.nonnewline^0))
- function starts_line(patt)
- return lpeg_P(function(input, idx)
- if idx == 1 then return idx end
- local char = input:sub(idx - 1, idx - 1)
- if char == '\n' or char == '\r' or char == '\f' then return idx end
- end) * patt
- end
- ---
- -- Similar to `delimited_range()`, but allows for multi-character delimitters.
- -- This is useful for lexers with tokens such as nested block comments. With
- -- single-character delimiters, this function is identical to
- -- `delimited_range(start_chars..end_chars, nil, end_optional, true)`.
- -- @param start_chars The string starting a nested sequence.
- -- @param end_chars The string ending a nested sequence.
- -- @param end_optional Optional flag indicating whether or not an ending
- -- delimiter is optional or not. If true, the range begun by the start
- -- delimiter matches until an end delimiter or the end of the input is
- -- reached.
- -- @usage local nested_comment = l.nested_pair('/*', '*/', true)
- function nested_pair(start_chars, end_chars, end_optional)
- local s, e = start_chars, end_optional and lpeg_P(end_chars)^-1 or end_chars
- return lpeg_P{ s * (any - s - end_chars + lpeg_V(1))^0 * e }
- end
- ---
- -- Creates an LPeg pattern that matches a set of words.
- -- @param words A table of words.
- -- @param word_chars Optional string of additional characters considered to be
- -- part of a word (default is `%w_`).
- -- @param case_insensitive Optional boolean flag indicating whether the word
- -- match is case-insensitive.
- -- @usage local keyword = token(l.KEYWORD, word_match { 'foo', 'bar', 'baz' })
- -- @usage local keyword = token(l.KEYWORD, word_match({ 'foo-bar', 'foo-baz',
- -- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar' }, '-', true))
- function word_match(words, word_chars, case_insensitive)
- local word_list = {}
- for _, word in ipairs(words) do word_list[word] = true end
- local chars = '%w_'
- -- escape 'magic' characters
- -- TODO: append chars to the end so ^_ can be passed for not including '_'s
- if word_chars then chars = chars..word_chars:gsub('([%^%]%-])', '%%%1') end
- return lpeg_P(function(input, index)
- local s, e, word = input:find('^(['..chars..']+)', index)
- if word then
- if case_insensitive then word = word:lower() end
- return word_list[word] and e + 1 or nil
- end
- end)
- end
- ---
- -- Embeds a child lexer language in a parent one.
- -- @param parent The parent lexer.
- -- @param child The child lexer.
- -- @param start_rule The token that signals the beginning of the embedded
- -- lexer.
- -- @param end_rule The token that signals the end of the embedded lexer.
- -- @usage embed_lexer(_M, css, css_start_rule, css_end_rule)
- -- @usage embed_lexer(html, _M, php_start_rule, php_end_rule)
- -- @usage embed_lexer(html, ruby, ruby_start_rule, rule_end_rule)
- function embed_lexer(parent, child, start_rule, end_rule)
- -- Add child rules.
- if not child._EMBEDDEDRULES then
- ---
- -- Set of rules for an embedded lexer.
- -- For a parent lexer name, contains child's `start_rule`, `token_rule`, and
- -- `end_rule` patterns.
- -- @class table
- -- @name _EMBEDDEDRULES
- child._EMBEDDEDRULES = {}
- end
- if not child._RULES then -- creating a child lexer to be embedded
- if not child._rules then error('Cannot embed language with no rules') end
- for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
- end
- child._EMBEDDEDRULES[parent._NAME] = {
- ['start_rule'] = start_rule,
- token_rule = join_tokens(child),
- ['end_rule'] = end_rule
- }
- if not parent._CHILDREN then parent._CHILDREN = {} end
- local children = parent._CHILDREN
- children[#children + 1] = child
- -- Add child styles.
- local tokenstyles = parent._tokenstyles
- tokenstyles[#tokenstyles + 1] = { child._NAME..'_whitespace',
- style_whitespace }
- for _, style in ipairs(child._tokenstyles or {}) do
- tokenstyles[#tokenstyles + 1] = style
- end
- -- Add child's embedded lexers.
- -- local children2 = child._CHILDREN
- -- if children2 then
- -- for _, child2 in ipairs(children2) do
- -- child2._EMBEDDEDRULES[parent._NAME] = child2._EMBEDDEDRULES[child._NAME]
- -- children[#children + 1] = child2
- -- end
- -- end
- end
- -- Registered functions and constants.
- ---
- -- Returns the integer style number at a given position.
- -- @param pos The position to get the style for.
- function get_style_at(pos) end
- get_style_at = GetStyleAt
- ---
- -- Returns an integer property value for a given key.
- -- @param key The property key.
- -- @param default Optional integer value to return if key is not set.
- function get_property(key, default) end
- get_property = GetProperty
- ---
- -- Returns the fold level for a given line.
- -- This level already has `SC_FOLDLEVELBASE` added to it, so you do not need to
- -- add it yourself.
- -- @param line_number The line number to get the fold level of.
- function get_fold_level(line) end
- get_fold_level = GetFoldLevel
- ---
- -- Returns the indent amount of text for a given line.
- -- @param line The line number to get the indent amount of.
- function get_indent_amount(line) end
- get_indent_amount = GetIndentAmount
- _M.SC_FOLDLEVELBASE = SC_FOLDLEVELBASE
- _M.SC_FOLDLEVELWHITEFLAG = SC_FOLDLEVELWHITEFLAG
- _M.SC_FOLDLEVELHEADERFLAG = SC_FOLDLEVELHEADERFLAG
- _M.SC_FOLDLEVELNUMBERMASK = SC_FOLDLEVELNUMBERMASK
- -- Load theme.
- if _THEME and _THEME ~= '' then
- local ret, errmsg
- if not _THEME:find('[/\\]') then -- name of stock theme
- ret, errmsg = pcall(dofile, _LEXERHOME..'/themes/'.._THEME..'.lua')
- else -- absolute path of a theme
- ret, errmsg = pcall(dofile, _THEME)
- end
- if not ret and errmsg then _G.print(errmsg) end
- end