PageRenderTime 60ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/lexers/lexer.lua

https://code.google.com/p/scite-for-lua/
Lua | 1171 lines | 331 code | 32 blank | 808 comment | 66 complexity | 3b6380780c66f3efacf5aaaedfaca6ba MD5 | raw file
Possible License(s): ISC
  1. -- Copyright 2006-2010 Mitchell mitchell<att>caladbolg.net. See LICENSE.
  2. ---
  3. -- Performs lexing of Scintilla documents.
  4. module('lexer', package.seeall)
  5. -- Markdown:
  6. -- ## Overview
  7. --
  8. -- Dynamic lexers are more flexible than Scintilla's static ones. They are often
  9. -- more readable as well. This document provides all the information necessary
  10. -- in order to write a new lexer. For illustrative purposes, a Lua lexer will be
  11. -- created. Lexers are written using Parsing Expression Grammars or PEGs with
  12. -- the Lua [LPeg library][LPeg]. Please familiarize yourself with LPeg's
  13. -- documentation before proceeding.
  14. --
  15. -- [LPeg]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
  16. --
  17. -- ## Writing a Dynamic Lexer
  18. --
  19. -- Rather than writing a lexer from scratch, first see if your language is
  20. -- similar to any of the 70+ languages supported. If so, you can copy and modify
  21. -- that lexer, saving some time and effort.
  22. --
  23. -- #### Introduction
  24. --
  25. -- All lexers are contained in the `lexers/` directory. To begin, create a Lua
  26. -- script with the name of your lexer and open it for editing.
  27. --
  28. -- $> cd lexers
  29. -- $> textadept lua.lua
  30. --
  31. -- Inside the lexer, the heading should look like the following:
  32. --
  33. -- -- Lua LPeg lexer
  34. --
  35. -- local l = lexer
  36. -- local token, word_match = l.token, l.word_match
  37. -- local P, R, S, V = l.lpeg.P, l.lpeg.R, l.lpeg.S, l.lpeg.V
  38. --
  39. -- module(...)
  40. --
  41. -- Each lexer is a module so the global namespace is not cluttered with lexer
  42. -- patterns and variables. The `...` is there for a reason! Do not replace it
  43. -- with the name of your lexer. This is done by Lua automatically.
  44. --
  45. -- The local variables above the module give easy access to the many useful
  46. -- functions available for creating lexers.
  47. --
  48. -- #### Lexer Language Structure
  49. --
  50. -- It is important to spend some time considering the structure of the language
  51. -- you are creating the lexer for. What kinds of tokens does it have? Comments,
  52. -- strings, keywords, etc.? Lua has 9 tokens: whitespace, comments, strings,
  53. -- numbers, keywords, functions, constants, identifiers, and operators.
  54. --
  55. -- #### Tokens
  56. --
  57. -- In a lexer, tokens are comprised of a token type followed by an LPeg pattern.
  58. -- They are created using the [`token()`](#token) function. The `lexer` (`l`)
  59. -- module provides a number of default token types:
  60. --
  61. -- * `DEFAULT`
  62. -- * `WHITESPACE`
  63. -- * `COMMENT`
  64. -- * `STRING`
  65. -- * `NUMBER`
  66. -- * `KEYWORD`
  67. -- * `IDENTIFIER`
  68. -- * `OPERATOR`
  69. -- * `ERROR`
  70. -- * `PREPROCESSOR`
  71. -- * `CONSTANT`
  72. -- * `VARIABLE`
  73. -- * `FUNCTION`
  74. -- * `CLASS`
  75. -- * `TYPE`
  76. --
  77. -- Please note you are not limited to just these token types; you can create
  78. -- your own. If you create your own, you will have to specify how they are
  79. -- colored. The procedure is discussed later.
  80. --
  81. -- A whitespace token typically looks like:
  82. --
  83. -- local ws = token(l.WHITESPACE, S('\t\v\f\n\r ')^1)
  84. --
  85. -- It is difficult to remember that a space character is either a `\t`, `\v`,
  86. -- `\f`, `\n`, `\r`, or ` `. The `lexer` module also provides you with a
  87. -- shortcut for this and many other character sequences. They are:
  88. --
  89. -- * `any`: Matches any single character.
  90. -- * `ascii`: Matches any ASCII character (`0`..`127`).
  91. -- * `extend`: Matches any ASCII extended character (`0`..`255`).
  92. -- * `alpha`: Matches any alphabetic character (`A-Z`, `a-z`).
  93. -- * `digit`: Matches any digit (`0-9`).
  94. -- * `alnum`: Matches any alphanumeric character (`A-Z`, `a-z`, `0-9`).
  95. -- * `lower`: Matches any lowercase character (`a-z`).
  96. -- * `upper`: Matches any uppercase character (`A-Z`).
  97. -- * `xdigit`: Matches any hexadecimal digit (`0-9`, `A-F`, `a-f`).
  98. -- * `cntrl`: Matches any control character (`0`..`31`).
  99. -- * `graph`: Matches any graphical character (`!` to `~`).
  100. -- * `print`: Matches any printable character (space to `~`).
  101. -- * `punct`: Matches any punctuation character not alphanumeric (`!` to `/`,
  102. -- `:` to `@`, `[` to `'`, `{` to `~`).
  103. -- * `space`: Matches any whitespace character (`\t`, `\v`, `\f`, `\n`, `\r`,
  104. -- space).
  105. -- * `newline`: Matches any newline characters.
  106. -- * `nonnewline`: Matches any non-newline character.
  107. -- * `nonnewline_esc`: Matches any non-newline character excluding newlines
  108. -- escaped with `\\`.
  109. -- * `dec_num`: Matches a decimal number.
  110. -- * `hex_num`: Matches a hexadecimal number.
  111. -- * `oct_num`: Matches an octal number.
  112. -- * `integer`: Matches a decimal, hexadecimal, or octal number.
  113. -- * `float`: Matches a floating point number.
  114. -- * `word`: Matches a typical word starting with a letter or underscore and
  115. -- then any alphanumeric or underscore characters.
  116. --
  117. -- The above whitespace token can be rewritten more simply as:
  118. --
  119. -- local ws = token(l.WHITESPACE, l.space^1)
  120. --
  121. -- The next Lua token is a comment. Short comments beginning with `--` are easy
  122. -- to express with LPeg:
  123. --
  124. -- local line_comment = '--' * l.nonnewline^0
  125. --
  126. -- On the other hand, long comments are more difficult to express because they
  127. -- have levels. See the [Lua Reference Manual][lexical_conventions] for more
  128. -- information. As a result, a functional pattern is necessary:
  129. --
  130. -- local longstring = #('[[' + ('[' * P('=')^0 * '[')) *
  131. -- P(function(input, index)
  132. -- local level = input:match('^%[(=*)%[', index)
  133. -- if level then
  134. -- local _, stop = input:find(']'..level..']', index, true)
  135. -- return stop and stop + 1 or #input + 1
  136. -- end
  137. -- end)
  138. -- local block_comment = '--' * longstring
  139. --
  140. -- The token for a comment is then:
  141. --
  142. -- local comment = token(l.COMMENT, line_comment + block_comment)
  143. --
  144. -- [lexical_conventions]: http://www.lua.org/manual/5.1/manual.html#2.1
  145. --
  146. -- It is worth noting that while token names are arbitrary, you are encouraged
  147. -- to use the ones listed in the [`tokens`](#tokens) table because a standard
  148. -- color theme is applied to them. If you wish to create a unique token, no
  149. -- problem. You can specify how it will be colored later on.
  150. --
  151. -- Lua strings should be easy to express because they are just characters
  152. -- surrounded by `'` or `"` characters, right? Not quite. Lua strings contain
  153. -- escape sequences (`\`*`char`*) so a `\'` sequence in a single-quoted string
  154. -- does not indicate the end of a string and must be handled appropriately.
  155. -- Fortunately, this is a common occurance in many programming languages, so a
  156. -- convenient function is provided: [`delimited_range()`](#delimited_range).
  157. --
  158. -- local sq_str = l.delimited_range("'", '\\', true)
  159. -- local dq_str = l.delimited_range('"', '\\', true)
  160. --
  161. -- Lua also has multi-line strings, but they have the same format as block
  162. -- comments. All strings can all be combined into a token:
  163. --
  164. -- local string = token(l.STRING, sq_str + dq_str + longstring)
  165. --
  166. -- Numbers are easy in Lua using `lexer`'s predefined patterns.
  167. --
  168. -- local lua_integer = P('-')^-1 * (l.hex_num + l.dec_num)
  169. -- local number = token(l.NUMBER, l.float + lua_integer)
  170. --
  171. -- Keep in mind that the predefined patterns may not be completely accurate for
  172. -- your language, so you may have to create your own variants. In the above
  173. -- case, Lua integers do not have octal sequences, so the `l.integer` pattern is
  174. -- not used.
  175. --
  176. -- Depending on the number of keywords for a particular language, a simple
  177. -- `P(keyword1) + P(keyword2) + ... + P(keywordN)` pattern can get quite large.
  178. -- In fact, LPeg has a limit on pattern size. Also, if the keywords are not case
  179. -- sensitive, additional complexity arises, so a better approach is necessary.
  180. -- Once again, `lexer` has a shortcut function: [`word_match()`](#word_match).
  181. --
  182. -- local keyword = token(l.KEYWORD, word_match {
  183. -- 'and', 'break', 'do', 'else', 'elseif', 'end', 'false', 'for',
  184. -- 'function', 'if', 'in', 'local', 'nil', 'not', 'or', 'repeat',
  185. -- 'return', 'then', 'true', 'until', 'while'
  186. -- })
  187. --
  188. -- If keywords were case-insensitive, an additional parameter would be specified
  189. -- in the call to [`word_match()`](#word_match); no other action is needed.
  190. --
  191. -- Lua functions and constants are specified like keywords:
  192. --
  193. -- local func = token(l.FUNCTION, word_match {
  194. -- 'assert', 'collectgarbage', 'dofile', 'error', 'getfenv',
  195. -- 'getmetatable', 'gcinfo', 'ipairs', 'loadfile', 'loadlib',
  196. -- 'loadstring', 'next', 'pairs', 'pcall', 'print', 'rawequal',
  197. -- 'rawget', 'rawset', 'require', 'setfenv', 'setmetatable',
  198. -- 'tonumber', 'tostring', 'type', 'unpack', 'xpcall'
  199. -- })
  200. --
  201. -- local constant = token(l.CONSTANT, word_match {
  202. -- '_G', '_VERSION', 'LUA_PATH', '_LOADED', '_REQUIREDNAME', '_ALERT',
  203. -- '_ERRORMESSAGE', '_PROMPT'
  204. -- })
  205. --
  206. -- Unlike most programming languages, Lua allows an additional range of
  207. -- characters in its identifier names (variables, functions, modules, etc.) so
  208. -- the usual `l.word` cannot be used. Instead, identifiers are represented by:
  209. --
  210. -- local word = (R('AZ', 'az', '\127\255') + '_') * (l.alnum + '_')^0
  211. -- local identifier = token(l.IDENTIFIER, word)
  212. --
  213. -- Finally, an operator character is one of the following:
  214. --
  215. -- local operator = token(l.OPERATOR, '~=' + S('+-*/%^#=<>;:,.{}[]()'))
  216. --
  217. -- #### Rules
  218. --
  219. -- Rules are just a combination of tokens. In Lua, all rules consist of a
  220. -- single token, but other languages may have two or more tokens in a rule.
  221. -- For example, an HTML tag consists of an element token followed by an
  222. -- optional set of attribute tokens. This allows each part of the tag to be
  223. -- colored distinctly.
  224. --
  225. -- The set of rules that comprises Lua is specified in a `_rules` table for the
  226. -- lexer.
  227. --
  228. -- _rules = {
  229. -- { 'whitespace', ws },
  230. -- { 'keyword', keyword },
  231. -- { 'function', func },
  232. -- { 'constant', constant },
  233. -- { 'identifier', identifier },
  234. -- { 'string', string },
  235. -- { 'comment', comment },
  236. -- { 'number', number },
  237. -- { 'operator', operator },
  238. -- { 'any_char', l.any_char },
  239. -- }
  240. --
  241. -- Each entry is a rule name and its associated pattern. Please note that the
  242. -- names of the rules can be completely different than the names of the tokens
  243. -- contained within them.
  244. --
  245. -- The order of the rules is important because of the nature of LPeg. LPeg tries
  246. -- to apply the first rule to the current position in the text it is matching.
  247. -- If there is a match, it colors that section appropriately and moves on. If
  248. -- there is not a match, it tries the next rule, and so on. Suppose instead that
  249. -- the `identifier` rule was before the `keyword` rule. It can be seen that all
  250. -- keywords satisfy the requirements for being an identifier, so any keywords
  251. -- would be incorrectly colored as identifiers. This is why `identifier` is
  252. -- where it is in the `_rules` table.
  253. --
  254. -- You might be wondering what that `any_char` is doing at the bottom of
  255. -- `_rules`. Its purpose is to match anything not accounted for in the above
  256. -- rules. For example, suppose the `!` character is in the input text. It will
  257. -- not be matched by any of the first 9 rules, so without `any_char`, the text
  258. -- would not match at all, and no coloring would occur. `any_char` matches one
  259. -- single character and moves on. It may be colored red (indicating a syntax
  260. -- error) if desired because it is a token, not just a pattern.
  261. --
  262. -- #### Summary
  263. --
  264. -- The above method of defining tokens and rules is sufficient for a majority of
  265. -- lexers. The `lexer` module provides many useful patterns and functions for
  266. -- constructing a working lexer quickly and efficiently. In most cases, the
  267. -- amount of knowledge of LPeg required to write a lexer is minimal.
  268. --
  269. -- As long as you used the default token types provided by `lexer`, you do not
  270. -- have to specify any coloring (or styling) information in the lexer; it is
  271. -- taken care of by the user's color theme.
  272. --
  273. -- The rest of this document is devoted to more complex lexer techniques.
  274. --
  275. -- #### Styling Tokens
  276. --
  277. -- The term for coloring text is styling. Just like with predefined LPeg
  278. -- patterns in `lexer`, predefined styles are available.
  279. --
  280. -- * `style_nothing`: Typically used for whitespace.
  281. -- * `style_char`: Typically used for character literals.
  282. -- * `style_class`: Typically used for class definitions.
  283. -- * `style_comment`: Typically used for code comments.
  284. -- * `style_constant`: Typically used for constants.
  285. -- * `style_definition`: Typically used for definitions.
  286. -- * `style_error`: Typically used for erroneous syntax.
  287. -- * `style_function`: Typically used for function definitions.
  288. -- * `style_keyword`: Typically used for language keywords.
  289. -- * `style_number`: Typically used for numbers.
  290. -- * `style_operator`: Typically used for operators.
  291. -- * `style_string`: Typically used for strings.
  292. -- * `style_preproc`: Typically used for preprocessor statements.
  293. -- * `style_tag`: Typically used for markup tags.
  294. -- * `style_type`: Typically used for static types.
  295. -- * `style_variable`: Typically used for variables.
  296. -- * `style_embedded`: Typically used for embedded code.
  297. -- * `style_identifier`: Typically used for identifier words.
  298. --
  299. -- Each style consists of a set of attributes:
  300. --
  301. -- + `font`: The style's font name.
  302. -- + `size`: The style's font size.
  303. -- + `bold`: Flag indicating whether or not the font is boldface.
  304. -- + `italic`: Flag indicating whether or not the font is italic.
  305. -- + `underline`: Flag indicating whether or not the font is underlined.
  306. -- + `fore`: The color of the font face.
  307. -- + `back`: The color of the font background.
  308. -- + `eolfilled`: Flag indicating whether or not to color the end of the line.
  309. -- + `characterset`: The character set of the font.
  310. -- + `case`: The case of the font. 1 for upper case, 2 for lower case, 0 for
  311. -- normal case.
  312. -- + `visible`: Flag indicating whether or not the text is visible.
  313. -- + `changable`: Flag indicating whether or not the text is read-only.
  314. -- + `hotspot`: Flag indicating whether or not the style is clickable.
  315. --
  316. -- Styles are created with [`style()`](#style). For example:
  317. --
  318. -- -- style with default theme settings
  319. -- local style_nothing = l.style { }
  320. --
  321. -- -- style with bold text with default theme font
  322. -- local style_bold = l.style { bold = true }
  323. --
  324. -- -- style with bold italic text with default theme font
  325. -- local style_bold_italic = l.style { bold = true, italic = true }
  326. --
  327. -- The `style_bold_italic` style can be rewritten in terms of `style_bold`:
  328. --
  329. -- local style_bold_italic = style_bold..{ italic = true }
  330. --
  331. -- In this way you can build on previously defined styles without having to
  332. -- rewrite them. Note the previous style is left unchanged.
  333. --
  334. -- Style colors are different than the #rrggbb RGB notation you may be familiar
  335. -- with. Instead, create a color using [`color()`](#color).
  336. --
  337. -- local red = l.color('FF', '00', '00')
  338. -- local green = l.color('00', 'FF', '00')
  339. -- local blue = l.color('00', '00', 'FF')
  340. --
  341. -- As you might have guessed, `lexer` has a set of default colors.
  342. --
  343. -- * `green`
  344. -- * `blue`
  345. -- * `red`
  346. -- * `yellow`
  347. -- * `teal`
  348. -- * `white`
  349. -- * `black`
  350. -- * `grey`
  351. -- * `purple`
  352. -- * `orange`
  353. --
  354. -- It is recommended to use them to stay consistant with a user's color theme.
  355. --
  356. -- Finally, styles are assigned to tokens via a `_tokenstyles` table in the
  357. -- lexer. Styles do not have to be assigned to the default tokens; it is done
  358. -- automatically. You only have to assign styles for tokens you create. For
  359. -- example:
  360. --
  361. -- local lua = token('lua', P('lua'))
  362. --
  363. -- -- ... other patterns and tokens ...
  364. --
  365. -- _tokenstyles = {
  366. -- { 'lua', l.style_keyword },
  367. -- }
  368. --
  369. -- Each entry is the token name the style is for and the style itself. The order
  370. -- of styles in `_tokenstyles` does not matter.
  371. --
  372. -- For examples of how styles are created, please see the theme files in the
  373. -- `lexers/themes/` folder.
  374. --
  375. -- #### Line Lexer
  376. --
  377. -- Sometimes it is advantageous to lex input text line by line rather than a
  378. -- chunk at a time. This occurs particularly in diff, patch, or make files. Put
  379. --
  380. -- _LEXBYLINE = true
  381. --
  382. -- somewhere in your lexer in order to do this.
  383. --
  384. -- #### Embedded Lexers
  385. --
  386. -- A particular advantage that dynamic lexers have over static ones is that
  387. -- lexers can be embedded within one another very easily, requiring minimal
  388. -- effort. There are two kinds of embedded lexers: a parent lexer that embeds
  389. -- other child lexers in it, and a child lexer that embeds itself within a
  390. -- parent lexer.
  391. --
  392. -- #### Parent Lexer with Children
  393. --
  394. -- An example of this kind of lexer is HTML with embedded CSS and Javascript.
  395. -- After creating the parent lexer, load the children lexers in it using
  396. -- [`lexer.load()`](#load). For example:
  397. --
  398. -- local css = l.load('css')
  399. --
  400. -- There needs to be a transition from the parent HTML lexer to the child CSS
  401. -- lexer. This is something of the form `<style type="text/css">`. Similarly,
  402. -- the transition from child to parent is `</style>`.
  403. --
  404. -- local css_start_rule = #(P('<') * P('style') *
  405. -- P(function(input, index)
  406. -- if input:find('[^>]+type%s*=%s*(["\'])text/css%1') then
  407. -- return index
  408. -- end
  409. -- end)) * tag
  410. -- local css_end_rule = #(P('</') * P('style') * ws^0 * P('>')) * tag
  411. --
  412. -- where `tag` and `ws` have been previously defined in the HTML lexer.
  413. --
  414. -- Now the CSS lexer can be embedded using [`embed_lexer()`](#embed_lexer):
  415. --
  416. -- l.embed_lexer(_M, css, css_start_rule, css_end_rule)
  417. --
  418. -- What is `_M`? It is the parent HTML lexer object, not the string `...` or
  419. -- `'html'`. The lexer object is needed by [`embed_lexer()`](#embed_lexer).
  420. --
  421. -- The same procedure can be done for Javascript.
  422. --
  423. -- local js = l.load('javascript')
  424. --
  425. -- local js_start_rule = #(P('<') * P('script') *
  426. -- P(function(input, index)
  427. -- if input:find('[^>]+type%s*=%s*(["\'])text/javascript%1') then
  428. -- return index
  429. -- end
  430. -- end)) * tag
  431. -- local js_end_rule = #('</' * P('script') * ws^0 * '>') * tag
  432. -- l.embed_lexer(_M, js, js_start_rule, js_end_rule)
  433. --
  434. -- #### Child Lexer Within Parent
  435. --
  436. -- An example of this kind of lexer is PHP embedded in HTML. After creating the
  437. -- child lexer, load the parent lexer. As an example:
  438. --
  439. -- local html = l.load('hypertext')
  440. --
  441. -- Since HTML should be the main lexer, (PHP is just a preprocessing language),
  442. -- the following statement changes the main lexer from PHP to HTML:
  443. --
  444. -- _lexer = html
  445. --
  446. -- Like in the previous section, transitions from HTML to PHP and back are
  447. -- specified:
  448. --
  449. -- local php_start_rule = token('php_tag', '<?' * ('php' * l.space)^-1)
  450. -- local php_end_rule = token('php_tag', '?>')
  451. --
  452. -- And PHP is embedded:
  453. --
  454. -- l.embed_lexer(html, _M, php_start_rule, php_end_rule)
  455. --
  456. -- #### Code Folding (Optional)
  457. --
  458. -- It is sometimes convenient to "fold", or not show blocks of text. These
  459. -- blocks can be functions, classes, comments, etc. A folder iterates over each
  460. -- line of input text and assigns a fold level to it. Certain lines can be
  461. -- specified as fold points that fold subsequent lines with a higher fold level.
  462. --
  463. -- In order to implement a folder, define the following function in your lexer:
  464. --
  465. -- function _fold(input, start_pos, start_line, start_level)
  466. --
  467. -- end
  468. --
  469. -- + `input`: The text to fold.
  470. -- + `start_pos`: Current position in the buffer of the text (used for obtaining
  471. -- style information from the document).
  472. -- + `start_line`: The line number the text starts at.
  473. -- + `start_level`: The fold level of the text at `start_line`.
  474. --
  475. -- The function must return a table whose indices are line numbers and whose
  476. -- values are tables containing the fold level and optionally a fold flag.
  477. --
  478. -- The following Scintilla fold flags are available:
  479. --
  480. -- * `SC_FOLDLEVELBASE`: The initial (root) fold level.
  481. -- * `SC_FOLDLEVELWHITEFLAG`: Flag indicating that the line is blank.
  482. -- * `SC_FOLDLEVELHEADERFLAG`: Flag indicating the line is fold point.
  483. -- * `SC_FOLDLEVELNUMBERMASK`: Flag used with `SCI_GETFOLDLEVEL(line)` to get
  484. -- the fold level of a line.
  485. --
  486. -- Have your fold function interate over each line, setting fold levels. You can
  487. -- use the [`get_style_at()`](#get_style_at), [`get_property()`](#get_property),
  488. -- [`get_fold_level()`](#get_fold_level), and
  489. -- [`get_indent_amount()`](#get_indent_amount) functions as necessary to
  490. -- determine the fold level for each line. The following example sets fold
  491. -- points by changes in indentation.
  492. --
  493. -- function _fold(input, start_pos, start_line, start_level)
  494. -- local folds = {}
  495. -- local current_line = start_line
  496. -- local prev_level = start_level
  497. -- for indent, line in text:gmatch('([\t ]*)(.-)\r?\n') do
  498. -- if #line > 0 then
  499. -- local current_level = l.get_indent_amount(current_line)
  500. -- if current_level > prev_level then -- next level
  501. -- local i = current_line - 1
  502. -- while folds[i] and folds[i][2] == l.SC_FOLDLEVELWHITEFLAG do
  503. -- i = i - 1
  504. -- end
  505. -- if folds[i] then
  506. -- folds[i][2] = l.SC_FOLDLEVELHEADERFLAG -- low indent
  507. -- end
  508. -- folds[current_line] = { current_level } -- high indent
  509. -- elseif current_level < prev_level then -- prev level
  510. -- if folds[current_line - 1] then
  511. -- folds[current_line - 1][1] = prev_level -- high indent
  512. -- end
  513. -- folds[current_line] = { current_level } -- low indent
  514. -- else -- same level
  515. -- folds[current_line] = { prev_level }
  516. -- end
  517. -- prev_level = current_level
  518. -- else
  519. -- folds[current_line] = { prev_level, l.SC_FOLDLEVELWHITEFLAG }
  520. -- end
  521. -- current_line = current_line + 1
  522. -- end
  523. -- return folds
  524. -- end
  525. --
  526. -- SciTE users note: do not use `get_property` for getting fold options from a
  527. -- `.properties` file because SciTE is not set up to forward them to your lexer.
  528. -- Instead, you can provide options that can be set at the top of the lexer.
  529. --
  530. -- #### Using the Lexer with SciTE
  531. --
  532. -- Create a `.properties` file for your lexer and `import` it in either your
  533. -- `SciTEUser.properties` or `SciTEGlobal.properties`. The contents of the
  534. -- `.properties` file should contain:
  535. --
  536. -- file.patterns.[lexer_name]=[file_patterns]
  537. -- lexer.$(file.patterns.[lexer_name])=[lexer_name]
  538. --
  539. -- where [lexer\_name] is the name of your lexer (minus the `.lua` extension)
  540. -- and [file\_patterns] is a set of file extensions matched to your lexer.
  541. --
  542. -- Please note any styling information in `.properties` files is ignored.
  543. --
  544. -- #### Using the Lexer with Textadept
  545. --
  546. -- Put your lexer in your [`~/.textadept/`][user]`lexers/` directory. That way
  547. -- your lexer will not be overwritten when upgrading. Also, lexers in this
  548. -- directory override default lexers. (A user `lua` lexer would be loaded
  549. -- instead of the default `lua` lexer. This is convenient if you wish to tweak
  550. -- a default lexer to your liking.) Do not forget to add a
  551. -- [mime-type](textadept.mime_types.html) for your lexer.
  552. --
  553. -- [user]: http://caladbolg.net/luadoc/textadept/manual/5_FolderStructure.html
  554. --
  555. -- #### Optimization
  556. --
  557. -- Lexers can usually be optimized for speed by re-arranging tokens so that the
  558. -- most common ones are recognized first. Keep in mind the issue that was raised
  559. -- earlier: if you put similar tokens like `identifier`s before `keyword`s, the
  560. -- latter will not be styled correctly.
  561. --
  562. -- #### Troubleshooting
  563. --
  564. -- Errors in lexers can be tricky to debug. Lua errors are printed to STDERR
  565. -- and `_G.print()` statements in lexers are printed to STDOUT.
  566. --
  567. -- #### Limitations
  568. --
  569. -- True embedded preprocessor language highlighting is not available. For most
  570. -- cases this will not be noticed, but code like
  571. --
  572. -- <div id="<?php echo $id; ?>">
  573. --
  574. -- or
  575. --
  576. -- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
  577. --
  578. -- will not highlight correctly.
  579. --
  580. -- #### Performance
  581. --
  582. -- There might be some slight overhead when initializing a lexer, but loading a
  583. -- file from disk into Scintilla is usually more expensive.
  584. --
  585. -- On modern computer systems, I see no difference in speed between LPeg lexers
  586. -- and Scintilla's C++ ones.
  587. --
  588. -- #### Risks
  589. --
  590. -- Poorly written lexers have the ability to crash Scintilla, so unsaved data
  591. -- might be lost. However, these crashes have only been observed in early lexer
  592. -- development, when syntax errors or pattern errors are present. Once the lexer
  593. -- actually starts styling text (either correctly or incorrectly; it does not
  594. -- matter), no crashes have occurred.
  595. --
  596. -- #### Acknowledgements
  597. --
  598. -- Thanks to Peter Odding for his [lexer post][post] on the Lua mailing list
  599. -- that inspired me, and of course thanks to Roberto Ierusalimschy for LPeg.
  600. --
  601. -- [post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
  602. local lpeg = require 'lpeg'
  603. local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
  604. local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
  605. local lpeg_match = lpeg.match
  606. package.path = _LEXERHOME..'/?.lua'
  607. -- Adds a rule to a lexer's current ordered list of rules.
  608. -- @param lexer The lexer to add the given rule to.
  609. -- @param name The name associated with this rule. It is used for other lexers
  610. -- to access this particular rule from the lexer's `_RULES` table. It does not
  611. -- have to be the same as the name passed to `token`.
  612. -- @param rule The LPeg pattern of the rule.
  613. local function add_rule(lexer, id, rule)
  614. if not lexer._RULES then
  615. ---
  616. -- List of rule names with associated LPeg patterns for a specific lexer.
  617. -- It is accessible to other lexers for embedded lexer applications.
  618. -- @class table
  619. -- @name _RULES
  620. lexer._RULES = {}
  621. -- Contains an ordered list (by numerical index) of rule names. This is used
  622. -- in conjunction with lexer._RULES for building _TOKENRULE.
  623. lexer._RULEORDER = {}
  624. end
  625. lexer._RULES[id] = rule
  626. lexer._RULEORDER[#lexer._RULEORDER + 1] = id
  627. end
  628. -- Adds a new Scintilla style to Scintilla.
  629. -- @param lexer The lexer to add the given style to.
  630. -- @param token_name The name of the token associated with this style.
  631. -- @param style A Scintilla style created from style().
  632. -- @see style
  633. local function add_style(lexer, token_name, style)
  634. local len = lexer._STYLES.len
  635. if len == 32 then len = len + 8 end -- skip predefined styles
  636. if len >= 128 then _G.print('Too many styles defined (128 MAX)') end
  637. lexer._TOKENS[token_name] = len
  638. lexer._STYLES[len] = style
  639. lexer._STYLES.len = len + 1
  640. end
  641. -- (Re)constructs lexer._TOKENRULE.
  642. -- @param parent The parent lexer.
  643. local function join_tokens(lexer)
  644. local patterns, order = lexer._RULES, lexer._RULEORDER
  645. local token_rule = patterns[order[1]]
  646. for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
  647. lexer._TOKENRULE = token_rule
  648. return lexer._TOKENRULE
  649. end
  650. -- (Re)constructs lexer._GRAMMAR.
  651. -- @param lexer The parent lexer.
  652. -- @param initial_rule The name of the rule to start lexing with. Defaults to
  653. -- lexer._NAME. Multilang lexers use this to start with a child rule if
  654. -- necessary.
  655. local function build_grammar(lexer, initial_rule)
  656. local token_rule = join_tokens(lexer)
  657. local children = lexer._CHILDREN
  658. if children then
  659. local lexer_name = lexer._NAME
  660. if not initial_rule then initial_rule = lexer_name end
  661. local grammar = { initial_rule, [lexer_name] = token_rule^0 }
  662. for _, child in ipairs(children) do
  663. local child_name = child._NAME
  664. local embedded_child = '_'..child_name
  665. local rules = child._EMBEDDEDRULES[lexer_name]
  666. grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
  667. rules.token_rule)^0 * rules.end_rule^-1
  668. token_rule = lpeg_V(embedded_child) + token_rule
  669. grammar[child_name] = (-rules.end_rule * rules.token_rule)^0 *
  670. rules.end_rule^-1 * lpeg_V(lexer_name)
  671. end
  672. grammar[lexer_name] = token_rule^0
  673. lexer._INITIALRULE = initial_rule
  674. lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
  675. else
  676. lexer._GRAMMAR = lpeg_Ct(token_rule^0)
  677. end
  678. end
  679. -- Default tokens.
  680. -- Contains token identifiers and associated style numbers.
  681. -- @class table
  682. -- @name tokens
  683. -- @field default The default type (0).
  684. -- @field whitespace The whitespace type (1).
  685. -- @field comment The comment type (2).
  686. -- @field string The string type (3).
  687. -- @field number The number type (4).
  688. -- @field keyword The keyword type (5).
  689. -- @field identifier The identifier type (6).
  690. -- @field operator The operator type (7).
  691. -- @field error The error type (8).
  692. -- @field preprocessor The preprocessor type (9).
  693. -- @field constant The constant type (10).
  694. -- @field function The function type (11).
  695. -- @field class The class type (12).
  696. -- @field type The type type (13).
  697. local tokens = {
  698. default = 0,
  699. whitespace = 1,
  700. comment = 2,
  701. string = 3,
  702. number = 4,
  703. keyword = 5,
  704. identifier = 6,
  705. operator = 7,
  706. error = 8,
  707. preprocessor = 9,
  708. constant = 10,
  709. variable = 11,
  710. ['function'] = 12,
  711. class = 13,
  712. type = 14,
  713. }
  714. local string_upper = string.upper
  715. for k, v in pairs(tokens) do _M[string_upper(k)] = k end
  716. ---
  717. -- Initializes the specified lexer.
  718. -- @param lexer_name The name of the lexing language.
  719. function load(lexer_name)
  720. _M.WHITESPACE = lexer_name..'_whitespace'
  721. local lexer = require(lexer_name or 'null')
  722. if not lexer then error('Lexer '..lexer_name..' does not exist') end
  723. lexer._TOKENS = tokens
  724. lexer._STYLES = {
  725. [0] = style_nothing,
  726. [1] = style_whitespace,
  727. [2] = style_comment,
  728. [3] = style_string,
  729. [4] = style_number,
  730. [5] = style_keyword,
  731. [6] = style_identifier,
  732. [7] = style_operator,
  733. [8] = style_error,
  734. [9] = style_preproc,
  735. [10] = style_constant,
  736. [11] = style_variable,
  737. [12] = style_function,
  738. [13] = style_class,
  739. [14] = style_type,
  740. len = 15,
  741. -- Predefined styles.
  742. [32] = style_default,
  743. [33] = style_line_number,
  744. [34] = style_bracelight,
  745. [35] = style_bracebad,
  746. [36] = style_controlchar,
  747. [37] = style_indentguide,
  748. [38] = style_calltip,
  749. }
  750. if lexer._lexer then
  751. local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
  752. if not l._tokenstyles then l._tokenstyles = {} end
  753. for _, r in ipairs(_r or {}) do
  754. -- Prevent rule id clashes.
  755. l._rules[#l._rules + 1] = { lexer._NAME..'_'..r[1], r[2] }
  756. end
  757. for _, s in ipairs(_s or {}) do l._tokenstyles[#l._tokenstyles + 1] = s end
  758. -- Each lexer that is loaded with l.load() has its _STYLES modified through
  759. -- add_style(). Reset _lexer's _STYLES accordingly.
  760. -- For example: RHTML load's HTML (which loads CSS and Javascript). CSS's
  761. -- styles are added to css._STYLES and JS's styles are added to js._STYLES.
  762. -- HTML adds its styles to html._STYLES as well as CSS's and JS's styles.
  763. -- RHTML adds its styles, HTML's styles, CSS's styles, and JS's styles to
  764. -- rhtml._STYLES. The problem is that rhtml == _lexer == html. Therefore
  765. -- html._STYLES would contain duplicate styles. Compensate by setting
  766. -- html._STYLES to rhtml._STYLES.
  767. l._STYLES = lexer._STYLES
  768. lexer = l
  769. end
  770. if lexer._rules then
  771. for _, s in ipairs(lexer._tokenstyles or {}) do
  772. add_style(lexer, s[1], s[2])
  773. end
  774. for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
  775. build_grammar(lexer)
  776. end
  777. add_style(lexer, lexer._NAME..'_whitespace', style_whitespace)
  778. _G._LEXER = lexer
  779. return lexer
  780. end
  781. ---
  782. -- Lexes the given text.
  783. -- Called by LexLPeg.cxx; do not call from Lua.
  784. -- If the lexer has a _LEXBYLINE flag set, the text is lexed one line at a time.
  785. -- Otherwise the text is lexed as a whole.
  786. -- @param text The text to lex.
  787. -- @param init_style The current style. Multilang lexers use this to determine
  788. -- which language to start lexing in.
  789. function lex(text, init_style)
  790. local lexer = _G._LEXER
  791. if not lexer._GRAMMAR then return {} end
  792. if not lexer._LEXBYLINE then
  793. -- For multilang lexers, build a new grammar whose initial_rule is the
  794. -- current language.
  795. if lexer._CHILDREN then
  796. for style, style_num in pairs(lexer._TOKENS) do
  797. if style_num == init_style then
  798. local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
  799. if lexer._INITIALRULE ~= lexer_name then
  800. build_grammar(lexer, lexer_name)
  801. end
  802. break
  803. end
  804. end
  805. end
  806. return lpeg_match(lexer._GRAMMAR, text)
  807. else
  808. local tokens = {}
  809. local function append(tokens, line_tokens, offset)
  810. for _, token in ipairs(line_tokens) do
  811. token[2] = token[2] + offset
  812. tokens[#tokens + 1] = token
  813. end
  814. end
  815. local offset = 0
  816. local grammar = lexer._GRAMMAR
  817. for line in text:gmatch('[^\r\n]*[\r\n]*') do
  818. local line_tokens = lpeg_match(grammar, line)
  819. if line_tokens then append(tokens, line_tokens, offset) end
  820. offset = offset + #line
  821. -- Use the default style to the end of the line if none was specified.
  822. if tokens[#tokens][2] ~= offset then
  823. tokens[#tokens + 1] = { 'default', offset + 1 }
  824. end
  825. end
  826. return tokens
  827. end
  828. end
  829. ---
  830. -- Folds the given text.
  831. -- Called by LexLPeg.cxx; do not call from Lua.
  832. -- If the current lexer has no _fold function, folding by indentation is
  833. -- performed if the 'fold.by.indentation' property is set.
  834. -- @param text The document text to fold.
  835. -- @param start_pos The position in the document text starts at.
  836. -- @param start_line The line number text starts on.
  837. -- @param start_level The fold level text starts on.
  838. -- @return Table of fold levels.
  839. function fold(text, start_pos, start_line, start_level)
  840. local folds = {}
  841. local lexer = _G._LEXER
  842. if lexer._fold then
  843. return lexer._fold(text, start_pos, start_line, start_level)
  844. elseif GetProperty('fold.by.indentation', 1) == 1 then
  845. local GetIndentAmount, GetFoldLevel, SetFoldLevel =
  846. GetIndentAmount, GetFoldLevel, SetFoldLevel
  847. local SC_FOLDLEVELHEADERFLAG, SC_FOLDLEVELWHITEFLAG =
  848. SC_FOLDLEVELHEADERFLAG, SC_FOLDLEVELWHITEFLAG
  849. -- Indentation based folding.
  850. local current_line = start_line
  851. local prev_level = start_level
  852. for indent, line in text:gmatch('([\t ]*)(.-)\r?\n') do
  853. if #line > 0 then
  854. local current_level = GetIndentAmount(current_line)
  855. if current_level > prev_level then -- next level
  856. local i = current_line - 1
  857. while folds[i] and folds[i][2] == SC_FOLDLEVELWHITEFLAG do
  858. i = i - 1
  859. end
  860. if folds[i] then
  861. folds[i][2] = SC_FOLDLEVELHEADERFLAG -- low indent
  862. end
  863. folds[current_line] = { current_level } -- high indent
  864. elseif current_level < prev_level then -- prev level
  865. if folds[current_line - 1] then
  866. folds[current_line - 1][1] = prev_level -- high indent
  867. end
  868. folds[current_line] = { current_level } -- low indent
  869. else -- same level
  870. folds[current_line] = { prev_level }
  871. end
  872. prev_level = current_level
  873. else
  874. folds[current_line] = { prev_level, SC_FOLDLEVELWHITEFLAG }
  875. end
  876. current_line = current_line + 1
  877. end
  878. return folds
  879. end
  880. end
  881. -- The following are utility functions lexers will have access to.
  882. -- Common patterns.
  883. any = lpeg_P(1)
  884. ascii = lpeg_R('\000\127')
  885. extend = lpeg_R('\000\255')
  886. alpha = lpeg_R('AZ', 'az')
  887. digit = lpeg_R('09')
  888. alnum = lpeg_R('AZ', 'az', '09')
  889. lower = lpeg_R('az')
  890. upper = lpeg_R('AZ')
  891. xdigit = lpeg_R('09', 'AF', 'af')
  892. cntrl = lpeg_R('\000\031')
  893. graph = lpeg_R('!~')
  894. print = lpeg_R(' ~')
  895. punct = lpeg_R('!/', ':@', '[\'', '{~')
  896. space = lpeg_S('\t\v\f\n\r ')
  897. newline = lpeg_S('\r\n\f')^1
  898. nonnewline = 1 - newline
  899. nonnewline_esc = 1 - (newline + '\\') + '\\' * any
  900. dec_num = digit^1
  901. hex_num = '0' * lpeg_S('xX') * xdigit^1
  902. oct_num = '0' * lpeg_R('07')^1
  903. integer = lpeg_S('+-')^-1 * (hex_num + oct_num + dec_num)
  904. float = lpeg_S('+-')^-1 *
  905. (digit^0 * '.' * digit^1 + digit^1 * '.' * digit^0 + digit^1) *
  906. lpeg_S('eE') * lpeg_S('+-')^-1 * digit^1
  907. word = (alpha + '_') * (alnum + '_')^0
  908. ---
  909. -- Creates an LPeg capture table index with the name and position of the token.
  910. -- @param name The name of token. If this name is not in `l.tokens` then you
  911. -- will have to specify a style for it in `lexer._tokenstyles`.
  912. -- @param patt The LPeg pattern associated with the token.
  913. -- @usage local ws = token(l.WHITESPACE, l.space^1)
  914. -- @usage php_start_rule = token('php_tag', '<?' * ('php' * l.space)^-1)
  915. function token(name, patt)
  916. if not name then _G.print('noname') end
  917. return lpeg_Ct(lpeg_Cc(name) * patt * lpeg_Cp())
  918. end
  919. -- common tokens
  920. any_char = token('default', any)
  921. ---
  922. -- Creates a Scintilla style from a table of style properties.
  923. -- @param style_table A table of style properties.
  924. -- Style properties available:
  925. -- font = [string]
  926. -- size = [integer]
  927. -- bold = [boolean]
  928. -- italic = [boolean]
  929. -- underline = [boolean]
  930. -- fore = [integer]*
  931. -- back = [integer]*
  932. -- eolfilled = [boolean]
  933. -- characterset = ?
  934. -- case = [integer]
  935. -- visible = [boolean]
  936. -- changeable = [boolean]
  937. -- hotspot = [boolean]
  938. -- * Use the value returned by `color()`.
  939. -- @usage local bold_italic = style { bold = true, italic = true }
  940. -- @see color
  941. function style(style_table)
  942. setmetatable(style_table, {
  943. __concat = function(t1, t2)
  944. local t = setmetatable({}, getmetatable(t1)) -- duplicate t1
  945. for k,v in pairs(t1) do t[k] = v end
  946. for k,v in pairs(t2) do t[k] = v end
  947. return t
  948. end
  949. })
  950. return style_table
  951. end
  952. ---
  953. -- Creates a Scintilla color.
  954. -- @param r The string red component of the hexadecimal color.
  955. -- @param g The string green component of the color.
  956. -- @param b The string blue component of the color.
  957. -- @usage local red = color('FF', '00', '00')
  958. function color(r, g, b) return tonumber(b..g..r, 16) end
  959. ---
  960. -- Creates an LPeg pattern that matches a range of characters delimitted by a
  961. -- specific character(s).
  962. -- This can be used to match a string, parenthesis, etc.
  963. -- @param chars The character(s) that bound the matched range.
  964. -- @param escape Optional escape character. This parameter may be omitted, nil,
  965. -- or the empty string.
  966. -- @param end_optional Optional flag indicating whether or not an ending
  967. -- delimiter is optional or not. If true, the range begun by the start
  968. -- delimiter matches until an end delimiter or the end of the input is
  969. -- reached.
  970. -- @param balanced Optional flag indicating whether or not a balanced range is
  971. -- matched, like `%b` in Lua's `string.find`. This flag only applies if
  972. -- `chars` consists of two different characters (e.g. '()').
  973. -- @param forbidden Optional string of characters forbidden in a delimited
  974. -- range. Each character is part of the set.
  975. -- @usage local sq_str_noescapes = delimited_range("'")
  976. -- @usage local sq_str_escapes = delimited_range("'", '\\', true)
  977. -- @usage local unbalanced_parens = delimited_range('()', '\\', true)
  978. -- @usage local balanced_parens = delimited_range('()', '\\', true, true)
  979. function delimited_range(chars, escape, end_optional, balanced, forbidden)
  980. local s = chars:sub(1, 1)
  981. local e = #chars == 2 and chars:sub(2, 2) or s
  982. local range
  983. local b = balanced and s or ''
  984. local f = forbidden or ''
  985. if not escape or escape == '' then
  986. local invalid = lpeg_S(e..f..b)
  987. range = any - invalid
  988. else
  989. local invalid = lpeg_S(e..f..b) + escape
  990. range = any - invalid + escape * any
  991. end
  992. if balanced and s ~= e then
  993. return lpeg_P{ s * (range + lpeg_V(1))^0 * e }
  994. else
  995. if end_optional then e = lpeg_P(e)^-1 end
  996. return s * range^0 * e
  997. end
  998. end
  999. ---
  1000. -- Creates an LPeg pattern from a given pattern that matches the beginning of a
  1001. -- line and returns it.
  1002. -- @param patt The LPeg pattern to match at the beginning of a line.
  1003. -- @usage local preproc = token(l.PREPROCESSOR, #P('#') * l.starts_line('#' *
  1004. -- l.nonnewline^0))
  1005. function starts_line(patt)
  1006. return lpeg_P(function(input, idx)
  1007. if idx == 1 then return idx end
  1008. local char = input:sub(idx - 1, idx - 1)
  1009. if char == '\n' or char == '\r' or char == '\f' then return idx end
  1010. end) * patt
  1011. end
  1012. ---
  1013. -- Similar to `delimited_range()`, but allows for multi-character delimitters.
  1014. -- This is useful for lexers with tokens such as nested block comments. With
  1015. -- single-character delimiters, this function is identical to
  1016. -- `delimited_range(start_chars..end_chars, nil, end_optional, true)`.
  1017. -- @param start_chars The string starting a nested sequence.
  1018. -- @param end_chars The string ending a nested sequence.
  1019. -- @param end_optional Optional flag indicating whether or not an ending
  1020. -- delimiter is optional or not. If true, the range begun by the start
  1021. -- delimiter matches until an end delimiter or the end of the input is
  1022. -- reached.
  1023. -- @usage local nested_comment = l.nested_pair('/*', '*/', true)
  1024. function nested_pair(start_chars, end_chars, end_optional)
  1025. local s, e = start_chars, end_optional and lpeg_P(end_chars)^-1 or end_chars
  1026. return lpeg_P{ s * (any - s - end_chars + lpeg_V(1))^0 * e }
  1027. end
  1028. ---
  1029. -- Creates an LPeg pattern that matches a set of words.
  1030. -- @param words A table of words.
  1031. -- @param word_chars Optional string of additional characters considered to be
  1032. -- part of a word (default is `%w_`).
  1033. -- @param case_insensitive Optional boolean flag indicating whether the word
  1034. -- match is case-insensitive.
  1035. -- @usage local keyword = token(l.KEYWORD, word_match { 'foo', 'bar', 'baz' })
  1036. -- @usage local keyword = token(l.KEYWORD, word_match({ 'foo-bar', 'foo-baz',
  1037. -- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar' }, '-', true))
  1038. function word_match(words, word_chars, case_insensitive)
  1039. local word_list = {}
  1040. for _, word in ipairs(words) do word_list[word] = true end
  1041. local chars = '%w_'
  1042. -- escape 'magic' characters
  1043. -- TODO: append chars to the end so ^_ can be passed for not including '_'s
  1044. if word_chars then chars = chars..word_chars:gsub('([%^%]%-])', '%%%1') end
  1045. return lpeg_P(function(input, index)
  1046. local s, e, word = input:find('^(['..chars..']+)', index)
  1047. if word then
  1048. if case_insensitive then word = word:lower() end
  1049. return word_list[word] and e + 1 or nil
  1050. end
  1051. end)
  1052. end
  1053. ---
  1054. -- Embeds a child lexer language in a parent one.
  1055. -- @param parent The parent lexer.
  1056. -- @param child The child lexer.
  1057. -- @param start_rule The token that signals the beginning of the embedded
  1058. -- lexer.
  1059. -- @param end_rule The token that signals the end of the embedded lexer.
  1060. -- @usage embed_lexer(_M, css, css_start_rule, css_end_rule)
  1061. -- @usage embed_lexer(html, _M, php_start_rule, php_end_rule)
  1062. -- @usage embed_lexer(html, ruby, ruby_start_rule, rule_end_rule)
  1063. function embed_lexer(parent, child, start_rule, end_rule)
  1064. -- Add child rules.
  1065. if not child._EMBEDDEDRULES then
  1066. ---
  1067. -- Set of rules for an embedded lexer.
  1068. -- For a parent lexer name, contains child's `start_rule`, `token_rule`, and
  1069. -- `end_rule` patterns.
  1070. -- @class table
  1071. -- @name _EMBEDDEDRULES
  1072. child._EMBEDDEDRULES = {}
  1073. end
  1074. if not child._RULES then -- creating a child lexer to be embedded
  1075. if not child._rules then error('Cannot embed language with no rules') end
  1076. for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
  1077. end
  1078. child._EMBEDDEDRULES[parent._NAME] = {
  1079. ['start_rule'] = start_rule,
  1080. token_rule = join_tokens(child),
  1081. ['end_rule'] = end_rule
  1082. }
  1083. if not parent._CHILDREN then parent._CHILDREN = {} end
  1084. local children = parent._CHILDREN
  1085. children[#children + 1] = child
  1086. -- Add child styles.
  1087. local tokenstyles = parent._tokenstyles
  1088. tokenstyles[#tokenstyles + 1] = { child._NAME..'_whitespace',
  1089. style_whitespace }
  1090. for _, style in ipairs(child._tokenstyles or {}) do
  1091. tokenstyles[#tokenstyles + 1] = style
  1092. end
  1093. -- Add child's embedded lexers.
  1094. -- local children2 = child._CHILDREN
  1095. -- if children2 then
  1096. -- for _, child2 in ipairs(children2) do
  1097. -- child2._EMBEDDEDRULES[parent._NAME] = child2._EMBEDDEDRULES[child._NAME]
  1098. -- children[#children + 1] = child2
  1099. -- end
  1100. -- end
  1101. end
  1102. -- Registered functions and constants.
  1103. ---
  1104. -- Returns the integer style number at a given position.
  1105. -- @param pos The position to get the style for.
  1106. function get_style_at(pos) end
  1107. get_style_at = GetStyleAt
  1108. ---
  1109. -- Returns an integer property value for a given key.
  1110. -- @param key The property key.
  1111. -- @param default Optional integer value to return if key is not set.
  1112. function get_property(key, default) end
  1113. get_property = GetProperty
  1114. ---
  1115. -- Returns the fold level for a given line.
  1116. -- This level already has `SC_FOLDLEVELBASE` added to it, so you do not need to
  1117. -- add it yourself.
  1118. -- @param line_number The line number to get the fold level of.
  1119. function get_fold_level(line) end
  1120. get_fold_level = GetFoldLevel
  1121. ---
  1122. -- Returns the indent amount of text for a given line.
  1123. -- @param line The line number to get the indent amount of.
  1124. function get_indent_amount(line) end
  1125. get_indent_amount = GetIndentAmount
  1126. _M.SC_FOLDLEVELBASE = SC_FOLDLEVELBASE
  1127. _M.SC_FOLDLEVELWHITEFLAG = SC_FOLDLEVELWHITEFLAG
  1128. _M.SC_FOLDLEVELHEADERFLAG = SC_FOLDLEVELHEADERFLAG
  1129. _M.SC_FOLDLEVELNUMBERMASK = SC_FOLDLEVELNUMBERMASK
  1130. -- Load theme.
  1131. if _THEME and _THEME ~= '' then
  1132. local ret, errmsg
  1133. if not _THEME:find('[/\\]') then -- name of stock theme
  1134. ret, errmsg = pcall(dofile, _LEXERHOME..'/themes/'.._THEME..'.lua')
  1135. else -- absolute path of a theme
  1136. ret, errmsg = pcall(dofile, _THEME)
  1137. end
  1138. if not ret and errmsg then _G.print(errmsg) end
  1139. end