PageRenderTime 26ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/linguist/tokenizer.rb

https://gitlab.com/Aaeinstein54/linguist
Ruby | 205 lines | 163 code | 12 blank | 30 comment | 6 complexity | e68d82637492eca1cd1332359e63c119 MD5 | raw file
  1. require 'strscan'
  2. module Linguist
  3. # Generic programming language tokenizer.
  4. #
  5. # Tokens are designed for use in the language bayes classifier.
  6. # It strips any data strings or comments and preserves significant
  7. # language symbols.
  8. class Tokenizer
  9. # Public: Extract tokens from data
  10. #
  11. # data - String to tokenize
  12. #
  13. # Returns Array of token Strings.
  14. def self.tokenize(data)
  15. new.extract_tokens(data)
  16. end
  17. # Read up to 100KB
  18. BYTE_LIMIT = 100_000
  19. # Start state on token, ignore anything till the next newline
  20. SINGLE_LINE_COMMENTS = [
  21. '//', # C
  22. '--', # Ada, Haskell, AppleScript
  23. '#', # Ruby
  24. '%', # Tex
  25. '"', # Vim
  26. ]
  27. # Start state on opening token, ignore anything until the closing
  28. # token is reached.
  29. MULTI_LINE_COMMENTS = [
  30. ['/*', '*/'], # C
  31. ['<!--', '-->'], # XML
  32. ['{-', '-}'], # Haskell
  33. ['(*', '*)'], # Coq
  34. ['"""', '"""'], # Python
  35. ["'''", "'''"] # Python
  36. ]
  37. START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
  38. "\s*#{Regexp.escape(c)} "
  39. }.join("|"))
  40. START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
  41. Regexp.escape(c[0])
  42. }.join("|"))
  43. # Internal: Extract generic tokens from data.
  44. #
  45. # data - String to scan.
  46. #
  47. # Examples
  48. #
  49. # extract_tokens("printf('Hello')")
  50. # # => ['printf', '(', ')']
  51. #
  52. # Returns Array of token Strings.
  53. def extract_tokens(data)
  54. s = StringScanner.new(data)
  55. tokens = []
  56. until s.eos?
  57. break if s.pos >= BYTE_LIMIT
  58. if token = s.scan(/^#!.+$/)
  59. if name = extract_shebang(token)
  60. tokens << "SHEBANG#!#{name}"
  61. end
  62. # Single line comment
  63. elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
  64. # tokens << token.strip
  65. s.skip_until(/\n|\Z/)
  66. # Multiline comments
  67. elsif token = s.scan(START_MULTI_LINE_COMMENT)
  68. # tokens << token
  69. close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
  70. s.skip_until(Regexp.compile(Regexp.escape(close_token)))
  71. # tokens << close_token
  72. # Skip single or double quoted strings
  73. elsif s.scan(/"/)
  74. if s.peek(1) == "\""
  75. s.getch
  76. else
  77. s.skip_until(/(?<!\\)"/)
  78. end
  79. elsif s.scan(/'/)
  80. if s.peek(1) == "'"
  81. s.getch
  82. else
  83. s.skip_until(/(?<!\\)'/)
  84. end
  85. # Skip number literals
  86. elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
  87. # SGML style brackets
  88. elsif token = s.scan(/<[^\s<>][^<>]*>/)
  89. extract_sgml_tokens(token).each { |t| tokens << t }
  90. # Common programming punctuation
  91. elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
  92. tokens << token
  93. # Regular token
  94. elsif token = s.scan(/[\w\.@#\/\*]+/)
  95. tokens << token
  96. # Common operators
  97. elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
  98. tokens << token
  99. else
  100. s.getch
  101. end
  102. end
  103. tokens
  104. end
  105. # Internal: Extract normalized shebang command token.
  106. #
  107. # Examples
  108. #
  109. # extract_shebang("#!/usr/bin/ruby")
  110. # # => "ruby"
  111. #
  112. # extract_shebang("#!/usr/bin/env node")
  113. # # => "node"
  114. #
  115. # extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
  116. # # => "awk"
  117. #
  118. # Returns String token or nil it couldn't be parsed.
  119. def extract_shebang(data)
  120. s = StringScanner.new(data)
  121. if path = s.scan(/^#!\s*\S+/)
  122. script = path.split('/').last
  123. if script == 'env'
  124. s.scan(/\s+/)
  125. s.scan(/.*=[^\s]+\s+/)
  126. script = s.scan(/\S+/)
  127. end
  128. script = script[/[^\d]+/, 0] if script
  129. return script
  130. end
  131. nil
  132. end
  133. # Internal: Extract tokens from inside SGML tag.
  134. #
  135. # data - SGML tag String.
  136. #
  137. # Examples
  138. #
  139. # extract_sgml_tokens("<a href='' class=foo>")
  140. # # => ["<a>", "href="]
  141. #
  142. # Returns Array of token Strings.
  143. def extract_sgml_tokens(data)
  144. s = StringScanner.new(data)
  145. tokens = []
  146. until s.eos?
  147. # Emit start token
  148. if token = s.scan(/<\/?[^\s>]+/)
  149. tokens << "#{token}>"
  150. # Emit attributes with trailing =
  151. elsif token = s.scan(/\w+=/)
  152. tokens << token
  153. # Then skip over attribute value
  154. if s.scan(/"/)
  155. s.skip_until(/[^\\]"/)
  156. elsif s.scan(/'/)
  157. s.skip_until(/[^\\]'/)
  158. else
  159. s.skip_until(/\w+/)
  160. end
  161. # Emit lone attributes
  162. elsif token = s.scan(/\w+/)
  163. tokens << token
  164. # Stop at the end of the tag
  165. elsif s.scan(/>/)
  166. s.terminate
  167. else
  168. s.getch
  169. end
  170. end
  171. tokens
  172. end
  173. end
  174. end