PageRenderTime 58ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/tokenization.py

https://github.com/stereotype441/nurfer_python
Python | 61 lines | 54 code | 4 blank | 3 comment | 0 complexity | 3d1b5352d706aced2d2ae3ff0b154ab2 MD5 | raw file
  1. from span import Span
  2. import re
  3. SHORT_STRING_CONTENTS_REGEXP = re.compile('\\\\.|(?P<quote>\'|")|(?P<eol>$)', re.DOTALL | re.MULTILINE)
  4. LONG_STRING_CONTENTS_REGEXP = re.compile('\\\\.|(?P<quote>\'{3}|"{3})', re.DOTALL)
  5. SIMPLE_STRING_START_PATTERN = '(?P<quote>\'(\'\')?|"("")?)' # Ignores leading 'u'/'r'
  6. ANONYMOUS_IDENTIFIER_PATTERN = r'[a-zA-Z_]\w*'
  7. IDENTIFIER_PATTERN = r'(?P<identifier>%s)' % ANONYMOUS_IDENTIFIER_PATTERN
  8. COMMENT_PATTERN = r'(?P<comment>#.*$)'
  9. SYMBOL_PATTERN = r'(?P<symbol>!=|%=|&=|\*\*|\*\*=|\*=|\+=|-=|//|//=|/=|<<|<<=|<=|<>|==|>=|>>|>>=|\^=|\|=|[!%&()*+,\-./:;<=>@[\]^`{|}~])'
  10. TOKEN_REGEXP = re.compile('%s|%s|%s|%s' % (SIMPLE_STRING_START_PATTERN, IDENTIFIER_PATTERN, COMMENT_PATTERN, SYMBOL_PATTERN), re.MULTILINE)
  11. COMMENT_START_PATTERN = r'(?P<comment>#)'
  12. STRING_OR_COMMENT_START_REGEXP = re.compile('%s|%s' % (SIMPLE_STRING_START_PATTERN, COMMENT_START_PATTERN))
  13. EOL_REGEXP = re.compile('$', re.MULTILINE)
  14. NESTING_OPERATOR_REGEXP = re.compile(r'(?P<open>[[({])|(?P<close>[])}])')
  15. STATEMENT_CONTINUATION_REGEXP = re.compile(r'\s*(elif|else|except|finally)\b')
  16. BLOCK_INTRO_REGEXP = re.compile(r'\s*(def|class)\b')
  17. ANONYMOUS_IDENTIFIER_PATTERN = r'[a-zA-Z_]\w*'
  18. TARGET_PATTERN = r'%s(\s*,\s*%s)*' % (ANONYMOUS_IDENTIFIER_PATTERN, ANONYMOUS_IDENTIFIER_PATTERN)
  19. class Token(Span):
  20. """Stores the location of a token in a document, and its type and text.
  21. Possible types are:
  22. 'IDENTIFIER'
  23. 'LITERAL' (e.g. string)
  24. any keyword (lowercase)
  25. any symbol
  26. """
  27. def __init__(self, full_text, token_type, start, end):
  28. Span.__init__(self, full_text, start, end)
  29. self._token_type = token_type
  30. @property
  31. def type(self):
  32. return self._token_type
  33. def __str__(self):
  34. return self.sub_text
  35. def find_string_end(text, pos, quote_type):
  36. if len(quote_type) == 1:
  37. re = SHORT_STRING_CONTENTS_REGEXP
  38. else:
  39. re = LONG_STRING_CONTENTS_REGEXP
  40. while pos <= len(text):
  41. match = re.search(text, pos)
  42. if match is None:
  43. break
  44. if match.lastgroup == 'quote' and match.group() == quote_type:
  45. return match.end()
  46. elif match.lastgroup == 'eol':
  47. return match.start()
  48. pos = match.end()
  49. return len(text) # Unterminated string.