PageRenderTime 42ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/Bel/Source/Tokenizers/Regex.cpp

https://bitbucket.org/beleza/beleza
C++ | 93 lines | 71 code | 20 blank | 2 comment | 8 complexity | 47dbf97786063c6feab8fed2688ed4eb MD5 | raw file
  1. #include <string>
  2. #include <vector>
  3. using namespace std;
  4. #include "Regex.h"
  5. namespace Bel
  6. {
  7. int RegexTokenizer::addTokenType ()
  8. {
  9. return mNumTokenTypes++;
  10. }
  11. int RegexTokenizer::addTokenType (const string& regex)
  12. {
  13. mTokenTypes.push_back(TokenType(mNumTokenTypes,regex));
  14. return mNumTokenTypes++;
  15. }
  16. void RegexTokenizer::addFluffType (const string& regex)
  17. {
  18. mTokenTypes.push_back(TokenType(regex));
  19. }
  20. TokenList RegexTokenizer::tokenizeInput (const string& input) const
  21. {
  22. TokenList tokens;
  23. tokens.push_back({1, 0, 0});
  24. for (int start = 0; start < input.size();)
  25. {
  26. int length = 0;
  27. TokenType type;
  28. // Greedily find the longest matching token pattern
  29. for (const TokenType& tokenType: mTokenTypes)
  30. {
  31. match_results<string::const_iterator> result;
  32. if (regex_search(input.begin()+start, input.end(), result, tokenType.mPattern, regex_constants::match_continuous))
  33. {
  34. if (result.length() > length)
  35. {
  36. length = result.length();
  37. type = tokenType;
  38. }
  39. }
  40. }
  41. // Add another token if appropriate
  42. if (length == 0) { throw new TokenError(start); }
  43. if (!type.mFluff) { tokens.push_back({type.mType, start, start+length-1}); }
  44. start += length;
  45. }
  46. tokens.push_back({0, (int)input.size(), (int)input.size()});
  47. return tokens;
  48. }
  49. void RegexTokenizer::print () const
  50. {
  51. for (const TokenType& tokenType: mTokenTypes)
  52. {
  53. tokenType.print();
  54. }
  55. }
  56. RegexTokenizer::TokenType::TokenType (const string& pattern) :
  57. mPatternString(pattern),
  58. mPattern(pattern, regex::nosubs|regex::optimize),
  59. mFluff(true)
  60. {
  61. }
  62. RegexTokenizer::TokenType::TokenType (int type, const string& pattern) :
  63. mPatternString(pattern),
  64. mPattern(pattern, regex::nosubs|regex::optimize),
  65. mType(type),
  66. mFluff(false)
  67. {
  68. }
  69. void RegexTokenizer::TokenType::print () const
  70. {
  71. printf("%d: %s\n", mType, mPatternString.c_str());
  72. }
  73. }