/Bel/Source/Tokenizers/Regex.cpp
C++ | 93 lines | 71 code | 20 blank | 2 comment | 8 complexity | 47dbf97786063c6feab8fed2688ed4eb MD5 | raw file
- #include <string>
- #include <vector>
- using namespace std;
- #include "Regex.h"
- namespace Bel
- {
- int RegexTokenizer::addTokenType ()
- {
- return mNumTokenTypes++;
- }
- int RegexTokenizer::addTokenType (const string& regex)
- {
- mTokenTypes.push_back(TokenType(mNumTokenTypes,regex));
- return mNumTokenTypes++;
- }
- void RegexTokenizer::addFluffType (const string& regex)
- {
- mTokenTypes.push_back(TokenType(regex));
- }
- TokenList RegexTokenizer::tokenizeInput (const string& input) const
- {
- TokenList tokens;
- tokens.push_back({1, 0, 0});
- for (int start = 0; start < input.size();)
- {
- int length = 0;
- TokenType type;
- // Greedily find the longest matching token pattern
- for (const TokenType& tokenType: mTokenTypes)
- {
- match_results<string::const_iterator> result;
- if (regex_search(input.begin()+start, input.end(), result, tokenType.mPattern, regex_constants::match_continuous))
- {
- if (result.length() > length)
- {
- length = result.length();
- type = tokenType;
- }
- }
- }
- // Add another token if appropriate
- if (length == 0) { throw new TokenError(start); }
- if (!type.mFluff) { tokens.push_back({type.mType, start, start+length-1}); }
- start += length;
- }
- tokens.push_back({0, (int)input.size(), (int)input.size()});
- return tokens;
- }
- void RegexTokenizer::print () const
- {
- for (const TokenType& tokenType: mTokenTypes)
- {
- tokenType.print();
- }
- }
- RegexTokenizer::TokenType::TokenType (const string& pattern) :
- mPatternString(pattern),
- mPattern(pattern, regex::nosubs|regex::optimize),
- mFluff(true)
- {
- }
- RegexTokenizer::TokenType::TokenType (int type, const string& pattern) :
- mPatternString(pattern),
- mPattern(pattern, regex::nosubs|regex::optimize),
- mType(type),
- mFluff(false)
- {
- }
- void RegexTokenizer::TokenType::print () const
- {
- printf("%d: %s\n", mType, mPatternString.c_str());
- }
- }