/Library/Regex/Regex.cpp
C++ | 678 lines | 598 code | 60 blank | 20 comment | 64 complexity | 151040f3345d8f00e7f4b19287b32d4e MD5 | raw file
- #include "Regex.h"
- #include "RegexExpression.h"
- #include "RegexPure.h"
- #include "RegexRich.h"
- #include "..\Collections\OperationCopyFrom.h"
-
- namespace vl
- {
- namespace regex
- {
- using namespace collections;
- using namespace regex_internal;
-
- /***********************************************************************
- RegexString
- ***********************************************************************/
-
- RegexString::RegexString(vint _start)
- :start(_start)
- ,length(0)
- {
- }
-
- RegexString::RegexString(const WString& _string, vint _start, vint _length)
- :value(_length==0?L"":_string.Sub(_start, _length))
- ,start(_start)
- ,length(_length)
- {
- }
-
- vint RegexString::Start()const
- {
- return start;
- }
-
- vint RegexString::Length()const
- {
- return length;
- }
-
- const WString& RegexString::Value()const
- {
- return value;
- }
-
- bool RegexString::operator==(const RegexString& string)const
- {
- return start==string.start && length==string.length && value==string.value;
- }
-
- /***********************************************************************
- RegexMatch
- ***********************************************************************/
-
- RegexMatch::RegexMatch(const WString& _string, PureResult* _result)
- :success(true)
- ,result(_string, _result->start, _result->length)
- {
- }
-
- RegexMatch::RegexMatch(const WString& _string, RichResult* _result, RichInterpretor* _rich)
- :success(true)
- ,result(_string, _result->start, _result->length)
- {
- for(vint i=0;i<_result->captures.Count();i++)
- {
- CaptureRecord& capture=_result->captures[i];
- if(capture.capture==-1)
- {
- captures.Add(RegexString(_string, capture.start, capture.length));
- }
- else
- {
- groups.Add(_rich->CaptureNames()[capture.capture], RegexString(_string, capture.start, capture.length));
- }
- }
- }
-
- RegexMatch::RegexMatch(const RegexString& _result)
- :success(false)
- ,result(_result)
- {
- }
-
- bool RegexMatch::Success()const
- {
- return success;
- }
-
- const RegexString& RegexMatch::Result()const
- {
- return result;
- }
-
- const RegexMatch::CaptureList& RegexMatch::Captures()const
- {
- return captures.Wrap();
- }
-
- const RegexMatch::CaptureGroup& RegexMatch::Groups()const
- {
- return groups.Wrap();
- }
-
- /***********************************************************************
- Regex
- ***********************************************************************/
-
- void Regex::Process(const WString& text, bool keepEmpty, bool keepSuccess, bool keepFail, RegexMatch::List& matches)const
- {
- if(rich)
- {
- const wchar_t* start=text.Buffer();
- const wchar_t* input=start;
- RichResult result;
- while(rich->Match(input, start, result))
- {
- vint offset=input-start;
- if(keepFail)
- {
- if(result.start>offset || keepEmpty)
- {
- matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
- }
- }
- if(keepSuccess)
- {
- matches.Add(new RegexMatch(text, &result, rich));
- }
- input=start+result.start+result.length;
- }
- if(keepFail)
- {
- vint remain=input-start;
- vint length=text.Length()-remain;
- if(length || keepEmpty)
- {
- matches.Add(new RegexMatch(RegexString(text, remain, length)));
- }
- }
- }
- else
- {
- const wchar_t* start=text.Buffer();
- const wchar_t* input=start;
- PureResult result;
- while(pure->Match(input, start, result))
- {
- vint offset=input-start;
- if(keepFail)
- {
- if(result.start>offset || keepEmpty)
- {
- matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
- }
- }
- if(keepSuccess)
- {
- matches.Add(new RegexMatch(text, &result));
- }
- input=start+result.start+result.length;
- }
- if(keepFail)
- {
- vint remain=input-start;
- vint length=text.Length()-remain;
- if(length || keepEmpty)
- {
- matches.Add(new RegexMatch(RegexString(text, remain, length)));
- }
- }
- }
- }
-
- Regex::Regex(const WString& code, bool preferPure)
- :pure(0)
- ,rich(0)
- {
- CharRange::List subsets;
- RegexExpression::Ref regex=ParseRegexExpression(code);
- Expression::Ref expression=regex->Merge();
- expression->NormalizeCharSet(subsets);
-
- bool pureRequired=false;
- bool richRequired=false;
- if(preferPure)
- {
- if(expression->HasNoExtension())
- {
- pureRequired=true;
- }
- else
- {
- if(expression->CanTreatAsPure())
- {
- pureRequired=true;
- richRequired=true;
- }
- else
- {
- richRequired=true;
- }
- }
- }
- else
- {
- richRequired=true;
- }
-
- try
- {
- if(pureRequired)
- {
- Dictionary<State*, State*> nfaStateMap;
- Group<State*, State*> dfaStateMap;
- Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
- Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
- Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
- pure=new PureInterpretor(dfa, subsets);
- }
- if(richRequired)
- {
- Dictionary<State*, State*> nfaStateMap;
- Group<State*, State*> dfaStateMap;
- Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
- Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap);
- Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
- rich=new RichInterpretor(dfa);
- }
- }
- catch(...)
- {
- if(pure)delete pure;
- if(rich)delete rich;
- throw;
- }
- }
-
- Regex::~Regex()
- {
- if(pure)delete pure;
- if(rich)delete rich;
- }
-
- bool Regex::IsPureMatch()const
- {
- return rich?false:true;
- }
-
- bool Regex::IsPureTest()const
- {
- return pure?true:false;
- }
-
- RegexMatch::Ref Regex::MatchHead(const WString& text)const
- {
- if(rich)
- {
- RichResult result;
- if(rich->MatchHead(text.Buffer(), text.Buffer(), result))
- {
- return new RegexMatch(text, &result, rich);
- }
- else
- {
- return 0;
- }
- }
- else
- {
- PureResult result;
- if(pure->MatchHead(text.Buffer(), text.Buffer(), result))
- {
- return new RegexMatch(text, &result);
- }
- else
- {
- return 0;
- }
- }
- }
-
- RegexMatch::Ref Regex::Match(const WString& text)const
- {
- if(rich)
- {
- RichResult result;
- if(rich->Match(text.Buffer(), text.Buffer(), result))
- {
- return new RegexMatch(text, &result, rich);
- }
- else
- {
- return 0;
- }
- }
- else
- {
- PureResult result;
- if(pure->Match(text.Buffer(), text.Buffer(), result))
- {
- return new RegexMatch(text, &result);
- }
- else
- {
- return 0;
- }
- }
- }
-
- bool Regex::TestHead(const WString& text)const
- {
- if(pure)
- {
- PureResult result;
- return pure->MatchHead(text.Buffer(), text.Buffer(), result);
- }
- else
- {
- RichResult result;
- return rich->MatchHead(text.Buffer(), text.Buffer(), result);
- }
- }
-
- bool Regex::Test(const WString& text)const
- {
- if(pure)
- {
- PureResult result;
- return pure->Match(text.Buffer(), text.Buffer(), result);
- }
- else
- {
- RichResult result;
- return rich->Match(text.Buffer(), text.Buffer(), result);
- }
- }
-
- void Regex::Search(const WString& text, RegexMatch::List& matches)const
- {
- Process(text, false, true, false, matches);
- }
-
- void Regex::Split(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
- {
- Process(text, keepEmptyMatch, false, true, matches);
- }
-
- void Regex::Cut(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
- {
- Process(text, keepEmptyMatch, true, true, matches);
- }
-
- /***********************************************************************
- RegexTokens
- ***********************************************************************/
-
- bool RegexToken::operator==(const RegexToken& _token)const
- {
- return length==_token.length && token==_token.token && reading==_token.reading;
- }
-
- bool RegexToken::operator==(const wchar_t* _token)const
- {
- return wcslen(_token)==length && wcsncmp(reading, _token, length)==0;
- }
-
- class RegexTokenEnumerator : public Object, public IEnumerator<RegexToken>
- {
- protected:
- bool available;
- RegexToken token;
- vint index;
- PureInterpretor* pure;
- Array<vint>& stateTokens;
- const wchar_t* reading;
- const wchar_t* start;
- vint lineIndex;
- vint lineStart;
- vint codeIndex;
- bool cacheAvailable;
- RegexToken cacheToken;
-
- void Read()
- {
- if(cacheAvailable || *reading)
- {
- if(cacheAvailable)
- {
- token=cacheToken;
- cacheAvailable=false;
- }
- else
- {
- token.reading=reading;
- token.start=0;
- token.length=0;
- token.token=-2;
- }
- token.lineIndex=lineIndex;
- token.lineStart=lineStart;
- token.codeIndex=codeIndex;
-
- PureResult result;
- while(*reading)
- {
- vint id=-1;
- if(!pure->MatchHead(reading, start, result))
- {
- result.start=reading-start;
- result.length=1;
- }
- else
- {
- id=stateTokens[result.finalState];
- }
- if(token.token==-2)
- {
- token.start=result.start;
- token.length=result.length;
- token.token=id;
- }
- else if(token.token==id && id==-1)
- {
- token.length+=result.length;
- }
- else
- {
- cacheAvailable=true;
- cacheToken.reading=reading;
- cacheToken.start=result.start;
- cacheToken.length=result.length;
- cacheToken.codeIndex=codeIndex;
- cacheToken.token=id;
- }
- reading+=result.length;
- if(cacheAvailable)
- {
- break;
- }
- }
-
- index++;
- available=true;
-
- for(vint i=0;i<token.length;i++)
- {
- if(token.reading[i]==L'\n')
- {
- lineIndex++;
- lineStart=0;
- }
- else
- {
- lineStart++;
- }
- }
- }
- else
- {
- available=false;
- }
- }
- public:
- RegexTokenEnumerator(const RegexTokenEnumerator& enumerator)
- :available(enumerator.available)
- ,token(enumerator.token)
- ,index(enumerator.index)
- ,pure(enumerator.pure)
- ,stateTokens(enumerator.stateTokens)
- ,reading(enumerator.reading)
- ,start(enumerator.start)
- ,lineIndex(enumerator.lineIndex)
- ,lineStart(enumerator.lineStart)
- ,codeIndex(enumerator.codeIndex)
- ,cacheAvailable(enumerator.cacheAvailable)
- ,cacheToken(enumerator.cacheToken)
- {
- }
-
- RegexTokenEnumerator(PureInterpretor* _pure, Array<vint>& _stateTokens, const wchar_t* _start, vint _codeIndex)
- :available(true)
- ,index(-1)
- ,pure(_pure)
- ,stateTokens(_stateTokens)
- ,reading(_start)
- ,start(_start)
- ,lineIndex(0)
- ,lineStart(0)
- ,codeIndex(_codeIndex)
- ,cacheAvailable(false)
- {
- Read();
- }
-
- IEnumerator<RegexToken>* Clone()const
- {
- return new RegexTokenEnumerator(*this);
- }
-
- const RegexToken& Current()const
- {
- return token;
- }
-
- vint Index()const
- {
- return index;
- }
-
- bool Next()
- {
- Read();
- return available;
- }
-
- bool Available()const
- {
- return available;
- }
-
- void Reset()
- {
- index=-1;
- reading=start;
- cacheAvailable=false;
- Read();
- }
-
- void ReadToEnd(List<RegexToken>& tokens, bool(*discard)(vint))
- {
- while(available)
- {
- if(!discard(token.token))
- {
- tokens.Add(token);
- }
- Read();
- }
- }
- };
-
- RegexTokens::RegexTokens(PureInterpretor* _pure, Array<vint>& _stateTokens, const WString& _code, vint _codeIndex)
- :pure(_pure)
- ,stateTokens(_stateTokens)
- ,code(_code)
- ,codeIndex(_codeIndex)
- {
- }
-
- IEnumerator<RegexToken>* RegexTokens::CreateEnumerator()const
- {
- return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex);
- }
-
- bool DefaultDiscard(vint token)
- {
- return false;
- }
-
- void RegexTokens::ReadToEnd(collections::List<RegexToken>& tokens, bool(*discard)(vint))const
- {
- if(discard==0)
- {
- discard=&DefaultDiscard;
- }
- RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex).ReadToEnd(tokens, discard);
- }
-
- /***********************************************************************
- RegexLexer
- ***********************************************************************/
-
- RegexLexer::RegexLexer(const collections::IEnumerable<WString>& tokens)
- :pure(0)
- {
- //构造所有DFA
- List<Expression::Ref> expressions;
- List<Automaton::Ref> dfas;
- CharRange::List subsets;
- Ptr<IEnumerator<WString>> enumerator=tokens.CreateEnumerator();
- while(enumerator->Available())
- {
- const WString& code=enumerator->Current();
- enumerator->Next();
-
- RegexExpression::Ref regex=ParseRegexExpression(code);
- Expression::Ref expression=regex->Merge();
- expression->CollectCharSet(subsets);
- expressions.Add(expression);
- }
- for(vint i=0;i<expressions.Count();i++)
- {
- Dictionary<State*, State*> nfaStateMap;
- Group<State*, State*> dfaStateMap;
- Expression::Ref expression=expressions[i];
- expression->ApplyCharSet(subsets);
- Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
- Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
- Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
- dfas.Add(dfa);
- }
-
- //为每一个DFA设置标记
- for(vint i=0;i<dfas.Count();i++)
- {
- Automaton::Ref dfa=dfas[i];
- for(vint j=0;j<dfa->states.Count();j++)
- {
- if(dfa->states[j]->finalState)
- {
- dfa->states[j]->userData=(void*)i;
- }
- else
- {
- dfa->states[j]->userData=(void*)dfas.Count();
- }
- }
- }
-
- //将DFA组合成大的e-NFA
- Automaton::Ref bigEnfa=new Automaton;
- for(vint i=0;i<dfas.Count();i++)
- {
- CopyFrom(bigEnfa->states.Wrap(), dfas[i]->states.Wrap());
- CopyFrom(bigEnfa->transitions.Wrap(), dfas[i]->transitions.Wrap());
- }
- bigEnfa->startState=bigEnfa->NewState();
- for(vint i=0;i<dfas.Count();i++)
- {
- bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState);
- }
-
- //转换成DFA
- Dictionary<State*, State*> nfaStateMap;
- Group<State*, State*> dfaStateMap;
- Automaton::Ref bigNfa=EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap);
- for(vint i=0;i<nfaStateMap.Keys().Count();i++)
- {
- void* userData=nfaStateMap.Values()[i]->userData;
- nfaStateMap.Keys()[i]->userData=userData;
- }
- Automaton::Ref bigDfa=NfaToDfa(bigNfa, dfaStateMap);
- for(vint i=0;i<dfaStateMap.Keys().Count();i++)
- {
- void* userData=dfaStateMap.GetByIndex(i)[0]->userData;
- for(vint j=1;j<dfaStateMap.GetByIndex(i).Count();j++)
- {
- void* newData=dfaStateMap.GetByIndex(i)[j]->userData;
- if(userData>newData)
- {
- userData=newData;
- }
- }
- dfaStateMap.Keys()[i]->userData=userData;
- }
-
- //构造状态机
- pure=new PureInterpretor(bigDfa, subsets);
- stateTokens.Resize(bigDfa->states.Count());
- for(vint i=0;i<stateTokens.Count();i++)
- {
- void* userData=bigDfa->states[i]->userData;
- stateTokens[i]=(vint)userData;
- }
- }
-
- RegexLexer::~RegexLexer()
- {
- if(pure)delete pure;
- }
-
- RegexTokens RegexLexer::Parse(const WString& code, vint codeIndex)
- {
- return RegexTokens(pure, stateTokens, code, codeIndex);
- }
- }
- }