PageRenderTime 63ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/Library/Regex/Regex.cpp

#
C++ | 678 lines | 598 code | 60 blank | 20 comment | 64 complexity | 151040f3345d8f00e7f4b19287b32d4e MD5 | raw file
  1. #include "Regex.h"
  2. #include "RegexExpression.h"
  3. #include "RegexPure.h"
  4. #include "RegexRich.h"
  5. #include "..\Collections\OperationCopyFrom.h"
  6. namespace vl
  7. {
  8. namespace regex
  9. {
  10. using namespace collections;
  11. using namespace regex_internal;
  12. /***********************************************************************
  13. RegexString
  14. ***********************************************************************/
  15. RegexString::RegexString(vint _start)
  16. :start(_start)
  17. ,length(0)
  18. {
  19. }
  20. RegexString::RegexString(const WString& _string, vint _start, vint _length)
  21. :value(_length==0?L"":_string.Sub(_start, _length))
  22. ,start(_start)
  23. ,length(_length)
  24. {
  25. }
  26. vint RegexString::Start()const
  27. {
  28. return start;
  29. }
  30. vint RegexString::Length()const
  31. {
  32. return length;
  33. }
  34. const WString& RegexString::Value()const
  35. {
  36. return value;
  37. }
  38. bool RegexString::operator==(const RegexString& string)const
  39. {
  40. return start==string.start && length==string.length && value==string.value;
  41. }
  42. /***********************************************************************
  43. RegexMatch
  44. ***********************************************************************/
  45. RegexMatch::RegexMatch(const WString& _string, PureResult* _result)
  46. :success(true)
  47. ,result(_string, _result->start, _result->length)
  48. {
  49. }
  50. RegexMatch::RegexMatch(const WString& _string, RichResult* _result, RichInterpretor* _rich)
  51. :success(true)
  52. ,result(_string, _result->start, _result->length)
  53. {
  54. for(vint i=0;i<_result->captures.Count();i++)
  55. {
  56. CaptureRecord& capture=_result->captures[i];
  57. if(capture.capture==-1)
  58. {
  59. captures.Add(RegexString(_string, capture.start, capture.length));
  60. }
  61. else
  62. {
  63. groups.Add(_rich->CaptureNames()[capture.capture], RegexString(_string, capture.start, capture.length));
  64. }
  65. }
  66. }
  67. RegexMatch::RegexMatch(const RegexString& _result)
  68. :success(false)
  69. ,result(_result)
  70. {
  71. }
  72. bool RegexMatch::Success()const
  73. {
  74. return success;
  75. }
  76. const RegexString& RegexMatch::Result()const
  77. {
  78. return result;
  79. }
  80. const RegexMatch::CaptureList& RegexMatch::Captures()const
  81. {
  82. return captures.Wrap();
  83. }
  84. const RegexMatch::CaptureGroup& RegexMatch::Groups()const
  85. {
  86. return groups.Wrap();
  87. }
  88. /***********************************************************************
  89. Regex
  90. ***********************************************************************/
  91. void Regex::Process(const WString& text, bool keepEmpty, bool keepSuccess, bool keepFail, RegexMatch::List& matches)const
  92. {
  93. if(rich)
  94. {
  95. const wchar_t* start=text.Buffer();
  96. const wchar_t* input=start;
  97. RichResult result;
  98. while(rich->Match(input, start, result))
  99. {
  100. vint offset=input-start;
  101. if(keepFail)
  102. {
  103. if(result.start>offset || keepEmpty)
  104. {
  105. matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
  106. }
  107. }
  108. if(keepSuccess)
  109. {
  110. matches.Add(new RegexMatch(text, &result, rich));
  111. }
  112. input=start+result.start+result.length;
  113. }
  114. if(keepFail)
  115. {
  116. vint remain=input-start;
  117. vint length=text.Length()-remain;
  118. if(length || keepEmpty)
  119. {
  120. matches.Add(new RegexMatch(RegexString(text, remain, length)));
  121. }
  122. }
  123. }
  124. else
  125. {
  126. const wchar_t* start=text.Buffer();
  127. const wchar_t* input=start;
  128. PureResult result;
  129. while(pure->Match(input, start, result))
  130. {
  131. vint offset=input-start;
  132. if(keepFail)
  133. {
  134. if(result.start>offset || keepEmpty)
  135. {
  136. matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
  137. }
  138. }
  139. if(keepSuccess)
  140. {
  141. matches.Add(new RegexMatch(text, &result));
  142. }
  143. input=start+result.start+result.length;
  144. }
  145. if(keepFail)
  146. {
  147. vint remain=input-start;
  148. vint length=text.Length()-remain;
  149. if(length || keepEmpty)
  150. {
  151. matches.Add(new RegexMatch(RegexString(text, remain, length)));
  152. }
  153. }
  154. }
  155. }
  156. Regex::Regex(const WString& code, bool preferPure)
  157. :pure(0)
  158. ,rich(0)
  159. {
  160. CharRange::List subsets;
  161. RegexExpression::Ref regex=ParseRegexExpression(code);
  162. Expression::Ref expression=regex->Merge();
  163. expression->NormalizeCharSet(subsets);
  164. bool pureRequired=false;
  165. bool richRequired=false;
  166. if(preferPure)
  167. {
  168. if(expression->HasNoExtension())
  169. {
  170. pureRequired=true;
  171. }
  172. else
  173. {
  174. if(expression->CanTreatAsPure())
  175. {
  176. pureRequired=true;
  177. richRequired=true;
  178. }
  179. else
  180. {
  181. richRequired=true;
  182. }
  183. }
  184. }
  185. else
  186. {
  187. richRequired=true;
  188. }
  189. try
  190. {
  191. if(pureRequired)
  192. {
  193. Dictionary<State*, State*> nfaStateMap;
  194. Group<State*, State*> dfaStateMap;
  195. Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
  196. Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
  197. Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
  198. pure=new PureInterpretor(dfa, subsets);
  199. }
  200. if(richRequired)
  201. {
  202. Dictionary<State*, State*> nfaStateMap;
  203. Group<State*, State*> dfaStateMap;
  204. Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
  205. Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap);
  206. Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
  207. rich=new RichInterpretor(dfa);
  208. }
  209. }
  210. catch(...)
  211. {
  212. if(pure)delete pure;
  213. if(rich)delete rich;
  214. throw;
  215. }
  216. }
  217. Regex::~Regex()
  218. {
  219. if(pure)delete pure;
  220. if(rich)delete rich;
  221. }
  222. bool Regex::IsPureMatch()const
  223. {
  224. return rich?false:true;
  225. }
  226. bool Regex::IsPureTest()const
  227. {
  228. return pure?true:false;
  229. }
  230. RegexMatch::Ref Regex::MatchHead(const WString& text)const
  231. {
  232. if(rich)
  233. {
  234. RichResult result;
  235. if(rich->MatchHead(text.Buffer(), text.Buffer(), result))
  236. {
  237. return new RegexMatch(text, &result, rich);
  238. }
  239. else
  240. {
  241. return 0;
  242. }
  243. }
  244. else
  245. {
  246. PureResult result;
  247. if(pure->MatchHead(text.Buffer(), text.Buffer(), result))
  248. {
  249. return new RegexMatch(text, &result);
  250. }
  251. else
  252. {
  253. return 0;
  254. }
  255. }
  256. }
  257. RegexMatch::Ref Regex::Match(const WString& text)const
  258. {
  259. if(rich)
  260. {
  261. RichResult result;
  262. if(rich->Match(text.Buffer(), text.Buffer(), result))
  263. {
  264. return new RegexMatch(text, &result, rich);
  265. }
  266. else
  267. {
  268. return 0;
  269. }
  270. }
  271. else
  272. {
  273. PureResult result;
  274. if(pure->Match(text.Buffer(), text.Buffer(), result))
  275. {
  276. return new RegexMatch(text, &result);
  277. }
  278. else
  279. {
  280. return 0;
  281. }
  282. }
  283. }
  284. bool Regex::TestHead(const WString& text)const
  285. {
  286. if(pure)
  287. {
  288. PureResult result;
  289. return pure->MatchHead(text.Buffer(), text.Buffer(), result);
  290. }
  291. else
  292. {
  293. RichResult result;
  294. return rich->MatchHead(text.Buffer(), text.Buffer(), result);
  295. }
  296. }
  297. bool Regex::Test(const WString& text)const
  298. {
  299. if(pure)
  300. {
  301. PureResult result;
  302. return pure->Match(text.Buffer(), text.Buffer(), result);
  303. }
  304. else
  305. {
  306. RichResult result;
  307. return rich->Match(text.Buffer(), text.Buffer(), result);
  308. }
  309. }
  310. void Regex::Search(const WString& text, RegexMatch::List& matches)const
  311. {
  312. Process(text, false, true, false, matches);
  313. }
  314. void Regex::Split(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
  315. {
  316. Process(text, keepEmptyMatch, false, true, matches);
  317. }
  318. void Regex::Cut(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
  319. {
  320. Process(text, keepEmptyMatch, true, true, matches);
  321. }
  322. /***********************************************************************
  323. RegexTokens
  324. ***********************************************************************/
  325. bool RegexToken::operator==(const RegexToken& _token)const
  326. {
  327. return length==_token.length && token==_token.token && reading==_token.reading;
  328. }
  329. bool RegexToken::operator==(const wchar_t* _token)const
  330. {
  331. return wcslen(_token)==length && wcsncmp(reading, _token, length)==0;
  332. }
  333. class RegexTokenEnumerator : public Object, public IEnumerator<RegexToken>
  334. {
  335. protected:
  336. bool available;
  337. RegexToken token;
  338. vint index;
  339. PureInterpretor* pure;
  340. Array<vint>& stateTokens;
  341. const wchar_t* reading;
  342. const wchar_t* start;
  343. vint lineIndex;
  344. vint lineStart;
  345. vint codeIndex;
  346. bool cacheAvailable;
  347. RegexToken cacheToken;
  348. void Read()
  349. {
  350. if(cacheAvailable || *reading)
  351. {
  352. if(cacheAvailable)
  353. {
  354. token=cacheToken;
  355. cacheAvailable=false;
  356. }
  357. else
  358. {
  359. token.reading=reading;
  360. token.start=0;
  361. token.length=0;
  362. token.token=-2;
  363. }
  364. token.lineIndex=lineIndex;
  365. token.lineStart=lineStart;
  366. token.codeIndex=codeIndex;
  367. PureResult result;
  368. while(*reading)
  369. {
  370. vint id=-1;
  371. if(!pure->MatchHead(reading, start, result))
  372. {
  373. result.start=reading-start;
  374. result.length=1;
  375. }
  376. else
  377. {
  378. id=stateTokens[result.finalState];
  379. }
  380. if(token.token==-2)
  381. {
  382. token.start=result.start;
  383. token.length=result.length;
  384. token.token=id;
  385. }
  386. else if(token.token==id && id==-1)
  387. {
  388. token.length+=result.length;
  389. }
  390. else
  391. {
  392. cacheAvailable=true;
  393. cacheToken.reading=reading;
  394. cacheToken.start=result.start;
  395. cacheToken.length=result.length;
  396. cacheToken.codeIndex=codeIndex;
  397. cacheToken.token=id;
  398. }
  399. reading+=result.length;
  400. if(cacheAvailable)
  401. {
  402. break;
  403. }
  404. }
  405. index++;
  406. available=true;
  407. for(vint i=0;i<token.length;i++)
  408. {
  409. if(token.reading[i]==L'\n')
  410. {
  411. lineIndex++;
  412. lineStart=0;
  413. }
  414. else
  415. {
  416. lineStart++;
  417. }
  418. }
  419. }
  420. else
  421. {
  422. available=false;
  423. }
  424. }
  425. public:
  426. RegexTokenEnumerator(const RegexTokenEnumerator& enumerator)
  427. :available(enumerator.available)
  428. ,token(enumerator.token)
  429. ,index(enumerator.index)
  430. ,pure(enumerator.pure)
  431. ,stateTokens(enumerator.stateTokens)
  432. ,reading(enumerator.reading)
  433. ,start(enumerator.start)
  434. ,lineIndex(enumerator.lineIndex)
  435. ,lineStart(enumerator.lineStart)
  436. ,codeIndex(enumerator.codeIndex)
  437. ,cacheAvailable(enumerator.cacheAvailable)
  438. ,cacheToken(enumerator.cacheToken)
  439. {
  440. }
  441. RegexTokenEnumerator(PureInterpretor* _pure, Array<vint>& _stateTokens, const wchar_t* _start, vint _codeIndex)
  442. :available(true)
  443. ,index(-1)
  444. ,pure(_pure)
  445. ,stateTokens(_stateTokens)
  446. ,reading(_start)
  447. ,start(_start)
  448. ,lineIndex(0)
  449. ,lineStart(0)
  450. ,codeIndex(_codeIndex)
  451. ,cacheAvailable(false)
  452. {
  453. Read();
  454. }
  455. IEnumerator<RegexToken>* Clone()const
  456. {
  457. return new RegexTokenEnumerator(*this);
  458. }
  459. const RegexToken& Current()const
  460. {
  461. return token;
  462. }
  463. vint Index()const
  464. {
  465. return index;
  466. }
  467. bool Next()
  468. {
  469. Read();
  470. return available;
  471. }
  472. bool Available()const
  473. {
  474. return available;
  475. }
  476. void Reset()
  477. {
  478. index=-1;
  479. reading=start;
  480. cacheAvailable=false;
  481. Read();
  482. }
  483. void ReadToEnd(List<RegexToken>& tokens, bool(*discard)(vint))
  484. {
  485. while(available)
  486. {
  487. if(!discard(token.token))
  488. {
  489. tokens.Add(token);
  490. }
  491. Read();
  492. }
  493. }
  494. };
  495. RegexTokens::RegexTokens(PureInterpretor* _pure, Array<vint>& _stateTokens, const WString& _code, vint _codeIndex)
  496. :pure(_pure)
  497. ,stateTokens(_stateTokens)
  498. ,code(_code)
  499. ,codeIndex(_codeIndex)
  500. {
  501. }
  502. IEnumerator<RegexToken>* RegexTokens::CreateEnumerator()const
  503. {
  504. return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex);
  505. }
  506. bool DefaultDiscard(vint token)
  507. {
  508. return false;
  509. }
  510. void RegexTokens::ReadToEnd(collections::List<RegexToken>& tokens, bool(*discard)(vint))const
  511. {
  512. if(discard==0)
  513. {
  514. discard=&DefaultDiscard;
  515. }
  516. RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex).ReadToEnd(tokens, discard);
  517. }
  518. /***********************************************************************
  519. RegexLexer
  520. ***********************************************************************/
  521. RegexLexer::RegexLexer(const collections::IEnumerable<WString>& tokens)
  522. :pure(0)
  523. {
  524. //构造所有DFA
  525. List<Expression::Ref> expressions;
  526. List<Automaton::Ref> dfas;
  527. CharRange::List subsets;
  528. Ptr<IEnumerator<WString>> enumerator=tokens.CreateEnumerator();
  529. while(enumerator->Available())
  530. {
  531. const WString& code=enumerator->Current();
  532. enumerator->Next();
  533. RegexExpression::Ref regex=ParseRegexExpression(code);
  534. Expression::Ref expression=regex->Merge();
  535. expression->CollectCharSet(subsets);
  536. expressions.Add(expression);
  537. }
  538. for(vint i=0;i<expressions.Count();i++)
  539. {
  540. Dictionary<State*, State*> nfaStateMap;
  541. Group<State*, State*> dfaStateMap;
  542. Expression::Ref expression=expressions[i];
  543. expression->ApplyCharSet(subsets);
  544. Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
  545. Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
  546. Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
  547. dfas.Add(dfa);
  548. }
  549. //为每一个DFA设置标记
  550. for(vint i=0;i<dfas.Count();i++)
  551. {
  552. Automaton::Ref dfa=dfas[i];
  553. for(vint j=0;j<dfa->states.Count();j++)
  554. {
  555. if(dfa->states[j]->finalState)
  556. {
  557. dfa->states[j]->userData=(void*)i;
  558. }
  559. else
  560. {
  561. dfa->states[j]->userData=(void*)dfas.Count();
  562. }
  563. }
  564. }
  565. //将DFA组合成大的e-NFA
  566. Automaton::Ref bigEnfa=new Automaton;
  567. for(vint i=0;i<dfas.Count();i++)
  568. {
  569. CopyFrom(bigEnfa->states.Wrap(), dfas[i]->states.Wrap());
  570. CopyFrom(bigEnfa->transitions.Wrap(), dfas[i]->transitions.Wrap());
  571. }
  572. bigEnfa->startState=bigEnfa->NewState();
  573. for(vint i=0;i<dfas.Count();i++)
  574. {
  575. bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState);
  576. }
  577. //转换成DFA
  578. Dictionary<State*, State*> nfaStateMap;
  579. Group<State*, State*> dfaStateMap;
  580. Automaton::Ref bigNfa=EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap);
  581. for(vint i=0;i<nfaStateMap.Keys().Count();i++)
  582. {
  583. void* userData=nfaStateMap.Values()[i]->userData;
  584. nfaStateMap.Keys()[i]->userData=userData;
  585. }
  586. Automaton::Ref bigDfa=NfaToDfa(bigNfa, dfaStateMap);
  587. for(vint i=0;i<dfaStateMap.Keys().Count();i++)
  588. {
  589. void* userData=dfaStateMap.GetByIndex(i)[0]->userData;
  590. for(vint j=1;j<dfaStateMap.GetByIndex(i).Count();j++)
  591. {
  592. void* newData=dfaStateMap.GetByIndex(i)[j]->userData;
  593. if(userData>newData)
  594. {
  595. userData=newData;
  596. }
  597. }
  598. dfaStateMap.Keys()[i]->userData=userData;
  599. }
  600. //构造状态机
  601. pure=new PureInterpretor(bigDfa, subsets);
  602. stateTokens.Resize(bigDfa->states.Count());
  603. for(vint i=0;i<stateTokens.Count();i++)
  604. {
  605. void* userData=bigDfa->states[i]->userData;
  606. stateTokens[i]=(vint)userData;
  607. }
  608. }
  609. RegexLexer::~RegexLexer()
  610. {
  611. if(pure)delete pure;
  612. }
  613. RegexTokens RegexLexer::Parse(const WString& code, vint codeIndex)
  614. {
  615. return RegexTokens(pure, stateTokens, code, codeIndex);
  616. }
  617. }
  618. }