PageRenderTime 46ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/src/regex.cc

https://github.com/sinya8282/Regen
C++ | 780 lines | 673 code | 69 blank | 38 comment | 218 complexity | f0f3655118304cafcb042dec27f95b1c MD5 | raw file
Possible License(s): BSD-3-Clause
  1. #include "regex.h"
  2. namespace regen {
  3. Regex::Regex(const Regen::StringPiece& pattern, const Regen::Options flags):
  4. regex_(pattern.as_string()),
  5. flag_(flags),
  6. recursion_depth_(0),
  7. involved_char_(std::bitset<256>()),
  8. olevel_(Regen::Options::Onone),
  9. dfa_failure_(false),
  10. dfa_(flags)
  11. {
  12. Parse();
  13. dfa_.set_expr_info(expr_info_);
  14. }
  15. StateExpr* Regex::CombineStateExpr(StateExpr *e1, StateExpr *e2, ExprPool *p)
  16. {
  17. StateExpr *s;
  18. CharClass *cc = p->alloc<CharClass>(e1, e2);
  19. if (cc->count() == 256) {
  20. //delete cc;
  21. s = p->alloc<Dot>();
  22. } else if (cc->count() == 1) {
  23. //delete cc;
  24. char c;
  25. switch (e1->type()) {
  26. case Expr::kLiteral:
  27. c = ((Literal*)e1)->literal();
  28. break;
  29. default: exitmsg("Invalid Expr Type: %d", e1->type());
  30. }
  31. s = p->alloc<Literal>(c);
  32. } else {
  33. s = cc;
  34. }
  35. return s;
  36. }
  37. Expr* Regex::PatchBackRef(Lexer *lexer, Expr *e, ExprPool *p)
  38. {
  39. std::set<std::size_t> &backrefs = lexer->backrefs();
  40. for (std::set<std::size_t>::iterator iter = backrefs.begin();
  41. iter != backrefs.end(); ++iter) {
  42. std::size_t ref_id = *iter;
  43. Expr* referee = lexer->groups()[ref_id];
  44. Operator *patch = p->alloc<Operator>(Operator::kBackRef);
  45. patch->set_id(ref_id);
  46. patch->set_parent(referee->parent());
  47. switch (referee->parent()->type()) {
  48. case Expr::kConcat: case Expr::kUnion:
  49. case Expr::kIntersection: case Expr::kXOR: {
  50. BinaryExpr* p = static_cast<BinaryExpr*>(referee->parent());
  51. if (p->lhs() == referee) {
  52. p->set_lhs(patch);
  53. } else if (p->rhs() == referee) {
  54. p->set_rhs(patch);
  55. } else {
  56. exitmsg("inconsistency parent-child pointer");
  57. }
  58. break;
  59. }
  60. case Expr::kQmark: case Expr::kStar: case Expr::kPlus: {
  61. UnaryExpr *p = static_cast<UnaryExpr*>(referee->parent());
  62. if (p->lhs() == referee) {
  63. p->set_lhs(patch);
  64. } else {
  65. exitmsg("inconsistency parent-child pointer");
  66. }
  67. break;
  68. }
  69. default: exitmsg("invalid types");
  70. }
  71. std::vector<Expr*> patches;
  72. referee->Serialize(patches, p);
  73. //delete referee;
  74. std::vector<Expr*> roots;
  75. roots.push_back(e);
  76. while (patches.size() > roots.size()) {
  77. roots.push_back(e->Clone(p));
  78. }
  79. for (std::size_t i = 0; i < roots.size(); i++) {
  80. roots[i]->PatchBackRef(patches[i], ref_id, p);
  81. }
  82. e = roots[0];
  83. for (std::size_t i = 1; i < roots.size(); i++) {
  84. e = p->alloc<Union>(e, roots[i]);
  85. }
  86. }
  87. return e;
  88. }
  89. void Regex::Parse()
  90. {
  91. const unsigned char *begin = (const unsigned char*)regex_.c_str(),
  92. *end = begin + regex_.length();
  93. Lexer lexer(begin, end, flag_);
  94. lexer.Consume();
  95. Expr* e;
  96. e = e0(&lexer, &pool_);
  97. if (e->type() == Expr::kNone) exitmsg("Inavlid pattern.");
  98. if (lexer.token() != Lexer::kEOP) exitmsg("Expected end of pattern.");
  99. if (!lexer.backrefs().empty()) e = PatchBackRef(&lexer, e, &pool_);
  100. expr_info_.orig_root = e;
  101. e->set_nonnullable(flag_.non_nullable());
  102. if (flag_.filtered_match()) e->FillKeywords(&expr_info_.key, &expr_info_.involve);
  103. if (!flag_.prefix_match()) {
  104. // rewrite expression R when Prefix-free Matching is required
  105. // R -> .*?R
  106. Expr *dotstar = pool_.alloc<Star>(pool_.alloc<Dot>(true), true);
  107. e = pool_.alloc<Concat>(dotstar, e, flag_.reverse_regex());
  108. }
  109. expr_info_.eop = pool_.alloc<EOP>();
  110. e = pool_.alloc<Concat>(e, expr_info_.eop);
  111. expr_info_.expr_root = e;
  112. e->FillPosition(&expr_info_);
  113. expr_info_.min_length = expr_info_.orig_root->min_length();
  114. expr_info_.max_length = expr_info_.orig_root->max_length();
  115. e->FillTransition();
  116. }
  117. /* Regen parsing rules
  118. * RE ::= e0 EOP
  119. * e0 ::= e1 ('||' e1)* # shuffle
  120. * e1 ::= e2 ('&&' e2)* # xor
  121. * e2 ::= e3 ('|' e3)* # union
  122. * e3 ::= e4 ('&' e4)* # intersection
  123. * e4 ::= e5+ # concatenation
  124. * e5 ::= e6 ([?+*]|{N,N}|{,}|{,N}|{N,})* # repetition
  125. * e6 ::= ATOM | '(' e0 ')' | '!' e0 | '#' e0 # ATOM, grouped, complement, permutation
  126. */
  127. Expr* Regex::e0(Lexer *lexer, ExprPool *pool)
  128. {
  129. Expr *e, *f;
  130. e = e1(lexer, pool);
  131. while (lexer->token() == Lexer::kShuffle) {
  132. lexer->Consume();
  133. f = e1(lexer, pool);
  134. e = Expr::Shuffle(e, f, pool);
  135. }
  136. return e;
  137. }
  138. Expr* Regex::e1(Lexer *lexer, ExprPool *pool)
  139. {
  140. Expr *e, *f;
  141. e = e2(lexer, pool);
  142. while (lexer->token() == Lexer::kXOR) {
  143. lexer->Consume();
  144. f = e2(lexer, pool);
  145. e = pool->alloc<XOR>(e, f, pool);
  146. }
  147. return e;
  148. }
  149. Expr* Regex::e2(Lexer *lexer, ExprPool *pool)
  150. {
  151. Expr *e, *f;
  152. e = e3(lexer, pool);
  153. while (lexer->token() == Lexer::kUnion) {
  154. lexer->Consume();
  155. f = e3(lexer, pool);
  156. e = pool->alloc<Union>(e, f);
  157. }
  158. return e;
  159. }
  160. Expr* Regex::e3(Lexer *lexer, ExprPool *pool)
  161. {
  162. Expr *e, *f;
  163. e = e4(lexer, pool);
  164. while (lexer->token() == Lexer::kIntersection) {
  165. lexer->Consume();
  166. f = e4(lexer, pool);
  167. e = pool->alloc<Intersection>(e, f, pool);
  168. }
  169. return e;
  170. }
  171. Expr* Regex::e4(Lexer *lexer, ExprPool *pool)
  172. {
  173. Expr *e, *f;
  174. e = e5(lexer, pool);
  175. while (lexer->Concatenated()) {
  176. f = e5(lexer, pool);
  177. e = pool->alloc<Concat>(e, f, flag_.reverse_regex());
  178. }
  179. return e;
  180. }
  181. Expr* Regex::e5(Lexer *lexer, ExprPool *pool)
  182. {
  183. Expr *e;
  184. e = e6(lexer, pool);
  185. while (lexer->Quantifier()) {
  186. bool non_greedy = false;
  187. Lexer::Type token = lexer->token();
  188. double probability = lexer->probability();
  189. lexer->Consume();
  190. if (lexer->token() == Lexer::kQmark) {
  191. non_greedy = true;
  192. lexer->Consume();
  193. }
  194. switch (token) {
  195. case Lexer::kStar: {
  196. e = pool->alloc<Star>(e, non_greedy, probability);
  197. break;
  198. }
  199. case Lexer::kPlus: {
  200. if (non_greedy) {
  201. Expr* e_ = e->Clone(pool);
  202. e = pool->alloc<Concat>(e_, pool->alloc<Star>(e, non_greedy, probability), flag_.reverse_regex());
  203. } else {
  204. e = pool->alloc<Plus>(e, probability);
  205. }
  206. break;
  207. }
  208. case Lexer::kQmark: {
  209. e = pool->alloc<Qmark>(e, non_greedy, probability);
  210. break;
  211. }
  212. case Lexer::kRepetition: {
  213. std::pair<int, int> r = lexer->repetition();
  214. int lower_repetition = r.first, upper_repetition = r.second;
  215. if (lower_repetition == 0 && upper_repetition == 0) {
  216. //delete e;
  217. e = pool->alloc<Epsilon>();
  218. } else if (upper_repetition == -1) {
  219. Expr* f = e;
  220. for (int i = 0; i < lower_repetition - 1; i++) {
  221. e = pool->alloc<Concat>(e, f->Clone(pool), flag_.reverse_regex());
  222. }
  223. e = pool->alloc<Concat>(e, pool->alloc<Star>(f->Clone(pool), non_greedy, probability), flag_.reverse_regex());
  224. } else if (upper_repetition == lower_repetition) {
  225. Expr *f;
  226. if (probability == 0.0) {
  227. f = e;
  228. } else {
  229. f = pool->alloc<Qmark>(e, non_greedy, probability);
  230. }
  231. for (int i = 0; i < lower_repetition - 1; i++) {
  232. e = pool->alloc<Concat>(e, f->Clone(pool), flag_.reverse_regex());
  233. }
  234. } else {
  235. Expr *f = e;
  236. for (int i = 0; i < lower_repetition - 1; i++) {
  237. e = pool->alloc<Concat>(e, f->Clone(pool), flag_.reverse_regex());
  238. }
  239. if (lower_repetition == 0) {
  240. e = pool->alloc<Qmark>(e, non_greedy, probability);
  241. lower_repetition++;
  242. }
  243. for (int i = 0; i < (upper_repetition - lower_repetition); i++) {
  244. e = pool->alloc<Concat>(e, pool->alloc<Qmark>(f->Clone(pool), non_greedy, probability), flag_.reverse_regex());
  245. }
  246. }
  247. break;
  248. }
  249. default:
  250. break;
  251. }
  252. }
  253. return e;
  254. }
  255. std::size_t UTF8ByteLength(const unsigned char c)
  256. {
  257. static const std::size_t len[] = {
  258. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  259. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  260. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  261. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  262. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  263. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  264. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  265. 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  266. 4,4,4,4,4,4,4,4,
  267. 0,0,0,0,0,0,0,0
  268. };
  269. return len[c];
  270. }
  271. bool IsReagalUTF8Sequence(const unsigned char *s)
  272. {
  273. std::size_t len = UTF8ByteLength(*s);
  274. if (len == 0) return false;
  275. for (std::size_t i = 1; i < len; i++) {
  276. if (!(0x80 <= *(s+i) && *(s+i) < 0xC0)) return false;
  277. }
  278. return true;
  279. }
  280. Expr* Regex::e6(Lexer *lexer, ExprPool *pool)
  281. {
  282. Expr *e;
  283. switch(lexer->token()) {
  284. case Lexer::kLiteral:
  285. if (flag_.encoding_utf8()) {
  286. std::size_t len = UTF8ByteLength(lexer->literal());
  287. if (IsReagalUTF8Sequence(lexer->ptr()-1)) {
  288. e = pool->alloc<Literal>(lexer->literal());
  289. while (--len > 0) {
  290. lexer->Consume();
  291. e = pool->alloc<Concat>(pool->alloc<Literal>(lexer->literal()), e, flag_.reverse_regex());
  292. }
  293. } else {
  294. exitmsg("Invalid UTF8 byte sequence.");
  295. }
  296. } else {
  297. e = pool->alloc<Literal>(lexer->literal());
  298. }
  299. break;
  300. case Lexer::kBegLine:
  301. e = pool->alloc<Anchor>(Anchor::kBegLine);
  302. break;
  303. case Lexer::kEndLine:
  304. e = pool->alloc<Anchor>(Anchor::kEndLine);
  305. break;
  306. case Lexer::kDot:
  307. e = pool->alloc<Dot>();
  308. break;
  309. case Lexer::kByteRange: {
  310. e = pool->alloc<CharClass>(lexer->table());
  311. break;
  312. }
  313. case Lexer::kCharClass: {
  314. CharClass *cc = pool->alloc<CharClass>();
  315. BuildCharClass(lexer, cc);
  316. if (cc->count() == 1) {
  317. e = pool->alloc<Literal>(lexer->literal());
  318. //delete cc;
  319. } else if (cc->count() == 256) {
  320. e = pool->alloc<Dot>();
  321. //delete cc;
  322. } else {
  323. e = cc;
  324. }
  325. break;
  326. }
  327. case Lexer::kNone:
  328. e = pool->alloc<None>();
  329. break;
  330. case Lexer::kEpsilon:
  331. e = pool->alloc<Epsilon>();
  332. break;
  333. case Lexer::kBackRef: {
  334. if (lexer->backref() >= lexer->groups().size()) exitmsg("Invalid back reference");
  335. if (lexer->weakref()) {
  336. e = lexer->groups()[lexer->backref()]->Clone(pool);
  337. } else {
  338. lexer->backrefs().insert(lexer->backref());
  339. Operator *o = pool->alloc<Operator>(Operator::kBackRef);
  340. o->set_id(lexer->backref());
  341. e = o;
  342. }
  343. break;
  344. }
  345. case Lexer::kLpar: {
  346. lexer->Consume();
  347. std::size_t ngroup = lexer->groups().size();
  348. lexer->groups().push_back(0);
  349. if (lexer->token() == Lexer::kRpar) {
  350. e = pool->alloc<Epsilon>();
  351. } else {
  352. e = e0(lexer, pool);
  353. }
  354. if (lexer->token() != Lexer::kRpar) exitmsg("expected a ')'");
  355. lexer->groups()[ngroup] = e;
  356. break;
  357. }
  358. case Lexer::kComplement: {
  359. bool complement = false;
  360. do {
  361. complement = !complement;
  362. lexer->Consume();
  363. } while (lexer->token() == Lexer::kComplement);
  364. e = e6(lexer, pool);
  365. if (complement) {
  366. if (e->type() == Expr::kNone) {
  367. //delete e;
  368. return pool->alloc<Star>(pool->alloc<Dot>());
  369. }
  370. Expr *dotstar = pool->alloc<Star>(pool->alloc<Dot>());
  371. e = pool->alloc<XOR>(dotstar, e, pool); /* R xor .* == !R */
  372. }
  373. return e;
  374. }
  375. case Lexer::kPermutation: {
  376. lexer->Consume();
  377. ExprPool tmp_pool;
  378. e = e6(lexer, &tmp_pool);
  379. e = Expr::Permutation(e, pool);
  380. return e;
  381. }
  382. case Lexer::kReverse: {
  383. lexer->Consume();
  384. bool reverse = flag_.reverse_regex();
  385. flag_.reverse_regex(!reverse);
  386. e = e6(lexer, pool);
  387. flag_.reverse_regex(reverse);
  388. return e;
  389. }
  390. case Lexer::kRecursion: {
  391. lexer->Consume();
  392. std::pair<int, int> recursion_limit;
  393. recursion_limit.first = recursion_limit.second = 1;
  394. if (lexer->token() == Lexer::kRepetition) {
  395. recursion_limit = lexer->repetition();
  396. lexer->Consume();
  397. }
  398. if (recursion_limit.second == -1) {
  399. exitmsg("disallow infinite recursion.");
  400. }
  401. if (recursion_depth_ < static_cast<std::size_t>(recursion_limit.second)) {
  402. recursion_depth_++;
  403. Lexer l(lexer->begin(), lexer->end(), flag_);
  404. l.Consume();
  405. e = e0(&l, pool);
  406. if (recursion_depth_ > static_cast<std::size_t>(recursion_limit.first)) {
  407. e = pool->alloc<Qmark>(e);
  408. }
  409. recursion_depth_--;
  410. } else {
  411. e = pool->alloc<Epsilon>();
  412. }
  413. return e;
  414. }
  415. case Lexer::kRpar:
  416. exitmsg("expected a '('!");
  417. case Lexer::kEOP:
  418. exitmsg("expected none-nullable expression.");
  419. default:
  420. exitmsg("can't handle Expr: %s (%c)", lexer->TokenToString(),lexer->literal());
  421. }
  422. lexer->Consume();
  423. return e;
  424. }
  425. CharClass* Regex::BuildCharClass(Lexer *lexer, CharClass *cc)
  426. {
  427. std::bitset<256>& table = cc->table();
  428. bool range;
  429. unsigned char lastc = '\0';
  430. lexer->Consume();
  431. if (lexer->token() == Lexer::kBegLine) {
  432. lexer->Consume();
  433. cc->set_negative(true);
  434. }
  435. if (lexer->literal() == '-' ||
  436. lexer->literal() == ']') {
  437. table.set(lexer->literal());
  438. lastc = lexer->literal();
  439. lexer->Consume();
  440. }
  441. for (range = false; lexer->token() != Lexer::kEOP && lexer->literal() != ']'; lexer->Consume()) {
  442. if (!range && lexer->literal() == '-') {
  443. range = true;
  444. continue;
  445. }
  446. if (lexer->token() == Lexer::kByteRange) {
  447. table |= lexer->table();
  448. } else {
  449. table.set(lexer->literal());
  450. }
  451. if (range) {
  452. for (std::size_t c = lexer->literal() - 1; c > lastc; c--) {
  453. table.set(c);
  454. }
  455. range = false;
  456. }
  457. lastc = lexer->literal();
  458. }
  459. if (lexer->token() == Lexer::kEOP) exitmsg(" [ ] imbalance");
  460. if (range) {
  461. table.set('-');
  462. }
  463. if (cc->count() == 1) {
  464. lexer->literal(lastc);
  465. } else if (cc->count() >= 128 && !cc->negative()) {
  466. cc->set_negative(true);
  467. cc->flip();
  468. }
  469. return cc;
  470. }
  471. // Converte DFA to Regular Expression using GNFA.
  472. // see http://en.wikipedia.org/wiki/Generalized_nondeterministic_finite-state_machine
  473. void Regex::CreateRegexFromDFA(const DFA &dfa, ExprInfo *info, ExprPool *p)
  474. {
  475. int GSTART = dfa.size();
  476. int GACCEPT = GSTART+1;
  477. typedef std::map<int, Expr*> GNFATrans;
  478. std::vector<GNFATrans> gnfa_transition(GACCEPT);
  479. for (std::size_t i = 0; i < dfa.size(); i++) {
  480. const DFA::Transition &transition = dfa.GetTransition(i);
  481. GNFATrans &gtransition = gnfa_transition[i];
  482. for (int c = 0; c < 256; c++) {
  483. DFA::state_t next = transition[c];
  484. if (next != DFA::REJECT) {
  485. Expr *e;
  486. if (c < 255 && next == transition[c+1]) {
  487. int begin = c;
  488. while (++c < 255) {
  489. if (transition[c] != transition[c+1]) break;
  490. }
  491. int end = c;
  492. if (begin == 0 && end == 255) {
  493. e = p->alloc<Dot>();
  494. } else {
  495. std::bitset<256> table;
  496. bool negative = false;
  497. for (int j = begin; j <= end; j++) {
  498. table.set(j);
  499. }
  500. if (table.count() >= 128) {
  501. negative = true;
  502. table.flip();
  503. }
  504. e = p->alloc<CharClass>(table, negative);
  505. }
  506. } else {
  507. e = p->alloc<Literal>(c);
  508. }
  509. if (gtransition.find(next) != gtransition.end()) {
  510. Expr* f = gtransition[next];
  511. if (Expr::SuperTypeOf(e) == Expr::kStateExpr &&
  512. Expr::SuperTypeOf(f) == Expr::kStateExpr) {
  513. Expr *e_ = CombineStateExpr((StateExpr*)e, (StateExpr*)f, p);
  514. //delete e;
  515. //delete f;
  516. e = e_;
  517. } else {
  518. e = p->alloc<Union>(e, f);
  519. }
  520. }
  521. gtransition[next] = e;
  522. }
  523. }
  524. }
  525. for (std::size_t i = 0; i < dfa.size(); i++) {
  526. if (dfa.IsAcceptOrEndlineState(i)) {
  527. if (dfa.IsEndlineState(i)) {
  528. gnfa_transition[i][GACCEPT] = p->alloc<Anchor>(Anchor::kEndLine);
  529. } else {
  530. gnfa_transition[i][GACCEPT] = NULL;
  531. }
  532. }
  533. }
  534. gnfa_transition[GSTART][0] = NULL;
  535. for (int i = 0; i < GSTART; i++) {
  536. Expr* loop = NULL;
  537. GNFATrans &gtransition = gnfa_transition[i];
  538. if (gtransition.find(i) != gtransition.end()) {
  539. loop = p->alloc<Star>(gtransition[i]);
  540. gtransition.erase(i);
  541. }
  542. for (int j = i+1; j <= GSTART; j++) {
  543. if (gnfa_transition[j].find(i) != gnfa_transition[j].end()) {
  544. Expr* regex1 = gnfa_transition[j][i];
  545. gnfa_transition[j].erase(i);
  546. GNFATrans::iterator iter = gtransition.begin();
  547. while (iter != gtransition.end()) {
  548. Expr* regex2 = (*iter).second;
  549. if (loop != NULL) {
  550. if (regex2 != NULL) {
  551. regex2 = p->alloc<Concat>(loop, regex2);
  552. } else {
  553. regex2 = loop;
  554. }
  555. }
  556. if (regex1 != NULL) {
  557. if (regex2 != NULL) {
  558. regex2 = p->alloc<Concat>(regex1, regex2);
  559. } else {
  560. regex2 = regex1;
  561. }
  562. }
  563. if (regex2 != NULL) {
  564. regex2 = regex2->Clone(p);
  565. }
  566. if (gnfa_transition[j].find((*iter).first) != gnfa_transition[j].end()) {
  567. if (gnfa_transition[j][(*iter).first] != NULL) {
  568. if (regex2 != NULL) {
  569. Expr* e = gnfa_transition[j][(*iter).first];
  570. Expr* f = regex2;
  571. if (Expr::SuperTypeOf(e) == Expr::kStateExpr &&
  572. Expr::SuperTypeOf(f) == Expr::kStateExpr) {
  573. Expr *e_ = CombineStateExpr((StateExpr*)e, (StateExpr*)f, p);
  574. //delete e;
  575. //delete f;
  576. e = e_;
  577. } else {
  578. e = p->alloc<Union>(e, f);
  579. }
  580. gnfa_transition[j][(*iter).first] = e;
  581. } else {
  582. gnfa_transition[j][(*iter).first] =
  583. p->alloc<Qmark>(gnfa_transition[j][(*iter).first]);
  584. }
  585. } else {
  586. if (regex2 != NULL) {
  587. gnfa_transition[j][(*iter).first] = p->alloc<Qmark>(regex2);
  588. } else {
  589. gnfa_transition[j][(*iter).first] = regex2;
  590. }
  591. }
  592. } else {
  593. gnfa_transition[j][(*iter).first] = regex2;
  594. }
  595. ++iter;
  596. }
  597. }
  598. }
  599. }
  600. if(gnfa_transition[GSTART][GACCEPT] == NULL) {
  601. info->expr_root = p->alloc<None>();
  602. } else {
  603. info->eop = p->alloc<EOP>();
  604. info->expr_root = p->alloc<Concat>(gnfa_transition[GSTART][GACCEPT], info->eop);
  605. }
  606. }
  607. /*
  608. * - slower -
  609. * Onone: On-The-Fly DFA based matching
  610. * O0: DFA based matching
  611. * ~ Xbyak(JIT library) required ~
  612. * O1: JIT-ed DFA based matching
  613. * O2: transition rule optimized-JIT-ed DFA based mathing
  614. * O3: transition rule & dfa reduction optimized-JIT-ed DFA based mathing
  615. * - faster -
  616. */
  617. bool Regex::Compile(Regen::Options::CompileFlag olevel) {
  618. if (olevel == Regen::Options::Onone || olevel_ >= olevel) return true;
  619. if (!dfa_failure_ && !dfa_.Complete()) {
  620. /* try create DFA. */
  621. std::size_t limit = state_exprs_.size();
  622. limit = 1000; // default limitation is 1000 (it's may finish within a second).
  623. dfa_failure_ = !dfa_.Construct(limit);
  624. }
  625. if (dfa_failure_) {
  626. /* can not create DFA. (too many states) */
  627. return false;
  628. }
  629. if (!dfa_.Compile(olevel)) {
  630. olevel_ = dfa_.olevel();
  631. } else {
  632. olevel_ = olevel;
  633. }
  634. return olevel_ == olevel;
  635. }
  636. bool Regex::Match(const Regen::StringPiece& string, Regen::StringPiece *result) const {
  637. return dfa_.Match(string, result);
  638. }
  639. /* Thompson-NFA based matching */
  640. bool Regex::NFAMatch(const Regen::StringPiece& string, Regen::StringPiece *result) const
  641. {
  642. typedef std::vector<StateExpr*> NFA;
  643. std::size_t nfa_size = state_exprs_.size();
  644. std::vector<uint32_t> next_states_flag(nfa_size);
  645. uint32_t step = 1;
  646. NFA::iterator iter;
  647. std::set<StateExpr*>::iterator next_iter;
  648. NFA states, next_states;
  649. states.insert(states.begin(), expr_info_.expr_root->transition().first.begin(), expr_info_.expr_root->transition().first.end());
  650. for (const unsigned char *p = string.ubegin(); p < string.uend(); p++, step++) {
  651. for (iter = states.begin(); iter != states.end(); ++iter) {
  652. StateExpr *s = *iter;
  653. if (s->Match(*p)) {
  654. for (next_iter = s->transition().follow.begin();
  655. next_iter != s->transition().follow.end();
  656. ++next_iter) {
  657. if (next_states_flag[(*next_iter)->state_id()] != step) {
  658. next_states_flag[(*next_iter)->state_id()] = step;
  659. next_states.push_back(*next_iter);
  660. }
  661. }
  662. }
  663. }
  664. states.swap(next_states);
  665. if (states.empty()) break;
  666. next_states.clear();
  667. }
  668. bool match = false;
  669. for (iter = states.begin(); iter != states.end(); ++iter) {
  670. if ((*iter)->type() == Expr::kEOP) {
  671. match = true;
  672. break;
  673. }
  674. }
  675. return match;
  676. }
  677. void Regex::PrintRegex() const
  678. {
  679. if (dfa_.Complete()) {
  680. ExprPool p;
  681. ExprInfo i;
  682. CreateRegexFromDFA(dfa_, &i, &p);
  683. PrintRegexVisitor::Print(i.expr_root);
  684. } else {
  685. PrintRegexVisitor::Print(expr_info_.expr_root);
  686. }
  687. }
  688. void Regex::PrintRegex(const DFA &dfa)
  689. {
  690. ExprPool p;
  691. ExprInfo i;
  692. CreateRegexFromDFA(dfa, &i, &p);
  693. PrintRegexVisitor::Print(i.expr_root);
  694. }
  695. void Regex::PrintParseTree() const
  696. {
  697. PrintParseTreeVisitor::Print(expr_info_.expr_root);
  698. }
  699. void Regex::DumpExprTree() const
  700. {
  701. DumpExprVisitor::Dump(expr_info_.expr_root);
  702. }
  703. void Regex::PrintText(Expr::GenOpt opt, std::size_t n) const
  704. {
  705. std::set<std::string> g;
  706. expr_info_.expr_root->Generate(g, opt, n);
  707. for (std::set<std::string>::iterator iter = g.begin(); iter != g.end(); ++iter) {
  708. printf("%s\n", iter->c_str());
  709. }
  710. }
  711. } // namespace regen