/core/regex.d

http://github.com/wilkie/djehuty · D · 1702 lines · 1162 code · 282 blank · 258 comment · 460 complexity · 72824d970fe155d6a80b7f8454cf27e5 MD5 · raw file

  1. /*
  2. * regex.d
  3. *
  4. * This file contains the logic behind a regular expression parser.
  5. *
  6. * Author: Dave Wilkinson
  7. * Originated: May 9th, 2009
  8. * Inspiration: "Albino 2" by Mark Knight
  9. *
  10. */
  11. module core.regex;
  12. import core.string;
  13. import core.definitions;
  14. import synch.thread;
  15. import io.console;
  16. import data.stack;
  17. import data.list;
  18. // This provides thread-local access to regex variables set via
  19. // Regex groups.
  20. uint _position() {
  21. if (Thread.current() in Regex.regexPos) {
  22. return Regex.regexPos[Thread.current()];
  23. }
  24. return uint.max;
  25. }
  26. string _1() {
  27. if (Thread.current() in Regex.regexRefs) {
  28. return Regex.regexRefs[Thread.current()][0];
  29. }
  30. return ("");
  31. }
  32. string _2() {
  33. if (Thread.current() in Regex.regexRefs) {
  34. return Regex.regexRefs[Thread.current()][1];
  35. }
  36. return ("");
  37. }
  38. string _3() {
  39. if (Thread.current() in Regex.regexRefs) {
  40. return Regex.regexRefs[Thread.current()][2];
  41. }
  42. return ("");
  43. }
  44. string _4() {
  45. if (Thread.current() in Regex.regexRefs) {
  46. return Regex.regexRefs[Thread.current()][3];
  47. }
  48. return ("");
  49. }
  50. string _5() {
  51. if (Thread.current() in Regex.regexRefs) {
  52. return Regex.regexRefs[Thread.current()][4];
  53. }
  54. return ("");
  55. }
  56. string _6() {
  57. if (Thread.current() in Regex.regexRefs) {
  58. return Regex.regexRefs[Thread.current()][5];
  59. }
  60. return ("");
  61. }
  62. string _7() {
  63. if (Thread.current() in Regex.regexRefs) {
  64. return Regex.regexRefs[Thread.current()][6];
  65. }
  66. return ("");
  67. }
  68. string _8() {
  69. if (Thread.current() in Regex.regexRefs) {
  70. return Regex.regexRefs[Thread.current()][7];
  71. }
  72. return ("");
  73. }
  74. string _9() {
  75. if (Thread.current() in Regex.regexRefs) {
  76. return Regex.regexRefs[Thread.current()][8];
  77. }
  78. return ("");
  79. }
  80. class Regex {
  81. // Description: This constructor will create an instance of a Regex that will efficiently compute the regular expression given.
  82. // regex: The regular expression to utilize.
  83. this(string regex) {
  84. regularExpression = (regex);
  85. buildDFA(false);
  86. }
  87. // Description: This function will return a matched regular expression on the given string. Single use regular expression functions, such as this one, use a backtracking algorithm.
  88. // str: The string to run the regular expression upon.
  89. // regex: The regular expression to use.
  90. // Returns: The matched substring or null when no match could be found.
  91. static string eval(string str, string regex, string options = "") {
  92. RegexInfo regexInfo;
  93. /*
  94. static RegexInfo[string] oldRuns;
  95. string oldRunIndex = (regex.tostring() ~ "_" ~ options);
  96. if (oldRunIndex in oldRuns) {
  97. regexInfo = oldRuns[oldRunIndex];
  98. }*/
  99. regexInfo.memoizer = new int[][](str.length, regex.length);
  100. int strPos;
  101. int regexPos;
  102. int currentGroupIdx = -1;
  103. int strPosStart;
  104. int regexPosStart;
  105. int regexGroupStart = int.max;
  106. int regexFlagPotential;
  107. int nextUnionPos = -1;
  108. int currentUnionPos = -1;
  109. int currentClassStart;
  110. int groupCount;
  111. int flags;
  112. const int PLUS_OK = 1;
  113. const int LAZY_KLEENE = 2;
  114. const int KLEENE_MATCHED = 4;
  115. bool multiline;
  116. foreach(chr; options) {
  117. switch(chr) {
  118. case 'm':
  119. multiline = true;
  120. break;
  121. default:
  122. break;
  123. }
  124. }
  125. // This is a stack of the groupings currently in context.
  126. Stack!(int) groupStart = new Stack!(int)();
  127. Stack!(int) stack = new Stack!(int)();
  128. // Running flags
  129. bool running = true;
  130. bool matchMade = true;
  131. bool backtrack = false;
  132. bool noMatch = false;
  133. bool matchClass = false;
  134. bool matchInverse = false;
  135. bool matchRange = false;
  136. bool noMatchClass = false;
  137. bool backtrackedOnCaret = false;
  138. regexRefs[Thread.current()] = new string [](9);
  139. // Suppresses group matching until a position is reached.
  140. int noMatchUntilClosedAtPos = -1;
  141. int noMatchUntilUnionForPos = -1;
  142. // This function will set a backtracking point in the regex.
  143. void setBacktrack(int newRegexPos, int newStrPos) {
  144. stack.push(newRegexPos);
  145. stack.push(newStrPos);
  146. stack.push(regexGroupStart);
  147. stack.push(currentGroupIdx);
  148. stack.push(regexFlagPotential);
  149. }
  150. // This function finds the regex position that will undo the last move.
  151. int findBackupRegexPosition() {
  152. int ret = regexPos - 1;
  153. if (ret in regexInfo.groupInfo) {
  154. ret = regexInfo.groupInfo[ret].startPos;
  155. }
  156. else if (ret < regex.length && regex[ret] == ']' && ret in regexInfo.operatorFlag) {
  157. ret = regexInfo.operatorFlag[ret];
  158. }
  159. else {
  160. if (ret > 0 && regex[ret-1] == '\\') {
  161. ret--;
  162. }
  163. }
  164. return ret;
  165. }
  166. // Like above, but for the working position.
  167. int findBackupPosition() {
  168. if (regexPos-1 in regexInfo.groupInfo) {
  169. return regexInfo.groupInfo[regexInfo.groupInfo[regexPos-1].startPos].strStartPos;
  170. }
  171. else {
  172. return strPos-1;
  173. }
  174. }
  175. // Set a backtrack that will return to the front of both strings.
  176. setBacktrack(0,0);
  177. // Alright, main loop! This won't be broken until either a match is
  178. // found or nothing can be found.
  179. while(running) {
  180. // This is the mechanics for the memoizer. If a valid match has
  181. // been made and the regex positions are valid, set this position
  182. // pair in the memoizer indicating we've done this work before.
  183. if (strPos < str.length && regexPos < regex.length && matchMade && !noMatch) {
  184. if (regexInfo.memoizer[strPos][regexPos] == 1) {
  185. // we have been here before
  186. backtrack = true;
  187. }
  188. else {
  189. regexInfo.memoizer[strPos][regexPos] = 1;
  190. }
  191. }
  192. // If we are meant to backtrack this turn, this code path is taken.
  193. if (backtrack) {
  194. // steps are saved after successful matches
  195. // therefore the matchMade flag is always set
  196. matchMade = true;
  197. int oldRegexPos = regexPos;
  198. regexFlagPotential = stack.pop();
  199. currentGroupIdx = stack.pop();
  200. regexGroupStart = stack.pop();
  201. strPos = stack.pop();
  202. regexPos = stack.pop();
  203. if (regexPos == 0) {
  204. // We have gone back to the beginning...
  205. // we could attempt to find a union
  206. noMatch = true;
  207. noMatchUntilClosedAtPos = -1;
  208. noMatchUntilUnionForPos = -1;
  209. regexPos = oldRegexPos;
  210. }
  211. // OMG; Do not want to backtrack twice!
  212. backtrack = false;
  213. }
  214. if (regexPos >= regex.length) {
  215. // The regex has been consumed.
  216. if (noMatch) {
  217. if (noMatchUntilClosedAtPos == -1) {
  218. // No union, so just start the regex at the next character in the string.
  219. // UNLESS the backtrack happened at a 'caret' character in the regex
  220. if (backtrackedOnCaret) {
  221. matchMade = false;
  222. running = false;
  223. break;
  224. }
  225. strPosStart++;
  226. strPos = strPosStart;
  227. if (strPosStart >= str.length) {
  228. // bad
  229. matchMade = false;
  230. running = false;
  231. continue;
  232. }
  233. // start from a good state
  234. matchMade = true;
  235. // turn off find mode
  236. noMatch = false;
  237. regexPos = 0;
  238. // Set the backtrack to point to the start of the regex
  239. // with the new working position.
  240. setBacktrack(0, strPos);
  241. }
  242. else {
  243. // bad
  244. matchMade = false;
  245. running = false;
  246. }
  247. }
  248. else if (matchMade) {
  249. // good
  250. running = false;
  251. break;
  252. }
  253. else {
  254. // backtrack
  255. //regexPos = findBackupRegexPosition();
  256. //strPos = findBackupPosition();
  257. backtrack = true;
  258. }
  259. continue;
  260. }
  261. else if (noMatch && regex[regexPos] == '\\') {
  262. regexPos+=2;
  263. continue;
  264. }
  265. else if (noMatch && noMatchClass) {
  266. if (regex[regexPos] == ']') {
  267. noMatchClass = false;
  268. }
  269. regexPos++;
  270. }
  271. else if (noMatch && regex[regexPos] == '[') {
  272. // ignore!
  273. noMatchClass = true;
  274. continue;
  275. }
  276. else if (regex[regexPos] == '|') {
  277. // A union operator.
  278. if (currentGroupIdx >= 0) {
  279. if (regexInfo.groupInfo[currentGroupIdx].unionPos >= 0) {
  280. // the current group already has at least one union
  281. // use the current unionPos to append to the list
  282. if (!(currentUnionPos in regexInfo.operatorFlag) && regexPos > currentUnionPos) {
  283. regexInfo.operatorFlag[currentUnionPos] = regexPos;
  284. }
  285. }
  286. else {
  287. // this is the first union of the current group
  288. regexInfo.groupInfo[currentGroupIdx].unionPos = regexPos;
  289. }
  290. if (noMatch && noMatchUntilUnionForPos != -1 && currentGroupIdx == noMatchUntilClosedAtPos) {
  291. // turn off find mode
  292. noMatch = false;
  293. // start from a good state
  294. matchMade = true;
  295. }
  296. else if (matchMade && !noMatch) {
  297. // do not take this union
  298. // declare this group as good
  299. // but set a backtrack just in case
  300. // this will start the regular expression search from the next regex
  301. // point, but undoing the actions of the group thus far
  302. setBacktrack(regexPos+1, regexInfo.groupInfo[currentGroupIdx].strStartPos);
  303. if (regexInfo.groupInfo[currentGroupIdx].endPos >= 0) {
  304. regexPos = regexInfo.groupInfo[currentGroupIdx].endPos-1;
  305. }
  306. else {
  307. noMatch = true;
  308. noMatchUntilClosedAtPos = currentGroupIdx;
  309. noMatchUntilUnionForPos = -1;
  310. }
  311. }
  312. else if (!noMatch) {
  313. // undo actions
  314. strPos = regexInfo.groupInfo[currentGroupIdx].strStartPos;
  315. noMatch = false;
  316. matchMade = true;
  317. }
  318. }
  319. else {
  320. // union operator is in the main regex (top level)
  321. // If we are searching for a union to continue a failed search
  322. // We will enter the next code path. We have found a top level
  323. // union operator.
  324. if (noMatch && noMatchUntilClosedAtPos == -1 && noMatchUntilUnionForPos == -1) {
  325. // Set the backtrack to point to the start of the regex
  326. // with the new working position.
  327. setBacktrack(0, strPos);
  328. // turn off find mode
  329. noMatch = false;
  330. // start from a good state
  331. matchMade = true;
  332. }
  333. else if (noMatch && noMatchUntilUnionForPos != -1) {
  334. // turn off find mode
  335. noMatch = false;
  336. // start from a good state
  337. matchMade = true;
  338. }
  339. else if (matchMade) {
  340. // accept the regular expression
  341. running = false;
  342. break;
  343. }
  344. else {
  345. // we start anew, but at this regular expression
  346. strPos = strPosStart;
  347. }
  348. }
  349. currentUnionPos = regexPos;
  350. regexPos++;
  351. }
  352. else if (regex[regexPos] == '(' && (matchMade || noMatch) ) {
  353. // The start of a grouping.
  354. bool isNew;
  355. if (!(regexPos in regexInfo.groupInfo)) {
  356. GroupInfo newGroup;
  357. newGroup.startPos = regexPos;
  358. newGroup.endPos = -1;
  359. newGroup.strPos = strPos;
  360. newGroup.strStartPos = strPos;
  361. newGroup.parent = currentGroupIdx;
  362. newGroup.unionPos = -1;
  363. // This assumes that all groups will be visited
  364. // in order from left to right.
  365. newGroup.groupId = groupCount;
  366. groupCount++;
  367. regexInfo.groupInfo[regexPos] = newGroup;
  368. isNew = true;
  369. }
  370. regexInfo.groupInfo[regexPos].strStartPos = strPos;
  371. regexInfo.groupInfo[regexPos].strPos = strPos;
  372. currentGroupIdx = regexPos;
  373. regexPos++;
  374. if (regexPos < regex.length - 1 && regex[regexPos] == '?') {
  375. switch(regex[regexPos+1]) {
  376. case '#':
  377. // comments
  378. if (regexInfo.groupInfo[currentGroupIdx].endPos > 0) {
  379. regexPos = regexInfo.groupInfo[currentGroupIdx].endPos;
  380. }
  381. else {
  382. // find the end of the group, ignoring everything
  383. while(regexPos < regex.length && regex[regexPos] != ')') {
  384. regexPos++;
  385. }
  386. // save the result
  387. regexInfo.groupInfo[currentGroupIdx].endPos = regexPos;
  388. }
  389. break;
  390. case '>':
  391. // atomic grouping
  392. break;
  393. case ':':
  394. // non-capturing
  395. if (isNew) {
  396. regexInfo.groupInfo[currentGroupIdx].groupId = int.max;
  397. groupCount--;
  398. }
  399. regexPos+=2;
  400. break;
  401. case '=':
  402. // zero-width positive lookahead
  403. break;
  404. case '!':
  405. // zero-width negative lookahead
  406. break;
  407. case '<':
  408. // zero-width lookbehind
  409. if (regexPos < regex.length - 3) {
  410. if (regex[regexPos+3] == '=') {
  411. // positive
  412. }
  413. else if (regex[regexPos+3] == '!') {
  414. // negative
  415. }
  416. }
  417. regexPos+=2;
  418. break;
  419. default:
  420. break;
  421. }
  422. }
  423. }
  424. else if (regex[regexPos] == ')') {
  425. // A group is ending.
  426. if (!(regexPos in regexInfo.groupInfo)) {
  427. regexInfo.groupInfo[currentGroupIdx].endPos = regexPos;
  428. regexInfo.groupInfo[regexPos] = regexInfo.groupInfo[currentGroupIdx];
  429. if (currentGroupIdx == noMatchUntilClosedAtPos) {
  430. noMatch = false;
  431. }
  432. }
  433. if (noMatch && noMatchUntilClosedAtPos == currentGroupIdx) {
  434. noMatch = false;
  435. }
  436. if (matchMade || noMatch) {
  437. regexInfo.groupInfo[regexInfo.groupInfo[regexPos].startPos].strPos = strPos;
  438. regexGroupStart = regexInfo.groupInfo[regexInfo.groupInfo[regexPos].startPos].groupId;
  439. // set consumption string
  440. if (!noMatch) {
  441. if (regexGroupStart < 9) {
  442. string consumed = (str[regexInfo.groupInfo[regexInfo.groupInfo[regexPos].startPos].strStartPos..strPos]);
  443. regexRefs[Thread.current()][regexGroupStart] = consumed;
  444. regexGroupStart++;
  445. }
  446. }
  447. }
  448. else {
  449. // if we can backtrack to make another decision in this group, do so
  450. // that would effectively undo moves that this group had made
  451. strPos = regexInfo.groupInfo[regexInfo.groupInfo[regexPos].startPos].strPos;
  452. //backtrack = true;
  453. }
  454. currentGroupIdx = regexInfo.groupInfo[regexPos].parent;
  455. regexPos++;
  456. }
  457. else if (noMatch) {
  458. regexPos++;
  459. }
  460. else if (regex[regexPos] == '*') {
  461. // Kleene star operator.
  462. if (regexPos < regex.length - 1 && regex[regexPos+1] == '?') {
  463. // this is a lazy kleene
  464. // it may have matched something, but it should ignore the work
  465. // for now that it had done and save it as part of the lazy operator
  466. if (matchMade) {
  467. // set backtrack to do another computation
  468. setBacktrack(findBackupRegexPosition(), strPos);
  469. //if (!(regexPos in regexInfo.operatorFlag)) {
  470. if (regexFlagPotential < regexPos) {
  471. // we have made a match, but have not attempted
  472. // to try not matching anything first
  473. // set the flag so that this operator knows that it has
  474. // already found a match
  475. regexInfo.operatorFlag[regexPos] = strPos;
  476. regexFlagPotential = regexPos;
  477. // set backtrack to start where this one would have
  478. // continued to
  479. setBacktrack(regexPos+2, strPos);
  480. // and then start all over by assuming nothing is taken
  481. strPos = findBackupPosition();
  482. regexPos+=2;
  483. }
  484. else {
  485. // we have already found a match
  486. // just continue on our way
  487. regexPos+=2;
  488. }
  489. }
  490. else {
  491. // the group fails, it is ok
  492. matchMade = true;
  493. regexPos+=2;
  494. }
  495. }
  496. else if (matchMade) {
  497. // this is a greedy kleene
  498. // the backtrack will suggest to just go to the next regex
  499. // character at this same string. this computation path,
  500. // however, will be attempting to match the previous group
  501. // as much as possible
  502. // we need to set a backtrack for having not matched anything even though
  503. // something was just matched. It could be that what we matched belongs to
  504. // another section of the regex.
  505. if (!(regexPos in regexInfo.operatorFlag) || regexFlagPotential < regexPos) {
  506. // set a backtrack for having nothing found
  507. setBacktrack(regexPos+1,findBackupPosition());
  508. }
  509. regexInfo.operatorFlag[regexPos] = 1;
  510. setBacktrack(regexPos+1, strPos);
  511. regexPos--;
  512. if (regexPos in regexInfo.groupInfo) {
  513. regexPos = regexInfo.groupInfo[regexPos].startPos;
  514. currentGroupIdx = regexPos;
  515. }
  516. else if (regexPos < regex.length && regex[regexPos] == ']' && regexPos in regexInfo.operatorFlag) {
  517. regexPos = regexInfo.operatorFlag[regexPos];
  518. }
  519. else {
  520. if (regexPos > 0 && regex[regexPos-1] == '\\') {
  521. regexPos--;
  522. }
  523. }
  524. }
  525. else {
  526. // it is ok
  527. matchMade = true;
  528. regexPos++;
  529. }
  530. }
  531. else if (regex[regexPos] == '+') {
  532. // Kleene plus operator.
  533. if (regexPos < regex.length - 1 && regex[regexPos+1] == '?') {
  534. // this is a lazy kleene
  535. if (matchMade) {
  536. // good, continue and set a backtrack to attempt another
  537. // match on this kleene
  538. // set the flag so that this operator knows that it has
  539. // already found a match
  540. regexInfo.operatorFlag[regexPos] = 1;
  541. regexFlagPotential = regexPos;
  542. // set the backtrace
  543. int newRegexPos = regexPos+2;
  544. regexPos--;
  545. if (regexPos in regexInfo.groupInfo) {
  546. regexPos = regexInfo.groupInfo[regexPos].startPos;
  547. currentGroupIdx = regexPos;
  548. }
  549. else if (regexPos < regex.length && regex[regexPos] == ']' && regexPos in regexInfo.operatorFlag) {
  550. regexPos = regexInfo.operatorFlag[regexPos];
  551. }
  552. else {
  553. if (regexPos > 0 && regex[regexPos-1] == '\\') {
  554. regexPos--;
  555. }
  556. }
  557. setBacktrack(regexPos, strPos);
  558. regexPos = newRegexPos;
  559. }
  560. else {
  561. if (regexFlagPotential < regexPos) {
  562. // we have not found any matches at all
  563. // fail the op
  564. //regexPos = findBackupRegexPosition();
  565. //strPos = findBackupPosition();
  566. backtrack = true;
  567. continue;
  568. }
  569. else {
  570. // it is ok, we found at least one
  571. matchMade = true;
  572. regexPos+=2;
  573. }
  574. }
  575. }
  576. else if (matchMade) {
  577. // this is a greedy kleene
  578. // the backtrack will suggest to just go to the next regex
  579. // character at this same string. this computation path,
  580. // however, will be attempting to match the previous group
  581. // as much as possible
  582. setBacktrack(regexPos+1, strPos);
  583. // set the flag so that this operator knows that it has
  584. // already found a match
  585. regexInfo.operatorFlag[regexPos] = 1;
  586. regexFlagPotential = regexPos;
  587. regexPos--;
  588. if (regexPos in regexInfo.groupInfo) {
  589. regexPos = regexInfo.groupInfo[regexPos].startPos;
  590. currentGroupIdx = regexPos;
  591. }
  592. else if (regexPos < regex.length && regex[regexPos] == ']' && regexPos in regexInfo.operatorFlag) {
  593. regexPos = regexInfo.operatorFlag[regexPos];
  594. }
  595. else {
  596. if (regexPos > 0 && regex[regexPos-1] == '\\') {
  597. regexPos--;
  598. }
  599. }
  600. }
  601. else {
  602. // it is ok
  603. if (regexPos in regexInfo.operatorFlag && regexFlagPotential >= regexPos) {
  604. // good
  605. matchMade = true;
  606. regexPos++;
  607. }
  608. else {
  609. // fail the op
  610. //regexPos = findBackupRegexPosition();
  611. //strPos = findBackupPosition();
  612. backtrack = true;
  613. continue;
  614. }
  615. }
  616. }
  617. else if (regex[regexPos] == '?') {
  618. // option
  619. regexPos++;
  620. if (regexPos < regex.length && regex[regexPos] == '?') {
  621. // lazy option
  622. regexPos++;
  623. if (matchMade) {
  624. // unfortunately, this work that has been done
  625. // has been done in vain. We want to attempt to
  626. // not consume this option.
  627. // set the backtrack to backtrack to the current
  628. // situation (taking the option)
  629. setBacktrack(regexPos, strPos);
  630. // now, attempt to carry on to the next part of
  631. // the regex while undoing the last group
  632. strPos = findBackupPosition();
  633. }
  634. else {
  635. // very good, only one possible outcome: no match
  636. matchMade = true;
  637. }
  638. }
  639. else if (matchMade) {
  640. // greedy option
  641. // backtrack to not taking the option
  642. setBacktrack(regexPos, findBackupPosition());
  643. }
  644. else {
  645. // greedy option
  646. matchMade = true;
  647. }
  648. }
  649. else if (!matchMade) {
  650. // the group fails if a concatenation fails
  651. if (currentGroupIdx >= 0) {
  652. int curUnionPos = -1;
  653. if (regexInfo.groupInfo[currentGroupIdx].unionPos >= 0) {
  654. curUnionPos = regexInfo.groupInfo[currentGroupIdx].unionPos;
  655. while(curUnionPos < regexPos && curUnionPos in regexInfo.operatorFlag) {
  656. curUnionPos = regexInfo.operatorFlag[curUnionPos];
  657. }
  658. if (curUnionPos < regexPos) {
  659. curUnionPos = -1;
  660. }
  661. }
  662. strPos = regexInfo.groupInfo[currentGroupIdx].strStartPos;
  663. if (curUnionPos >= 0) {
  664. regexPos = curUnionPos;
  665. }
  666. else if (regexInfo.groupInfo[currentGroupIdx].endPos >= 0) {
  667. regexPos = regexInfo.groupInfo[currentGroupIdx].endPos;
  668. }
  669. else {
  670. // need to find either a union for this group
  671. // or the group end
  672. noMatch = true;
  673. noMatchUntilClosedAtPos = currentGroupIdx;
  674. noMatchUntilUnionForPos = currentGroupIdx;
  675. }
  676. }
  677. else {
  678. backtrack = true;
  679. continue;
  680. }
  681. }
  682. else if (regex[regexPos] == '$') {
  683. // dollar anchor
  684. if (strPos == str.length || str[strPos] == '\n' || str[strPos] == '\r') {
  685. matchMade = true;
  686. }
  687. else {
  688. //regexPos = findBackupRegexPosition();
  689. //strPos = findBackupPosition();
  690. backtrack = true;
  691. continue;
  692. }
  693. regexPos++;
  694. }
  695. else if (regex[regexPos] == '^') {
  696. // caret anchor
  697. if (multiline) {
  698. if (strPos == 0 || str[strPos-1] == '\n' || str[strPos-1] == '\r') {
  699. matchMade = true;
  700. }
  701. else {
  702. // Multiline option:
  703. backtrack = true;
  704. continue;
  705. }
  706. }
  707. else {
  708. if (strPos == 0) {
  709. matchMade = true;
  710. }
  711. else {
  712. backtrackedOnCaret = true;
  713. // Nonmultiline option:
  714. backtrack = true;
  715. continue;
  716. }
  717. }
  718. regexPos++;
  719. }
  720. else if (((regexPos + 1) < regex.length) && (regex[regexPos] == '\\') && (regex[regexPos+1] == 'b')) {
  721. // word boundary anchor
  722. // Check for boundary
  723. if (strPos == 0) {
  724. // Anchored to the beginning of the string
  725. // The first character should be a word character
  726. if ( (str[strPos] >= 'a' && str[strPos] <= 'z') ||
  727. (str[strPos] >= 'A' && str[strPos] <= 'Z') ||
  728. (str[strPos] >= '0' && str[strPos] <= '9') ||
  729. (str[strPos] == '_')) {
  730. matchMade = true;
  731. }
  732. else {
  733. backtrack = true;
  734. continue;
  735. }
  736. }
  737. else if ((strPos == str.length) && (str.length > 0)) {
  738. // Anchored at end of string
  739. matchMade = true;
  740. // The last character should be a word character
  741. if ( (str[strPos-1] >= 'a' && str[strPos-1] <= 'z') ||
  742. (str[strPos-1] >= 'A' && str[strPos-1] <= 'Z') ||
  743. (str[strPos-1] >= '0' && str[strPos-1] <= '9') ||
  744. (str[strPos-1] == '_')) {
  745. matchMade = true;
  746. }
  747. else {
  748. backtrack = true;
  749. continue;
  750. }
  751. }
  752. else {
  753. // It is between two characters
  754. // One or the other (exclusive) should be a word character
  755. bool firstWordCharacter = false;
  756. if ( (str[strPos-1] >= 'a' && str[strPos-1] <= 'z') ||
  757. (str[strPos-1] >= 'A' && str[strPos-1] <= 'Z') ||
  758. (str[strPos-1] >= '0' && str[strPos-1] <= '9') ||
  759. (str[strPos-1] == '_')) {
  760. firstWordCharacter = true;
  761. }
  762. if ( (str[strPos] >= 'a' && str[strPos] <= 'z') ||
  763. (str[strPos] >= 'A' && str[strPos] <= 'Z') ||
  764. (str[strPos] >= '0' && str[strPos] <= '9') ||
  765. (str[strPos] == '_')) {
  766. if (!firstWordCharacter) {
  767. matchMade = true;
  768. }
  769. else {
  770. backtrack = true;
  771. continue;
  772. }
  773. }
  774. else if (firstWordCharacter) {
  775. matchMade = true;
  776. }
  777. else {
  778. backtrack = true;
  779. continue;
  780. }
  781. }
  782. regexPos+=2;
  783. }
  784. else {
  785. // concatentation
  786. if (regex[regexPos] == '[') {
  787. currentClassStart = regexPos;
  788. matchClass = true;
  789. regexPos++;
  790. if (regexPos < regex.length && regex[regexPos] == '^') {
  791. matchInverse = true;
  792. regexPos++;
  793. }
  794. else {
  795. matchInverse = false;
  796. }
  797. // cancel when we run out of space
  798. if (regexPos == regex.length) {
  799. continue;
  800. }
  801. }
  802. do {
  803. if (matchClass && regex[regexPos] == ']') {
  804. regexInfo.operatorFlag[currentClassStart] = regexPos;
  805. regexInfo.operatorFlag[regexPos] = currentClassStart;
  806. if (matchInverse && !matchMade) {
  807. matchMade = true;
  808. matchInverse = false;
  809. }
  810. matchClass = false;
  811. }
  812. else if (matchClass && regexPos < regex.length - 1 && regex[regexPos+1] == '-') {
  813. // character class range, use the last character
  814. // and build a range of possible values
  815. matchRange = true;
  816. regexPos+=2;
  817. continue;
  818. }
  819. else if (matchRange) {
  820. matchMade = strPos < str.length && str[strPos] >= regex[regexPos-2] && str[strPos] <= regex[regexPos];
  821. // no more ranges!
  822. matchRange = false;
  823. }
  824. else if (regex[regexPos] == '\\' && regexPos < regex.length-1) {
  825. regexPos++;
  826. if (strPos >= str.length) {
  827. matchMade = false;
  828. }
  829. else {
  830. switch(regex[regexPos]) {
  831. case '1':
  832. case '2':
  833. case '3':
  834. case '4':
  835. case '5':
  836. case '6':
  837. case '7':
  838. case '8':
  839. case '9':
  840. int refIndex = cast(uint)regex[regexPos] - cast(uint)'1';
  841. // forward and backward references
  842. if (Thread.current() in regexRefs) {
  843. if (regexRefs[Thread.current()][refIndex] !is null) {
  844. matchMade = true;
  845. foreach(int i, chr; regexRefs[Thread.current()][refIndex]) {
  846. if (strPos >= str.length) {
  847. matchMade = false;
  848. break;
  849. }
  850. if (str[strPos] != chr) {
  851. matchMade = false;
  852. break;
  853. }
  854. strPos++;
  855. }
  856. if (matchMade) {
  857. strPos--;
  858. }
  859. }
  860. else {
  861. matchMade = false;
  862. }
  863. }
  864. else {
  865. matchMade = false;
  866. }
  867. break;
  868. case 'd':
  869. matchMade = (str[strPos] >= '0' && str[strPos] <= '9');
  870. break;
  871. case 'D':
  872. matchMade = !(str[strPos] >= '0' && str[strPos] <= '9');
  873. break;
  874. case 's':
  875. matchMade = (str[strPos] == ' '
  876. || str[strPos] == '\t'
  877. || str[strPos] == '\r'
  878. || str[strPos] == '\n'
  879. || str[strPos] == '\v'
  880. || str[strPos] == '\f');
  881. break;
  882. case 'S':
  883. matchMade = (str[strPos] != ' '
  884. && str[strPos] != '\t'
  885. && str[strPos] != '\r'
  886. && str[strPos] != '\n'
  887. && str[strPos] != '\v'
  888. && str[strPos] != '\f');
  889. break;
  890. case 'w':
  891. matchMade = (str[strPos] == '_'
  892. || (str[strPos] >= 'a' && str[strPos] <= 'z')
  893. || (str[strPos] >= 'A' && str[strPos] <= 'Z'));
  894. break;
  895. case 'W':
  896. matchMade = (str[strPos] != '_'
  897. && (str[strPos] < 'a' || str[strPos] > 'z')
  898. && (str[strPos] < 'A' || str[strPos] > 'Z'));
  899. break;
  900. case 'b':
  901. // backspace
  902. matchMade = str[strPos] == '\b';
  903. break;
  904. case 'n':
  905. // newline
  906. matchMade = str[strPos] == '\n';
  907. break;
  908. case 'e':
  909. // escape
  910. matchMade = str[strPos] == '\x1b';
  911. break;
  912. case 'v':
  913. matchMade = str[strPos] == '\v';
  914. break;
  915. case 't':
  916. matchMade = str[strPos] == '\t';
  917. break;
  918. case 'r':
  919. matchMade = str[strPos] == '\r';
  920. break;
  921. case 'a':
  922. matchMade = str[strPos] == '\a';
  923. break;
  924. case '0':
  925. matchMade = str[strPos] == '\0';
  926. break;
  927. case '\0':
  928. matchMade = str[strPos] == '\0';
  929. break;
  930. default:
  931. matchMade = str[strPos] == regex[regexPos];
  932. break;
  933. }
  934. }
  935. }
  936. else if (regexPos < regex.length && strPos < str.length
  937. && ((str[strPos] == regex[regexPos])
  938. || (!matchClass && regex[regexPos] == '.'
  939. && str[strPos] != '\n' && str[strPos] != '\r'))) {
  940. // match made
  941. matchMade = true;
  942. }
  943. else {
  944. // no match made
  945. matchMade = false;
  946. }
  947. if ((matchMade && matchInverse) || (matchInverse && strPos >= str.length)) {
  948. matchMade = false;
  949. break;
  950. }
  951. if (matchClass && !matchMade && regexPos < regex.length) {
  952. regexPos++;
  953. continue;
  954. }
  955. break;
  956. } while (true);
  957. matchRange = false;
  958. matchInverse = false;
  959. if (matchClass) {
  960. matchClass = false;
  961. if (currentClassStart in regexInfo.operatorFlag) {
  962. regexPos = regexInfo.operatorFlag[currentClassStart];
  963. }
  964. else {
  965. // dang, need to search for it
  966. regexPos++;
  967. for(;regexPos < regex.length && regex[regexPos] != ']'; regexPos++) {
  968. if (regex[regexPos] == '\\') { regexPos++; }
  969. }
  970. if (regexPos >= regex.length) { continue; }
  971. regexInfo.operatorFlag[currentClassStart] = regexPos;
  972. regexInfo.operatorFlag[regexPos] = currentClassStart;
  973. }
  974. }
  975. if (matchMade) {
  976. // consume input string
  977. strPos++;
  978. }
  979. // consume
  980. regexPos++;
  981. }
  982. }
  983. // Null out any outstanding groups
  984. if (Thread.current() in regexRefs) {
  985. for( ; regexGroupStart < 9 ; regexGroupStart++ ) {
  986. // regexRefs[Thread.current()][regexGroupStart]
  987. // = null;
  988. }
  989. }
  990. /*
  991. if (!(oldRunIndex in oldRuns)) {
  992. oldRuns[oldRunIndex] = regexInfo;
  993. }*/
  994. // Return the result
  995. if (matchMade && strPosStart <= str.length) {
  996. if (strPos-strPosStart == 0) {
  997. return ("");
  998. }
  999. // Save the position where the string was consumed
  1000. this.regexPos[Thread.current()] = strPosStart;
  1001. // Slice and return the consumed string
  1002. return str.substring(strPosStart, strPos-strPosStart);
  1003. }
  1004. return null;
  1005. }
  1006. // Description: This function will return a matched regular expression on the given string. Instances of a Regex will use a DFA based approach.
  1007. // str: The string to run the regular expression upon.
  1008. // Returns: The matched substring or null when no match could be found.
  1009. string eval(string str) {
  1010. State currentState = startingState;
  1011. uint strPos;
  1012. uint startingStrPos;
  1013. State acceptState;
  1014. uint acceptStrEnd;
  1015. dchar chr;
  1016. for (strPos = startingStrPos; strPos < str.length; strPos++) {
  1017. // Console.putln("starting ... ", startingStrPos);
  1018. chr = str[strPos];
  1019. // Console.putln("chr ... ", str[strPos]);
  1020. if (chr in currentState.transitions) {
  1021. // Take transition
  1022. //Console.putln("taking transition ", chr, " from ", currentState.id, " to ", currentState.transitions[chr].id);
  1023. currentState = currentState.transitions[chr];
  1024. if (currentState.accept) {
  1025. //Console.putln("found accept at ", strPos, " from ", startingStrPos);
  1026. acceptStrEnd = strPos + 1;
  1027. acceptState = currentState;
  1028. }
  1029. }
  1030. else {
  1031. // No transition
  1032. if (acceptStrEnd > startingStrPos) {
  1033. Console.putln("Leaving Early");
  1034. strPos = acceptStrEnd;
  1035. currentState = acceptState;
  1036. }
  1037. // Is this an accept state?
  1038. if (currentState.accept) {
  1039. break;
  1040. }
  1041. // Start over
  1042. if (startingStrPos >= str.length) {
  1043. // No more to search
  1044. return null;
  1045. }
  1046. // Next turn, strPos will be startingStrPos + 1
  1047. // (because of loop iteration)
  1048. strPos = startingStrPos;
  1049. // We are sliding down the string by one character
  1050. startingStrPos++;
  1051. // We go back to the beginning
  1052. currentState = startingState;
  1053. }
  1054. }
  1055. if (acceptStrEnd > startingStrPos) {
  1056. Console.putln("Leaving Early");
  1057. strPos = acceptStrEnd;
  1058. currentState = acceptState;
  1059. }
  1060. // Return consumed string
  1061. if (currentState.accept) {
  1062. return str.substring(startingStrPos, strPos - startingStrPos);
  1063. }
  1064. // No match
  1065. return null;
  1066. }
  1067. protected:
  1068. // These instance variables contain the data structures
  1069. // that will build and maintain the DFA for the regular expression
  1070. // Holds the regular expression for the instance
  1071. string regularExpression;
  1072. // For DFA regex operations
  1073. class Link {
  1074. State from;
  1075. dchar transition;
  1076. }
  1077. static class State {
  1078. State[dchar] transitions;
  1079. // List!(Group) groupStarts;
  1080. // List!(Group) groupEnds;
  1081. List!(dchar) backwardList;
  1082. List!(Link) incomingList;
  1083. bool accept;
  1084. int id;
  1085. this() {
  1086. this(count);
  1087. }
  1088. this(int id) {
  1089. this.id = id;
  1090. backwardList = new List!(dchar);
  1091. incomingList = new List!(Link);
  1092. debugThis();
  1093. }
  1094. // Debugging block
  1095. static int count = 0;
  1096. static List!(State) all;
  1097. static this() {
  1098. all = new List!(State);
  1099. }
  1100. void debugThis() {
  1101. count++;
  1102. all.add(this);
  1103. }
  1104. string tostring() {
  1105. string ret = "State " ~ toStr(id) ~ ": [";
  1106. if (accept) {
  1107. ret ~= "A] ";
  1108. }
  1109. else {
  1110. ret ~= " ] ";
  1111. }
  1112. foreach(key; transitions.keys) {
  1113. if (transitions[key].id <= id) {
  1114. ret ~= toStr(key) ~ "<>" ~ toStr(transitions[key].id) ~ " ";
  1115. }
  1116. else {
  1117. ret ~= toStr(key) ~ "->" ~ toStr(transitions[key].id) ~ " ";
  1118. }
  1119. }
  1120. return ret;
  1121. }
  1122. static void printall() {
  1123. foreach(state; all) {
  1124. Console.putln(state);
  1125. }
  1126. }
  1127. }
  1128. State startingState;
  1129. void buildDFA(bool useDFARules = false) {
  1130. _DFARules = useDFARules;
  1131. // Go through the regular expression and build the DFAs
  1132. startingState = buildDFA(regularExpression);
  1133. }
  1134. private:
  1135. bool _DFARules;
  1136. struct DFAGroupInfo {
  1137. bool hasKleene;
  1138. int endPos;
  1139. }
  1140. DFAGroupInfo[int] _groupInfo;
  1141. void fillGroupInfo() {
  1142. _groupInfo = null;
  1143. dchar ch;
  1144. List!(int) groupStack = new List!(int);
  1145. for (uint i; i < regularExpression.length; i++) {
  1146. Console.putln("foo ", i);
  1147. ch = regularExpression[i];
  1148. switch (ch) {
  1149. case '\0':
  1150. return;
  1151. case '\\':
  1152. i++;
  1153. continue;
  1154. case '(':
  1155. groupStack.add(cast(int)i);
  1156. DFAGroupInfo dgi;
  1157. _groupInfo[i] = dgi;
  1158. break;
  1159. case ')':
  1160. int startPos = groupStack.remove();
  1161. if (startPos in _groupInfo) {
  1162. _groupInfo[startPos].endPos = i;
  1163. if ((i + 1 < regularExpression.length) && regularExpression[i+1] == '*') {
  1164. Console.putln("HAS KLEENE");
  1165. _groupInfo[startPos].hasKleene = true;
  1166. i++;
  1167. }
  1168. }
  1169. break;
  1170. default:
  1171. if (groupStack.empty()) {
  1172. Console.putln("NULLED");
  1173. _groupInfo = null;
  1174. }
  1175. break;
  1176. }
  1177. }
  1178. }
  1179. State buildDFA(string regex) {
  1180. fillGroupInfo();
  1181. uint regexPos = 0;
  1182. List!(State) current = new List!(State);
  1183. return buildDFA(regex, regexPos, current);
  1184. }
  1185. State buildDFA(string regex, ref uint regexPos, ref List!(State) current, bool isKleene = false) {
  1186. State startState = new State();
  1187. Console.putln("Start State: ", startState.id);
  1188. uint groupPos = regexPos - 1;
  1189. dchar lastChar = '\0';
  1190. dchar thisChar;
  1191. dchar lastConcatChar = '\0';
  1192. enum Operation {
  1193. None,
  1194. Kleene,
  1195. Concat
  1196. }
  1197. Operation lastOp = Operation.None;
  1198. List!(State) old = current.dup();
  1199. current.add(startState);
  1200. if (regexPos < regex.length) {
  1201. lastChar = regex[regexPos];
  1202. if (lastChar == '*') {
  1203. // error
  1204. }
  1205. else if (lastChar == '(') {
  1206. // group
  1207. regexPos++;
  1208. buildDFA(regex, regexPos, current);
  1209. if (regex[regexPos] == '*') {
  1210. Console.putln("Inner Group Kleened");
  1211. lastOp = Operation.Kleene;
  1212. }
  1213. }
  1214. else {
  1215. lastConcatChar = lastChar;
  1216. }
  1217. regexPos++;
  1218. }
  1219. while (regexPos <= regex.length) {
  1220. if (regexPos == regex.length) {
  1221. thisChar = '\0';
  1222. }
  1223. else {
  1224. thisChar = regex[regexPos];
  1225. }
  1226. if (thisChar == '*') {
  1227. // Kleene Star
  1228. //Console.putln("Kleene (", lastChar, ")");
  1229. if (lastChar == ')') {
  1230. Console.putln("Kleene Group End, connecting ", lastConcatChar, " to ", startState.id);
  1231. foreach(state; current) {
  1232. State ret = concat(state, lastConcatChar, startState);
  1233. if (ret is startState && startState.id <= state.id) {
  1234. state.backwardList.add(lastConcatChar);
  1235. Link link = new Link();
  1236. link.from = state;
  1237. link.transition = lastConcatChar;
  1238. startState.incomingList.add(link);
  1239. }
  1240. }
  1241. old.add(startState);
  1242. current = old;
  1243. State.printall();
  1244. return startState;
  1245. }
  1246. else {
  1247. // Single Character Kleene
  1248. // ex. "a*" => [p] -> 'a' -> [p]
  1249. Console.putln("Single Character Kleene (", lastConcatChar, ")");
  1250. List!(State) newStateList = current.dup;
  1251. State loopState;
  1252. foreach(state; current) {
  1253. if (state.backwardList.empty) {
  1254. while (lastConcatChar in state.transitions) {
  1255. state = concat(state, lastConcatChar, state);
  1256. }
  1257. state.transitions[lastConcatChar] = state;
  1258. state.backwardList.add(lastConcatChar);
  1259. }
  1260. else {
  1261. if (loopState is null) {
  1262. loopState = new State();
  1263. loopState.transitions[lastConcatChar] = loopState;
  1264. loopState.backwardList.add(lastConcatChar);
  1265. }
  1266. State ret = concat(state, lastConcatChar, loopState);
  1267. }
  1268. }
  1269. current = newStateList;
  1270. if (loopState !is null) {
  1271. current.add(loopState);
  1272. }
  1273. //Console.putln("Done Single Character Kleene (", lastConcatChar, ")");
  1274. }
  1275. lastOp = Operation.Kleene;
  1276. lastConcatChar = '\0';
  1277. }
  1278. else {
  1279. // concatenation
  1280. if (lastConcatChar != '\0' && thisChar != ')') {
  1281. Console.putln("-=-=-=-=-");
  1282. Console.putln("boo: ", lastOp == Operation.Kleene);
  1283. State concatState;
  1284. List!(State) newStateList = new List!(State);
  1285. foreach(state; current) {
  1286. State ret = concat(state, lastConcatChar, concatState, lastOp == Operation.Kleene);
  1287. if (ret !is concatState && ret !is null) {
  1288. newStateList.add(ret);
  1289. }
  1290. }
  1291. if (concatState !is null) {
  1292. newStateList.add(concatState);
  1293. }
  1294. current = newStateList;
  1295. Console.putln("Concat Character (", lastConcatChar, ")");
  1296. State.printall();
  1297. Console.putln("-=-=-=-=-");
  1298. foreach(state; current) {
  1299. Console.put(state.id, " ... ");
  1300. }
  1301. Console.putln;
  1302. lastOp = Operation.Concat;
  1303. }
  1304. if (thisChar == '(') {
  1305. // group start
  1306. Console.putln("Inner Group Found");
  1307. regexPos+=1;
  1308. buildDFA(regex, regexPos, current, false);
  1309. if (regex[regexPos] == '*') {
  1310. Console.putln("Inner Group Kleened");
  1311. lastOp = Operation.Kleene;
  1312. }
  1313. lastConcatChar = '\0';
  1314. }
  1315. else if (thisChar != ')') {
  1316. lastConcatChar = thisChar;
  1317. }
  1318. }
  1319. //Console.putln("lastChar = ", thisChar);
  1320. lastChar = thisChar;
  1321. regexPos++;
  1322. }
  1323. foreach(state; current) {
  1324. isolate(state);
  1325. state.accept = true;
  1326. }
  1327. Console.putln("Done");
  1328. State.printall();
  1329. return startState;
  1330. }
  1331. State concat(State start, dchar transition, ref State to, bool doNotUnroll = false) {
  1332. if (to !is null) {
  1333. Console.putln(start.id, " to ", to.id);
  1334. }
  1335. else {
  1336. Console.putln(start.id, " to null");
  1337. }
  1338. if ((to is null) || (to.id > start.id)) {
  1339. if (!doNotUnroll) {
  1340. isolate(start);
  1341. unroll(start);
  1342. isolate(start);
  1343. }
  1344. }
  1345. if (transition in start.transitions) {
  1346. return start.transitions[transition];
  1347. }
  1348. else {
  1349. if (to is null) {
  1350. to = new State();
  1351. }
  1352. start.transitions[transition] = to;
  1353. }
  1354. return to;
  1355. }
  1356. void unroll(State state) {
  1357. Console.putln("unrolling ", state.id);
  1358. foreach(backwardTrans; state.backwardList) {
  1359. State newState = new State();
  1360. State destState = state.transitions[backwardTrans];
  1361. state.transitions[backwardTrans] = newState;
  1362. foreach(transition; destState.transitions.keys) {
  1363. State toState = destState.transitions[transition];
  1364. newState.transitions[transition] = toState;
  1365. newState.backwardList.add(transition);
  1366. Link link = new Link();
  1367. link.from = newState;
  1368. link.transition = transition;
  1369. toState.incomingList.add(link);
  1370. /*
  1371. if (toState is state) {
  1372. Link link = new Link();
  1373. link.from = newState;
  1374. link.transition = transition;
  1375. state.incomingList.add(link);
  1376. }
  1377. */
  1378. }
  1379. }
  1380. state.backwardList = new List!(dchar);
  1381. }
  1382. void isolate(State state) {
  1383. Console.putln("isolating ", state.id);
  1384. foreach(link; state.incomingList) {
  1385. unroll(link.from);
  1386. }
  1387. state.incomingList = new List!(Link);
  1388. }
  1389. public static void test() {
  1390. }
  1391. static void my_unroll(State state, dchar chr) {
  1392. }
  1393. // Common
  1394. static string[][Thread] regexRefs;
  1395. static uint[Thread] regexPos;
  1396. // For backtracking regex operations
  1397. struct GroupInfo {
  1398. int startPos;
  1399. int endPos;
  1400. int strStartPos;
  1401. int strPos;
  1402. int parent;
  1403. int unionPos;
  1404. int groupId;
  1405. }
  1406. struct RegexInfo {
  1407. // This hash table contains information about a grouping
  1408. // for a specific position in the regex.
  1409. GroupInfo[int] groupInfo;
  1410. // This hash table contains information that aids operators
  1411. // for a specific position in the regex.
  1412. int[int] operatorFlag;
  1413. // This structure hopes to minimize work already done by merely setting
  1414. // a flag whenever a position in each string is reached. Since this
  1415. // denotes that the regex will be parsing from the same state, and the
  1416. // regex is pure, it will not have to repeat the work.
  1417. int[][] memoizer;
  1418. }
  1419. }