PageRenderTime 118ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/std/regexp.d

http://github.com/jcd/phobos
D | 2207 lines | 1447 code | 256 blank | 504 comment | 319 complexity | e1195e03ce2e2a4902931d040408c37a MD5 | raw file
  1. // Written in the D programming language.
  2. // Regular Expressions.
  3. /**
  4. * $(RED Deprecated. It will be removed in March 2012.
  5. * Please use $(LINK2 std_regex.html, std.regex) instead.)
  6. *
  7. * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular
  8. * expressions) are a powerful method of string pattern matching. The
  9. * regular expression language used in this library is the same as
  10. * that commonly used, however, some of the very advanced forms may
  11. * behave slightly differently. The standard observed is the $(WEB
  12. * www.ecma-international.org/publications/standards/Ecma-262.htm,
  13. * ECMA standard) for regular expressions.
  14. *
  15. * std.regexp is designed to work only with valid UTF strings as input.
  16. * To validate untrusted input, use std.utf.validate().
  17. *
  18. * In the following guide, $(I pattern)[] refers to a
  19. * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
  20. * The $(I attributes)[] refers to
  21. * a string controlling the interpretation
  22. * of the regular expression.
  23. * It consists of a sequence of one or more
  24. * of the following characters:
  25. *
  26. * <table border=1 cellspacing=0 cellpadding=5>
  27. * <caption>Attribute Characters</caption>
  28. * $(TR $(TH Attribute) $(TH Action))
  29. * <tr>
  30. * $(TD $(B g))
  31. * $(TD global; repeat over the whole input string)
  32. * </tr>
  33. * <tr>
  34. * $(TD $(B i))
  35. * $(TD case insensitive)
  36. * </tr>
  37. * <tr>
  38. * $(TD $(B m))
  39. * $(TD treat as multiple lines separated by newlines)
  40. * </tr>
  41. * </table>
  42. *
  43. * The $(I format)[] string has the formatting characters:
  44. *
  45. * <table border=1 cellspacing=0 cellpadding=5>
  46. * <caption>Formatting Characters</caption>
  47. * $(TR $(TH Format) $(TH Replaced With))
  48. * $(TR
  49. * $(TD $(B $$)) $(TD $)
  50. * )
  51. * $(TR
  52. * $(TD $(B $&amp;)) $(TD The matched substring.)
  53. * )
  54. * $(TR
  55. * $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.)
  56. * )
  57. * $(TR
  58. * $(TD $(B $')) $(TD The portion of string that follows the matched substring.)
  59. * )
  60. * $(TR
  61. * $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
  62. * is a single digit 1-9
  63. * and $$(I n) is not followed by a decimal digit.)
  64. * )
  65. * $(TR
  66. * $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
  67. * is a two-digit decimal
  68. * number 01-99.
  69. * If $(I nn)th capture is undefined or more than the number
  70. * of parenthesized subexpressions, use the empty
  71. * string instead.)
  72. * )
  73. * </table>
  74. *
  75. * Any other $ are left as is.
  76. *
  77. * References:
  78. * $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
  79. * Macros:
  80. * WIKI = StdRegexp
  81. * DOLLAR = $
  82. *
  83. * Copyright: Copyright Digital Mars 2000 - 2011.
  84. * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  85. * Authors: $(WEB digitalmars.com, Walter Bright)
  86. * Source: $(PHOBOSSRC std/_regexp.d)
  87. */
  88. /* Copyright Digital Mars 2000 - 2011.
  89. * Distributed under the Boost Software License, Version 1.0.
  90. * (See accompanying file LICENSE_1_0.txt or copy at
  91. * http://www.boost.org/LICENSE_1_0.txt)
  92. */
  93. /*
  94. Escape sequences:
  95. \nnn starts out a 1, 2 or 3 digit octal sequence,
  96. where n is an octal digit. If nnn is larger than
  97. 0377, then the 3rd digit is not part of the sequence
  98. and is not consumed.
  99. For maximal portability, use exactly 3 digits.
  100. \xXX starts out a 1 or 2 digit hex sequence. X
  101. is a hex character. If the first character after the \x
  102. is not a hex character, the value of the sequence is 'x'
  103. and the XX are not consumed.
  104. For maximal portability, use exactly 2 digits.
  105. \uUUUU is a unicode sequence. There are exactly
  106. 4 hex characters after the \u, if any are not, then
  107. the value of the sequence is 'u', and the UUUU are not
  108. consumed.
  109. Character classes:
  110. [a-b], where a is greater than b, will produce
  111. an error.
  112. References:
  113. http://www.unicode.org/unicode/reports/tr18/
  114. */
  115. module std.regexp;
  116. pragma(msg, "Notice: As of Phobos 2.055, std.regexp has been deprecated. " ~
  117. "It will be removed in March 2012. Please use std.regex instead.");
  118. //debug = regexp; // uncomment to turn on debugging printf's
  119. private
  120. {
  121. import core.stdc.stdio;
  122. import core.stdc.stdlib;
  123. import core.stdc.string;
  124. import std.array;
  125. import std.stdio;
  126. import std.string;
  127. import std.ascii;
  128. import std.outbuffer;
  129. import std.bitmanip;
  130. import std.utf;
  131. import std.algorithm;
  132. import std.array;
  133. import std.traits;
  134. }
  135. deprecated:
  136. /** Regular expression to extract an _email address.
  137. * References:
  138. * $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR)
  139. * $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format)
  140. */
  141. string email =
  142. r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";
  143. /** Regular expression to extract a _url */
  144. string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";
  145. /************************************
  146. * One of these gets thrown on compilation errors
  147. */
  148. class RegExpException : Exception
  149. {
  150. this(string msg)
  151. {
  152. super(msg);
  153. }
  154. }
  155. struct regmatch_t
  156. {
  157. sizediff_t rm_so; // index of start of match
  158. sizediff_t rm_eo; // index past end of match
  159. }
  160. private alias char rchar; // so we can make a wchar version
  161. /******************************************************
  162. * Search string for matches with regular expression
  163. * pattern with attributes.
  164. * Replace each match with string generated from format.
  165. * Params:
  166. * s = String to search.
  167. * pattern = Regular expression pattern.
  168. * format = Replacement string format.
  169. * attributes = Regular expression attributes.
  170. * Returns:
  171. * the resulting string
  172. * Example:
  173. * Replace the letters 'a' with the letters 'ZZ'.
  174. * ---
  175. * s = "Strap a rocket engine on a chicken."
  176. * sub(s, "a", "ZZ") // result: StrZZp a rocket engine on a chicken.
  177. * sub(s, "a", "ZZ", "g") // result: StrZZp ZZ rocket engine on ZZ chicken.
  178. * ---
  179. * The replacement format can reference the matches using
  180. * the $&amp;, $$, $', $`, $0 .. $99 notation:
  181. * ---
  182. * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
  183. * ---
  184. */
  185. string sub(string s, string pattern, string format, string attributes = null)
  186. {
  187. auto r = new RegExp(pattern, attributes);
  188. auto result = r.replace(s, format);
  189. delete r;
  190. return result;
  191. }
  192. unittest
  193. {
  194. debug(regexp) printf("regexp.sub.unittest\n");
  195. string r = sub("hello", "ll", "ss");
  196. assert(r == "hesso");
  197. }
  198. /*******************************************************
  199. * Search string for matches with regular expression
  200. * pattern with attributes.
  201. * Pass each match to delegate dg.
  202. * Replace each match with the return value from dg.
  203. * Params:
  204. * s = String to search.
  205. * pattern = Regular expression pattern.
  206. * dg = Delegate
  207. * attributes = Regular expression attributes.
  208. * Returns: the resulting string.
  209. * Example:
  210. * Capitalize the letters 'a' and 'r':
  211. * ---
  212. * s = "Strap a rocket engine on a chicken.";
  213. * sub(s, "[ar]",
  214. * delegate char[] (RegExp m)
  215. * {
  216. * return toUpper(m[0]);
  217. * },
  218. * "g"); // result: StRAp A Rocket engine on A chicken.
  219. * ---
  220. */
  221. string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null)
  222. {
  223. auto r = new RegExp(pattern, attributes);
  224. string result = s;
  225. size_t lastindex = 0;
  226. size_t offset = 0;
  227. while (r.test(s, lastindex))
  228. {
  229. auto so = r.pmatch[0].rm_so;
  230. auto eo = r.pmatch[0].rm_eo;
  231. string replacement = dg(r);
  232. // Optimize by using std.string.replace if possible - Dave Fladebo
  233. string slice = result[offset + so .. offset + eo];
  234. if (r.attributes & RegExp.REA.global && // global, so replace all
  235. !(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case
  236. !(r.attributes & RegExp.REA.multiline) && // not multiline
  237. pattern == slice) // simple pattern (exact match, no special characters)
  238. {
  239. debug(regexp)
  240. printf("result: %.*s, pattern: %.*s, slice: %.*s, replacement: %.*s\n",
  241. result.length, result.ptr,
  242. pattern.length, pattern.ptr,
  243. slice.length, slice.ptr,
  244. replacement.length, replacement.ptr);
  245. result = replace(result,slice,replacement);
  246. break;
  247. }
  248. result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
  249. if (r.attributes & RegExp.REA.global)
  250. {
  251. offset += replacement.length - (eo - so);
  252. if (lastindex == eo)
  253. lastindex++; // always consume some source
  254. else
  255. lastindex = eo;
  256. }
  257. else
  258. break;
  259. }
  260. delete r;
  261. return result;
  262. }
  263. unittest
  264. {
  265. debug(regexp) printf("regexp.sub.unittest\n");
  266. string foo(RegExp r) { return "ss"; }
  267. auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; });
  268. assert(r == "hesso");
  269. r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g");
  270. assert(r == "hello");
  271. auto s = sub("Strap a rocket engine on a chicken.",
  272. "[ar]",
  273. delegate string (RegExp m)
  274. {
  275. return std.string.toUpper(m[0]);
  276. },
  277. "g");
  278. assert(s == "StRAp A Rocket engine on A chicken.");
  279. }
  280. /*************************************************
  281. * Search $(D_PARAM s[]) for first match with $(D_PARAM pattern).
  282. * Params:
  283. * s = String to search.
  284. * pattern = Regular expression pattern.
  285. * Returns:
  286. * index into s[] of match if found, -1 if no match.
  287. * Example:
  288. * ---
  289. * auto s = "abcabcabab";
  290. * find(s, RegExp("b")); // match, returns 1
  291. * find(s, RegExp("f")); // no match, returns -1
  292. * ---
  293. */
  294. sizediff_t find(string s, RegExp pattern)
  295. {
  296. return pattern.test(s)
  297. ? pattern.pmatch[0].rm_so
  298. : -1;
  299. }
  300. unittest
  301. {
  302. debug(regexp) printf("regexp.find.unittest\n");
  303. auto i = find("xabcy", RegExp("abc"));
  304. assert(i == 1);
  305. i = find("cba", RegExp("abc"));
  306. assert(i == -1);
  307. }
  308. /**
  309. Returns:
  310. Same as $(D_PARAM find(s, RegExp(pattern, attributes))).
  311. WARNING:
  312. This function is scheduled for deprecation due to unnecessary
  313. ambiguity with the homonym function in std.string. Instead of
  314. $(D_PARAM std.regexp.find(s, p, a)), you may want to use $(D_PARAM
  315. find(s, RegExp(p, a))).
  316. */
  317. sizediff_t
  318. find(string s, string pattern, string attributes = null)
  319. {
  320. auto r = new RegExp(pattern, attributes);
  321. scope(exit) delete r;
  322. return r.test(s) ? r.pmatch[0].rm_so : -1;
  323. }
  324. unittest
  325. {
  326. debug(regexp) printf("regexp.find.unittest\n");
  327. auto i = find("xabcy", "abc");
  328. assert(i == 1);
  329. i = find("cba", "abc");
  330. assert(i == -1);
  331. }
  332. /*************************************************
  333. * Search $(D_PARAM s[]) for last match with $(D_PARAM pattern).
  334. * Params:
  335. * s = String to search.
  336. * pattern = Regular expression pattern.
  337. * Returns:
  338. * index into s[] of match if found, -1 if no match.
  339. * Example:
  340. * ---
  341. * auto s = "abcabcabab";
  342. * rfind(s, RegExp("b")); // match, returns 9
  343. * rfind(s, RegExp("f")); // no match, returns -1
  344. * ---
  345. */
  346. sizediff_t rfind(string s, RegExp pattern)
  347. {
  348. sizediff_t i = -1, lastindex = 0;
  349. while (pattern.test(s, lastindex))
  350. {
  351. auto eo = pattern.pmatch[0].rm_eo;
  352. i = pattern.pmatch[0].rm_so;
  353. if (lastindex == eo)
  354. lastindex++; // always consume some source
  355. else
  356. lastindex = eo;
  357. }
  358. return i;
  359. }
  360. unittest
  361. {
  362. sizediff_t i;
  363. debug(regexp) printf("regexp.rfind.unittest\n");
  364. i = rfind("abcdefcdef", RegExp("c"));
  365. assert(i == 6);
  366. i = rfind("abcdefcdef", RegExp("cd"));
  367. assert(i == 6);
  368. i = rfind("abcdefcdef", RegExp("x"));
  369. assert(i == -1);
  370. i = rfind("abcdefcdef", RegExp("xy"));
  371. assert(i == -1);
  372. i = rfind("abcdefcdef", RegExp(""));
  373. assert(i == 10);
  374. }
  375. /*************************************************
  376. Returns:
  377. Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))).
  378. WARNING:
  379. This function is scheduled for deprecation due to unnecessary
  380. ambiguity with the homonym function in std.string. Instead of
  381. $(D_PARAM std.regexp.rfind(s, p, a)), you may want to use $(D_PARAM
  382. rfind(s, RegExp(p, a))).
  383. */
  384. sizediff_t
  385. rfind(string s, string pattern, string attributes = null)
  386. {
  387. typeof(return) i = -1, lastindex = 0;
  388. auto r = new RegExp(pattern, attributes);
  389. while (r.test(s, lastindex))
  390. {
  391. auto eo = r.pmatch[0].rm_eo;
  392. i = r.pmatch[0].rm_so;
  393. if (lastindex == eo)
  394. lastindex++; // always consume some source
  395. else
  396. lastindex = eo;
  397. }
  398. delete r;
  399. return i;
  400. }
  401. unittest
  402. {
  403. sizediff_t i;
  404. debug(regexp) printf("regexp.rfind.unittest\n");
  405. i = rfind("abcdefcdef", "c");
  406. assert(i == 6);
  407. i = rfind("abcdefcdef", "cd");
  408. assert(i == 6);
  409. i = rfind("abcdefcdef", "x");
  410. assert(i == -1);
  411. i = rfind("abcdefcdef", "xy");
  412. assert(i == -1);
  413. i = rfind("abcdefcdef", "");
  414. assert(i == 10);
  415. }
  416. /********************************************
  417. * Split s[] into an array of strings, using the regular
  418. * expression $(D_PARAM pattern) as the separator.
  419. * Params:
  420. * s = String to search.
  421. * pattern = Regular expression pattern.
  422. * Returns:
  423. * array of slices into s[]
  424. * Example:
  425. * ---
  426. * foreach (s; split("abcabcabab", RegExp("C.", "i")))
  427. * {
  428. * writefln("s = '%s'", s);
  429. * }
  430. * // Prints:
  431. * // s = 'ab'
  432. * // s = 'b'
  433. * // s = 'bab'
  434. * ---
  435. */
  436. string[] split(string s, RegExp pattern)
  437. {
  438. return pattern.split(s);
  439. }
  440. unittest
  441. {
  442. debug(regexp) printf("regexp.split.unittest()\n");
  443. string[] result;
  444. result = split("ab", RegExp("a*"));
  445. assert(result.length == 2);
  446. assert(result[0] == "");
  447. assert(result[1] == "b");
  448. foreach (i, s; split("abcabcabab", RegExp("C.", "i")))
  449. {
  450. //writefln("s[%d] = '%s'", i, s);
  451. if (i == 0) assert(s == "ab");
  452. else if (i == 1) assert(s == "b");
  453. else if (i == 2) assert(s == "bab");
  454. else assert(0);
  455. }
  456. }
  457. /********************************************
  458. Returns:
  459. Same as $(D_PARAM split(s, RegExp(pattern, attributes))).
  460. WARNING:
  461. This function is scheduled for deprecation due to unnecessary
  462. ambiguity with the homonym function in std.string. Instead of
  463. $(D_PARAM std.regexp.split(s, p, a)), you may want to use $(D_PARAM
  464. split(s, RegExp(p, a))).
  465. */
  466. string[] split(string s, string pattern, string attributes = null)
  467. {
  468. auto r = new RegExp(pattern, attributes);
  469. auto result = r.split(s);
  470. delete r;
  471. return result;
  472. }
  473. unittest
  474. {
  475. debug(regexp) printf("regexp.split.unittest()\n");
  476. string[] result;
  477. result = split("ab", "a*");
  478. assert(result.length == 2);
  479. assert(result[0] == "");
  480. assert(result[1] == "b");
  481. foreach (i, s; split("abcabcabab", "C.", "i"))
  482. {
  483. //writefln("s[%d] = '%s'", i, s.length, s.ptr);
  484. if (i == 0) assert(s == "ab");
  485. else if (i == 1) assert(s == "b");
  486. else if (i == 2) assert(s == "bab");
  487. else assert(0);
  488. }
  489. }
  490. /****************************************************
  491. * Search s[] for first match with pattern[] with attributes[].
  492. * Params:
  493. * s = String to search.
  494. * pattern = Regular expression pattern.
  495. * attributes = Regular expression attributes.
  496. * Returns:
  497. * corresponding RegExp if found, null if not.
  498. * Example:
  499. * ---
  500. * import std.stdio;
  501. * import std.regexp;
  502. *
  503. * void main()
  504. * {
  505. * if (auto m = std.regexp.search("abcdef", "c"))
  506. * {
  507. * writefln("%s[%s]%s", m.pre, m[0], m.post);
  508. * }
  509. * }
  510. * // Prints:
  511. * // ab[c]def
  512. * ---
  513. */
  514. RegExp search(string s, string pattern, string attributes = null)
  515. {
  516. auto r = new RegExp(pattern, attributes);
  517. if (!r.test(s))
  518. { delete r;
  519. assert(r is null);
  520. }
  521. return r;
  522. }
  523. unittest
  524. {
  525. debug(regexp) printf("regexp.string.unittest()\n");
  526. if (auto m = std.regexp.search("abcdef", "c()"))
  527. {
  528. auto result = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
  529. assert(result == "ab[c]def");
  530. assert(m[1] == null);
  531. assert(m[2] == null);
  532. }
  533. else
  534. assert(0);
  535. if (auto n = std.regexp.search("abcdef", "g"))
  536. {
  537. assert(0);
  538. }
  539. }
  540. /* ********************************* RegExp ******************************** */
  541. /*****************************
  542. * RegExp is a class to handle regular expressions.
  543. *
  544. * It is the core foundation for adding powerful string pattern matching
  545. * capabilities to programs like grep, text editors, awk, sed, etc.
  546. */
  547. class RegExp
  548. {
  549. /*****
  550. * Construct a RegExp object. Compile pattern
  551. * with <i>attributes</i> into
  552. * an internal form for fast execution.
  553. * Params:
  554. * pattern = regular expression
  555. * attributes = _attributes
  556. * Throws: RegExpException if there are any compilation errors.
  557. * Example:
  558. * Declare two variables and assign to them a RegExp object:
  559. * ---
  560. * auto r = new RegExp("pattern");
  561. * auto s = new RegExp(r"p[1-5]\s*");
  562. * ---
  563. */
  564. public this(string pattern, string attributes = null)
  565. {
  566. pmatch = (&gmatch)[0 .. 1];
  567. compile(pattern, attributes);
  568. }
  569. /*****
  570. * Generate instance of RegExp.
  571. * Params:
  572. * pattern = regular expression
  573. * attributes = _attributes
  574. * Throws: RegExpException if there are any compilation errors.
  575. * Example:
  576. * Declare two variables and assign to them a RegExp object:
  577. * ---
  578. * auto r = RegExp("pattern");
  579. * auto s = RegExp(r"p[1-5]\s*");
  580. * ---
  581. */
  582. public static RegExp opCall(string pattern, string attributes = null)
  583. {
  584. return new RegExp(pattern, attributes);
  585. }
  586. unittest
  587. {
  588. debug(regexp) printf("regexp.opCall.unittest()\n");
  589. auto r1 = RegExp("hello", "m");
  590. string msg;
  591. try
  592. {
  593. auto r2 = RegExp("hello", "q");
  594. assert(0);
  595. }
  596. catch (RegExpException ree)
  597. {
  598. msg = ree.toString();
  599. //writefln("message: %s", ree);
  600. }
  601. assert(std.algorithm.countUntil(msg, "unrecognized attribute") >= 0);
  602. }
  603. /************************************
  604. * Set up for start of foreach loop.
  605. * Returns:
  606. * search() returns instance of RegExp set up to _search string[].
  607. * Example:
  608. * ---
  609. * import std.stdio;
  610. * import std.regexp;
  611. *
  612. * void main()
  613. * {
  614. * foreach(m; RegExp("ab").search("abcabcabab"))
  615. * {
  616. * writefln("%s[%s]%s", m.pre, m[0], m.post);
  617. * }
  618. * }
  619. * // Prints:
  620. * // [ab]cabcabab
  621. * // abc[ab]cabab
  622. * // abcabc[ab]ab
  623. * // abcabcab[ab]
  624. * ---
  625. */
  626. public RegExp search(string string)
  627. {
  628. input = string;
  629. pmatch[0].rm_eo = 0;
  630. return this;
  631. }
  632. /** ditto */
  633. public int opApply(scope int delegate(ref RegExp) dg)
  634. {
  635. int result;
  636. RegExp r = this;
  637. while (test())
  638. {
  639. result = dg(r);
  640. if (result)
  641. break;
  642. }
  643. return result;
  644. }
  645. unittest
  646. {
  647. debug(regexp) printf("regexp.search.unittest()\n");
  648. int i;
  649. foreach(m; RegExp("ab").search("abcabcabab"))
  650. {
  651. auto s = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
  652. if (i == 0) assert(s == "[ab]cabcabab");
  653. else if (i == 1) assert(s == "abc[ab]cabab");
  654. else if (i == 2) assert(s == "abcabc[ab]ab");
  655. else if (i == 3) assert(s == "abcabcab[ab]");
  656. else assert(0);
  657. i++;
  658. }
  659. }
  660. /******************
  661. * Retrieve match n.
  662. *
  663. * n==0 means the matched substring, n>0 means the
  664. * n'th parenthesized subexpression.
  665. * if n is larger than the number of parenthesized subexpressions,
  666. * null is returned.
  667. */
  668. public string opIndex(size_t n)
  669. {
  670. if (n >= pmatch.length)
  671. return null;
  672. else
  673. {
  674. auto rm_so = pmatch[n].rm_so;
  675. auto rm_eo = pmatch[n].rm_eo;
  676. if (rm_so == rm_eo)
  677. return null;
  678. return input[rm_so .. rm_eo];
  679. }
  680. }
  681. /**
  682. Same as $(D_PARAM opIndex(n)).
  683. WARNING:
  684. Scheduled for deprecation due to confusion with overloaded
  685. $(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n))
  686. you may want to use $(D_PARAM regex[n]).
  687. */
  688. public string match(size_t n)
  689. {
  690. return this[n];
  691. }
  692. /*******************
  693. * Return the slice of the input that precedes the matched substring.
  694. */
  695. public @property string pre()
  696. {
  697. return input[0 .. pmatch[0].rm_so];
  698. }
  699. /*******************
  700. * Return the slice of the input that follows the matched substring.
  701. */
  702. public @property string post()
  703. {
  704. return input[pmatch[0].rm_eo .. $];
  705. }
  706. uint re_nsub; // number of parenthesized subexpression matches
  707. regmatch_t[] pmatch; // array [re_nsub + 1]
  708. string input; // the string to search
  709. // per instance:
  710. string pattern; // source text of the regular expression
  711. string flags; // source text of the attributes parameter
  712. int errors;
  713. uint attributes;
  714. enum REA
  715. {
  716. global = 1, // has the g attribute
  717. ignoreCase = 2, // has the i attribute
  718. multiline = 4, // if treat as multiple lines separated
  719. // by newlines, or as a single line
  720. dotmatchlf = 8, // if . matches \n
  721. }
  722. private:
  723. size_t src; // current source index in input[]
  724. size_t src_start; // starting index for match in input[]
  725. size_t p; // position of parser in pattern[]
  726. regmatch_t gmatch; // match for the entire regular expression
  727. // (serves as storage for pmatch[0])
  728. const(ubyte)[] program; // pattern[] compiled into regular expression program
  729. OutBuffer buf;
  730. /******************************************/
  731. // Opcodes
  732. enum : ubyte
  733. {
  734. REend, // end of program
  735. REchar, // single character
  736. REichar, // single character, case insensitive
  737. REdchar, // single UCS character
  738. REidchar, // single wide character, case insensitive
  739. REanychar, // any character
  740. REanystar, // ".*"
  741. REstring, // string of characters
  742. REistring, // string of characters, case insensitive
  743. REtestbit, // any in bitmap, non-consuming
  744. REbit, // any in the bit map
  745. REnotbit, // any not in the bit map
  746. RErange, // any in the string
  747. REnotrange, // any not in the string
  748. REor, // a | b
  749. REplus, // 1 or more
  750. REstar, // 0 or more
  751. REquest, // 0 or 1
  752. REnm, // n..m
  753. REnmq, // n..m, non-greedy version
  754. REbol, // beginning of line
  755. REeol, // end of line
  756. REparen, // parenthesized subexpression
  757. REgoto, // goto offset
  758. REwordboundary,
  759. REnotwordboundary,
  760. REdigit,
  761. REnotdigit,
  762. REspace,
  763. REnotspace,
  764. REword,
  765. REnotword,
  766. REbackref,
  767. };
  768. // BUG: should this include '$'?
  769. private int isword(dchar c) { return isAlphaNum(c) || c == '_'; }
  770. private uint inf = ~0u;
  771. /* ********************************
  772. * Throws RegExpException on error
  773. */
  774. public void compile(string pattern, string attributes)
  775. {
  776. //printf("RegExp.compile('%.*s', '%.*s')\n", pattern.length, pattern.ptr, attributes.length, attributes.ptr);
  777. this.attributes = 0;
  778. foreach (rchar c; attributes)
  779. { REA att;
  780. switch (c)
  781. {
  782. case 'g': att = REA.global; break;
  783. case 'i': att = REA.ignoreCase; break;
  784. case 'm': att = REA.multiline; break;
  785. default:
  786. error("unrecognized attribute");
  787. return;
  788. }
  789. if (this.attributes & att)
  790. { error("redundant attribute");
  791. return;
  792. }
  793. this.attributes |= att;
  794. }
  795. input = null;
  796. this.pattern = pattern;
  797. this.flags = attributes;
  798. uint oldre_nsub = re_nsub;
  799. re_nsub = 0;
  800. errors = 0;
  801. buf = new OutBuffer();
  802. buf.reserve(pattern.length * 8);
  803. p = 0;
  804. parseRegexp();
  805. if (p < pattern.length)
  806. { error("unmatched ')'");
  807. }
  808. // @@@ SKIPPING OPTIMIZATION SOLVES BUG 941 @@@
  809. //optimize();
  810. program = buf.data;
  811. buf.data = null;
  812. delete buf;
  813. if (re_nsub > oldre_nsub)
  814. {
  815. if (pmatch.ptr is &gmatch)
  816. pmatch = null;
  817. pmatch.length = re_nsub + 1;
  818. }
  819. pmatch[0].rm_so = 0;
  820. pmatch[0].rm_eo = 0;
  821. }
  822. /********************************************
  823. * Split s[] into an array of strings, using the regular
  824. * expression as the separator.
  825. * Returns:
  826. * array of slices into s[]
  827. */
  828. public string[] split(string s)
  829. {
  830. debug(regexp) printf("regexp.split()\n");
  831. string[] result;
  832. if (s.length)
  833. {
  834. sizediff_t p, q;
  835. for (q = p; q != s.length;)
  836. {
  837. if (test(s, q))
  838. {
  839. q = pmatch[0].rm_so;
  840. auto e = pmatch[0].rm_eo;
  841. if (e != p)
  842. {
  843. result ~= s[p .. q];
  844. for (size_t i = 1; i < pmatch.length; i++)
  845. {
  846. auto so = pmatch[i].rm_so;
  847. auto eo = pmatch[i].rm_eo;
  848. if (so == eo)
  849. { so = 0; // -1 gives array bounds error
  850. eo = 0;
  851. }
  852. result ~= s[so .. eo];
  853. }
  854. q = p = e;
  855. continue;
  856. }
  857. }
  858. q++;
  859. }
  860. result ~= s[p .. s.length];
  861. }
  862. else if (!test(s))
  863. result ~= s;
  864. return result;
  865. }
  866. unittest
  867. {
  868. debug(regexp) printf("regexp.split.unittest()\n");
  869. auto r = new RegExp("a*?", null);
  870. string[] result;
  871. string j;
  872. int i;
  873. result = r.split("ab");
  874. assert(result.length == 2);
  875. i = std.string.cmp(result[0], "a");
  876. assert(i == 0);
  877. i = std.string.cmp(result[1], "b");
  878. assert(i == 0);
  879. r = new RegExp("a*", null);
  880. result = r.split("ab");
  881. assert(result.length == 2);
  882. i = std.string.cmp(result[0], "");
  883. assert(i == 0);
  884. i = std.string.cmp(result[1], "b");
  885. assert(i == 0);
  886. r = new RegExp("<(\\/)?([^<>]+)>", null);
  887. result = r.split("a<b>font</b>bar<TAG>hello</TAG>");
  888. debug(regexp)
  889. {
  890. for (i = 0; i < result.length; i++)
  891. printf("result[%d] = '%.*s'\n", i, result[i].length, result[i].ptr);
  892. }
  893. j = join(result, ",");
  894. //printf("j = '%.*s'\n", j.length, j.ptr);
  895. i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
  896. assert(i == 0);
  897. r = new RegExp("a[bc]", null);
  898. result = r.match("123ab");
  899. j = join(result, ",");
  900. i = std.string.cmp(j, "ab");
  901. assert(i == 0);
  902. result = r.match("ac");
  903. j = join(result, ",");
  904. i = std.string.cmp(j, "ac");
  905. assert(i == 0);
  906. }
  907. /*************************************************
  908. * Search string[] for match with regular expression.
  909. * Returns:
  910. * index of match if successful, -1 if not found
  911. */
  912. public sizediff_t find(string string)
  913. {
  914. if (test(string))
  915. return pmatch[0].rm_so;
  916. else
  917. return -1; // no match
  918. }
  919. //deprecated alias find search;
  920. unittest
  921. {
  922. debug(regexp) printf("regexp.find.unittest()\n");
  923. RegExp r = new RegExp("abc", null);
  924. auto i = r.find("xabcy");
  925. assert(i == 1);
  926. i = r.find("cba");
  927. assert(i == -1);
  928. }
  929. /*************************************************
  930. * Search s[] for match.
  931. * Returns:
  932. * If global attribute, return same value as exec(s).
  933. * If not global attribute, return array of all matches.
  934. */
  935. public string[] match(string s)
  936. {
  937. string[] result;
  938. if (attributes & REA.global)
  939. {
  940. sizediff_t lastindex = 0;
  941. while (test(s, lastindex))
  942. {
  943. auto eo = pmatch[0].rm_eo;
  944. result ~= input[pmatch[0].rm_so .. eo];
  945. if (lastindex == eo)
  946. lastindex++; // always consume some source
  947. else
  948. lastindex = eo;
  949. }
  950. }
  951. else
  952. {
  953. result = exec(s);
  954. }
  955. return result;
  956. }
  957. unittest
  958. {
  959. debug(regexp) printf("regexp.match.unittest()\n");
  960. int i;
  961. string[] result;
  962. string j;
  963. RegExp r;
  964. r = new RegExp("a[bc]", null);
  965. result = r.match("1ab2ac3");
  966. j = join(result, ",");
  967. i = std.string.cmp(j, "ab");
  968. assert(i == 0);
  969. r = new RegExp("a[bc]", "g");
  970. result = r.match("1ab2ac3");
  971. j = join(result, ",");
  972. i = std.string.cmp(j, "ab,ac");
  973. assert(i == 0);
  974. }
  975. /*************************************************
  976. * Find regular expression matches in s[]. Replace those matches
  977. * with a new string composed of format[] merged with the result of the
  978. * matches.
  979. * If global, replace all matches. Otherwise, replace first match.
  980. * Returns: the new string
  981. */
  982. public string replace(string s, string format)
  983. {
  984. debug(regexp) printf("string = %.*s, format = %.*s\n", s.length, s.ptr, format.length, format.ptr);
  985. string result = s;
  986. sizediff_t lastindex = 0;
  987. size_t offset = 0;
  988. for (;;)
  989. {
  990. if (!test(s, lastindex))
  991. break;
  992. auto so = pmatch[0].rm_so;
  993. auto eo = pmatch[0].rm_eo;
  994. string replacement = replace(format);
  995. // Optimize by using replace if possible - Dave Fladebo
  996. string slice = result[offset + so .. offset + eo];
  997. if (attributes & REA.global && // global, so replace all
  998. !(attributes & REA.ignoreCase) && // not ignoring case
  999. !(attributes & REA.multiline) && // not multiline
  1000. pattern == slice && // simple pattern (exact match, no special characters)
  1001. format == replacement) // simple format, not $ formats
  1002. {
  1003. debug(regexp)
  1004. {
  1005. auto sss = result[offset + so .. offset + eo];
  1006. printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",
  1007. pattern.length, pattern.ptr, sss.length, sss.ptr, format.length, format.ptr, replacement.length, replacement.ptr);
  1008. }
  1009. result = std.array.replace(result,slice,replacement);
  1010. break;
  1011. }
  1012. result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
  1013. if (attributes & REA.global)
  1014. {
  1015. offset += replacement.length - (eo - so);
  1016. if (lastindex == eo)
  1017. lastindex++; // always consume some source
  1018. else
  1019. lastindex = eo;
  1020. }
  1021. else
  1022. break;
  1023. }
  1024. return result;
  1025. }
  1026. unittest
  1027. {
  1028. debug(regexp) printf("regexp.replace.unittest()\n");
  1029. int i;
  1030. string result;
  1031. RegExp r;
  1032. r = new RegExp("a[bc]", "g");
  1033. result = r.replace("1ab2ac3", "x$&y");
  1034. i = std.string.cmp(result, "1xaby2xacy3");
  1035. assert(i == 0);
  1036. r = new RegExp("ab", "g");
  1037. result = r.replace("1ab2ac3", "xy");
  1038. i = std.string.cmp(result, "1xy2ac3");
  1039. assert(i == 0);
  1040. }
  1041. /*************************************************
  1042. * Search string[] for match.
  1043. * Returns:
  1044. * array of slices into string[] representing matches
  1045. */
  1046. public string[] exec(string s)
  1047. {
  1048. debug(regexp) printf("regexp.exec(string = '%.*s')\n", s.length, s.ptr);
  1049. input = s;
  1050. pmatch[0].rm_so = 0;
  1051. pmatch[0].rm_eo = 0;
  1052. return exec();
  1053. }
  1054. /*************************************************
  1055. * Pick up where last exec(string) or exec() left off,
  1056. * searching string[] for next match.
  1057. * Returns:
  1058. * array of slices into string[] representing matches
  1059. */
  1060. public string[] exec()
  1061. {
  1062. if (!test())
  1063. return null;
  1064. auto result = new string[pmatch.length];
  1065. for (int i = 0; i < pmatch.length; i++)
  1066. {
  1067. if (pmatch[i].rm_so == pmatch[i].rm_eo)
  1068. result[i] = null;
  1069. else
  1070. result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
  1071. }
  1072. return result;
  1073. }
  1074. /************************************************
  1075. * Search s[] for match.
  1076. * Returns: 0 for no match, !=0 for match
  1077. * Example:
  1078. ---
  1079. import std.stdio;
  1080. import std.regexp;
  1081. import std.string;
  1082. int grep(int delegate(char[]) pred, char[][] list)
  1083. {
  1084. int count;
  1085. foreach (s; list)
  1086. { if (pred(s))
  1087. ++count;
  1088. }
  1089. return count;
  1090. }
  1091. void main()
  1092. {
  1093. auto x = grep(&RegExp("[Ff]oo").test,
  1094. std.string.split("mary had a foo lamb"));
  1095. writefln(x);
  1096. }
  1097. ---
  1098. * which prints: 1
  1099. */
  1100. //@@@
  1101. public bool test(string s)
  1102. {
  1103. return test(s, 0 /*pmatch[0].rm_eo*/) != 0;
  1104. }
  1105. /************************************************
  1106. * Pick up where last test(string) or test() left off, and search again.
  1107. * Returns: 0 for no match, !=0 for match
  1108. */
  1109. public int test()
  1110. {
  1111. return test(input, pmatch[0].rm_eo);
  1112. }
  1113. /************************************************
  1114. * Test s[] starting at startindex against regular expression.
  1115. * Returns: 0 for no match, !=0 for match
  1116. */
  1117. public int test(string s, size_t startindex)
  1118. {
  1119. char firstc;
  1120. input = s;
  1121. debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %zd)\n", input.length, input.ptr, startindex);
  1122. pmatch[0].rm_so = 0;
  1123. pmatch[0].rm_eo = 0;
  1124. if (startindex < 0 || startindex > input.length)
  1125. {
  1126. return 0; // fail
  1127. }
  1128. //debug(regexp) printProgram(program);
  1129. // First character optimization
  1130. firstc = 0;
  1131. if (program[0] == REchar)
  1132. {
  1133. firstc = program[1];
  1134. if (attributes & REA.ignoreCase && isAlpha(firstc))
  1135. firstc = 0;
  1136. }
  1137. for (auto si = startindex; ; si++)
  1138. {
  1139. if (firstc)
  1140. {
  1141. if (si == input.length)
  1142. break; // no match
  1143. if (input[si] != firstc)
  1144. {
  1145. si++;
  1146. if (!chr(si, firstc)) // if first character not found
  1147. break; // no match
  1148. }
  1149. }
  1150. for (size_t i = 0; i < re_nsub + 1; i++)
  1151. {
  1152. pmatch[i].rm_so = -1;
  1153. pmatch[i].rm_eo = -1;
  1154. }
  1155. src_start = src = si;
  1156. if (trymatch(0, program.length))
  1157. {
  1158. pmatch[0].rm_so = si;
  1159. pmatch[0].rm_eo = src;
  1160. //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
  1161. return 1;
  1162. }
  1163. // If possible match must start at beginning, we are done
  1164. if (program[0] == REbol || program[0] == REanystar)
  1165. {
  1166. if (attributes & REA.multiline)
  1167. {
  1168. // Scan for the next \n
  1169. if (!chr(si, '\n'))
  1170. break; // no match if '\n' not found
  1171. }
  1172. else
  1173. break;
  1174. }
  1175. if (si == input.length)
  1176. break;
  1177. debug(regexp)
  1178. {
  1179. auto sss = input[si + 1 .. input.length];
  1180. printf("Starting new try: '%.*s'\n", sss.length, sss.ptr);
  1181. }
  1182. }
  1183. return 0; // no match
  1184. }
  1185. /**
  1186. Returns whether string $(D_PARAM s) matches $(D_PARAM this).
  1187. */
  1188. alias test opEquals;
  1189. // bool opEquals(string s)
  1190. // {
  1191. // return test(s);
  1192. // }
  1193. unittest
  1194. {
  1195. assert("abc" == RegExp(".b."));
  1196. assert("abc" != RegExp(".b.."));
  1197. }
  1198. int chr(ref size_t si, rchar c)
  1199. {
  1200. for (; si < input.length; si++)
  1201. {
  1202. if (input[si] == c)
  1203. return 1;
  1204. }
  1205. return 0;
  1206. }
  1207. void printProgram(const(ubyte)[] prog)
  1208. {
  1209. //debug(regexp)
  1210. {
  1211. size_t len;
  1212. uint n;
  1213. uint m;
  1214. ushort *pu;
  1215. uint *puint;
  1216. char[] str;
  1217. printf("printProgram()\n");
  1218. for (size_t pc = 0; pc < prog.length; )
  1219. {
  1220. printf("%3d: ", pc);
  1221. //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
  1222. switch (prog[pc])
  1223. {
  1224. case REchar:
  1225. printf("\tREchar '%c'\n", prog[pc + 1]);
  1226. pc += 1 + char.sizeof;
  1227. break;
  1228. case REichar:
  1229. printf("\tREichar '%c'\n", prog[pc + 1]);
  1230. pc += 1 + char.sizeof;
  1231. break;
  1232. case REdchar:
  1233. printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
  1234. pc += 1 + dchar.sizeof;
  1235. break;
  1236. case REidchar:
  1237. printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
  1238. pc += 1 + dchar.sizeof;
  1239. break;
  1240. case REanychar:
  1241. printf("\tREanychar\n");
  1242. pc++;
  1243. break;
  1244. case REstring:
  1245. len = *cast(size_t *)&prog[pc + 1];
  1246. str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
  1247. printf("\tREstring x%x, '%.*s'\n", len, str.length, str.ptr);
  1248. pc += 1 + size_t.sizeof + len * rchar.sizeof;
  1249. break;
  1250. case REistring:
  1251. len = *cast(size_t *)&prog[pc + 1];
  1252. str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
  1253. printf("\tREistring x%x, '%.*s'\n", len, str.length, str.ptr);
  1254. pc += 1 + size_t.sizeof + len * rchar.sizeof;
  1255. break;
  1256. case REtestbit:
  1257. pu = cast(ushort *)&prog[pc + 1];
  1258. printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
  1259. len = pu[1];
  1260. pc += 1 + 2 * ushort.sizeof + len;
  1261. break;
  1262. case REbit:
  1263. pu = cast(ushort *)&prog[pc + 1];
  1264. len = pu[1];
  1265. printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
  1266. for (n = 0; n < len; n++)
  1267. printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
  1268. printf("\n");
  1269. pc += 1 + 2 * ushort.sizeof + len;
  1270. break;
  1271. case REnotbit:
  1272. pu = cast(ushort *)&prog[pc + 1];
  1273. printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
  1274. len = pu[1];
  1275. pc += 1 + 2 * ushort.sizeof + len;
  1276. break;
  1277. case RErange:
  1278. len = *cast(uint *)&prog[pc + 1];
  1279. printf("\tRErange %d\n", len);
  1280. // BUG: REAignoreCase?
  1281. pc += 1 + uint.sizeof + len;
  1282. break;
  1283. case REnotrange:
  1284. len = *cast(uint *)&prog[pc + 1];
  1285. printf("\tREnotrange %d\n", len);
  1286. // BUG: REAignoreCase?
  1287. pc += 1 + uint.sizeof + len;
  1288. break;
  1289. case REbol:
  1290. printf("\tREbol\n");
  1291. pc++;
  1292. break;
  1293. case REeol:
  1294. printf("\tREeol\n");
  1295. pc++;
  1296. break;
  1297. case REor:
  1298. len = *cast(uint *)&prog[pc + 1];
  1299. printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
  1300. pc += 1 + uint.sizeof;
  1301. break;
  1302. case REgoto:
  1303. len = *cast(uint *)&prog[pc + 1];
  1304. printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
  1305. pc += 1 + uint.sizeof;
  1306. break;
  1307. case REanystar:
  1308. printf("\tREanystar\n");
  1309. pc++;
  1310. break;
  1311. case REnm:
  1312. case REnmq:
  1313. // len, n, m, ()
  1314. puint = cast(uint *)&prog[pc + 1];
  1315. len = puint[0];
  1316. n = puint[1];
  1317. m = puint[2];
  1318. printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n",
  1319. (prog[pc] == REnmq) ? "q".ptr : " ".ptr,
  1320. len, n, m, pc + 1 + uint.sizeof * 3 + len);
  1321. pc += 1 + uint.sizeof * 3;
  1322. break;
  1323. case REparen:
  1324. // len, n, ()
  1325. puint = cast(uint *)&prog[pc + 1];
  1326. len = puint[0];
  1327. n = puint[1];
  1328. printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
  1329. pc += 1 + uint.sizeof * 2;
  1330. break;
  1331. case REend:
  1332. printf("\tREend\n");
  1333. return;
  1334. case REwordboundary:
  1335. printf("\tREwordboundary\n");
  1336. pc++;
  1337. break;
  1338. case REnotwordboundary:
  1339. printf("\tREnotwordboundary\n");
  1340. pc++;
  1341. break;
  1342. case REdigit:
  1343. printf("\tREdigit\n");
  1344. pc++;
  1345. break;
  1346. case REnotdigit:
  1347. printf("\tREnotdigit\n");
  1348. pc++;
  1349. break;
  1350. case REspace:
  1351. printf("\tREspace\n");
  1352. pc++;
  1353. break;
  1354. case REnotspace:
  1355. printf("\tREnotspace\n");
  1356. pc++;
  1357. break;
  1358. case REword:
  1359. printf("\tREword\n");
  1360. pc++;
  1361. break;
  1362. case REnotword:
  1363. printf("\tREnotword\n");
  1364. pc++;
  1365. break;
  1366. case REbackref:
  1367. printf("\tREbackref %d\n", prog[1]);
  1368. pc += 2;
  1369. break;
  1370. default:
  1371. assert(0);
  1372. }
  1373. }
  1374. }
  1375. }
  1376. /**************************************************
  1377. * Match input against a section of the program[].
  1378. * Returns:
  1379. * 1 if successful match
  1380. * 0 no match
  1381. */
  1382. int trymatch(size_t pc, size_t pcend)
  1383. {
  1384. size_t len;
  1385. size_t n;
  1386. size_t m;
  1387. size_t count;
  1388. size_t pop;
  1389. size_t ss;
  1390. regmatch_t *psave;
  1391. size_t c1;
  1392. size_t c2;
  1393. ushort* pu;
  1394. uint* puint;
  1395. debug(regexp)
  1396. {
  1397. auto sss = input[src .. input.length];
  1398. printf("RegExp.trymatch(pc = %zd, src = '%.*s', pcend = %zd)\n", pc, sss.length, sss.ptr, pcend);
  1399. }
  1400. auto srcsave = src;
  1401. psave = null;
  1402. for (;;)
  1403. {
  1404. if (pc == pcend) // if done matching
  1405. { debug(regex) printf("\tprogend\n");
  1406. return 1;
  1407. }
  1408. //printf("\top = %d\n", program[pc]);
  1409. switch (program[pc])
  1410. {
  1411. case REchar:
  1412. if (src == input.length)
  1413. goto Lnomatch;
  1414. debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
  1415. if (program[pc + 1] != input[src])
  1416. goto Lnomatch;
  1417. src++;
  1418. pc += 1 + char.sizeof;
  1419. break;
  1420. case REichar:
  1421. if (src == input.length)
  1422. goto Lnomatch;
  1423. debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
  1424. c1 = program[pc + 1];
  1425. c2 = input[src];
  1426. if (c1 != c2)
  1427. {
  1428. if (isLower(cast(rchar)c2))
  1429. c2 = std.ascii.toUpper(cast(rchar)c2);
  1430. else
  1431. goto Lnomatch;
  1432. if (c1 != c2)
  1433. goto Lnomatch;
  1434. }
  1435. src++;
  1436. pc += 1 + char.sizeof;
  1437. break;
  1438. case REdchar:
  1439. debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
  1440. if (src == input.length)
  1441. goto Lnomatch;
  1442. if (*(cast(dchar *)&program[pc + 1]) != input[src])
  1443. goto Lnomatch;
  1444. src++;
  1445. pc += 1 + dchar.sizeof;
  1446. break;
  1447. case REidchar:
  1448. debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
  1449. if (src == input.length)
  1450. goto Lnomatch;
  1451. c1 = *(cast(dchar *)&program[pc + 1]);
  1452. c2 = input[src];
  1453. if (c1 != c2)
  1454. {
  1455. if (isLower(cast(rchar)c2))
  1456. c2 = std.ascii.toUpper(cast(rchar)c2);
  1457. else
  1458. goto Lnomatch;
  1459. if (c1 != c2)
  1460. goto Lnomatch;
  1461. }
  1462. src++;
  1463. pc += 1 + dchar.sizeof;
  1464. break;
  1465. case REanychar:
  1466. debug(regexp) printf("\tREanychar\n");
  1467. if (src == input.length)
  1468. goto Lnomatch;
  1469. if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
  1470. goto Lnomatch;
  1471. src += std.utf.stride(input, src);
  1472. //src++;
  1473. pc++;
  1474. break;
  1475. case REstring:
  1476. len = *cast(size_t *)&program[pc + 1];
  1477. debug(regexp)
  1478. {
  1479. auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
  1480. printf("\tREstring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
  1481. }
  1482. if (src + len > input.length)
  1483. goto Lnomatch;
  1484. if (memcmp(&program[pc + 1 + size_t.sizeof], &input[src], len * rchar.sizeof))
  1485. goto Lnomatch;
  1486. src += len;
  1487. pc += 1 + size_t.sizeof + len * rchar.sizeof;
  1488. break;
  1489. case REistring:
  1490. len = *cast(size_t *)&program[pc + 1];
  1491. debug(regexp)
  1492. {
  1493. auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
  1494. printf("\tREistring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
  1495. }
  1496. if (src + len > input.length)
  1497. goto Lnomatch;
  1498. if (icmp((cast(char*)&program[pc + 1 + size_t.sizeof])[0..len],
  1499. input[src .. src + len]))
  1500. goto Lnomatch;
  1501. src += len;
  1502. pc += 1 + size_t.sizeof + len * rchar.sizeof;
  1503. break;
  1504. case REtestbit:
  1505. pu = (cast(ushort *)&program[pc + 1]);
  1506. if (src == input.length)
  1507. goto Lnomatch;
  1508. debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
  1509. pu[0], pu[1], input[src], input[src]);
  1510. len = pu[1];
  1511. c1 = input[src];
  1512. //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
  1513. if (c1 <= pu[0] &&
  1514. !((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7))))
  1515. goto Lnomatch;
  1516. pc += 1 + 2 * ushort.sizeof + len;
  1517. break;
  1518. case REbit:
  1519. pu = (cast(ushort *)&program[pc + 1]);
  1520. if (src == input.length)
  1521. goto Lnomatch;
  1522. debug(regexp) printf("\tREbit %d, %d, '%c'\n",
  1523. pu[0], pu[1], input[src]);
  1524. len = pu[1];
  1525. c1 = input[src];
  1526. if (c1 > pu[0])
  1527. goto Lnomatch;
  1528. if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
  1529. goto Lnomatch;
  1530. src++;
  1531. pc += 1 + 2 * ushort.sizeof + len;
  1532. break;
  1533. case REnotbit:
  1534. pu = (cast(ushort *)&program[pc + 1]);
  1535. if (src == input.length)
  1536. goto Lnomatch;
  1537. debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
  1538. pu[0], pu[1], input[src]);
  1539. len = pu[1];
  1540. c1 = input[src];
  1541. if (c1 <= pu[0] &&
  1542. ((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7))))
  1543. goto Lnomatch;
  1544. src++;
  1545. pc += 1 + 2 * ushort.sizeof + len;
  1546. break;
  1547. case RErange:
  1548. len = *cast(uint *)&program[pc + 1];
  1549. debug(regexp) printf("\tRErange %d\n", len);
  1550. if (src == input.length)
  1551. goto Lnomatch;
  1552. // BUG: REA.ignoreCase?
  1553. if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null)
  1554. goto Lnomatch;
  1555. src++;
  1556. pc += 1 + uint.sizeof + len;
  1557. break;
  1558. case REnotrange:
  1559. len = *cast(uint *)&program[pc + 1];
  1560. debug(regexp) printf("\tREnotrange %d\n", len);
  1561. if (src == input.length)
  1562. goto Lnomatch;
  1563. // BUG: REA.ignoreCase?
  1564. if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null)
  1565. goto Lnomatch;
  1566. src++;
  1567. pc += 1 + uint.sizeof + len;
  1568. break;
  1569. case REbol:
  1570. debug(regexp) printf("\tREbol\n");
  1571. if (src == 0)
  1572. {
  1573. }
  1574. else if (attributes & REA.multiline)
  1575. {
  1576. if (input[src - 1] != '\n')
  1577. goto Lnomatch;
  1578. }
  1579. else
  1580. goto Lnomatch;
  1581. pc++;
  1582. break;
  1583. case REeol:
  1584. debug(regexp) printf("\tREeol\n");
  1585. if (src == input.length)
  1586. {
  1587. }
  1588. else if (attributes & REA.multiline && input[src] == '\n')
  1589. src++;
  1590. else
  1591. goto Lnomatch;
  1592. pc++;
  1593. break;
  1594. case REor:
  1595. len = (cast(uint *)&program[pc + 1])[0];
  1596. debug(regexp) printf("\tREor %d\n", len);
  1597. pop = pc + 1 + uint.sizeof;
  1598. ss = src;
  1599. if (trymatch(pop, pcend))
  1600. {
  1601. if (pcend != program.length)
  1602. {
  1603. auto s = src;
  1604. if (trymatch(pcend, program.length))
  1605. { debug(regexp) printf("\tfirst operand matched\n");
  1606. src = s;
  1607. return 1;
  1608. }
  1609. else
  1610. {
  1611. // If second branch doesn't match to end, take first anyway
  1612. src = ss;
  1613. if (!trymatch(pop + len, program.length))
  1614. {
  1615. debug(regexp) printf("\tfirst operand matched\n");
  1616. src = s;
  1617. return 1;
  1618. }
  1619. }
  1620. src = ss;
  1621. }
  1622. else
  1623. { debug(regexp) printf("\tfirst operand matched\n");
  1624. return 1;
  1625. }
  1626. }
  1627. pc = pop + len; // proceed with 2nd branch
  1628. break;
  1629. case REgoto:
  1630. debug(regexp) printf("\tREgoto\n");
  1631. len = (cast(uint *)&program[pc + 1])[0];
  1632. pc += 1 + uint.sizeof + len;
  1633. break;
  1634. case REanystar:
  1635. debug(regexp) printf("\tREanystar\n");
  1636. pc++;
  1637. for (;;)
  1638. {
  1639. auto s1 = src;
  1640. if (src == input.length)
  1641. break;
  1642. if (!(attributes & REA.dotmatchlf) && input[src] == '\n')
  1643. break;
  1644. src++;
  1645. auto s2 = src;
  1646. // If no match after consumption, but it
  1647. // did match before, then no match
  1648. if (!trymatch(pc, program.length))
  1649. {
  1650. src = s1;
  1651. // BUG: should we save/restore pmatch[]?
  1652. if (trymatch(pc, program.length))
  1653. {
  1654. src = s1; // no match
  1655. break;
  1656. }
  1657. }
  1658. src = s2;
  1659. }
  1660. break;
  1661. case REnm:
  1662. case REnmq:
  1663. // len, n, m, ()
  1664. puint = cast(uint *)&program[pc + 1];
  1665. len = puint[0];
  1666. n = puint[1];
  1667. m = puint[2];
  1668. debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n",
  1669. (program[pc] == REnmq) ? "q".ptr : "".ptr, len, n, m);
  1670. pop = pc + 1 + uint.sizeof * 3;
  1671. for (count = 0; count < n; count++)
  1672. {
  1673. if (!trymatch(pop, pop + len))
  1674. goto Lnomatch;
  1675. }
  1676. if (!psave && count < m)
  1677. {
  1678. //version (Win32)
  1679. psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof);
  1680. //else
  1681. //psave = new regmatch_t[re_nsub + 1];
  1682. }
  1683. if (program[pc] == REnmq) // if minimal munch
  1684. {
  1685. for (; count < m; count++)
  1686. {
  1687. memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
  1688. auto s1 = src;
  1689. if (trymatch(pop + len, program.length))
  1690. {
  1691. src = s1;
  1692. memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
  1693. break;
  1694. }
  1695. if (!trymatch(pop, pop + len))
  1696. { debug(regexp) printf("\tdoesn't match subexpression\n");
  1697. break;
  1698. }
  1699. // If source is not consumed, don't
  1700. // infinite loop on the match
  1701. if (s1 == src)
  1702. { debug(regexp) printf("\tsource is not consumed\n");
  1703. break;
  1704. }
  1705. }
  1706. }
  1707. else // maximal munch
  1708. {
  1709. for (; count < m; count++)
  1710. {
  1711. memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
  1712. auto s1 = src;
  1713. if (!trymatch(pop, pop + len))
  1714. { debug(regexp) printf("\tdoesn't match subexpression\n");
  1715. break;
  1716. }
  1717. auto s2 = src;
  1718. // If source is not consumed, don't
  1719. // infinite loop on the match
  1720. if (s1 == s2)
  1721. { debug(regexp) printf("\tsource is not consumed\n");
  1722. break;
  1723. }
  1724. // If no match after consumption, but it
  1725. // did match before, then no match
  1726. if (!trymatch(pop + len, program.length))
  1727. {
  1728. src = s1;
  1729. if (trymatch(pop + len, program.length))
  1730. {
  1731. src = s1; // no match
  1732. memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
  1733. break;
  1734. }
  1735. }
  1736. src = s2;
  1737. }
  1738. }
  1739. debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
  1740. pc = pop + len;
  1741. break;
  1742. case REparen:
  1743. // len, ()
  1744. debug(regexp) printf("\tREparen\n");
  1745. puint = cast(uint *)&program[pc + 1];
  1746. len = puint[0];
  1747. n = puint[1];
  1748. pop = pc + 1 + uint.sizeof * 2;
  1749. ss = src;
  1750. if (!trymatch(pop, pop + len))
  1751. goto Lnomatch;
  1752. pmatch[n + 1].rm_so = ss;
  1753. pmatch[n + 1].rm_eo = src;
  1754. pc = pop + len;
  1755. break;
  1756. case REend:
  1757. debug(regexp) printf("\tREend\n");
  1758. return 1; // successful match
  1759. case REwordboundary:
  1760. debug(regexp) printf("\tREwordboundary\n");
  1761. if (src > 0 && src < input.length)
  1762. {
  1763. c1 = input[src - 1];
  1764. c2 = input[src];
  1765. if (!(
  1766. (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
  1767. (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
  1768. )
  1769. )
  1770. goto Lnomatch;
  1771. }
  1772. pc++;
  1773. break;
  1774. case REnotwordboundary:
  1775. debug(regexp) printf("\tREnotwordboundary\n");
  1776. if (src == 0 || src == input.length)
  1777. goto Lnomatch;
  1778. c1 = input[src - 1];
  1779. c2 = input[src];
  1780. if (
  1781. (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
  1782. (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
  1783. )
  1784. goto Lnomatch;
  1785. pc++;
  1786. break;
  1787. case REdigit:
  1788. debug(regexp) printf("\tREdigit\n");
  1789. if (src == input.length)
  1790. goto Lnomatch;
  1791. if (!isDigit(input[src]))
  1792. goto Lnomatch;
  1793. src++;
  1794. pc++;
  1795. break;
  1796. case REnotdigit:
  1797. debug(regexp) printf("\tREnotdigit\n");
  1798. if (src == input.length)
  1799. goto Lnomatch;
  1800. if (isDigit(input[src]))
  1801. goto Lnomatch;
  1802. src++;
  1803. pc++;
  1804. break;
  1805. case REspace:
  1806. debug(regexp) printf("\tREspace\n");
  1807. if (src == input.length)
  1808. goto Lnomatch;
  1809. if (!isWhite(input[src]))
  1810. goto Lnomatch;
  1811. src++;
  1812. pc++;
  1813. break;
  1814. case REnotspace:
  1815. debug(regexp) printf("\tREnotspace\n");
  1816. if (src == input.length)
  1817. goto Lnomatch;
  1818. if (isWhite(input[src]))
  1819. goto Lnomatch;
  1820. src++;
  1821. pc++;
  1822. break;
  1823. case REword:
  1824. debug(regexp) printf("\tREword\n");
  1825. if (src == input.length)
  1826. goto Lnomatch;
  1827. if (!isword(input[src]))
  1828. goto Lnomatch;
  1829. src++;
  1830. pc++;
  1831. break;
  1832. case REnotword:
  1833. debug(regexp) printf("\tREnotword\n");
  1834. if (src == input.length)
  1835. goto Lnomatch;
  1836. if (isword(input[src]))
  1837. goto Lnomatch;
  1838. src++;
  1839. pc++;
  1840. break;
  1841. case REbackref:
  1842. {
  1843. n = program[pc + 1];
  1844. debug(regexp) printf("\tREbackref %d\n", n);
  1845. auto so = pmatch[n + 1].rm_so;
  1846. auto eo = pmatch[n + 1].rm_eo;
  1847. len = eo - so;
  1848. if (src + len > input.length)
  1849. goto Lnomatch;
  1850. else if (attributes & REA.ignoreCase)
  1851. {
  1852. if (icmp(input[src .. src + len], input[so .. eo]))
  1853. goto Lnomatch;
  1854. }
  1855. else if (memcmp(&input[src], &input[so], len * rchar.sizeof))
  1856. goto Lnomatch;
  1857. src += len;
  1858. pc += 2;
  1859. break;
  1860. }
  1861. default:
  1862. assert(0);
  1863. }
  1864. }
  1865. Lnomatch:
  1866. debug(regexp) printf("\tnomatch pc=%d\n", pc);
  1867. src = srcsave;
  1868. return 0;
  1869. }
  1870. /* =================== Compiler ================== */
  1871. int parseRegexp()
  1872. {
  1873. size_t gotooffset;
  1874. uint len1;
  1875. uint len2;
  1876. debug(regexp)
  1877. {
  1878. auto sss = pattern[p .. pattern.length];
  1879. printf("parseRegexp() '%.*s'\n", sss.length, sss.ptr);
  1880. }
  1881. auto offset = buf.offset;
  1882. for (;;)
  1883. {
  1884. assert(p <= pattern.length);
  1885. if (p == pattern.length)
  1886. { buf.write(REend);
  1887. return 1;
  1888. }
  1889. switch (pattern[p])
  1890. {
  1891. case ')':
  1892. return 1;
  1893. case '|':
  1894. p++;
  1895. gotooffset = buf.offset;
  1896. buf.write(REgoto);
  1897. buf.write(cast(uint)0);
  1898. len1 = cast(uint)(buf.offset - offset);
  1899. buf.spread(offset, 1 + uint.sizeof);
  1900. gotooffset += 1 + uint.sizeof;
  1901. parseRegexp();
  1902. len2 = cast(uint)(buf.offset - (gotooffset + 1 + uint.sizeof));
  1903. buf.data[offset] = REor;
  1904. (cast(uint *)&buf.data[offset + 1])[0] = len1;
  1905. (cast(uint *)&buf.data[gotooffset + 1])[0] = len2;
  1906. break;
  1907. default:
  1908. parsePiece();
  1909. break;
  1910. }
  1911. }
  1912. }
  1913. int parsePiece()
  1914. {
  1915. uint len;
  1916. uint n;
  1917. uint m;
  1918. ubyte op;
  1919. auto plength = pattern.length;
  1920. debug(regexp)
  1921. {
  1922. auto sss = pattern[p .. pattern.length];
  1923. printf("parsePiece() '%.*s'\n", sss.length, sss.ptr);
  1924. }
  1925. auto offset = buf.offset;
  1926. parseAtom();
  1927. if (p == plength)
  1928. return 1;
  1929. switch (pattern[p])
  1930. {
  1931. case '*':
  1932. // Speci