PageRenderTime 182ms CodeModel.GetById 20ms app.highlight 149ms RepoModel.GetById 1ms app.codeStats 1ms

/std/regexp.d

http://github.com/jcd/phobos
D | 2207 lines | 1447 code | 256 blank | 504 comment | 319 complexity | e1195e03ce2e2a4902931d040408c37a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1// Written in the D programming language.
   2// Regular Expressions.
   3
   4/**
   5 * $(RED Deprecated. It will be removed in March 2012.
   6 *       Please use $(LINK2 std_regex.html, std.regex) instead.)
   7 *
   8 * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular
   9 * expressions) are a powerful method of string pattern matching.  The
  10 * regular expression language used in this library is the same as
  11 * that commonly used, however, some of the very advanced forms may
  12 * behave slightly differently. The standard observed is the $(WEB
  13 * www.ecma-international.org/publications/standards/Ecma-262.htm,
  14 * ECMA standard) for regular expressions.
  15 *
  16 * std.regexp is designed to work only with valid UTF strings as input.
  17 * To validate untrusted input, use std.utf.validate().
  18 *
  19 * In the following guide, $(I pattern)[] refers to a
  20 * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
  21 * The $(I attributes)[] refers to
  22 * a string controlling the interpretation
  23 * of the regular expression.
  24 * It consists of a sequence of one or more
  25 * of the following characters:
  26 *
  27 * <table border=1 cellspacing=0 cellpadding=5>
  28 * <caption>Attribute Characters</caption>
  29 * $(TR $(TH Attribute) $(TH Action))
  30 * <tr>
  31 * $(TD $(B g))
  32 * $(TD global; repeat over the whole input string)
  33 * </tr>
  34 * <tr>
  35 * $(TD $(B i))
  36 * $(TD case insensitive)
  37 * </tr>
  38 * <tr>
  39 * $(TD $(B m))
  40 * $(TD treat as multiple lines separated by newlines)
  41 * </tr>
  42 * </table>
  43 *
  44 * The $(I format)[] string has the formatting characters:
  45 *
  46 * <table border=1 cellspacing=0 cellpadding=5>
  47 * <caption>Formatting Characters</caption>
  48 * $(TR $(TH Format) $(TH Replaced With))
  49 * $(TR
  50 * $(TD $(B $$))    $(TD $)
  51 * )
  52 * $(TR
  53 * $(TD $(B $&amp;))    $(TD The matched substring.)
  54 * )
  55 * $(TR
  56 * $(TD $(B $`))    $(TD The portion of string that precedes the matched substring.)
  57 * )
  58 * $(TR
  59 * $(TD $(B $'))    $(TD The portion of string that follows the matched substring.)
  60 * )
  61 * $(TR
  62 * $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
  63 *      is a single digit 1-9
  64 *      and $$(I n) is not followed by a decimal digit.)
  65 * )
  66 * $(TR
  67 * $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
  68 *      is a two-digit decimal
  69 *      number 01-99.
  70 *      If $(I nn)th capture is undefined or more than the number
  71 *      of parenthesized subexpressions, use the empty
  72 *      string instead.)
  73 * )
  74 * </table>
  75 *
  76 * Any other $ are left as is.
  77 *
  78 * References:
  79 *  $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
  80 * Macros:
  81 *  WIKI = StdRegexp
  82 *  DOLLAR = $
  83 *
  84 * Copyright: Copyright Digital Mars 2000 - 2011.
  85 * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  86 * Authors:   $(WEB digitalmars.com, Walter Bright)
  87 * Source:    $(PHOBOSSRC std/_regexp.d)
  88 */
  89/*          Copyright Digital Mars 2000 - 2011.
  90 * Distributed under the Boost Software License, Version 1.0.
  91 *    (See accompanying file LICENSE_1_0.txt or copy at
  92 *          http://www.boost.org/LICENSE_1_0.txt)
  93 */
  94
  95/*
  96  Escape sequences:
  97
  98  \nnn starts out a 1, 2 or 3 digit octal sequence,
  99  where n is an octal digit. If nnn is larger than
 100  0377, then the 3rd digit is not part of the sequence
 101  and is not consumed.
 102  For maximal portability, use exactly 3 digits.
 103
 104  \xXX starts out a 1 or 2 digit hex sequence. X
 105  is a hex character. If the first character after the \x
 106  is not a hex character, the value of the sequence is 'x'
 107  and the XX are not consumed.
 108  For maximal portability, use exactly 2 digits.
 109
 110  \uUUUU is a unicode sequence. There are exactly
 111  4 hex characters after the \u, if any are not, then
 112  the value of the sequence is 'u', and the UUUU are not
 113  consumed.
 114
 115  Character classes:
 116
 117  [a-b], where a is greater than b, will produce
 118  an error.
 119
 120  References:
 121
 122  http://www.unicode.org/unicode/reports/tr18/
 123*/
 124
 125module std.regexp;
 126
 127pragma(msg, "Notice: As of Phobos 2.055, std.regexp has been deprecated. " ~
 128            "It will be removed in March 2012. Please use std.regex instead.");
 129
 130//debug = regexp;       // uncomment to turn on debugging printf's
 131
 132private
 133{
 134    import core.stdc.stdio;
 135    import core.stdc.stdlib;
 136    import core.stdc.string;
 137    import std.array;
 138    import std.stdio;
 139    import std.string;
 140    import std.ascii;
 141    import std.outbuffer;
 142    import std.bitmanip;
 143    import std.utf;
 144    import std.algorithm;
 145    import std.array;
 146    import std.traits;
 147}
 148
 149deprecated:
 150
 151/** Regular expression to extract an _email address.
 152 * References:
 153 *  $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR)
 154 *  $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format)
 155 */
 156string email =
 157    r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";
 158
 159/** Regular expression to extract a _url */
 160string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";
 161
 162/************************************
 163 * One of these gets thrown on compilation errors
 164 */
 165
 166class RegExpException : Exception
 167{
 168    this(string msg)
 169    {
 170        super(msg);
 171    }
 172}
 173
 174struct regmatch_t
 175{
 176    sizediff_t rm_so; // index of start of match
 177    sizediff_t rm_eo; // index past end of match
 178}
 179
 180private alias char rchar;   // so we can make a wchar version
 181
 182/******************************************************
 183 * Search string for matches with regular expression
 184 * pattern with attributes.
 185 * Replace each match with string generated from format.
 186 * Params:
 187 *  s = String to search.
 188 *  pattern = Regular expression pattern.
 189 *  format = Replacement string format.
 190 *  attributes = Regular expression attributes.
 191 * Returns:
 192 *  the resulting string
 193 * Example:
 194 *  Replace the letters 'a' with the letters 'ZZ'.
 195 * ---
 196 * s = "Strap a rocket engine on a chicken."
 197 * sub(s, "a", "ZZ")        // result: StrZZp a rocket engine on a chicken.
 198 * sub(s, "a", "ZZ", "g")   // result: StrZZp ZZ rocket engine on ZZ chicken.
 199 * ---
 200 *  The replacement format can reference the matches using
 201 *  the $&amp;, $$, $', $`, $0 .. $99 notation:
 202 * ---
 203 * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
 204 * ---
 205 */
 206
 207string sub(string s, string pattern, string format, string attributes = null)
 208{
 209    auto r = new RegExp(pattern, attributes);
 210    auto result = r.replace(s, format);
 211    delete r;
 212    return result;
 213}
 214
 215unittest
 216{
 217    debug(regexp) printf("regexp.sub.unittest\n");
 218
 219    string r = sub("hello", "ll", "ss");
 220    assert(r == "hesso");
 221}
 222
 223/*******************************************************
 224 * Search string for matches with regular expression
 225 * pattern with attributes.
 226 * Pass each match to delegate dg.
 227 * Replace each match with the return value from dg.
 228 * Params:
 229 *  s = String to search.
 230 *  pattern = Regular expression pattern.
 231 *  dg = Delegate
 232 *  attributes = Regular expression attributes.
 233 * Returns: the resulting string.
 234 * Example:
 235 * Capitalize the letters 'a' and 'r':
 236 * ---
 237 * s = "Strap a rocket engine on a chicken.";
 238 * sub(s, "[ar]",
 239 *    delegate char[] (RegExp m)
 240 *    {
 241 *         return toUpper(m[0]);
 242 *    },
 243 *    "g");    // result: StRAp A Rocket engine on A chicken.
 244 * ---
 245 */
 246
 247string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null)
 248{
 249    auto r = new RegExp(pattern, attributes);
 250
 251    string result = s;
 252    size_t lastindex = 0;
 253    size_t offset = 0;
 254
 255    while (r.test(s, lastindex))
 256    {
 257        auto so = r.pmatch[0].rm_so;
 258        auto eo = r.pmatch[0].rm_eo;
 259
 260        string replacement = dg(r);
 261
 262        // Optimize by using std.string.replace if possible - Dave Fladebo
 263        string slice = result[offset + so .. offset + eo];
 264        if (r.attributes & RegExp.REA.global &&     // global, so replace all
 265                !(r.attributes & RegExp.REA.ignoreCase) &&  // not ignoring case
 266                !(r.attributes & RegExp.REA.multiline) &&   // not multiline
 267                pattern == slice)               // simple pattern (exact match, no special characters)
 268        {
 269            debug(regexp)
 270                printf("result: %.*s, pattern: %.*s, slice: %.*s, replacement: %.*s\n",
 271                        result.length,      result.ptr,
 272                        pattern.length,     pattern.ptr,
 273                        slice.length,       slice.ptr,
 274                        replacement.length, replacement.ptr);
 275            result = replace(result,slice,replacement);
 276            break;
 277        }
 278
 279        result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
 280
 281        if (r.attributes & RegExp.REA.global)
 282        {
 283            offset += replacement.length - (eo - so);
 284
 285            if (lastindex == eo)
 286                lastindex++;        // always consume some source
 287            else
 288                lastindex = eo;
 289        }
 290        else
 291            break;
 292    }
 293    delete r;
 294
 295    return result;
 296}
 297
 298unittest
 299{
 300    debug(regexp) printf("regexp.sub.unittest\n");
 301
 302    string foo(RegExp r) { return "ss"; }
 303
 304    auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; });
 305    assert(r == "hesso");
 306
 307    r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g");
 308    assert(r == "hello");
 309
 310    auto s = sub("Strap a rocket engine on a chicken.",
 311            "[ar]",
 312            delegate string (RegExp m)
 313            {
 314                return std.string.toUpper(m[0]);
 315            },
 316            "g");
 317    assert(s == "StRAp A Rocket engine on A chicken.");
 318}
 319
 320
 321/*************************************************
 322 * Search $(D_PARAM s[]) for first match with $(D_PARAM pattern).
 323 * Params:
 324 *  s = String to search.
 325 *  pattern = Regular expression pattern.
 326 * Returns:
 327 *  index into s[] of match if found, -1 if no match.
 328 * Example:
 329 * ---
 330 * auto s = "abcabcabab";
 331 * find(s, RegExp("b"));    // match, returns 1
 332 * find(s, RegExp("f"));    // no match, returns -1
 333 * ---
 334 */
 335
 336sizediff_t find(string s, RegExp pattern)
 337{
 338    return pattern.test(s)
 339        ? pattern.pmatch[0].rm_so
 340        : -1;
 341}
 342
 343unittest
 344{
 345    debug(regexp) printf("regexp.find.unittest\n");
 346
 347    auto i = find("xabcy", RegExp("abc"));
 348    assert(i == 1);
 349    i = find("cba", RegExp("abc"));
 350    assert(i == -1);
 351}
 352
 353/**
 354   Returns:
 355
 356   Same as $(D_PARAM find(s, RegExp(pattern, attributes))).
 357
 358   WARNING:
 359
 360   This function is scheduled for deprecation due to unnecessary
 361   ambiguity with the homonym function in std.string. Instead of
 362   $(D_PARAM std.regexp.find(s, p, a)), you may want to use $(D_PARAM
 363   find(s, RegExp(p, a))).
 364*/
 365
 366sizediff_t
 367find(string s, string pattern, string attributes = null)
 368{
 369    auto r = new RegExp(pattern, attributes);
 370    scope(exit) delete r;
 371    return r.test(s) ? r.pmatch[0].rm_so : -1;
 372}
 373
 374unittest
 375{
 376    debug(regexp) printf("regexp.find.unittest\n");
 377
 378    auto i = find("xabcy", "abc");
 379    assert(i == 1);
 380    i = find("cba", "abc");
 381    assert(i == -1);
 382}
 383
 384/*************************************************
 385 * Search $(D_PARAM s[]) for last match with $(D_PARAM pattern).
 386 * Params:
 387 *  s = String to search.
 388 *  pattern = Regular expression pattern.
 389 * Returns:
 390 *  index into s[] of match if found, -1 if no match.
 391 * Example:
 392 * ---
 393 * auto s = "abcabcabab";
 394 * rfind(s, RegExp("b"));    // match, returns 9
 395 * rfind(s, RegExp("f"));    // no match, returns -1
 396 * ---
 397 */
 398
 399sizediff_t rfind(string s, RegExp pattern)
 400{
 401    sizediff_t i = -1, lastindex = 0;
 402
 403    while (pattern.test(s, lastindex))
 404    {
 405        auto eo = pattern.pmatch[0].rm_eo;
 406        i = pattern.pmatch[0].rm_so;
 407        if (lastindex == eo)
 408            lastindex++;        // always consume some source
 409        else
 410            lastindex = eo;
 411    }
 412    return i;
 413}
 414
 415unittest
 416{
 417    sizediff_t i;
 418
 419    debug(regexp) printf("regexp.rfind.unittest\n");
 420    i = rfind("abcdefcdef", RegExp("c"));
 421    assert(i == 6);
 422    i = rfind("abcdefcdef", RegExp("cd"));
 423    assert(i == 6);
 424    i = rfind("abcdefcdef", RegExp("x"));
 425    assert(i == -1);
 426    i = rfind("abcdefcdef", RegExp("xy"));
 427    assert(i == -1);
 428    i = rfind("abcdefcdef", RegExp(""));
 429    assert(i == 10);
 430}
 431
 432/*************************************************
 433Returns:
 434
 435  Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))).
 436
 437WARNING:
 438
 439This function is scheduled for deprecation due to unnecessary
 440ambiguity with the homonym function in std.string. Instead of
 441$(D_PARAM std.regexp.rfind(s, p, a)), you may want to use $(D_PARAM
 442rfind(s, RegExp(p, a))).
 443*/
 444
 445sizediff_t
 446rfind(string s, string pattern, string attributes = null)
 447{
 448    typeof(return) i = -1, lastindex = 0;
 449
 450    auto r = new RegExp(pattern, attributes);
 451    while (r.test(s, lastindex))
 452    {
 453        auto eo = r.pmatch[0].rm_eo;
 454        i = r.pmatch[0].rm_so;
 455        if (lastindex == eo)
 456            lastindex++;        // always consume some source
 457        else
 458            lastindex = eo;
 459    }
 460    delete r;
 461    return i;
 462}
 463
 464unittest
 465{
 466    sizediff_t i;
 467
 468    debug(regexp) printf("regexp.rfind.unittest\n");
 469    i = rfind("abcdefcdef", "c");
 470    assert(i == 6);
 471    i = rfind("abcdefcdef", "cd");
 472    assert(i == 6);
 473    i = rfind("abcdefcdef", "x");
 474    assert(i == -1);
 475    i = rfind("abcdefcdef", "xy");
 476    assert(i == -1);
 477    i = rfind("abcdefcdef", "");
 478    assert(i == 10);
 479}
 480
 481
 482/********************************************
 483 * Split s[] into an array of strings, using the regular
 484 * expression $(D_PARAM pattern) as the separator.
 485 * Params:
 486 *  s = String to search.
 487 *  pattern = Regular expression pattern.
 488 * Returns:
 489 *  array of slices into s[]
 490 * Example:
 491 * ---
 492 * foreach (s; split("abcabcabab", RegExp("C.", "i")))
 493 * {
 494 *     writefln("s = '%s'", s);
 495 * }
 496 * // Prints:
 497 * // s = 'ab'
 498 * // s = 'b'
 499 * // s = 'bab'
 500 * ---
 501 */
 502
 503string[] split(string s, RegExp pattern)
 504{
 505    return pattern.split(s);
 506}
 507
 508unittest
 509{
 510    debug(regexp) printf("regexp.split.unittest()\n");
 511    string[] result;
 512
 513    result = split("ab", RegExp("a*"));
 514    assert(result.length == 2);
 515    assert(result[0] == "");
 516    assert(result[1] == "b");
 517
 518    foreach (i, s; split("abcabcabab", RegExp("C.", "i")))
 519    {
 520        //writefln("s[%d] = '%s'", i, s);
 521        if (i == 0) assert(s == "ab");
 522        else if (i == 1) assert(s == "b");
 523        else if (i == 2) assert(s == "bab");
 524        else assert(0);
 525    }
 526}
 527
 528/********************************************
 529  Returns:
 530    Same as $(D_PARAM split(s, RegExp(pattern, attributes))).
 531
 532WARNING:
 533
 534This function is scheduled for deprecation due to unnecessary
 535ambiguity with the homonym function in std.string. Instead of
 536$(D_PARAM std.regexp.split(s, p, a)), you may want to use $(D_PARAM
 537split(s, RegExp(p, a))).
 538*/
 539
 540string[] split(string s, string pattern, string attributes = null)
 541{
 542    auto r = new RegExp(pattern, attributes);
 543    auto result = r.split(s);
 544    delete r;
 545    return result;
 546}
 547
 548unittest
 549{
 550    debug(regexp) printf("regexp.split.unittest()\n");
 551    string[] result;
 552
 553    result = split("ab", "a*");
 554    assert(result.length == 2);
 555    assert(result[0] == "");
 556    assert(result[1] == "b");
 557
 558    foreach (i, s; split("abcabcabab", "C.", "i"))
 559    {
 560        //writefln("s[%d] = '%s'", i, s.length, s.ptr);
 561        if (i == 0) assert(s == "ab");
 562        else if (i == 1) assert(s == "b");
 563        else if (i == 2) assert(s == "bab");
 564        else assert(0);
 565    }
 566}
 567
 568/****************************************************
 569 * Search s[] for first match with pattern[] with attributes[].
 570 * Params:
 571 *  s = String to search.
 572 *  pattern = Regular expression pattern.
 573 *  attributes = Regular expression attributes.
 574 * Returns:
 575 *  corresponding RegExp if found, null if not.
 576 * Example:
 577 * ---
 578 * import std.stdio;
 579 * import std.regexp;
 580 *
 581 * void main()
 582 * {
 583 *     if (auto m = std.regexp.search("abcdef", "c"))
 584 *     {
 585 *         writefln("%s[%s]%s", m.pre, m[0], m.post);
 586 *     }
 587 * }
 588 * // Prints:
 589 * // ab[c]def
 590 * ---
 591 */
 592
 593RegExp search(string s, string pattern, string attributes = null)
 594{
 595    auto r = new RegExp(pattern, attributes);
 596    if (!r.test(s))
 597    {   delete r;
 598        assert(r is null);
 599    }
 600    return r;
 601}
 602
 603unittest
 604{
 605    debug(regexp) printf("regexp.string.unittest()\n");
 606
 607    if (auto m = std.regexp.search("abcdef", "c()"))
 608    {
 609        auto result = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
 610        assert(result == "ab[c]def");
 611        assert(m[1] == null);
 612        assert(m[2] == null);
 613    }
 614    else
 615    assert(0);
 616
 617    if (auto n = std.regexp.search("abcdef", "g"))
 618    {
 619        assert(0);
 620    }
 621}
 622
 623/* ********************************* RegExp ******************************** */
 624
 625/*****************************
 626 * RegExp is a class to handle regular expressions.
 627 *
 628 * It is the core foundation for adding powerful string pattern matching
 629 * capabilities to programs like grep, text editors, awk, sed, etc.
 630 */
 631class RegExp
 632{
 633    /*****
 634     * Construct a RegExp object. Compile pattern
 635     * with <i>attributes</i> into
 636     * an internal form for fast execution.
 637     * Params:
 638     *  pattern = regular expression
 639     *  attributes = _attributes
 640     * Throws: RegExpException if there are any compilation errors.
 641     * Example:
 642     *  Declare two variables and assign to them a RegExp object:
 643     * ---
 644     * auto r = new RegExp("pattern");
 645     * auto s = new RegExp(r"p[1-5]\s*");
 646     * ---
 647     */
 648    public this(string pattern, string attributes = null)
 649    {
 650        pmatch = (&gmatch)[0 .. 1];
 651        compile(pattern, attributes);
 652    }
 653
 654    /*****
 655     * Generate instance of RegExp.
 656     * Params:
 657     *  pattern = regular expression
 658     *  attributes = _attributes
 659     * Throws: RegExpException if there are any compilation errors.
 660     * Example:
 661     *  Declare two variables and assign to them a RegExp object:
 662     * ---
 663     * auto r = RegExp("pattern");
 664     * auto s = RegExp(r"p[1-5]\s*");
 665     * ---
 666     */
 667    public static RegExp opCall(string pattern, string attributes = null)
 668    {
 669        return new RegExp(pattern, attributes);
 670    }
 671
 672    unittest
 673    {
 674        debug(regexp) printf("regexp.opCall.unittest()\n");
 675        auto r1 = RegExp("hello", "m");
 676        string msg;
 677        try
 678        {
 679            auto r2 = RegExp("hello", "q");
 680            assert(0);
 681        }
 682        catch (RegExpException ree)
 683        {
 684            msg = ree.toString();
 685            //writefln("message: %s", ree);
 686        }
 687        assert(std.algorithm.countUntil(msg, "unrecognized attribute") >= 0);
 688    }
 689
 690    /************************************
 691     * Set up for start of foreach loop.
 692     * Returns:
 693     *  search() returns instance of RegExp set up to _search string[].
 694     * Example:
 695     * ---
 696     * import std.stdio;
 697     * import std.regexp;
 698     *
 699     * void main()
 700     * {
 701     *     foreach(m; RegExp("ab").search("abcabcabab"))
 702     *     {
 703     *         writefln("%s[%s]%s", m.pre, m[0], m.post);
 704     *     }
 705     * }
 706     * // Prints:
 707     * // [ab]cabcabab
 708     * // abc[ab]cabab
 709     * // abcabc[ab]ab
 710     * // abcabcab[ab]
 711     * ---
 712     */
 713
 714    public RegExp search(string string)
 715    {
 716        input = string;
 717        pmatch[0].rm_eo = 0;
 718        return this;
 719    }
 720
 721    /** ditto */
 722    public int opApply(scope int delegate(ref RegExp) dg)
 723    {
 724        int result;
 725        RegExp r = this;
 726
 727        while (test())
 728        {
 729            result = dg(r);
 730            if (result)
 731                break;
 732        }
 733
 734        return result;
 735    }
 736
 737    unittest
 738    {
 739        debug(regexp) printf("regexp.search.unittest()\n");
 740
 741        int i;
 742        foreach(m; RegExp("ab").search("abcabcabab"))
 743        {
 744            auto s = std.string.format("%s[%s]%s", m.pre, m[0], m.post);
 745            if (i == 0) assert(s == "[ab]cabcabab");
 746            else if (i == 1) assert(s == "abc[ab]cabab");
 747            else if (i == 2) assert(s == "abcabc[ab]ab");
 748            else if (i == 3) assert(s == "abcabcab[ab]");
 749            else assert(0);
 750            i++;
 751        }
 752    }
 753
 754    /******************
 755     * Retrieve match n.
 756     *
 757     * n==0 means the matched substring, n>0 means the
 758     * n'th parenthesized subexpression.
 759     * if n is larger than the number of parenthesized subexpressions,
 760     * null is returned.
 761     */
 762    public string opIndex(size_t n)
 763    {
 764        if (n >= pmatch.length)
 765            return null;
 766        else
 767        {
 768            auto rm_so = pmatch[n].rm_so;
 769            auto rm_eo = pmatch[n].rm_eo;
 770            if (rm_so == rm_eo)
 771                return null;
 772            return input[rm_so .. rm_eo];
 773        }
 774    }
 775
 776    /**
 777       Same as $(D_PARAM opIndex(n)).
 778
 779       WARNING:
 780
 781       Scheduled for deprecation due to confusion with overloaded
 782       $(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n))
 783       you may want to use $(D_PARAM regex[n]).
 784    */
 785    public string match(size_t n)
 786    {
 787        return this[n];
 788    }
 789
 790    /*******************
 791     * Return the slice of the input that precedes the matched substring.
 792     */
 793    public @property string pre()
 794    {
 795        return input[0 .. pmatch[0].rm_so];
 796    }
 797
 798    /*******************
 799     * Return the slice of the input that follows the matched substring.
 800     */
 801    public @property string post()
 802    {
 803        return input[pmatch[0].rm_eo .. $];
 804    }
 805
 806    uint re_nsub;       // number of parenthesized subexpression matches
 807    regmatch_t[] pmatch;    // array [re_nsub + 1]
 808
 809    string input;       // the string to search
 810
 811    // per instance:
 812
 813    string pattern;     // source text of the regular expression
 814
 815    string flags;       // source text of the attributes parameter
 816
 817    int errors;
 818
 819    uint attributes;
 820
 821    enum REA
 822    {
 823        global      = 1,    // has the g attribute
 824            ignoreCase  = 2,    // has the i attribute
 825            multiline   = 4,    // if treat as multiple lines separated
 826        // by newlines, or as a single line
 827            dotmatchlf  = 8,    // if . matches \n
 828            }
 829
 830
 831private:
 832    size_t src;         // current source index in input[]
 833    size_t src_start;       // starting index for match in input[]
 834    size_t p;           // position of parser in pattern[]
 835    regmatch_t gmatch;      // match for the entire regular expression
 836    // (serves as storage for pmatch[0])
 837
 838    const(ubyte)[] program; // pattern[] compiled into regular expression program
 839    OutBuffer buf;
 840
 841
 842
 843
 844/******************************************/
 845
 846// Opcodes
 847
 848    enum : ubyte
 849    {
 850        REend,      // end of program
 851            REchar,     // single character
 852            REichar,        // single character, case insensitive
 853            REdchar,        // single UCS character
 854            REidchar,       // single wide character, case insensitive
 855            REanychar,      // any character
 856            REanystar,      // ".*"
 857            REstring,       // string of characters
 858            REistring,      // string of characters, case insensitive
 859            REtestbit,      // any in bitmap, non-consuming
 860            REbit,      // any in the bit map
 861            REnotbit,       // any not in the bit map
 862            RErange,        // any in the string
 863            REnotrange,     // any not in the string
 864            REor,       // a | b
 865            REplus,     // 1 or more
 866            REstar,     // 0 or more
 867            REquest,        // 0 or 1
 868            REnm,       // n..m
 869            REnmq,      // n..m, non-greedy version
 870            REbol,      // beginning of line
 871            REeol,      // end of line
 872            REparen,        // parenthesized subexpression
 873            REgoto,     // goto offset
 874
 875            REwordboundary,
 876            REnotwordboundary,
 877            REdigit,
 878            REnotdigit,
 879            REspace,
 880            REnotspace,
 881            REword,
 882            REnotword,
 883            REbackref,
 884            };
 885
 886// BUG: should this include '$'?
 887    private int isword(dchar c) { return isAlphaNum(c) || c == '_'; }
 888
 889    private uint inf = ~0u;
 890
 891/* ********************************
 892 * Throws RegExpException on error
 893 */
 894
 895    public void compile(string pattern, string attributes)
 896    {
 897        //printf("RegExp.compile('%.*s', '%.*s')\n", pattern.length, pattern.ptr, attributes.length, attributes.ptr);
 898
 899        this.attributes = 0;
 900        foreach (rchar c; attributes)
 901        {   REA att;
 902
 903            switch (c)
 904            {
 905            case 'g': att = REA.global;     break;
 906            case 'i': att = REA.ignoreCase; break;
 907            case 'm': att = REA.multiline;  break;
 908            default:
 909                error("unrecognized attribute");
 910                return;
 911            }
 912            if (this.attributes & att)
 913            {   error("redundant attribute");
 914                return;
 915            }
 916            this.attributes |= att;
 917        }
 918
 919        input = null;
 920
 921        this.pattern = pattern;
 922        this.flags = attributes;
 923
 924        uint oldre_nsub = re_nsub;
 925        re_nsub = 0;
 926        errors = 0;
 927
 928        buf = new OutBuffer();
 929        buf.reserve(pattern.length * 8);
 930        p = 0;
 931        parseRegexp();
 932        if (p < pattern.length)
 933        {   error("unmatched ')'");
 934        }
 935        // @@@ SKIPPING OPTIMIZATION SOLVES BUG 941 @@@
 936        //optimize();
 937        program = buf.data;
 938        buf.data = null;
 939        delete buf;
 940
 941        if (re_nsub > oldre_nsub)
 942        {
 943            if (pmatch.ptr is &gmatch)
 944                pmatch = null;
 945            pmatch.length = re_nsub + 1;
 946        }
 947        pmatch[0].rm_so = 0;
 948        pmatch[0].rm_eo = 0;
 949    }
 950
 951/********************************************
 952 * Split s[] into an array of strings, using the regular
 953 * expression as the separator.
 954 * Returns:
 955 *  array of slices into s[]
 956 */
 957
 958    public string[] split(string s)
 959    {
 960        debug(regexp) printf("regexp.split()\n");
 961
 962        string[] result;
 963
 964        if (s.length)
 965        {
 966            sizediff_t p, q;
 967            for (q = p; q != s.length;)
 968            {
 969                if (test(s, q))
 970                {
 971                    q = pmatch[0].rm_so;
 972                    auto e = pmatch[0].rm_eo;
 973                    if (e != p)
 974                    {
 975                        result ~= s[p .. q];
 976                        for (size_t i = 1; i < pmatch.length; i++)
 977                        {
 978                            auto so = pmatch[i].rm_so;
 979                            auto eo = pmatch[i].rm_eo;
 980                            if (so == eo)
 981                            {   so = 0; // -1 gives array bounds error
 982                                eo = 0;
 983                            }
 984                            result ~= s[so .. eo];
 985                        }
 986                        q = p = e;
 987                        continue;
 988                    }
 989                }
 990                q++;
 991            }
 992            result ~= s[p .. s.length];
 993        }
 994        else if (!test(s))
 995            result ~= s;
 996        return result;
 997    }
 998
 999    unittest
1000    {
1001        debug(regexp) printf("regexp.split.unittest()\n");
1002
1003        auto r = new RegExp("a*?", null);
1004        string[] result;
1005        string j;
1006        int i;
1007
1008        result = r.split("ab");
1009
1010        assert(result.length == 2);
1011        i = std.string.cmp(result[0], "a");
1012        assert(i == 0);
1013        i = std.string.cmp(result[1], "b");
1014        assert(i == 0);
1015
1016        r = new RegExp("a*", null);
1017        result = r.split("ab");
1018        assert(result.length == 2);
1019        i = std.string.cmp(result[0], "");
1020        assert(i == 0);
1021        i = std.string.cmp(result[1], "b");
1022        assert(i == 0);
1023
1024        r = new RegExp("<(\\/)?([^<>]+)>", null);
1025        result = r.split("a<b>font</b>bar<TAG>hello</TAG>");
1026
1027        debug(regexp)
1028        {
1029            for (i = 0; i < result.length; i++)
1030                printf("result[%d] = '%.*s'\n", i, result[i].length, result[i].ptr);
1031        }
1032
1033        j = join(result, ",");
1034        //printf("j = '%.*s'\n", j.length, j.ptr);
1035        i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
1036        assert(i == 0);
1037
1038        r = new RegExp("a[bc]", null);
1039        result = r.match("123ab");
1040        j = join(result, ",");
1041        i = std.string.cmp(j, "ab");
1042        assert(i == 0);
1043
1044        result = r.match("ac");
1045        j = join(result, ",");
1046        i = std.string.cmp(j, "ac");
1047        assert(i == 0);
1048    }
1049
1050/*************************************************
1051 * Search string[] for match with regular expression.
1052 * Returns:
1053 *  index of match if successful, -1 if not found
1054 */
1055
1056    public sizediff_t find(string string)
1057    {
1058        if (test(string))
1059            return pmatch[0].rm_so;
1060        else
1061            return -1;         // no match
1062    }
1063
1064//deprecated alias find search;
1065
1066    unittest
1067    {
1068        debug(regexp) printf("regexp.find.unittest()\n");
1069
1070        RegExp r = new RegExp("abc", null);
1071        auto i = r.find("xabcy");
1072        assert(i == 1);
1073        i = r.find("cba");
1074        assert(i == -1);
1075    }
1076
1077
1078/*************************************************
1079 * Search s[] for match.
1080 * Returns:
1081 *  If global attribute, return same value as exec(s).
1082 *  If not global attribute, return array of all matches.
1083 */
1084
1085    public string[] match(string s)
1086    {
1087        string[] result;
1088
1089        if (attributes & REA.global)
1090        {
1091            sizediff_t lastindex = 0;
1092
1093            while (test(s, lastindex))
1094            {
1095                auto eo = pmatch[0].rm_eo;
1096
1097                result ~= input[pmatch[0].rm_so .. eo];
1098                if (lastindex == eo)
1099                    lastindex++;        // always consume some source
1100                else
1101                    lastindex = eo;
1102            }
1103        }
1104        else
1105        {
1106            result = exec(s);
1107        }
1108        return result;
1109    }
1110
1111    unittest
1112    {
1113        debug(regexp) printf("regexp.match.unittest()\n");
1114
1115        int i;
1116        string[] result;
1117        string j;
1118        RegExp r;
1119
1120        r = new RegExp("a[bc]", null);
1121        result = r.match("1ab2ac3");
1122        j = join(result, ",");
1123        i = std.string.cmp(j, "ab");
1124        assert(i == 0);
1125
1126        r = new RegExp("a[bc]", "g");
1127        result = r.match("1ab2ac3");
1128        j = join(result, ",");
1129        i = std.string.cmp(j, "ab,ac");
1130        assert(i == 0);
1131    }
1132
1133
1134/*************************************************
1135 * Find regular expression matches in s[]. Replace those matches
1136 * with a new string composed of format[] merged with the result of the
1137 * matches.
1138 * If global, replace all matches. Otherwise, replace first match.
1139 * Returns: the new string
1140 */
1141
1142    public string replace(string s, string format)
1143    {
1144        debug(regexp) printf("string = %.*s, format = %.*s\n", s.length, s.ptr, format.length, format.ptr);
1145
1146        string result = s;
1147        sizediff_t lastindex = 0;
1148        size_t offset = 0;
1149
1150        for (;;)
1151        {
1152            if (!test(s, lastindex))
1153                break;
1154
1155            auto so = pmatch[0].rm_so;
1156            auto eo = pmatch[0].rm_eo;
1157
1158            string replacement = replace(format);
1159
1160            // Optimize by using replace if possible - Dave Fladebo
1161            string slice = result[offset + so .. offset + eo];
1162            if (attributes & REA.global &&      // global, so replace all
1163                    !(attributes & REA.ignoreCase) &&   // not ignoring case
1164                    !(attributes & REA.multiline) &&    // not multiline
1165                    pattern == slice &&         // simple pattern (exact match, no special characters)
1166                    format == replacement)      // simple format, not $ formats
1167            {
1168                debug(regexp)
1169                {
1170                    auto sss = result[offset + so .. offset + eo];
1171                    printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",
1172                            pattern.length, pattern.ptr, sss.length, sss.ptr, format.length, format.ptr, replacement.length, replacement.ptr);
1173                }
1174                result = std.array.replace(result,slice,replacement);
1175                break;
1176            }
1177
1178            result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
1179
1180            if (attributes & REA.global)
1181            {
1182                offset += replacement.length - (eo - so);
1183
1184                if (lastindex == eo)
1185                    lastindex++;        // always consume some source
1186                else
1187                    lastindex = eo;
1188            }
1189            else
1190                break;
1191        }
1192
1193        return result;
1194    }
1195
1196    unittest
1197    {
1198        debug(regexp) printf("regexp.replace.unittest()\n");
1199
1200        int i;
1201        string result;
1202        RegExp r;
1203
1204        r = new RegExp("a[bc]", "g");
1205        result = r.replace("1ab2ac3", "x$&y");
1206        i = std.string.cmp(result, "1xaby2xacy3");
1207        assert(i == 0);
1208
1209        r = new RegExp("ab", "g");
1210        result = r.replace("1ab2ac3", "xy");
1211        i = std.string.cmp(result, "1xy2ac3");
1212        assert(i == 0);
1213    }
1214
1215
1216/*************************************************
1217 * Search string[] for match.
1218 * Returns:
1219 *  array of slices into string[] representing matches
1220 */
1221
1222    public string[] exec(string s)
1223    {
1224        debug(regexp) printf("regexp.exec(string = '%.*s')\n", s.length, s.ptr);
1225        input = s;
1226        pmatch[0].rm_so = 0;
1227        pmatch[0].rm_eo = 0;
1228        return exec();
1229    }
1230
1231/*************************************************
1232 * Pick up where last exec(string) or exec() left off,
1233 * searching string[] for next match.
1234 * Returns:
1235 *  array of slices into string[] representing matches
1236 */
1237
1238    public string[] exec()
1239    {
1240        if (!test())
1241            return null;
1242
1243        auto result = new string[pmatch.length];
1244        for (int i = 0; i < pmatch.length; i++)
1245        {
1246            if (pmatch[i].rm_so == pmatch[i].rm_eo)
1247                result[i] = null;
1248            else
1249                result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
1250        }
1251
1252        return result;
1253    }
1254
1255/************************************************
1256 * Search s[] for match.
1257 * Returns: 0 for no match, !=0 for match
1258 * Example:
1259---
1260import std.stdio;
1261import std.regexp;
1262import std.string;
1263
1264int grep(int delegate(char[]) pred, char[][] list)
1265{
1266  int count;
1267  foreach (s; list)
1268  {  if (pred(s))
1269       ++count;
1270  }
1271  return count;
1272}
1273
1274void main()
1275{
1276  auto x = grep(&RegExp("[Ff]oo").test,
1277                std.string.split("mary had a foo lamb"));
1278  writefln(x);
1279}
1280---
1281* which prints: 1
1282*/
1283                //@@@
1284public bool test(string s)
1285    {
1286        return test(s, 0 /*pmatch[0].rm_eo*/) != 0;
1287    }
1288
1289/************************************************
1290 * Pick up where last test(string) or test() left off, and search again.
1291 * Returns: 0 for no match, !=0 for match
1292 */
1293
1294    public int test()
1295    {
1296        return test(input, pmatch[0].rm_eo);
1297    }
1298
1299/************************************************
1300 * Test s[] starting at startindex against regular expression.
1301 * Returns: 0 for no match, !=0 for match
1302 */
1303
1304    public int test(string s, size_t startindex)
1305    {
1306        char firstc;
1307
1308        input = s;
1309        debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %zd)\n", input.length, input.ptr, startindex);
1310        pmatch[0].rm_so = 0;
1311        pmatch[0].rm_eo = 0;
1312        if (startindex < 0 || startindex > input.length)
1313        {
1314            return 0;           // fail
1315        }
1316        //debug(regexp) printProgram(program);
1317
1318        // First character optimization
1319        firstc = 0;
1320        if (program[0] == REchar)
1321        {
1322            firstc = program[1];
1323            if (attributes & REA.ignoreCase && isAlpha(firstc))
1324                firstc = 0;
1325        }
1326
1327        for (auto si = startindex; ; si++)
1328        {
1329            if (firstc)
1330            {
1331                if (si == input.length)
1332                    break;          // no match
1333                if (input[si] != firstc)
1334                {
1335                    si++;
1336                    if (!chr(si, firstc))   // if first character not found
1337                        break;      // no match
1338                }
1339            }
1340            for (size_t i = 0; i < re_nsub + 1; i++)
1341            {
1342                pmatch[i].rm_so = -1;
1343                pmatch[i].rm_eo = -1;
1344            }
1345            src_start = src = si;
1346            if (trymatch(0, program.length))
1347            {
1348                pmatch[0].rm_so = si;
1349                pmatch[0].rm_eo = src;
1350                //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
1351                return 1;
1352            }
1353            // If possible match must start at beginning, we are done
1354            if (program[0] == REbol || program[0] == REanystar)
1355            {
1356                if (attributes & REA.multiline)
1357                {
1358                    // Scan for the next \n
1359                    if (!chr(si, '\n'))
1360                        break;      // no match if '\n' not found
1361                }
1362                else
1363                    break;
1364            }
1365            if (si == input.length)
1366                break;
1367            debug(regexp)
1368            {
1369                auto sss = input[si + 1 .. input.length];
1370                printf("Starting new try: '%.*s'\n", sss.length, sss.ptr);
1371            }
1372        }
1373        return 0;       // no match
1374    }
1375
1376    /**
1377       Returns whether string $(D_PARAM s) matches $(D_PARAM this).
1378    */
1379    alias test opEquals;
1380//     bool opEquals(string s)
1381//     {
1382//         return test(s);
1383//     }
1384
1385    unittest
1386    {
1387        assert("abc" == RegExp(".b."));
1388        assert("abc" != RegExp(".b.."));
1389    }
1390
1391    int chr(ref size_t si, rchar c)
1392    {
1393        for (; si < input.length; si++)
1394        {
1395            if (input[si] == c)
1396                return 1;
1397        }
1398        return 0;
1399    }
1400
1401
1402    void printProgram(const(ubyte)[] prog)
1403    {
1404        //debug(regexp)
1405        {
1406            size_t len;
1407            uint n;
1408            uint m;
1409            ushort *pu;
1410            uint *puint;
1411            char[] str;
1412
1413            printf("printProgram()\n");
1414            for (size_t pc = 0; pc < prog.length; )
1415            {
1416                printf("%3d: ", pc);
1417
1418                //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
1419                switch (prog[pc])
1420                {
1421                case REchar:
1422                    printf("\tREchar '%c'\n", prog[pc + 1]);
1423                    pc += 1 + char.sizeof;
1424                    break;
1425
1426                case REichar:
1427                    printf("\tREichar '%c'\n", prog[pc + 1]);
1428                    pc += 1 + char.sizeof;
1429                    break;
1430
1431                case REdchar:
1432                    printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
1433                    pc += 1 + dchar.sizeof;
1434                    break;
1435
1436                case REidchar:
1437                    printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
1438                    pc += 1 + dchar.sizeof;
1439                    break;
1440
1441                case REanychar:
1442                    printf("\tREanychar\n");
1443                    pc++;
1444                    break;
1445
1446                case REstring:
1447                    len = *cast(size_t *)&prog[pc + 1];
1448                    str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
1449                    printf("\tREstring x%x, '%.*s'\n", len, str.length, str.ptr);
1450                    pc += 1 + size_t.sizeof + len * rchar.sizeof;
1451                    break;
1452
1453                case REistring:
1454                    len = *cast(size_t *)&prog[pc + 1];
1455                    str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len];
1456                    printf("\tREistring x%x, '%.*s'\n", len, str.length, str.ptr);
1457                    pc += 1 + size_t.sizeof + len * rchar.sizeof;
1458                    break;
1459
1460                case REtestbit:
1461                    pu = cast(ushort *)&prog[pc + 1];
1462                    printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
1463                    len = pu[1];
1464                    pc += 1 + 2 * ushort.sizeof + len;
1465                    break;
1466
1467                case REbit:
1468                    pu = cast(ushort *)&prog[pc + 1];
1469                    len = pu[1];
1470                    printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
1471                    for (n = 0; n < len; n++)
1472                        printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
1473                    printf("\n");
1474                    pc += 1 + 2 * ushort.sizeof + len;
1475                    break;
1476
1477                case REnotbit:
1478                    pu = cast(ushort *)&prog[pc + 1];
1479                    printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
1480                    len = pu[1];
1481                    pc += 1 + 2 * ushort.sizeof + len;
1482                    break;
1483
1484                case RErange:
1485                    len = *cast(uint *)&prog[pc + 1];
1486                    printf("\tRErange %d\n", len);
1487                    // BUG: REAignoreCase?
1488                    pc += 1 + uint.sizeof + len;
1489                    break;
1490
1491                case REnotrange:
1492                    len = *cast(uint *)&prog[pc + 1];
1493                    printf("\tREnotrange %d\n", len);
1494                    // BUG: REAignoreCase?
1495                    pc += 1 + uint.sizeof + len;
1496                    break;
1497
1498                case REbol:
1499                    printf("\tREbol\n");
1500                    pc++;
1501                    break;
1502
1503                case REeol:
1504                    printf("\tREeol\n");
1505                    pc++;
1506                    break;
1507
1508                case REor:
1509                    len = *cast(uint *)&prog[pc + 1];
1510                    printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
1511                    pc += 1 + uint.sizeof;
1512                    break;
1513
1514                case REgoto:
1515                    len = *cast(uint *)&prog[pc + 1];
1516                    printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
1517                    pc += 1 + uint.sizeof;
1518                    break;
1519
1520                case REanystar:
1521                    printf("\tREanystar\n");
1522                    pc++;
1523                    break;
1524
1525                case REnm:
1526                case REnmq:
1527                    // len, n, m, ()
1528                    puint = cast(uint *)&prog[pc + 1];
1529                    len = puint[0];
1530                    n = puint[1];
1531                    m = puint[2];
1532                    printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n",
1533                            (prog[pc] == REnmq) ? "q".ptr : " ".ptr,
1534                            len, n, m, pc + 1 + uint.sizeof * 3 + len);
1535                    pc += 1 + uint.sizeof * 3;
1536                    break;
1537
1538                case REparen:
1539                    // len, n, ()
1540                    puint = cast(uint *)&prog[pc + 1];
1541                    len = puint[0];
1542                    n = puint[1];
1543                    printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
1544                    pc += 1 + uint.sizeof * 2;
1545                    break;
1546
1547                case REend:
1548                    printf("\tREend\n");
1549                    return;
1550
1551                case REwordboundary:
1552                    printf("\tREwordboundary\n");
1553                    pc++;
1554                    break;
1555
1556                case REnotwordboundary:
1557                    printf("\tREnotwordboundary\n");
1558                    pc++;
1559                    break;
1560
1561                case REdigit:
1562                    printf("\tREdigit\n");
1563                    pc++;
1564                    break;
1565
1566                case REnotdigit:
1567                    printf("\tREnotdigit\n");
1568                    pc++;
1569                    break;
1570
1571                case REspace:
1572                    printf("\tREspace\n");
1573                    pc++;
1574                    break;
1575
1576                case REnotspace:
1577                    printf("\tREnotspace\n");
1578                    pc++;
1579                    break;
1580
1581                case REword:
1582                    printf("\tREword\n");
1583                    pc++;
1584                    break;
1585
1586                case REnotword:
1587                    printf("\tREnotword\n");
1588                    pc++;
1589                    break;
1590
1591                case REbackref:
1592                    printf("\tREbackref %d\n", prog[1]);
1593                    pc += 2;
1594                    break;
1595
1596                default:
1597                    assert(0);
1598                }
1599            }
1600        }
1601    }
1602
1603
1604/**************************************************
1605 * Match input against a section of the program[].
1606 * Returns:
1607 *  1 if successful match
1608 *  0 no match
1609 */
1610
1611    int trymatch(size_t pc, size_t pcend)
1612    {
1613        size_t len;
1614        size_t n;
1615        size_t m;
1616        size_t count;
1617        size_t pop;
1618        size_t ss;
1619        regmatch_t *psave;
1620        size_t c1;
1621        size_t c2;
1622        ushort* pu;
1623        uint* puint;
1624
1625        debug(regexp)
1626        {
1627            auto sss = input[src .. input.length];
1628            printf("RegExp.trymatch(pc = %zd, src = '%.*s', pcend = %zd)\n", pc, sss.length, sss.ptr, pcend);
1629        }
1630        auto srcsave = src;
1631        psave = null;
1632        for (;;)
1633        {
1634            if (pc == pcend)        // if done matching
1635            {   debug(regex) printf("\tprogend\n");
1636                return 1;
1637            }
1638
1639            //printf("\top = %d\n", program[pc]);
1640            switch (program[pc])
1641            {
1642            case REchar:
1643                if (src == input.length)
1644                    goto Lnomatch;
1645                debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
1646                if (program[pc + 1] != input[src])
1647                    goto Lnomatch;
1648                src++;
1649                pc += 1 + char.sizeof;
1650                break;
1651
1652            case REichar:
1653                if (src == input.length)
1654                    goto Lnomatch;
1655                debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
1656                c1 = program[pc + 1];
1657                c2 = input[src];
1658                if (c1 != c2)
1659                {
1660                    if (isLower(cast(rchar)c2))
1661                        c2 = std.ascii.toUpper(cast(rchar)c2);
1662                    else
1663                        goto Lnomatch;
1664                    if (c1 != c2)
1665                        goto Lnomatch;
1666                }
1667                src++;
1668                pc += 1 + char.sizeof;
1669                break;
1670
1671            case REdchar:
1672                debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
1673                if (src == input.length)
1674                    goto Lnomatch;
1675                if (*(cast(dchar *)&program[pc + 1]) != input[src])
1676                    goto Lnomatch;
1677                src++;
1678                pc += 1 + dchar.sizeof;
1679                break;
1680
1681            case REidchar:
1682                debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
1683                if (src == input.length)
1684                    goto Lnomatch;
1685                c1 = *(cast(dchar *)&program[pc + 1]);
1686                c2 = input[src];
1687                if (c1 != c2)
1688                {
1689                    if (isLower(cast(rchar)c2))
1690                        c2 = std.ascii.toUpper(cast(rchar)c2);
1691                    else
1692                        goto Lnomatch;
1693                    if (c1 != c2)
1694                        goto Lnomatch;
1695                }
1696                src++;
1697                pc += 1 + dchar.sizeof;
1698                break;
1699
1700            case REanychar:
1701                debug(regexp) printf("\tREanychar\n");
1702                if (src == input.length)
1703                    goto Lnomatch;
1704                if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
1705                    goto Lnomatch;
1706                src += std.utf.stride(input, src);
1707                //src++;
1708                pc++;
1709                break;
1710
1711            case REstring:
1712                len = *cast(size_t *)&program[pc + 1];
1713                debug(regexp)
1714                {
1715                    auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
1716                    printf("\tREstring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
1717                }
1718                if (src + len > input.length)
1719                    goto Lnomatch;
1720                if (memcmp(&program[pc + 1 + size_t.sizeof], &input[src], len * rchar.sizeof))
1721                    goto Lnomatch;
1722                src += len;
1723                pc += 1 + size_t.sizeof + len * rchar.sizeof;
1724                break;
1725
1726            case REistring:
1727                len = *cast(size_t *)&program[pc + 1];
1728                debug(regexp)
1729                {
1730                    auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len];
1731                    printf("\tREistring x%x, '%.*s'\n", len, sss2.length, sss2.ptr);
1732                }
1733                if (src + len > input.length)
1734                    goto Lnomatch;
1735                if (icmp((cast(char*)&program[pc + 1 + size_t.sizeof])[0..len],
1736                                input[src .. src + len]))
1737                    goto Lnomatch;
1738                src += len;
1739                pc += 1 + size_t.sizeof + len * rchar.sizeof;
1740                break;
1741
1742            case REtestbit:
1743                pu = (cast(ushort *)&program[pc + 1]);
1744                if (src == input.length)
1745                    goto Lnomatch;
1746                debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
1747                        pu[0], pu[1], input[src], input[src]);
1748                len = pu[1];
1749                c1 = input[src];
1750                //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
1751                if (c1 <= pu[0] &&
1752                        !((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7))))
1753                    goto Lnomatch;
1754                pc += 1 + 2 * ushort.sizeof + len;
1755                break;
1756
1757    

Large files files are truncated, but you can click here to view the full file