PageRenderTime 165ms CodeModel.GetById 18ms app.highlight 134ms RepoModel.GetById 1ms app.codeStats 0ms

/regexp/syntax/parse.go

https://code.google.com/p/appengine-go-backports/
Go | 1861 lines | 1411 code | 149 blank | 301 comment | 571 complexity | ed9cc54c2b086c0f7b00f6258e064853 MD5 | raw file
   1// Copyright 2011 The Go Authors.  All rights reserved.
   2// Use of this source code is governed by a BSD-style
   3// license that can be found in the LICENSE file.
   4
   5package syntax
   6
   7import (
   8	"os"
   9	"sort"
  10	"strings"
  11	"unicode"
  12	"utf8"
  13)
  14
  15// An Error describes a failure to parse a regular expression
  16// and gives the offending expression.
  17type Error struct {
  18	Code ErrorCode
  19	Expr string
  20}
  21
  22func (e *Error) String() string {
  23	return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
  24}
  25
  26// An ErrorCode describes a failure to parse a regular expression.
  27type ErrorCode string
  28
  29const (
  30	// Unexpected error
  31	ErrInternalError ErrorCode = "regexp/syntax: internal error"
  32
  33	// Parse errors
  34	ErrInvalidCharClass      ErrorCode = "invalid character class"
  35	ErrInvalidCharRange      ErrorCode = "invalid character class range"
  36	ErrInvalidEscape         ErrorCode = "invalid escape sequence"
  37	ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
  38	ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
  39	ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
  40	ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
  41	ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
  42	ErrMissingBracket        ErrorCode = "missing closing ]"
  43	ErrMissingParen          ErrorCode = "missing closing )"
  44	ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
  45	ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
  46)
  47
  48func (e ErrorCode) String() string {
  49	return string(e)
  50}
  51
  52// Flags control the behavior of the parser and record information about regexp context.
  53type Flags uint16
  54
  55const (
  56	FoldCase      Flags = 1 << iota // case-insensitive match
  57	Literal                         // treat pattern as literal string
  58	ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
  59	DotNL                           // allow . to match newline
  60	OneLine                         // treat ^ and $ as only matching at beginning and end of text
  61	NonGreedy                       // make repetition operators default to non-greedy
  62	PerlX                           // allow Perl extensions
  63	UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
  64	WasDollar                       // regexp OpEndText was $, not \z
  65	Simple                          // regexp contains no counted repetition
  66
  67	MatchNL = ClassNL | DotNL
  68
  69	Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
  70	POSIX Flags = 0                                         // POSIX syntax
  71)
  72
  73// Pseudo-ops for parsing stack.
  74const (
  75	opLeftParen = opPseudo + iota
  76	opVerticalBar
  77)
  78
  79type parser struct {
  80	flags       Flags     // parse mode flags
  81	stack       []*Regexp // stack of parsed expressions
  82	free        *Regexp
  83	numCap      int // number of capturing groups seen
  84	wholeRegexp string
  85	tmpClass    []int // temporary char class work space
  86}
  87
  88func (p *parser) newRegexp(op Op) *Regexp {
  89	re := p.free
  90	if re != nil {
  91		p.free = re.Sub0[0]
  92		*re = Regexp{}
  93	} else {
  94		re = new(Regexp)
  95	}
  96	re.Op = op
  97	return re
  98}
  99
 100func (p *parser) reuse(re *Regexp) {
 101	re.Sub0[0] = p.free
 102	p.free = re
 103}
 104
 105// Parse stack manipulation.
 106
 107// push pushes the regexp re onto the parse stack and returns the regexp.
 108func (p *parser) push(re *Regexp) *Regexp {
 109	if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
 110		// Single rune.
 111		if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
 112			return nil
 113		}
 114		re.Op = OpLiteral
 115		re.Rune = re.Rune[:1]
 116		re.Flags = p.flags &^ FoldCase
 117	} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
 118		re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
 119		unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
 120		unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
 121		re.Op == OpCharClass && len(re.Rune) == 2 &&
 122			re.Rune[0]+1 == re.Rune[1] &&
 123			unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
 124			unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
 125		// Case-insensitive rune like [Aa] or [Î&#x201D;δ].
 126		if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
 127			return nil
 128		}
 129
 130		// Rewrite as (case-insensitive) literal.
 131		re.Op = OpLiteral
 132		re.Rune = re.Rune[:1]
 133		re.Flags = p.flags | FoldCase
 134	} else {
 135		// Incremental concatenation.
 136		p.maybeConcat(-1, 0)
 137	}
 138
 139	p.stack = append(p.stack, re)
 140	return re
 141}
 142
 143// maybeConcat implements incremental concatenation
 144// of literal runes into string nodes.  The parser calls this
 145// before each push, so only the top fragment of the stack
 146// might need processing.  Since this is called before a push,
 147// the topmost literal is no longer subject to operators like *
 148// (Otherwise ab* would turn into (ab)*.)
 149// If r >= 0 and there's a node left over, maybeConcat uses it
 150// to push r with the given flags.
 151// maybeConcat reports whether r was pushed.
 152func (p *parser) maybeConcat(r int, flags Flags) bool {
 153	n := len(p.stack)
 154	if n < 2 {
 155		return false
 156	}
 157
 158	re1 := p.stack[n-1]
 159	re2 := p.stack[n-2]
 160	if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
 161		return false
 162	}
 163
 164	// Push re1 into re2.
 165	re2.Rune = append(re2.Rune, re1.Rune...)
 166
 167	// Reuse re1 if possible.
 168	if r >= 0 {
 169		re1.Rune = re1.Rune0[:1]
 170		re1.Rune[0] = r
 171		re1.Flags = flags
 172		return true
 173	}
 174
 175	p.stack = p.stack[:n-1]
 176	p.reuse(re1)
 177	return false // did not push r
 178}
 179
 180// newLiteral returns a new OpLiteral Regexp with the given flags
 181func (p *parser) newLiteral(r int, flags Flags) *Regexp {
 182	re := p.newRegexp(OpLiteral)
 183	re.Flags = flags
 184	if flags&FoldCase != 0 {
 185		r = minFoldRune(r)
 186	}
 187	re.Rune0[0] = r
 188	re.Rune = re.Rune0[:1]
 189	return re
 190}
 191
 192// minFoldRune returns the minimum rune fold-equivalent to r.
 193func minFoldRune(r int) int {
 194	if r < minFold || r > maxFold {
 195		return r
 196	}
 197	min := r
 198	r0 := r
 199	for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
 200		if min > r {
 201			min = r
 202		}
 203	}
 204	return min
 205}
 206
 207// literal pushes a literal regexp for the rune r on the stack
 208// and returns that regexp.
 209func (p *parser) literal(r int) {
 210	p.push(p.newLiteral(r, p.flags))
 211}
 212
 213// op pushes a regexp with the given op onto the stack
 214// and returns that regexp.
 215func (p *parser) op(op Op) *Regexp {
 216	re := p.newRegexp(op)
 217	re.Flags = p.flags
 218	return p.push(re)
 219}
 220
 221// repeat replaces the top stack element with itself repeated according to op, min, max.
 222// before is the regexp suffix starting at the repetition operator.
 223// after is the regexp suffix following after the repetition operator.
 224// repeat returns an updated 'after' and an error, if any.
 225func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, os.Error) {
 226	flags := p.flags
 227	if p.flags&PerlX != 0 {
 228		if len(after) > 0 && after[0] == '?' {
 229			after = after[1:]
 230			flags ^= NonGreedy
 231		}
 232		if lastRepeat != "" {
 233			// In Perl it is not allowed to stack repetition operators:
 234			// a** is a syntax error, not a doubled star, and a++ means
 235			// something else entirely, which we don't support!
 236			return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
 237		}
 238	}
 239	n := len(p.stack)
 240	if n == 0 {
 241		return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
 242	}
 243	sub := p.stack[n-1]
 244	if sub.Op >= opPseudo {
 245		return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
 246	}
 247	re := p.newRegexp(op)
 248	re.Min = min
 249	re.Max = max
 250	re.Flags = flags
 251	re.Sub = re.Sub0[:1]
 252	re.Sub[0] = sub
 253	p.stack[n-1] = re
 254	return after, nil
 255}
 256
 257// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
 258func (p *parser) concat() *Regexp {
 259	p.maybeConcat(-1, 0)
 260
 261	// Scan down to find pseudo-operator | or (.
 262	i := len(p.stack)
 263	for i > 0 && p.stack[i-1].Op < opPseudo {
 264		i--
 265	}
 266	subs := p.stack[i:]
 267	p.stack = p.stack[:i]
 268
 269	// Empty concatenation is special case.
 270	if len(subs) == 0 {
 271		return p.push(p.newRegexp(OpEmptyMatch))
 272	}
 273
 274	return p.push(p.collapse(subs, OpConcat))
 275}
 276
 277// alternate replaces the top of the stack (above the topmost '(') with its alternation.
 278func (p *parser) alternate() *Regexp {
 279	// Scan down to find pseudo-operator (.
 280	// There are no | above (.
 281	i := len(p.stack)
 282	for i > 0 && p.stack[i-1].Op < opPseudo {
 283		i--
 284	}
 285	subs := p.stack[i:]
 286	p.stack = p.stack[:i]
 287
 288	// Make sure top class is clean.
 289	// All the others already are (see swapVerticalBar).
 290	if len(subs) > 0 {
 291		cleanAlt(subs[len(subs)-1])
 292	}
 293
 294	// Empty alternate is special case
 295	// (shouldn't happen but easy to handle).
 296	if len(subs) == 0 {
 297		return p.push(p.newRegexp(OpNoMatch))
 298	}
 299
 300	return p.push(p.collapse(subs, OpAlternate))
 301}
 302
 303// cleanAlt cleans re for eventual inclusion in an alternation.
 304func cleanAlt(re *Regexp) {
 305	switch re.Op {
 306	case OpCharClass:
 307		re.Rune = cleanClass(&re.Rune)
 308		if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
 309			re.Rune = nil
 310			re.Op = OpAnyChar
 311			return
 312		}
 313		if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
 314			re.Rune = nil
 315			re.Op = OpAnyCharNotNL
 316			return
 317		}
 318		if cap(re.Rune)-len(re.Rune) > 100 {
 319			// re.Rune will not grow any more.
 320			// Make a copy or inline to reclaim storage.
 321			re.Rune = append(re.Rune0[:0], re.Rune...)
 322		}
 323	}
 324}
 325
 326// collapse returns the result of applying op to sub.
 327// If sub contains op nodes, they all get hoisted up
 328// so that there is never a concat of a concat or an
 329// alternate of an alternate.
 330func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
 331	if len(subs) == 1 {
 332		return subs[0]
 333	}
 334	re := p.newRegexp(op)
 335	re.Sub = re.Sub0[:0]
 336	for _, sub := range subs {
 337		if sub.Op == op {
 338			re.Sub = append(re.Sub, sub.Sub...)
 339			p.reuse(sub)
 340		} else {
 341			re.Sub = append(re.Sub, sub)
 342		}
 343	}
 344	if op == OpAlternate {
 345		re.Sub = p.factor(re.Sub, re.Flags)
 346		if len(re.Sub) == 1 {
 347			old := re
 348			re = re.Sub[0]
 349			p.reuse(old)
 350		}
 351	}
 352	return re
 353}
 354
 355// factor factors common prefixes from the alternation list sub.
 356// It returns a replacement list that reuses the same storage and
 357// frees (passes to p.reuse) any removed *Regexps.
 358//
 359// For example,
 360//     ABC|ABD|AEF|BCX|BCY
 361// simplifies by literal prefix extraction to
 362//     A(B(C|D)|EF)|BC(X|Y)
 363// which simplifies by character class introduction to
 364//     A(B[CD]|EF)|BC[XY]
 365//
 366func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
 367	if len(sub) < 2 {
 368		return sub
 369	}
 370
 371	// Round 1: Factor out common literal prefixes.
 372	var str []int
 373	var strflags Flags
 374	start := 0
 375	out := sub[:0]
 376	for i := 0; i <= len(sub); i++ {
 377		// Invariant: the Regexps that were in sub[0:start] have been
 378		// used or marked for reuse, and the slice space has been reused
 379		// for out (len(out) <= start).
 380		//
 381		// Invariant: sub[start:i] consists of regexps that all begin
 382		// with str as modified by strflags.
 383		var istr []int
 384		var iflags Flags
 385		if i < len(sub) {
 386			istr, iflags = p.leadingString(sub[i])
 387			if iflags == strflags {
 388				same := 0
 389				for same < len(str) && same < len(istr) && str[same] == istr[same] {
 390					same++
 391				}
 392				if same > 0 {
 393					// Matches at least one rune in current range.
 394					// Keep going around.
 395					str = str[:same]
 396					continue
 397				}
 398			}
 399		}
 400
 401		// Found end of a run with common leading literal string:
 402		// sub[start:i] all begin with str[0:len(str)], but sub[i]
 403		// does not even begin with str[0].
 404		//
 405		// Factor out common string and append factored expression to out.
 406		if i == start {
 407			// Nothing to do - run of length 0.
 408		} else if i == start+1 {
 409			// Just one: don't bother factoring.
 410			out = append(out, sub[start])
 411		} else {
 412			// Construct factored form: prefix(suffix1|suffix2|...)
 413			prefix := p.newRegexp(OpLiteral)
 414			prefix.Flags = strflags
 415			prefix.Rune = append(prefix.Rune[:0], str...)
 416
 417			for j := start; j < i; j++ {
 418				sub[j] = p.removeLeadingString(sub[j], len(str))
 419			}
 420			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 421
 422			re := p.newRegexp(OpConcat)
 423			re.Sub = append(re.Sub[:0], prefix, suffix)
 424			out = append(out, re)
 425		}
 426
 427		// Prepare for next iteration.
 428		start = i
 429		str = istr
 430		strflags = iflags
 431	}
 432	sub = out
 433
 434	// Round 2: Factor out common complex prefixes,
 435	// just the first piece of each concatenation,
 436	// whatever it is.  This is good enough a lot of the time.
 437	start = 0
 438	out = sub[:0]
 439	var first *Regexp
 440	for i := 0; i <= len(sub); i++ {
 441		// Invariant: the Regexps that were in sub[0:start] have been
 442		// used or marked for reuse, and the slice space has been reused
 443		// for out (len(out) <= start).
 444		//
 445		// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
 446		var ifirst *Regexp
 447		if i < len(sub) {
 448			ifirst = p.leadingRegexp(sub[i])
 449			if first != nil && first.Equal(ifirst) {
 450				continue
 451			}
 452		}
 453
 454		// Found end of a run with common leading regexp:
 455		// sub[start:i] all begin with first but sub[i] does not.
 456		//
 457		// Factor out common regexp and append factored expression to out.
 458		if i == start {
 459			// Nothing to do - run of length 0.
 460		} else if i == start+1 {
 461			// Just one: don't bother factoring.
 462			out = append(out, sub[start])
 463		} else {
 464			// Construct factored form: prefix(suffix1|suffix2|...)
 465			prefix := first
 466			for j := start; j < i; j++ {
 467				reuse := j != start // prefix came from sub[start] 
 468				sub[j] = p.removeLeadingRegexp(sub[j], reuse)
 469			}
 470			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 471
 472			re := p.newRegexp(OpConcat)
 473			re.Sub = append(re.Sub[:0], prefix, suffix)
 474			out = append(out, re)
 475		}
 476
 477		// Prepare for next iteration.
 478		start = i
 479		first = ifirst
 480	}
 481	sub = out
 482
 483	// Round 3: Collapse runs of single literals into character classes.
 484	start = 0
 485	out = sub[:0]
 486	for i := 0; i <= len(sub); i++ {
 487		// Invariant: the Regexps that were in sub[0:start] have been
 488		// used or marked for reuse, and the slice space has been reused
 489		// for out (len(out) <= start).
 490		//
 491		// Invariant: sub[start:i] consists of regexps that are either
 492		// literal runes or character classes.
 493		if i < len(sub) && isCharClass(sub[i]) {
 494			continue
 495		}
 496
 497		// sub[i] is not a char or char class;
 498		// emit char class for sub[start:i]...
 499		if i == start {
 500			// Nothing to do - run of length 0.
 501		} else if i == start+1 {
 502			out = append(out, sub[start])
 503		} else {
 504			// Make new char class.
 505			// Start with most complex regexp in sub[start].
 506			max := start
 507			for j := start + 1; j < i; j++ {
 508				if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
 509					max = j
 510				}
 511			}
 512			sub[start], sub[max] = sub[max], sub[start]
 513
 514			for j := start + 1; j < i; j++ {
 515				mergeCharClass(sub[start], sub[j])
 516				p.reuse(sub[j])
 517			}
 518			cleanAlt(sub[start])
 519			out = append(out, sub[start])
 520		}
 521
 522		// ... and then emit sub[i].
 523		if i < len(sub) {
 524			out = append(out, sub[i])
 525		}
 526		start = i + 1
 527	}
 528	sub = out
 529
 530	// Round 4: Collapse runs of empty matches into a single empty match.
 531	start = 0
 532	out = sub[:0]
 533	for i := range sub {
 534		if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
 535			continue
 536		}
 537		out = append(out, sub[i])
 538	}
 539	sub = out
 540
 541	return sub
 542}
 543
 544// leadingString returns the leading literal string that re begins with.
 545// The string refers to storage in re or its children.
 546func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
 547	if re.Op == OpConcat && len(re.Sub) > 0 {
 548		re = re.Sub[0]
 549	}
 550	if re.Op != OpLiteral {
 551		return nil, 0
 552	}
 553	return re.Rune, re.Flags & FoldCase
 554}
 555
 556// removeLeadingString removes the first n leading runes
 557// from the beginning of re.  It returns the replacement for re.
 558func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
 559	if re.Op == OpConcat && len(re.Sub) > 0 {
 560		// Removing a leading string in a concatenation
 561		// might simplify the concatenation.
 562		sub := re.Sub[0]
 563		sub = p.removeLeadingString(sub, n)
 564		re.Sub[0] = sub
 565		if sub.Op == OpEmptyMatch {
 566			p.reuse(sub)
 567			switch len(re.Sub) {
 568			case 0, 1:
 569				// Impossible but handle.
 570				re.Op = OpEmptyMatch
 571				re.Sub = nil
 572			case 2:
 573				old := re
 574				re = re.Sub[1]
 575				p.reuse(old)
 576			default:
 577				copy(re.Sub, re.Sub[1:])
 578				re.Sub = re.Sub[:len(re.Sub)-1]
 579			}
 580		}
 581		return re
 582	}
 583
 584	if re.Op == OpLiteral {
 585		re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
 586		if len(re.Rune) == 0 {
 587			re.Op = OpEmptyMatch
 588		}
 589	}
 590	return re
 591}
 592
 593// leadingRegexp returns the leading regexp that re begins with.
 594// The regexp refers to storage in re or its children.
 595func (p *parser) leadingRegexp(re *Regexp) *Regexp {
 596	if re.Op == OpEmptyMatch {
 597		return nil
 598	}
 599	if re.Op == OpConcat && len(re.Sub) > 0 {
 600		sub := re.Sub[0]
 601		if sub.Op == OpEmptyMatch {
 602			return nil
 603		}
 604		return sub
 605	}
 606	return re
 607}
 608
 609// removeLeadingRegexp removes the leading regexp in re.
 610// It returns the replacement for re.
 611// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
 612func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
 613	if re.Op == OpConcat && len(re.Sub) > 0 {
 614		if reuse {
 615			p.reuse(re.Sub[0])
 616		}
 617		re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
 618		switch len(re.Sub) {
 619		case 0:
 620			re.Op = OpEmptyMatch
 621			re.Sub = nil
 622		case 1:
 623			old := re
 624			re = re.Sub[0]
 625			p.reuse(old)
 626		}
 627		return re
 628	}
 629	if reuse {
 630		p.reuse(re)
 631	}
 632	return p.newRegexp(OpEmptyMatch)
 633}
 634
 635func literalRegexp(s string, flags Flags) *Regexp {
 636	re := &Regexp{Op: OpLiteral}
 637	re.Flags = flags
 638	re.Rune = re.Rune0[:0] // use local storage for small strings
 639	for _, c := range s {
 640		if len(re.Rune) >= cap(re.Rune) {
 641			// string is too long to fit in Rune0.  let Go handle it
 642			re.Rune = []int(s)
 643			break
 644		}
 645		re.Rune = append(re.Rune, c)
 646	}
 647	return re
 648}
 649
 650// Parsing.
 651
 652func Parse(s string, flags Flags) (*Regexp, os.Error) {
 653	if flags&Literal != 0 {
 654		// Trivial parser for literal string.
 655		if err := checkUTF8(s); err != nil {
 656			return nil, err
 657		}
 658		return literalRegexp(s, flags), nil
 659	}
 660
 661	// Otherwise, must do real work.
 662	var (
 663		p          parser
 664		err        os.Error
 665		c          int
 666		op         Op
 667		lastRepeat string
 668		min, max   int
 669	)
 670	p.flags = flags
 671	p.wholeRegexp = s
 672	t := s
 673	for t != "" {
 674		repeat := ""
 675	BigSwitch:
 676		switch t[0] {
 677		default:
 678			if c, t, err = nextRune(t); err != nil {
 679				return nil, err
 680			}
 681			p.literal(c)
 682
 683		case '(':
 684			if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
 685				// Flag changes and non-capturing groups.
 686				if t, err = p.parsePerlFlags(t); err != nil {
 687					return nil, err
 688				}
 689				break
 690			}
 691			p.numCap++
 692			p.op(opLeftParen).Cap = p.numCap
 693			t = t[1:]
 694		case '|':
 695			if err = p.parseVerticalBar(); err != nil {
 696				return nil, err
 697			}
 698			t = t[1:]
 699		case ')':
 700			if err = p.parseRightParen(); err != nil {
 701				return nil, err
 702			}
 703			t = t[1:]
 704		case '^':
 705			if p.flags&OneLine != 0 {
 706				p.op(OpBeginText)
 707			} else {
 708				p.op(OpBeginLine)
 709			}
 710			t = t[1:]
 711		case '$':
 712			if p.flags&OneLine != 0 {
 713				p.op(OpEndText).Flags |= WasDollar
 714			} else {
 715				p.op(OpEndLine)
 716			}
 717			t = t[1:]
 718		case '.':
 719			if p.flags&DotNL != 0 {
 720				p.op(OpAnyChar)
 721			} else {
 722				p.op(OpAnyCharNotNL)
 723			}
 724			t = t[1:]
 725		case '[':
 726			if t, err = p.parseClass(t); err != nil {
 727				return nil, err
 728			}
 729		case '*', '+', '?':
 730			before := t
 731			switch t[0] {
 732			case '*':
 733				op = OpStar
 734			case '+':
 735				op = OpPlus
 736			case '?':
 737				op = OpQuest
 738			}
 739			after := t[1:]
 740			if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
 741				return nil, err
 742			}
 743			repeat = before
 744			t = after
 745		case '{':
 746			op = OpRepeat
 747			before := t
 748			min, max, after, ok := p.parseRepeat(t)
 749			if !ok {
 750				// If the repeat cannot be parsed, { is a literal.
 751				p.literal('{')
 752				t = t[1:]
 753				break
 754			}
 755			if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
 756				// Numbers were too big, or max is present and min > max.
 757				return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
 758			}
 759			if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
 760				return nil, err
 761			}
 762			repeat = before
 763			t = after
 764		case '\\':
 765			if p.flags&PerlX != 0 && len(t) >= 2 {
 766				switch t[1] {
 767				case 'A':
 768					p.op(OpBeginText)
 769					t = t[2:]
 770					break BigSwitch
 771				case 'b':
 772					p.op(OpWordBoundary)
 773					t = t[2:]
 774					break BigSwitch
 775				case 'B':
 776					p.op(OpNoWordBoundary)
 777					t = t[2:]
 778					break BigSwitch
 779				case 'C':
 780					// any byte; not supported
 781					return nil, &Error{ErrInvalidEscape, t[:2]}
 782				case 'Q':
 783					// \Q ... \E: the ... is always literals
 784					var lit string
 785					if i := strings.Index(t, `\E`); i < 0 {
 786						lit = t[2:]
 787						t = ""
 788					} else {
 789						lit = t[2:i]
 790						t = t[i+2:]
 791					}
 792					p.push(literalRegexp(lit, p.flags))
 793					break BigSwitch
 794				case 'z':
 795					p.op(OpEndText)
 796					t = t[2:]
 797					break BigSwitch
 798				}
 799			}
 800
 801			re := p.newRegexp(OpCharClass)
 802			re.Flags = p.flags
 803
 804			// Look for Unicode character group like \p{Han}
 805			if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
 806				r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
 807				if err != nil {
 808					return nil, err
 809				}
 810				if r != nil {
 811					re.Rune = r
 812					t = rest
 813					p.push(re)
 814					break BigSwitch
 815				}
 816			}
 817
 818			// Perl character class escape.
 819			if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
 820				re.Rune = r
 821				t = rest
 822				p.push(re)
 823				break BigSwitch
 824			}
 825			p.reuse(re)
 826
 827			// Ordinary single-character escape.
 828			if c, t, err = p.parseEscape(t); err != nil {
 829				return nil, err
 830			}
 831			p.literal(c)
 832		}
 833		lastRepeat = repeat
 834	}
 835
 836	p.concat()
 837	if p.swapVerticalBar() {
 838		// pop vertical bar
 839		p.stack = p.stack[:len(p.stack)-1]
 840	}
 841	p.alternate()
 842
 843	n := len(p.stack)
 844	if n != 1 {
 845		return nil, &Error{ErrMissingParen, s}
 846	}
 847	return p.stack[0], nil
 848}
 849
 850// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
 851// If s is not of that form, it returns ok == false.
 852// If s has the right form but the values are too big, it returns min == -1, ok == true.
 853func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
 854	if s == "" || s[0] != '{' {
 855		return
 856	}
 857	s = s[1:]
 858	var ok1 bool
 859	if min, s, ok1 = p.parseInt(s); !ok1 {
 860		return
 861	}
 862	if s == "" {
 863		return
 864	}
 865	if s[0] != ',' {
 866		max = min
 867	} else {
 868		s = s[1:]
 869		if s == "" {
 870			return
 871		}
 872		if s[0] == '}' {
 873			max = -1
 874		} else if max, s, ok1 = p.parseInt(s); !ok1 {
 875			return
 876		} else if max < 0 {
 877			// parseInt found too big a number
 878			min = -1
 879		}
 880	}
 881	if s == "" || s[0] != '}' {
 882		return
 883	}
 884	rest = s[1:]
 885	ok = true
 886	return
 887}
 888
 889// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
 890// like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
 891// The caller must have ensured that s begins with "(?".
 892func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) {
 893	t := s
 894
 895	// Check for named captures, first introduced in Python's regexp library.
 896	// As usual, there are three slightly different syntaxes:
 897	//
 898	//   (?P<name>expr)   the original, introduced by Python
 899	//   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
 900	//   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
 901	//
 902	// Perl 5.10 gave in and implemented the Python version too,
 903	// but they claim that the last two are the preferred forms.
 904	// PCRE and languages based on it (specifically, PHP and Ruby)
 905	// support all three as well.  EcmaScript 4 uses only the Python form.
 906	//
 907	// In both the open source world (via Code Search) and the
 908	// Google source tree, (?P<expr>name) is the dominant form,
 909	// so that's the one we implement.  One is enough.
 910	if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
 911		// Pull out name.
 912		end := strings.IndexRune(t, '>')
 913		if end < 0 {
 914			if err = checkUTF8(t); err != nil {
 915				return "", err
 916			}
 917			return "", &Error{ErrInvalidNamedCapture, s}
 918		}
 919
 920		capture := t[:end+1] // "(?P<name>"
 921		name := t[4:end]     // "name"
 922		if err = checkUTF8(name); err != nil {
 923			return "", err
 924		}
 925		if !isValidCaptureName(name) {
 926			return "", &Error{ErrInvalidNamedCapture, capture}
 927		}
 928
 929		// Like ordinary capture, but named.
 930		p.numCap++
 931		re := p.op(opLeftParen)
 932		re.Cap = p.numCap
 933		re.Name = name
 934		return t[end+1:], nil
 935	}
 936
 937	// Non-capturing group.  Might also twiddle Perl flags.
 938	var c int
 939	t = t[2:] // skip (?
 940	flags := p.flags
 941	sign := +1
 942	sawFlag := false
 943Loop:
 944	for t != "" {
 945		if c, t, err = nextRune(t); err != nil {
 946			return "", err
 947		}
 948		switch c {
 949		default:
 950			break Loop
 951
 952		// Flags.
 953		case 'i':
 954			flags |= FoldCase
 955			sawFlag = true
 956		case 'm':
 957			flags &^= OneLine
 958			sawFlag = true
 959		case 's':
 960			flags |= DotNL
 961			sawFlag = true
 962		case 'U':
 963			flags |= NonGreedy
 964			sawFlag = true
 965
 966		// Switch to negation.
 967		case '-':
 968			if sign < 0 {
 969				break Loop
 970			}
 971			sign = -1
 972			// Invert flags so that | above turn into &^ and vice versa.
 973			// We'll invert flags again before using it below.
 974			flags = ^flags
 975			sawFlag = false
 976
 977		// End of flags, starting group or not.
 978		case ':', ')':
 979			if sign < 0 {
 980				if !sawFlag {
 981					break Loop
 982				}
 983				flags = ^flags
 984			}
 985			if c == ':' {
 986				// Open new group
 987				p.op(opLeftParen)
 988			}
 989			p.flags = flags
 990			return t, nil
 991		}
 992	}
 993
 994	return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
 995}
 996
 997// isValidCaptureName reports whether name
 998// is a valid capture name: [A-Za-z0-9_]+.
 999// PCRE limits names to 32 bytes.
1000// Python rejects names starting with digits.
1001// We don't enforce either of those.
1002func isValidCaptureName(name string) bool {
1003	if name == "" {
1004		return false
1005	}
1006	for _, c := range name {
1007		if c != '_' && !isalnum(c) {
1008			return false
1009		}
1010	}
1011	return true
1012}
1013
1014// parseInt parses a decimal integer.
1015func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1016	if s == "" || s[0] < '0' || '9' < s[0] {
1017		return
1018	}
1019	// Disallow leading zeros.
1020	if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1021		return
1022	}
1023	t := s
1024	for s != "" && '0' <= s[0] && s[0] <= '9' {
1025		s = s[1:]
1026	}
1027	rest = s
1028	ok = true
1029	// Have digits, compute value.
1030	t = t[:len(t)-len(s)]
1031	for i := 0; i < len(t); i++ {
1032		// Avoid overflow.
1033		if n >= 1e8 {
1034			n = -1
1035			break
1036		}
1037		n = n*10 + int(t[i]) - '0'
1038	}
1039	return
1040}
1041
1042// can this be represented as a character class?
1043// single-rune literal string, char class, ., and .|\n.
1044func isCharClass(re *Regexp) bool {
1045	return re.Op == OpLiteral && len(re.Rune) == 1 ||
1046		re.Op == OpCharClass ||
1047		re.Op == OpAnyCharNotNL ||
1048		re.Op == OpAnyChar
1049}
1050
1051// does re match r?
1052func matchRune(re *Regexp, r int) bool {
1053	switch re.Op {
1054	case OpLiteral:
1055		return len(re.Rune) == 1 && re.Rune[0] == r
1056	case OpCharClass:
1057		for i := 0; i < len(re.Rune); i += 2 {
1058			if re.Rune[i] <= r && r <= re.Rune[i+1] {
1059				return true
1060			}
1061		}
1062		return false
1063	case OpAnyCharNotNL:
1064		return r != '\n'
1065	case OpAnyChar:
1066		return true
1067	}
1068	return false
1069}
1070
1071// parseVerticalBar handles a | in the input.
1072func (p *parser) parseVerticalBar() os.Error {
1073	p.concat()
1074
1075	// The concatenation we just parsed is on top of the stack.
1076	// If it sits above an opVerticalBar, swap it below
1077	// (things below an opVerticalBar become an alternation).
1078	// Otherwise, push a new vertical bar.
1079	if !p.swapVerticalBar() {
1080		p.op(opVerticalBar)
1081	}
1082
1083	return nil
1084}
1085
1086// mergeCharClass makes dst = dst|src.
1087// The caller must ensure that dst.Op >= src.Op,
1088// to reduce the amount of copying.
1089func mergeCharClass(dst, src *Regexp) {
1090	switch dst.Op {
1091	case OpAnyChar:
1092		// src doesn't add anything.
1093	case OpAnyCharNotNL:
1094		// src might add \n
1095		if matchRune(src, '\n') {
1096			dst.Op = OpAnyChar
1097		}
1098	case OpCharClass:
1099		// src is simpler, so either literal or char class
1100		if src.Op == OpLiteral {
1101			dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1102		} else {
1103			dst.Rune = appendClass(dst.Rune, src.Rune)
1104		}
1105	case OpLiteral:
1106		// both literal
1107		if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1108			break
1109		}
1110		dst.Op = OpCharClass
1111		dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1112		dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1113	}
1114}
1115
1116// If the top of the stack is an element followed by an opVerticalBar
1117// swapVerticalBar swaps the two and returns true.
1118// Otherwise it returns false.
1119func (p *parser) swapVerticalBar() bool {
1120	// If above and below vertical bar are literal or char class,
1121	// can merge into a single char class.
1122	n := len(p.stack)
1123	if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1124		re1 := p.stack[n-1]
1125		re3 := p.stack[n-3]
1126		// Make re3 the more complex of the two.
1127		if re1.Op > re3.Op {
1128			re1, re3 = re3, re1
1129			p.stack[n-3] = re3
1130		}
1131		mergeCharClass(re3, re1)
1132		p.reuse(re1)
1133		p.stack = p.stack[:n-1]
1134		return true
1135	}
1136
1137	if n >= 2 {
1138		re1 := p.stack[n-1]
1139		re2 := p.stack[n-2]
1140		if re2.Op == opVerticalBar {
1141			if n >= 3 {
1142				// Now out of reach.
1143				// Clean opportunistically.
1144				cleanAlt(p.stack[n-3])
1145			}
1146			p.stack[n-2] = re1
1147			p.stack[n-1] = re2
1148			return true
1149		}
1150	}
1151	return false
1152}
1153
1154// parseRightParen handles a ) in the input.
1155func (p *parser) parseRightParen() os.Error {
1156	p.concat()
1157	if p.swapVerticalBar() {
1158		// pop vertical bar
1159		p.stack = p.stack[:len(p.stack)-1]
1160	}
1161	p.alternate()
1162
1163	n := len(p.stack)
1164	if n < 2 {
1165		return &Error{ErrInternalError, ""}
1166	}
1167	re1 := p.stack[n-1]
1168	re2 := p.stack[n-2]
1169	p.stack = p.stack[:n-2]
1170	if re2.Op != opLeftParen {
1171		return &Error{ErrMissingParen, p.wholeRegexp}
1172	}
1173	// Restore flags at time of paren.
1174	p.flags = re2.Flags
1175	if re2.Cap == 0 {
1176		// Just for grouping.
1177		p.push(re1)
1178	} else {
1179		re2.Op = OpCapture
1180		re2.Sub = re2.Sub0[:1]
1181		re2.Sub[0] = re1
1182		p.push(re2)
1183	}
1184	return nil
1185}
1186
1187// parseEscape parses an escape sequence at the beginning of s
1188// and returns the rune.
1189func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) {
1190	t := s[1:]
1191	if t == "" {
1192		return 0, "", &Error{ErrTrailingBackslash, ""}
1193	}
1194	c, t, err := nextRune(t)
1195	if err != nil {
1196		return 0, "", err
1197	}
1198
1199Switch:
1200	switch c {
1201	default:
1202		if c < utf8.RuneSelf && !isalnum(c) {
1203			// Escaped non-word characters are always themselves.
1204			// PCRE is not quite so rigorous: it accepts things like
1205			// \q, but we don't.  We once rejected \_, but too many
1206			// programs and people insist on using it, so allow \_.
1207			return c, t, nil
1208		}
1209
1210	// Octal escapes.
1211	case '1', '2', '3', '4', '5', '6', '7':
1212		// Single non-zero digit is a backreference; not supported
1213		if t == "" || t[0] < '0' || t[0] > '7' {
1214			break
1215		}
1216		fallthrough
1217	case '0':
1218		// Consume up to three octal digits; already have one.
1219		r = c - '0'
1220		for i := 1; i < 3; i++ {
1221			if t == "" || t[0] < '0' || t[0] > '7' {
1222				break
1223			}
1224			r = r*8 + int(t[0]) - '0'
1225			t = t[1:]
1226		}
1227		return r, t, nil
1228
1229	// Hexadecimal escapes.
1230	case 'x':
1231		if t == "" {
1232			break
1233		}
1234		if c, t, err = nextRune(t); err != nil {
1235			return 0, "", err
1236		}
1237		if c == '{' {
1238			// Any number of digits in braces.
1239			// Perl accepts any text at all; it ignores all text
1240			// after the first non-hex digit.  We require only hex digits,
1241			// and at least one.
1242			nhex := 0
1243			r = 0
1244			for {
1245				if t == "" {
1246					break Switch
1247				}
1248				if c, t, err = nextRune(t); err != nil {
1249					return 0, "", err
1250				}
1251				if c == '}' {
1252					break
1253				}
1254				v := unhex(c)
1255				if v < 0 {
1256					break Switch
1257				}
1258				r = r*16 + v
1259				if r > unicode.MaxRune {
1260					break Switch
1261				}
1262				nhex++
1263			}
1264			if nhex == 0 {
1265				break Switch
1266			}
1267			return r, t, nil
1268		}
1269
1270		// Easy case: two hex digits.
1271		x := unhex(c)
1272		if c, t, err = nextRune(t); err != nil {
1273			return 0, "", err
1274		}
1275		y := unhex(c)
1276		if x < 0 || y < 0 {
1277			break
1278		}
1279		return x*16 + y, t, nil
1280
1281	// C escapes.  There is no case 'b', to avoid misparsing
1282	// the Perl word-boundary \b as the C backspace \b
1283	// when in POSIX mode.  In Perl, /\b/ means word-boundary
1284	// but /[\b]/ means backspace.  We don't support that.
1285	// If you want a backspace, embed a literal backspace
1286	// character or use \x08.
1287	case 'a':
1288		return '\a', t, err
1289	case 'f':
1290		return '\f', t, err
1291	case 'n':
1292		return '\n', t, err
1293	case 'r':
1294		return '\r', t, err
1295	case 't':
1296		return '\t', t, err
1297	case 'v':
1298		return '\v', t, err
1299	}
1300	return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1301}
1302
1303// parseClassChar parses a character class character at the beginning of s
1304// and returns it.
1305func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) {
1306	if s == "" {
1307		return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1308	}
1309
1310	// Allow regular escape sequences even though
1311	// many need not be escaped in this context.
1312	if s[0] == '\\' {
1313		return p.parseEscape(s)
1314	}
1315
1316	return nextRune(s)
1317}
1318
1319type charGroup struct {
1320	sign  int
1321	class []int
1322}
1323
1324// parsePerlClassEscape parses a leading Perl character class escape like \d
1325// from the beginning of s.  If one is present, it appends the characters to r
1326// and returns the new slice r and the remainder of the string.
1327func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) {
1328	if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1329		return
1330	}
1331	g := perlGroup[s[0:2]]
1332	if g.sign == 0 {
1333		return
1334	}
1335	return p.appendGroup(r, g), s[2:]
1336}
1337
1338// parseNamedClass parses a leading POSIX named character class like [:alnum:]
1339// from the beginning of s.  If one is present, it appends the characters to r
1340// and returns the new slice r and the remainder of the string.
1341func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) {
1342	if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1343		return
1344	}
1345
1346	i := strings.Index(s[2:], ":]")
1347	if i < 0 {
1348		return
1349	}
1350	i += 2
1351	name, s := s[0:i+2], s[i+2:]
1352	g := posixGroup[name]
1353	if g.sign == 0 {
1354		return nil, "", &Error{ErrInvalidCharRange, name}
1355	}
1356	return p.appendGroup(r, g), s, nil
1357}
1358
1359func (p *parser) appendGroup(r []int, g charGroup) []int {
1360	if p.flags&FoldCase == 0 {
1361		if g.sign < 0 {
1362			r = appendNegatedClass(r, g.class)
1363		} else {
1364			r = appendClass(r, g.class)
1365		}
1366	} else {
1367		tmp := p.tmpClass[:0]
1368		tmp = appendFoldedClass(tmp, g.class)
1369		p.tmpClass = tmp
1370		tmp = cleanClass(&p.tmpClass)
1371		if g.sign < 0 {
1372			r = appendNegatedClass(r, tmp)
1373		} else {
1374			r = appendClass(r, tmp)
1375		}
1376	}
1377	return r
1378}
1379
1380var anyTable = &unicode.RangeTable{
1381	[]unicode.Range16{{0, 1<<16 - 1, 1}},
1382	[]unicode.Range32{{1 << 16, unicode.MaxRune, 1}},
1383}
1384
1385// unicodeTable returns the unicode.RangeTable identified by name
1386// and the table of additional fold-equivalent code points.
1387func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1388	// Special case: "Any" means any.
1389	if name == "Any" {
1390		return anyTable, anyTable
1391	}
1392	if t := unicode.Categories[name]; t != nil {
1393		return t, unicode.FoldCategory[name]
1394	}
1395	if t := unicode.Scripts[name]; t != nil {
1396		return t, unicode.FoldScript[name]
1397	}
1398	return nil, nil
1399}
1400
1401// parseUnicodeClass parses a leading Unicode character class like \p{Han}
1402// from the beginning of s.  If one is present, it appends the characters to r
1403// and returns the new slice r and the remainder of the string.
1404func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) {
1405	if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1406		return
1407	}
1408
1409	// Committed to parse or return error.
1410	sign := +1
1411	if s[1] == 'P' {
1412		sign = -1
1413	}
1414	t := s[2:]
1415	c, t, err := nextRune(t)
1416	if err != nil {
1417		return
1418	}
1419	var seq, name string
1420	if c != '{' {
1421		// Single-letter name.
1422		seq = s[:len(s)-len(t)]
1423		name = seq[2:]
1424	} else {
1425		// Name is in braces.
1426		end := strings.IndexRune(s, '}')
1427		if end < 0 {
1428			if err = checkUTF8(s); err != nil {
1429				return
1430			}
1431			return nil, "", &Error{ErrInvalidCharRange, s}
1432		}
1433		seq, t = s[:end+1], s[end+1:]
1434		name = s[3:end]
1435		if err = checkUTF8(name); err != nil {
1436			return
1437		}
1438	}
1439
1440	// Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1441	if name != "" && name[0] == '^' {
1442		sign = -sign
1443		name = name[1:]
1444	}
1445
1446	tab, fold := unicodeTable(name)
1447	if tab == nil {
1448		return nil, "", &Error{ErrInvalidCharRange, seq}
1449	}
1450
1451	if p.flags&FoldCase == 0 || fold == nil {
1452		if sign > 0 {
1453			r = appendTable(r, tab)
1454		} else {
1455			r = appendNegatedTable(r, tab)
1456		}
1457	} else {
1458		// Merge and clean tab and fold in a temporary buffer.
1459		// This is necessary for the negative case and just tidy
1460		// for the positive case.
1461		tmp := p.tmpClass[:0]
1462		tmp = appendTable(tmp, tab)
1463		tmp = appendTable(tmp, fold)
1464		p.tmpClass = tmp
1465		tmp = cleanClass(&p.tmpClass)
1466		if sign > 0 {
1467			r = appendClass(r, tmp)
1468		} else {
1469			r = appendNegatedClass(r, tmp)
1470		}
1471	}
1472	return r, t, nil
1473}
1474
1475// parseClass parses a character class at the beginning of s
1476// and pushes it onto the parse stack.
1477func (p *parser) parseClass(s string) (rest string, err os.Error) {
1478	t := s[1:] // chop [
1479	re := p.newRegexp(OpCharClass)
1480	re.Flags = p.flags
1481	re.Rune = re.Rune0[:0]
1482
1483	sign := +1
1484	if t != "" && t[0] == '^' {
1485		sign = -1
1486		t = t[1:]
1487
1488		// If character class does not match \n, add it here,
1489		// so that negation later will do the right thing.
1490		if p.flags&ClassNL == 0 {
1491			re.Rune = append(re.Rune, '\n', '\n')
1492		}
1493	}
1494
1495	class := re.Rune
1496	first := true // ] and - are okay as first char in class
1497	for t == "" || t[0] != ']' || first {
1498		// POSIX: - is only okay unescaped as first or last in class.
1499		// Perl: - is okay anywhere.
1500		if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1501			_, size := utf8.DecodeRuneInString(t[1:])
1502			return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1503		}
1504		first = false
1505
1506		// Look for POSIX [:alnum:] etc.
1507		if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1508			nclass, nt, err := p.parseNamedClass(t, class)
1509			if err != nil {
1510				return "", err
1511			}
1512			if nclass != nil {
1513				class, t = nclass, nt
1514				continue
1515			}
1516		}
1517
1518		// Look for Unicode character group like \p{Han}.
1519		nclass, nt, err := p.parseUnicodeClass(t, class)
1520		if err != nil {
1521			return "", err
1522		}
1523		if nclass != nil {
1524			class, t = nclass, nt
1525			continue
1526		}
1527
1528		// Look for Perl character class symbols (extension).
1529		if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1530			class, t = nclass, nt
1531			continue
1532		}
1533
1534		// Single character or simple range.
1535		rng := t
1536		var lo, hi int
1537		if lo, t, err = p.parseClassChar(t, s); err != nil {
1538			return "", err
1539		}
1540		hi = lo
1541		// [a-] means (a|-) so check for final ].
1542		if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1543			t = t[1:]
1544			if hi, t, err = p.parseClassChar(t, s); err != nil {
1545				return "", err
1546			}
1547			if hi < lo {
1548				rng = rng[:len(rng)-len(t)]
1549				return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1550			}
1551		}
1552		if p.flags&FoldCase == 0 {
1553			class = appendRange(class, lo, hi)
1554		} else {
1555			class = appendFoldedRange(class, lo, hi)
1556		}
1557	}
1558	t = t[1:] // chop ]
1559
1560	// Use &re.Rune instead of &class to avoid allocation.
1561	re.Rune = class
1562	class = cleanClass(&re.Rune)
1563	if sign < 0 {
1564		class = negateClass(class)
1565	}
1566	re.Rune = class
1567	p.push(re)
1568	return t, nil
1569}
1570
1571// cleanClass sorts the ranges (pairs of elements of r),
1572// merges them, and eliminates duplicates.
1573func cleanClass(rp *[]int) []int {
1574
1575	// Sort by lo increasing, hi decreasing to break ties.
1576	sort.Sort(ranges{rp})
1577
1578	r := *rp
1579	if len(r) < 2 {
1580		return r
1581	}
1582
1583	// Merge abutting, overlapping.
1584	w := 2 // write index
1585	for i := 2; i < len(r); i += 2 {
1586		lo, hi := r[i], r[i+1]
1587		if lo <= r[w-1]+1 {
1588			// merge with previous range
1589			if hi > r[w-1] {
1590				r[w-1] = hi
1591			}
1592			continue
1593		}
1594		// new disjoint range
1595		r[w] = lo
1596		r[w+1] = hi
1597		w += 2
1598	}
1599
1600	return r[:w]
1601}
1602
1603// appendLiteral returns the result of appending the literal x to the class r.
1604func appendLiteral(r []int, x int, flags Flags) []int {
1605	if flags&FoldCase != 0 {
1606		return appendFoldedRange(r, x, x)
1607	}
1608	return appendRange(r, x, x)
1609}
1610
1611// appendRange returns the result of appending the range lo-hi to the class r.
1612func appendRange(r []int, lo, hi int) []int {
1613	// Expand last range or next to last range if it overlaps or abuts.
1614	// Checking two ranges helps when appending case-folded
1615	// alphabets, so that one range can be expanding A-Z and the
1616	// other expanding a-z.
1617	n := len(r)
1618	for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1619		if n >= i {
1620			rlo, rhi := r[n-i], r[n-i+1]
1621			if lo <= rhi+1 && rlo <= hi+1 {
1622				if lo < rlo {
1623					r[n-i] = lo
1624				}
1625				if hi > rhi {
1626					r[n-i+1] = hi
1627				}
1628				return r
1629			}
1630		}
1631	}
1632
1633	return append(r, lo, hi)
1634}
1635
1636const (
1637	// minimum and maximum runes involved in folding.
1638	// checked during test.
1639	minFold = 0x0041
1640	maxFold = 0x1044f
1641)
1642
1643// appendFoldedRange returns the result of appending the range lo-hi
1644// and its case folding-equivalent runes to the class r.
1645func appendFoldedRange(r []int, lo, hi int) []int {
1646	// Optimizations.
1647	if lo <= minFold && hi >= maxFold {
1648		// Range is full: folding can't add more.
1649		return appendRange(r, lo, hi)
1650	}
1651	if hi < minFold || lo > maxFold {
1652		// Range is outside folding possibilities.
1653		return appendRange(r, lo, hi)
1654	}
1655	if lo < minFold {
1656		// [lo, minFold-1] needs no folding.
1657		r = appendRange(r, lo, minFold-1)
1658		lo = minFold
1659	}
1660	if hi > maxFold {
1661		// [maxFold+1, hi] needs no folding.
1662		r = appendRange(r, maxFold+1, hi)
1663		hi = maxFold
1664	}
1665
1666	// Brute force.  Depend on appendRange to coalesce ranges on the fly.
1667	for c := lo; c <= hi; c++ {
1668		r = appendRange(r, c, c)
1669		f := unicode.SimpleFold(c)
1670		for f != c {
1671			r = appendRange(r, f, f)
1672			f = unicode.SimpleFold(f)
1673		}
1674	}
1675	return r
1676}
1677
1678// appendClass returns the result of appending the class x to the class r.
1679// It assume x is clean.
1680func appendClass(r []int, x []int) []int {
1681	for i := 0; i < len(x); i += 2 {
1682		r = appendRange(r, x[i], x[i+1])
1683	}
1684	return r
1685}
1686
1687// appendFolded returns the result of appending the case folding of the class x to the class r.
1688func appendFoldedClass(r []int, x []int) []int {
1689	for i := 0; i < len(x); i += 2 {
1690		r = appendFoldedRange(r, x[i], x[i+1])
1691	}
1692	return r
1693}
1694
1695// appendNegatedClass returns the result of appending the negation of the class x to the class r.
1696// It assumes x is clean.
1697func appendNegatedClass(r []int, x []int) []int {
1698	nextLo := 0
1699	for i := 0; i < len(x); i += 2 {
1700		lo, hi := x[i], x[i+1]
1701		if nextLo <= lo-1 {
1702			r = appendRange(r, nextLo, lo-1)
1703		}
1704		nextLo = hi + 1
1705	}
1706	if nextLo <= unicode.MaxRune {
1707		r = appendRange(r, nextLo, unicode.MaxRune)
1708	}
1709	return r
1710}
1711
1712// appendTable returns the result of appending x to the class r.
1713func appendTable(r []int, x *unicode.RangeTable) []int {
1714	for _, xr := range x.R16 {
1715		lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1716		if stride == 1 {
1717			r = appendRange(r, lo, hi)
1718			continue
1719		}
1720		for c := lo; c <= hi; c += stride {
1721			r = appendRange(r, c, c)
1722		}
1723	}
1724	for _, xr := range x.R32 {
1725		lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1726		if stride == 1 {
1727			r = appendRange(r, lo, hi)
1728			continue
1729		}
1730		for c := lo; c <= hi; c += stride {
1731			r = appendRange(r, c, c)
1732		}
1733	}
1734	return r
1735}
1736
1737// appendNegatedTable returns the result of appending the negation of x to the class r.
1738func appendNegatedTable(r []int, x *unicode.RangeTable) []int {
1739	nextLo := 0 // lo end of next class to add
1740	for _, xr := range x.R16 {
1741		lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1742		if stride == 1 {
1743			if nextLo <= lo-1 {
1744				r = appendRange(r, nextLo, lo-1)
1745			}
1746			nextLo = hi + 1
1747			continue
1748		}
1749		for c := lo; c <= hi; c += stride {
1750			if nextLo <= c-1 {
1751				r = appendRange(r, nextLo, c-1)
1752			}
1753			nextLo = c + 1
1754		}
1755	}
1756	for _, xr := range x.R32 {
1757		lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
1758		if stride == 1 {
1759			if nextLo <= lo-1 {
1760				r = appendRange(r, nextLo, lo-1)
1761			}
1762			nextLo = hi + 1
1763			continue
1764		}
1765		for c := lo; c <= hi; c += stride {
1766			if nextLo <= c-1 {
1767				r = appendRange(r, nextLo, c-1)
1768			}
1769			nextLo = c + 1
1770		}
1771	}
1772	if nextLo <= unicode.MaxRune {
1773		r = appendRange(r, nextLo, unicode.MaxRune)
1774	}
1775	return r
1776}
1777
1778// negateClass overwrites r and returns r's negation.
1779// It assumes the class r is already clean.
1780func negateClass(r []int) []int {
1781	nextLo := 0 // lo end of next class to add
1782	w := 0      // write index
1783	for i := 0; i < len(r); i += 2 {
1784		lo, hi := r[i], r[i+1]
1785		if nextLo <= lo-1 {
1786			r[w] = nextLo
1787			r[w+1] = lo - 1
1788			w += 2
1789		}
1790		nextLo = hi + 1
1791	}
1792	r = r[:w]
1793	if nextLo <= unicode.MaxRune {
1794		// It's possible for the negation to have one more
1795		// range - this one - than the original class, so use append.
1796		r = append(r, nextLo, unicode.MaxRune)
1797	}
1798	return r
1799}
1800
1801// ranges implements sort.Interface on a []rune.
1802// The choice of receiver type definition is strange
1803// but avoids an allocation since we already have
1804// a *[]int.
1805type ranges struct {
1806	p *[]int
1807}
1808
1809func (ra ranges) Less(i, j int) bool {
1810	p := *ra.p
1811	i *= 2
1812	j *= 2
1813	return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1814}
1815
1816func (ra ranges) Len() int {
1817	return len(*ra.p) / 2
1818}
1819
1820func (ra ranges) Swap(i, j int) {
1821	p := *ra.p
1822	i *= 2
1823	j *= 2
1824	p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1825}
1826
1827func checkUTF8(s string) os.Error {
1828	for s != "" {
1829		rune, size := utf8.DecodeRuneInString(s)
1830		if rune == utf8.RuneError && size == 1 {
1831			return &Error{Code: ErrInvalidUTF8, Expr: s}
1832		}
1833		s = s[size:]
1834	}
1835	return nil
1836}
1837
1838func nextRune(s string) (c int, t string, err os.Error) {
1839	c, size := utf8.DecodeRuneInString(s)
1840	if c == utf8.RuneError && size == 1 {
1841		return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1842	}
1843	return c, s[size:], nil
1844}
1845
1846func isalnum(c int) bool {
1847	return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1848}
1849
1850func unhex(c int) int {
1851	if '0' <= c && c <= '9' {
1852		return c - '0'
1853	}
1854	if 'a' <= c && c <= 'f' {
1855		return c - 'a' + 10
1856	}
1857	if 'A' <= c && c <= 'F' {
1858		return c - 'A' + 10
1859	}
1860	return -1
1861}