/regexp/syntax/parse.go

https://code.google.com/p/appengine-go-backports/ · Go · 1861 lines · 1411 code · 149 blank · 301 comment · 571 complexity · ed9cc54c2b086c0f7b00f6258e064853 MD5 · raw file

  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package syntax
  5. import (
  6. "os"
  7. "sort"
  8. "strings"
  9. "unicode"
  10. "utf8"
  11. )
  12. // An Error describes a failure to parse a regular expression
  13. // and gives the offending expression.
  14. type Error struct {
  15. Code ErrorCode
  16. Expr string
  17. }
  18. func (e *Error) String() string {
  19. return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
  20. }
  21. // An ErrorCode describes a failure to parse a regular expression.
  22. type ErrorCode string
  23. const (
  24. // Unexpected error
  25. ErrInternalError ErrorCode = "regexp/syntax: internal error"
  26. // Parse errors
  27. ErrInvalidCharClass ErrorCode = "invalid character class"
  28. ErrInvalidCharRange ErrorCode = "invalid character class range"
  29. ErrInvalidEscape ErrorCode = "invalid escape sequence"
  30. ErrInvalidNamedCapture ErrorCode = "invalid named capture"
  31. ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
  32. ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
  33. ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
  34. ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
  35. ErrMissingBracket ErrorCode = "missing closing ]"
  36. ErrMissingParen ErrorCode = "missing closing )"
  37. ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
  38. ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
  39. )
  40. func (e ErrorCode) String() string {
  41. return string(e)
  42. }
  43. // Flags control the behavior of the parser and record information about regexp context.
  44. type Flags uint16
  45. const (
  46. FoldCase Flags = 1 << iota // case-insensitive match
  47. Literal // treat pattern as literal string
  48. ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
  49. DotNL // allow . to match newline
  50. OneLine // treat ^ and $ as only matching at beginning and end of text
  51. NonGreedy // make repetition operators default to non-greedy
  52. PerlX // allow Perl extensions
  53. UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
  54. WasDollar // regexp OpEndText was $, not \z
  55. Simple // regexp contains no counted repetition
  56. MatchNL = ClassNL | DotNL
  57. Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
  58. POSIX Flags = 0 // POSIX syntax
  59. )
  60. // Pseudo-ops for parsing stack.
  61. const (
  62. opLeftParen = opPseudo + iota
  63. opVerticalBar
  64. )
  65. type parser struct {
  66. flags Flags // parse mode flags
  67. stack []*Regexp // stack of parsed expressions
  68. free *Regexp
  69. numCap int // number of capturing groups seen
  70. wholeRegexp string
  71. tmpClass []int // temporary char class work space
  72. }
  73. func (p *parser) newRegexp(op Op) *Regexp {
  74. re := p.free
  75. if re != nil {
  76. p.free = re.Sub0[0]
  77. *re = Regexp{}
  78. } else {
  79. re = new(Regexp)
  80. }
  81. re.Op = op
  82. return re
  83. }
  84. func (p *parser) reuse(re *Regexp) {
  85. re.Sub0[0] = p.free
  86. p.free = re
  87. }
  88. // Parse stack manipulation.
  89. // push pushes the regexp re onto the parse stack and returns the regexp.
  90. func (p *parser) push(re *Regexp) *Regexp {
  91. if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
  92. // Single rune.
  93. if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
  94. return nil
  95. }
  96. re.Op = OpLiteral
  97. re.Rune = re.Rune[:1]
  98. re.Flags = p.flags &^ FoldCase
  99. } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
  100. re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
  101. unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
  102. unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
  103. re.Op == OpCharClass && len(re.Rune) == 2 &&
  104. re.Rune[0]+1 == re.Rune[1] &&
  105. unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
  106. unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
  107. // Case-insensitive rune like [Aa] or [Î&#x201D;δ].
  108. if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
  109. return nil
  110. }
  111. // Rewrite as (case-insensitive) literal.
  112. re.Op = OpLiteral
  113. re.Rune = re.Rune[:1]
  114. re.Flags = p.flags | FoldCase
  115. } else {
  116. // Incremental concatenation.
  117. p.maybeConcat(-1, 0)
  118. }
  119. p.stack = append(p.stack, re)
  120. return re
  121. }
  122. // maybeConcat implements incremental concatenation
  123. // of literal runes into string nodes. The parser calls this
  124. // before each push, so only the top fragment of the stack
  125. // might need processing. Since this is called before a push,
  126. // the topmost literal is no longer subject to operators like *
  127. // (Otherwise ab* would turn into (ab)*.)
  128. // If r >= 0 and there's a node left over, maybeConcat uses it
  129. // to push r with the given flags.
  130. // maybeConcat reports whether r was pushed.
  131. func (p *parser) maybeConcat(r int, flags Flags) bool {
  132. n := len(p.stack)
  133. if n < 2 {
  134. return false
  135. }
  136. re1 := p.stack[n-1]
  137. re2 := p.stack[n-2]
  138. if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
  139. return false
  140. }
  141. // Push re1 into re2.
  142. re2.Rune = append(re2.Rune, re1.Rune...)
  143. // Reuse re1 if possible.
  144. if r >= 0 {
  145. re1.Rune = re1.Rune0[:1]
  146. re1.Rune[0] = r
  147. re1.Flags = flags
  148. return true
  149. }
  150. p.stack = p.stack[:n-1]
  151. p.reuse(re1)
  152. return false // did not push r
  153. }
  154. // newLiteral returns a new OpLiteral Regexp with the given flags
  155. func (p *parser) newLiteral(r int, flags Flags) *Regexp {
  156. re := p.newRegexp(OpLiteral)
  157. re.Flags = flags
  158. if flags&FoldCase != 0 {
  159. r = minFoldRune(r)
  160. }
  161. re.Rune0[0] = r
  162. re.Rune = re.Rune0[:1]
  163. return re
  164. }
  165. // minFoldRune returns the minimum rune fold-equivalent to r.
  166. func minFoldRune(r int) int {
  167. if r < minFold || r > maxFold {
  168. return r
  169. }
  170. min := r
  171. r0 := r
  172. for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
  173. if min > r {
  174. min = r
  175. }
  176. }
  177. return min
  178. }
  179. // literal pushes a literal regexp for the rune r on the stack
  180. // and returns that regexp.
  181. func (p *parser) literal(r int) {
  182. p.push(p.newLiteral(r, p.flags))
  183. }
  184. // op pushes a regexp with the given op onto the stack
  185. // and returns that regexp.
  186. func (p *parser) op(op Op) *Regexp {
  187. re := p.newRegexp(op)
  188. re.Flags = p.flags
  189. return p.push(re)
  190. }
  191. // repeat replaces the top stack element with itself repeated according to op, min, max.
  192. // before is the regexp suffix starting at the repetition operator.
  193. // after is the regexp suffix following after the repetition operator.
  194. // repeat returns an updated 'after' and an error, if any.
  195. func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, os.Error) {
  196. flags := p.flags
  197. if p.flags&PerlX != 0 {
  198. if len(after) > 0 && after[0] == '?' {
  199. after = after[1:]
  200. flags ^= NonGreedy
  201. }
  202. if lastRepeat != "" {
  203. // In Perl it is not allowed to stack repetition operators:
  204. // a** is a syntax error, not a doubled star, and a++ means
  205. // something else entirely, which we don't support!
  206. return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
  207. }
  208. }
  209. n := len(p.stack)
  210. if n == 0 {
  211. return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
  212. }
  213. sub := p.stack[n-1]
  214. if sub.Op >= opPseudo {
  215. return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
  216. }
  217. re := p.newRegexp(op)
  218. re.Min = min
  219. re.Max = max
  220. re.Flags = flags
  221. re.Sub = re.Sub0[:1]
  222. re.Sub[0] = sub
  223. p.stack[n-1] = re
  224. return after, nil
  225. }
  226. // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
  227. func (p *parser) concat() *Regexp {
  228. p.maybeConcat(-1, 0)
  229. // Scan down to find pseudo-operator | or (.
  230. i := len(p.stack)
  231. for i > 0 && p.stack[i-1].Op < opPseudo {
  232. i--
  233. }
  234. subs := p.stack[i:]
  235. p.stack = p.stack[:i]
  236. // Empty concatenation is special case.
  237. if len(subs) == 0 {
  238. return p.push(p.newRegexp(OpEmptyMatch))
  239. }
  240. return p.push(p.collapse(subs, OpConcat))
  241. }
  242. // alternate replaces the top of the stack (above the topmost '(') with its alternation.
  243. func (p *parser) alternate() *Regexp {
  244. // Scan down to find pseudo-operator (.
  245. // There are no | above (.
  246. i := len(p.stack)
  247. for i > 0 && p.stack[i-1].Op < opPseudo {
  248. i--
  249. }
  250. subs := p.stack[i:]
  251. p.stack = p.stack[:i]
  252. // Make sure top class is clean.
  253. // All the others already are (see swapVerticalBar).
  254. if len(subs) > 0 {
  255. cleanAlt(subs[len(subs)-1])
  256. }
  257. // Empty alternate is special case
  258. // (shouldn't happen but easy to handle).
  259. if len(subs) == 0 {
  260. return p.push(p.newRegexp(OpNoMatch))
  261. }
  262. return p.push(p.collapse(subs, OpAlternate))
  263. }
  264. // cleanAlt cleans re for eventual inclusion in an alternation.
  265. func cleanAlt(re *Regexp) {
  266. switch re.Op {
  267. case OpCharClass:
  268. re.Rune = cleanClass(&re.Rune)
  269. if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
  270. re.Rune = nil
  271. re.Op = OpAnyChar
  272. return
  273. }
  274. if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
  275. re.Rune = nil
  276. re.Op = OpAnyCharNotNL
  277. return
  278. }
  279. if cap(re.Rune)-len(re.Rune) > 100 {
  280. // re.Rune will not grow any more.
  281. // Make a copy or inline to reclaim storage.
  282. re.Rune = append(re.Rune0[:0], re.Rune...)
  283. }
  284. }
  285. }
  286. // collapse returns the result of applying op to sub.
  287. // If sub contains op nodes, they all get hoisted up
  288. // so that there is never a concat of a concat or an
  289. // alternate of an alternate.
  290. func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
  291. if len(subs) == 1 {
  292. return subs[0]
  293. }
  294. re := p.newRegexp(op)
  295. re.Sub = re.Sub0[:0]
  296. for _, sub := range subs {
  297. if sub.Op == op {
  298. re.Sub = append(re.Sub, sub.Sub...)
  299. p.reuse(sub)
  300. } else {
  301. re.Sub = append(re.Sub, sub)
  302. }
  303. }
  304. if op == OpAlternate {
  305. re.Sub = p.factor(re.Sub, re.Flags)
  306. if len(re.Sub) == 1 {
  307. old := re
  308. re = re.Sub[0]
  309. p.reuse(old)
  310. }
  311. }
  312. return re
  313. }
  314. // factor factors common prefixes from the alternation list sub.
  315. // It returns a replacement list that reuses the same storage and
  316. // frees (passes to p.reuse) any removed *Regexps.
  317. //
  318. // For example,
  319. // ABC|ABD|AEF|BCX|BCY
  320. // simplifies by literal prefix extraction to
  321. // A(B(C|D)|EF)|BC(X|Y)
  322. // which simplifies by character class introduction to
  323. // A(B[CD]|EF)|BC[XY]
  324. //
  325. func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
  326. if len(sub) < 2 {
  327. return sub
  328. }
  329. // Round 1: Factor out common literal prefixes.
  330. var str []int
  331. var strflags Flags
  332. start := 0
  333. out := sub[:0]
  334. for i := 0; i <= len(sub); i++ {
  335. // Invariant: the Regexps that were in sub[0:start] have been
  336. // used or marked for reuse, and the slice space has been reused
  337. // for out (len(out) <= start).
  338. //
  339. // Invariant: sub[start:i] consists of regexps that all begin
  340. // with str as modified by strflags.
  341. var istr []int
  342. var iflags Flags
  343. if i < len(sub) {
  344. istr, iflags = p.leadingString(sub[i])
  345. if iflags == strflags {
  346. same := 0
  347. for same < len(str) && same < len(istr) && str[same] == istr[same] {
  348. same++
  349. }
  350. if same > 0 {
  351. // Matches at least one rune in current range.
  352. // Keep going around.
  353. str = str[:same]
  354. continue
  355. }
  356. }
  357. }
  358. // Found end of a run with common leading literal string:
  359. // sub[start:i] all begin with str[0:len(str)], but sub[i]
  360. // does not even begin with str[0].
  361. //
  362. // Factor out common string and append factored expression to out.
  363. if i == start {
  364. // Nothing to do - run of length 0.
  365. } else if i == start+1 {
  366. // Just one: don't bother factoring.
  367. out = append(out, sub[start])
  368. } else {
  369. // Construct factored form: prefix(suffix1|suffix2|...)
  370. prefix := p.newRegexp(OpLiteral)
  371. prefix.Flags = strflags
  372. prefix.Rune = append(prefix.Rune[:0], str...)
  373. for j := start; j < i; j++ {
  374. sub[j] = p.removeLeadingString(sub[j], len(str))
  375. }
  376. suffix := p.collapse(sub[start:i], OpAlternate) // recurse
  377. re := p.newRegexp(OpConcat)
  378. re.Sub = append(re.Sub[:0], prefix, suffix)
  379. out = append(out, re)
  380. }
  381. // Prepare for next iteration.
  382. start = i
  383. str = istr
  384. strflags = iflags
  385. }
  386. sub = out
  387. // Round 2: Factor out common complex prefixes,
  388. // just the first piece of each concatenation,
  389. // whatever it is. This is good enough a lot of the time.
  390. start = 0
  391. out = sub[:0]
  392. var first *Regexp
  393. for i := 0; i <= len(sub); i++ {
  394. // Invariant: the Regexps that were in sub[0:start] have been
  395. // used or marked for reuse, and the slice space has been reused
  396. // for out (len(out) <= start).
  397. //
  398. // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
  399. var ifirst *Regexp
  400. if i < len(sub) {
  401. ifirst = p.leadingRegexp(sub[i])
  402. if first != nil && first.Equal(ifirst) {
  403. continue
  404. }
  405. }
  406. // Found end of a run with common leading regexp:
  407. // sub[start:i] all begin with first but sub[i] does not.
  408. //
  409. // Factor out common regexp and append factored expression to out.
  410. if i == start {
  411. // Nothing to do - run of length 0.
  412. } else if i == start+1 {
  413. // Just one: don't bother factoring.
  414. out = append(out, sub[start])
  415. } else {
  416. // Construct factored form: prefix(suffix1|suffix2|...)
  417. prefix := first
  418. for j := start; j < i; j++ {
  419. reuse := j != start // prefix came from sub[start]
  420. sub[j] = p.removeLeadingRegexp(sub[j], reuse)
  421. }
  422. suffix := p.collapse(sub[start:i], OpAlternate) // recurse
  423. re := p.newRegexp(OpConcat)
  424. re.Sub = append(re.Sub[:0], prefix, suffix)
  425. out = append(out, re)
  426. }
  427. // Prepare for next iteration.
  428. start = i
  429. first = ifirst
  430. }
  431. sub = out
  432. // Round 3: Collapse runs of single literals into character classes.
  433. start = 0
  434. out = sub[:0]
  435. for i := 0; i <= len(sub); i++ {
  436. // Invariant: the Regexps that were in sub[0:start] have been
  437. // used or marked for reuse, and the slice space has been reused
  438. // for out (len(out) <= start).
  439. //
  440. // Invariant: sub[start:i] consists of regexps that are either
  441. // literal runes or character classes.
  442. if i < len(sub) && isCharClass(sub[i]) {
  443. continue
  444. }
  445. // sub[i] is not a char or char class;
  446. // emit char class for sub[start:i]...
  447. if i == start {
  448. // Nothing to do - run of length 0.
  449. } else if i == start+1 {
  450. out = append(out, sub[start])
  451. } else {
  452. // Make new char class.
  453. // Start with most complex regexp in sub[start].
  454. max := start
  455. for j := start + 1; j < i; j++ {
  456. if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
  457. max = j
  458. }
  459. }
  460. sub[start], sub[max] = sub[max], sub[start]
  461. for j := start + 1; j < i; j++ {
  462. mergeCharClass(sub[start], sub[j])
  463. p.reuse(sub[j])
  464. }
  465. cleanAlt(sub[start])
  466. out = append(out, sub[start])
  467. }
  468. // ... and then emit sub[i].
  469. if i < len(sub) {
  470. out = append(out, sub[i])
  471. }
  472. start = i + 1
  473. }
  474. sub = out
  475. // Round 4: Collapse runs of empty matches into a single empty match.
  476. start = 0
  477. out = sub[:0]
  478. for i := range sub {
  479. if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
  480. continue
  481. }
  482. out = append(out, sub[i])
  483. }
  484. sub = out
  485. return sub
  486. }
  487. // leadingString returns the leading literal string that re begins with.
  488. // The string refers to storage in re or its children.
  489. func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
  490. if re.Op == OpConcat && len(re.Sub) > 0 {
  491. re = re.Sub[0]
  492. }
  493. if re.Op != OpLiteral {
  494. return nil, 0
  495. }
  496. return re.Rune, re.Flags & FoldCase
  497. }
  498. // removeLeadingString removes the first n leading runes
  499. // from the beginning of re. It returns the replacement for re.
  500. func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
  501. if re.Op == OpConcat && len(re.Sub) > 0 {
  502. // Removing a leading string in a concatenation
  503. // might simplify the concatenation.
  504. sub := re.Sub[0]
  505. sub = p.removeLeadingString(sub, n)
  506. re.Sub[0] = sub
  507. if sub.Op == OpEmptyMatch {
  508. p.reuse(sub)
  509. switch len(re.Sub) {
  510. case 0, 1:
  511. // Impossible but handle.
  512. re.Op = OpEmptyMatch
  513. re.Sub = nil
  514. case 2:
  515. old := re
  516. re = re.Sub[1]
  517. p.reuse(old)
  518. default:
  519. copy(re.Sub, re.Sub[1:])
  520. re.Sub = re.Sub[:len(re.Sub)-1]
  521. }
  522. }
  523. return re
  524. }
  525. if re.Op == OpLiteral {
  526. re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
  527. if len(re.Rune) == 0 {
  528. re.Op = OpEmptyMatch
  529. }
  530. }
  531. return re
  532. }
  533. // leadingRegexp returns the leading regexp that re begins with.
  534. // The regexp refers to storage in re or its children.
  535. func (p *parser) leadingRegexp(re *Regexp) *Regexp {
  536. if re.Op == OpEmptyMatch {
  537. return nil
  538. }
  539. if re.Op == OpConcat && len(re.Sub) > 0 {
  540. sub := re.Sub[0]
  541. if sub.Op == OpEmptyMatch {
  542. return nil
  543. }
  544. return sub
  545. }
  546. return re
  547. }
  548. // removeLeadingRegexp removes the leading regexp in re.
  549. // It returns the replacement for re.
  550. // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
  551. func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
  552. if re.Op == OpConcat && len(re.Sub) > 0 {
  553. if reuse {
  554. p.reuse(re.Sub[0])
  555. }
  556. re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
  557. switch len(re.Sub) {
  558. case 0:
  559. re.Op = OpEmptyMatch
  560. re.Sub = nil
  561. case 1:
  562. old := re
  563. re = re.Sub[0]
  564. p.reuse(old)
  565. }
  566. return re
  567. }
  568. if reuse {
  569. p.reuse(re)
  570. }
  571. return p.newRegexp(OpEmptyMatch)
  572. }
  573. func literalRegexp(s string, flags Flags) *Regexp {
  574. re := &Regexp{Op: OpLiteral}
  575. re.Flags = flags
  576. re.Rune = re.Rune0[:0] // use local storage for small strings
  577. for _, c := range s {
  578. if len(re.Rune) >= cap(re.Rune) {
  579. // string is too long to fit in Rune0. let Go handle it
  580. re.Rune = []int(s)
  581. break
  582. }
  583. re.Rune = append(re.Rune, c)
  584. }
  585. return re
  586. }
  587. // Parsing.
  588. func Parse(s string, flags Flags) (*Regexp, os.Error) {
  589. if flags&Literal != 0 {
  590. // Trivial parser for literal string.
  591. if err := checkUTF8(s); err != nil {
  592. return nil, err
  593. }
  594. return literalRegexp(s, flags), nil
  595. }
  596. // Otherwise, must do real work.
  597. var (
  598. p parser
  599. err os.Error
  600. c int
  601. op Op
  602. lastRepeat string
  603. min, max int
  604. )
  605. p.flags = flags
  606. p.wholeRegexp = s
  607. t := s
  608. for t != "" {
  609. repeat := ""
  610. BigSwitch:
  611. switch t[0] {
  612. default:
  613. if c, t, err = nextRune(t); err != nil {
  614. return nil, err
  615. }
  616. p.literal(c)
  617. case '(':
  618. if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
  619. // Flag changes and non-capturing groups.
  620. if t, err = p.parsePerlFlags(t); err != nil {
  621. return nil, err
  622. }
  623. break
  624. }
  625. p.numCap++
  626. p.op(opLeftParen).Cap = p.numCap
  627. t = t[1:]
  628. case '|':
  629. if err = p.parseVerticalBar(); err != nil {
  630. return nil, err
  631. }
  632. t = t[1:]
  633. case ')':
  634. if err = p.parseRightParen(); err != nil {
  635. return nil, err
  636. }
  637. t = t[1:]
  638. case '^':
  639. if p.flags&OneLine != 0 {
  640. p.op(OpBeginText)
  641. } else {
  642. p.op(OpBeginLine)
  643. }
  644. t = t[1:]
  645. case '$':
  646. if p.flags&OneLine != 0 {
  647. p.op(OpEndText).Flags |= WasDollar
  648. } else {
  649. p.op(OpEndLine)
  650. }
  651. t = t[1:]
  652. case '.':
  653. if p.flags&DotNL != 0 {
  654. p.op(OpAnyChar)
  655. } else {
  656. p.op(OpAnyCharNotNL)
  657. }
  658. t = t[1:]
  659. case '[':
  660. if t, err = p.parseClass(t); err != nil {
  661. return nil, err
  662. }
  663. case '*', '+', '?':
  664. before := t
  665. switch t[0] {
  666. case '*':
  667. op = OpStar
  668. case '+':
  669. op = OpPlus
  670. case '?':
  671. op = OpQuest
  672. }
  673. after := t[1:]
  674. if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
  675. return nil, err
  676. }
  677. repeat = before
  678. t = after
  679. case '{':
  680. op = OpRepeat
  681. before := t
  682. min, max, after, ok := p.parseRepeat(t)
  683. if !ok {
  684. // If the repeat cannot be parsed, { is a literal.
  685. p.literal('{')
  686. t = t[1:]
  687. break
  688. }
  689. if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
  690. // Numbers were too big, or max is present and min > max.
  691. return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
  692. }
  693. if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
  694. return nil, err
  695. }
  696. repeat = before
  697. t = after
  698. case '\\':
  699. if p.flags&PerlX != 0 && len(t) >= 2 {
  700. switch t[1] {
  701. case 'A':
  702. p.op(OpBeginText)
  703. t = t[2:]
  704. break BigSwitch
  705. case 'b':
  706. p.op(OpWordBoundary)
  707. t = t[2:]
  708. break BigSwitch
  709. case 'B':
  710. p.op(OpNoWordBoundary)
  711. t = t[2:]
  712. break BigSwitch
  713. case 'C':
  714. // any byte; not supported
  715. return nil, &Error{ErrInvalidEscape, t[:2]}
  716. case 'Q':
  717. // \Q ... \E: the ... is always literals
  718. var lit string
  719. if i := strings.Index(t, `\E`); i < 0 {
  720. lit = t[2:]
  721. t = ""
  722. } else {
  723. lit = t[2:i]
  724. t = t[i+2:]
  725. }
  726. p.push(literalRegexp(lit, p.flags))
  727. break BigSwitch
  728. case 'z':
  729. p.op(OpEndText)
  730. t = t[2:]
  731. break BigSwitch
  732. }
  733. }
  734. re := p.newRegexp(OpCharClass)
  735. re.Flags = p.flags
  736. // Look for Unicode character group like \p{Han}
  737. if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
  738. r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
  739. if err != nil {
  740. return nil, err
  741. }
  742. if r != nil {
  743. re.Rune = r
  744. t = rest
  745. p.push(re)
  746. break BigSwitch
  747. }
  748. }
  749. // Perl character class escape.
  750. if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
  751. re.Rune = r
  752. t = rest
  753. p.push(re)
  754. break BigSwitch
  755. }
  756. p.reuse(re)
  757. // Ordinary single-character escape.
  758. if c, t, err = p.parseEscape(t); err != nil {
  759. return nil, err
  760. }
  761. p.literal(c)
  762. }
  763. lastRepeat = repeat
  764. }
  765. p.concat()
  766. if p.swapVerticalBar() {
  767. // pop vertical bar
  768. p.stack = p.stack[:len(p.stack)-1]
  769. }
  770. p.alternate()
  771. n := len(p.stack)
  772. if n != 1 {
  773. return nil, &Error{ErrMissingParen, s}
  774. }
  775. return p.stack[0], nil
  776. }
  777. // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
  778. // If s is not of that form, it returns ok == false.
  779. // If s has the right form but the values are too big, it returns min == -1, ok == true.
  780. func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
  781. if s == "" || s[0] != '{' {
  782. return
  783. }
  784. s = s[1:]
  785. var ok1 bool
  786. if min, s, ok1 = p.parseInt(s); !ok1 {
  787. return
  788. }
  789. if s == "" {
  790. return
  791. }
  792. if s[0] != ',' {
  793. max = min
  794. } else {
  795. s = s[1:]
  796. if s == "" {
  797. return
  798. }
  799. if s[0] == '}' {
  800. max = -1
  801. } else if max, s, ok1 = p.parseInt(s); !ok1 {
  802. return
  803. } else if max < 0 {
  804. // parseInt found too big a number
  805. min = -1
  806. }
  807. }
  808. if s == "" || s[0] != '}' {
  809. return
  810. }
  811. rest = s[1:]
  812. ok = true
  813. return
  814. }
  815. // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
  816. // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
  817. // The caller must have ensured that s begins with "(?".
  818. func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) {
  819. t := s
  820. // Check for named captures, first introduced in Python's regexp library.
  821. // As usual, there are three slightly different syntaxes:
  822. //
  823. // (?P<name>expr) the original, introduced by Python
  824. // (?<name>expr) the .NET alteration, adopted by Perl 5.10
  825. // (?'name'expr) another .NET alteration, adopted by Perl 5.10
  826. //
  827. // Perl 5.10 gave in and implemented the Python version too,
  828. // but they claim that the last two are the preferred forms.
  829. // PCRE and languages based on it (specifically, PHP and Ruby)
  830. // support all three as well. EcmaScript 4 uses only the Python form.
  831. //
  832. // In both the open source world (via Code Search) and the
  833. // Google source tree, (?P<expr>name) is the dominant form,
  834. // so that's the one we implement. One is enough.
  835. if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
  836. // Pull out name.
  837. end := strings.IndexRune(t, '>')
  838. if end < 0 {
  839. if err = checkUTF8(t); err != nil {
  840. return "", err
  841. }
  842. return "", &Error{ErrInvalidNamedCapture, s}
  843. }
  844. capture := t[:end+1] // "(?P<name>"
  845. name := t[4:end] // "name"
  846. if err = checkUTF8(name); err != nil {
  847. return "", err
  848. }
  849. if !isValidCaptureName(name) {
  850. return "", &Error{ErrInvalidNamedCapture, capture}
  851. }
  852. // Like ordinary capture, but named.
  853. p.numCap++
  854. re := p.op(opLeftParen)
  855. re.Cap = p.numCap
  856. re.Name = name
  857. return t[end+1:], nil
  858. }
  859. // Non-capturing group. Might also twiddle Perl flags.
  860. var c int
  861. t = t[2:] // skip (?
  862. flags := p.flags
  863. sign := +1
  864. sawFlag := false
  865. Loop:
  866. for t != "" {
  867. if c, t, err = nextRune(t); err != nil {
  868. return "", err
  869. }
  870. switch c {
  871. default:
  872. break Loop
  873. // Flags.
  874. case 'i':
  875. flags |= FoldCase
  876. sawFlag = true
  877. case 'm':
  878. flags &^= OneLine
  879. sawFlag = true
  880. case 's':
  881. flags |= DotNL
  882. sawFlag = true
  883. case 'U':
  884. flags |= NonGreedy
  885. sawFlag = true
  886. // Switch to negation.
  887. case '-':
  888. if sign < 0 {
  889. break Loop
  890. }
  891. sign = -1
  892. // Invert flags so that | above turn into &^ and vice versa.
  893. // We'll invert flags again before using it below.
  894. flags = ^flags
  895. sawFlag = false
  896. // End of flags, starting group or not.
  897. case ':', ')':
  898. if sign < 0 {
  899. if !sawFlag {
  900. break Loop
  901. }
  902. flags = ^flags
  903. }
  904. if c == ':' {
  905. // Open new group
  906. p.op(opLeftParen)
  907. }
  908. p.flags = flags
  909. return t, nil
  910. }
  911. }
  912. return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
  913. }
  914. // isValidCaptureName reports whether name
  915. // is a valid capture name: [A-Za-z0-9_]+.
  916. // PCRE limits names to 32 bytes.
  917. // Python rejects names starting with digits.
  918. // We don't enforce either of those.
  919. func isValidCaptureName(name string) bool {
  920. if name == "" {
  921. return false
  922. }
  923. for _, c := range name {
  924. if c != '_' && !isalnum(c) {
  925. return false
  926. }
  927. }
  928. return true
  929. }
  930. // parseInt parses a decimal integer.
  931. func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
  932. if s == "" || s[0] < '0' || '9' < s[0] {
  933. return
  934. }
  935. // Disallow leading zeros.
  936. if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
  937. return
  938. }
  939. t := s
  940. for s != "" && '0' <= s[0] && s[0] <= '9' {
  941. s = s[1:]
  942. }
  943. rest = s
  944. ok = true
  945. // Have digits, compute value.
  946. t = t[:len(t)-len(s)]
  947. for i := 0; i < len(t); i++ {
  948. // Avoid overflow.
  949. if n >= 1e8 {
  950. n = -1
  951. break
  952. }
  953. n = n*10 + int(t[i]) - '0'
  954. }
  955. return
  956. }
  957. // can this be represented as a character class?
  958. // single-rune literal string, char class, ., and .|\n.
  959. func isCharClass(re *Regexp) bool {
  960. return re.Op == OpLiteral && len(re.Rune) == 1 ||
  961. re.Op == OpCharClass ||
  962. re.Op == OpAnyCharNotNL ||
  963. re.Op == OpAnyChar
  964. }
  965. // does re match r?
  966. func matchRune(re *Regexp, r int) bool {
  967. switch re.Op {
  968. case OpLiteral:
  969. return len(re.Rune) == 1 && re.Rune[0] == r
  970. case OpCharClass:
  971. for i := 0; i < len(re.Rune); i += 2 {
  972. if re.Rune[i] <= r && r <= re.Rune[i+1] {
  973. return true
  974. }
  975. }
  976. return false
  977. case OpAnyCharNotNL:
  978. return r != '\n'
  979. case OpAnyChar:
  980. return true
  981. }
  982. return false
  983. }
  984. // parseVerticalBar handles a | in the input.
  985. func (p *parser) parseVerticalBar() os.Error {
  986. p.concat()
  987. // The concatenation we just parsed is on top of the stack.
  988. // If it sits above an opVerticalBar, swap it below
  989. // (things below an opVerticalBar become an alternation).
  990. // Otherwise, push a new vertical bar.
  991. if !p.swapVerticalBar() {
  992. p.op(opVerticalBar)
  993. }
  994. return nil
  995. }
  996. // mergeCharClass makes dst = dst|src.
  997. // The caller must ensure that dst.Op >= src.Op,
  998. // to reduce the amount of copying.
  999. func mergeCharClass(dst, src *Regexp) {
  1000. switch dst.Op {
  1001. case OpAnyChar:
  1002. // src doesn't add anything.
  1003. case OpAnyCharNotNL:
  1004. // src might add \n
  1005. if matchRune(src, '\n') {
  1006. dst.Op = OpAnyChar
  1007. }
  1008. case OpCharClass:
  1009. // src is simpler, so either literal or char class
  1010. if src.Op == OpLiteral {
  1011. dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1012. } else {
  1013. dst.Rune = appendClass(dst.Rune, src.Rune)
  1014. }
  1015. case OpLiteral:
  1016. // both literal
  1017. if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
  1018. break
  1019. }
  1020. dst.Op = OpCharClass
  1021. dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
  1022. dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1023. }
  1024. }
  1025. // If the top of the stack is an element followed by an opVerticalBar
  1026. // swapVerticalBar swaps the two and returns true.
  1027. // Otherwise it returns false.
  1028. func (p *parser) swapVerticalBar() bool {
  1029. // If above and below vertical bar are literal or char class,
  1030. // can merge into a single char class.
  1031. n := len(p.stack)
  1032. if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
  1033. re1 := p.stack[n-1]
  1034. re3 := p.stack[n-3]
  1035. // Make re3 the more complex of the two.
  1036. if re1.Op > re3.Op {
  1037. re1, re3 = re3, re1
  1038. p.stack[n-3] = re3
  1039. }
  1040. mergeCharClass(re3, re1)
  1041. p.reuse(re1)
  1042. p.stack = p.stack[:n-1]
  1043. return true
  1044. }
  1045. if n >= 2 {
  1046. re1 := p.stack[n-1]
  1047. re2 := p.stack[n-2]
  1048. if re2.Op == opVerticalBar {
  1049. if n >= 3 {
  1050. // Now out of reach.
  1051. // Clean opportunistically.
  1052. cleanAlt(p.stack[n-3])
  1053. }
  1054. p.stack[n-2] = re1
  1055. p.stack[n-1] = re2
  1056. return true
  1057. }
  1058. }
  1059. return false
  1060. }
  1061. // parseRightParen handles a ) in the input.
  1062. func (p *parser) parseRightParen() os.Error {
  1063. p.concat()
  1064. if p.swapVerticalBar() {
  1065. // pop vertical bar
  1066. p.stack = p.stack[:len(p.stack)-1]
  1067. }
  1068. p.alternate()
  1069. n := len(p.stack)
  1070. if n < 2 {
  1071. return &Error{ErrInternalError, ""}
  1072. }
  1073. re1 := p.stack[n-1]
  1074. re2 := p.stack[n-2]
  1075. p.stack = p.stack[:n-2]
  1076. if re2.Op != opLeftParen {
  1077. return &Error{ErrMissingParen, p.wholeRegexp}
  1078. }
  1079. // Restore flags at time of paren.
  1080. p.flags = re2.Flags
  1081. if re2.Cap == 0 {
  1082. // Just for grouping.
  1083. p.push(re1)
  1084. } else {
  1085. re2.Op = OpCapture
  1086. re2.Sub = re2.Sub0[:1]
  1087. re2.Sub[0] = re1
  1088. p.push(re2)
  1089. }
  1090. return nil
  1091. }
  1092. // parseEscape parses an escape sequence at the beginning of s
  1093. // and returns the rune.
  1094. func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) {
  1095. t := s[1:]
  1096. if t == "" {
  1097. return 0, "", &Error{ErrTrailingBackslash, ""}
  1098. }
  1099. c, t, err := nextRune(t)
  1100. if err != nil {
  1101. return 0, "", err
  1102. }
  1103. Switch:
  1104. switch c {
  1105. default:
  1106. if c < utf8.RuneSelf && !isalnum(c) {
  1107. // Escaped non-word characters are always themselves.
  1108. // PCRE is not quite so rigorous: it accepts things like
  1109. // \q, but we don't. We once rejected \_, but too many
  1110. // programs and people insist on using it, so allow \_.
  1111. return c, t, nil
  1112. }
  1113. // Octal escapes.
  1114. case '1', '2', '3', '4', '5', '6', '7':
  1115. // Single non-zero digit is a backreference; not supported
  1116. if t == "" || t[0] < '0' || t[0] > '7' {
  1117. break
  1118. }
  1119. fallthrough
  1120. case '0':
  1121. // Consume up to three octal digits; already have one.
  1122. r = c - '0'
  1123. for i := 1; i < 3; i++ {
  1124. if t == "" || t[0] < '0' || t[0] > '7' {
  1125. break
  1126. }
  1127. r = r*8 + int(t[0]) - '0'
  1128. t = t[1:]
  1129. }
  1130. return r, t, nil
  1131. // Hexadecimal escapes.
  1132. case 'x':
  1133. if t == "" {
  1134. break
  1135. }
  1136. if c, t, err = nextRune(t); err != nil {
  1137. return 0, "", err
  1138. }
  1139. if c == '{' {
  1140. // Any number of digits in braces.
  1141. // Perl accepts any text at all; it ignores all text
  1142. // after the first non-hex digit. We require only hex digits,
  1143. // and at least one.
  1144. nhex := 0
  1145. r = 0
  1146. for {
  1147. if t == "" {
  1148. break Switch
  1149. }
  1150. if c, t, err = nextRune(t); err != nil {
  1151. return 0, "", err
  1152. }
  1153. if c == '}' {
  1154. break
  1155. }
  1156. v := unhex(c)
  1157. if v < 0 {
  1158. break Switch
  1159. }
  1160. r = r*16 + v
  1161. if r > unicode.MaxRune {
  1162. break Switch
  1163. }
  1164. nhex++
  1165. }
  1166. if nhex == 0 {
  1167. break Switch
  1168. }
  1169. return r, t, nil
  1170. }
  1171. // Easy case: two hex digits.
  1172. x := unhex(c)
  1173. if c, t, err = nextRune(t); err != nil {
  1174. return 0, "", err
  1175. }
  1176. y := unhex(c)
  1177. if x < 0 || y < 0 {
  1178. break
  1179. }
  1180. return x*16 + y, t, nil
  1181. // C escapes. There is no case 'b', to avoid misparsing
  1182. // the Perl word-boundary \b as the C backspace \b
  1183. // when in POSIX mode. In Perl, /\b/ means word-boundary
  1184. // but /[\b]/ means backspace. We don't support that.
  1185. // If you want a backspace, embed a literal backspace
  1186. // character or use \x08.
  1187. case 'a':
  1188. return '\a', t, err
  1189. case 'f':
  1190. return '\f', t, err
  1191. case 'n':
  1192. return '\n', t, err
  1193. case 'r':
  1194. return '\r', t, err
  1195. case 't':
  1196. return '\t', t, err
  1197. case 'v':
  1198. return '\v', t, err
  1199. }
  1200. return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
  1201. }
  1202. // parseClassChar parses a character class character at the beginning of s
  1203. // and returns it.
  1204. func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) {
  1205. if s == "" {
  1206. return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
  1207. }
  1208. // Allow regular escape sequences even though
  1209. // many need not be escaped in this context.
  1210. if s[0] == '\\' {
  1211. return p.parseEscape(s)
  1212. }
  1213. return nextRune(s)
  1214. }
  1215. type charGroup struct {
  1216. sign int
  1217. class []int
  1218. }
  1219. // parsePerlClassEscape parses a leading Perl character class escape like \d
  1220. // from the beginning of s. If one is present, it appends the characters to r
  1221. // and returns the new slice r and the remainder of the string.
  1222. func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) {
  1223. if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
  1224. return
  1225. }
  1226. g := perlGroup[s[0:2]]
  1227. if g.sign == 0 {
  1228. return
  1229. }
  1230. return p.appendGroup(r, g), s[2:]
  1231. }
  1232. // parseNamedClass parses a leading POSIX named character class like [:alnum:]
  1233. // from the beginning of s. If one is present, it appends the characters to r
  1234. // and returns the new slice r and the remainder of the string.
  1235. func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) {
  1236. if len(s) < 2 || s[0] != '[' || s[1] != ':' {
  1237. return
  1238. }
  1239. i := strings.Index(s[2:], ":]")
  1240. if i < 0 {
  1241. return
  1242. }
  1243. i += 2
  1244. name, s := s[0:i+2], s[i+2:]
  1245. g := posixGroup[name]
  1246. if g.sign == 0 {
  1247. return nil, "", &Error{ErrInvalidCharRange, name}
  1248. }
  1249. return p.appendGroup(r, g), s, nil
  1250. }
  1251. func (p *parser) appendGroup(r []int, g charGroup) []int {
  1252. if p.flags&FoldCase == 0 {
  1253. if g.sign < 0 {
  1254. r = appendNegatedClass(r, g.class)
  1255. } else {
  1256. r = appendClass(r, g.class)
  1257. }
  1258. } else {
  1259. tmp := p.tmpClass[:0]
  1260. tmp = appendFoldedClass(tmp, g.class)
  1261. p.tmpClass = tmp
  1262. tmp = cleanClass(&p.tmpClass)
  1263. if g.sign < 0 {
  1264. r = appendNegatedClass(r, tmp)
  1265. } else {
  1266. r = appendClass(r, tmp)
  1267. }
  1268. }
  1269. return r
  1270. }
  1271. var anyTable = &unicode.RangeTable{
  1272. []unicode.Range16{{0, 1<<16 - 1, 1}},
  1273. []unicode.Range32{{1 << 16, unicode.MaxRune, 1}},
  1274. }
  1275. // unicodeTable returns the unicode.RangeTable identified by name
  1276. // and the table of additional fold-equivalent code points.
  1277. func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
  1278. // Special case: "Any" means any.
  1279. if name == "Any" {
  1280. return anyTable, anyTable
  1281. }
  1282. if t := unicode.Categories[name]; t != nil {
  1283. return t, unicode.FoldCategory[name]
  1284. }
  1285. if t := unicode.Scripts[name]; t != nil {
  1286. return t, unicode.FoldScript[name]
  1287. }
  1288. return nil, nil
  1289. }
  1290. // parseUnicodeClass parses a leading Unicode character class like \p{Han}
  1291. // from the beginning of s. If one is present, it appends the characters to r
  1292. // and returns the new slice r and the remainder of the string.
  1293. func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) {
  1294. if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
  1295. return
  1296. }
  1297. // Committed to parse or return error.
  1298. sign := +1
  1299. if s[1] == 'P' {
  1300. sign = -1
  1301. }
  1302. t := s[2:]
  1303. c, t, err := nextRune(t)
  1304. if err != nil {
  1305. return
  1306. }
  1307. var seq, name string
  1308. if c != '{' {
  1309. // Single-letter name.
  1310. seq = s[:len(s)-len(t)]
  1311. name = seq[2:]
  1312. } else {
  1313. // Name is in braces.
  1314. end := strings.IndexRune(s, '}')
  1315. if end < 0 {
  1316. if err = checkUTF8(s); err != nil {
  1317. return
  1318. }
  1319. return nil, "", &Error{ErrInvalidCharRange, s}
  1320. }
  1321. seq, t = s[:end+1], s[end+1:]
  1322. name = s[3:end]
  1323. if err = checkUTF8(name); err != nil {
  1324. return
  1325. }
  1326. }
  1327. // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
  1328. if name != "" && name[0] == '^' {
  1329. sign = -sign
  1330. name = name[1:]
  1331. }
  1332. tab, fold := unicodeTable(name)
  1333. if tab == nil {
  1334. return nil, "", &Error{ErrInvalidCharRange, seq}
  1335. }
  1336. if p.flags&FoldCase == 0 || fold == nil {
  1337. if sign > 0 {
  1338. r = appendTable(r, tab)
  1339. } else {
  1340. r = appendNegatedTable(r, tab)
  1341. }
  1342. } else {
  1343. // Merge and clean tab and fold in a temporary buffer.
  1344. // This is necessary for the negative case and just tidy
  1345. // for the positive case.
  1346. tmp := p.tmpClass[:0]
  1347. tmp = appendTable(tmp, tab)
  1348. tmp = appendTable(tmp, fold)
  1349. p.tmpClass = tmp
  1350. tmp = cleanClass(&p.tmpClass)
  1351. if sign > 0 {
  1352. r = appendClass(r, tmp)
  1353. } else {
  1354. r = appendNegatedClass(r, tmp)
  1355. }
  1356. }
  1357. return r, t, nil
  1358. }
  1359. // parseClass parses a character class at the beginning of s
  1360. // and pushes it onto the parse stack.
  1361. func (p *parser) parseClass(s string) (rest string, err os.Error) {
  1362. t := s[1:] // chop [
  1363. re := p.newRegexp(OpCharClass)
  1364. re.Flags = p.flags
  1365. re.Rune = re.Rune0[:0]
  1366. sign := +1
  1367. if t != "" && t[0] == '^' {
  1368. sign = -1
  1369. t = t[1:]
  1370. // If character class does not match \n, add it here,
  1371. // so that negation later will do the right thing.
  1372. if p.flags&ClassNL == 0 {
  1373. re.Rune = append(re.Rune, '\n', '\n')
  1374. }
  1375. }
  1376. class := re.Rune
  1377. first := true // ] and - are okay as first char in class
  1378. for t == "" || t[0] != ']' || first {
  1379. // POSIX: - is only okay unescaped as first or last in class.
  1380. // Perl: - is okay anywhere.
  1381. if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
  1382. _, size := utf8.DecodeRuneInString(t[1:])
  1383. return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
  1384. }
  1385. first = false
  1386. // Look for POSIX [:alnum:] etc.
  1387. if len(t) > 2 && t[0] == '[' && t[1] == ':' {
  1388. nclass, nt, err := p.parseNamedClass(t, class)
  1389. if err != nil {
  1390. return "", err
  1391. }
  1392. if nclass != nil {
  1393. class, t = nclass, nt
  1394. continue
  1395. }
  1396. }
  1397. // Look for Unicode character group like \p{Han}.
  1398. nclass, nt, err := p.parseUnicodeClass(t, class)
  1399. if err != nil {
  1400. return "", err
  1401. }
  1402. if nclass != nil {
  1403. class, t = nclass, nt
  1404. continue
  1405. }
  1406. // Look for Perl character class symbols (extension).
  1407. if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
  1408. class, t = nclass, nt
  1409. continue
  1410. }
  1411. // Single character or simple range.
  1412. rng := t
  1413. var lo, hi int
  1414. if lo, t, err = p.parseClassChar(t, s); err != nil {
  1415. return "", err
  1416. }
  1417. hi = lo
  1418. // [a-] means (a|-) so check for final ].
  1419. if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
  1420. t = t[1:]
  1421. if hi, t, err = p.parseClassChar(t, s); err != nil {
  1422. return "", err
  1423. }
  1424. if hi < lo {
  1425. rng = rng[:len(rng)-len(t)]
  1426. return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
  1427. }
  1428. }
  1429. if p.flags&FoldCase == 0 {
  1430. class = appendRange(class, lo, hi)
  1431. } else {
  1432. class = appendFoldedRange(class, lo, hi)
  1433. }
  1434. }
  1435. t = t[1:] // chop ]
  1436. // Use &re.Rune instead of &class to avoid allocation.
  1437. re.Rune = class
  1438. class = cleanClass(&re.Rune)
  1439. if sign < 0 {
  1440. class = negateClass(class)
  1441. }
  1442. re.Rune = class
  1443. p.push(re)
  1444. return t, nil
  1445. }
  1446. // cleanClass sorts the ranges (pairs of elements of r),
  1447. // merges them, and eliminates duplicates.
  1448. func cleanClass(rp *[]int) []int {
  1449. // Sort by lo increasing, hi decreasing to break ties.
  1450. sort.Sort(ranges{rp})
  1451. r := *rp
  1452. if len(r) < 2 {
  1453. return r
  1454. }
  1455. // Merge abutting, overlapping.
  1456. w := 2 // write index
  1457. for i := 2; i < len(r); i += 2 {
  1458. lo, hi := r[i], r[i+1]
  1459. if lo <= r[w-1]+1 {
  1460. // merge with previous range
  1461. if hi > r[w-1] {
  1462. r[w-1] = hi
  1463. }
  1464. continue
  1465. }
  1466. // new disjoint range
  1467. r[w] = lo
  1468. r[w+1] = hi
  1469. w += 2
  1470. }
  1471. return r[:w]
  1472. }
  1473. // appendLiteral returns the result of appending the literal x to the class r.
  1474. func appendLiteral(r []int, x int, flags Flags) []int {
  1475. if flags&FoldCase != 0 {
  1476. return appendFoldedRange(r, x, x)
  1477. }
  1478. return appendRange(r, x, x)
  1479. }
  1480. // appendRange returns the result of appending the range lo-hi to the class r.
  1481. func appendRange(r []int, lo, hi int) []int {
  1482. // Expand last range or next to last range if it overlaps or abuts.
  1483. // Checking two ranges helps when appending case-folded
  1484. // alphabets, so that one range can be expanding A-Z and the
  1485. // other expanding a-z.
  1486. n := len(r)
  1487. for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
  1488. if n >= i {
  1489. rlo, rhi := r[n-i], r[n-i+1]
  1490. if lo <= rhi+1 && rlo <= hi+1 {
  1491. if lo < rlo {
  1492. r[n-i] = lo
  1493. }
  1494. if hi > rhi {
  1495. r[n-i+1] = hi
  1496. }
  1497. return r
  1498. }
  1499. }
  1500. }
  1501. return append(r, lo, hi)
  1502. }
  1503. const (
  1504. // minimum and maximum runes involved in folding.
  1505. // checked during test.
  1506. minFold = 0x0041
  1507. maxFold = 0x1044f
  1508. )
  1509. // appendFoldedRange returns the result of appending the range lo-hi
  1510. // and its case folding-equivalent runes to the class r.
  1511. func appendFoldedRange(r []int, lo, hi int) []int {
  1512. // Optimizations.
  1513. if lo <= minFold && hi >= maxFold {
  1514. // Range is full: folding can't add more.
  1515. return appendRange(r, lo, hi)
  1516. }
  1517. if hi < minFold || lo > maxFold {
  1518. // Range is outside folding possibilities.
  1519. return appendRange(r, lo, hi)
  1520. }
  1521. if lo < minFold {
  1522. // [lo, minFold-1] needs no folding.
  1523. r = appendRange(r, lo, minFold-1)
  1524. lo = minFold
  1525. }
  1526. if hi > maxFold {
  1527. // [maxFold+1, hi] needs no folding.
  1528. r = appendRange(r, maxFold+1, hi)
  1529. hi = maxFold
  1530. }
  1531. // Brute force. Depend on appendRange to coalesce ranges on the fly.
  1532. for c := lo; c <= hi; c++ {
  1533. r = appendRange(r, c, c)
  1534. f := unicode.SimpleFold(c)
  1535. for f != c {
  1536. r = appendRange(r, f, f)
  1537. f = unicode.SimpleFold(f)
  1538. }
  1539. }
  1540. return r
  1541. }
  1542. // appendClass returns the result of appending the class x to the class r.
  1543. // It assume x is clean.
  1544. func appendClass(r []int, x []int) []int {
  1545. for i := 0; i < len(x); i += 2 {
  1546. r = appendRange(r, x[i], x[i+1])
  1547. }
  1548. return r
  1549. }
  1550. // appendFolded returns the result of appending the case folding of the class x to the class r.
  1551. func appendFoldedClass(r []int, x []int) []int {
  1552. for i := 0; i < len(x); i += 2 {
  1553. r = appendFoldedRange(r, x[i], x[i+1])
  1554. }
  1555. return r
  1556. }
  1557. // appendNegatedClass returns the result of appending the negation of the class x to the class r.
  1558. // It assumes x is clean.
  1559. func appendNegatedClass(r []int, x []int) []int {
  1560. nextLo := 0
  1561. for i := 0; i < len(x); i += 2 {
  1562. lo, hi := x[i], x[i+1]
  1563. if nextLo <= lo-1 {
  1564. r = appendRange(r, nextLo, lo-1)
  1565. }
  1566. nextLo = hi + 1
  1567. }
  1568. if nextLo <= unicode.MaxRune {
  1569. r = appendRange(r, nextLo, unicode.MaxRune)
  1570. }
  1571. return r
  1572. }
  1573. // appendTable returns the result of appending x to the class r.
  1574. func appendTable(r []int, x *unicode.RangeTable) []int {
  1575. for _, xr := range x.R16 {
  1576. lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
  1577. if stride == 1 {
  1578. r = appendRange(r, lo, hi)
  1579. continue
  1580. }
  1581. for c := lo; c <= hi; c += stride {
  1582. r = appendRange(r, c, c)
  1583. }
  1584. }
  1585. for _, xr := range x.R32 {
  1586. lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
  1587. if stride == 1 {
  1588. r = appendRange(r, lo, hi)
  1589. continue
  1590. }
  1591. for c := lo; c <= hi; c += stride {
  1592. r = appendRange(r, c, c)
  1593. }
  1594. }
  1595. return r
  1596. }
  1597. // appendNegatedTable returns the result of appending the negation of x to the class r.
  1598. func appendNegatedTable(r []int, x *unicode.RangeTable) []int {
  1599. nextLo := 0 // lo end of next class to add
  1600. for _, xr := range x.R16 {
  1601. lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
  1602. if stride == 1 {
  1603. if nextLo <= lo-1 {
  1604. r = appendRange(r, nextLo, lo-1)
  1605. }
  1606. nextLo = hi + 1
  1607. continue
  1608. }
  1609. for c := lo; c <= hi; c += stride {
  1610. if nextLo <= c-1 {
  1611. r = appendRange(r, nextLo, c-1)
  1612. }
  1613. nextLo = c + 1
  1614. }
  1615. }
  1616. for _, xr := range x.R32 {
  1617. lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
  1618. if stride == 1 {
  1619. if nextLo <= lo-1 {
  1620. r = appendRange(r, nextLo, lo-1)
  1621. }
  1622. nextLo = hi + 1
  1623. continue
  1624. }
  1625. for c := lo; c <= hi; c += stride {
  1626. if nextLo <= c-1 {
  1627. r = appendRange(r, nextLo, c-1)
  1628. }
  1629. nextLo = c + 1
  1630. }
  1631. }
  1632. if nextLo <= unicode.MaxRune {
  1633. r = appendRange(r, nextLo, unicode.MaxRune)
  1634. }
  1635. return r
  1636. }
  1637. // negateClass overwrites r and returns r's negation.
  1638. // It assumes the class r is already clean.
  1639. func negateClass(r []int) []int {
  1640. nextLo := 0 // lo end of next class to add
  1641. w := 0 // write index
  1642. for i := 0; i < len(r); i += 2 {
  1643. lo, hi := r[i], r[i+1]
  1644. if nextLo <= lo-1 {
  1645. r[w] = nextLo
  1646. r[w+1] = lo - 1
  1647. w += 2
  1648. }
  1649. nextLo = hi + 1
  1650. }
  1651. r = r[:w]
  1652. if nextLo <= unicode.MaxRune {
  1653. // It's possible for the negation to have one more
  1654. // range - this one - than the original class, so use append.
  1655. r = append(r, nextLo, unicode.MaxRune)
  1656. }
  1657. return r
  1658. }
  1659. // ranges implements sort.Interface on a []rune.
  1660. // The choice of receiver type definition is strange
  1661. // but avoids an allocation since we already have
  1662. // a *[]int.
  1663. type ranges struct {
  1664. p *[]int
  1665. }
  1666. func (ra ranges) Less(i, j int) bool {
  1667. p := *ra.p
  1668. i *= 2
  1669. j *= 2
  1670. return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
  1671. }
  1672. func (ra ranges) Len() int {
  1673. return len(*ra.p) / 2
  1674. }
  1675. func (ra ranges) Swap(i, j int) {
  1676. p := *ra.p
  1677. i *= 2
  1678. j *= 2
  1679. p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
  1680. }
  1681. func checkUTF8(s string) os.Error {
  1682. for s != "" {
  1683. rune, size := utf8.DecodeRuneInString(s)
  1684. if rune == utf8.RuneError && size == 1 {
  1685. return &Error{Code: ErrInvalidUTF8, Expr: s}
  1686. }
  1687. s = s[size:]
  1688. }
  1689. return nil
  1690. }
  1691. func nextRune(s string) (c int, t string, err os.Error) {
  1692. c, size := utf8.DecodeRuneInString(s)
  1693. if c == utf8.RuneError && size == 1 {
  1694. return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
  1695. }
  1696. return c, s[size:], nil
  1697. }
  1698. func isalnum(c int) bool {
  1699. return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
  1700. }
  1701. func unhex(c int) int {
  1702. if '0' <= c && c <= '9' {
  1703. return c - '0'
  1704. }
  1705. if 'a' <= c && c <= 'f' {
  1706. return c - 'a' + 10
  1707. }
  1708. if 'A' <= c && c <= 'F' {
  1709. return c - 'A' + 10
  1710. }
  1711. return -1
  1712. }