PageRenderTime 30ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/re2/regexp.cc

https://gitlab.com/Blueprint-Marketing/re2
C++ | 935 lines | 829 code | 60 blank | 46 comment | 105 complexity | 80b535d88fe0890ed0adaf192a72744a MD5 | raw file
  1. // Copyright 2006 The RE2 Authors. All Rights Reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Regular expression representation.
  5. // Tested by parse_test.cc
  6. #include "util/util.h"
  7. #include "re2/regexp.h"
  8. #include "re2/stringpiece.h"
  9. #include "re2/walker-inl.h"
  10. namespace re2 {
  11. // Constructor. Allocates vectors as appropriate for operator.
  12. Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
  13. : op_(static_cast<uint8>(op)),
  14. simple_(false),
  15. parse_flags_(static_cast<uint16>(parse_flags)),
  16. ref_(1),
  17. nsub_(0),
  18. down_(NULL) {
  19. subone_ = NULL;
  20. memset(the_union_, 0, sizeof the_union_);
  21. }
  22. // Destructor. Assumes already cleaned up children.
  23. // Private: use Decref() instead of delete to destroy Regexps.
  24. // Can't call Decref on the sub-Regexps here because
  25. // that could cause arbitrarily deep recursion, so
  26. // required Decref() to have handled them for us.
  27. Regexp::~Regexp() {
  28. if (nsub_ > 0)
  29. LOG(DFATAL) << "Regexp not destroyed.";
  30. switch (op_) {
  31. default:
  32. break;
  33. case kRegexpCapture:
  34. delete name_;
  35. break;
  36. case kRegexpLiteralString:
  37. delete[] runes_;
  38. break;
  39. case kRegexpCharClass:
  40. if (cc_)
  41. cc_->Delete();
  42. delete ccb_;
  43. break;
  44. }
  45. }
  46. // If it's possible to destroy this regexp without recurring,
  47. // do so and return true. Else return false.
  48. bool Regexp::QuickDestroy() {
  49. if (nsub_ == 0) {
  50. delete this;
  51. return true;
  52. }
  53. return false;
  54. }
  55. // Lazily allocated.
  56. static Mutex* ref_mutex;
  57. static map<Regexp*, int>* ref_map;
  58. int Regexp::Ref() {
  59. if (ref_ < kMaxRef)
  60. return ref_;
  61. MutexLock l(ref_mutex);
  62. return (*ref_map)[this];
  63. }
  64. // Increments reference count, returns object as convenience.
  65. Regexp* Regexp::Incref() {
  66. if (ref_ >= kMaxRef-1) {
  67. static std::once_flag ref_once;
  68. std::call_once(ref_once, []() {
  69. ref_mutex = new Mutex;
  70. ref_map = new map<Regexp*, int>;
  71. });
  72. // Store ref count in overflow map.
  73. MutexLock l(ref_mutex);
  74. if (ref_ == kMaxRef) {
  75. // already overflowed
  76. (*ref_map)[this]++;
  77. } else {
  78. // overflowing now
  79. (*ref_map)[this] = kMaxRef;
  80. ref_ = kMaxRef;
  81. }
  82. return this;
  83. }
  84. ref_++;
  85. return this;
  86. }
  87. // Decrements reference count and deletes this object if count reaches 0.
  88. void Regexp::Decref() {
  89. if (ref_ == kMaxRef) {
  90. // Ref count is stored in overflow map.
  91. MutexLock l(ref_mutex);
  92. int r = (*ref_map)[this] - 1;
  93. if (r < kMaxRef) {
  94. ref_ = static_cast<uint16>(r);
  95. ref_map->erase(this);
  96. } else {
  97. (*ref_map)[this] = r;
  98. }
  99. return;
  100. }
  101. ref_--;
  102. if (ref_ == 0)
  103. Destroy();
  104. }
  105. // Deletes this object; ref count has count reached 0.
  106. void Regexp::Destroy() {
  107. if (QuickDestroy())
  108. return;
  109. // Handle recursive Destroy with explicit stack
  110. // to avoid arbitrarily deep recursion on process stack [sigh].
  111. down_ = NULL;
  112. Regexp* stack = this;
  113. while (stack != NULL) {
  114. Regexp* re = stack;
  115. stack = re->down_;
  116. if (re->ref_ != 0)
  117. LOG(DFATAL) << "Bad reference count " << re->ref_;
  118. if (re->nsub_ > 0) {
  119. Regexp** subs = re->sub();
  120. for (int i = 0; i < re->nsub_; i++) {
  121. Regexp* sub = subs[i];
  122. if (sub == NULL)
  123. continue;
  124. if (sub->ref_ == kMaxRef)
  125. sub->Decref();
  126. else
  127. --sub->ref_;
  128. if (sub->ref_ == 0 && !sub->QuickDestroy()) {
  129. sub->down_ = stack;
  130. stack = sub;
  131. }
  132. }
  133. if (re->nsub_ > 1)
  134. delete[] subs;
  135. re->nsub_ = 0;
  136. }
  137. delete re;
  138. }
  139. }
  140. void Regexp::AddRuneToString(Rune r) {
  141. DCHECK(op_ == kRegexpLiteralString);
  142. if (nrunes_ == 0) {
  143. // start with 8
  144. runes_ = new Rune[8];
  145. } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
  146. // double on powers of two
  147. Rune *old = runes_;
  148. runes_ = new Rune[nrunes_ * 2];
  149. for (int i = 0; i < nrunes_; i++)
  150. runes_[i] = old[i];
  151. delete[] old;
  152. }
  153. runes_[nrunes_++] = r;
  154. }
  155. Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
  156. Regexp* re = new Regexp(kRegexpHaveMatch, flags);
  157. re->match_id_ = match_id;
  158. return re;
  159. }
  160. Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
  161. if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
  162. return sub;
  163. Regexp* re = new Regexp(kRegexpPlus, flags);
  164. re->AllocSub(1);
  165. re->sub()[0] = sub;
  166. return re;
  167. }
  168. Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
  169. if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
  170. return sub;
  171. Regexp* re = new Regexp(kRegexpStar, flags);
  172. re->AllocSub(1);
  173. re->sub()[0] = sub;
  174. return re;
  175. }
  176. Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
  177. if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
  178. return sub;
  179. Regexp* re = new Regexp(kRegexpQuest, flags);
  180. re->AllocSub(1);
  181. re->sub()[0] = sub;
  182. return re;
  183. }
  184. Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
  185. ParseFlags flags, bool can_factor) {
  186. if (nsub == 1)
  187. return sub[0];
  188. if (nsub == 0) {
  189. if (op == kRegexpAlternate)
  190. return new Regexp(kRegexpNoMatch, flags);
  191. else
  192. return new Regexp(kRegexpEmptyMatch, flags);
  193. }
  194. Regexp** subcopy = NULL;
  195. if (op == kRegexpAlternate && can_factor) {
  196. // Going to edit sub; make a copy so we don't step on caller.
  197. subcopy = new Regexp*[nsub];
  198. memmove(subcopy, sub, nsub * sizeof sub[0]);
  199. sub = subcopy;
  200. nsub = FactorAlternation(sub, nsub, flags);
  201. if (nsub == 1) {
  202. Regexp* re = sub[0];
  203. delete[] subcopy;
  204. return re;
  205. }
  206. }
  207. if (nsub > kMaxNsub) {
  208. // Too many subexpressions to fit in a single Regexp.
  209. // Make a two-level tree. Two levels gets us to 65535^2.
  210. int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
  211. Regexp* re = new Regexp(op, flags);
  212. re->AllocSub(nbigsub);
  213. Regexp** subs = re->sub();
  214. for (int i = 0; i < nbigsub - 1; i++)
  215. subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
  216. subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
  217. nsub - (nbigsub-1)*kMaxNsub, flags,
  218. false);
  219. delete[] subcopy;
  220. return re;
  221. }
  222. Regexp* re = new Regexp(op, flags);
  223. re->AllocSub(nsub);
  224. Regexp** subs = re->sub();
  225. for (int i = 0; i < nsub; i++)
  226. subs[i] = sub[i];
  227. delete[] subcopy;
  228. return re;
  229. }
  230. Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
  231. return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
  232. }
  233. Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
  234. return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
  235. }
  236. Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
  237. return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
  238. }
  239. Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
  240. Regexp* re = new Regexp(kRegexpCapture, flags);
  241. re->AllocSub(1);
  242. re->sub()[0] = sub;
  243. re->cap_ = cap;
  244. return re;
  245. }
  246. Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
  247. Regexp* re = new Regexp(kRegexpRepeat, flags);
  248. re->AllocSub(1);
  249. re->sub()[0] = sub;
  250. re->min_ = min;
  251. re->max_ = max;
  252. return re;
  253. }
  254. Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
  255. Regexp* re = new Regexp(kRegexpLiteral, flags);
  256. re->rune_ = rune;
  257. return re;
  258. }
  259. Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
  260. if (nrunes <= 0)
  261. return new Regexp(kRegexpEmptyMatch, flags);
  262. if (nrunes == 1)
  263. return NewLiteral(runes[0], flags);
  264. Regexp* re = new Regexp(kRegexpLiteralString, flags);
  265. for (int i = 0; i < nrunes; i++)
  266. re->AddRuneToString(runes[i]);
  267. return re;
  268. }
  269. Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
  270. Regexp* re = new Regexp(kRegexpCharClass, flags);
  271. re->cc_ = cc;
  272. return re;
  273. }
  274. // Swaps this and that in place.
  275. void Regexp::Swap(Regexp* that) {
  276. // Can use memmove because Regexp is just a struct (no vtable).
  277. char tmp[sizeof *this];
  278. memmove(tmp, this, sizeof tmp);
  279. memmove(this, that, sizeof tmp);
  280. memmove(that, tmp, sizeof tmp);
  281. }
  282. // Tests equality of all top-level structure but not subregexps.
  283. static bool TopEqual(Regexp* a, Regexp* b) {
  284. if (a->op() != b->op())
  285. return false;
  286. switch (a->op()) {
  287. case kRegexpNoMatch:
  288. case kRegexpEmptyMatch:
  289. case kRegexpAnyChar:
  290. case kRegexpAnyByte:
  291. case kRegexpBeginLine:
  292. case kRegexpEndLine:
  293. case kRegexpWordBoundary:
  294. case kRegexpNoWordBoundary:
  295. case kRegexpBeginText:
  296. return true;
  297. case kRegexpEndText:
  298. // The parse flags remember whether it's \z or (?-m:$),
  299. // which matters when testing against PCRE.
  300. return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
  301. case kRegexpLiteral:
  302. return a->rune() == b->rune() &&
  303. ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
  304. case kRegexpLiteralString:
  305. return a->nrunes() == b->nrunes() &&
  306. ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
  307. memcmp(a->runes(), b->runes(),
  308. a->nrunes() * sizeof a->runes()[0]) == 0;
  309. case kRegexpAlternate:
  310. case kRegexpConcat:
  311. return a->nsub() == b->nsub();
  312. case kRegexpStar:
  313. case kRegexpPlus:
  314. case kRegexpQuest:
  315. return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
  316. case kRegexpRepeat:
  317. return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
  318. a->min() == b->min() &&
  319. a->max() == b->max();
  320. case kRegexpCapture:
  321. return a->cap() == b->cap() && a->name() == b->name();
  322. case kRegexpHaveMatch:
  323. return a->match_id() == b->match_id();
  324. case kRegexpCharClass: {
  325. CharClass* acc = a->cc();
  326. CharClass* bcc = b->cc();
  327. return acc->size() == bcc->size() &&
  328. acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
  329. memcmp(acc->begin(), bcc->begin(),
  330. (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
  331. }
  332. }
  333. LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
  334. return 0;
  335. }
  336. bool Regexp::Equal(Regexp* a, Regexp* b) {
  337. if (a == NULL || b == NULL)
  338. return a == b;
  339. if (!TopEqual(a, b))
  340. return false;
  341. // Fast path:
  342. // return without allocating vector if there are no subregexps.
  343. switch (a->op()) {
  344. case kRegexpAlternate:
  345. case kRegexpConcat:
  346. case kRegexpStar:
  347. case kRegexpPlus:
  348. case kRegexpQuest:
  349. case kRegexpRepeat:
  350. case kRegexpCapture:
  351. break;
  352. default:
  353. return true;
  354. }
  355. // Committed to doing real work.
  356. // The stack (vector) has pairs of regexps waiting to
  357. // be compared. The regexps are only equal if
  358. // all the pairs end up being equal.
  359. vector<Regexp*> stk;
  360. for (;;) {
  361. // Invariant: TopEqual(a, b) == true.
  362. Regexp* a2;
  363. Regexp* b2;
  364. switch (a->op()) {
  365. default:
  366. break;
  367. case kRegexpAlternate:
  368. case kRegexpConcat:
  369. for (int i = 0; i < a->nsub(); i++) {
  370. a2 = a->sub()[i];
  371. b2 = b->sub()[i];
  372. if (!TopEqual(a2, b2))
  373. return false;
  374. stk.push_back(a2);
  375. stk.push_back(b2);
  376. }
  377. break;
  378. case kRegexpStar:
  379. case kRegexpPlus:
  380. case kRegexpQuest:
  381. case kRegexpRepeat:
  382. case kRegexpCapture:
  383. a2 = a->sub()[0];
  384. b2 = b->sub()[0];
  385. if (!TopEqual(a2, b2))
  386. return false;
  387. // Really:
  388. // stk.push_back(a2);
  389. // stk.push_back(b2);
  390. // break;
  391. // but faster to assign directly and loop.
  392. a = a2;
  393. b = b2;
  394. continue;
  395. }
  396. size_t n = stk.size();
  397. if (n == 0)
  398. break;
  399. DCHECK_GE(n, 2);
  400. a = stk[n-2];
  401. b = stk[n-1];
  402. stk.resize(n-2);
  403. }
  404. return true;
  405. }
  406. // Keep in sync with enum RegexpStatusCode in regexp.h
  407. static const char *kErrorStrings[] = {
  408. "no error",
  409. "unexpected error",
  410. "invalid escape sequence",
  411. "invalid character class",
  412. "invalid character class range",
  413. "missing ]",
  414. "missing )",
  415. "trailing \\",
  416. "no argument for repetition operator",
  417. "invalid repetition size",
  418. "bad repetition operator",
  419. "invalid perl operator",
  420. "invalid UTF-8",
  421. "invalid named capture group",
  422. };
  423. string RegexpStatus::CodeText(enum RegexpStatusCode code) {
  424. if (code < 0 || code >= arraysize(kErrorStrings))
  425. code = kRegexpInternalError;
  426. return kErrorStrings[code];
  427. }
  428. string RegexpStatus::Text() const {
  429. if (error_arg_.empty())
  430. return CodeText(code_);
  431. string s;
  432. s.append(CodeText(code_));
  433. s.append(": ");
  434. s.append(error_arg_.data(), error_arg_.size());
  435. return s;
  436. }
  437. void RegexpStatus::Copy(const RegexpStatus& status) {
  438. code_ = status.code_;
  439. error_arg_ = status.error_arg_;
  440. }
  441. typedef int Ignored; // Walker<void> doesn't exist
  442. // Walker subclass to count capturing parens in regexp.
  443. class NumCapturesWalker : public Regexp::Walker<Ignored> {
  444. public:
  445. NumCapturesWalker() : ncapture_(0) {}
  446. int ncapture() { return ncapture_; }
  447. virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
  448. if (re->op() == kRegexpCapture)
  449. ncapture_++;
  450. return ignored;
  451. }
  452. virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
  453. // Should never be called: we use Walk not WalkExponential.
  454. LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
  455. return ignored;
  456. }
  457. private:
  458. int ncapture_;
  459. DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker);
  460. };
  461. int Regexp::NumCaptures() {
  462. NumCapturesWalker w;
  463. w.Walk(this, 0);
  464. return w.ncapture();
  465. }
  466. // Walker class to build map of named capture groups and their indices.
  467. class NamedCapturesWalker : public Regexp::Walker<Ignored> {
  468. public:
  469. NamedCapturesWalker() : map_(NULL) {}
  470. ~NamedCapturesWalker() { delete map_; }
  471. map<string, int>* TakeMap() {
  472. map<string, int>* m = map_;
  473. map_ = NULL;
  474. return m;
  475. }
  476. Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
  477. if (re->op() == kRegexpCapture && re->name() != NULL) {
  478. // Allocate map once we find a name.
  479. if (map_ == NULL)
  480. map_ = new map<string, int>;
  481. // Record first occurrence of each name.
  482. // (The rule is that if you have the same name
  483. // multiple times, only the leftmost one counts.)
  484. if (map_->find(*re->name()) == map_->end())
  485. (*map_)[*re->name()] = re->cap();
  486. }
  487. return ignored;
  488. }
  489. virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
  490. // Should never be called: we use Walk not WalkExponential.
  491. LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
  492. return ignored;
  493. }
  494. private:
  495. map<string, int>* map_;
  496. DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker);
  497. };
  498. map<string, int>* Regexp::NamedCaptures() {
  499. NamedCapturesWalker w;
  500. w.Walk(this, 0);
  501. return w.TakeMap();
  502. }
  503. // Walker class to build map from capture group indices to their names.
  504. class CaptureNamesWalker : public Regexp::Walker<Ignored> {
  505. public:
  506. CaptureNamesWalker() : map_(NULL) {}
  507. ~CaptureNamesWalker() { delete map_; }
  508. map<int, string>* TakeMap() {
  509. map<int, string>* m = map_;
  510. map_ = NULL;
  511. return m;
  512. }
  513. Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
  514. if (re->op() == kRegexpCapture && re->name() != NULL) {
  515. // Allocate map once we find a name.
  516. if (map_ == NULL)
  517. map_ = new map<int, string>;
  518. (*map_)[re->cap()] = *re->name();
  519. }
  520. return ignored;
  521. }
  522. virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
  523. // Should never be called: we use Walk not WalkExponential.
  524. LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
  525. return ignored;
  526. }
  527. private:
  528. map<int, string>* map_;
  529. DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker);
  530. };
  531. map<int, string>* Regexp::CaptureNames() {
  532. CaptureNamesWalker w;
  533. w.Walk(this, 0);
  534. return w.TakeMap();
  535. }
  536. // Determines whether regexp matches must be anchored
  537. // with a fixed string prefix. If so, returns the prefix and
  538. // the regexp that remains after the prefix. The prefix might
  539. // be ASCII case-insensitive.
  540. bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
  541. // No need for a walker: the regexp must be of the form
  542. // 1. some number of ^ anchors
  543. // 2. a literal char or string
  544. // 3. the rest
  545. prefix->clear();
  546. *foldcase = false;
  547. *suffix = NULL;
  548. if (op_ != kRegexpConcat)
  549. return false;
  550. // Some number of anchors, then a literal or concatenation.
  551. int i = 0;
  552. Regexp** sub = this->sub();
  553. while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
  554. i++;
  555. if (i == 0 || i >= nsub_)
  556. return false;
  557. Regexp* re = sub[i];
  558. switch (re->op_) {
  559. default:
  560. return false;
  561. case kRegexpLiteralString:
  562. // Convert to string in proper encoding.
  563. if (re->parse_flags() & Latin1) {
  564. prefix->resize(re->nrunes_);
  565. for (int j = 0; j < re->nrunes_; j++)
  566. (*prefix)[j] = static_cast<char>(re->runes_[j]);
  567. } else {
  568. // Convert to UTF-8 in place.
  569. // Assume worst-case space and then trim.
  570. prefix->resize(re->nrunes_ * UTFmax);
  571. char *p = &(*prefix)[0];
  572. for (int j = 0; j < re->nrunes_; j++) {
  573. Rune r = re->runes_[j];
  574. if (r < Runeself)
  575. *p++ = static_cast<char>(r);
  576. else
  577. p += runetochar(p, &r);
  578. }
  579. prefix->resize(p - &(*prefix)[0]);
  580. }
  581. break;
  582. case kRegexpLiteral:
  583. if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
  584. prefix->append(1, static_cast<char>(re->rune_));
  585. } else {
  586. char buf[UTFmax];
  587. prefix->append(buf, runetochar(buf, &re->rune_));
  588. }
  589. break;
  590. }
  591. *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
  592. i++;
  593. // The rest.
  594. if (i < nsub_) {
  595. for (int j = i; j < nsub_; j++)
  596. sub[j]->Incref();
  597. re = Concat(sub + i, nsub_ - i, parse_flags());
  598. } else {
  599. re = new Regexp(kRegexpEmptyMatch, parse_flags());
  600. }
  601. *suffix = re;
  602. return true;
  603. }
  604. // Character class builder is a balanced binary tree (STL set)
  605. // containing non-overlapping, non-abutting RuneRanges.
  606. // The less-than operator used in the tree treats two
  607. // ranges as equal if they overlap at all, so that
  608. // lookups for a particular Rune are possible.
  609. CharClassBuilder::CharClassBuilder() {
  610. nrunes_ = 0;
  611. upper_ = 0;
  612. lower_ = 0;
  613. }
  614. // Add lo-hi to the class; return whether class got bigger.
  615. bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
  616. if (hi < lo)
  617. return false;
  618. if (lo <= 'z' && hi >= 'A') {
  619. // Overlaps some alpha, maybe not all.
  620. // Update bitmaps telling which ASCII letters are in the set.
  621. Rune lo1 = max<Rune>(lo, 'A');
  622. Rune hi1 = min<Rune>(hi, 'Z');
  623. if (lo1 <= hi1)
  624. upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
  625. lo1 = max<Rune>(lo, 'a');
  626. hi1 = min<Rune>(hi, 'z');
  627. if (lo1 <= hi1)
  628. lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
  629. }
  630. { // Check whether lo, hi is already in the class.
  631. iterator it = ranges_.find(RuneRange(lo, lo));
  632. if (it != end() && it->lo <= lo && hi <= it->hi)
  633. return false;
  634. }
  635. // Look for a range abutting lo on the left.
  636. // If it exists, take it out and increase our range.
  637. if (lo > 0) {
  638. iterator it = ranges_.find(RuneRange(lo-1, lo-1));
  639. if (it != end()) {
  640. lo = it->lo;
  641. if (it->hi > hi)
  642. hi = it->hi;
  643. nrunes_ -= it->hi - it->lo + 1;
  644. ranges_.erase(it);
  645. }
  646. }
  647. // Look for a range abutting hi on the right.
  648. // If it exists, take it out and increase our range.
  649. if (hi < Runemax) {
  650. iterator it = ranges_.find(RuneRange(hi+1, hi+1));
  651. if (it != end()) {
  652. hi = it->hi;
  653. nrunes_ -= it->hi - it->lo + 1;
  654. ranges_.erase(it);
  655. }
  656. }
  657. // Look for ranges between lo and hi. Take them out.
  658. // This is only safe because the set has no overlapping ranges.
  659. // We've already removed any ranges abutting lo and hi, so
  660. // any that overlap [lo, hi] must be contained within it.
  661. for (;;) {
  662. iterator it = ranges_.find(RuneRange(lo, hi));
  663. if (it == end())
  664. break;
  665. nrunes_ -= it->hi - it->lo + 1;
  666. ranges_.erase(it);
  667. }
  668. // Finally, add [lo, hi].
  669. nrunes_ += hi - lo + 1;
  670. ranges_.insert(RuneRange(lo, hi));
  671. return true;
  672. }
  673. void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
  674. for (iterator it = cc->begin(); it != cc->end(); ++it)
  675. AddRange(it->lo, it->hi);
  676. }
  677. bool CharClassBuilder::Contains(Rune r) {
  678. return ranges_.find(RuneRange(r, r)) != end();
  679. }
  680. // Does the character class behave the same on A-Z as on a-z?
  681. bool CharClassBuilder::FoldsASCII() {
  682. return ((upper_ ^ lower_) & AlphaMask) == 0;
  683. }
  684. CharClassBuilder* CharClassBuilder::Copy() {
  685. CharClassBuilder* cc = new CharClassBuilder;
  686. for (iterator it = begin(); it != end(); ++it)
  687. cc->ranges_.insert(RuneRange(it->lo, it->hi));
  688. cc->upper_ = upper_;
  689. cc->lower_ = lower_;
  690. cc->nrunes_ = nrunes_;
  691. return cc;
  692. }
  693. void CharClassBuilder::RemoveAbove(Rune r) {
  694. if (r >= Runemax)
  695. return;
  696. if (r < 'z') {
  697. if (r < 'a')
  698. lower_ = 0;
  699. else
  700. lower_ &= AlphaMask >> ('z' - r);
  701. }
  702. if (r < 'Z') {
  703. if (r < 'A')
  704. upper_ = 0;
  705. else
  706. upper_ &= AlphaMask >> ('Z' - r);
  707. }
  708. for (;;) {
  709. iterator it = ranges_.find(RuneRange(r + 1, Runemax));
  710. if (it == end())
  711. break;
  712. RuneRange rr = *it;
  713. ranges_.erase(it);
  714. nrunes_ -= rr.hi - rr.lo + 1;
  715. if (rr.lo <= r) {
  716. rr.hi = r;
  717. ranges_.insert(rr);
  718. nrunes_ += rr.hi - rr.lo + 1;
  719. }
  720. }
  721. }
  722. void CharClassBuilder::Negate() {
  723. // Build up negation and then copy in.
  724. // Could edit ranges in place, but C++ won't let me.
  725. vector<RuneRange> v;
  726. v.reserve(ranges_.size() + 1);
  727. // In negation, first range begins at 0, unless
  728. // the current class begins at 0.
  729. iterator it = begin();
  730. if (it == end()) {
  731. v.push_back(RuneRange(0, Runemax));
  732. } else {
  733. int nextlo = 0;
  734. if (it->lo == 0) {
  735. nextlo = it->hi + 1;
  736. ++it;
  737. }
  738. for (; it != end(); ++it) {
  739. v.push_back(RuneRange(nextlo, it->lo - 1));
  740. nextlo = it->hi + 1;
  741. }
  742. if (nextlo <= Runemax)
  743. v.push_back(RuneRange(nextlo, Runemax));
  744. }
  745. ranges_.clear();
  746. for (size_t i = 0; i < v.size(); i++)
  747. ranges_.insert(v[i]);
  748. upper_ = AlphaMask & ~upper_;
  749. lower_ = AlphaMask & ~lower_;
  750. nrunes_ = Runemax+1 - nrunes_;
  751. }
  752. // Character class is a sorted list of ranges.
  753. // The ranges are allocated in the same block as the header,
  754. // necessitating a special allocator and Delete method.
  755. CharClass* CharClass::New(int maxranges) {
  756. CharClass* cc;
  757. uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
  758. cc = reinterpret_cast<CharClass*>(data);
  759. cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
  760. cc->nranges_ = 0;
  761. cc->folds_ascii_ = false;
  762. cc->nrunes_ = 0;
  763. return cc;
  764. }
  765. void CharClass::Delete() {
  766. uint8 *data = reinterpret_cast<uint8*>(this);
  767. delete[] data;
  768. }
  769. CharClass* CharClass::Negate() {
  770. CharClass* cc = CharClass::New(nranges_+1);
  771. cc->folds_ascii_ = folds_ascii_;
  772. cc->nrunes_ = Runemax + 1 - nrunes_;
  773. int n = 0;
  774. int nextlo = 0;
  775. for (CharClass::iterator it = begin(); it != end(); ++it) {
  776. if (it->lo == nextlo) {
  777. nextlo = it->hi + 1;
  778. } else {
  779. cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
  780. nextlo = it->hi + 1;
  781. }
  782. }
  783. if (nextlo <= Runemax)
  784. cc->ranges_[n++] = RuneRange(nextlo, Runemax);
  785. cc->nranges_ = n;
  786. return cc;
  787. }
  788. bool CharClass::Contains(Rune r) {
  789. RuneRange* rr = ranges_;
  790. int n = nranges_;
  791. while (n > 0) {
  792. int m = n/2;
  793. if (rr[m].hi < r) {
  794. rr += m+1;
  795. n -= m+1;
  796. } else if (r < rr[m].lo) {
  797. n = m;
  798. } else { // rr[m].lo <= r && r <= rr[m].hi
  799. return true;
  800. }
  801. }
  802. return false;
  803. }
  804. CharClass* CharClassBuilder::GetCharClass() {
  805. CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
  806. int n = 0;
  807. for (iterator it = begin(); it != end(); ++it)
  808. cc->ranges_[n++] = *it;
  809. cc->nranges_ = n;
  810. DCHECK_LE(n, static_cast<int>(ranges_.size()));
  811. cc->nrunes_ = nrunes_;
  812. cc->folds_ascii_ = FoldsASCII();
  813. return cc;
  814. }
  815. } // namespace re2