/src/3rdparty/javascriptcore/JavaScriptCore/wrec/WRECParser.cpp

https://bitbucket.org/ultra_iter/qt-vtl · C++ · 643 lines · 481 code · 114 blank · 48 comment · 75 complexity · 286894955f1bdb58208d16092a2fff24 MD5 · raw file

  1. /*
  2. * Copyright (C) 2008 Apple Inc. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions
  6. * are met:
  7. * 1. Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * 2. Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. *
  13. * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
  17. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include "config.h"
  26. #include "WRECParser.h"
  27. #if ENABLE(WREC)
  28. #include "CharacterClassConstructor.h"
  29. #include "WRECFunctors.h"
  30. using namespace WTF;
  31. namespace JSC { namespace WREC {
  32. // These error messages match the error messages used by PCRE.
  33. const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier";
  34. const char* Parser::QuantifierWithoutAtom = "nothing to repeat";
  35. const char* Parser::ParenthesesUnmatched = "unmatched parentheses";
  36. const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?";
  37. const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
  38. const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class";
  39. const char* Parser::CharacterClassOutOfOrder = "range out of order in character class";
  40. const char* Parser::EscapeUnterminated = "\\ at end of pattern";
  41. class PatternCharacterSequence {
  42. typedef Generator::JumpList JumpList;
  43. public:
  44. PatternCharacterSequence(Generator& generator, JumpList& failures)
  45. : m_generator(generator)
  46. , m_failures(failures)
  47. {
  48. }
  49. size_t size() { return m_sequence.size(); }
  50. void append(int ch)
  51. {
  52. m_sequence.append(ch);
  53. }
  54. void flush()
  55. {
  56. if (!m_sequence.size())
  57. return;
  58. m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size());
  59. m_sequence.clear();
  60. }
  61. void flush(const Quantifier& quantifier)
  62. {
  63. if (!m_sequence.size())
  64. return;
  65. m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1);
  66. switch (quantifier.type) {
  67. case Quantifier::None:
  68. case Quantifier::Error:
  69. ASSERT_NOT_REACHED();
  70. break;
  71. case Quantifier::Greedy: {
  72. GeneratePatternCharacterFunctor functor(m_sequence.last());
  73. m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
  74. break;
  75. }
  76. case Quantifier::NonGreedy: {
  77. GeneratePatternCharacterFunctor functor(m_sequence.last());
  78. m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
  79. break;
  80. }
  81. }
  82. m_sequence.clear();
  83. }
  84. private:
  85. Generator& m_generator;
  86. JumpList& m_failures;
  87. Vector<int, 8> m_sequence;
  88. };
  89. ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier()
  90. {
  91. switch (peek()) {
  92. case '?':
  93. consume();
  94. return Quantifier(Quantifier::Greedy, 0, 1);
  95. case '*':
  96. consume();
  97. return Quantifier(Quantifier::Greedy, 0);
  98. case '+':
  99. consume();
  100. return Quantifier(Quantifier::Greedy, 1);
  101. case '{': {
  102. SavedState state(*this);
  103. consume();
  104. // Accept: {n}, {n,}, {n,m}.
  105. // Reject: {n,m} where n > m.
  106. // Ignore: Anything else, such as {n, m}.
  107. if (!peekIsDigit()) {
  108. state.restore();
  109. return Quantifier();
  110. }
  111. unsigned min = consumeNumber();
  112. unsigned max = min;
  113. if (peek() == ',') {
  114. consume();
  115. max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity;
  116. }
  117. if (peek() != '}') {
  118. state.restore();
  119. return Quantifier();
  120. }
  121. consume();
  122. if (min > max) {
  123. setError(QuantifierOutOfOrder);
  124. return Quantifier(Quantifier::Error);
  125. }
  126. return Quantifier(Quantifier::Greedy, min, max);
  127. }
  128. default:
  129. return Quantifier(); // No quantifier.
  130. }
  131. }
  132. Quantifier Parser::consumeQuantifier()
  133. {
  134. Quantifier q = consumeGreedyQuantifier();
  135. if ((q.type == Quantifier::Greedy) && (peek() == '?')) {
  136. consume();
  137. q.type = Quantifier::NonGreedy;
  138. }
  139. return q;
  140. }
  141. bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert)
  142. {
  143. Quantifier q = consumeQuantifier();
  144. switch (q.type) {
  145. case Quantifier::None: {
  146. m_generator.generateCharacterClass(failures, charClass, invert);
  147. break;
  148. }
  149. case Quantifier::Greedy: {
  150. GenerateCharacterClassFunctor functor(&charClass, invert);
  151. m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max);
  152. break;
  153. }
  154. case Quantifier::NonGreedy: {
  155. GenerateCharacterClassFunctor functor(&charClass, invert);
  156. m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max);
  157. break;
  158. }
  159. case Quantifier::Error:
  160. return false;
  161. }
  162. return true;
  163. }
  164. bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId)
  165. {
  166. Quantifier q = consumeQuantifier();
  167. switch (q.type) {
  168. case Quantifier::None: {
  169. m_generator.generateBackreference(failures, subpatternId);
  170. break;
  171. }
  172. case Quantifier::Greedy:
  173. case Quantifier::NonGreedy:
  174. m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max);
  175. return true;
  176. case Quantifier::Error:
  177. return false;
  178. }
  179. return true;
  180. }
  181. bool Parser::parseParentheses(JumpList& failures)
  182. {
  183. ParenthesesType type = consumeParenthesesType();
  184. // FIXME: WREC originally failed to backtrack correctly in cases such as
  185. // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
  186. // unsupported parentheses, we fall back on PCRE.
  187. switch (type) {
  188. case Generator::Assertion: {
  189. m_generator.generateParenthesesAssertion(failures);
  190. if (consume() != ')') {
  191. setError(ParenthesesUnmatched);
  192. return false;
  193. }
  194. Quantifier quantifier = consumeQuantifier();
  195. if (quantifier.type != Quantifier::None && quantifier.min == 0) {
  196. setError(ParenthesesNotSupported);
  197. return false;
  198. }
  199. return true;
  200. }
  201. case Generator::InvertedAssertion: {
  202. m_generator.generateParenthesesInvertedAssertion(failures);
  203. if (consume() != ')') {
  204. setError(ParenthesesUnmatched);
  205. return false;
  206. }
  207. Quantifier quantifier = consumeQuantifier();
  208. if (quantifier.type != Quantifier::None && quantifier.min == 0) {
  209. setError(ParenthesesNotSupported);
  210. return false;
  211. }
  212. return true;
  213. }
  214. default:
  215. setError(ParenthesesNotSupported);
  216. return false;
  217. }
  218. }
  219. bool Parser::parseCharacterClass(JumpList& failures)
  220. {
  221. bool invert = false;
  222. if (peek() == '^') {
  223. consume();
  224. invert = true;
  225. }
  226. CharacterClassConstructor constructor(m_ignoreCase);
  227. int ch;
  228. while ((ch = peek()) != ']') {
  229. switch (ch) {
  230. case EndOfPattern:
  231. setError(CharacterClassUnmatched);
  232. return false;
  233. case '\\': {
  234. consume();
  235. Escape escape = consumeEscape(true);
  236. switch (escape.type()) {
  237. case Escape::PatternCharacter: {
  238. int character = PatternCharacterEscape::cast(escape).character();
  239. if (character == '-')
  240. constructor.flushBeforeEscapedHyphen();
  241. constructor.put(character);
  242. break;
  243. }
  244. case Escape::CharacterClass: {
  245. const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape);
  246. ASSERT(!characterClassEscape.invert());
  247. constructor.append(characterClassEscape.characterClass());
  248. break;
  249. }
  250. case Escape::Error:
  251. return false;
  252. case Escape::Backreference:
  253. case Escape::WordBoundaryAssertion: {
  254. ASSERT_NOT_REACHED();
  255. break;
  256. }
  257. }
  258. break;
  259. }
  260. default:
  261. consume();
  262. constructor.put(ch);
  263. }
  264. }
  265. consume();
  266. // lazily catch reversed ranges ([z-a])in character classes
  267. if (constructor.isUpsideDown()) {
  268. setError(CharacterClassOutOfOrder);
  269. return false;
  270. }
  271. constructor.flush();
  272. CharacterClass charClass = constructor.charClass();
  273. return parseCharacterClassQuantifier(failures, charClass, invert);
  274. }
  275. bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape)
  276. {
  277. switch (escape.type()) {
  278. case Escape::PatternCharacter:
  279. ASSERT_NOT_REACHED();
  280. return false;
  281. case Escape::CharacterClass:
  282. return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert());
  283. case Escape::Backreference:
  284. return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId());
  285. case Escape::WordBoundaryAssertion:
  286. m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert());
  287. return true;
  288. case Escape::Error:
  289. return false;
  290. }
  291. ASSERT_NOT_REACHED();
  292. return false;
  293. }
  294. Escape Parser::consumeEscape(bool inCharacterClass)
  295. {
  296. switch (peek()) {
  297. case EndOfPattern:
  298. setError(EscapeUnterminated);
  299. return Escape(Escape::Error);
  300. // Assertions
  301. case 'b':
  302. consume();
  303. if (inCharacterClass)
  304. return PatternCharacterEscape('\b');
  305. return WordBoundaryAssertionEscape(false); // do not invert
  306. case 'B':
  307. consume();
  308. if (inCharacterClass)
  309. return PatternCharacterEscape('B');
  310. return WordBoundaryAssertionEscape(true); // invert
  311. // CharacterClassEscape
  312. case 'd':
  313. consume();
  314. return CharacterClassEscape(CharacterClass::digits(), false);
  315. case 's':
  316. consume();
  317. return CharacterClassEscape(CharacterClass::spaces(), false);
  318. case 'w':
  319. consume();
  320. return CharacterClassEscape(CharacterClass::wordchar(), false);
  321. case 'D':
  322. consume();
  323. return inCharacterClass
  324. ? CharacterClassEscape(CharacterClass::nondigits(), false)
  325. : CharacterClassEscape(CharacterClass::digits(), true);
  326. case 'S':
  327. consume();
  328. return inCharacterClass
  329. ? CharacterClassEscape(CharacterClass::nonspaces(), false)
  330. : CharacterClassEscape(CharacterClass::spaces(), true);
  331. case 'W':
  332. consume();
  333. return inCharacterClass
  334. ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
  335. : CharacterClassEscape(CharacterClass::wordchar(), true);
  336. // DecimalEscape
  337. case '1':
  338. case '2':
  339. case '3':
  340. case '4':
  341. case '5':
  342. case '6':
  343. case '7':
  344. case '8':
  345. case '9': {
  346. if (peekDigit() > m_numSubpatterns || inCharacterClass) {
  347. // To match Firefox, we parse an invalid backreference in the range [1-7]
  348. // as an octal escape.
  349. return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
  350. }
  351. int value = 0;
  352. do {
  353. unsigned newValue = value * 10 + peekDigit();
  354. if (newValue > m_numSubpatterns)
  355. break;
  356. value = newValue;
  357. consume();
  358. } while (peekIsDigit());
  359. return BackreferenceEscape(value);
  360. }
  361. // Octal escape
  362. case '0':
  363. consume();
  364. return PatternCharacterEscape(consumeOctal());
  365. // ControlEscape
  366. case 'f':
  367. consume();
  368. return PatternCharacterEscape('\f');
  369. case 'n':
  370. consume();
  371. return PatternCharacterEscape('\n');
  372. case 'r':
  373. consume();
  374. return PatternCharacterEscape('\r');
  375. case 't':
  376. consume();
  377. return PatternCharacterEscape('\t');
  378. case 'v':
  379. consume();
  380. return PatternCharacterEscape('\v');
  381. // ControlLetter
  382. case 'c': {
  383. SavedState state(*this);
  384. consume();
  385. int control = consume();
  386. // To match Firefox, inside a character class, we also accept numbers
  387. // and '_' as control characters.
  388. if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
  389. state.restore();
  390. return PatternCharacterEscape('\\');
  391. }
  392. return PatternCharacterEscape(control & 31);
  393. }
  394. // HexEscape
  395. case 'x': {
  396. consume();
  397. SavedState state(*this);
  398. int x = consumeHex(2);
  399. if (x == -1) {
  400. state.restore();
  401. return PatternCharacterEscape('x');
  402. }
  403. return PatternCharacterEscape(x);
  404. }
  405. // UnicodeEscape
  406. case 'u': {
  407. consume();
  408. SavedState state(*this);
  409. int x = consumeHex(4);
  410. if (x == -1) {
  411. state.restore();
  412. return PatternCharacterEscape('u');
  413. }
  414. return PatternCharacterEscape(x);
  415. }
  416. // IdentityEscape
  417. default:
  418. return PatternCharacterEscape(consume());
  419. }
  420. }
  421. void Parser::parseAlternative(JumpList& failures)
  422. {
  423. PatternCharacterSequence sequence(m_generator, failures);
  424. while (1) {
  425. switch (peek()) {
  426. case EndOfPattern:
  427. case '|':
  428. case ')':
  429. sequence.flush();
  430. return;
  431. case '*':
  432. case '+':
  433. case '?':
  434. case '{': {
  435. Quantifier q = consumeQuantifier();
  436. if (q.type == Quantifier::None) {
  437. sequence.append(consume());
  438. continue;
  439. }
  440. if (q.type == Quantifier::Error)
  441. return;
  442. if (!sequence.size()) {
  443. setError(QuantifierWithoutAtom);
  444. return;
  445. }
  446. sequence.flush(q);
  447. continue;
  448. }
  449. case '^':
  450. consume();
  451. sequence.flush();
  452. m_generator.generateAssertionBOL(failures);
  453. continue;
  454. case '$':
  455. consume();
  456. sequence.flush();
  457. m_generator.generateAssertionEOL(failures);
  458. continue;
  459. case '.':
  460. consume();
  461. sequence.flush();
  462. if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
  463. return;
  464. continue;
  465. case '[':
  466. consume();
  467. sequence.flush();
  468. if (!parseCharacterClass(failures))
  469. return;
  470. continue;
  471. case '(':
  472. consume();
  473. sequence.flush();
  474. if (!parseParentheses(failures))
  475. return;
  476. continue;
  477. case '\\': {
  478. consume();
  479. Escape escape = consumeEscape(false);
  480. if (escape.type() == Escape::PatternCharacter) {
  481. sequence.append(PatternCharacterEscape::cast(escape).character());
  482. continue;
  483. }
  484. sequence.flush();
  485. if (!parseNonCharacterEscape(failures, escape))
  486. return;
  487. continue;
  488. }
  489. default:
  490. sequence.append(consume());
  491. continue;
  492. }
  493. }
  494. }
  495. /*
  496. TOS holds index.
  497. */
  498. void Parser::parseDisjunction(JumpList& failures)
  499. {
  500. parseAlternative(failures);
  501. if (peek() != '|')
  502. return;
  503. JumpList successes;
  504. do {
  505. consume();
  506. m_generator.terminateAlternative(successes, failures);
  507. parseAlternative(failures);
  508. } while (peek() == '|');
  509. m_generator.terminateDisjunction(successes);
  510. }
  511. Generator::ParenthesesType Parser::consumeParenthesesType()
  512. {
  513. if (peek() != '?')
  514. return Generator::Capturing;
  515. consume();
  516. switch (consume()) {
  517. case ':':
  518. return Generator::NonCapturing;
  519. case '=':
  520. return Generator::Assertion;
  521. case '!':
  522. return Generator::InvertedAssertion;
  523. default:
  524. setError(ParenthesesTypeInvalid);
  525. return Generator::Error;
  526. }
  527. }
  528. } } // namespace JSC::WREC
  529. #endif // ENABLE(WREC)