/src/Symfony/Component/CssSelector/Tokenizer.php

https://github.com/sebio/symfony · PHP · 168 lines · 121 code · 23 blank · 24 comment · 25 complexity · fc1294bd98f3c0d6933beded55b2aa20 MD5 · raw file

  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien.potencier@symfony-project.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\CssSelector;
  11. /**
  12. * Tokenizer lexes a CSS Selector to tokens.
  13. *
  14. * This component is a port of the Python lxml library,
  15. * which is copyright Infrae and distributed under the BSD license.
  16. *
  17. * @author Fabien Potencier <fabien.potencier@symfony-project.com>
  18. */
  19. class Tokenizer
  20. {
  21. public function tokenize($s)
  22. {
  23. if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
  24. $mbEncoding = mb_internal_encoding();
  25. mb_internal_encoding('ASCII');
  26. }
  27. $tokens = array();
  28. $pos = 0;
  29. $s = preg_replace('#/\*.*?\*/#s', '', $s);
  30. while (true) {
  31. if (preg_match('#\s+#A', $s, $match, 0, $pos)) {
  32. $preceding_whitespace_pos = $pos;
  33. $pos += strlen($match[0]);
  34. } else {
  35. $preceding_whitespace_pos = 0;
  36. }
  37. if ($pos >= strlen($s)) {
  38. if (isset($mbEncoding)) {
  39. mb_internal_encoding($mbEncoding);
  40. }
  41. return $tokens;
  42. }
  43. if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
  44. $sym = substr($s, $pos, strlen($match[0]));
  45. $tokens[] = new Token('Symbol', $sym, $pos);
  46. $pos += strlen($match[0]);
  47. continue;
  48. }
  49. $c = $s[$pos];
  50. $c2 = substr($s, $pos, 2);
  51. if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
  52. $tokens[] = new Token('Token', $c2, $pos);
  53. $pos += 2;
  54. continue;
  55. }
  56. if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
  57. if (in_array($c, array('.', '#', '[')) && $preceding_whitespace_pos > 0) {
  58. $tokens[] = new Token('Token', ' ', $preceding_whitespace_pos);
  59. }
  60. $tokens[] = new Token('Token', $c, $pos);
  61. ++$pos;
  62. continue;
  63. }
  64. if ('"' === $c || "'" === $c) {
  65. // Quoted string
  66. $old_pos = $pos;
  67. list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
  68. $tokens[] = new Token('String', $sym, $old_pos);
  69. continue;
  70. }
  71. $old_pos = $pos;
  72. list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
  73. $tokens[] = new Token('Symbol', $sym, $old_pos);
  74. continue;
  75. }
  76. }
  77. /**
  78. * @throws SyntaxError When expected closing is not found
  79. */
  80. protected function tokenizeEscapedString($s, $pos)
  81. {
  82. $quote = $s[$pos];
  83. $pos = $pos + 1;
  84. $start = $pos;
  85. while (true) {
  86. $next = strpos($s, $quote, $pos);
  87. if (false === $next) {
  88. throw new SyntaxError(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start)));
  89. }
  90. $result = substr($s, $start, $next - $start);
  91. if ('\\' === $result[strlen($result) - 1]) {
  92. // next quote character is escaped
  93. $pos = $next + 1;
  94. continue;
  95. }
  96. if (false !== strpos($result, '\\')) {
  97. $result = $this->unescapeStringLiteral($result);
  98. }
  99. return array($result, $next + 1);
  100. }
  101. }
  102. /**
  103. * @throws SyntaxError When invalid escape sequence is found
  104. */
  105. protected function unescapeStringLiteral($literal)
  106. {
  107. return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal)
  108. {
  109. if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) {
  110. $matches[0] = substr($matches[0], 1);
  111. if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) {
  112. return chr(trim($matches[0]));
  113. }
  114. } else {
  115. throw new SyntaxError(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal));
  116. }
  117. }, $literal);
  118. }
  119. /**
  120. * @throws SyntaxError When Unexpected symbol is found
  121. */
  122. protected function tokenizeSymbol($s, $pos)
  123. {
  124. $start = $pos;
  125. if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) {
  126. // Goes to end of s
  127. return array(substr($s, $start), strlen($s));
  128. }
  129. $matchStart = $match[0][1];
  130. if ($matchStart == $pos) {
  131. throw new SyntaxError(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos));
  132. }
  133. $result = substr($s, $start, $matchStart - $start);
  134. $pos = $matchStart;
  135. return array($result, $pos);
  136. }
  137. }