PageRenderTime 53ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/moodle/lib/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php

https://bitbucket.org/geek745/moodle-db2
PHP | 495 lines | 354 code | 57 blank | 84 comment | 75 complexity | 18ae40c8ddddefe9ef5f6088ada08235 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, BSD-3-Clause, LGPL-2.0
  1. <?php
  2. require_once 'HTMLPurifier/Lexer.php';
  3. HTMLPurifier_ConfigSchema::define(
  4. 'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
  5. <p>
  6. Specifies the number of tokens the DirectLex line number tracking
  7. implementations should process before attempting to resyncronize the
  8. current line count by manually counting all previous new-lines. When
  9. at 0, this functionality is disabled. Lower values will decrease
  10. performance, and this is only strictly necessary if the counting
  11. algorithm is buggy (in which case you should report it as a bug).
  12. This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
  13. not being used. This directive has been available since 2.0.0.
  14. </p>
  15. ');
  16. /**
  17. * Our in-house implementation of a parser.
  18. *
  19. * A pure PHP parser, DirectLex has absolutely no dependencies, making
  20. * it a reasonably good default for PHP4. Written with efficiency in mind,
  21. * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  22. * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  23. *
  24. * @todo Reread XML spec and document differences.
  25. */
  26. class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  27. {
  28. /**
  29. * Whitespace characters for str(c)spn.
  30. * @protected
  31. */
  32. var $_whitespace = "\x20\x09\x0D\x0A";
  33. /**
  34. * Callback function for script CDATA fudge
  35. * @param $matches, in form of array(opening tag, contents, closing tag)
  36. * @static
  37. */
  38. function scriptCallback($matches) {
  39. return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  40. }
  41. function tokenizeHTML($html, $config, &$context) {
  42. // special normalization for script tags without any armor
  43. // our "armor" heurstic is a < sign any number of whitespaces after
  44. // the first script tag
  45. if ($config->get('HTML', 'Trusted')) {
  46. $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  47. array('HTMLPurifier_Lexer_DirectLex', 'scriptCallback'), $html);
  48. }
  49. $html = $this->normalize($html, $config, $context);
  50. $cursor = 0; // our location in the text
  51. $inside_tag = false; // whether or not we're parsing the inside of a tag
  52. $array = array(); // result array
  53. $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
  54. if ($maintain_line_numbers === null) {
  55. // automatically determine line numbering by checking
  56. // if error collection is on
  57. $maintain_line_numbers = $config->get('Core', 'CollectErrors');
  58. }
  59. if ($maintain_line_numbers) $current_line = 1;
  60. else $current_line = false;
  61. $context->register('CurrentLine', $current_line);
  62. $nl = "\n";
  63. // how often to manually recalculate. This will ALWAYS be right,
  64. // but it's pretty wasteful. Set to 0 to turn off
  65. $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
  66. $e = false;
  67. if ($config->get('Core', 'CollectErrors')) {
  68. $e =& $context->get('ErrorCollector');
  69. }
  70. // infinite loop protection
  71. // has to be pretty big, since html docs can be big
  72. // we're allow two hundred thousand tags... more than enough?
  73. // NOTE: this is also used for synchronization, so watch out
  74. $loops = 0;
  75. while(true) {
  76. // infinite loop protection
  77. if (++$loops > 200000) return array();
  78. // recalculate lines
  79. if (
  80. $maintain_line_numbers && // line number tracking is on
  81. $synchronize_interval && // synchronization is on
  82. $cursor > 0 && // cursor is further than zero
  83. $loops % $synchronize_interval === 0 // time to synchronize!
  84. ) {
  85. $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
  86. }
  87. $position_next_lt = strpos($html, '<', $cursor);
  88. $position_next_gt = strpos($html, '>', $cursor);
  89. // triggers on "<b>asdf</b>" but not "asdf <b></b>"
  90. // special case to set up context
  91. if ($position_next_lt === $cursor) {
  92. $inside_tag = true;
  93. $cursor++;
  94. }
  95. if (!$inside_tag && $position_next_lt !== false) {
  96. // We are not inside tag and there still is another tag to parse
  97. $token = new
  98. HTMLPurifier_Token_Text(
  99. $this->parseData(
  100. substr(
  101. $html, $cursor, $position_next_lt - $cursor
  102. )
  103. )
  104. );
  105. if ($maintain_line_numbers) {
  106. $token->line = $current_line;
  107. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
  108. }
  109. $array[] = $token;
  110. $cursor = $position_next_lt + 1;
  111. $inside_tag = true;
  112. continue;
  113. } elseif (!$inside_tag) {
  114. // We are not inside tag but there are no more tags
  115. // If we're already at the end, break
  116. if ($cursor === strlen($html)) break;
  117. // Create Text of rest of string
  118. $token = new
  119. HTMLPurifier_Token_Text(
  120. $this->parseData(
  121. substr(
  122. $html, $cursor
  123. )
  124. )
  125. );
  126. if ($maintain_line_numbers) $token->line = $current_line;
  127. $array[] = $token;
  128. break;
  129. } elseif ($inside_tag && $position_next_gt !== false) {
  130. // We are in tag and it is well formed
  131. // Grab the internals of the tag
  132. $strlen_segment = $position_next_gt - $cursor;
  133. if ($strlen_segment < 1) {
  134. // there's nothing to process!
  135. $token = new HTMLPurifier_Token_Text('<');
  136. $cursor++;
  137. continue;
  138. }
  139. $segment = substr($html, $cursor, $strlen_segment);
  140. if ($segment === false) {
  141. // somehow, we attempted to access beyond the end of
  142. // the string, defense-in-depth, reported by Nate Abele
  143. break;
  144. }
  145. // Check if it's a comment
  146. if (
  147. strncmp('!--', $segment, 3) === 0
  148. ) {
  149. // re-determine segment length, looking for -->
  150. $position_comment_end = strpos($html, '-->', $cursor);
  151. if ($position_comment_end === false) {
  152. // uh oh, we have a comment that extends to
  153. // infinity. Can't be helped: set comment
  154. // end position to end of string
  155. if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
  156. $position_comment_end = strlen($html);
  157. $end = true;
  158. } else {
  159. $end = false;
  160. }
  161. $strlen_segment = $position_comment_end - $cursor;
  162. $segment = substr($html, $cursor, $strlen_segment);
  163. $token = new HTMLPurifier_Token_Comment(substr($segment, 3));
  164. if ($maintain_line_numbers) {
  165. $token->line = $current_line;
  166. $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
  167. }
  168. $array[] = $token;
  169. $cursor = $end ? $position_comment_end : $position_comment_end + 3;
  170. $inside_tag = false;
  171. continue;
  172. }
  173. // Check if it's an end tag
  174. $is_end_tag = (strpos($segment,'/') === 0);
  175. if ($is_end_tag) {
  176. $type = substr($segment, 1);
  177. $token = new HTMLPurifier_Token_End($type);
  178. if ($maintain_line_numbers) {
  179. $token->line = $current_line;
  180. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  181. }
  182. $array[] = $token;
  183. $inside_tag = false;
  184. $cursor = $position_next_gt + 1;
  185. continue;
  186. }
  187. // Check leading character is alnum, if not, we may
  188. // have accidently grabbed an emoticon. Translate into
  189. // text and go our merry way
  190. if (!ctype_alpha($segment[0])) {
  191. // XML: $segment[0] !== '_' && $segment[0] !== ':'
  192. if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
  193. $token = new
  194. HTMLPurifier_Token_Text(
  195. '<' .
  196. $this->parseData(
  197. $segment
  198. ) .
  199. '>'
  200. );
  201. if ($maintain_line_numbers) {
  202. $token->line = $current_line;
  203. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  204. }
  205. $array[] = $token;
  206. $cursor = $position_next_gt + 1;
  207. $inside_tag = false;
  208. continue;
  209. }
  210. // Check if it is explicitly self closing, if so, remove
  211. // trailing slash. Remember, we could have a tag like <br>, so
  212. // any later token processing scripts must convert improperly
  213. // classified EmptyTags from StartTags.
  214. $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
  215. if ($is_self_closing) {
  216. $strlen_segment--;
  217. $segment = substr($segment, 0, $strlen_segment);
  218. }
  219. // Check if there are any attributes
  220. $position_first_space = strcspn($segment, $this->_whitespace);
  221. if ($position_first_space >= $strlen_segment) {
  222. if ($is_self_closing) {
  223. $token = new HTMLPurifier_Token_Empty($segment);
  224. } else {
  225. $token = new HTMLPurifier_Token_Start($segment);
  226. }
  227. if ($maintain_line_numbers) {
  228. $token->line = $current_line;
  229. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  230. }
  231. $array[] = $token;
  232. $inside_tag = false;
  233. $cursor = $position_next_gt + 1;
  234. continue;
  235. }
  236. // Grab out all the data
  237. $type = substr($segment, 0, $position_first_space);
  238. $attribute_string =
  239. trim(
  240. substr(
  241. $segment, $position_first_space
  242. )
  243. );
  244. if ($attribute_string) {
  245. $attr = $this->parseAttributeString(
  246. $attribute_string
  247. , $config, $context
  248. );
  249. } else {
  250. $attr = array();
  251. }
  252. if ($is_self_closing) {
  253. $token = new HTMLPurifier_Token_Empty($type, $attr);
  254. } else {
  255. $token = new HTMLPurifier_Token_Start($type, $attr);
  256. }
  257. if ($maintain_line_numbers) {
  258. $token->line = $current_line;
  259. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  260. }
  261. $array[] = $token;
  262. $cursor = $position_next_gt + 1;
  263. $inside_tag = false;
  264. continue;
  265. } else {
  266. // inside tag, but there's no ending > sign
  267. if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
  268. $token = new
  269. HTMLPurifier_Token_Text(
  270. '<' .
  271. $this->parseData(
  272. substr($html, $cursor)
  273. )
  274. );
  275. if ($maintain_line_numbers) $token->line = $current_line;
  276. // no cursor scroll? Hmm...
  277. $array[] = $token;
  278. break;
  279. }
  280. break;
  281. }
  282. $context->destroy('CurrentLine');
  283. return $array;
  284. }
  285. /**
  286. * PHP 4 compatible substr_count that implements offset and length
  287. */
  288. function substrCount($haystack, $needle, $offset, $length) {
  289. static $oldVersion;
  290. if ($oldVersion === null) {
  291. $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
  292. }
  293. if ($oldVersion) {
  294. $haystack = substr($haystack, $offset, $length);
  295. return substr_count($haystack, $needle);
  296. } else {
  297. return substr_count($haystack, $needle, $offset, $length);
  298. }
  299. }
  300. /**
  301. * Takes the inside of an HTML tag and makes an assoc array of attributes.
  302. *
  303. * @param $string Inside of tag excluding name.
  304. * @returns Assoc array of attributes.
  305. */
  306. function parseAttributeString($string, $config, &$context) {
  307. $string = (string) $string; // quick typecast
  308. if ($string == '') return array(); // no attributes
  309. $e = false;
  310. if ($config->get('Core', 'CollectErrors')) {
  311. $e =& $context->get('ErrorCollector');
  312. }
  313. // let's see if we can abort as quickly as possible
  314. // one equal sign, no spaces => one attribute
  315. $num_equal = substr_count($string, '=');
  316. $has_space = strpos($string, ' ');
  317. if ($num_equal === 0 && !$has_space) {
  318. // bool attribute
  319. return array($string => $string);
  320. } elseif ($num_equal === 1 && !$has_space) {
  321. // only one attribute
  322. list($key, $quoted_value) = explode('=', $string);
  323. $quoted_value = trim($quoted_value);
  324. if (!$key) {
  325. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  326. return array();
  327. }
  328. if (!$quoted_value) return array($key => '');
  329. $first_char = @$quoted_value[0];
  330. $last_char = @$quoted_value[strlen($quoted_value)-1];
  331. $same_quote = ($first_char == $last_char);
  332. $open_quote = ($first_char == '"' || $first_char == "'");
  333. if ( $same_quote && $open_quote) {
  334. // well behaved
  335. $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
  336. } else {
  337. // not well behaved
  338. if ($open_quote) {
  339. if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
  340. $value = substr($quoted_value, 1);
  341. } else {
  342. $value = $quoted_value;
  343. }
  344. }
  345. if ($value === false) $value = '';
  346. return array($key => $value);
  347. }
  348. // setup loop environment
  349. $array = array(); // return assoc array of attributes
  350. $cursor = 0; // current position in string (moves forward)
  351. $size = strlen($string); // size of the string (stays the same)
  352. // if we have unquoted attributes, the parser expects a terminating
  353. // space, so let's guarantee that there's always a terminating space.
  354. $string .= ' ';
  355. // infinite loop protection
  356. $loops = 0;
  357. while(true) {
  358. // infinite loop protection
  359. if (++$loops > 1000) {
  360. trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING);
  361. return array();
  362. }
  363. if ($cursor >= $size) {
  364. break;
  365. }
  366. $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
  367. // grab the key
  368. $key_begin = $cursor; //we're currently at the start of the key
  369. // scroll past all characters that are the key (not whitespace or =)
  370. $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
  371. $key_end = $cursor; // now at the end of the key
  372. $key = substr($string, $key_begin, $key_end - $key_begin);
  373. if (!$key) {
  374. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  375. $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
  376. continue; // empty key
  377. }
  378. // scroll past all whitespace
  379. $cursor += strspn($string, $this->_whitespace, $cursor);
  380. if ($cursor >= $size) {
  381. $array[$key] = $key;
  382. break;
  383. }
  384. // if the next character is an equal sign, we've got a regular
  385. // pair, otherwise, it's a bool attribute
  386. $first_char = @$string[$cursor];
  387. if ($first_char == '=') {
  388. // key="value"
  389. $cursor++;
  390. $cursor += strspn($string, $this->_whitespace, $cursor);
  391. if ($cursor === false) {
  392. $array[$key] = '';
  393. break;
  394. }
  395. // we might be in front of a quote right now
  396. $char = @$string[$cursor];
  397. if ($char == '"' || $char == "'") {
  398. // it's quoted, end bound is $char
  399. $cursor++;
  400. $value_begin = $cursor;
  401. $cursor = strpos($string, $char, $cursor);
  402. $value_end = $cursor;
  403. } else {
  404. // it's not quoted, end bound is whitespace
  405. $value_begin = $cursor;
  406. $cursor += strcspn($string, $this->_whitespace, $cursor);
  407. $value_end = $cursor;
  408. }
  409. // we reached a premature end
  410. if ($cursor === false) {
  411. $cursor = $size;
  412. $value_end = $cursor;
  413. }
  414. $value = substr($string, $value_begin, $value_end - $value_begin);
  415. if ($value === false) $value = '';
  416. $array[$key] = $this->parseData($value);
  417. $cursor++;
  418. } else {
  419. // boolattr
  420. if ($key !== '') {
  421. $array[$key] = $key;
  422. } else {
  423. // purely theoretical
  424. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  425. }
  426. }
  427. }
  428. return $array;
  429. }
  430. }