PageRenderTime 48ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/kotal/vendor/phptal/PHPTAL/Dom/SaxXmlParser.php

https://bitbucket.org/chrispiechowicz/zepto
PHP | 480 lines | 370 code | 56 blank | 54 comment | 62 complexity | 4ef8f372387c791ba39a4fdacf290b05 MD5 | raw file
Possible License(s): LGPL-2.1, MIT, BSD-3-Clause
  1. <?php
  2. /**
  3. * PHPTAL templating engine
  4. *
  5. * PHP Version 5
  6. *
  7. * @category HTML
  8. * @package PHPTAL
  9. * @author Laurent Bedubourg <lbedubourg@motion-twin.com>
  10. * @author Kornel Lesiński <kornel@aardvarkmedia.co.uk>
  11. * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
  12. * @version SVN: $Id$
  13. * @link http://phptal.org/
  14. */
  15. /**
  16. * Simple sax like xml parser for PHPTAL
  17. * ("Dom" in the class name comes from name of the directory, not mode of operation)
  18. *
  19. * At the time this parser was created, standard PHP libraries were not suitable
  20. * (could not retrieve doctypes, xml declaration, problems with comments and CDATA).
  21. *
  22. * There are still some problems: XML parsers don't care about exact format of enties
  23. * or CDATA sections (PHPTAL tries to preserve them),
  24. * <?php ?> blocks are not allowed in attributes.
  25. *
  26. * This parser failed to enforce some XML well-formedness constraints,
  27. * and there are ill-formed templates "in the wild" because of this.
  28. *
  29. * @package PHPTAL
  30. * @subpackage Dom
  31. * @see PHPTAL_DOM_DocumentBuilder
  32. */
  33. class PHPTAL_Dom_SaxXmlParser
  34. {
  35. private $_file;
  36. private $_line;
  37. private $_source;
  38. // available parser states
  39. const ST_ROOT = 0;
  40. const ST_TEXT = 1;
  41. const ST_LT = 2;
  42. const ST_TAG_NAME = 3;
  43. const ST_TAG_CLOSE = 4;
  44. const ST_TAG_SINGLE = 5;
  45. const ST_TAG_ATTRIBUTES = 6;
  46. const ST_TAG_BETWEEN_ATTRIBUTE = 7;
  47. const ST_CDATA = 8;
  48. const ST_COMMENT = 9;
  49. const ST_DOCTYPE = 10;
  50. const ST_XMLDEC = 11;
  51. const ST_PREPROC = 12;
  52. const ST_ATTR_KEY = 13;
  53. const ST_ATTR_EQ = 14;
  54. const ST_ATTR_QUOTE = 15;
  55. const ST_ATTR_VALUE = 16;
  56. const BOM_STR = "\xef\xbb\xbf";
  57. static $state_names = array(
  58. self::ST_ROOT => 'root node',
  59. self::ST_TEXT => 'text',
  60. self::ST_LT => 'start of tag',
  61. self::ST_TAG_NAME => 'tag name',
  62. self::ST_TAG_CLOSE => 'closing tag',
  63. self::ST_TAG_SINGLE => 'self-closing tag',
  64. self::ST_TAG_ATTRIBUTES => 'tag',
  65. self::ST_TAG_BETWEEN_ATTRIBUTE => 'tag attributes',
  66. self::ST_CDATA => 'CDATA',
  67. self::ST_COMMENT => 'comment',
  68. self::ST_DOCTYPE => 'doctype',
  69. self::ST_XMLDEC => 'XML declaration',
  70. self::ST_PREPROC => 'preprocessor directive',
  71. self::ST_ATTR_KEY => 'attribute name',
  72. self::ST_ATTR_EQ => 'attribute value',
  73. self::ST_ATTR_QUOTE => 'quoted attribute value',
  74. self::ST_ATTR_VALUE => 'unquoted attribute value',
  75. );
  76. private $input_encoding;
  77. public function __construct($input_encoding)
  78. {
  79. $this->input_encoding = $input_encoding;
  80. $this->_file = "<string>";
  81. }
  82. public function parseFile(PHPTAL_Dom_DocumentBuilder $builder, $src)
  83. {
  84. if (!file_exists($src)) {
  85. throw new PHPTAL_IOException("file $src not found");
  86. }
  87. return $this->parseString($builder, file_get_contents($src), $src);
  88. }
  89. public function parseString(PHPTAL_Dom_DocumentBuilder $builder, $src, $filename = '<string>')
  90. {
  91. try
  92. {
  93. $builder->setEncoding($this->input_encoding);
  94. $this->_file = $filename;
  95. $this->_line = 1;
  96. $state = self::ST_ROOT;
  97. $mark = 0;
  98. $len = strlen($src);
  99. $quoteStyle = '"';
  100. $tagname = "";
  101. $attribute = "";
  102. $attributes = array();
  103. $customDoctype = false;
  104. $builder->setSource($this->_file, $this->_line);
  105. $builder->onDocumentStart();
  106. $i=0;
  107. // remove BOM (UTF-8 byte order mark)...
  108. if (substr($src, 0, 3) === self::BOM_STR) {
  109. $i=3;
  110. }
  111. for (; $i<$len; $i++) {
  112. $c = $src[$i]; // Change to substr($src, $i, 1); if you want to use mb_string.func_overload
  113. if ($c === "\n") $builder->setSource($this->_file, ++$this->_line);
  114. switch ($state) {
  115. case self::ST_ROOT:
  116. if ($c === '<') {
  117. $mark = $i; // mark tag start
  118. $state = self::ST_LT;
  119. } elseif (!self::isWhiteChar($c)) {
  120. $this->raiseError("Characters found before beginning of the document! (wrap document in < tal:block > to avoid this error)");
  121. }
  122. break;
  123. case self::ST_TEXT:
  124. if ($c === '<') {
  125. if ($mark != $i) {
  126. $builder->onElementData($this->sanitizeEscapedText($this->checkEncoding(substr($src, $mark, $i-$mark))));
  127. }
  128. $mark = $i;
  129. $state = self::ST_LT;
  130. }
  131. break;
  132. case self::ST_LT:
  133. if ($c === '/') {
  134. $mark = $i+1;
  135. $state = self::ST_TAG_CLOSE;
  136. } elseif ($c === '?' and strtolower(substr($src, $i, 5)) === '?xml ') {
  137. $state = self::ST_XMLDEC;
  138. } elseif ($c === '?') {
  139. $state = self::ST_PREPROC;
  140. } elseif ($c === '!' and substr($src, $i, 3) === '!--') {
  141. $state = self::ST_COMMENT;
  142. } elseif ($c === '!' and substr($src, $i, 8) === '![CDATA[') {
  143. $state = self::ST_CDATA;
  144. $mark = $i+8; // past opening tag
  145. } elseif ($c === '!' and strtoupper(substr($src, $i, 8)) === '!DOCTYPE') {
  146. $state = self::ST_DOCTYPE;
  147. } elseif (self::isWhiteChar($c)) {
  148. $state = self::ST_TEXT;
  149. } else {
  150. $mark = $i; // mark node name start
  151. $attributes = array();
  152. $attribute = "";
  153. $state = self::ST_TAG_NAME;
  154. }
  155. break;
  156. case self::ST_TAG_NAME:
  157. if (self::isWhiteChar($c) || $c === '/' || $c === '>') {
  158. $tagname = substr($src, $mark, $i-$mark);
  159. if (!$this->isValidQName($tagname)) $this->raiseError("Invalid tag name '$tagname'");
  160. if ($c === '/') {
  161. $state = self::ST_TAG_SINGLE;
  162. } elseif ($c === '>') {
  163. $mark = $i+1; // mark text start
  164. $state = self::ST_TEXT;
  165. $builder->onElementStart($tagname, $attributes);
  166. } else /* isWhiteChar */ {
  167. $state = self::ST_TAG_ATTRIBUTES;
  168. }
  169. }
  170. break;
  171. case self::ST_TAG_CLOSE:
  172. if ($c === '>') {
  173. $tagname = rtrim(substr($src, $mark, $i-$mark));
  174. $builder->onElementClose($tagname);
  175. $mark = $i+1; // mark text start
  176. $state = self::ST_TEXT;
  177. }
  178. break;
  179. case self::ST_TAG_SINGLE:
  180. if ($c !== '>') {
  181. $this->raiseError("Expected '/>', but found '/$c' inside tag < $tagname >");
  182. }
  183. $mark = $i+1; // mark text start
  184. $state = self::ST_TEXT;
  185. $builder->onElementStart($tagname, $attributes);
  186. $builder->onElementClose($tagname);
  187. break;
  188. case self::ST_TAG_BETWEEN_ATTRIBUTE:
  189. case self::ST_TAG_ATTRIBUTES:
  190. if ($c === '>') {
  191. $mark = $i+1; // mark text start
  192. $state = self::ST_TEXT;
  193. $builder->onElementStart($tagname, $attributes);
  194. } elseif ($c === '/') {
  195. $state = self::ST_TAG_SINGLE;
  196. } elseif (self::isWhiteChar($c)) {
  197. $state = self::ST_TAG_ATTRIBUTES;
  198. } elseif ($state === self::ST_TAG_ATTRIBUTES && $this->isValidQName($c)) {
  199. $mark = $i; // mark attribute key start
  200. $state = self::ST_ATTR_KEY;
  201. } else $this->raiseError("Unexpected character '$c' between attributes of < $tagname >");
  202. break;
  203. case self::ST_COMMENT:
  204. if ($c === '>' && $i > $mark+4 && substr($src, $i-2, 2) === '--') {
  205. if (preg_match('/^-|--|-$/', substr($src, $mark +4, $i-$mark+1 -7))) {
  206. $this->raiseError("Ill-formed comment. XML comments are not allowed to contain '--' or start/end with '-': ".substr($src, $mark+4, $i-$mark+1-7));
  207. }
  208. $builder->onComment($this->checkEncoding(substr($src, $mark+4, $i-$mark+1-7)));
  209. $mark = $i+1; // mark text start
  210. $state = self::ST_TEXT;
  211. }
  212. break;
  213. case self::ST_CDATA:
  214. if ($c === '>' and substr($src, $i-2, 2) === ']]') {
  215. $builder->onCDATASection($this->checkEncoding(substr($src, $mark, $i-$mark-2)));
  216. $mark = $i+1; // mark text start
  217. $state = self::ST_TEXT;
  218. }
  219. break;
  220. case self::ST_XMLDEC:
  221. if ($c === '?' && substr($src, $i, 2) === '?>') {
  222. $builder->onXmlDecl($this->checkEncoding(substr($src, $mark, $i-$mark+2)));
  223. $i++; // skip '>'
  224. $mark = $i+1; // mark text start
  225. $state = self::ST_TEXT;
  226. }
  227. break;
  228. case self::ST_DOCTYPE:
  229. if ($c === '[') {
  230. $customDoctype = true;
  231. } elseif ($customDoctype && $c === '>' && substr($src, $i-1, 2) === ']>') {
  232. $customDoctype = false;
  233. $builder->onDocType($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
  234. $mark = $i+1; // mark text start
  235. $state = self::ST_TEXT;
  236. } elseif (!$customDoctype && $c === '>') {
  237. $customDoctype = false;
  238. $builder->onDocType($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
  239. $mark = $i+1; // mark text start
  240. $state = self::ST_TEXT;
  241. }
  242. break;
  243. case self::ST_PREPROC:
  244. if ($c === '>' and substr($src, $i-1, 1) === '?') {
  245. $builder->onProcessingInstruction($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
  246. $mark = $i+1; // mark text start
  247. $state = self::ST_TEXT;
  248. }
  249. break;
  250. case self::ST_ATTR_KEY:
  251. if ($c === '=' || self::isWhiteChar($c)) {
  252. $attribute = substr($src, $mark, $i-$mark);
  253. if (!$this->isValidQName($attribute)) {
  254. $this->raiseError("Invalid attribute name '$attribute' in < $tagname >");
  255. }
  256. if (isset($attributes[$attribute])) {
  257. $this->raiseError("Attribute $attribute in < $tagname > is defined more than once");
  258. }
  259. if ($c === '=') $state = self::ST_ATTR_VALUE;
  260. else /* white char */ $state = self::ST_ATTR_EQ;
  261. } elseif ($c === '/' || $c==='>') {
  262. $attribute = substr($src, $mark, $i-$mark);
  263. if (!$this->isValidQName($attribute)) {
  264. $this->raiseError("Invalid attribute name '$attribute'");
  265. }
  266. $this->raiseError("Attribute $attribute does not have value (found end of tag instead of '=')");
  267. }
  268. break;
  269. case self::ST_ATTR_EQ:
  270. if ($c === '=') {
  271. $state = self::ST_ATTR_VALUE;
  272. } elseif (!self::isWhiteChar($c)) {
  273. $this->raiseError("Attribute $attribute in < $tagname > does not have value (found character '$c' instead of '=')");
  274. }
  275. break;
  276. case self::ST_ATTR_VALUE:
  277. if (self::isWhiteChar($c)) {
  278. } elseif ($c === '"' or $c === '\'') {
  279. $quoteStyle = $c;
  280. $state = self::ST_ATTR_QUOTE;
  281. $mark = $i+1; // mark attribute real value start
  282. } else {
  283. $this->raiseError("Value of attribute $attribute in < $tagname > is not in quotes (found character '$c' instead of quote)");
  284. }
  285. break;
  286. case self::ST_ATTR_QUOTE:
  287. if ($c === $quoteStyle) {
  288. $attributes[$attribute] = $this->sanitizeEscapedText($this->checkEncoding(substr($src, $mark, $i-$mark)));
  289. // PHPTAL's code generator assumes input is escaped for double-quoted strings. Single-quoted attributes need to be converted.
  290. // FIXME: it should be escaped at later stage.
  291. $attributes[$attribute] = str_replace('"',"&quot;", $attributes[$attribute]);
  292. $state = self::ST_TAG_BETWEEN_ATTRIBUTE;
  293. }
  294. break;
  295. }
  296. }
  297. if ($state === self::ST_TEXT) // allows text past root node, which is in violation of XML spec
  298. {
  299. if ($i > $mark) {
  300. $text = substr($src, $mark, $i-$mark);
  301. if (!ctype_space($text)) $this->raiseError("Characters found after end of the root element (wrap document in < tal:block > to avoid this error)");
  302. }
  303. } else {
  304. if ($state === self::ST_ROOT) {
  305. $msg = "Document does not have any tags";
  306. } else {
  307. $msg = "Finished document in unexpected state: ".self::$state_names[$state]." is not finished";
  308. }
  309. $this->raiseError($msg);
  310. }
  311. $builder->onDocumentEnd();
  312. }
  313. catch(PHPTAL_TemplateException $e)
  314. {
  315. $e->hintSrcPosition($this->_file, $this->_line);
  316. throw $e;
  317. }
  318. return $builder;
  319. }
  320. private function isValidQName($name)
  321. {
  322. $name = $this->checkEncoding($name);
  323. return preg_match('/^([a-z_\x80-\xff]+[a-z0-9._\x80-\xff-]*:)?[a-z_\x80-\xff]+[a-z0-9._\x80-\xff-]*$/i', $name);
  324. }
  325. private function checkEncoding($str)
  326. {
  327. if ($str === '') return '';
  328. if ($this->input_encoding === 'UTF-8') {
  329. // $match expression below somehow triggers quite deep recurrency and stack overflow in preg
  330. // to avoid this, check string bit by bit, omitting ASCII fragments.
  331. if (strlen($str) > 200) {
  332. $chunks = preg_split('/(?>[\x09\x0A\x0D\x20-\x7F]+)/',$str,null,PREG_SPLIT_NO_EMPTY);
  333. foreach ($chunks as $chunk) {
  334. if (strlen($chunk) < 200) {
  335. $this->checkEncoding($chunk);
  336. }
  337. }
  338. return $str;
  339. }
  340. // http://www.w3.org/International/questions/qa-forms-utf-8
  341. $match = '[\x09\x0A\x0D\x20-\x7F]' // ASCII
  342. . '|[\xC2-\xDF][\x80-\xBF]' // non-overlong 2-byte
  343. . '|\xE0[\xA0-\xBF][\x80-\xBF]' // excluding overlongs
  344. . '|[\xE1-\xEC\xEE\xEE][\x80-\xBF]{2}' // straight 3-byte (exclude FFFE and FFFF)
  345. . '|\xEF[\x80-\xBE][\x80-\xBF]' // straight 3-byte
  346. . '|\xEF\xBF[\x80-\xBD]' // straight 3-byte
  347. . '|\xED[\x80-\x9F][\x80-\xBF]' // excluding surrogates
  348. . '|\xF0[\x90-\xBF][\x80-\xBF]{2}' // planes 1-3
  349. . '|[\xF1-\xF3][\x80-\xBF]{3}' // planes 4-15
  350. . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'; // plane 16
  351. if (!preg_match('/^(?:(?>'.$match.'))+$/s',$str)) {
  352. $res = preg_split('/((?>'.$match.')+)/s',$str,null,PREG_SPLIT_DELIM_CAPTURE);
  353. for($i=0; $i < count($res); $i+=2)
  354. {
  355. $res[$i] = self::convertBytesToEntities(array(1=>$res[$i]));
  356. }
  357. $this->raiseError("Invalid UTF-8 bytes: ".implode('', $res));
  358. }
  359. }
  360. if ($this->input_encoding === 'ISO-8859-1') {
  361. // http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
  362. $forbid = '/((?>[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F]+))/s';
  363. if (preg_match($forbid, $str)) {
  364. $str = preg_replace_callback($forbid, array('self', 'convertBytesToEntities'), $str);
  365. $this->raiseError("Invalid ISO-8859-1 characters: ".$str);
  366. }
  367. }
  368. return $str;
  369. }
  370. /**
  371. * preg callback
  372. * Changes all bytes to hexadecimal XML entities
  373. *
  374. * @param array $m first array element is used for input
  375. *
  376. * @return string
  377. */
  378. private static function convertBytesToEntities(array $m)
  379. {
  380. $m = $m[1]; $out = '';
  381. for($i=0; $i < strlen($m); $i++)
  382. {
  383. $out .= '&#X'.strtoupper(dechex(ord($m[$i]))).';';
  384. }
  385. return $out;
  386. }
  387. /**
  388. * This is where this parser violates XML and refuses to be an annoying bastard.
  389. */
  390. private function sanitizeEscapedText($str)
  391. {
  392. $str = str_replace('&apos;', '&#39;', $str); // PHP's html_entity_decode doesn't seem to support that!
  393. /* <?php ?> blocks can't reliably work in attributes (due to escaping impossible in XML)
  394. so they have to be converted into special TALES expression
  395. */
  396. $types = ini_get('short_open_tag')?'php|=|':'php';
  397. $str = preg_replace_callback("/<\?($types)(.*?)\?>/", array('self', 'convertPHPBlockToTALES'), $str);
  398. // corrects all non-entities and neutralizes potentially problematic CDATA end marker
  399. $str = strtr(preg_replace('/&(?!(?:#x?[a-f0-9]+|[a-z][a-z0-9]*);)/i', '&amp;', $str), array('<'=>'&lt;', ']]>'=>']]&gt;'));
  400. return $str;
  401. }
  402. private static function convertPHPBlockToTALES($m)
  403. {
  404. list(, $type, $code) = $m;
  405. if ($type === '=') $code = 'echo '.$code;
  406. return '${structure phptal-internal-php-block:'.rawurlencode($code).'}';
  407. }
  408. public function getSourceFile()
  409. {
  410. return $this->_file;
  411. }
  412. public function getLineNumber()
  413. {
  414. return $this->_line;
  415. }
  416. public static function isWhiteChar($c)
  417. {
  418. return strpos(" \t\n\r\0", $c) !== false;
  419. }
  420. protected function raiseError($errStr)
  421. {
  422. throw new PHPTAL_ParserException($errStr, $this->_file, $this->_line);
  423. }
  424. }