PageRenderTime 56ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/classes/fXML.php

https://bitbucket.org/wbond/flourish/
PHP | 709 lines | 322 code | 108 blank | 279 comment | 63 complexity | 6935fe7792b241d65a323275453f482d MD5 | raw file
  1. <?php
  2. /**
  3. * Provides functionality for XML files
  4. *
  5. * This class is implemented to use the UTF-8 character encoding. Please see
  6. * http://flourishlib.com/docs/UTF-8 for more information.
  7. *
  8. * @copyright Copyright (c) 2007-2011 Will Bond, others
  9. * @author Will Bond [wb] <will@flourishlib.com>
  10. * @author Craig Ruksznis [cr-imarc] <craigruk@imarc.net>
  11. * @license http://flourishlib.com/license
  12. *
  13. * @package Flourish
  14. * @link http://flourishlib.com/fXML
  15. *
  16. * @version 1.0.0b8
  17. * @changes 1.0.0b8 Fixed a method signature [wb, 2011-08-24]
  18. * @changes 1.0.0b7 Added a workaround for iconv having issues in MAMP 1.9.4+ [wb, 2011-07-26]
  19. * @changes 1.0.0b6 Updated class to use fCore::startErrorCapture() instead of `error_reporting()` [wb, 2010-08-09]
  20. * @changes 1.0.0b5 Added the `$fix_entities_encoding` parameter to ::__construct() [cr-imarc+wb, 2010-08-08]
  21. * @changes 1.0.0b4 Updated the class to automatically add a `__` prefix for the default namespace and to use that for attribute and child element access [wb, 2010-04-06]
  22. * @changes 1.0.0b3 Added the `$http_timeout` parameter to ::__construct() [wb, 2009-09-16]
  23. * @changes 1.0.0b2 Added instance functionality for reading of XML files [wb, 2009-09-01]
  24. * @changes 1.0.0b The initial implementation [wb, 2008-01-13]
  25. */
  26. class fXML implements ArrayAccess
  27. {
  28. // The following constants allow for nice looking callbacks to static methods
  29. const encode = 'fXML::encode';
  30. const sendHeader = 'fXML::sendHeader';
  31. /**
  32. * Encodes content for display in a UTF-8 encoded XML document
  33. *
  34. * @param string $content The content to encode
  35. * @return string The encoded content
  36. */
  37. static public function encode($content)
  38. {
  39. return htmlspecialchars(html_entity_decode($content, ENT_QUOTES, 'UTF-8'), ENT_QUOTES, 'UTF-8');
  40. }
  41. /**
  42. * This works around a bug in MAMP 1.9.4+ and PHP 5.3 where iconv()
  43. * does not seem to properly assign the return value to a variable, but
  44. * does work when returning the value.
  45. *
  46. * @param string $in_charset The incoming character encoding
  47. * @param string $out_charset The outgoing character encoding
  48. * @param string $string The string to convert
  49. * @return string The converted string
  50. */
  51. static private function iconv($in_charset, $out_charset, $string)
  52. {
  53. return iconv($in_charset, $out_charset, $string);
  54. }
  55. /**
  56. * Sets the proper `Content-Type` HTTP header for a UTF-8 XML file
  57. *
  58. * @return void
  59. */
  60. static public function sendHeader()
  61. {
  62. header('Content-Type: text/xml; charset=utf-8');
  63. }
  64. /**
  65. * Custom prefix => namespace URI mappings
  66. *
  67. * @var array
  68. */
  69. protected $__custom_prefixes;
  70. /**
  71. * The dom element for this XML
  72. *
  73. * @var DOMElement
  74. */
  75. protected $__dom;
  76. /**
  77. * An XPath object for performing xpath lookups
  78. *
  79. * @var DOMXPath
  80. */
  81. protected $__xpath;
  82. /**
  83. * The XML string for serialization
  84. *
  85. * @var string
  86. */
  87. protected $__xml;
  88. /**
  89. * Create the XML object from a string, fFile or URL
  90. *
  91. * The `$default_namespace` will be used for any sort of methods calls,
  92. * member access or array access when the element or attribute name does
  93. * not include a `:`.
  94. *
  95. * @throws fValidationException When the source XML is invalid or does not exist
  96. *
  97. * @param fFile|string $source The source of the XML, either an fFile object, a string of XML, a file path or a URL
  98. * @param numeric $http_timeout The timeout to use in seconds when requesting an XML file from a URL
  99. * @param boolean $fix_entities_encoding This will fix two common XML authoring errors and should only be used when experiencing decoding issues - HTML entities that haven't been encoded as XML, and XML content published in ISO-8859-1 or Windows-1252 encoding without an explicit encoding attribute
  100. * @param fFile|string |$source
  101. * @param boolean |$fix_entities_encoding
  102. * @return fXML
  103. */
  104. public function __construct($source, $http_timeout=NULL, $fix_entities_encoding=NULL)
  105. {
  106. if (is_bool($http_timeout)) {
  107. $fix_entities_encoding = $http_timeout;
  108. $http_timeout = NULL;
  109. }
  110. // Prevent spitting out errors to we can throw exceptions
  111. $old_setting = libxml_use_internal_errors(TRUE);
  112. $exception_message = NULL;
  113. try {
  114. if ($source instanceof fFile && $fix_entities_encoding) {
  115. $source = $source->read();
  116. }
  117. if ($source instanceof DOMElement) {
  118. $this->__dom = $source;
  119. $xml = TRUE;
  120. } elseif ($source instanceof fFile) {
  121. $xml = simplexml_load_file($source->getPath());
  122. // This handles URLs specially by adding a reasonable timeout
  123. } elseif (preg_match('#^(?P<protocol>http(s)?)://#', $source, $matches)) {
  124. if ($http_timeout === NULL) {
  125. $http_timeout = ini_get('default_socket_timeout');
  126. }
  127. // We use the appropriate protocol here so PHP can supress IIS https:// warnings
  128. $context = stream_context_create(array(
  129. $matches['protocol'] => array('timeout' => $http_timeout)
  130. ));
  131. // If the URL is not loaded in time, this supresses the file_get_contents() warning
  132. fCore::startErrorCapture(E_WARNING);
  133. $xml = file_get_contents($source, 0, $context);
  134. fCore::stopErrorCapture();
  135. if (!$xml) {
  136. throw new fExpectedException('The URL specified, %s, could not be loaded', $source);
  137. }
  138. if ($fix_entities_encoding) {
  139. $xml = $this->fixEntitiesEncoding($xml);
  140. }
  141. $xml = new SimpleXMLElement($xml);
  142. } else {
  143. $is_path = $source && !preg_match('#^\s*<#', $source);
  144. if ($fix_entities_encoding) {
  145. if ($is_path) {
  146. $source = file_get_contents($source);
  147. $is_path = FALSE;
  148. }
  149. $source = $this->fixEntitiesEncoding($source);
  150. }
  151. $xml = new SimpleXMLElement($source, 0, $is_path);
  152. }
  153. } catch (Exception $e) {
  154. $exception_message = $e->getMessage();
  155. $xml = FALSE;
  156. }
  157. // We want it to be clear when XML parsing issues occur
  158. if ($xml === FALSE) {
  159. $errors = libxml_get_errors();
  160. foreach ($errors as $error) {
  161. $exception_message .= "\n" . rtrim($error->message);
  162. }
  163. // If internal errors were off before, turn them back off
  164. if (!$old_setting) {
  165. libxml_use_internal_errors(FALSE);
  166. }
  167. throw new fValidationException(str_replace('%', '%%', $exception_message));
  168. }
  169. if (!$old_setting) {
  170. libxml_use_internal_errors(FALSE);
  171. }
  172. if (!$this->__dom) {
  173. $this->__dom = dom_import_simplexml($xml);
  174. }
  175. if ($this->__dom->namespaceURI && $this->__dom->prefix == '') {
  176. $this->addCustomPrefix('__', $this->__dom->namespaceURI);
  177. }
  178. }
  179. /**
  180. * Allows access to the text content of a child tag
  181. *
  182. * The child element name (`$name`) may start with a namespace prefix and a
  183. * `:` to indicate what namespace it is part of. A blank namespace prefix
  184. * (i.e. an element name starting with `:`) is treated as the XML default
  185. * namespace.
  186. *
  187. * @internal
  188. *
  189. * @param string $name The child element to retrieve
  190. * @return fXML|NULL The child element requested
  191. */
  192. public function __get($name)
  193. {
  194. // Handle nice callback syntax
  195. static $methods = array(
  196. '__construct' => TRUE,
  197. '__get' => TRUE,
  198. '__isset' => TRUE,
  199. '__sleep' => TRUE,
  200. '__toString' => TRUE,
  201. '__wakeup' => TRUE,
  202. 'addCustomPrefix' => TRUE,
  203. 'getName' => TRUE,
  204. 'getNamespace' => TRUE,
  205. 'getPrefix' => TRUE,
  206. 'getText' => TRUE,
  207. 'offsetExists' => TRUE,
  208. 'offsetGet' => TRUE,
  209. 'offsetSet' => TRUE,
  210. 'offsetUnset' => TRUE,
  211. 'toXML' => TRUE,
  212. 'xpath' => TRUE
  213. );
  214. if (isset($methods[$name])) {
  215. return array($this, $name);
  216. }
  217. if ($this->__dom->namespaceURI && $this->__dom->prefix == '' && strpos($name, ':') === FALSE) {
  218. $name = '__:' . $name;
  219. }
  220. $first_child = $this->query($name . '[1]');
  221. if ($first_child->length) {
  222. return $first_child->item(0)->textContent;
  223. }
  224. return NULL;
  225. }
  226. /**
  227. * The child element name (`$name`) may start with a namespace prefix and a
  228. * `:` to indicate what namespace it is part of. A blank namespace prefix
  229. * (i.e. an element name starting with `:`) is treated as the XML default
  230. * namespace.
  231. *
  232. * @internal
  233. *
  234. * @param string $name The child element to check - see method description for details about namespaces
  235. * @return boolean If the child element is set
  236. */
  237. public function __isset($name)
  238. {
  239. if ($this->__dom->namespaceURI && $this->__dom->prefix == '' && strpos($name, ':') === FALSE) {
  240. $name = '__:' . $name;
  241. }
  242. return (boolean) $this->query($name . '[1]')->length;
  243. }
  244. /**
  245. * Prevents users from trying to set elements
  246. *
  247. * @internal
  248. *
  249. * @param string $name The element to set
  250. * @param mixed $value The value to set
  251. * @return void
  252. */
  253. public function __set($name, $value)
  254. {
  255. throw new fProgrammerException('The %s class does not support modifying XML', __CLASS__);
  256. }
  257. /**
  258. * The XML needs to be made into a string before being serialized
  259. *
  260. * @internal
  261. *
  262. * @return array The members to serialize
  263. */
  264. public function __sleep()
  265. {
  266. $this->__xml = $this->toXML();
  267. return array('__custom_prefixes', '__xml');
  268. }
  269. /**
  270. * Gets the string inside the root XML element
  271. *
  272. * @return string The text inside the root element
  273. */
  274. public function __toString()
  275. {
  276. return (string) $this->__dom->textContent;
  277. }
  278. /**
  279. * Prevents users from trying to unset elements
  280. *
  281. * @internal
  282. *
  283. * @param string $name The element to unset
  284. * @return void
  285. */
  286. public function __unset($name)
  287. {
  288. throw new fProgrammerException('The %s class does not support modifying XML', __CLASS__);
  289. }
  290. /**
  291. * The XML needs to be made into a DOMElement when woken up
  292. *
  293. * @internal
  294. *
  295. * @return void
  296. */
  297. public function __wakeup()
  298. {
  299. $this->__dom = dom_import_simplexml(new SimpleXMLElement($this->__xml));
  300. $this->__xml = NULL;
  301. }
  302. /**
  303. * Adds a custom namespace prefix to full namespace mapping
  304. *
  305. * This namespace prefix will be valid for any operation on this object,
  306. * including calls to ::xpath().
  307. *
  308. * @param string $ns_prefix The custom namespace prefix
  309. * @param string $namespace The full namespace it maps to
  310. * @return void
  311. */
  312. public function addCustomPrefix($ns_prefix, $namespace)
  313. {
  314. if (!$this->__custom_prefixes) {
  315. $this->__custom_prefixes = array();
  316. }
  317. $this->__custom_prefixes[$ns_prefix] = $namespace;
  318. if ($this->__xpath) {
  319. $this->__xpath->registerNamespace($ns_prefix, $namespace);
  320. }
  321. }
  322. /**
  323. * Fixes HTML entities that aren't XML encoded and fixes ISO-8859-1/Windows-1252 encoded content that does not have an encoding attribute
  324. *
  325. * @param string $xml The XML to fix
  326. * @return string The fixed XML
  327. */
  328. private function fixEntitiesEncoding($xml)
  329. {
  330. preg_match('#^<\?xml.*? encoding="([^"]+)".*?\?>#i', $xml, $match);
  331. $encoding = empty($match[1]) ? NULL : $match[1];
  332. // Try to detect the encoding via the BOM
  333. if ($encoding === NULL) {
  334. if (substr($xml, 0, 3) == "\x0\x0\xFE\xFF") {
  335. $encoding = 'UTF-32BE';
  336. } elseif (substr($xml, 0, 3) == "\xFF\xFE\x0\x0") {
  337. $encoding = 'UTF-32LE';
  338. } elseif (substr($xml, 0, 2) == "\xFE\xFF") {
  339. $encoding = 'UTF-16BE';
  340. } elseif (substr($xml, 0, 2) == "\xFF\xFE") {
  341. $encoding = 'UTF-16LE';
  342. } else {
  343. $encoding = 'UTF-8';
  344. }
  345. }
  346. // This fixes broken encodings where the XML author puts ISO-8859-1 or
  347. // Windows-1252 into an XML file without an encoding or UTF-8 encoding
  348. if (preg_replace('#[^a-z0-9]#', '', strtolower($encoding)) == 'utf8') {
  349. // Remove the UTF-8 BOM if present
  350. $xml = preg_replace("#^\xEF\xBB\xBF#", '', $xml);
  351. fCore::startErrorCapture(E_NOTICE);
  352. $cleaned = self::iconv('UTF-8', 'UTF-8', $xml);
  353. if ($cleaned != $xml) {
  354. $xml = self::iconv('Windows-1252', 'UTF-8', $xml);
  355. }
  356. fCore::stopErrorCapture();
  357. }
  358. $num_matches = preg_match_all('#&(?!gt|lt|amp|quot|apos)\w+;#', $xml, $matches, PREG_SET_ORDER);
  359. if ($num_matches) {
  360. // We convert non-UTF-* content to UTF-8 because some character sets
  361. // don't have characters for all HTML entities
  362. if (substr(strtolower($encoding), 0, 3) != 'utf') {
  363. $xml = self::iconv($encoding, 'UTF-8', $xml);
  364. $xml = preg_replace('#^(<\?xml.*?) encoding="[^"]+"(.*?\?>)#', '\1 encoding="UTF-8"\2', $xml);
  365. $encoding = 'UTF-8';
  366. }
  367. $entities = array();
  368. foreach ($matches as $match) {
  369. $entities[$match[0]] = html_entity_decode($match[0], ENT_COMPAT, $encoding);
  370. }
  371. $xml = strtr($xml, $entities);
  372. }
  373. return $xml;
  374. }
  375. /**
  376. * Returns the name of the current element
  377. *
  378. * @return string The name of the current element
  379. */
  380. public function getName()
  381. {
  382. return $this->__dom->localName;
  383. }
  384. /**
  385. * Returns the namespace of the current element
  386. *
  387. * @return string The namespace of the current element
  388. */
  389. public function getNamespace()
  390. {
  391. return $this->__dom->namespaceURI;
  392. }
  393. /**
  394. * Returns the namespace prefix of the current element
  395. *
  396. * @return string The namespace prefix of the current element
  397. */
  398. public function getPrefix()
  399. {
  400. return $this->__dom->prefix;
  401. }
  402. /**
  403. * Returns the string text of the current element
  404. *
  405. * @return string The string text of the current element
  406. */
  407. public function getText()
  408. {
  409. return (string) $this->__dom->textContent;
  410. }
  411. /**
  412. * Provides functionality for isset() and empty() (required by arrayaccess interface)
  413. *
  414. * Offsets refers to an attribute name. Attribute may start with a namespace
  415. * prefix and a `:` to indicate what namespace the attribute is part of. A
  416. * blank namespace prefix (i.e. an offset starting with `:`) is treated as
  417. * the XML default namespace.
  418. *
  419. * @internal
  420. *
  421. * @param string $offset The offset to check
  422. * @return boolean If the offset exists
  423. */
  424. public function offsetExists($offset)
  425. {
  426. return (boolean) $this->query('@' . $offset . '[1]')->length;
  427. }
  428. /**
  429. * Provides functionality for get [index] syntax (required by ArrayAccess interface)
  430. *
  431. * Offsets refers to an attribute name. Attribute may start with a namespace
  432. * prefix and a `:` to indicate what namespace the attribute is part of. A
  433. * blank namespace prefix (i.e. an offset starting with `:`) is treated as
  434. * the XML default namespace.
  435. *
  436. * @internal
  437. *
  438. * @param string $offset The attribute to retrieve the value for
  439. * @return string The value of the offset
  440. */
  441. public function offsetGet($offset)
  442. {
  443. $attribute = $this->query('@' . $offset . '[1]');
  444. if ($attribute->length) {
  445. return $attribute->item(0)->nodeValue;
  446. }
  447. return NULL;
  448. }
  449. /**
  450. * Required by ArrayAccess interface
  451. *
  452. * @internal
  453. *
  454. * @param integer|string $offset The offset to set
  455. * @return void
  456. */
  457. public function offsetSet($offset, $value)
  458. {
  459. throw new fProgrammerException('The %s class does not support modifying XML', __CLASS__);
  460. }
  461. /**
  462. * Required by ArrayAccess interface
  463. *
  464. * @internal
  465. *
  466. * @param integer|string $offset The offset to unset
  467. * @return void
  468. */
  469. public function offsetUnset($offset)
  470. {
  471. throw new fProgrammerException('The %s class does not support modifying XML', __CLASS__);
  472. }
  473. /**
  474. * Performs an XPath query on the current element, returning the raw results
  475. *
  476. * @param string $path The XPath path to query
  477. * @return array The matching elements
  478. */
  479. protected function query($path)
  480. {
  481. if (!$this->__xpath) {
  482. $this->__xpath = new DOMXPath($this->__dom->ownerDocument);
  483. if ($this->__custom_prefixes) {
  484. foreach ($this->__custom_prefixes as $prefix => $namespace) {
  485. $this->__xpath->registerNamespace($prefix, $namespace);
  486. }
  487. }
  488. }
  489. // Prevent spitting out errors to we can throw exceptions
  490. $old_setting = libxml_use_internal_errors(TRUE);
  491. $result = $this->__xpath->query($path, $this->__dom);
  492. // We want it to be clear when XML parsing issues occur
  493. if ($result === FALSE) {
  494. $errors = libxml_get_errors();
  495. $exception_message = '';
  496. foreach ($errors as $error) {
  497. $exception_message .= "\n" . $error->message;
  498. }
  499. // If internal errors were off before, turn them back off
  500. if (!$old_setting) {
  501. libxml_use_internal_errors(FALSE);
  502. }
  503. throw new fProgrammerException(str_replace('%', '%%', trim($exception_message)));
  504. }
  505. if (!$old_setting) {
  506. libxml_use_internal_errors(FALSE);
  507. }
  508. return $result;
  509. }
  510. /**
  511. * Returns a well-formed XML string from the current element
  512. *
  513. * @return string The XML
  514. */
  515. public function toXML()
  516. {
  517. return $this->__dom->ownerDocument->saveXML($this->__dom->parentNode === $this->__dom->ownerDocument ? $this->__dom->parentNode : $this->__dom);
  518. }
  519. /**
  520. * Executes an XPath query on the current element, returning an array of matching elements
  521. *
  522. * @param string $path The XPath path to query
  523. * @param boolean $first_only If only the first match should be returned
  524. * @return array|string|fXML An array of matching elements, or a string or fXML object if `$first_only` is `TRUE`
  525. */
  526. public function xpath($path, $first_only=FALSE)
  527. {
  528. $result = $this->query($path);
  529. if ($first_only) {
  530. if (!$result->length) { return NULL; }
  531. $result = array($result->item(0));
  532. } else {
  533. if (!$result->length) { return array(); }
  534. }
  535. $keys_to_remove = array();
  536. $output = array();
  537. foreach ($result as $element) {
  538. if ($element instanceof DOMElement) {
  539. $child = new fXML($element);
  540. $child->__custom_prefixes = $this->__custom_prefixes;
  541. if ($child->__dom->namespaceURI && $child->__dom->prefix == '') {
  542. $child->addCustomPrefix('__', $child->__dom->namespaceURI);
  543. }
  544. $output[] = $child;
  545. } elseif ($element instanceof DOMCharacterData) {
  546. $output[] = $element->data;
  547. } elseif ($element instanceof DOMAttr) {
  548. $key = $element->name;
  549. if ($element->prefix) {
  550. $key = $element->prefix . ':' . $key;
  551. }
  552. // We will create an attrname and attrname[0] key for each
  553. // attribute and if more than one is found we remove the
  554. // key attrname. If only one is found we remove attrname[0].
  555. $key_1 = $key . '[1]';
  556. if (isset($output[$key_1])) {
  557. $i = 1;
  558. while (isset($output[$key . '[' . $i . ']'])) {
  559. $i++;
  560. }
  561. // This removes the key without the array index if more than one was found
  562. unset($output[$key]);
  563. unset($keys_to_remove[$key_1]);
  564. $key = $key . '[' . $i . ']';
  565. } else {
  566. $output[$key_1] = $element->nodeValue;
  567. $keys_to_remove[$key_1] = TRUE;
  568. }
  569. $output[$key] = $element->nodeValue;
  570. }
  571. }
  572. foreach ($keys_to_remove as $key => $trash) {
  573. unset($output[$key]);
  574. }
  575. if ($first_only) {
  576. return current($output);
  577. }
  578. return $output;
  579. }
  580. }
  581. /**
  582. * Copyright (c) 2007-2011 Will Bond <will@flourishlib.com>, others
  583. *
  584. * Permission is hereby granted, free of charge, to any person obtaining a copy
  585. * of this software and associated documentation files (the "Software"), to deal
  586. * in the Software without restriction, including without limitation the rights
  587. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  588. * copies of the Software, and to permit persons to whom the Software is
  589. * furnished to do so, subject to the following conditions:
  590. *
  591. * The above copyright notice and this permission notice shall be included in
  592. * all copies or substantial portions of the Software.
  593. *
  594. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  595. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  596. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  597. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  598. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  599. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  600. * THE SOFTWARE.
  601. */