PageRenderTime 47ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/pkp/classes/xml/XMLParser.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 330 lines | 164 code | 58 blank | 108 comment | 30 complexity | 13b98c26fd2652964173c7781f14227d MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @defgroup xml
  4. */
  5. /**
  6. * @file classes/xml/XMLParser.inc.php
  7. *
  8. * Copyright (c) 2000-2012 John Willinsky
  9. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  10. *
  11. * @class XMLParser
  12. * @ingroup xml
  13. *
  14. * @brief Generic class for parsing an XML document into a data structure.
  15. */
  16. // $Id$
  17. // The default character encodings
  18. define('XML_PARSER_SOURCE_ENCODING', Config::getVar('i18n', 'client_charset'));
  19. define('XML_PARSER_TARGET_ENCODING', Config::getVar('i18n', 'client_charset'));
  20. import('xml.XMLParserDOMHandler');
  21. class XMLParser {
  22. /** @var int original magic_quotes_runtime setting */
  23. var $magicQuotes;
  24. /** @var $handler object instance of XMLParserHandler */
  25. var $handler;
  26. /** @var $errors array List of error strings */
  27. var $errors;
  28. /**
  29. * Constructor.
  30. * Initialize parser and set parser options.
  31. */
  32. function XMLParser() {
  33. // magic_quotes_runtime must be disabled for XML parsing
  34. $this->magicQuotes = get_magic_quotes_runtime();
  35. if ($this->magicQuotes) set_magic_quotes_runtime(0);
  36. $this->errors = array();
  37. }
  38. function &parseText($text) {
  39. $parser =& $this->createParser();
  40. if (!isset($this->handler)) {
  41. // Use default handler for parsing
  42. $handler = new XMLParserDOMHandler();
  43. $this->setHandler($handler);
  44. }
  45. xml_set_object($parser, $this->handler);
  46. xml_set_element_handler($parser, "startElement", "endElement");
  47. xml_set_character_data_handler($parser, "characterData");
  48. // if the string contains non-UTF8 characters, convert it to UTF-8 for parsing
  49. if ( Config::getVar('i18n', 'charset_normalization') == 'On' && !String::utf8_compliant($text) ) {
  50. $text = String::utf8_normalize($text);
  51. // strip any invalid UTF-8 sequences
  52. $text = String::utf8_bad_strip($text);
  53. // convert named entities to numeric entities
  54. $text = strtr($text, String::getHTMLEntities());
  55. }
  56. // strip any invalid ASCII control characters
  57. $text = String::utf8_strip_ascii_ctrl($text);
  58. if (!xml_parse($parser, $text, true)) {
  59. $this->addError(xml_error_string(xml_get_error_code($parser)));
  60. }
  61. $result =& $this->handler->getResult();
  62. $this->destroyParser($parser);
  63. if (isset($handler)) {
  64. $handler->destroy();
  65. unset($handler);
  66. }
  67. return $result;
  68. }
  69. /**
  70. * Parse an XML file using the specified handler.
  71. * If no handler has been specified, XMLParserDOMHandler is used by default, returning a tree structure representing the document.
  72. * @param $file string full path to the XML file
  73. * @return object actual return type depends on the handler
  74. */
  75. function &parse($file) {
  76. $parser =& $this->createParser();
  77. if (!isset($this->handler)) {
  78. // Use default handler for parsing
  79. $handler = new XMLParserDOMHandler();
  80. $this->setHandler($handler);
  81. }
  82. xml_set_object($parser, $this->handler);
  83. xml_set_element_handler($parser, "startElement", "endElement");
  84. xml_set_character_data_handler($parser, "characterData");
  85. import('file.FileWrapper');
  86. $wrapper =& FileWrapper::wrapper($file);
  87. // Handle responses of various types
  88. while (true) {
  89. $newWrapper = $wrapper->open();
  90. if (is_object($newWrapper)) {
  91. // Follow a redirect
  92. unset($wrapper);
  93. $wrapper =& $newWrapper;
  94. unset ($newWrapper);
  95. } elseif (!$newWrapper) {
  96. // Could not open resource -- error
  97. $returner = false;
  98. return $returner;
  99. } else {
  100. // OK, we've found the end result
  101. break;
  102. }
  103. }
  104. if (!$wrapper) {
  105. $result = false;
  106. return $result;
  107. }
  108. while (!$wrapper->eof() && ($data = $wrapper->read()) !== false) {
  109. // if the string contains non-UTF8 characters, convert it to UTF-8 for parsing
  110. if ( Config::getVar('i18n', 'charset_normalization') == 'On' && !String::utf8_compliant($data) ) {
  111. $utf8_last = String::substr($data, String::strlen($data) - 1);
  112. // if the string ends in a "bad" UTF-8 character, maybe it's truncated
  113. while (!$wrapper->eof() && String::utf8_bad_find($utf8_last) === 0) {
  114. // read another chunk of data
  115. $data .= $wrapper->read();
  116. $utf8_last = String::substr($data, String::strlen($data) - 1);
  117. }
  118. $data = String::utf8_normalize($data);
  119. // strip any invalid UTF-8 sequences
  120. $data = String::utf8_bad_strip($data);
  121. // convert named entities to numeric entities
  122. $data = strtr($data, String::getHTMLEntities());
  123. }
  124. // strip any invalid ASCII control characters
  125. $data = String::utf8_strip_ascii_ctrl($data);
  126. if (!xml_parse($parser, $data, $wrapper->eof())) {
  127. $this->addError(xml_error_string(xml_get_error_code($parser)));
  128. }
  129. }
  130. $wrapper->close();
  131. $result =& $this->handler->getResult();
  132. $this->destroyParser($parser);
  133. if (isset($handler)) {
  134. $handler->destroy();
  135. unset($handler);
  136. }
  137. return $result;
  138. }
  139. /**
  140. * Add an error to the current error list
  141. * @param $error string
  142. */
  143. function addError($error) {
  144. array_push($this->errors, $error);
  145. }
  146. /**
  147. * Get the current list of errors
  148. */
  149. function getErrors() {
  150. return $this->errors;
  151. }
  152. /**
  153. * Determine whether or not the parser encountered an error (false)
  154. * or completed successfully (true)
  155. * @return boolean
  156. */
  157. function getStatus() {
  158. return empty($this->errors);
  159. }
  160. /**
  161. * Set the handler to use for parse(...).
  162. * @param $handler XMLParserHandler
  163. */
  164. function setHandler(&$handler) {
  165. $this->handler =& $handler;
  166. }
  167. /**
  168. * Parse XML data using xml_parse_into_struct and return data in an array.
  169. * This is best suited for XML documents with fairly simple structure.
  170. * @param $text string XML data
  171. * @param $tagsToMatch array optional, if set tags not in the array will be skipped
  172. * @return array a struct of the form ($TAG => array('attributes' => array( ... ), 'value' => $VALUE), ... )
  173. */
  174. function &parseTextStruct(&$text, $tagsToMatch = array()) {
  175. $parser =& $this->createParser();
  176. xml_parse_into_struct($parser, $text, $values, $tags);
  177. $this->destroyParser($parser);
  178. // Clean up data struct, removing undesired tags if necessary
  179. foreach ($tags as $key => $indices) {
  180. if (!empty($tagsToMatch) && !in_array($key, $tagsToMatch)) {
  181. continue;
  182. }
  183. $data[$key] = array();
  184. foreach ($indices as $index) {
  185. if (!isset($values[$index]['type']) || ($values[$index]['type'] != 'open' && $values[$index]['type'] != 'complete')) {
  186. continue;
  187. }
  188. $data[$key][] = array(
  189. 'attributes' => isset($values[$index]['attributes']) ? $values[$index]['attributes'] : array(),
  190. 'value' => isset($values[$index]['value']) ? trim($values[$index]['value']) : ''
  191. );
  192. }
  193. }
  194. return $data;
  195. }
  196. /**
  197. * Parse an XML file using xml_parse_into_struct and return data in an array.
  198. * This is best suited for XML documents with fairly simple structure.
  199. * @param $file string full path to the XML file
  200. * @param $tagsToMatch array optional, if set tags not in the array will be skipped
  201. * @return array a struct of the form ($TAG => array('attributes' => array( ... ), 'value' => $VALUE), ... )
  202. */
  203. function &parseStruct($file, $tagsToMatch = array()) {
  204. import('file.FileWrapper');
  205. $wrapper =& FileWrapper::wrapper($file);
  206. $fileContents = $wrapper->contents();
  207. if (!$fileContents) {
  208. $result = false;
  209. return $result;
  210. }
  211. $returner =& $this->parseTextStruct($fileContents, $tagsToMatch);
  212. return $returner;
  213. }
  214. /**
  215. * Initialize a new XML parser.
  216. * @return resource
  217. */
  218. function &createParser() {
  219. $parser = xml_parser_create(XML_PARSER_SOURCE_ENCODING);
  220. xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, XML_PARSER_TARGET_ENCODING);
  221. xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
  222. return $parser;
  223. }
  224. /**
  225. * Destroy XML parser.
  226. * @param $parser resource
  227. */
  228. function destroyParser(&$parser) {
  229. xml_parser_free($parser);
  230. unset($parser);
  231. }
  232. /**
  233. * Perform required clean up for this object.
  234. */
  235. function destroy() {
  236. // Set magic_quotes_runtime back to original setting
  237. if ($this->magicQuotes) set_magic_quotes_runtime($this->magicQuotes);
  238. unset($this);
  239. }
  240. }
  241. /**
  242. * Interface for handler class used by XMLParser.
  243. * All XML parser handler classes must implement these methods.
  244. */
  245. class XMLParserHandler {
  246. /**
  247. * Callback function to act as the start element handler.
  248. */
  249. function startElement(&$parser, $tag, $attributes) {
  250. }
  251. /**
  252. * Callback function to act as the end element handler.
  253. */
  254. function endElement(&$parser, $tag) {
  255. }
  256. /**
  257. * Callback function to act as the character data handler.
  258. */
  259. function characterData(&$parser, $data) {
  260. }
  261. /**
  262. * Returns a resulting data structure representing the parsed content.
  263. * The format of this object is specific to the handler.
  264. * @return mixed
  265. */
  266. function &getResult() {
  267. // Default: Return null (must be by ref).
  268. $nullVar = null;
  269. return $nullVar;
  270. }
  271. }
  272. ?>