/lib/pkp/classes/xml/XMLParser.inc.php
PHP | 330 lines | 164 code | 58 blank | 108 comment | 30 complexity | 13b98c26fd2652964173c7781f14227d MD5 | raw file
Possible License(s): GPL-2.0
- <?php
- /**
- * @defgroup xml
- */
- /**
- * @file classes/xml/XMLParser.inc.php
- *
- * Copyright (c) 2000-2012 John Willinsky
- * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
- *
- * @class XMLParser
- * @ingroup xml
- *
- * @brief Generic class for parsing an XML document into a data structure.
- */
- // $Id$
- // The default character encodings
- define('XML_PARSER_SOURCE_ENCODING', Config::getVar('i18n', 'client_charset'));
- define('XML_PARSER_TARGET_ENCODING', Config::getVar('i18n', 'client_charset'));
- import('xml.XMLParserDOMHandler');
- class XMLParser {
- /** @var int original magic_quotes_runtime setting */
- var $magicQuotes;
- /** @var $handler object instance of XMLParserHandler */
- var $handler;
- /** @var $errors array List of error strings */
- var $errors;
- /**
- * Constructor.
- * Initialize parser and set parser options.
- */
- function XMLParser() {
- // magic_quotes_runtime must be disabled for XML parsing
- $this->magicQuotes = get_magic_quotes_runtime();
- if ($this->magicQuotes) set_magic_quotes_runtime(0);
- $this->errors = array();
- }
- function &parseText($text) {
- $parser =& $this->createParser();
- if (!isset($this->handler)) {
- // Use default handler for parsing
- $handler = new XMLParserDOMHandler();
- $this->setHandler($handler);
- }
- xml_set_object($parser, $this->handler);
- xml_set_element_handler($parser, "startElement", "endElement");
- xml_set_character_data_handler($parser, "characterData");
- // if the string contains non-UTF8 characters, convert it to UTF-8 for parsing
- if ( Config::getVar('i18n', 'charset_normalization') == 'On' && !String::utf8_compliant($text) ) {
- $text = String::utf8_normalize($text);
- // strip any invalid UTF-8 sequences
- $text = String::utf8_bad_strip($text);
- // convert named entities to numeric entities
- $text = strtr($text, String::getHTMLEntities());
- }
- // strip any invalid ASCII control characters
- $text = String::utf8_strip_ascii_ctrl($text);
- if (!xml_parse($parser, $text, true)) {
- $this->addError(xml_error_string(xml_get_error_code($parser)));
- }
- $result =& $this->handler->getResult();
- $this->destroyParser($parser);
- if (isset($handler)) {
- $handler->destroy();
- unset($handler);
- }
- return $result;
- }
- /**
- * Parse an XML file using the specified handler.
- * If no handler has been specified, XMLParserDOMHandler is used by default, returning a tree structure representing the document.
- * @param $file string full path to the XML file
- * @return object actual return type depends on the handler
- */
- function &parse($file) {
- $parser =& $this->createParser();
- if (!isset($this->handler)) {
- // Use default handler for parsing
- $handler = new XMLParserDOMHandler();
- $this->setHandler($handler);
- }
- xml_set_object($parser, $this->handler);
- xml_set_element_handler($parser, "startElement", "endElement");
- xml_set_character_data_handler($parser, "characterData");
- import('file.FileWrapper');
- $wrapper =& FileWrapper::wrapper($file);
- // Handle responses of various types
- while (true) {
- $newWrapper = $wrapper->open();
- if (is_object($newWrapper)) {
- // Follow a redirect
- unset($wrapper);
- $wrapper =& $newWrapper;
- unset ($newWrapper);
- } elseif (!$newWrapper) {
- // Could not open resource -- error
- $returner = false;
- return $returner;
- } else {
- // OK, we've found the end result
- break;
- }
- }
- if (!$wrapper) {
- $result = false;
- return $result;
- }
- while (!$wrapper->eof() && ($data = $wrapper->read()) !== false) {
- // if the string contains non-UTF8 characters, convert it to UTF-8 for parsing
- if ( Config::getVar('i18n', 'charset_normalization') == 'On' && !String::utf8_compliant($data) ) {
- $utf8_last = String::substr($data, String::strlen($data) - 1);
- // if the string ends in a "bad" UTF-8 character, maybe it's truncated
- while (!$wrapper->eof() && String::utf8_bad_find($utf8_last) === 0) {
- // read another chunk of data
- $data .= $wrapper->read();
- $utf8_last = String::substr($data, String::strlen($data) - 1);
- }
- $data = String::utf8_normalize($data);
- // strip any invalid UTF-8 sequences
- $data = String::utf8_bad_strip($data);
- // convert named entities to numeric entities
- $data = strtr($data, String::getHTMLEntities());
- }
- // strip any invalid ASCII control characters
- $data = String::utf8_strip_ascii_ctrl($data);
- if (!xml_parse($parser, $data, $wrapper->eof())) {
- $this->addError(xml_error_string(xml_get_error_code($parser)));
- }
- }
- $wrapper->close();
- $result =& $this->handler->getResult();
- $this->destroyParser($parser);
- if (isset($handler)) {
- $handler->destroy();
- unset($handler);
- }
- return $result;
- }
- /**
- * Add an error to the current error list
- * @param $error string
- */
- function addError($error) {
- array_push($this->errors, $error);
- }
- /**
- * Get the current list of errors
- */
- function getErrors() {
- return $this->errors;
- }
- /**
- * Determine whether or not the parser encountered an error (false)
- * or completed successfully (true)
- * @return boolean
- */
- function getStatus() {
- return empty($this->errors);
- }
- /**
- * Set the handler to use for parse(...).
- * @param $handler XMLParserHandler
- */
- function setHandler(&$handler) {
- $this->handler =& $handler;
- }
- /**
- * Parse XML data using xml_parse_into_struct and return data in an array.
- * This is best suited for XML documents with fairly simple structure.
- * @param $text string XML data
- * @param $tagsToMatch array optional, if set tags not in the array will be skipped
- * @return array a struct of the form ($TAG => array('attributes' => array( ... ), 'value' => $VALUE), ... )
- */
- function &parseTextStruct(&$text, $tagsToMatch = array()) {
- $parser =& $this->createParser();
- xml_parse_into_struct($parser, $text, $values, $tags);
- $this->destroyParser($parser);
- // Clean up data struct, removing undesired tags if necessary
- foreach ($tags as $key => $indices) {
- if (!empty($tagsToMatch) && !in_array($key, $tagsToMatch)) {
- continue;
- }
- $data[$key] = array();
- foreach ($indices as $index) {
- if (!isset($values[$index]['type']) || ($values[$index]['type'] != 'open' && $values[$index]['type'] != 'complete')) {
- continue;
- }
- $data[$key][] = array(
- 'attributes' => isset($values[$index]['attributes']) ? $values[$index]['attributes'] : array(),
- 'value' => isset($values[$index]['value']) ? trim($values[$index]['value']) : ''
- );
- }
- }
- return $data;
- }
- /**
- * Parse an XML file using xml_parse_into_struct and return data in an array.
- * This is best suited for XML documents with fairly simple structure.
- * @param $file string full path to the XML file
- * @param $tagsToMatch array optional, if set tags not in the array will be skipped
- * @return array a struct of the form ($TAG => array('attributes' => array( ... ), 'value' => $VALUE), ... )
- */
- function &parseStruct($file, $tagsToMatch = array()) {
- import('file.FileWrapper');
- $wrapper =& FileWrapper::wrapper($file);
- $fileContents = $wrapper->contents();
- if (!$fileContents) {
- $result = false;
- return $result;
- }
- $returner =& $this->parseTextStruct($fileContents, $tagsToMatch);
- return $returner;
- }
- /**
- * Initialize a new XML parser.
- * @return resource
- */
- function &createParser() {
- $parser = xml_parser_create(XML_PARSER_SOURCE_ENCODING);
- xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, XML_PARSER_TARGET_ENCODING);
- xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
- return $parser;
- }
- /**
- * Destroy XML parser.
- * @param $parser resource
- */
- function destroyParser(&$parser) {
- xml_parser_free($parser);
- unset($parser);
- }
- /**
- * Perform required clean up for this object.
- */
- function destroy() {
- // Set magic_quotes_runtime back to original setting
- if ($this->magicQuotes) set_magic_quotes_runtime($this->magicQuotes);
- unset($this);
- }
- }
- /**
- * Interface for handler class used by XMLParser.
- * All XML parser handler classes must implement these methods.
- */
- class XMLParserHandler {
- /**
- * Callback function to act as the start element handler.
- */
- function startElement(&$parser, $tag, $attributes) {
- }
- /**
- * Callback function to act as the end element handler.
- */
- function endElement(&$parser, $tag) {
- }
- /**
- * Callback function to act as the character data handler.
- */
- function characterData(&$parser, $data) {
- }
- /**
- * Returns a resulting data structure representing the parsed content.
- * The format of this object is specific to the handler.
- * @return mixed
- */
- function &getResult() {
- // Default: Return null (must be by ref).
- $nullVar = null;
- return $nullVar;
- }
- }
- ?>