/plugins/wp-all-import/classes/chunk.php
PHP | 392 lines | 244 code | 59 blank | 89 comment | 37 complexity | 975ca1fe2ea62e85dc02bf8a10e5f11d MD5 | raw file
- <?php
- include __DIR__ . "/XmlStreamReader/autoload.php";
- use Prewk\XmlStringStreamer;
- use Prewk\XmlStringStreamer\Parser;
- use Prewk\XmlStringStreamer\Stream;
- /**
- * Chunk
- *
- * Reads a large file in as chunks for easier parsing.
- *
- *
- * @package default
- * @author Max Tsiplyakov
- */
- class PMXI_Chunk {
- /**
- * options
- *
- * @var array Contains all major options
- * @access public
- */
- public $options = array(
- 'path' => './', // string The path to check for $file in
- 'element' => '', // string The XML element to return
- 'type' => 'upload',
- 'encoding' => 'UTF-8',
- 'pointer' => 1,
- 'chunkSize' => 1024,
- 'filter' => true,
- 'get_cloud' => false
- );
-
- /**
- * file
- *
- * @var string The filename being read
- * @access public
- */
- public $file = '';
- /**
- * pointer
- *
- * @var integer The current position the file is being read from
- * @access public
- */
- public $reader;
- public $cloud = array();
- public $loop = 1;
- public $is_404 = false;
- public $parser_type = false;
-
- /**
- * handle
- *
- * @var resource The fopen() resource
- * @access private
- */
- private $handle = null;
- /**
- * reading
- *
- * @var boolean Whether the script is currently reading the file
- * @access private
- */
-
- /**
- * __construct
- *
- * Builds the Chunk object
- *
- * @param string $file The filename to work with
- * @param array $options The options with which to parse the file
- * @author Dom Hastings
- * @access public
- */
- public function __construct($file, $options = array(), $parser_type = false) {
-
- // merge the options together
- $this->options = array_merge($this->options, (is_array($options) ? $options : array()));
- $this->options['chunkSize'] *= PMXI_Plugin::getInstance()->getOption('chunk_size');
- // set the filename
- $this->file = $file;
- $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type;
- $is_html = false;
- $f = @fopen($file, "rb");
- while (!@feof($f)) {
- $chunk = @fread($f, 1024);
- if (strpos($chunk, "<!DOCTYPE") === 0) $is_html = true;
- break;
- }
- @fclose($f);
- if ($is_html)
- {
- $path = $this->get_file_path();
- $this->is_404 = true;
- $this->reader = new XMLReader();
- @$this->reader->open($path);
- @$this->reader->setParserProperty(XMLReader::VALIDATE, false);
- return;
- }
- if ( PMXI_Plugin::getInstance()->getOption('force_stream_reader') )
- {
- $this->parser_type = 'xmlstreamer';
- }
- else
- {
- $input = new PMXI_Input();
- $import_id = $input->get('id', 0);
- if ( empty($import_id)) $import_id = $input->get('import_id', 0);
- if ( ! empty($import_id) )
- {
- $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type;
- $import = new PMXI_Import_Record();
- $import->getById($import_id);
- if ( ! $import->isEmpty() ){
- $this->parser_type = empty($import->options['xml_reader_engine']) ? 'xmlreader' : 'xmlstreamer';
- }
- }
- else
- {
- $this->parser_type = empty($parser_type) ? get_option('wpai_parser_type', 'xmlreader') : $parser_type;
- }
- }
- if (empty($this->options['element']) or $this->options['get_cloud'])
- {
- $path = $this->get_file_path();
- if ( $this->parser_type == 'xmlreader' )
- {
- $reader = new XMLReader();
- $reader->open($path);
- $reader->setParserProperty(XMLReader::VALIDATE, false);
- while ( @$reader->read() ) {
- switch ($reader->nodeType) {
- case (XMLREADER::ELEMENT):
- $localName = str_replace("_colon_", ":", $reader->localName);
- if (array_key_exists(str_replace(":", "_", $localName), $this->cloud))
- $this->cloud[str_replace(":", "_", $localName)]++;
- else
- $this->cloud[str_replace(":", "_", $localName)] = 1;
- break;
- default:
- break;
- }
- }
- unset($reader);
- }
- else
- {
- $CHUNK_SIZE = 1024;
- $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE);
- $parseroptions = array(
- "extractContainer" => false, // Required option
- );
- // Works like an XmlReader, and walks the XML tree node by node. Captures by node depth setting.
- $parser = new Parser\StringWalker($parseroptions);
- // Create the streamer
- $streamer = new XmlStringStreamer($parser, $streamProvider);
- while ($node = $streamer->getNode()) {
- // $simpleXmlNode = simplexml_load_string($node);
- // echo (string)$simpleXmlNode->firstName;
- }
- $this->cloud = $parser->cloud;
- }
-
- if ( ! empty($this->cloud) and empty($this->options['element']) ){
-
- arsort($this->cloud);
- $main_elements = array('node', 'product', 'job', 'deal', 'entry', 'item', 'property', 'listing', 'hotel', 'record', 'article', 'post', 'book', 'item_0');
- foreach ($this->cloud as $element_name => $value) {
- if ( in_array(strtolower($element_name), $main_elements) ){
- $this->options['element'] = $element_name;
- break;
- }
- }
-
- if (empty($this->options['element'])){
- foreach ($this->cloud as $el => $count) {
- $this->options['element'] = $el;
- break;
- }
- }
- }
- }
- $path = $this->get_file_path();
- if ( $this->parser_type == 'xmlreader' )
- {
- $this->reader = new XMLReader();
- @$this->reader->open($path);
- @$this->reader->setParserProperty(XMLReader::VALIDATE, false);
- }
- else
- {
- $parseroptions = array(
- "uniqueNode" => $this->options['element']
- );
- $CHUNK_SIZE = 1024;
- $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE);
- $parser = new Parser\UniqueNode($parseroptions);
- $this->reader = new XmlStringStreamer($parser, $streamProvider);
- }
- }
- function get_file_path()
- {
- $is_enabled_stream_filter = apply_filters('wp_all_import_is_enabled_stream_filter', true);
- if ( function_exists('stream_filter_register') and $this->options['filter'] and $is_enabled_stream_filter and $this->parser_type == 'xmlreader' )
- {
- stream_filter_register('preprocessxml', 'preprocessXml_filter');
- if (defined('HHVM_VERSION'))
- $path = $this->file;
- else
- $path = 'php://filter/read=preprocessxml/resource=' . $this->file;
- }
- else $path = $this->file;
- return $path;
- }
- /**
- * __destruct
- *
- * Cleans up
- *
- * @return void
- * @author Dom Hastings
- * @access public
- */
- public function __destruct() {
- // close the file resource
- unset($this->reader);
- }
-
- /**
- * read
- *
- * Reads the first available occurence of the XML element $this->options['element']
- *
- * @return string The XML string from $this->file
- * @author Dom Hastings
- * @access public
- */
- public function read($debug = false) {
- // trim it
- $element = trim($this->options['element']);
-
- $xml = '';
-
- if ( $this->parser_type == 'xmlreader' )
- {
- try {
- while ( @$this->reader->read() ) {
- switch ($this->reader->nodeType) {
- case (XMLREADER::ELEMENT):
-
- $localName = str_replace("_colon_", ":", $this->reader->localName);
- if ( strtolower(str_replace(":", "_", $localName)) == strtolower($element) ) {
- if ($this->loop < $this->options['pointer']){
- $this->loop++;
- continue;
- }
-
- $xml = @$this->reader->readOuterXML();
- break(2);
- }
- break;
- default:
- // code ...
- break;
- }
- }
- } catch (XmlImportException $e) {
- $xml = false;
- }
- }
- else
- {
- $is_preprocess_enabled = apply_filters('is_xml_preprocess_enabled', true);
- while ($xml = $this->reader->getNode()) {
- // $simpleXmlNode = simplexml_load_string($node);
- // echo (string)$simpleXmlNode->firstName;
- if ($this->loop < $this->options['pointer']){
- $this->loop++;
- continue;
- }
- if ($is_preprocess_enabled)
- {
- // the & symbol is not valid in XML, so replace it with temporary word _ampersand_
- $xml = str_replace("&", "_ampersand_", $xml);
- $xml = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', str_replace(":", "_colon_", $xml));
- }
- break;
- }
- }
-
- return ( ! empty($xml) ) ? self::removeColonsFromRSS(preg_replace('%xmlns.*=\s*([\'""]).*\1%sU', '', $xml)) : false;
- }
- public static function removeColonsFromRSS($feed) {
-
- $feed = str_replace("_colon_", ":", $feed);
-
- // pull out colons from start tags
- // (<\w+):(\w+>)
- $pattern = '/(<\w+):([\w+|\.|-]+[ |>]{1})/i';
- $replacement = '$1_$2';
- $feed = preg_replace($pattern, $replacement, $feed);
- // pull out colons from end tags
- // (<\/\w+):(\w+>)
- $pattern = '/(<\/\w+):([\w+|\.|-]+>)/i';
- $replacement = '$1_$2';
- $feed = preg_replace($pattern, $replacement, $feed);
- // pull out colons from attributes
- $pattern = '/(\s+\w+):(\w+[=]{1})/i';
- $replacement = '$1_$2';
- $feed = preg_replace($pattern, $replacement, $feed);
- // pull colons from single element
- // (<\w+):(\w+\/>)
- $pattern = '/(<\w+):([\w+|\.|-]+\/>)/i';
- $replacement = '$1_$2';
- $feed = preg_replace($pattern, $replacement, $feed);
- $is_preprocess_enabled = apply_filters('is_xml_preprocess_enabled', true);
- if ($is_preprocess_enabled)
- {
- // replace temporary word _ampersand_ back to & symbol
- $feed = str_replace("_ampersand_", "&", $feed);
- }
- // replace all standalone & symbols ( which is not in htmlentities e.q. and not wrapped in CDATA section ) to &
- PMXI_Import_Record::preprocessXml($feed);
- return $feed;
- }
- }
- class preprocessXml_filter extends php_user_filter {
- function filter($in, $out, &$consumed, $closing)
- {
- while ($bucket = stream_bucket_make_writeable($in)) {
- $is_preprocess_enabled = apply_filters('is_xml_preprocess_enabled', true);
- if ($is_preprocess_enabled)
- {
- // the & symbol is not valid in XML, so replace it with temporary word _ampersand_
- $bucket->data = str_replace("&", "_ampersand_", $bucket->data);
- $bucket->data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $this->replace_colons($bucket->data));
- }
- $consumed += $bucket->datalen;
- stream_bucket_append($out, $bucket);
- }
- return PSFS_PASS_ON;
- }
- function replace_colons($data)
- {
- return str_replace(":", "_colon_", $data);
- }
- }