PageRenderTime 50ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/plugins/wp-all-import/classes/chunk.php

https://gitlab.com/hschoenburg/tlworks2
PHP | 392 lines | 244 code | 59 blank | 89 comment | 37 complexity | 975ca1fe2ea62e85dc02bf8a10e5f11d MD5 | raw file
  1. <?php
  2. include __DIR__ . "/XmlStreamReader/autoload.php";
  3. use Prewk\XmlStringStreamer;
  4. use Prewk\XmlStringStreamer\Parser;
  5. use Prewk\XmlStringStreamer\Stream;
  6. /**
  7. * Chunk
  8. *
  9. * Reads a large file in as chunks for easier parsing.
  10. *
  11. *
  12. * @package default
  13. * @author Max Tsiplyakov
  14. */
  15. class PMXI_Chunk {
  16. /**
  17. * options
  18. *
  19. * @var array Contains all major options
  20. * @access public
  21. */
  22. public $options = array(
  23. 'path' => './', // string The path to check for $file in
  24. 'element' => '', // string The XML element to return
  25. 'type' => 'upload',
  26. 'encoding' => 'UTF-8',
  27. 'pointer' => 1,
  28. 'chunkSize' => 1024,
  29. 'filter' => true,
  30. 'get_cloud' => false
  31. );
  32. /**
  33. * file
  34. *
  35. * @var string The filename being read
  36. * @access public
  37. */
  38. public $file = '';
  39. /**
  40. * pointer
  41. *
  42. * @var integer The current position the file is being read from
  43. * @access public
  44. */
  45. public $reader;
  46. public $cloud = array();
  47. public $loop = 1;
  48. public $is_404 = false;
  49. public $parser_type = false;
  50. /**
  51. * handle
  52. *
  53. * @var resource The fopen() resource
  54. * @access private
  55. */
  56. private $handle = null;
  57. /**
  58. * reading
  59. *
  60. * @var boolean Whether the script is currently reading the file
  61. * @access private
  62. */
  63. /**
  64. * __construct
  65. *
  66. * Builds the Chunk object
  67. *
  68. * @param string $file The filename to work with
  69. * @param array $options The options with which to parse the file
  70. * @author Dom Hastings
  71. * @access public
  72. */
  73. public function __construct($file, $options = array(), $parser_type = false) {
  74. // merge the options together
  75. $this->options = array_merge($this->options, (is_array($options) ? $options : array()));
  76. $this->options['chunkSize'] *= PMXI_Plugin::getInstance()->getOption('chunk_size');
  77. // set the filename
  78. $this->file = $file;
  79. $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type;
  80. $is_html = false;
  81. $f = @fopen($file, "rb");
  82. while (!@feof($f)) {
  83. $chunk = @fread($f, 1024);
  84. if (strpos($chunk, "<!DOCTYPE") === 0) $is_html = true;
  85. break;
  86. }
  87. @fclose($f);
  88. if ($is_html)
  89. {
  90. $path = $this->get_file_path();
  91. $this->is_404 = true;
  92. $this->reader = new XMLReader();
  93. @$this->reader->open($path);
  94. @$this->reader->setParserProperty(XMLReader::VALIDATE, false);
  95. return;
  96. }
  97. if ( PMXI_Plugin::getInstance()->getOption('force_stream_reader') )
  98. {
  99. $this->parser_type = 'xmlstreamer';
  100. }
  101. else
  102. {
  103. $input = new PMXI_Input();
  104. $import_id = $input->get('id', 0);
  105. if ( empty($import_id)) $import_id = $input->get('import_id', 0);
  106. if ( ! empty($import_id) )
  107. {
  108. $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type;
  109. $import = new PMXI_Import_Record();
  110. $import->getById($import_id);
  111. if ( ! $import->isEmpty() ){
  112. $this->parser_type = empty($import->options['xml_reader_engine']) ? 'xmlreader' : 'xmlstreamer';
  113. }
  114. }
  115. else
  116. {
  117. $this->parser_type = empty($parser_type) ? get_option('wpai_parser_type', 'xmlreader') : $parser_type;
  118. }
  119. }
  120. if (empty($this->options['element']) or $this->options['get_cloud'])
  121. {
  122. $path = $this->get_file_path();
  123. if ( $this->parser_type == 'xmlreader' )
  124. {
  125. $reader = new XMLReader();
  126. $reader->open($path);
  127. $reader->setParserProperty(XMLReader::VALIDATE, false);
  128. while ( @$reader->read() ) {
  129. switch ($reader->nodeType) {
  130. case (XMLREADER::ELEMENT):
  131. $localName = str_replace("_colon_", ":", $reader->localName);
  132. if (array_key_exists(str_replace(":", "_", $localName), $this->cloud))
  133. $this->cloud[str_replace(":", "_", $localName)]++;
  134. else
  135. $this->cloud[str_replace(":", "_", $localName)] = 1;
  136. break;
  137. default:
  138. break;
  139. }
  140. }
  141. unset($reader);
  142. }
  143. else
  144. {
  145. $CHUNK_SIZE = 1024;
  146. $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE);
  147. $parseroptions = array(
  148. "extractContainer" => false, // Required option
  149. );
  150. // Works like an XmlReader, and walks the XML tree node by node. Captures by node depth setting.
  151. $parser = new Parser\StringWalker($parseroptions);
  152. // Create the streamer
  153. $streamer = new XmlStringStreamer($parser, $streamProvider);
  154. while ($node = $streamer->getNode()) {
  155. // $simpleXmlNode = simplexml_load_string($node);
  156. // echo (string)$simpleXmlNode->firstName;
  157. }
  158. $this->cloud = $parser->cloud;
  159. }
  160. if ( ! empty($this->cloud) and empty($this->options['element']) ){
  161. arsort($this->cloud);
  162. $main_elements = array('node', 'product', 'job', 'deal', 'entry', 'item', 'property', 'listing', 'hotel', 'record', 'article', 'post', 'book', 'item_0');
  163. foreach ($this->cloud as $element_name => $value) {
  164. if ( in_array(strtolower($element_name), $main_elements) ){
  165. $this->options['element'] = $element_name;
  166. break;
  167. }
  168. }
  169. if (empty($this->options['element'])){
  170. foreach ($this->cloud as $el => $count) {
  171. $this->options['element'] = $el;
  172. break;
  173. }
  174. }
  175. }
  176. }
  177. $path = $this->get_file_path();
  178. if ( $this->parser_type == 'xmlreader' )
  179. {
  180. $this->reader = new XMLReader();
  181. @$this->reader->open($path);
  182. @$this->reader->setParserProperty(XMLReader::VALIDATE, false);
  183. }
  184. else
  185. {
  186. $parseroptions = array(
  187. "uniqueNode" => $this->options['element']
  188. );
  189. $CHUNK_SIZE = 1024;
  190. $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE);
  191. $parser = new Parser\UniqueNode($parseroptions);
  192. $this->reader = new XmlStringStreamer($parser, $streamProvider);
  193. }
  194. }
  195. function get_file_path()
  196. {
  197. $is_enabled_stream_filter = apply_filters('wp_all_import_is_enabled_stream_filter', true);
  198. if ( function_exists('stream_filter_register') and $this->options['filter'] and $is_enabled_stream_filter and $this->parser_type == 'xmlreader' )
  199. {
  200. stream_filter_register('preprocessxml', 'preprocessXml_filter');
  201. if (defined('HHVM_VERSION'))
  202. $path = $this->file;
  203. else
  204. $path = 'php://filter/read=preprocessxml/resource=' . $this->file;
  205. }
  206. else $path = $this->file;
  207. return $path;
  208. }
  209. /**
  210. * __destruct
  211. *
  212. * Cleans up
  213. *
  214. * @return void
  215. * @author Dom Hastings
  216. * @access public
  217. */
  218. public function __destruct() {
  219. // close the file resource
  220. unset($this->reader);
  221. }
  222. /**
  223. * read
  224. *
  225. * Reads the first available occurence of the XML element $this->options['element']
  226. *
  227. * @return string The XML string from $this->file
  228. * @author Dom Hastings
  229. * @access public
  230. */
  231. public function read($debug = false) {
  232. // trim it
  233. $element = trim($this->options['element']);
  234. $xml = '';
  235. if ( $this->parser_type == 'xmlreader' )
  236. {
  237. try {
  238. while ( @$this->reader->read() ) {
  239. switch ($this->reader->nodeType) {
  240. case (XMLREADER::ELEMENT):
  241. $localName = str_replace("_colon_", ":", $this->reader->localName);
  242. if ( strtolower(str_replace(":", "_", $localName)) == strtolower($element) ) {
  243. if ($this->loop < $this->options['pointer']){
  244. $this->loop++;
  245. continue;
  246. }
  247. $xml = @$this->reader->readOuterXML();
  248. break(2);
  249. }
  250. break;
  251. default:
  252. // code ...
  253. break;
  254. }
  255. }
  256. } catch (XmlImportException $e) {
  257. $xml = false;
  258. }
  259. }
  260. else
  261. {
  262. $is_preprocess_enabled = apply_filters('is_xml_preprocess_enabled', true);
  263. while ($xml = $this->reader->getNode()) {
  264. // $simpleXmlNode = simplexml_load_string($node);
  265. // echo (string)$simpleXmlNode->firstName;
  266. if ($this->loop < $this->options['pointer']){
  267. $this->loop++;
  268. continue;
  269. }
  270. if ($is_preprocess_enabled)
  271. {
  272. // the & symbol is not valid in XML, so replace it with temporary word _ampersand_
  273. $xml = str_replace("&", "_ampersand_", $xml);
  274. $xml = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', str_replace(":", "_colon_", $xml));
  275. }
  276. break;
  277. }
  278. }
  279. return ( ! empty($xml) ) ? self::removeColonsFromRSS(preg_replace('%xmlns.*=\s*([\'"&quot;]).*\1%sU', '', $xml)) : false;
  280. }
  281. public static function removeColonsFromRSS($feed) {
  282. $feed = str_replace("_colon_", ":", $feed);
  283. // pull out colons from start tags
  284. // (<\w+):(\w+>)
  285. $pattern = '/(<\w+):([\w+|\.|-]+[ |>]{1})/i';
  286. $replacement = '$1_$2';
  287. $feed = preg_replace($pattern, $replacement, $feed);
  288. // pull out colons from end tags
  289. // (<\/\w+):(\w+>)
  290. $pattern = '/(<\/\w+):([\w+|\.|-]+>)/i';
  291. $replacement = '$1_$2';
  292. $feed = preg_replace($pattern, $replacement, $feed);
  293. // pull out colons from attributes
  294. $pattern = '/(\s+\w+):(\w+[=]{1})/i';
  295. $replacement = '$1_$2';
  296. $feed = preg_replace($pattern, $replacement, $feed);
  297. // pull colons from single element
  298. // (<\w+):(\w+\/>)
  299. $pattern = '/(<\w+):([\w+|\.|-]+\/>)/i';
  300. $replacement = '$1_$2';
  301. $feed = preg_replace($pattern, $replacement, $feed);
  302. $is_preprocess_enabled = apply_filters('is_xml_preprocess_enabled', true);
  303. if ($is_preprocess_enabled)
  304. {
  305. // replace temporary word _ampersand_ back to & symbol
  306. $feed = str_replace("_ampersand_", "&", $feed);
  307. }
  308. // replace all standalone & symbols ( which is not in htmlentities e.q. &nbsp; and not wrapped in CDATA section ) to &amp;
  309. PMXI_Import_Record::preprocessXml($feed);
  310. return $feed;
  311. }
  312. }
  313. class preprocessXml_filter extends php_user_filter {
  314. function filter($in, $out, &$consumed, $closing)
  315. {
  316. while ($bucket = stream_bucket_make_writeable($in)) {
  317. $is_preprocess_enabled = apply_filters('is_xml_preprocess_enabled', true);
  318. if ($is_preprocess_enabled)
  319. {
  320. // the & symbol is not valid in XML, so replace it with temporary word _ampersand_
  321. $bucket->data = str_replace("&", "_ampersand_", $bucket->data);
  322. $bucket->data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $this->replace_colons($bucket->data));
  323. }
  324. $consumed += $bucket->datalen;
  325. stream_bucket_append($out, $bucket);
  326. }
  327. return PSFS_PASS_ON;
  328. }
  329. function replace_colons($data)
  330. {
  331. return str_replace(":", "_colon_", $data);
  332. }
  333. }