PageRenderTime 60ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/ext/simplehtmldom_1_5/simple_html_dom.php

https://bitbucket.org/pontikis/tolc
PHP | 1393 lines | 1051 code | 151 blank | 191 comment | 247 complexity | fa249f6c0a0bc274631b8d606817f49d MD5 | raw file
Possible License(s): LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /**
  3. * Website: http://sourceforge.net/projects/simplehtmldom/
  4. * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
  5. * Contributions by:
  6. * Yousuke Kumakura (Attribute filters)
  7. * Vadim Voituk (Negative indexes supports of "find" method)
  8. * Antcs (Constructor with automatically load contents either text or file/url)
  9. *
  10. * all affected sections have comments starting with "PaperG"
  11. *
  12. * Paperg - Added case insensitive testing of the value of the selector.
  13. * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
  14. * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
  15. * it will almost always be smaller by some amount.
  16. * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
  17. * but for most purposes, it's a really good estimation.
  18. * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  19. * Allow the user to tell us how much they trust the html.
  20. * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
  21. * This allows for us to find tags based on the text they contain.
  22. * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  23. * Paperg: added parse_charset so that we know about the character set of the source document.
  24. * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  25. * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  26. *
  27. * Licensed under The MIT License
  28. * Redistributions of files must retain the above copyright notice.
  29. *
  30. * @author S.C. Chen <me578022@gmail.com>
  31. * @author John Schlick
  32. * @author Rus Carroll
  33. * @version 1.11 ($Rev: 184 $)
  34. * @package PlaceLocalInclude
  35. * @subpackage simple_html_dom
  36. */
  37. /**
  38. * All of the Defines for the classes below.
  39. * @author S.C. Chen <me578022@gmail.com>
  40. */
  41. define('HDOM_TYPE_ELEMENT', 1);
  42. define('HDOM_TYPE_COMMENT', 2);
  43. define('HDOM_TYPE_TEXT', 3);
  44. define('HDOM_TYPE_ENDTAG', 4);
  45. define('HDOM_TYPE_ROOT', 5);
  46. define('HDOM_TYPE_UNKNOWN', 6);
  47. define('HDOM_QUOTE_DOUBLE', 0);
  48. define('HDOM_QUOTE_SINGLE', 1);
  49. define('HDOM_QUOTE_NO', 3);
  50. define('HDOM_INFO_BEGIN', 0);
  51. define('HDOM_INFO_END', 1);
  52. define('HDOM_INFO_QUOTE', 2);
  53. define('HDOM_INFO_SPACE', 3);
  54. define('HDOM_INFO_TEXT', 4);
  55. define('HDOM_INFO_INNER', 5);
  56. define('HDOM_INFO_OUTER', 6);
  57. define('HDOM_INFO_ENDSPACE',7);
  58. define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  59. define('DEFAULT_BR_TEXT', "\r\n");
  60. // helper functions
  61. // -----------------------------------------------------------------------------
  62. // get html dom from file
  63. // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  64. function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT)
  65. {
  66. // We DO force the tags to be terminated.
  67. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText);
  68. // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
  69. $contents = file_get_contents($url, $use_include_path, $context, $offset);
  70. // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  71. // $contents = retrieve_url_contents($url);
  72. if (empty($contents))
  73. {
  74. return false;
  75. }
  76. // The second parameter can force the selectors to all be lowercase.
  77. $dom->load($contents, $lowercase, $stripRN);
  78. return $dom;
  79. }
  80. // get html dom from string
  81. function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT)
  82. {
  83. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText);
  84. if (empty($str))
  85. {
  86. $dom->clear();
  87. return false;
  88. }
  89. $dom->load($str, $lowercase, $stripRN);
  90. return $dom;
  91. }
  92. // dump html dom tree
  93. function dump_html_tree($node, $show_attr=true, $deep=0)
  94. {
  95. $node->dump($node);
  96. }
  97. /**
  98. * simple html dom node
  99. * PaperG - added ability for "find" routine to lowercase the value of the selector.
  100. * PaperG - added $tag_start to track the start position of the tag in the total byte index
  101. *
  102. * @package PlaceLocalInclude
  103. */
  104. class simple_html_dom_node {
  105. public $nodetype = HDOM_TYPE_TEXT;
  106. public $tag = 'text';
  107. public $attr = array();
  108. public $children = array();
  109. public $nodes = array();
  110. public $parent = null;
  111. public $_ = array();
  112. public $tag_start = 0;
  113. private $dom = null;
  114. function __construct($dom)
  115. {
  116. $this->dom = $dom;
  117. $dom->nodes[] = $this;
  118. }
  119. function __destruct()
  120. {
  121. $this->clear();
  122. }
  123. function __toString()
  124. {
  125. return $this->outertext();
  126. }
  127. // clean up memory due to php5 circular references memory leak...
  128. function clear()
  129. {
  130. $this->dom = null;
  131. $this->nodes = null;
  132. $this->parent = null;
  133. $this->children = null;
  134. }
  135. // dump node's tree
  136. function dump($show_attr=true, $deep=0)
  137. {
  138. $lead = str_repeat(' ', $deep);
  139. echo $lead.$this->tag;
  140. if ($show_attr && count($this->attr)>0)
  141. {
  142. echo '(';
  143. foreach ($this->attr as $k=>$v)
  144. echo "[$k]=>\"".$this->$k.'", ';
  145. echo ')';
  146. }
  147. echo "\n";
  148. foreach ($this->nodes as $c)
  149. $c->dump($show_attr, $deep+1);
  150. }
  151. // Debugging function to dump a single dom node with a bunch of information about it.
  152. function dump_node()
  153. {
  154. echo $this->tag;
  155. if (count($this->attr)>0)
  156. {
  157. echo '(';
  158. foreach ($this->attr as $k=>$v)
  159. {
  160. echo "[$k]=>\"".$this->$k.'", ';
  161. }
  162. echo ')';
  163. }
  164. if (count($this->attr)>0)
  165. {
  166. echo ' $_ (';
  167. foreach ($this->_ as $k=>$v)
  168. {
  169. if (is_array($v))
  170. {
  171. echo "[$k]=>(";
  172. foreach ($v as $k2=>$v2)
  173. {
  174. echo "[$k2]=>\"".$v2.'", ';
  175. }
  176. echo ")";
  177. } else {
  178. echo "[$k]=>\"".$v.'", ';
  179. }
  180. }
  181. echo ")";
  182. }
  183. if (isset($this->text))
  184. {
  185. echo " text: (" . $this->text . ")";
  186. }
  187. echo " children: " . count($this->children);
  188. echo " nodes: " . count($this->nodes);
  189. echo " tag_start: " . $this->tag_start;
  190. echo "\n";
  191. }
  192. // returns the parent of node
  193. function parent()
  194. {
  195. return $this->parent;
  196. }
  197. // returns children of node
  198. function children($idx=-1)
  199. {
  200. if ($idx===-1) return $this->children;
  201. if (isset($this->children[$idx])) return $this->children[$idx];
  202. return null;
  203. }
  204. // returns the first child of node
  205. function first_child()
  206. {
  207. if (count($this->children)>0) return $this->children[0];
  208. return null;
  209. }
  210. // returns the last child of node
  211. function last_child()
  212. {
  213. if (($count=count($this->children))>0) return $this->children[$count-1];
  214. return null;
  215. }
  216. // returns the next sibling of node
  217. function next_sibling()
  218. {
  219. if ($this->parent===null) return null;
  220. $idx = 0;
  221. $count = count($this->parent->children);
  222. while ($idx<$count && $this!==$this->parent->children[$idx])
  223. ++$idx;
  224. if (++$idx>=$count) return null;
  225. return $this->parent->children[$idx];
  226. }
  227. // returns the previous sibling of node
  228. function prev_sibling()
  229. {
  230. if ($this->parent===null) return null;
  231. $idx = 0;
  232. $count = count($this->parent->children);
  233. while ($idx<$count && $this!==$this->parent->children[$idx])
  234. ++$idx;
  235. if (--$idx<0) return null;
  236. return $this->parent->children[$idx];
  237. }
  238. // function to locate a specific ancestor tag in the path to the root.
  239. function find_ancestor_tag($tag)
  240. {
  241. global $debugObject;
  242. if (is_object($debugObject))
  243. {
  244. $debugObject->debugLogEntry(1);
  245. }
  246. // Start by including ourselves in the comparison.
  247. $returnDom = $this;
  248. while (!is_null($returnDom))
  249. {
  250. if (is_object($debugObject))
  251. {
  252. $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag);
  253. }
  254. if ($returnDom->tag == $tag)
  255. {
  256. break;
  257. }
  258. $returnDom = $returnDom->parent;
  259. }
  260. return $returnDom;
  261. }
  262. // get dom node's inner html
  263. function innertext()
  264. {
  265. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  266. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  267. $ret = '';
  268. foreach ($this->nodes as $n)
  269. $ret .= $n->outertext();
  270. return $ret;
  271. }
  272. // get dom node's outer text (with tag)
  273. function outertext()
  274. {
  275. global $debugObject;
  276. if (is_object($debugObject))
  277. {
  278. $text = '';
  279. if ($this->tag == 'text')
  280. {
  281. if (!empty($this->text))
  282. {
  283. $text = " with text: " . $this->text;
  284. }
  285. }
  286. $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
  287. }
  288. if ($this->tag==='root') return $this->innertext();
  289. // trigger callback
  290. if ($this->dom && $this->dom->callback!==null)
  291. {
  292. call_user_func_array($this->dom->callback, array($this));
  293. }
  294. if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
  295. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  296. // render begin tag
  297. if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
  298. {
  299. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  300. } else {
  301. $ret = "";
  302. }
  303. // render inner text
  304. if (isset($this->_[HDOM_INFO_INNER]))
  305. {
  306. // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
  307. if ($this->tag != "br")
  308. {
  309. $ret .= $this->_[HDOM_INFO_INNER];
  310. }
  311. } else {
  312. if ($this->nodes)
  313. {
  314. foreach ($this->nodes as $n)
  315. {
  316. $ret .= $this->convert_text($n->outertext());
  317. }
  318. }
  319. }
  320. // render end tag
  321. if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
  322. $ret .= '</'.$this->tag.'>';
  323. return $ret;
  324. }
  325. // get dom node's plain text
  326. function text()
  327. {
  328. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  329. switch ($this->nodetype)
  330. {
  331. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  332. case HDOM_TYPE_COMMENT: return '';
  333. case HDOM_TYPE_UNKNOWN: return '';
  334. }
  335. if (strcasecmp($this->tag, 'script')===0) return '';
  336. if (strcasecmp($this->tag, 'style')===0) return '';
  337. $ret = '';
  338. // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
  339. // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
  340. // WHY is this happening?
  341. if (!is_null($this->nodes))
  342. {
  343. foreach ($this->nodes as $n)
  344. {
  345. $ret .= $this->convert_text($n->text());
  346. }
  347. }
  348. return $ret;
  349. }
  350. function xmltext()
  351. {
  352. $ret = $this->innertext();
  353. $ret = str_ireplace('<![CDATA[', '', $ret);
  354. $ret = str_replace(']]>', '', $ret);
  355. return $ret;
  356. }
  357. // build node's text with tag
  358. function makeup()
  359. {
  360. // text, comment, unknown
  361. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  362. $ret = '<'.$this->tag;
  363. $i = -1;
  364. foreach ($this->attr as $key=>$val)
  365. {
  366. ++$i;
  367. // skip removed attribute
  368. if ($val===null || $val===false)
  369. continue;
  370. $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  371. //no value attr: nowrap, checked selected...
  372. if ($val===true)
  373. $ret .= $key;
  374. else {
  375. switch ($this->_[HDOM_INFO_QUOTE][$i])
  376. {
  377. case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  378. case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  379. default: $quote = '';
  380. }
  381. $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
  382. }
  383. }
  384. $ret = $this->dom->restore_noise($ret);
  385. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  386. }
  387. // find elements by css selector
  388. //PaperG - added ability for find to lowercase the value of the selector.
  389. function find($selector, $idx=null, $lowercase=false)
  390. {
  391. $selectors = $this->parse_selector($selector);
  392. if (($count=count($selectors))===0) return array();
  393. $found_keys = array();
  394. // find each selector
  395. for ($c=0; $c<$count; ++$c)
  396. {
  397. // The change on the below line was documented on the sourceforge code tracker id 2788009
  398. // used to be: if (($levle=count($selectors[0]))===0) return array();
  399. if (($levle=count($selectors[$c]))===0) return array();
  400. if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
  401. $head = array($this->_[HDOM_INFO_BEGIN]=>1);
  402. // handle descendant selectors, no recursive!
  403. for ($l=0; $l<$levle; ++$l)
  404. {
  405. $ret = array();
  406. foreach ($head as $k=>$v)
  407. {
  408. $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
  409. //PaperG - Pass this optional parameter on to the seek function.
  410. $n->seek($selectors[$c][$l], $ret, $lowercase);
  411. }
  412. $head = $ret;
  413. }
  414. foreach ($head as $k=>$v)
  415. {
  416. if (!isset($found_keys[$k]))
  417. $found_keys[$k] = 1;
  418. }
  419. }
  420. // sort keys
  421. ksort($found_keys);
  422. $found = array();
  423. foreach ($found_keys as $k=>$v)
  424. $found[] = $this->dom->nodes[$k];
  425. // return nth-element or array
  426. if (is_null($idx)) return $found;
  427. else if ($idx<0) $idx = count($found) + $idx;
  428. return (isset($found[$idx])) ? $found[$idx] : null;
  429. }
  430. // seek for given conditions
  431. // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
  432. protected function seek($selector, &$ret, $lowercase=false)
  433. {
  434. global $debugObject;
  435. if (is_object($debugObject))
  436. {
  437. $debugObject->debugLogEntry(1);
  438. }
  439. list($tag, $key, $val, $exp, $no_key) = $selector;
  440. // xpath index
  441. if ($tag && $key && is_numeric($key))
  442. {
  443. $count = 0;
  444. foreach ($this->children as $c)
  445. {
  446. if ($tag==='*' || $tag===$c->tag) {
  447. if (++$count==$key) {
  448. $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
  449. return;
  450. }
  451. }
  452. }
  453. return;
  454. }
  455. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  456. if ($end==0) {
  457. $parent = $this->parent;
  458. while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
  459. $end -= 1;
  460. $parent = $parent->parent;
  461. }
  462. $end += $parent->_[HDOM_INFO_END];
  463. }
  464. for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
  465. $node = $this->dom->nodes[$i];
  466. $pass = true;
  467. if ($tag==='*' && !$key) {
  468. if (in_array($node, $this->children, true))
  469. $ret[$i] = 1;
  470. continue;
  471. }
  472. // compare tag
  473. if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
  474. // compare key
  475. if ($pass && $key) {
  476. if ($no_key) {
  477. if (isset($node->attr[$key])) $pass=false;
  478. } else {
  479. if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
  480. }
  481. }
  482. // compare value
  483. if ($pass && $key && $val && $val!=='*') {
  484. // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
  485. if ($key == "plaintext") {
  486. // $node->plaintext actually returns $node->text();
  487. $nodeKeyValue = $node->text();
  488. } else {
  489. // this is a normal search, we want the value of that attribute of the tag.
  490. $nodeKeyValue = $node->attr[$key];
  491. }
  492. if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
  493. //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
  494. if ($lowercase) {
  495. $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
  496. } else {
  497. $check = $this->match($exp, $val, $nodeKeyValue);
  498. }
  499. if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
  500. // handle multiple class
  501. if (!$check && strcasecmp($key, 'class')===0) {
  502. foreach (explode(' ',$node->attr[$key]) as $k) {
  503. // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
  504. if (!empty($k)) {
  505. if ($lowercase) {
  506. $check = $this->match($exp, strtolower($val), strtolower($k));
  507. } else {
  508. $check = $this->match($exp, $val, $k);
  509. }
  510. if ($check) break;
  511. }
  512. }
  513. }
  514. if (!$check) $pass = false;
  515. }
  516. if ($pass) $ret[$i] = 1;
  517. unset($node);
  518. }
  519. // It's passed by reference so this is actually what this function returns.
  520. if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
  521. }
  522. protected function match($exp, $pattern, $value) {
  523. global $debugObject;
  524. if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
  525. switch ($exp) {
  526. case '=':
  527. return ($value===$pattern);
  528. case '!=':
  529. return ($value!==$pattern);
  530. case '^=':
  531. return preg_match("/^".preg_quote($pattern,'/')."/", $value);
  532. case '$=':
  533. return preg_match("/".preg_quote($pattern,'/')."$/", $value);
  534. case '*=':
  535. if ($pattern[0]=='/') {
  536. return preg_match($pattern, $value);
  537. }
  538. return preg_match("/".$pattern."/i", $value);
  539. }
  540. return false;
  541. }
  542. protected function parse_selector($selector_string) {
  543. global $debugObject;
  544. if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
  545. // pattern of CSS selectors, modified from mootools
  546. // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
  547. // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
  548. // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
  549. // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
  550. // farther study is required to determine of this should be documented or removed.
  551. // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
  552. $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
  553. preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
  554. if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
  555. $selectors = array();
  556. $result = array();
  557. //print_r($matches);
  558. foreach ($matches as $m) {
  559. $m[0] = trim($m[0]);
  560. if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
  561. // for browser generated xpath
  562. if ($m[1]==='tbody') continue;
  563. list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
  564. if (!empty($m[2])) {$key='id'; $val=$m[2];}
  565. if (!empty($m[3])) {$key='class'; $val=$m[3];}
  566. if (!empty($m[4])) {$key=$m[4];}
  567. if (!empty($m[5])) {$exp=$m[5];}
  568. if (!empty($m[6])) {$val=$m[6];}
  569. // convert to lowercase
  570. if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
  571. //elements that do NOT have the specified attribute
  572. if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
  573. $result[] = array($tag, $key, $val, $exp, $no_key);
  574. if (trim($m[7])===',') {
  575. $selectors[] = $result;
  576. $result = array();
  577. }
  578. }
  579. if (count($result)>0)
  580. $selectors[] = $result;
  581. return $selectors;
  582. }
  583. function __get($name) {
  584. if (isset($this->attr[$name]))
  585. {
  586. return $this->convert_text($this->attr[$name]);
  587. }
  588. switch ($name) {
  589. case 'outertext': return $this->outertext();
  590. case 'innertext': return $this->innertext();
  591. case 'plaintext': return $this->text();
  592. case 'xmltext': return $this->xmltext();
  593. default: return array_key_exists($name, $this->attr);
  594. }
  595. }
  596. function __set($name, $value) {
  597. switch ($name) {
  598. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  599. case 'innertext':
  600. if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
  601. return $this->_[HDOM_INFO_INNER] = $value;
  602. }
  603. if (!isset($this->attr[$name])) {
  604. $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  605. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  606. }
  607. $this->attr[$name] = $value;
  608. }
  609. function __isset($name) {
  610. switch ($name) {
  611. case 'outertext': return true;
  612. case 'innertext': return true;
  613. case 'plaintext': return true;
  614. }
  615. //no value attr: nowrap, checked selected...
  616. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  617. }
  618. function __unset($name) {
  619. if (isset($this->attr[$name]))
  620. unset($this->attr[$name]);
  621. }
  622. // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
  623. function convert_text($text) {
  624. global $debugObject;
  625. if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
  626. $converted_text = $text;
  627. $sourceCharset = "";
  628. $targetCharset = "";
  629. if ($this->dom) {
  630. $sourceCharset = strtoupper($this->dom->_charset);
  631. $targetCharset = strtoupper($this->dom->_target_charset);
  632. }
  633. if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
  634. if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
  635. {
  636. // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
  637. if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
  638. {
  639. $converted_text = $text;
  640. }
  641. else
  642. {
  643. $converted_text = iconv($sourceCharset, $targetCharset, $text);
  644. }
  645. }
  646. return $converted_text;
  647. }
  648. function is_utf8($string)
  649. {
  650. return (utf8_encode(utf8_decode($string)) == $string);
  651. }
  652. // camel naming conventions
  653. function getAllAttributes() {return $this->attr;}
  654. function getAttribute($name) {return $this->__get($name);}
  655. function setAttribute($name, $value) {$this->__set($name, $value);}
  656. function hasAttribute($name) {return $this->__isset($name);}
  657. function removeAttribute($name) {$this->__set($name, null);}
  658. function getElementById($id) {return $this->find("#$id", 0);}
  659. function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
  660. function getElementByTagName($name) {return $this->find($name, 0);}
  661. function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
  662. function parentNode() {return $this->parent();}
  663. function childNodes($idx=-1) {return $this->children($idx);}
  664. function firstChild() {return $this->first_child();}
  665. function lastChild() {return $this->last_child();}
  666. function nextSibling() {return $this->next_sibling();}
  667. function previousSibling() {return $this->prev_sibling();}
  668. }
  669. /**
  670. * simple html dom parser
  671. * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
  672. * Paperg - change $size from protected to public so we can easily access it
  673. * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
  674. *
  675. * @package PlaceLocalInclude
  676. */
  677. class simple_html_dom {
  678. public $root = null;
  679. public $nodes = array();
  680. public $callback = null;
  681. public $lowercase = false;
  682. public $size;
  683. protected $pos;
  684. protected $doc;
  685. protected $char;
  686. protected $cursor;
  687. protected $parent;
  688. protected $noise = array();
  689. protected $token_blank = " \t\r\n";
  690. protected $token_equal = ' =/>';
  691. protected $token_slash = " />\r\n\t";
  692. protected $token_attr = ' >';
  693. protected $_charset = '';
  694. protected $_target_charset = '';
  695. protected $default_br_text = "";
  696. // use isset instead of in_array, performance boost about 30%...
  697. protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
  698. protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
  699. // Known sourceforge issue #2977341
  700. // B tags that are not closed cause us to return everything to the end of the document.
  701. protected $optional_closing_tags = array(
  702. 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
  703. 'th'=>array('th'=>1),
  704. 'td'=>array('td'=>1),
  705. 'li'=>array('li'=>1),
  706. 'dt'=>array('dt'=>1, 'dd'=>1),
  707. 'dd'=>array('dd'=>1, 'dt'=>1),
  708. 'dl'=>array('dd'=>1, 'dt'=>1),
  709. 'p'=>array('p'=>1),
  710. 'nobr'=>array('nobr'=>1),
  711. 'b'=>array('b'=>1),
  712. );
  713. function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT) {
  714. if ($str) {
  715. if (preg_match("/^http:\/\//i",$str) || is_file($str))
  716. $this->load_file($str);
  717. else
  718. $this->load($str, $lowercase, $stripRN, $defaultBRText);
  719. }
  720. // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
  721. if (!$forceTagsClosed) {
  722. $this->optional_closing_array=array();
  723. }
  724. $this->_target_charset = $target_charset;
  725. }
  726. function __destruct() {
  727. $this->clear();
  728. }
  729. // load html from string
  730. function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT) {
  731. global $debugObject;
  732. // prepare
  733. $this->prepare($str, $lowercase, $stripRN, $defaultBRText);
  734. // strip out comments
  735. $this->remove_noise("'<!--(.*?)-->'is");
  736. // strip out cdata
  737. $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
  738. // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
  739. // Script tags removal now preceeds style tag removal.
  740. // strip out <script> tags
  741. $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
  742. $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
  743. // strip out <style> tags
  744. $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
  745. $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
  746. // strip out preformatted tags
  747. $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
  748. // strip out server side scripts
  749. $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
  750. // strip smarty scripts
  751. $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
  752. // parsing
  753. while ($this->parse());
  754. // end
  755. $this->root->_[HDOM_INFO_END] = $this->cursor;
  756. $this->parse_charset();
  757. }
  758. // load html from file
  759. function load_file() {
  760. $args = func_get_args();
  761. $this->load(call_user_func_array('file_get_contents', $args), true);
  762. // Per the simple_html_dom repositiry this is a planned upgrade to the codebase.
  763. // Throw an error if we can't properly load the dom.
  764. if (($error=error_get_last())!==null) {
  765. $this->clear();
  766. return false;
  767. }
  768. }
  769. // set callback function
  770. function set_callback($function_name) {
  771. $this->callback = $function_name;
  772. }
  773. // remove callback function
  774. function remove_callback() {
  775. $this->callback = null;
  776. }
  777. // save dom as string
  778. function save($filepath='') {
  779. $ret = $this->root->innertext();
  780. if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
  781. return $ret;
  782. }
  783. // find dom node by css selector
  784. // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
  785. function find($selector, $idx=null, $lowercase=false) {
  786. return $this->root->find($selector, $idx, $lowercase);
  787. }
  788. // clean up memory due to php5 circular references memory leak...
  789. function clear() {
  790. foreach ($this->nodes as $n) {$n->clear(); $n = null;}
  791. // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
  792. if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
  793. if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
  794. if (isset($this->root)) {$this->root->clear(); unset($this->root);}
  795. unset($this->doc);
  796. unset($this->noise);
  797. }
  798. function dump($show_attr=true) {
  799. $this->root->dump($show_attr);
  800. }
  801. // prepare HTML data and init everything
  802. protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT) {
  803. $this->clear();
  804. // set the length of content before we do anything to it.
  805. $this->size = strlen($str);
  806. //before we save the string as the doc... strip out the \r \n's if we are told to.
  807. if ($stripRN) {
  808. $str = str_replace("\r", " ", $str);
  809. $str = str_replace("\n", " ", $str);
  810. }
  811. $this->doc = $str;
  812. $this->pos = 0;
  813. $this->cursor = 1;
  814. $this->noise = array();
  815. $this->nodes = array();
  816. $this->lowercase = $lowercase;
  817. $this->default_br_text = $defaultBRText;
  818. $this->root = new simple_html_dom_node($this);
  819. $this->root->tag = 'root';
  820. $this->root->_[HDOM_INFO_BEGIN] = -1;
  821. $this->root->nodetype = HDOM_TYPE_ROOT;
  822. $this->parent = $this->root;
  823. if ($this->size>0) $this->char = $this->doc[0];
  824. }
  825. // parse html content
  826. protected function parse() {
  827. if (($s = $this->copy_until_char('<'))==='')
  828. return $this->read_tag();
  829. // text
  830. $node = new simple_html_dom_node($this);
  831. ++$this->cursor;
  832. $node->_[HDOM_INFO_TEXT] = $s;
  833. $this->link_nodes($node, false);
  834. return true;
  835. }
  836. // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
  837. // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE fromt he last curl_exec
  838. // (or the content_type header fromt eh last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
  839. protected function parse_charset()
  840. {
  841. global $debugObject;
  842. $charset = null;
  843. if (function_exists('get_last_retrieve_url_contents_content_type'))
  844. {
  845. $contentTypeHeader = get_last_retrieve_url_contents_content_type();
  846. $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
  847. if ($success)
  848. {
  849. $charset = $matches[1];
  850. if (is_object($debugObject)) {$debugObject->debugLog(2, 'header content-type found charset of: ' . $charset);}
  851. }
  852. }
  853. if (empty($charset))
  854. {
  855. $el = $this->root->find('meta[http-equiv=Content-Type]',0);
  856. if (!empty($el))
  857. {
  858. $fullvalue = $el->content;
  859. if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag found' . $fullValue);}
  860. if (!empty($fullvalue))
  861. {
  862. $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
  863. if ($success)
  864. {
  865. $charset = $matches[1];
  866. }
  867. else
  868. {
  869. // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
  870. if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
  871. $charset = 'ISO-8859-1';
  872. }
  873. }
  874. }
  875. }
  876. // If we couldn't find a charset above, then lets try to detect one based on the text we got...
  877. if (empty($charset))
  878. {
  879. // Have php try to detect the encoding from the text given to us.
  880. $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
  881. if (is_object($debugObject)) {$debugObject->debugLog(2, 'mb_detect found: ' . $charset);}
  882. // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
  883. if ($charset === false)
  884. {
  885. if (is_object($debugObject)) {$debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8');}
  886. $charset = 'UTF-8';
  887. }
  888. }
  889. // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
  890. if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
  891. {
  892. if (is_object($debugObject)) {$debugObject->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
  893. $charset = 'CP1252';
  894. }
  895. if (is_object($debugObject)) {$debugObject->debugLog(1, 'EXIT - ' . $charset);}
  896. return $this->_charset = $charset;
  897. }
  898. // read tag info
  899. protected function read_tag() {
  900. if ($this->char!=='<') {
  901. $this->root->_[HDOM_INFO_END] = $this->cursor;
  902. return false;
  903. }
  904. $begin_tag_pos = $this->pos;
  905. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  906. // end tag
  907. if ($this->char==='/') {
  908. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  909. // This represetns the change in the simple_html_dom trunk from revision 180 to 181.
  910. // $this->skip($this->token_blank_t);
  911. $this->skip($this->token_blank);
  912. $tag = $this->copy_until_char('>');
  913. // skip attributes in end tag
  914. if (($pos = strpos($tag, ' '))!==false)
  915. $tag = substr($tag, 0, $pos);
  916. $parent_lower = strtolower($this->parent->tag);
  917. $tag_lower = strtolower($tag);
  918. if ($parent_lower!==$tag_lower) {
  919. if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
  920. $this->parent->_[HDOM_INFO_END] = 0;
  921. $org_parent = $this->parent;
  922. while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
  923. $this->parent = $this->parent->parent;
  924. if (strtolower($this->parent->tag)!==$tag_lower) {
  925. $this->parent = $org_parent; // restore origonal parent
  926. if ($this->parent->parent) $this->parent = $this->parent->parent;
  927. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  928. return $this->as_text_node($tag);
  929. }
  930. }
  931. else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {
  932. $this->parent->_[HDOM_INFO_END] = 0;
  933. $org_parent = $this->parent;
  934. while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
  935. $this->parent = $this->parent->parent;
  936. if (strtolower($this->parent->tag)!==$tag_lower) {
  937. $this->parent = $org_parent; // restore origonal parent
  938. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  939. return $this->as_text_node($tag);
  940. }
  941. }
  942. else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) {
  943. $this->parent->_[HDOM_INFO_END] = 0;
  944. $this->parent = $this->parent->parent;
  945. }
  946. else
  947. return $this->as_text_node($tag);
  948. }
  949. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  950. if ($this->parent->parent) $this->parent = $this->parent->parent;
  951. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  952. return true;
  953. }
  954. $node = new simple_html_dom_node($this);
  955. $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  956. ++$this->cursor;
  957. $tag = $this->copy_until($this->token_slash);
  958. $node->tag_start = $begin_tag_pos;
  959. // doctype, cdata & comments...
  960. if (isset($tag[0]) && $tag[0]==='!') {
  961. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
  962. if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
  963. $node->nodetype = HDOM_TYPE_COMMENT;
  964. $node->tag = 'comment';
  965. } else {
  966. $node->nodetype = HDOM_TYPE_UNKNOWN;
  967. $node->tag = 'unknown';
  968. }
  969. if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
  970. $this->link_nodes($node, true);
  971. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  972. return true;
  973. }
  974. // text
  975. if ($pos=strpos($tag, '<')!==false) {
  976. $tag = '<' . substr($tag, 0, -1);
  977. $node->_[HDOM_INFO_TEXT] = $tag;
  978. $this->link_nodes($node, false);
  979. $this->char = $this->doc[--$this->pos]; // prev
  980. return true;
  981. }
  982. if (!preg_match("/^[\w-:]+$/", $tag)) {
  983. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
  984. if ($this->char==='<') {
  985. $this->link_nodes($node, false);
  986. return true;
  987. }
  988. if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
  989. $this->link_nodes($node, false);
  990. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  991. return true;
  992. }
  993. // begin tag
  994. $node->nodetype = HDOM_TYPE_ELEMENT;
  995. $tag_lower = strtolower($tag);
  996. $node->tag = ($this->lowercase) ? $tag_lower : $tag;
  997. // handle optional closing tags
  998. if (isset($this->optional_closing_tags[$tag_lower]) ) {
  999. while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
  1000. $this->parent->_[HDOM_INFO_END] = 0;
  1001. $this->parent = $this->parent->parent;
  1002. }
  1003. $node->parent = $this->parent;
  1004. }
  1005. $guard = 0; // prevent infinity loop
  1006. $space = array($this->copy_skip($this->token_blank), '', '');
  1007. // attributes
  1008. do
  1009. {
  1010. if ($this->char!==null && $space[0]==='') break;
  1011. $name = $this->copy_until($this->token_equal);
  1012. if ($guard===$this->pos) {
  1013. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1014. continue;
  1015. }
  1016. $guard = $this->pos;
  1017. // handle endless '<'
  1018. if ($this->pos>=$this->size-1 && $this->char!=='>') {
  1019. $node->nodetype = HDOM_TYPE_TEXT;
  1020. $node->_[HDOM_INFO_END] = 0;
  1021. $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
  1022. $node->tag = 'text';
  1023. $this->link_nodes($node, false);
  1024. return true;
  1025. }
  1026. // handle mismatch '<'
  1027. if ($this->doc[$this->pos-1]=='<') {
  1028. $node->nodetype = HDOM_TYPE_TEXT;
  1029. $node->tag = 'text';
  1030. $node->attr = array();
  1031. $node->_[HDOM_INFO_END] = 0;
  1032. $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
  1033. $this->pos -= 2;
  1034. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1035. $this->link_nodes($node, false);
  1036. return true;
  1037. }
  1038. if ($name!=='/' && $name!=='') {
  1039. $space[1] = $this->copy_skip($this->token_blank);
  1040. $name = $this->restore_noise($name);
  1041. if ($this->lowercase) $name = strtolower($name);
  1042. if ($this->char==='=') {
  1043. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1044. $this->parse_attr($node, $name, $space);
  1045. }
  1046. else {
  1047. //no value attr: nowrap, checked selected...
  1048. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1049. $node->attr[$name] = true;
  1050. if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
  1051. }
  1052. $node->_[HDOM_INFO_SPACE][] = $space;
  1053. $space = array($this->copy_skip($this->token_blank), '', '');
  1054. }
  1055. else
  1056. break;
  1057. } while ($this->char!=='>' && $this->char!=='/');
  1058. $this->link_nodes($node, true);
  1059. $node->_[HDOM_INFO_ENDSPACE] = $space[0];
  1060. // check self closing
  1061. if ($this->copy_until_char_escape('>')==='/') {
  1062. $node->_[HDOM_INFO_ENDSPACE] .= '/';
  1063. $node->_[HDOM_INFO_END] = 0;
  1064. }
  1065. else {
  1066. // reset parent
  1067. if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
  1068. }
  1069. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1070. // If it's a BR tag, we need to set it's text to the default text.
  1071. // This way when we see it in plaintext, we can generate formatting that the user wants.
  1072. if ($node->tag == "br") {
  1073. $node->_[HDOM_INFO_INNER] = $this->default_br_text;
  1074. }
  1075. return true;
  1076. }
  1077. // parse attributes
  1078. protected function parse_attr($node, $name, &$space) {
  1079. // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
  1080. // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
  1081. if (isset($node->attr[$name]))
  1082. {
  1083. return;
  1084. }
  1085. $space[2] = $this->copy_skip($this->token_blank);
  1086. switch ($this->char) {
  1087. case '"':
  1088. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  1089. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1090. $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
  1091. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1092. break;
  1093. case '\'':
  1094. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
  1095. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1096. $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
  1097. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1098. break;
  1099. default:
  1100. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1101. $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
  1102. }
  1103. // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
  1104. $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
  1105. $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
  1106. // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
  1107. if ($name == "class") {
  1108. $node->attr[$name] = trim($node->attr[$name]);
  1109. }
  1110. }
  1111. // link node's parent
  1112. protected function link_nodes(&$node, $is_child) {
  1113. $node->parent = $this->parent;
  1114. $this->parent->nodes[] = $node;
  1115. if ($is_child)
  1116. $this->parent->children[] = $node;
  1117. }
  1118. // as a text node
  1119. protected function as_text_node($tag) {
  1120. $node = new simple_html_dom_node($this);
  1121. ++$this->cursor;
  1122. $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
  1123. $this->link_nod

Large files files are truncated, but you can click here to view the full file