PageRenderTime 39ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Symfony/Component/DomCrawler/Crawler.php

https://github.com/stepanets/symfony
PHP | 732 lines | 313 code | 82 blank | 337 comment | 40 complexity | a4ecf47c471968e01e6ef9b982a016b4 MD5 | raw file
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. /**
  22. * @var string The current URI or the base href value
  23. */
  24. protected $uri;
  25. /**
  26. * Constructor.
  27. *
  28. * @param mixed $node A Node to use as the base for the crawling
  29. * @param string $uri The current URI or the base href value
  30. *
  31. * @api
  32. */
  33. public function __construct($node = null, $uri = null)
  34. {
  35. $this->uri = $uri;
  36. $this->add($node);
  37. }
  38. /**
  39. * Removes all the nodes.
  40. *
  41. * @api
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param null|\DOMNodeList|array|\DOMNode $node A node
  54. *
  55. * @api
  56. */
  57. public function add($node)
  58. {
  59. if ($node instanceof \DOMNodeList) {
  60. $this->addNodeList($node);
  61. } elseif (is_array($node)) {
  62. $this->addNodes($node);
  63. } elseif (is_string($node)) {
  64. $this->addContent($node);
  65. } elseif (is_object($node)) {
  66. $this->addNode($node);
  67. }
  68. }
  69. /**
  70. * Adds HTML/XML content.
  71. *
  72. * @param string $content A string to parse as HTML/XML
  73. * @param null|string $type The content type of the string
  74. *
  75. * @return null|void
  76. */
  77. public function addContent($content, $type = null)
  78. {
  79. if (empty($type)) {
  80. $type = 'text/html';
  81. }
  82. // DOM only for HTML/XML content
  83. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  84. return null;
  85. }
  86. $charset = 'ISO-8859-1';
  87. if (false !== $pos = strpos($type, 'charset=')) {
  88. $charset = substr($type, $pos + 8);
  89. if (false !== $pos = strpos($charset, ';')) {
  90. $charset = substr($charset, 0, $pos);
  91. }
  92. }
  93. if ('x' === $matches[1]) {
  94. $this->addXmlContent($content, $charset);
  95. } else {
  96. $this->addHtmlContent($content, $charset);
  97. }
  98. }
  99. /**
  100. * Adds an HTML content to the list of nodes.
  101. *
  102. * The libxml errors are disabled when the content is parsed.
  103. *
  104. * If you want to get parsing errors, be sure to enable
  105. * internal errors via libxml_use_internal_errors(true)
  106. * and then, get the errors via libxml_get_errors(). Be
  107. * sure to clear errors with libxml_clear_errors() afterward.
  108. *
  109. * @param string $content The HTML content
  110. * @param string $charset The charset
  111. *
  112. * @api
  113. */
  114. public function addHtmlContent($content, $charset = 'UTF-8')
  115. {
  116. $current = libxml_use_internal_errors(true);
  117. $disableEntities = libxml_disable_entity_loader(true);
  118. $dom = new \DOMDocument('1.0', $charset);
  119. $dom->validateOnParse = true;
  120. if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
  121. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  122. }
  123. @$dom->loadHTML($content);
  124. libxml_use_internal_errors($current);
  125. libxml_disable_entity_loader($disableEntities);
  126. $this->addDocument($dom);
  127. $base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
  128. $baseHref = current($base);
  129. if (count($base) && !empty($baseHref)) {
  130. $this->uri = $baseHref;
  131. }
  132. }
  133. /**
  134. * Adds an XML content to the list of nodes.
  135. *
  136. * The libxml errors are disabled when the content is parsed.
  137. *
  138. * If you want to get parsing errors, be sure to enable
  139. * internal errors via libxml_use_internal_errors(true)
  140. * and then, get the errors via libxml_get_errors(). Be
  141. * sure to clear errors with libxml_clear_errors() afterward.
  142. *
  143. * @param string $content The XML content
  144. * @param string $charset The charset
  145. *
  146. * @api
  147. */
  148. public function addXmlContent($content, $charset = 'UTF-8')
  149. {
  150. $current = libxml_use_internal_errors(true);
  151. $disableEntities = libxml_disable_entity_loader(true);
  152. $dom = new \DOMDocument('1.0', $charset);
  153. $dom->validateOnParse = true;
  154. // remove the default namespace to make XPath expressions simpler
  155. @$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
  156. libxml_use_internal_errors($current);
  157. libxml_disable_entity_loader($disableEntities);
  158. $this->addDocument($dom);
  159. }
  160. /**
  161. * Adds a \DOMDocument to the list of nodes.
  162. *
  163. * @param \DOMDocument $dom A \DOMDocument instance
  164. *
  165. * @api
  166. */
  167. public function addDocument(\DOMDocument $dom)
  168. {
  169. if ($dom->documentElement) {
  170. $this->addNode($dom->documentElement);
  171. }
  172. }
  173. /**
  174. * Adds a \DOMNodeList to the list of nodes.
  175. *
  176. * @param \DOMNodeList $nodes A \DOMNodeList instance
  177. *
  178. * @api
  179. */
  180. public function addNodeList(\DOMNodeList $nodes)
  181. {
  182. foreach ($nodes as $node) {
  183. $this->addNode($node);
  184. }
  185. }
  186. /**
  187. * Adds an array of \DOMNode instances to the list of nodes.
  188. *
  189. * @param \DOMNode[] $nodes An array of \DOMNode instances
  190. *
  191. * @api
  192. */
  193. public function addNodes(array $nodes)
  194. {
  195. foreach ($nodes as $node) {
  196. $this->add($node);
  197. }
  198. }
  199. /**
  200. * Adds a \DOMNode instance to the list of nodes.
  201. *
  202. * @param \DOMNode $node A \DOMNode instance
  203. *
  204. * @api
  205. */
  206. public function addNode(\DOMNode $node)
  207. {
  208. if ($node instanceof \DOMDocument) {
  209. $this->attach($node->documentElement);
  210. } else {
  211. $this->attach($node);
  212. }
  213. }
  214. /**
  215. * Returns a node given its position in the node list.
  216. *
  217. * @param integer $position The position
  218. *
  219. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  220. *
  221. * @api
  222. */
  223. public function eq($position)
  224. {
  225. foreach ($this as $i => $node) {
  226. if ($i == $position) {
  227. return new static($node, $this->uri);
  228. }
  229. }
  230. return new static(null, $this->uri);
  231. }
  232. /**
  233. * Calls an anonymous function on each node of the list.
  234. *
  235. * The anonymous function receives the position and the node as arguments.
  236. *
  237. * Example:
  238. *
  239. * $crawler->filter('h1')->each(function ($node, $i)
  240. * {
  241. * return $node->nodeValue;
  242. * });
  243. *
  244. * @param \Closure $closure An anonymous function
  245. *
  246. * @return array An array of values returned by the anonymous function
  247. *
  248. * @api
  249. */
  250. public function each(\Closure $closure)
  251. {
  252. $data = array();
  253. foreach ($this as $i => $node) {
  254. $data[] = $closure($node, $i);
  255. }
  256. return $data;
  257. }
  258. /**
  259. * Reduces the list of nodes by calling an anonymous function.
  260. *
  261. * To remove a node from the list, the anonymous function must return false.
  262. *
  263. * @param \Closure $closure An anonymous function
  264. *
  265. * @return Crawler A Crawler instance with the selected nodes.
  266. *
  267. * @api
  268. */
  269. public function reduce(\Closure $closure)
  270. {
  271. $nodes = array();
  272. foreach ($this as $i => $node) {
  273. if (false !== $closure($node, $i)) {
  274. $nodes[] = $node;
  275. }
  276. }
  277. return new static($nodes, $this->uri);
  278. }
  279. /**
  280. * Returns the first node of the current selection
  281. *
  282. * @return Crawler A Crawler instance with the first selected node
  283. *
  284. * @api
  285. */
  286. public function first()
  287. {
  288. return $this->eq(0);
  289. }
  290. /**
  291. * Returns the last node of the current selection
  292. *
  293. * @return Crawler A Crawler instance with the last selected node
  294. *
  295. * @api
  296. */
  297. public function last()
  298. {
  299. return $this->eq(count($this) - 1);
  300. }
  301. /**
  302. * Returns the siblings nodes of the current selection
  303. *
  304. * @return Crawler A Crawler instance with the sibling nodes
  305. *
  306. * @throws \InvalidArgumentException When current node is empty
  307. *
  308. * @api
  309. */
  310. public function siblings()
  311. {
  312. if (!count($this)) {
  313. throw new \InvalidArgumentException('The current node list is empty.');
  314. }
  315. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  316. }
  317. /**
  318. * Returns the next siblings nodes of the current selection
  319. *
  320. * @return Crawler A Crawler instance with the next sibling nodes
  321. *
  322. * @throws \InvalidArgumentException When current node is empty
  323. *
  324. * @api
  325. */
  326. public function nextAll()
  327. {
  328. if (!count($this)) {
  329. throw new \InvalidArgumentException('The current node list is empty.');
  330. }
  331. return new static($this->sibling($this->getNode(0)), $this->uri);
  332. }
  333. /**
  334. * Returns the previous sibling nodes of the current selection
  335. *
  336. * @return Crawler A Crawler instance with the previous sibling nodes
  337. *
  338. * @throws \InvalidArgumentException
  339. *
  340. * @api
  341. */
  342. public function previousAll()
  343. {
  344. if (!count($this)) {
  345. throw new \InvalidArgumentException('The current node list is empty.');
  346. }
  347. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  348. }
  349. /**
  350. * Returns the parents nodes of the current selection
  351. *
  352. * @return Crawler A Crawler instance with the parents nodes of the current selection
  353. *
  354. * @throws \InvalidArgumentException When current node is empty
  355. *
  356. * @api
  357. */
  358. public function parents()
  359. {
  360. if (!count($this)) {
  361. throw new \InvalidArgumentException('The current node list is empty.');
  362. }
  363. $node = $this->getNode(0);
  364. $nodes = array();
  365. while ($node = $node->parentNode) {
  366. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  367. $nodes[] = $node;
  368. }
  369. }
  370. return new static($nodes, $this->uri);
  371. }
  372. /**
  373. * Returns the children nodes of the current selection
  374. *
  375. * @return Crawler A Crawler instance with the children nodes
  376. *
  377. * @throws \InvalidArgumentException When current node is empty
  378. *
  379. * @api
  380. */
  381. public function children()
  382. {
  383. if (!count($this)) {
  384. throw new \InvalidArgumentException('The current node list is empty.');
  385. }
  386. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  387. }
  388. /**
  389. * Returns the attribute value of the first node of the list.
  390. *
  391. * @param string $attribute The attribute name
  392. *
  393. * @return string The attribute value
  394. *
  395. * @throws \InvalidArgumentException When current node is empty
  396. *
  397. * @api
  398. */
  399. public function attr($attribute)
  400. {
  401. if (!count($this)) {
  402. throw new \InvalidArgumentException('The current node list is empty.');
  403. }
  404. return $this->getNode(0)->getAttribute($attribute);
  405. }
  406. /**
  407. * Returns the node value of the first node of the list.
  408. *
  409. * @return string The node value
  410. *
  411. * @throws \InvalidArgumentException When current node is empty
  412. *
  413. * @api
  414. */
  415. public function text()
  416. {
  417. if (!count($this)) {
  418. throw new \InvalidArgumentException('The current node list is empty.');
  419. }
  420. return $this->getNode(0)->nodeValue;
  421. }
  422. /**
  423. * Extracts information from the list of nodes.
  424. *
  425. * You can extract attributes or/and the node value (_text).
  426. *
  427. * Example:
  428. *
  429. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  430. *
  431. * @param array $attributes An array of attributes
  432. *
  433. * @return array An array of extracted values
  434. *
  435. * @api
  436. */
  437. public function extract($attributes)
  438. {
  439. $attributes = (array) $attributes;
  440. $data = array();
  441. foreach ($this as $node) {
  442. $elements = array();
  443. foreach ($attributes as $attribute) {
  444. if ('_text' === $attribute) {
  445. $elements[] = $node->nodeValue;
  446. } else {
  447. $elements[] = $node->getAttribute($attribute);
  448. }
  449. }
  450. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  451. }
  452. return $data;
  453. }
  454. /**
  455. * Filters the list of nodes with an XPath expression.
  456. *
  457. * @param string $xpath An XPath expression
  458. *
  459. * @return Crawler A new instance of Crawler with the filtered list of nodes
  460. *
  461. * @api
  462. */
  463. public function filterXPath($xpath)
  464. {
  465. $document = new \DOMDocument('1.0', 'UTF-8');
  466. $root = $document->appendChild($document->createElement('_root'));
  467. foreach ($this as $node) {
  468. $root->appendChild($document->importNode($node, true));
  469. }
  470. $domxpath = new \DOMXPath($document);
  471. return new static($domxpath->query($xpath), $this->uri);
  472. }
  473. /**
  474. * Filters the list of nodes with a CSS selector.
  475. *
  476. * This method only works if you have installed the CssSelector Symfony Component.
  477. *
  478. * @param string $selector A CSS selector
  479. *
  480. * @return Crawler A new instance of Crawler with the filtered list of nodes
  481. *
  482. * @throws \RuntimeException if the CssSelector Component is not available
  483. *
  484. * @api
  485. */
  486. public function filter($selector)
  487. {
  488. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  489. // @codeCoverageIgnoreStart
  490. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  491. // @codeCoverageIgnoreEnd
  492. }
  493. return $this->filterXPath(CssSelector::toXPath($selector));
  494. }
  495. /**
  496. * Selects links by name or alt value for clickable images.
  497. *
  498. * @param string $value The link text
  499. *
  500. * @return Crawler A new instance of Crawler with the filtered list of nodes
  501. *
  502. * @api
  503. */
  504. public function selectLink($value)
  505. {
  506. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  507. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  508. return $this->filterXPath($xpath);
  509. }
  510. /**
  511. * Selects a button by name or alt value for images.
  512. *
  513. * @param string $value The button text
  514. *
  515. * @return Crawler A new instance of Crawler with the filtered list of nodes
  516. *
  517. * @api
  518. */
  519. public function selectButton($value)
  520. {
  521. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  522. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  523. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  524. return $this->filterXPath($xpath);
  525. }
  526. /**
  527. * Returns a Link object for the first node in the list.
  528. *
  529. * @param string $method The method for the link (get by default)
  530. *
  531. * @return Link A Link instance
  532. *
  533. * @throws \InvalidArgumentException If the current node list is empty
  534. *
  535. * @api
  536. */
  537. public function link($method = 'get')
  538. {
  539. if (!count($this)) {
  540. throw new \InvalidArgumentException('The current node list is empty.');
  541. }
  542. $node = $this->getNode(0);
  543. return new Link($node, $this->uri, $method);
  544. }
  545. /**
  546. * Returns an array of Link objects for the nodes in the list.
  547. *
  548. * @return Link[] An array of Link instances
  549. *
  550. * @api
  551. */
  552. public function links()
  553. {
  554. $links = array();
  555. foreach ($this as $node) {
  556. $links[] = new Link($node, $this->uri, 'get');
  557. }
  558. return $links;
  559. }
  560. /**
  561. * Returns a Form object for the first node in the list.
  562. *
  563. * @param array $values An array of values for the form fields
  564. * @param string $method The method for the form
  565. *
  566. * @return Form A Form instance
  567. *
  568. * @throws \InvalidArgumentException If the current node list is empty
  569. *
  570. * @api
  571. */
  572. public function form(array $values = null, $method = null)
  573. {
  574. if (!count($this)) {
  575. throw new \InvalidArgumentException('The current node list is empty.');
  576. }
  577. $form = new Form($this->getNode(0), $this->uri, $method);
  578. if (null !== $values) {
  579. $form->setValues($values);
  580. }
  581. return $form;
  582. }
  583. /**
  584. * Converts string for XPath expressions.
  585. *
  586. * Escaped characters are: quotes (") and apostrophe (').
  587. *
  588. * Examples:
  589. * <code>
  590. * echo Crawler::xpathLiteral('foo " bar');
  591. * //prints 'foo " bar'
  592. *
  593. * echo Crawler::xpathLiteral("foo ' bar");
  594. * //prints "foo ' bar"
  595. *
  596. * echo Crawler::xpathLiteral('a\'b"c');
  597. * //prints concat('a', "'", 'b"c')
  598. * </code>
  599. *
  600. * @param string $s String to be escaped
  601. *
  602. * @return string Converted string
  603. *
  604. */
  605. public static function xpathLiteral($s)
  606. {
  607. if (false === strpos($s, "'")) {
  608. return sprintf("'%s'", $s);
  609. }
  610. if (false === strpos($s, '"')) {
  611. return sprintf('"%s"', $s);
  612. }
  613. $string = $s;
  614. $parts = array();
  615. while (true) {
  616. if (false !== $pos = strpos($string, "'")) {
  617. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  618. $parts[] = "\"'\"";
  619. $string = substr($string, $pos + 1);
  620. } else {
  621. $parts[] = "'$string'";
  622. break;
  623. }
  624. }
  625. return sprintf("concat(%s)", implode($parts, ', '));
  626. }
  627. protected function getNode($position)
  628. {
  629. foreach ($this as $i => $node) {
  630. if ($i == $position) {
  631. return $node;
  632. }
  633. // @codeCoverageIgnoreStart
  634. }
  635. return null;
  636. // @codeCoverageIgnoreEnd
  637. }
  638. protected function sibling($node, $siblingDir = 'nextSibling')
  639. {
  640. $nodes = array();
  641. do {
  642. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  643. $nodes[] = $node;
  644. }
  645. } while ($node = $node->$siblingDir);
  646. return $nodes;
  647. }
  648. }