PageRenderTime 37ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/src/Symfony/Component/DomCrawler/Crawler.php

https://github.com/Exercise/symfony
PHP | 695 lines | 305 code | 77 blank | 313 comment | 37 complexity | 656d3d838d2fbd1a80544fc287e2fc03 MD5 | raw file
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. /**
  22. * @var string The current URI or the base href value
  23. */
  24. private $uri;
  25. /**
  26. * Constructor.
  27. *
  28. * @param mixed $node A Node to use as the base for the crawling
  29. * @param string $uri The current URI or the base href value
  30. *
  31. * @api
  32. */
  33. public function __construct($node = null, $uri = null)
  34. {
  35. $this->uri = $uri;
  36. $this->add($node);
  37. }
  38. /**
  39. * Removes all the nodes.
  40. *
  41. * @api
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param null|\DOMNodeList|array|\DOMNode $node A node
  54. *
  55. * @api
  56. */
  57. public function add($node)
  58. {
  59. if ($node instanceof \DOMNodeList) {
  60. $this->addNodeList($node);
  61. } elseif (is_array($node)) {
  62. $this->addNodes($node);
  63. } elseif (is_string($node)) {
  64. $this->addContent($node);
  65. } elseif (is_object($node)) {
  66. $this->addNode($node);
  67. }
  68. }
  69. /**
  70. * Adds HTML/XML content.
  71. *
  72. * @param string $content A string to parse as HTML/XML
  73. * @param null|string $type The content type of the string
  74. *
  75. * @return null|void
  76. */
  77. public function addContent($content, $type = null)
  78. {
  79. if (empty($type)) {
  80. $type = 'text/html';
  81. }
  82. // DOM only for HTML/XML content
  83. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  84. return null;
  85. }
  86. $charset = 'ISO-8859-1';
  87. if (false !== $pos = strpos($type, 'charset=')) {
  88. $charset = substr($type, $pos + 8);
  89. if (false !== $pos = strpos($charset, ';')) {
  90. $charset = substr($charset, 0, $pos);
  91. }
  92. }
  93. if ('x' === $matches[1]) {
  94. $this->addXmlContent($content, $charset);
  95. } else {
  96. $this->addHtmlContent($content, $charset);
  97. }
  98. }
  99. /**
  100. * Adds an HTML content to the list of nodes.
  101. *
  102. * The libxml errors are disabled when the content is parsed.
  103. *
  104. * If you want to get parsing errors, be sure to enable
  105. * internal errors via libxml_use_internal_errors(true)
  106. * and then, get the errors via libxml_get_errors(). Be
  107. * sure to clear errors with libxml_clear_errors() afterward.
  108. *
  109. * @param string $content The HTML content
  110. * @param string $charset The charset
  111. *
  112. * @api
  113. */
  114. public function addHtmlContent($content, $charset = 'UTF-8')
  115. {
  116. $dom = new \DOMDocument('1.0', $charset);
  117. $dom->validateOnParse = true;
  118. $current = libxml_use_internal_errors(true);
  119. @$dom->loadHTML($content);
  120. libxml_use_internal_errors($current);
  121. $this->addDocument($dom);
  122. $base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
  123. if (count($base)) {
  124. $this->uri = current($base);
  125. }
  126. }
  127. /**
  128. * Adds an XML content to the list of nodes.
  129. *
  130. * The libxml errors are disabled when the content is parsed.
  131. *
  132. * If you want to get parsing errors, be sure to enable
  133. * internal errors via libxml_use_internal_errors(true)
  134. * and then, get the errors via libxml_get_errors(). Be
  135. * sure to clear errors with libxml_clear_errors() afterward.
  136. *
  137. * @param string $content The XML content
  138. * @param string $charset The charset
  139. *
  140. * @api
  141. */
  142. public function addXmlContent($content, $charset = 'UTF-8')
  143. {
  144. $dom = new \DOMDocument('1.0', $charset);
  145. $dom->validateOnParse = true;
  146. // remove the default namespace to make XPath expressions simpler
  147. $current = libxml_use_internal_errors(true);
  148. @$dom->loadXML(str_replace('xmlns', 'ns', $content));
  149. libxml_use_internal_errors($current);
  150. $this->addDocument($dom);
  151. }
  152. /**
  153. * Adds a \DOMDocument to the list of nodes.
  154. *
  155. * @param \DOMDocument $dom A \DOMDocument instance
  156. *
  157. * @api
  158. */
  159. public function addDocument(\DOMDocument $dom)
  160. {
  161. if ($dom->documentElement) {
  162. $this->addNode($dom->documentElement);
  163. }
  164. }
  165. /**
  166. * Adds a \DOMNodeList to the list of nodes.
  167. *
  168. * @param \DOMNodeList $nodes A \DOMNodeList instance
  169. *
  170. * @api
  171. */
  172. public function addNodeList(\DOMNodeList $nodes)
  173. {
  174. foreach ($nodes as $node) {
  175. $this->addNode($node);
  176. }
  177. }
  178. /**
  179. * Adds an array of \DOMNode instances to the list of nodes.
  180. *
  181. * @param array $nodes An array of \DOMNode instances
  182. *
  183. * @api
  184. */
  185. public function addNodes(array $nodes)
  186. {
  187. foreach ($nodes as $node) {
  188. $this->add($node);
  189. }
  190. }
  191. /**
  192. * Adds a \DOMNode instance to the list of nodes.
  193. *
  194. * @param \DOMNode $node A \DOMNode instance
  195. *
  196. * @api
  197. */
  198. public function addNode(\DOMNode $node)
  199. {
  200. if ($node instanceof \DOMDocument) {
  201. $this->attach($node->documentElement);
  202. } else {
  203. $this->attach($node);
  204. }
  205. }
  206. /**
  207. * Returns a node given its position in the node list.
  208. *
  209. * @param integer $position The position
  210. *
  211. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  212. *
  213. * @api
  214. */
  215. public function eq($position)
  216. {
  217. foreach ($this as $i => $node) {
  218. if ($i == $position) {
  219. return new static($node, $this->uri);
  220. }
  221. }
  222. return new static(null, $this->uri);
  223. }
  224. /**
  225. * Calls an anonymous function on each node of the list.
  226. *
  227. * The anonymous function receives the position and the node as arguments.
  228. *
  229. * Example:
  230. *
  231. * $crawler->filter('h1')->each(function ($node, $i)
  232. * {
  233. * return $node->nodeValue;
  234. * });
  235. *
  236. * @param \Closure $closure An anonymous function
  237. *
  238. * @return array An array of values returned by the anonymous function
  239. *
  240. * @api
  241. */
  242. public function each(\Closure $closure)
  243. {
  244. $data = array();
  245. foreach ($this as $i => $node) {
  246. $data[] = $closure($node, $i);
  247. }
  248. return $data;
  249. }
  250. /**
  251. * Reduces the list of nodes by calling an anonymous function.
  252. *
  253. * To remove a node from the list, the anonymous function must return false.
  254. *
  255. * @param \Closure $closure An anonymous function
  256. *
  257. * @return Crawler A Crawler instance with the selected nodes.
  258. *
  259. * @api
  260. */
  261. public function reduce(\Closure $closure)
  262. {
  263. $nodes = array();
  264. foreach ($this as $i => $node) {
  265. if (false !== $closure($node, $i)) {
  266. $nodes[] = $node;
  267. }
  268. }
  269. return new static($nodes, $this->uri);
  270. }
  271. /**
  272. * Returns the first node of the current selection
  273. *
  274. * @return Crawler A Crawler instance with the first selected node
  275. *
  276. * @api
  277. */
  278. public function first()
  279. {
  280. return $this->eq(0);
  281. }
  282. /**
  283. * Returns the last node of the current selection
  284. *
  285. * @return Crawler A Crawler instance with the last selected node
  286. *
  287. * @api
  288. */
  289. public function last()
  290. {
  291. return $this->eq(count($this) - 1);
  292. }
  293. /**
  294. * Returns the siblings nodes of the current selection
  295. *
  296. * @return Crawler A Crawler instance with the sibling nodes
  297. *
  298. * @throws \InvalidArgumentException When current node is empty
  299. *
  300. * @api
  301. */
  302. public function siblings()
  303. {
  304. if (!count($this)) {
  305. throw new \InvalidArgumentException('The current node list is empty.');
  306. }
  307. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  308. }
  309. /**
  310. * Returns the next siblings nodes of the current selection
  311. *
  312. * @return Crawler A Crawler instance with the next sibling nodes
  313. *
  314. * @throws \InvalidArgumentException When current node is empty
  315. *
  316. * @api
  317. */
  318. public function nextAll()
  319. {
  320. if (!count($this)) {
  321. throw new \InvalidArgumentException('The current node list is empty.');
  322. }
  323. return new static($this->sibling($this->getNode(0)), $this->uri);
  324. }
  325. /**
  326. * Returns the previous sibling nodes of the current selection
  327. *
  328. * @return Crawler A Crawler instance with the previous sibling nodes
  329. *
  330. * @api
  331. */
  332. public function previousAll()
  333. {
  334. if (!count($this)) {
  335. throw new \InvalidArgumentException('The current node list is empty.');
  336. }
  337. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  338. }
  339. /**
  340. * Returns the parents nodes of the current selection
  341. *
  342. * @return Crawler A Crawler instance with the parents nodes of the current selection
  343. *
  344. * @throws \InvalidArgumentException When current node is empty
  345. *
  346. * @api
  347. */
  348. public function parents()
  349. {
  350. if (!count($this)) {
  351. throw new \InvalidArgumentException('The current node list is empty.');
  352. }
  353. $node = $this->getNode(0);
  354. $nodes = array();
  355. while ($node = $node->parentNode) {
  356. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  357. $nodes[] = $node;
  358. }
  359. }
  360. return new static($nodes, $this->uri);
  361. }
  362. /**
  363. * Returns the children nodes of the current selection
  364. *
  365. * @return Crawler A Crawler instance with the children nodes
  366. *
  367. * @throws \InvalidArgumentException When current node is empty
  368. *
  369. * @api
  370. */
  371. public function children()
  372. {
  373. if (!count($this)) {
  374. throw new \InvalidArgumentException('The current node list is empty.');
  375. }
  376. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  377. }
  378. /**
  379. * Returns the attribute value of the first node of the list.
  380. *
  381. * @param string $attribute The attribute name
  382. *
  383. * @return string The attribute value
  384. *
  385. * @throws \InvalidArgumentException When current node is empty
  386. *
  387. * @api
  388. */
  389. public function attr($attribute)
  390. {
  391. if (!count($this)) {
  392. throw new \InvalidArgumentException('The current node list is empty.');
  393. }
  394. return $this->getNode(0)->getAttribute($attribute);
  395. }
  396. /**
  397. * Returns the node value of the first node of the list.
  398. *
  399. * @return string The node value
  400. *
  401. * @throws \InvalidArgumentException When current node is empty
  402. *
  403. * @api
  404. */
  405. public function text()
  406. {
  407. if (!count($this)) {
  408. throw new \InvalidArgumentException('The current node list is empty.');
  409. }
  410. return $this->getNode(0)->nodeValue;
  411. }
  412. /**
  413. * Extracts information from the list of nodes.
  414. *
  415. * You can extract attributes or/and the node value (_text).
  416. *
  417. * Example:
  418. *
  419. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  420. *
  421. * @param array $attributes An array of attributes
  422. *
  423. * @return array An array of extracted values
  424. *
  425. * @api
  426. */
  427. public function extract($attributes)
  428. {
  429. $attributes = (array) $attributes;
  430. $data = array();
  431. foreach ($this as $node) {
  432. $elements = array();
  433. foreach ($attributes as $attribute) {
  434. if ('_text' === $attribute) {
  435. $elements[] = $node->nodeValue;
  436. } else {
  437. $elements[] = $node->getAttribute($attribute);
  438. }
  439. }
  440. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  441. }
  442. return $data;
  443. }
  444. /**
  445. * Filters the list of nodes with an XPath expression.
  446. *
  447. * @param string $xpath An XPath expression
  448. *
  449. * @return Crawler A new instance of Crawler with the filtered list of nodes
  450. *
  451. * @api
  452. */
  453. public function filterXPath($xpath)
  454. {
  455. $document = new \DOMDocument('1.0', 'UTF-8');
  456. $root = $document->appendChild($document->createElement('_root'));
  457. foreach ($this as $node) {
  458. $root->appendChild($document->importNode($node, true));
  459. }
  460. $domxpath = new \DOMXPath($document);
  461. return new static($domxpath->query($xpath), $this->uri);
  462. }
  463. /**
  464. * Filters the list of nodes with a CSS selector.
  465. *
  466. * This method only works if you have installed the CssSelector Symfony Component.
  467. *
  468. * @param string $selector A CSS selector
  469. *
  470. * @return Crawler A new instance of Crawler with the filtered list of nodes
  471. *
  472. * @throws \RuntimeException if the CssSelector Component is not available
  473. *
  474. * @api
  475. */
  476. public function filter($selector)
  477. {
  478. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  479. // @codeCoverageIgnoreStart
  480. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  481. // @codeCoverageIgnoreEnd
  482. }
  483. return $this->filterXPath(CssSelector::toXPath($selector));
  484. }
  485. /**
  486. * Selects links by name or alt value for clickable images.
  487. *
  488. * @param string $value The link text
  489. *
  490. * @return Crawler A new instance of Crawler with the filtered list of nodes
  491. *
  492. * @api
  493. */
  494. public function selectLink($value)
  495. {
  496. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  497. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  498. return $this->filterXPath($xpath);
  499. }
  500. /**
  501. * Selects a button by name or alt value for images.
  502. *
  503. * @param string $value The button text
  504. *
  505. * @return Crawler A new instance of Crawler with the filtered list of nodes
  506. *
  507. * @api
  508. */
  509. public function selectButton($value)
  510. {
  511. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  512. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  513. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  514. return $this->filterXPath($xpath);
  515. }
  516. /**
  517. * Returns a Link object for the first node in the list.
  518. *
  519. * @param string $method The method for the link (get by default)
  520. *
  521. * @return Link A Link instance
  522. *
  523. * @throws \InvalidArgumentException If the current node list is empty
  524. *
  525. * @api
  526. */
  527. public function link($method = 'get')
  528. {
  529. if (!count($this)) {
  530. throw new \InvalidArgumentException('The current node list is empty.');
  531. }
  532. $node = $this->getNode(0);
  533. return new Link($node, $this->uri, $method);
  534. }
  535. /**
  536. * Returns an array of Link objects for the nodes in the list.
  537. *
  538. * @return array An array of Link instances
  539. *
  540. * @api
  541. */
  542. public function links()
  543. {
  544. $links = array();
  545. foreach ($this as $node) {
  546. $links[] = new Link($node, $this->uri, 'get');
  547. }
  548. return $links;
  549. }
  550. /**
  551. * Returns a Form object for the first node in the list.
  552. *
  553. * @param array $values An array of values for the form fields
  554. * @param string $method The method for the form
  555. *
  556. * @return Form A Form instance
  557. *
  558. * @throws \InvalidArgumentException If the current node list is empty
  559. *
  560. * @api
  561. */
  562. public function form(array $values = null, $method = null)
  563. {
  564. if (!count($this)) {
  565. throw new \InvalidArgumentException('The current node list is empty.');
  566. }
  567. $form = new Form($this->getNode(0), $this->uri, $method);
  568. if (null !== $values) {
  569. $form->setValues($values);
  570. }
  571. return $form;
  572. }
  573. static public function xpathLiteral($s)
  574. {
  575. if (false === strpos($s, "'")) {
  576. return sprintf("'%s'", $s);
  577. }
  578. if (false === strpos($s, '"')) {
  579. return sprintf('"%s"', $s);
  580. }
  581. $string = $s;
  582. $parts = array();
  583. while (true) {
  584. if (false !== $pos = strpos($string, "'")) {
  585. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  586. $parts[] = "\"'\"";
  587. $string = substr($string, $pos + 1);
  588. } else {
  589. $parts[] = "'$string'";
  590. break;
  591. }
  592. }
  593. return sprintf("concat(%s)", implode($parts, ', '));
  594. }
  595. private function getNode($position)
  596. {
  597. foreach ($this as $i => $node) {
  598. if ($i == $position) {
  599. return $node;
  600. }
  601. // @codeCoverageIgnoreStart
  602. }
  603. return null;
  604. // @codeCoverageIgnoreEnd
  605. }
  606. private function sibling($node, $siblingDir = 'nextSibling')
  607. {
  608. $nodes = array();
  609. do {
  610. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  611. $nodes[] = $node;
  612. }
  613. } while ($node = $node->$siblingDir);
  614. return $nodes;
  615. }
  616. }