PageRenderTime 53ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/src/Symfony/Components/DomCrawler/Crawler.php

https://github.com/come/symfony
PHP | 617 lines | 310 code | 77 blank | 230 comment | 37 complexity | 1e576e5225d80fa272472ca1858f3a55 MD5 | raw file
Possible License(s): ISC
  1. <?php
  2. namespace Symfony\Components\DomCrawler;
  3. use Symfony\Components\CssSelector\Parser as CssParser;
  4. /*
  5. * This file is part of the Symfony package.
  6. *
  7. * (c) Fabien Potencier <fabien.potencier@symfony-project.com>
  8. *
  9. * For the full copyright and license information, please view the LICENSE
  10. * file that was distributed with this source code.
  11. */
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @package Symfony
  16. * @subpackage Components_DomCrawler
  17. * @author Fabien Potencier <fabien.potencier@symfony-project.com>
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. protected $uri;
  22. protected $host;
  23. protected $path;
  24. /**
  25. * Constructor.
  26. *
  27. * @param mixed $node A Node to use as the base for the crawling
  28. * @param string $uri The base URI to use for absolute links or form actions
  29. */
  30. public function __construct($node = null, $uri = null)
  31. {
  32. $this->uri = $uri;
  33. list($this->host, $this->path) = $this->parseUri($this->uri);
  34. $this->add($node);
  35. }
  36. /**
  37. * Removes all the nodes.
  38. */
  39. public function clear()
  40. {
  41. $this->removeAll($this);
  42. }
  43. /**
  44. * Adds a node to the current list of nodes.
  45. *
  46. * This method uses the appropriate specialized add*() method based
  47. * on the type of the argument.
  48. *
  49. * @param null|\DOMNodeList|array|\DOMNode $node A node
  50. */
  51. public function add($node)
  52. {
  53. if ($node instanceof \DOMNodeList) {
  54. $this->addNodeList($node);
  55. } elseif (is_array($node)) {
  56. $this->addNodes($node);
  57. } elseif (is_string($node)) {
  58. $this->addContent($node);
  59. } elseif (is_object($node)) {
  60. $this->addNode($node);
  61. }
  62. }
  63. public function addContent($content, $type = null)
  64. {
  65. if (empty($type)) {
  66. $type = 'text/html';
  67. }
  68. // DOM only for HTML/XML content
  69. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  70. return null;
  71. }
  72. $charset = 'ISO-8859-1';
  73. if (false !== $pos = strpos($type, 'charset=')) {
  74. $charset = substr($type, $pos + 8);
  75. }
  76. if ('x' === $matches[1]) {
  77. $this->addXmlContent($content, $charset);
  78. } else {
  79. $this->addHtmlContent($content, $charset);
  80. }
  81. }
  82. /**
  83. * Adds an HTML content to the list of nodes.
  84. *
  85. * @param string $content The HTML content
  86. * @param string $charset The charset
  87. */
  88. public function addHtmlContent($content, $charset = 'UTF-8')
  89. {
  90. $dom = new \DOMDocument('1.0', $charset);
  91. $dom->validateOnParse = true;
  92. @$dom->loadHTML($content);
  93. $this->addDocument($dom);
  94. }
  95. /**
  96. * Adds an XML content to the list of nodes.
  97. *
  98. * @param string $content The XML content
  99. * @param string $charset The charset
  100. */
  101. public function addXmlContent($content, $charset = 'UTF-8')
  102. {
  103. $dom = new \DOMDocument('1.0', $charset);
  104. $dom->validateOnParse = true;
  105. // remove the default namespace to make XPath expressions simpler
  106. @$dom->loadXML(str_replace('xmlns', 'ns', $content));
  107. $this->addDocument($dom);
  108. }
  109. /**
  110. * Adds a \DOMDocument to the list of nodes.
  111. *
  112. * @param \DOMDocument $dom A \DOMDocument instance
  113. */
  114. public function addDocument(\DOMDocument $dom)
  115. {
  116. if ($dom->documentElement) {
  117. $this->addNode($dom->documentElement);
  118. }
  119. }
  120. /**
  121. * Adds a \DOMNodeList to the list of nodes.
  122. *
  123. * @param \DOMNodeList $nodes A \DOMNodeList instance
  124. */
  125. public function addNodeList(\DOMNodeList $nodes)
  126. {
  127. foreach ($nodes as $node) {
  128. $this->addNode($node);
  129. }
  130. }
  131. /**
  132. * Adds an array of \DOMNode instances to the list of nodes.
  133. *
  134. * @param array $nodes An array of \DOMNode instances
  135. */
  136. public function addNodes(array $nodes)
  137. {
  138. foreach ($nodes as $node) {
  139. $this->add($node);
  140. }
  141. }
  142. /**
  143. * Adds a \DOMNode instance to the list of nodes.
  144. *
  145. * @param \DOMNode $node A \DOMNode instance
  146. */
  147. public function addNode(\DOMNode $node)
  148. {
  149. if ($node instanceof \DOMDocument) {
  150. $this->attach($node->documentElement);
  151. } else {
  152. $this->attach($node);
  153. }
  154. }
  155. /**
  156. * Returns a node given its position in the node list.
  157. *
  158. * @param integer $position The position
  159. *
  160. * @return A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  161. */
  162. public function eq($position)
  163. {
  164. foreach ($this as $i => $node) {
  165. if ($i == $position) {
  166. return new static($node, $this->uri);
  167. }
  168. }
  169. return new static(null, $this->uri);
  170. }
  171. /**
  172. * Calls an anonymous function on each node of the list.
  173. *
  174. * The anonymous function receives the position and the node as arguments.
  175. *
  176. * Example:
  177. *
  178. * $crawler->filter('h1')->each(function ($i, $node)
  179. * {
  180. * return $node->nodeValue;
  181. * });
  182. *
  183. * @param \Closure $closure An anonymous function
  184. *
  185. * @return array An array of values returned by the anonymous function
  186. */
  187. public function each(\Closure $closure)
  188. {
  189. $data = array();
  190. foreach ($this as $i => $node) {
  191. $data[] = $closure($node, $i);
  192. }
  193. return $data;
  194. }
  195. /**
  196. * Reduces the list of nodes by calling an anonymous function.
  197. *
  198. * To remove a node from the list, the anonymous function must return false.
  199. *
  200. * @param \Closure $closure An anonymous function
  201. *
  202. * @return Crawler A Crawler instance with the selected nodes.
  203. */
  204. public function reduce(\Closure $closure)
  205. {
  206. $nodes = array();
  207. foreach ($this as $i => $node) {
  208. if (false !== $closure($node, $i)) {
  209. $nodes[] = $node;
  210. }
  211. }
  212. return new static($nodes, $this->uri);
  213. }
  214. /**
  215. * Returns the first node of the current selection
  216. *
  217. * @return Crawler A Crawler instance with the first selected node
  218. */
  219. public function first()
  220. {
  221. return $this->eq(0);
  222. }
  223. /**
  224. * Returns the last node of the current selection
  225. *
  226. * @return Crawler A Crawler instance with the last selected node
  227. */
  228. public function last()
  229. {
  230. return $this->eq(count($this) - 1);
  231. }
  232. /**
  233. * Returns the siblings nodes of the current selection
  234. *
  235. * @return Crawler A Crawler instance with the sibling nodes
  236. *
  237. * @throws \InvalidArgumentException When current node is empty
  238. */
  239. public function siblings()
  240. {
  241. if (!count($this)) {
  242. throw new \InvalidArgumentException('The current node list is empty.');
  243. }
  244. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  245. }
  246. /**
  247. * Returns the next siblings nodes of the current selection
  248. *
  249. * @return Crawler A Crawler instance with the next sibling nodes
  250. *
  251. * @throws \InvalidArgumentException When current node is empty
  252. */
  253. public function nextAll()
  254. {
  255. if (!count($this)) {
  256. throw new \InvalidArgumentException('The current node list is empty.');
  257. }
  258. return new static($this->sibling($this->getNode(0)), $this->uri);
  259. }
  260. /**
  261. * Returns the previous sibling nodes of the current selection
  262. *
  263. * @return Crawler A Crawler instance with the previous sibling nodes
  264. */
  265. public function previousAll()
  266. {
  267. if (!count($this)) {
  268. throw new \InvalidArgumentException('The current node list is empty.');
  269. }
  270. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  271. }
  272. /**
  273. * Returns the parents nodes of the current selection
  274. *
  275. * @return Crawler A Crawler instance with the parents nodes of the current selection
  276. *
  277. * @throws \InvalidArgumentException When current node is empty
  278. */
  279. public function parents()
  280. {
  281. if (!count($this)) {
  282. throw new \InvalidArgumentException('The current node list is empty.');
  283. }
  284. $node = $this->getNode(0);
  285. $nodes = array();
  286. while ($node = $node->parentNode) {
  287. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  288. $nodes[] = $node;
  289. }
  290. }
  291. return new static($nodes, $this->uri);
  292. }
  293. /**
  294. * Returns the children nodes of the current selection
  295. *
  296. * @return Crawler A Crawler instance with the children nodes
  297. *
  298. * @throws \InvalidArgumentException When current node is empty
  299. */
  300. public function children()
  301. {
  302. if (!count($this)) {
  303. throw new \InvalidArgumentException('The current node list is empty.');
  304. }
  305. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  306. }
  307. /**
  308. * Returns the attribute value of the first node of the list.
  309. *
  310. * @param string $attribute The attribute name
  311. *
  312. * @return string The attribute value
  313. *
  314. * @throws \InvalidArgumentException When current node is empty
  315. */
  316. public function attr($attribute)
  317. {
  318. if (!count($this)) {
  319. throw new \InvalidArgumentException('The current node list is empty.');
  320. }
  321. return $this->getNode(0)->getAttribute($attribute);
  322. }
  323. /**
  324. * Returns the node value of the first node of the list.
  325. *
  326. * @return string The node value
  327. *
  328. * @throws \InvalidArgumentException When current node is empty
  329. */
  330. public function text()
  331. {
  332. if (!count($this)) {
  333. throw new \InvalidArgumentException('The current node list is empty.');
  334. }
  335. return $this->getNode(0)->nodeValue;
  336. }
  337. /**
  338. * Extracts information from the list of nodes.
  339. *
  340. * You can extract attributes or/and the node value (_text).
  341. *
  342. * Example:
  343. *
  344. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  345. *
  346. * @param array $attributes An array of attributes
  347. *
  348. * @return array An array of extracted values
  349. */
  350. public function extract($attributes)
  351. {
  352. if (!is_array($attributes)) {
  353. $attributes = array($attributes);
  354. }
  355. $data = array();
  356. foreach ($this as $node) {
  357. $elements = array();
  358. foreach ($attributes as $attribute) {
  359. if ('_text' === $attribute) {
  360. $elements[] = $node->nodeValue;
  361. } else {
  362. $elements[] = $node->getAttribute($attribute);
  363. }
  364. }
  365. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  366. }
  367. return $data;
  368. }
  369. /**
  370. * Filters the list of nodes with an XPath expression.
  371. *
  372. * @param string $xpath An XPath expression
  373. *
  374. * @return Crawler A new instance of Crawler with the filtered list of nodes
  375. */
  376. public function filterXPath($xpath)
  377. {
  378. $document = new \DOMDocument('1.0', 'UTF-8');
  379. $root = $document->appendChild($document->createElement('_root'));
  380. foreach ($this as $node) {
  381. $root->appendChild($document->importNode($node, true));
  382. }
  383. $domxpath = new \DOMXPath($document);
  384. return new static($domxpath->query($xpath), $this->uri);
  385. }
  386. /**
  387. * Filters the list of nodes with a CSS selector.
  388. *
  389. * This method only works if you have installed the CssSelector Symfony Component.
  390. *
  391. * @param string $selector A CSS selector
  392. *
  393. * @return Crawler A new instance of Crawler with the filtered list of nodes
  394. *
  395. * @throws \RuntimeException if the CssSelector Component is not available
  396. */
  397. public function filter($selector)
  398. {
  399. if (!class_exists('Symfony\\Components\\CssSelector\\Parser')) {
  400. // @codeCoverageIgnoreStart
  401. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  402. // @codeCoverageIgnoreEnd
  403. }
  404. return $this->filterXPath(CssParser::cssToXpath($selector));
  405. }
  406. /**
  407. * Selects links by name or alt value for clickable images.
  408. *
  409. * @param string $value The link text
  410. *
  411. * @return Crawler A new instance of Crawler with the filtered list of nodes
  412. */
  413. public function selectLink($value)
  414. {
  415. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  416. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  417. return $this->filterXPath($xpath);
  418. }
  419. /**
  420. * Selects a button by name or alt value for images.
  421. *
  422. * @param string $value The button text
  423. *
  424. * @return Crawler A new instance of Crawler with the filtered list of nodes
  425. */
  426. public function selectButton($value)
  427. {
  428. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  429. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  430. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  431. return $this->filterXPath($xpath);
  432. }
  433. /**
  434. * Returns a Link object for the first node in the list.
  435. *
  436. * @param string $method The method for the link (get by default)
  437. *
  438. * @return Link A Link instance
  439. *
  440. * @throws \InvalidArgumentException If the current node list is empty
  441. */
  442. public function link($method = 'get')
  443. {
  444. if (!count($this)) {
  445. throw new \InvalidArgumentException('The current node list is empty.');
  446. }
  447. $node = $this->getNode(0);
  448. return new Link($node, $method, $this->host, $this->path);
  449. }
  450. /**
  451. * Returns an array of Link objects for the nodes in the list.
  452. *
  453. * @return array An array of Link instances
  454. */
  455. public function links()
  456. {
  457. $links = array();
  458. foreach ($this as $node) {
  459. $links[] = new Link($node, 'get', $this->host, $this->path);
  460. }
  461. return $links;
  462. }
  463. /**
  464. * Returns a Form object for the first node in the list.
  465. *
  466. * @param array $arguments An array of values for the form fields
  467. * @param string $method The method for the form
  468. *
  469. * @return Form A Form instance
  470. *
  471. * @throws \InvalidArgumentException If the current node list is empty
  472. */
  473. public function form(array $values = null, $method = null)
  474. {
  475. if (!count($this)) {
  476. throw new \InvalidArgumentException('The current node list is empty.');
  477. }
  478. $form = new Form($this->getNode(0), $method, $this->host, $this->path);
  479. if (null !== $values) {
  480. $form->setValues($values);
  481. }
  482. return $form;
  483. }
  484. protected function getNode($position)
  485. {
  486. foreach ($this as $i => $node) {
  487. if ($i == $position) {
  488. return $node;
  489. }
  490. // @codeCoverageIgnoreStart
  491. }
  492. return null;
  493. // @codeCoverageIgnoreEnd
  494. }
  495. protected function parseUri($uri)
  496. {
  497. if ('http' !== substr($uri, 0, 4)) {
  498. return array(null, '/');
  499. }
  500. $path = parse_url($uri, PHP_URL_PATH);
  501. if ('/' !== substr($path, -1)) {
  502. $path = substr($path, 0, strrpos($path, '/') + 1);
  503. }
  504. return array(preg_replace('#^(.*?//[^/]+)\/.*$#', '$1', $uri), $path);
  505. }
  506. protected function sibling($node, $siblingDir = 'nextSibling')
  507. {
  508. $nodes = array();
  509. do {
  510. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  511. $nodes[] = $node;
  512. }
  513. } while($node = $node->$siblingDir);
  514. return $nodes;
  515. }
  516. static public function xpathLiteral($s)
  517. {
  518. if (false === strpos($s, "'")) {
  519. return sprintf("'%s'", $s);
  520. }
  521. if (false === strpos($s, '"')) {
  522. return sprintf('"%s"', $s);
  523. }
  524. $string = $s;
  525. $parts = array();
  526. while (true) {
  527. if (false !== $pos = strpos($string, "'")) {
  528. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  529. $parts[] = "\"'\"";
  530. $string = substr($string, $pos + 1);
  531. } else {
  532. $parts[] = "'$string'";
  533. break;
  534. }
  535. }
  536. return sprintf("concat(%s)", implode($parts, ', '));
  537. }
  538. }