PageRenderTime 51ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/symfony/dom-crawler/Crawler.php

https://gitlab.com/ealexis.t/trends
PHP | 1038 lines | 487 code | 135 blank | 416 comment | 61 complexity | fa9fd5f0b18902750064bea21a6293f1 MD5 | raw file
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelectorConverter;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. */
  17. class Crawler implements \Countable, \IteratorAggregate
  18. {
  19. /**
  20. * @var string The current URI
  21. */
  22. protected $uri;
  23. /**
  24. * @var string The default namespace prefix to be used with XPath and CSS expressions
  25. */
  26. private $defaultNamespacePrefix = 'default';
  27. /**
  28. * @var array A map of manually registered namespaces
  29. */
  30. private $namespaces = array();
  31. /**
  32. * @var string The base href value
  33. */
  34. private $baseHref;
  35. /**
  36. * @var \DOMDocument|null
  37. */
  38. private $document;
  39. /**
  40. * @var \DOMElement[]
  41. */
  42. private $nodes = array();
  43. /**
  44. * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
  45. *
  46. * @var bool
  47. */
  48. private $isHtml = true;
  49. /**
  50. * Constructor.
  51. *
  52. * @param mixed $node A Node to use as the base for the crawling
  53. * @param string $currentUri The current URI
  54. * @param string $baseHref The base href value
  55. */
  56. public function __construct($node = null, $currentUri = null, $baseHref = null)
  57. {
  58. $this->uri = $currentUri;
  59. $this->baseHref = $baseHref ?: $currentUri;
  60. $this->add($node);
  61. }
  62. /**
  63. * Removes all the nodes.
  64. */
  65. public function clear()
  66. {
  67. $this->nodes = array();
  68. $this->document = null;
  69. }
  70. /**
  71. * Adds a node to the current list of nodes.
  72. *
  73. * This method uses the appropriate specialized add*() method based
  74. * on the type of the argument.
  75. *
  76. * @param \DOMNodeList|\DOMNode|array|string|null $node A node
  77. *
  78. * @throws \InvalidArgumentException When node is not the expected type.
  79. */
  80. public function add($node)
  81. {
  82. if ($node instanceof \DOMNodeList) {
  83. $this->addNodeList($node);
  84. } elseif ($node instanceof \DOMNode) {
  85. $this->addNode($node);
  86. } elseif (is_array($node)) {
  87. $this->addNodes($node);
  88. } elseif (is_string($node)) {
  89. $this->addContent($node);
  90. } elseif (null !== $node) {
  91. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
  92. }
  93. }
  94. /**
  95. * Adds HTML/XML content.
  96. *
  97. * If the charset is not set via the content type, it is assumed
  98. * to be ISO-8859-1, which is the default charset defined by the
  99. * HTTP 1.1 specification.
  100. *
  101. * @param string $content A string to parse as HTML/XML
  102. * @param null|string $type The content type of the string
  103. */
  104. public function addContent($content, $type = null)
  105. {
  106. if (empty($type)) {
  107. $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
  108. }
  109. // DOM only for HTML/XML content
  110. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  111. return;
  112. }
  113. $charset = null;
  114. if (false !== $pos = stripos($type, 'charset=')) {
  115. $charset = substr($type, $pos + 8);
  116. if (false !== $pos = strpos($charset, ';')) {
  117. $charset = substr($charset, 0, $pos);
  118. }
  119. }
  120. // http://www.w3.org/TR/encoding/#encodings
  121. // http://www.w3.org/TR/REC-xml/#NT-EncName
  122. if (null === $charset &&
  123. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
  124. $charset = $matches[1];
  125. }
  126. if (null === $charset) {
  127. $charset = 'ISO-8859-1';
  128. }
  129. if ('x' === $xmlMatches[1]) {
  130. $this->addXmlContent($content, $charset);
  131. } else {
  132. $this->addHtmlContent($content, $charset);
  133. }
  134. }
  135. /**
  136. * Adds an HTML content to the list of nodes.
  137. *
  138. * The libxml errors are disabled when the content is parsed.
  139. *
  140. * If you want to get parsing errors, be sure to enable
  141. * internal errors via libxml_use_internal_errors(true)
  142. * and then, get the errors via libxml_get_errors(). Be
  143. * sure to clear errors with libxml_clear_errors() afterward.
  144. *
  145. * @param string $content The HTML content
  146. * @param string $charset The charset
  147. */
  148. public function addHtmlContent($content, $charset = 'UTF-8')
  149. {
  150. $internalErrors = libxml_use_internal_errors(true);
  151. $disableEntities = libxml_disable_entity_loader(true);
  152. $dom = new \DOMDocument('1.0', $charset);
  153. $dom->validateOnParse = true;
  154. set_error_handler(function () {throw new \Exception();});
  155. try {
  156. // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
  157. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  158. } catch (\Exception $e) {
  159. }
  160. restore_error_handler();
  161. if ('' !== trim($content)) {
  162. @$dom->loadHTML($content);
  163. }
  164. libxml_use_internal_errors($internalErrors);
  165. libxml_disable_entity_loader($disableEntities);
  166. $this->addDocument($dom);
  167. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
  168. $baseHref = current($base);
  169. if (count($base) && !empty($baseHref)) {
  170. if ($this->baseHref) {
  171. $linkNode = $dom->createElement('a');
  172. $linkNode->setAttribute('href', $baseHref);
  173. $link = new Link($linkNode, $this->baseHref);
  174. $this->baseHref = $link->getUri();
  175. } else {
  176. $this->baseHref = $baseHref;
  177. }
  178. }
  179. }
  180. /**
  181. * Adds an XML content to the list of nodes.
  182. *
  183. * The libxml errors are disabled when the content is parsed.
  184. *
  185. * If you want to get parsing errors, be sure to enable
  186. * internal errors via libxml_use_internal_errors(true)
  187. * and then, get the errors via libxml_get_errors(). Be
  188. * sure to clear errors with libxml_clear_errors() afterward.
  189. *
  190. * @param string $content The XML content
  191. * @param string $charset The charset
  192. * @param int $options Bitwise OR of the libxml option constants
  193. * LIBXML_PARSEHUGE is dangerous, see
  194. * http://symfony.com/blog/security-release-symfony-2-0-17-released
  195. */
  196. public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET)
  197. {
  198. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  199. if (!preg_match('/xmlns:/', $content)) {
  200. $content = str_replace('xmlns', 'ns', $content);
  201. }
  202. $internalErrors = libxml_use_internal_errors(true);
  203. $disableEntities = libxml_disable_entity_loader(true);
  204. $dom = new \DOMDocument('1.0', $charset);
  205. $dom->validateOnParse = true;
  206. if ('' !== trim($content)) {
  207. @$dom->loadXML($content, $options);
  208. }
  209. libxml_use_internal_errors($internalErrors);
  210. libxml_disable_entity_loader($disableEntities);
  211. $this->addDocument($dom);
  212. $this->isHtml = false;
  213. }
  214. /**
  215. * Adds a \DOMDocument to the list of nodes.
  216. *
  217. * @param \DOMDocument $dom A \DOMDocument instance
  218. */
  219. public function addDocument(\DOMDocument $dom)
  220. {
  221. if ($dom->documentElement) {
  222. $this->addNode($dom->documentElement);
  223. }
  224. }
  225. /**
  226. * Adds a \DOMNodeList to the list of nodes.
  227. *
  228. * @param \DOMNodeList $nodes A \DOMNodeList instance
  229. */
  230. public function addNodeList(\DOMNodeList $nodes)
  231. {
  232. foreach ($nodes as $node) {
  233. if ($node instanceof \DOMNode) {
  234. $this->addNode($node);
  235. }
  236. }
  237. }
  238. /**
  239. * Adds an array of \DOMNode instances to the list of nodes.
  240. *
  241. * @param \DOMNode[] $nodes An array of \DOMNode instances
  242. */
  243. public function addNodes(array $nodes)
  244. {
  245. foreach ($nodes as $node) {
  246. $this->add($node);
  247. }
  248. }
  249. /**
  250. * Adds a \DOMNode instance to the list of nodes.
  251. *
  252. * @param \DOMNode $node A \DOMNode instance
  253. */
  254. public function addNode(\DOMNode $node)
  255. {
  256. if ($node instanceof \DOMDocument) {
  257. $node = $node->documentElement;
  258. }
  259. if (null !== $this->document && $this->document !== $node->ownerDocument) {
  260. throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
  261. }
  262. if (null === $this->document) {
  263. $this->document = $node->ownerDocument;
  264. }
  265. // Don't add duplicate nodes in the Crawler
  266. if (in_array($node, $this->nodes, true)) {
  267. return;
  268. }
  269. $this->nodes[] = $node;
  270. }
  271. /**
  272. * Returns a node given its position in the node list.
  273. *
  274. * @param int $position The position
  275. *
  276. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  277. */
  278. public function eq($position)
  279. {
  280. if (isset($this->nodes[$position])) {
  281. return $this->createSubCrawler($this->nodes[$position]);
  282. }
  283. return $this->createSubCrawler(null);
  284. }
  285. /**
  286. * Calls an anonymous function on each node of the list.
  287. *
  288. * The anonymous function receives the position and the node wrapped
  289. * in a Crawler instance as arguments.
  290. *
  291. * Example:
  292. *
  293. * $crawler->filter('h1')->each(function ($node, $i) {
  294. * return $node->text();
  295. * });
  296. *
  297. * @param \Closure $closure An anonymous function
  298. *
  299. * @return array An array of values returned by the anonymous function
  300. */
  301. public function each(\Closure $closure)
  302. {
  303. $data = array();
  304. foreach ($this->nodes as $i => $node) {
  305. $data[] = $closure($this->createSubCrawler($node), $i);
  306. }
  307. return $data;
  308. }
  309. /**
  310. * Slices the list of nodes by $offset and $length.
  311. *
  312. * @param int $offset
  313. * @param int $length
  314. *
  315. * @return Crawler A Crawler instance with the sliced nodes
  316. */
  317. public function slice($offset = 0, $length = null)
  318. {
  319. return $this->createSubCrawler(array_slice($this->nodes, $offset, $length));
  320. }
  321. /**
  322. * Reduces the list of nodes by calling an anonymous function.
  323. *
  324. * To remove a node from the list, the anonymous function must return false.
  325. *
  326. * @param \Closure $closure An anonymous function
  327. *
  328. * @return Crawler A Crawler instance with the selected nodes.
  329. */
  330. public function reduce(\Closure $closure)
  331. {
  332. $nodes = array();
  333. foreach ($this->nodes as $i => $node) {
  334. if (false !== $closure($this->createSubCrawler($node), $i)) {
  335. $nodes[] = $node;
  336. }
  337. }
  338. return $this->createSubCrawler($nodes);
  339. }
  340. /**
  341. * Returns the first node of the current selection.
  342. *
  343. * @return Crawler A Crawler instance with the first selected node
  344. */
  345. public function first()
  346. {
  347. return $this->eq(0);
  348. }
  349. /**
  350. * Returns the last node of the current selection.
  351. *
  352. * @return Crawler A Crawler instance with the last selected node
  353. */
  354. public function last()
  355. {
  356. return $this->eq(count($this->nodes) - 1);
  357. }
  358. /**
  359. * Returns the siblings nodes of the current selection.
  360. *
  361. * @return Crawler A Crawler instance with the sibling nodes
  362. *
  363. * @throws \InvalidArgumentException When current node is empty
  364. */
  365. public function siblings()
  366. {
  367. if (!$this->nodes) {
  368. throw new \InvalidArgumentException('The current node list is empty.');
  369. }
  370. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  371. }
  372. /**
  373. * Returns the next siblings nodes of the current selection.
  374. *
  375. * @return Crawler A Crawler instance with the next sibling nodes
  376. *
  377. * @throws \InvalidArgumentException When current node is empty
  378. */
  379. public function nextAll()
  380. {
  381. if (!$this->nodes) {
  382. throw new \InvalidArgumentException('The current node list is empty.');
  383. }
  384. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  385. }
  386. /**
  387. * Returns the previous sibling nodes of the current selection.
  388. *
  389. * @return Crawler A Crawler instance with the previous sibling nodes
  390. *
  391. * @throws \InvalidArgumentException
  392. */
  393. public function previousAll()
  394. {
  395. if (!$this->nodes) {
  396. throw new \InvalidArgumentException('The current node list is empty.');
  397. }
  398. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  399. }
  400. /**
  401. * Returns the parents nodes of the current selection.
  402. *
  403. * @return Crawler A Crawler instance with the parents nodes of the current selection
  404. *
  405. * @throws \InvalidArgumentException When current node is empty
  406. */
  407. public function parents()
  408. {
  409. if (!$this->nodes) {
  410. throw new \InvalidArgumentException('The current node list is empty.');
  411. }
  412. $node = $this->getNode(0);
  413. $nodes = array();
  414. while ($node = $node->parentNode) {
  415. if (XML_ELEMENT_NODE === $node->nodeType) {
  416. $nodes[] = $node;
  417. }
  418. }
  419. return $this->createSubCrawler($nodes);
  420. }
  421. /**
  422. * Returns the children nodes of the current selection.
  423. *
  424. * @return Crawler A Crawler instance with the children nodes
  425. *
  426. * @throws \InvalidArgumentException When current node is empty
  427. */
  428. public function children()
  429. {
  430. if (!$this->nodes) {
  431. throw new \InvalidArgumentException('The current node list is empty.');
  432. }
  433. $node = $this->getNode(0)->firstChild;
  434. return $this->createSubCrawler($node ? $this->sibling($node) : array());
  435. }
  436. /**
  437. * Returns the attribute value of the first node of the list.
  438. *
  439. * @param string $attribute The attribute name
  440. *
  441. * @return string|null The attribute value or null if the attribute does not exist
  442. *
  443. * @throws \InvalidArgumentException When current node is empty
  444. */
  445. public function attr($attribute)
  446. {
  447. if (!$this->nodes) {
  448. throw new \InvalidArgumentException('The current node list is empty.');
  449. }
  450. $node = $this->getNode(0);
  451. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  452. }
  453. /**
  454. * Returns the node name of the first node of the list.
  455. *
  456. * @return string The node name
  457. *
  458. * @throws \InvalidArgumentException When current node is empty
  459. */
  460. public function nodeName()
  461. {
  462. if (!$this->nodes) {
  463. throw new \InvalidArgumentException('The current node list is empty.');
  464. }
  465. return $this->getNode(0)->nodeName;
  466. }
  467. /**
  468. * Returns the node value of the first node of the list.
  469. *
  470. * @return string The node value
  471. *
  472. * @throws \InvalidArgumentException When current node is empty
  473. */
  474. public function text()
  475. {
  476. if (!$this->nodes) {
  477. throw new \InvalidArgumentException('The current node list is empty.');
  478. }
  479. return $this->getNode(0)->nodeValue;
  480. }
  481. /**
  482. * Returns the first node of the list as HTML.
  483. *
  484. * @return string The node html
  485. *
  486. * @throws \InvalidArgumentException When current node is empty
  487. */
  488. public function html()
  489. {
  490. if (!$this->nodes) {
  491. throw new \InvalidArgumentException('The current node list is empty.');
  492. }
  493. $html = '';
  494. foreach ($this->getNode(0)->childNodes as $child) {
  495. $html .= $child->ownerDocument->saveHTML($child);
  496. }
  497. return $html;
  498. }
  499. /**
  500. * Extracts information from the list of nodes.
  501. *
  502. * You can extract attributes or/and the node value (_text).
  503. *
  504. * Example:
  505. *
  506. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  507. *
  508. * @param array $attributes An array of attributes
  509. *
  510. * @return array An array of extracted values
  511. */
  512. public function extract($attributes)
  513. {
  514. $attributes = (array) $attributes;
  515. $count = count($attributes);
  516. $data = array();
  517. foreach ($this->nodes as $node) {
  518. $elements = array();
  519. foreach ($attributes as $attribute) {
  520. if ('_text' === $attribute) {
  521. $elements[] = $node->nodeValue;
  522. } else {
  523. $elements[] = $node->getAttribute($attribute);
  524. }
  525. }
  526. $data[] = $count > 1 ? $elements : $elements[0];
  527. }
  528. return $data;
  529. }
  530. /**
  531. * Filters the list of nodes with an XPath expression.
  532. *
  533. * The XPath expression is evaluated in the context of the crawler, which
  534. * is considered as a fake parent of the elements inside it.
  535. * This means that a child selector "div" or "./div" will match only
  536. * the div elements of the current crawler, not their children.
  537. *
  538. * @param string $xpath An XPath expression
  539. *
  540. * @return Crawler A new instance of Crawler with the filtered list of nodes
  541. */
  542. public function filterXPath($xpath)
  543. {
  544. $xpath = $this->relativize($xpath);
  545. // If we dropped all expressions in the XPath while preparing it, there would be no match
  546. if ('' === $xpath) {
  547. return $this->createSubCrawler(null);
  548. }
  549. return $this->filterRelativeXPath($xpath);
  550. }
  551. /**
  552. * Filters the list of nodes with a CSS selector.
  553. *
  554. * This method only works if you have installed the CssSelector Symfony Component.
  555. *
  556. * @param string $selector A CSS selector
  557. *
  558. * @return Crawler A new instance of Crawler with the filtered list of nodes
  559. *
  560. * @throws \RuntimeException if the CssSelector Component is not available
  561. */
  562. public function filter($selector)
  563. {
  564. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) {
  565. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).');
  566. }
  567. $converter = new CssSelectorConverter($this->isHtml);
  568. // The CssSelector already prefixes the selector with descendant-or-self::
  569. return $this->filterRelativeXPath($converter->toXPath($selector));
  570. }
  571. /**
  572. * Selects links by name or alt value for clickable images.
  573. *
  574. * @param string $value The link text
  575. *
  576. * @return Crawler A new instance of Crawler with the filtered list of nodes
  577. */
  578. public function selectLink($value)
  579. {
  580. $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
  581. sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
  582. return $this->filterRelativeXPath($xpath);
  583. }
  584. /**
  585. * Selects a button by name or alt value for images.
  586. *
  587. * @param string $value The button text
  588. *
  589. * @return Crawler A new instance of Crawler with the filtered list of nodes
  590. */
  591. public function selectButton($value)
  592. {
  593. $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
  594. $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
  595. sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)).
  596. sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value));
  597. return $this->filterRelativeXPath($xpath);
  598. }
  599. /**
  600. * Returns a Link object for the first node in the list.
  601. *
  602. * @param string $method The method for the link (get by default)
  603. *
  604. * @return Link A Link instance
  605. *
  606. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  607. */
  608. public function link($method = 'get')
  609. {
  610. if (!$this->nodes) {
  611. throw new \InvalidArgumentException('The current node list is empty.');
  612. }
  613. $node = $this->getNode(0);
  614. if (!$node instanceof \DOMElement) {
  615. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  616. }
  617. return new Link($node, $this->baseHref, $method);
  618. }
  619. /**
  620. * Returns an array of Link objects for the nodes in the list.
  621. *
  622. * @return Link[] An array of Link instances
  623. *
  624. * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
  625. */
  626. public function links()
  627. {
  628. $links = array();
  629. foreach ($this->nodes as $node) {
  630. if (!$node instanceof \DOMElement) {
  631. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
  632. }
  633. $links[] = new Link($node, $this->baseHref, 'get');
  634. }
  635. return $links;
  636. }
  637. /**
  638. * Returns a Form object for the first node in the list.
  639. *
  640. * @param array $values An array of values for the form fields
  641. * @param string $method The method for the form
  642. *
  643. * @return Form A Form instance
  644. *
  645. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  646. */
  647. public function form(array $values = null, $method = null)
  648. {
  649. if (!$this->nodes) {
  650. throw new \InvalidArgumentException('The current node list is empty.');
  651. }
  652. $node = $this->getNode(0);
  653. if (!$node instanceof \DOMElement) {
  654. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  655. }
  656. $form = new Form($node, $this->uri, $method, $this->baseHref);
  657. if (null !== $values) {
  658. $form->setValues($values);
  659. }
  660. return $form;
  661. }
  662. /**
  663. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  664. *
  665. * @param string $prefix
  666. */
  667. public function setDefaultNamespacePrefix($prefix)
  668. {
  669. $this->defaultNamespacePrefix = $prefix;
  670. }
  671. /**
  672. * @param string $prefix
  673. * @param string $namespace
  674. */
  675. public function registerNamespace($prefix, $namespace)
  676. {
  677. $this->namespaces[$prefix] = $namespace;
  678. }
  679. /**
  680. * Converts string for XPath expressions.
  681. *
  682. * Escaped characters are: quotes (") and apostrophe (').
  683. *
  684. * Examples:
  685. * <code>
  686. * echo Crawler::xpathLiteral('foo " bar');
  687. * //prints 'foo " bar'
  688. *
  689. * echo Crawler::xpathLiteral("foo ' bar");
  690. * //prints "foo ' bar"
  691. *
  692. * echo Crawler::xpathLiteral('a\'b"c');
  693. * //prints concat('a', "'", 'b"c')
  694. * </code>
  695. *
  696. * @param string $s String to be escaped
  697. *
  698. * @return string Converted string
  699. */
  700. public static function xpathLiteral($s)
  701. {
  702. if (false === strpos($s, "'")) {
  703. return sprintf("'%s'", $s);
  704. }
  705. if (false === strpos($s, '"')) {
  706. return sprintf('"%s"', $s);
  707. }
  708. $string = $s;
  709. $parts = array();
  710. while (true) {
  711. if (false !== $pos = strpos($string, "'")) {
  712. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  713. $parts[] = "\"'\"";
  714. $string = substr($string, $pos + 1);
  715. } else {
  716. $parts[] = "'$string'";
  717. break;
  718. }
  719. }
  720. return sprintf('concat(%s)', implode($parts, ', '));
  721. }
  722. /**
  723. * Filters the list of nodes with an XPath expression.
  724. *
  725. * The XPath expression should already be processed to apply it in the context of each node.
  726. *
  727. * @param string $xpath
  728. *
  729. * @return Crawler
  730. */
  731. private function filterRelativeXPath($xpath)
  732. {
  733. $prefixes = $this->findNamespacePrefixes($xpath);
  734. $crawler = $this->createSubCrawler(null);
  735. foreach ($this->nodes as $node) {
  736. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  737. $crawler->add($domxpath->query($xpath, $node));
  738. }
  739. return $crawler;
  740. }
  741. /**
  742. * Make the XPath relative to the current context.
  743. *
  744. * The returned XPath will match elements matching the XPath inside the current crawler
  745. * when running in the context of a node of the crawler.
  746. *
  747. * @param string $xpath
  748. *
  749. * @return string
  750. */
  751. private function relativize($xpath)
  752. {
  753. $expressions = array();
  754. $unionPattern = '/\|(?![^\[]*\])/';
  755. // An expression which will never match to replace expressions which cannot match in the crawler
  756. // We cannot simply drop
  757. $nonMatchingExpression = 'a[name() = "b"]';
  758. // Split any unions into individual expressions.
  759. foreach (preg_split($unionPattern, $xpath) as $expression) {
  760. $expression = trim($expression);
  761. $parenthesis = '';
  762. // If the union is inside some braces, we need to preserve the opening braces and apply
  763. // the change only inside it.
  764. if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
  765. $parenthesis = $matches[0];
  766. $expression = substr($expression, strlen($parenthesis));
  767. }
  768. if (0 === strpos($expression, 'self::*/')) {
  769. $expression = './'.substr($expression, 8);
  770. }
  771. // add prefix before absolute element selector
  772. if (empty($expression)) {
  773. $expression = $nonMatchingExpression;
  774. } elseif (0 === strpos($expression, '//')) {
  775. $expression = 'descendant-or-self::'.substr($expression, 2);
  776. } elseif (0 === strpos($expression, './/')) {
  777. $expression = 'descendant-or-self::'.substr($expression, 3);
  778. } elseif (0 === strpos($expression, './')) {
  779. $expression = 'self::'.substr($expression, 2);
  780. } elseif (0 === strpos($expression, 'child::')) {
  781. $expression = 'self::'.substr($expression, 7);
  782. } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) {
  783. $expression = $nonMatchingExpression;
  784. } elseif (0 === strpos($expression, 'descendant::')) {
  785. $expression = 'descendant-or-self::'.substr($expression, strlen('descendant::'));
  786. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  787. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  788. $expression = $nonMatchingExpression;
  789. } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
  790. $expression = 'self::'.$expression;
  791. }
  792. $expressions[] = $parenthesis.$expression;
  793. }
  794. return implode(' | ', $expressions);
  795. }
  796. /**
  797. * @param int $position
  798. *
  799. * @return \DOMElement|null
  800. */
  801. public function getNode($position)
  802. {
  803. if (isset($this->nodes[$position])) {
  804. return $this->nodes[$position];
  805. }
  806. }
  807. /**
  808. * @return int
  809. */
  810. public function count()
  811. {
  812. return count($this->nodes);
  813. }
  814. /**
  815. * @return \ArrayIterator
  816. */
  817. public function getIterator()
  818. {
  819. return new \ArrayIterator($this->nodes);
  820. }
  821. /**
  822. * @param \DOMElement $node
  823. * @param string $siblingDir
  824. *
  825. * @return array
  826. */
  827. protected function sibling($node, $siblingDir = 'nextSibling')
  828. {
  829. $nodes = array();
  830. do {
  831. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  832. $nodes[] = $node;
  833. }
  834. } while ($node = $node->$siblingDir);
  835. return $nodes;
  836. }
  837. /**
  838. * @param \DOMDocument $document
  839. * @param array $prefixes
  840. *
  841. * @return \DOMXPath
  842. *
  843. * @throws \InvalidArgumentException
  844. */
  845. private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
  846. {
  847. $domxpath = new \DOMXPath($document);
  848. foreach ($prefixes as $prefix) {
  849. $namespace = $this->discoverNamespace($domxpath, $prefix);
  850. if (null !== $namespace) {
  851. $domxpath->registerNamespace($prefix, $namespace);
  852. }
  853. }
  854. return $domxpath;
  855. }
  856. /**
  857. * @param \DOMXPath $domxpath
  858. * @param string $prefix
  859. *
  860. * @return string
  861. *
  862. * @throws \InvalidArgumentException
  863. */
  864. private function discoverNamespace(\DOMXPath $domxpath, $prefix)
  865. {
  866. if (isset($this->namespaces[$prefix])) {
  867. return $this->namespaces[$prefix];
  868. }
  869. // ask for one namespace, otherwise we'd get a collection with an item for each node
  870. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  871. if ($node = $namespaces->item(0)) {
  872. return $node->nodeValue;
  873. }
  874. }
  875. /**
  876. * @param string $xpath
  877. *
  878. * @return array
  879. */
  880. private function findNamespacePrefixes($xpath)
  881. {
  882. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  883. return array_unique($matches['prefix']);
  884. }
  885. return array();
  886. }
  887. /**
  888. * Creates a crawler for some subnodes.
  889. *
  890. * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes
  891. *
  892. * @return static
  893. */
  894. private function createSubCrawler($nodes)
  895. {
  896. $crawler = new static($nodes, $this->uri, $this->baseHref);
  897. $crawler->isHtml = $this->isHtml;
  898. $crawler->document = $this->document;
  899. return $crawler;
  900. }
  901. }