PageRenderTime 46ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Spider.php

https://gitlab.com/lalbert/daric
PHP | 395 lines | 180 code | 58 blank | 157 comment | 19 complexity | 1d80c7ca2148995ab7efb51f065d340f MD5 | raw file
  1. <?php
  2. namespace Daric;
  3. use Goutte\Client;
  4. use GuzzleHttp\Client as GuzzleClient;
  5. use Symfony\Component\DomCrawler\Crawler;
  6. use Daric\Extractor\ExtractorInterface;
  7. /**
  8. * Spider is not intended to crawl all the links from a website, but only to
  9. * collect a specific link list in a list of results, eventually paginated.
  10. *
  11. * @author lalbert
  12. */
  13. class Spider implements \Countable, \Iterator
  14. {
  15. /**
  16. * @var string Base uri to spide
  17. */
  18. protected $uri;
  19. /**
  20. * @var array List of uri spided
  21. */
  22. protected $links = [];
  23. /**
  24. * @var Daric\ExtractorInterface
  25. */
  26. protected $linkExtractor;
  27. /**
  28. * @var Daric\ExtractorInterface
  29. */
  30. protected $nextLinkExtractor;
  31. /**
  32. * @var GuzzleHttp\ClientInterface
  33. */
  34. protected $client;
  35. /**
  36. * @var array
  37. */
  38. protected $clientConfig;
  39. /**
  40. * @var Symfony\Component\DomCrawler\Crawler
  41. */
  42. protected $content;
  43. /**
  44. * @var number
  45. */
  46. protected $count;
  47. /**
  48. * @var Daric\ExtractorInterface
  49. */
  50. protected $countResultsExtractor;
  51. /**
  52. * Limit links recolt. If $limit = -1 it's illimited.
  53. *
  54. * @var number.
  55. */
  56. protected $limit = -1;
  57. /**
  58. * Index for \Iterator.
  59. *
  60. * @var number
  61. */
  62. private $index = 0;
  63. public function __construct($uri = null)
  64. {
  65. if (!\is_null($uri)) {
  66. $this->setUri($uri);
  67. }
  68. }
  69. public function setUri($uri)
  70. {
  71. $this->uri = $uri;
  72. }
  73. public function getUri()
  74. {
  75. return $this->uri;
  76. }
  77. public function spide($uri = null)
  78. {
  79. if (!\is_null($uri)) {
  80. $this->setUri($uri);
  81. }
  82. if (!$this->getUri()) {
  83. throw new \InvalidArgumentException('You must set uri before run scrape.');
  84. }
  85. $this->content = $this->getClient()->request('GET', $this->getUri());
  86. $this->extractLinks();
  87. return $this->links;
  88. }
  89. /**
  90. * Extract links in current content.
  91. */
  92. protected function extractLinks()
  93. {
  94. $links = $this->linkExtractor->extract($this->content);
  95. if (!\is_array($links)) {
  96. throw new \InvalidArgumentException('linkExtractor must return an array.');
  97. }
  98. foreach ($links as $link) {
  99. $this->addLink($link);
  100. }
  101. }
  102. /**
  103. * @param string $link
  104. *
  105. * @return \Daric\Spider
  106. */
  107. public function addLink($link)
  108. {
  109. if ($this->limit > -1 && count($this->links) >= $this->limit) {
  110. return $this;
  111. }
  112. $link = $this->prepareLink($link);
  113. if (!\in_array($link, $this->links)) {
  114. \array_push($this->links, $link);
  115. }
  116. return $this;
  117. }
  118. /**
  119. * @param string $href
  120. */
  121. protected function prepareLink($href)
  122. {
  123. $crawler = new Crawler("<html><body><a href='$href'></a></body></html>", $this->getUri());
  124. $link = $crawler->filter('a')->link();
  125. return $link->getUri();
  126. }
  127. /**
  128. * Get Goutte Client.
  129. *
  130. * @return Goutte\Client
  131. */
  132. public function getClient()
  133. {
  134. if (!$this->client) {
  135. if (!$this->client) {
  136. $this->client = new Client();
  137. $this->client->setClient(
  138. new GuzzleClient($this->getClientConfig())
  139. );
  140. }
  141. }
  142. return $this->client;
  143. }
  144. /**
  145. * Set GuzzleClient config.
  146. *
  147. * @see http://docs.guzzlephp.org/en/latest/request-options.html
  148. *
  149. * @param array $config
  150. *
  151. * @return \Daric\Scraper
  152. */
  153. public function setClientConfig(array $config)
  154. {
  155. $this->clientConfig = $config;
  156. return $this;
  157. }
  158. /**
  159. * Retrieve client configuration. If configuration is not set, return the
  160. * default config.
  161. *
  162. * @return array;
  163. */
  164. public function getClientConfig()
  165. {
  166. if (!$this->clientConfig) {
  167. $this->clientConfig = [
  168. 'allow_redirects' => true,
  169. 'cookies' => true,
  170. ];
  171. }
  172. return $this->clientConfig;
  173. }
  174. /**
  175. * Set link extractor.
  176. * Extractor must return an array of string, and string must be an uri.
  177. *
  178. * @param ExtractorInterface $linkExtractor
  179. *
  180. * @return Daric\Spider
  181. */
  182. public function setLinkExtractor(ExtractorInterface $linkExtractor)
  183. {
  184. $this->linkExtractor = $linkExtractor;
  185. return $this;
  186. }
  187. /**
  188. * nextLinkExtractor must return a uri string.
  189. *
  190. * @param Daric\ExtractorInterface $nextLinkExtractor
  191. */
  192. public function setNextLinkExtractor(ExtractorInterface $nextLinkExtractor)
  193. {
  194. $this->nextLinkExtractor = $nextLinkExtractor;
  195. return $this;
  196. }
  197. /**
  198. * Implements \Countable::count().
  199. *
  200. * Retrieve the number of link to spide. If countResultsExtractor is set
  201. * will return this value instead of count($links[]).
  202. *
  203. * Example :
  204. *
  205. * The spided page contain a bloc with result information :
  206. *
  207. * <div id="results">
  208. * Results <span id="result-start">1</span> to <span id="result-end">10</span> from <span id="results-total">100</span>
  209. * </div>
  210. *
  211. * Set CountResultsExtractor with :
  212. *
  213. * new \Daric\Extrator\ChainExtractor([
  214. * \Daric\Extractor\CrawlerSelectorExtractor('#results-total'),
  215. * \Daric\Extractor\CrawlerNodeTextExtractor()
  216. * ]);
  217. *
  218. *
  219. *
  220. * @return number
  221. */
  222. public function count()
  223. {
  224. if ($this->countResultsExtractor) {
  225. if (!$this->content) {
  226. $this->spide();
  227. }
  228. if (!$this->count) {
  229. $this->count = (int) $this->countResultsExtractor->extract($this->content);
  230. }
  231. return $this->count;
  232. }
  233. return $this->currentCount();
  234. }
  235. /**
  236. * @param Daric\ExtractorInterface $countResultsExtractor
  237. */
  238. public function setCountResultsExtractor(ExtractorInterface $countResultsExtractor)
  239. {
  240. $this->countResultsExtractor = $countResultsExtractor;
  241. return $this;
  242. }
  243. public function hasNextPage()
  244. {
  245. if (!$this->nextLinkExtractor) {
  246. return false;
  247. }
  248. return !\is_null($this->nextLinkExtractor->extract($this->content));
  249. }
  250. public function getNextPage()
  251. {
  252. return $this->prepareLink($this->nextLinkExtractor->extract($this->content));
  253. }
  254. /**
  255. * Return the current size of $links[].
  256. *
  257. * @return number
  258. */
  259. final public function currentCount()
  260. {
  261. return count($this->links);
  262. }
  263. /**
  264. * Implements Iterator::current()
  265. * {@inheritdoc}
  266. *
  267. * @see Iterator::current()
  268. */
  269. public function current()
  270. {
  271. return $this->links[$this->index];
  272. }
  273. /**
  274. * Implements Iterator::next()
  275. * {@inheritdoc}
  276. *
  277. * @see Iterator::next()
  278. */
  279. public function next()
  280. {
  281. ++$this->index;
  282. }
  283. /**
  284. * Implements Iterator::key()
  285. * {@inheritdoc}
  286. *
  287. * @see Iterator::key()
  288. */
  289. public function key()
  290. {
  291. return $this->index;
  292. }
  293. /**
  294. * Implements Iterator::valid()
  295. * {@inheritdoc}
  296. *
  297. * @see Iterator::valid()
  298. */
  299. public function valid()
  300. {
  301. if (isset($this->links[$this->index])) {
  302. return true;
  303. }
  304. if ($this->limit > -1 && count($this->links) >= $this->limit) {
  305. return false;
  306. }
  307. if ($this->hasNextPage()) {
  308. $this->spide($this->getNextPage());
  309. }
  310. return isset($this->links[$this->index]);
  311. }
  312. /**
  313. * Implements Iterator::rewind()
  314. * {@inheritdoc}
  315. *
  316. * @see Iterator::rewind()
  317. */
  318. public function rewind()
  319. {
  320. $this->index = 0;
  321. if (!$this->content) {
  322. $this->spide();
  323. }
  324. }
  325. /**
  326. * Limit links recolt.
  327. *
  328. * @param int $limit
  329. *
  330. * @return \Daric\Spider
  331. */
  332. public function setLimit($limit)
  333. {
  334. $this->limit = $limit;
  335. return $this;
  336. }
  337. }