PageRenderTime 45ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/0_9_4/zibo_modules/zibo.spider/src/zibo/library/spider/Crawl.php

https://gitlab.com/BGCX261/zibo-svn-to-git
PHP | 186 lines | 134 code | 51 blank | 1 comment | 33 complexity | 60dce148fe601c109e3a999ef5dba68f MD5 | raw file
Possible License(s): GPL-3.0
  1. <?php
  2. namespace zibo\library\spider;
  3. use zibo\library\String;
  4. use \Exception;
  5. class Crawl {
  6. private $url;
  7. private $baseUrl;
  8. private $basePath;
  9. private $response;
  10. public function __construct($url) {
  11. $this->setUrl($url);
  12. }
  13. public function getResponse() {
  14. return $this->response;
  15. }
  16. public function getBasePath() {
  17. return $this->basePath;
  18. }
  19. public function getBaseUrl() {
  20. return $this->baseUrl;
  21. }
  22. public function getUrl() {
  23. return $this->url;
  24. }
  25. private function setUrl($url) {
  26. if (String::isEmpty($url)) {
  27. throw new Exception('Provided URL is empty');
  28. }
  29. $urlInformation = @parse_url($url);
  30. if ($urlInformation === false) {
  31. throw new Exception('Provided URL is invalid');
  32. }
  33. $this->host = $urlInformation['host'];
  34. if (!$this->host) {
  35. throw new Exception('Could not parse the host from the provided URL');
  36. }
  37. $this->response = null;
  38. $this->url = $url;
  39. if (isset($urlInformation['scheme'])) {
  40. $this->scheme = $urlInformation['scheme'];
  41. } else {
  42. $this->scheme = 'http';
  43. }
  44. $this->baseUrl = $this->scheme . '://' . $this->host;
  45. if ($this->scheme == 'https') {
  46. $this->port = isset($urlInformation['port']) ? $urlInformation['port'] : 443;
  47. if ($this->port != 443) {
  48. $this->baseUrl .= ':' . $this->port;
  49. }
  50. } else {
  51. $this->port = isset($urlInformation['port']) ? $urlInformation['port'] : 80;
  52. if ($this->port != 80) {
  53. $this->baseUrl .= ':' . $this->port;
  54. }
  55. }
  56. $this->basePath = $this->baseUrl;
  57. if (isset($urlInformation['path'])) {
  58. $this->path = $urlInformation['path'];
  59. if (substr($this->path, -1, 1) == '/') {
  60. $this->basePath .= $this->path;
  61. } else {
  62. $position = strrpos($this->path, '/');
  63. if ($position) {
  64. $this->basePath .= substr($this->path, 0, $position);
  65. }
  66. $this->basePath .= '/';
  67. }
  68. } else {
  69. $this->path = '/';
  70. $this->basePath .= '/';
  71. }
  72. $this->query = '';
  73. if (isset($urlInformation['query'])) {
  74. $this->query = '?' . $urlInformation['query'];
  75. }
  76. }
  77. public function performCrawl() {
  78. $socket = $this->connect();
  79. $this->response = $this->performHead($socket);
  80. $responseCode = $this->response->getResponseCode();
  81. if (!$responseCode) {
  82. throw new Exception('No response received');
  83. }
  84. if ($this->response->isRedirect()) {
  85. return;
  86. }
  87. $contentType = $this->response->getHeader('Content-Type');
  88. if ($responseCode == 200 && String::startsWith($contentType, 'text/')) {
  89. $this->performGet($this->response);
  90. }
  91. fclose($socket);
  92. }
  93. private function connect($timeoutSeconds = 15, $timeoutMicroseconds = 0) {
  94. // secured url?
  95. if ($this->scheme == 'https') {
  96. $socket = @fsockopen('ssl://'. $this->host, $this->port, $errorNumber, $errorMessage, $timeoutSeconds);
  97. } else {
  98. $socket = @fsockopen($this->host, $this->port, $errorNumber, $errorMessage, $timeoutSeconds);
  99. }
  100. if (!$socket) {
  101. throw new Exception('Could not connect to ' . $this->host);
  102. }
  103. stream_set_timeout($socket, $timeoutSeconds, $timeoutMicroseconds);
  104. return $socket;
  105. }
  106. private function performHead($socket) {
  107. $request = "HEAD " . $this->path . $this->query . " HTTP/1.0\r\nHost: " . $this->host . "\r\nConnection: keep-alive\r\n\r\n";
  108. if (!fputs($socket, $request, strlen($request))) {
  109. throw Exception('Could not send the request');
  110. }
  111. $responseString = fread($socket, 4096);
  112. $attempt = 0;
  113. while (!$responseString && $attempt < 5) {
  114. sleep($attempt + 1);
  115. if (!fputs($socket, $request, strlen($request))) {
  116. throw new Exception('Could not resend the request');
  117. }
  118. $responseString = fread($socket, 4096);
  119. $attempt++;
  120. }
  121. return new HttpResponse($responseString);
  122. }
  123. private function performGet(HttpResponse $response) {
  124. $context = stream_context_create(array(
  125. 'http' => array(
  126. 'timeout' => 15
  127. )
  128. ));
  129. $content = file_get_contents($this->url, false, $context);
  130. if (!$content) {
  131. throw new Exception('No content received');
  132. }
  133. $response->setContent($content);
  134. }
  135. }