PageRenderTime 46ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/gulliver/thirdparty/html2ps_pdf/fetcher.url.class.php

https://bitbucket.org/ferOnti/processmaker
PHP | 460 lines | 276 code | 82 blank | 102 comment | 53 complexity | 0e76707f59162a3446d2454eb4d75f8c MD5 | raw file
  1. <?php
  2. require_once(HTML2PS_DIR.'fetcher._interface.class.php');
  3. define('HTTP_OK',200);
  4. /**
  5. * @TODO send authorization headers only if they have been required by the server;
  6. */
  7. class FetcherUrl extends Fetcher {
  8. var $_connections;
  9. var $protocol;
  10. var $host;
  11. var $port;
  12. var $path;
  13. var $url;
  14. var $headers;
  15. var $content;
  16. var $code;
  17. var $redirects;
  18. // Authorization
  19. var $user;
  20. var $pass;
  21. // ---------------------------------------------
  22. // FetcherURL - PUBLIC methods
  23. // ---------------------------------------------
  24. // "Fetcher" interface implementation
  25. function get_base_url() {
  26. return $this->url;
  27. }
  28. function get_data($data_id) {
  29. $this->redirects = 0;
  30. if ($this->fetch($data_id)) {
  31. if ($this->code != HTTP_OK) {
  32. $_server_response = $this->headers;
  33. $_http_error = $this->code;
  34. $_url = htmlspecialchars($data_id);
  35. ob_start();
  36. include('templates/error._http.tpl');
  37. $this->error_message .= ob_get_contents();
  38. ob_end_clean();
  39. error_log("Cannot open $data_id, HTTP result code is: ".$this->code);
  40. return null;
  41. };
  42. return new FetchedDataURL($this->content,
  43. explode("\r\n",$this->headers),
  44. $this->url);
  45. } elseif ($this->redirects > MAX_REDIRECTS) {
  46. $_server_response = $this->headers;
  47. $_url = htmlspecialchars($data_id);
  48. ob_start();
  49. include('templates/error._redirects.tpl');
  50. $this->error_message .= ob_get_contents();
  51. ob_end_clean();
  52. error_log(sprintf("Cannot open %s, too many redirects",
  53. $data_id));
  54. return null;
  55. } else {
  56. $_server_response = $this->headers;
  57. $_url = htmlspecialchars($data_id);
  58. ob_start();
  59. include('templates/error._connection.tpl');
  60. $this->error_message .= ob_get_contents();
  61. ob_end_clean();
  62. error_log(sprintf("Cannot open %s",
  63. $data_id));
  64. return null;
  65. }
  66. }
  67. function error_message() {
  68. return $this->error_message;
  69. }
  70. // FetcherURL - constructor
  71. function FetcherURL() {
  72. $this->_connections = array();
  73. $this->error_message = "";
  74. $this->redirects = 0;
  75. $this->port = 80;
  76. // Default encoding
  77. // $this->encoding = "iso-8859-1";
  78. $this->user_agent = DEFAULT_USER_AGENT;
  79. }
  80. // ---------------------------------------------
  81. // FetcherURL - PRIVATE methods
  82. // ---------------------------------------------
  83. /**
  84. * Connects to the target host using either HTTP or HTTPS protocol;
  85. * returns handle to connection socked or 'null' in case connection failed.
  86. *
  87. * @access private
  88. * @final
  89. * @return resource
  90. */
  91. function _connect() {
  92. // Connect to the target host
  93. if ($this->protocol == "https") {
  94. return $this->_connect_ssl();
  95. };
  96. $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT);
  97. if (!$fp) {
  98. $message = sprintf("Cannot connect to %s:%d - (%d) %s",
  99. $this->host,
  100. $this->port,
  101. $errno,
  102. $errstr);
  103. error_log($message);
  104. $this->error_message = $message;
  105. return null;
  106. };
  107. return $fp;
  108. }
  109. function _connect_ssl() {
  110. /**
  111. * Check if there's SSL support library loaded
  112. *
  113. * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box)
  114. * openssl extension IS present, but fsockopen still complains "No SSL support in this build".
  115. * (probably PHP bug?)
  116. */
  117. if (!extension_loaded('openssl')) {
  118. $message = sprintf("Cannot connect to %s:%d. SSL Extension missing",
  119. $this->host,
  120. $this->port);
  121. error_log($message);
  122. $this->error_message .= $message;
  123. return null;
  124. };
  125. $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5);
  126. if (!$fp) {
  127. $message = sprintf("Cannot connect to %s:%d - (%d) %s<br/>Missing SSL support?",
  128. $this->host,
  129. $this->port,
  130. $errno,
  131. $errstr);
  132. error_log($message);
  133. $this->error_message = $message;
  134. return null;
  135. };
  136. return $fp;
  137. }
  138. function _extract_code($res) {
  139. // Check return code
  140. // Note the return code will always be contained in the response, so
  141. // the we may not check the result of 'preg_match' - it matches always.
  142. //
  143. // A month later: nope, not always.
  144. //
  145. if (preg_match('/\s(\d+)\s/',$res,$matches)) {
  146. $result = $matches[1];
  147. } else {
  148. $result = "200";
  149. };
  150. return $result;
  151. }
  152. function _fix_location($location) {
  153. if (substr($location, 0, 7) == "http://") { return $location; };
  154. if (substr($location, 0, 8) == "https://") { return $location; };
  155. if ($location{0} == "/") {
  156. return $this->protocol."://".$this->host.$location;
  157. };
  158. return $this->protocol."://".$this->host.$this->path.$location;
  159. }
  160. function fetch($url) {
  161. /**
  162. * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid
  163. * URL, so fetcher will attempt to fetch something from the localhost instead of
  164. * passing control to subsequent user-defined fetchers (which probably will know
  165. * how to handle this).
  166. */
  167. if ($url === "") {
  168. return null;
  169. }
  170. $this->url = $url;
  171. $parts = @parse_url($this->url);
  172. /**
  173. * If an malformed URL have been specified, add a message to the log file and
  174. * continue processing (as such URLs may be found in otherwise good HTML file -
  175. * for example, invalid image or CSS reference)
  176. */
  177. if ($parts == false) {
  178. error_log(sprintf("The URL '%s' could not be parsed", $this->url));
  179. $this->content = "";
  180. $this->code = HTTP_OK;
  181. return true;
  182. };
  183. /**
  184. * Setup default values
  185. */
  186. $this->protocol = 'http';
  187. $this->host = 'localhost';
  188. $this->user = "";
  189. $this->pass = "";
  190. $this->port = 80;
  191. $this->path = "/";
  192. $this->query = "";
  193. if (isset($parts['scheme'])) { $this->protocol = $parts['scheme']; };
  194. if (isset($parts['host'])) { $this->host = $parts['host']; };
  195. if (isset($parts['user'])) { $this->user = $parts['user']; };
  196. if (isset($parts['pass'])) { $this->pass = $parts['pass']; };
  197. if (isset($parts['port'])) { $this->port = $parts['port']; };
  198. if (isset($parts['path'])) { $this->path = $parts['path']; } else { $this->path = "/"; };
  199. if (isset($parts['query'])) { $this->path .= '?'.$parts['query']; };
  200. switch ($this->protocol) {
  201. case 'http':
  202. return $this->fetch_http();
  203. case 'https':
  204. return $this->fetch_https();
  205. case 'file':
  206. $this->host = "";
  207. return $this->fetch_file();
  208. default:
  209. $message = sprintf("Unsupported protocol: %s", $this->protocol);
  210. error_log($message);
  211. $this->error_message .= $message;
  212. return null;
  213. }
  214. }
  215. function fetch_http() {
  216. $res = $this->_head();
  217. if (is_null($res)) { return null; };
  218. $this->code = $this->_extract_code($res);
  219. return $this->_process_code($res);
  220. }
  221. function fetch_https() {
  222. /**
  223. * SSL works via port 443
  224. */
  225. if ($this->protocol == "https" && !isset($parts['port'])) {
  226. $this->port = 443;
  227. }
  228. $res = $this->_head();
  229. if (is_null($res)) { return null; };
  230. $this->code = $this->_extract_code($res);
  231. return $this->_process_code($res);
  232. }
  233. function fetch_file() {
  234. if (PHP_OS == "WINNT") {
  235. $path = substr($this->url, 7);
  236. if ($path{0} == "/") { $path = substr($path, 1); };
  237. } else {
  238. $path = substr($this->url, 7);
  239. };
  240. $normalized_path = realpath(urldecode($path));
  241. if (substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT)) !== FILE_PROTOCOL_RESTRICT) {
  242. error_log(sprintf("Access denied to file '%s'", $normalized_path));
  243. $this->content = "";
  244. $this->code = HTTP_OK;
  245. return true;
  246. }
  247. $this->content = @file_get_contents($normalized_path);
  248. $this->code = HTTP_OK;
  249. return true;
  250. }
  251. function _get() {
  252. $socket = $this->_connect();
  253. if (is_null($socket)) { return null; };
  254. // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
  255. $header = "GET ".$this->path." HTTP/1.1\r\n";
  256. $header .= "Host: ".$this->host."\r\n";
  257. $header .= "Accept: */*\r\n";
  258. $header .= "User-Agent: ".$this->user_agent."\r\n";
  259. $header .= "Connection: keep-alive\r\n";
  260. $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
  261. $header .= $this->_header_basic_authorization();
  262. $header .= "\r\n";
  263. fputs ($socket, $header);
  264. // Get the responce
  265. $res = "";
  266. // The PHP-recommended construction
  267. // while (!feof($fp)) { $res .= fread($fp, 4096); };
  268. // hangs indefinitely on www.searchscout.com, for example.
  269. // seems that they do not close conection on their side or somewhat similar;
  270. // let's assume that there will be no HTML pages greater than 1 Mb
  271. $res = fread($socket, 1024*1024);
  272. // Close connection handle, we do not need it anymore
  273. fclose($socket);
  274. return $res;
  275. }
  276. function _head() {
  277. $socket = $this->_connect();
  278. if (is_null($socket)) { return null; };
  279. // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
  280. $header = "HEAD ".$this->path." HTTP/1.1\r\n";
  281. $header .= "Host: ".$this->host."\r\n";
  282. $header .= "Accept: */*\r\n";
  283. $header .= "User-Agent: ".$this->user_agent."\r\n";
  284. $header .= "Connection: keep-alive\r\n";
  285. $header .= "Accept: text/html\r\n";
  286. $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
  287. $header .= $this->_header_basic_authorization();
  288. $header .= "\r\n";
  289. // Send the header
  290. fputs ($socket, $header);
  291. // Get the responce
  292. $res = "";
  293. // The PHP-recommended construction
  294. // while (!feof($fp)) { $res .= fread($fp, 4096); };
  295. // hangs indefinitely on www.searchscout.com, for example.
  296. // seems that they do not close conection on their side or somewhat similar;
  297. // let's assume that there will be no HTML pages greater than 1 Mb
  298. $res = fread($socket, 4096);
  299. // Close connection handle, we do not need it anymore
  300. fclose($socket);
  301. return $res;
  302. }
  303. function _process_code($res, $used_get = false) {
  304. switch ($this->code) {
  305. case '200': // OK
  306. if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
  307. $this->headers = $matches[1];
  308. };
  309. /**
  310. * @todo add error processing here
  311. *
  312. * Note: file_get_contents is smart enough to use basic authorization headers provided
  313. * user name / password are given in the URL.
  314. */
  315. $this->content = @file_get_contents($this->url);
  316. return true;
  317. break;
  318. case '301': // Moved Permanently
  319. $this->redirects++;
  320. if ($this->redirects > MAX_REDIRECTS) { return false; };
  321. preg_match('/Location: ([\S]+)/i',$res,$matches);
  322. return $this->fetch($this->_fix_location($matches[1]));
  323. case '302': // Found
  324. $this->redirects++;
  325. if ($this->redirects > MAX_REDIRECTS) { return false; };
  326. preg_match('/Location: ([\S]+)/i',$res,$matches);
  327. error_log('Redirected to:'.$matches[1]);
  328. return $this->fetch($this->_fix_location($matches[1]));
  329. case '400': // Bad request
  330. case '401': // Unauthorized
  331. case '402': // Payment required
  332. case '403': // Forbidden
  333. case '404': // Not found - but should return some html content - error page
  334. case '406': // Not acceptable
  335. if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
  336. error_log("Unrecognized HTTP response");
  337. return false;
  338. };
  339. $this->headers = $matches[1];
  340. $this->content = @file_get_contents($this->url);
  341. return true;
  342. case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests
  343. // Try to get URL information using GET request (if we didn't tried it before)
  344. if (!$used_get) {
  345. $res = $this->_get();
  346. if (is_null($res)) { return null; };
  347. $this->code = $this->_extract_code($res);
  348. return $this->_process_code($res, true);
  349. } else {
  350. if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
  351. error_log("Unrecognized HTTP response");
  352. return false;
  353. };
  354. $this->headers = $matches[1];
  355. $this->content = @file_get_contents($this->url);
  356. return true;
  357. };
  358. default:
  359. error_log("Unrecognized HTTP result code:".$this->code);
  360. return false;
  361. };
  362. }
  363. function _header_basic_authorization() {
  364. if (!is_null($this->user) && $this->user != "") {
  365. return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass));
  366. };
  367. }
  368. }
  369. ?>