/gulliver/thirdparty/html2ps_pdf/fetcher.url.class.php
PHP | 460 lines | 276 code | 82 blank | 102 comment | 53 complexity | 0e76707f59162a3446d2454eb4d75f8c MD5 | raw file
- <?php
- require_once(HTML2PS_DIR.'fetcher._interface.class.php');
- define('HTTP_OK',200);
- /**
- * @TODO send authorization headers only if they have been required by the server;
- */
- class FetcherUrl extends Fetcher {
- var $_connections;
- var $protocol;
- var $host;
- var $port;
- var $path;
- var $url;
- var $headers;
- var $content;
- var $code;
- var $redirects;
- // Authorization
- var $user;
- var $pass;
- // ---------------------------------------------
- // FetcherURL - PUBLIC methods
- // ---------------------------------------------
- // "Fetcher" interface implementation
- function get_base_url() {
- return $this->url;
- }
- function get_data($data_id) {
- $this->redirects = 0;
- if ($this->fetch($data_id)) {
- if ($this->code != HTTP_OK) {
- $_server_response = $this->headers;
- $_http_error = $this->code;
- $_url = htmlspecialchars($data_id);
- ob_start();
- include('templates/error._http.tpl');
- $this->error_message .= ob_get_contents();
- ob_end_clean();
- error_log("Cannot open $data_id, HTTP result code is: ".$this->code);
- return null;
- };
- return new FetchedDataURL($this->content,
- explode("\r\n",$this->headers),
- $this->url);
- } elseif ($this->redirects > MAX_REDIRECTS) {
- $_server_response = $this->headers;
- $_url = htmlspecialchars($data_id);
- ob_start();
- include('templates/error._redirects.tpl');
- $this->error_message .= ob_get_contents();
- ob_end_clean();
- error_log(sprintf("Cannot open %s, too many redirects",
- $data_id));
- return null;
- } else {
- $_server_response = $this->headers;
- $_url = htmlspecialchars($data_id);
- ob_start();
- include('templates/error._connection.tpl');
- $this->error_message .= ob_get_contents();
- ob_end_clean();
- error_log(sprintf("Cannot open %s",
- $data_id));
- return null;
- }
- }
- function error_message() {
- return $this->error_message;
- }
- // FetcherURL - constructor
- function FetcherURL() {
- $this->_connections = array();
- $this->error_message = "";
- $this->redirects = 0;
- $this->port = 80;
- // Default encoding
- // $this->encoding = "iso-8859-1";
- $this->user_agent = DEFAULT_USER_AGENT;
- }
- // ---------------------------------------------
- // FetcherURL - PRIVATE methods
- // ---------------------------------------------
- /**
- * Connects to the target host using either HTTP or HTTPS protocol;
- * returns handle to connection socked or 'null' in case connection failed.
- *
- * @access private
- * @final
- * @return resource
- */
- function _connect() {
- // Connect to the target host
- if ($this->protocol == "https") {
- return $this->_connect_ssl();
- };
- $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT);
- if (!$fp) {
- $message = sprintf("Cannot connect to %s:%d - (%d) %s",
- $this->host,
- $this->port,
- $errno,
- $errstr);
- error_log($message);
- $this->error_message = $message;
- return null;
- };
- return $fp;
- }
- function _connect_ssl() {
- /**
- * Check if there's SSL support library loaded
- *
- * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box)
- * openssl extension IS present, but fsockopen still complains "No SSL support in this build".
- * (probably PHP bug?)
- */
- if (!extension_loaded('openssl')) {
- $message = sprintf("Cannot connect to %s:%d. SSL Extension missing",
- $this->host,
- $this->port);
- error_log($message);
- $this->error_message .= $message;
- return null;
- };
- $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5);
- if (!$fp) {
- $message = sprintf("Cannot connect to %s:%d - (%d) %s<br/>Missing SSL support?",
- $this->host,
- $this->port,
- $errno,
- $errstr);
- error_log($message);
- $this->error_message = $message;
- return null;
- };
- return $fp;
- }
- function _extract_code($res) {
- // Check return code
- // Note the return code will always be contained in the response, so
- // the we may not check the result of 'preg_match' - it matches always.
- //
- // A month later: nope, not always.
- //
- if (preg_match('/\s(\d+)\s/',$res,$matches)) {
- $result = $matches[1];
- } else {
- $result = "200";
- };
- return $result;
- }
- function _fix_location($location) {
- if (substr($location, 0, 7) == "http://") { return $location; };
- if (substr($location, 0, 8) == "https://") { return $location; };
- if ($location{0} == "/") {
- return $this->protocol."://".$this->host.$location;
- };
- return $this->protocol."://".$this->host.$this->path.$location;
- }
- function fetch($url) {
- /**
- * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid
- * URL, so fetcher will attempt to fetch something from the localhost instead of
- * passing control to subsequent user-defined fetchers (which probably will know
- * how to handle this).
- */
- if ($url === "") {
- return null;
- }
- $this->url = $url;
- $parts = @parse_url($this->url);
- /**
- * If an malformed URL have been specified, add a message to the log file and
- * continue processing (as such URLs may be found in otherwise good HTML file -
- * for example, invalid image or CSS reference)
- */
- if ($parts == false) {
- error_log(sprintf("The URL '%s' could not be parsed", $this->url));
- $this->content = "";
- $this->code = HTTP_OK;
- return true;
- };
-
- /**
- * Setup default values
- */
- $this->protocol = 'http';
- $this->host = 'localhost';
- $this->user = "";
- $this->pass = "";
- $this->port = 80;
- $this->path = "/";
- $this->query = "";
- if (isset($parts['scheme'])) { $this->protocol = $parts['scheme']; };
- if (isset($parts['host'])) { $this->host = $parts['host']; };
- if (isset($parts['user'])) { $this->user = $parts['user']; };
- if (isset($parts['pass'])) { $this->pass = $parts['pass']; };
- if (isset($parts['port'])) { $this->port = $parts['port']; };
- if (isset($parts['path'])) { $this->path = $parts['path']; } else { $this->path = "/"; };
- if (isset($parts['query'])) { $this->path .= '?'.$parts['query']; };
- switch ($this->protocol) {
- case 'http':
- return $this->fetch_http();
- case 'https':
- return $this->fetch_https();
- case 'file':
- $this->host = "";
- return $this->fetch_file();
- default:
- $message = sprintf("Unsupported protocol: %s", $this->protocol);
- error_log($message);
- $this->error_message .= $message;
- return null;
- }
- }
- function fetch_http() {
- $res = $this->_head();
- if (is_null($res)) { return null; };
- $this->code = $this->_extract_code($res);
- return $this->_process_code($res);
- }
- function fetch_https() {
- /**
- * SSL works via port 443
- */
- if ($this->protocol == "https" && !isset($parts['port'])) {
- $this->port = 443;
- }
- $res = $this->_head();
- if (is_null($res)) { return null; };
- $this->code = $this->_extract_code($res);
- return $this->_process_code($res);
- }
- function fetch_file() {
- if (PHP_OS == "WINNT") {
- $path = substr($this->url, 7);
- if ($path{0} == "/") { $path = substr($path, 1); };
- } else {
- $path = substr($this->url, 7);
- };
- $normalized_path = realpath(urldecode($path));
- if (substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT)) !== FILE_PROTOCOL_RESTRICT) {
- error_log(sprintf("Access denied to file '%s'", $normalized_path));
- $this->content = "";
- $this->code = HTTP_OK;
- return true;
- }
- $this->content = @file_get_contents($normalized_path);
- $this->code = HTTP_OK;
- return true;
- }
- function _get() {
- $socket = $this->_connect();
- if (is_null($socket)) { return null; };
- // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
- $header = "GET ".$this->path." HTTP/1.1\r\n";
- $header .= "Host: ".$this->host."\r\n";
- $header .= "Accept: */*\r\n";
- $header .= "User-Agent: ".$this->user_agent."\r\n";
- $header .= "Connection: keep-alive\r\n";
- $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
- $header .= $this->_header_basic_authorization();
- $header .= "\r\n";
- fputs ($socket, $header);
- // Get the responce
- $res = "";
- // The PHP-recommended construction
- // while (!feof($fp)) { $res .= fread($fp, 4096); };
- // hangs indefinitely on www.searchscout.com, for example.
- // seems that they do not close conection on their side or somewhat similar;
- // let's assume that there will be no HTML pages greater than 1 Mb
- $res = fread($socket, 1024*1024);
- // Close connection handle, we do not need it anymore
- fclose($socket);
- return $res;
- }
- function _head() {
- $socket = $this->_connect();
- if (is_null($socket)) { return null; };
- // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
- $header = "HEAD ".$this->path." HTTP/1.1\r\n";
- $header .= "Host: ".$this->host."\r\n";
- $header .= "Accept: */*\r\n";
- $header .= "User-Agent: ".$this->user_agent."\r\n";
- $header .= "Connection: keep-alive\r\n";
- $header .= "Accept: text/html\r\n";
- $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
- $header .= $this->_header_basic_authorization();
- $header .= "\r\n";
- // Send the header
- fputs ($socket, $header);
- // Get the responce
- $res = "";
- // The PHP-recommended construction
- // while (!feof($fp)) { $res .= fread($fp, 4096); };
- // hangs indefinitely on www.searchscout.com, for example.
- // seems that they do not close conection on their side or somewhat similar;
- // let's assume that there will be no HTML pages greater than 1 Mb
- $res = fread($socket, 4096);
- // Close connection handle, we do not need it anymore
- fclose($socket);
- return $res;
- }
- function _process_code($res, $used_get = false) {
- switch ($this->code) {
- case '200': // OK
- if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
- $this->headers = $matches[1];
- };
- /**
- * @todo add error processing here
- *
- * Note: file_get_contents is smart enough to use basic authorization headers provided
- * user name / password are given in the URL.
- */
- $this->content = @file_get_contents($this->url);
- return true;
- break;
- case '301': // Moved Permanently
- $this->redirects++;
- if ($this->redirects > MAX_REDIRECTS) { return false; };
- preg_match('/Location: ([\S]+)/i',$res,$matches);
- return $this->fetch($this->_fix_location($matches[1]));
- case '302': // Found
- $this->redirects++;
- if ($this->redirects > MAX_REDIRECTS) { return false; };
- preg_match('/Location: ([\S]+)/i',$res,$matches);
- error_log('Redirected to:'.$matches[1]);
- return $this->fetch($this->_fix_location($matches[1]));
- case '400': // Bad request
- case '401': // Unauthorized
- case '402': // Payment required
- case '403': // Forbidden
- case '404': // Not found - but should return some html content - error page
- case '406': // Not acceptable
- if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
- error_log("Unrecognized HTTP response");
- return false;
- };
- $this->headers = $matches[1];
- $this->content = @file_get_contents($this->url);
- return true;
- case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests
- // Try to get URL information using GET request (if we didn't tried it before)
- if (!$used_get) {
- $res = $this->_get();
- if (is_null($res)) { return null; };
- $this->code = $this->_extract_code($res);
- return $this->_process_code($res, true);
- } else {
- if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
- error_log("Unrecognized HTTP response");
- return false;
- };
- $this->headers = $matches[1];
- $this->content = @file_get_contents($this->url);
- return true;
- };
- default:
- error_log("Unrecognized HTTP result code:".$this->code);
- return false;
- };
- }
- function _header_basic_authorization() {
- if (!is_null($this->user) && $this->user != "") {
- return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass));
- };
- }
- }
- ?>