/wp-content/plugins/broken-link-checker/modules/checkers/http.php
PHP | 407 lines | 258 code | 66 blank | 83 comment | 47 complexity | d25910450cb9b7359f8c10812502ca15 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, AGPL-1.0
- <?php
- /*
- Plugin Name: Basic HTTP
- Description: Check all links that have the HTTP/HTTPS protocol.
- Version: 1.0
- Author: Janis Elsts
- ModuleID: http
- ModuleCategory: checker
- ModuleContext: on-demand
- ModuleLazyInit: true
- ModuleClassName: blcHttpChecker
- ModulePriority: -1
- */
- //TODO: Rewrite sub-classes as transports, not stand-alone checkers
- class blcHttpChecker extends blcChecker {
- /* @var blcChecker */
- var $implementation = null;
-
- function init(){
- parent::init();
-
- if ( function_exists('curl_init') || is_callable('curl_init') ) {
- $this->implementation = new blcCurlHttp(
- $this->module_id,
- $this->cached_header,
- $this->plugin_conf,
- $this->module_manager
- );
- } else {
- //Try to load Snoopy.
- if ( !class_exists('Snoopy') ){
- $snoopy_file = ABSPATH. WPINC . '/class-snoopy.php';
- if (file_exists($snoopy_file) ){
- include $snoopy_file;
- }
- }
-
- //If Snoopy is available, it will be used in place of CURL.
- if ( class_exists('Snoopy') ){
- $this->implementation = new blcSnoopyHttp(
- $this->module_id,
- $this->cached_header,
- $this->plugin_conf,
- $this->module_manager
- );
- }
- }
- }
-
- function can_check($url, $parsed){
- if ( isset($this->implementation) ){
- return $this->implementation->can_check($url, $parsed);
- } else {
- return false;
- }
- }
-
- function check($url, $use_get = false){
- return $this->implementation->check($url, $use_get);
- }
- }
- /**
- * Base class for checkers that deal with HTTP(S) URLs.
- *
- * @package Broken Link Checker
- * @access public
- */
- class blcHttpCheckerBase extends blcChecker {
-
- function clean_url($url){
- $url = html_entity_decode($url);
- $url = preg_replace(
- array(
- '/([\?&]PHPSESSID=\w+)$/i', //remove session ID
- '/(#[^\/]*)$/', //and anchors/fragments
- '/&/', //convert improper HTML entities
- '/([\?&]sid=\w+)$/i' //remove another flavour of session ID
- ),
- array('','','&',''),
- $url
- );
- $url = trim($url);
-
- return $url;
- }
-
- function is_error_code($http_code){
- /*"Good" response codes are anything in the 2XX range (e.g "200 OK") and redirects - the 3XX range.
- HTTP 401 Unauthorized is a special case that is considered OK as well. Other errors - the 4XX range -
- are treated as such. */
- $good_code = ( ($http_code >= 200) && ($http_code < 400) ) || ( $http_code == 401 );
- return !$good_code;
- }
-
- /**
- * This checker only accepts HTTP(s) links.
- *
- * @param string $url
- * @param array|bool $parsed
- * @return bool
- */
- function can_check($url, $parsed){
- if ( !isset($parsed['scheme']) ) return false;
-
- return in_array( strtolower($parsed['scheme']), array('http', 'https') );
- }
-
- /**
- * Takes an URL and replaces spaces and some other non-alphanumeric characters with their urlencoded equivalents.
- *
- * @param string $url
- * @return string
- */
- function urlencodefix($url){
- //TODO: Remove/fix this. Probably not a good idea to "fix" invalid URLs like that.
- return preg_replace_callback(
- '|[^a-z0-9\+\-\/\\#:.,;=?!&%@()$\|*~_]|i',
- create_function('$str','return rawurlencode($str[0]);'),
- $url
- );
- }
-
- }
- class blcCurlHttp extends blcHttpCheckerBase {
-
- var $last_headers = '';
-
- function check($url, $use_get = false){
- $this->last_headers = '';
-
- $url = $this->clean_url($url);
-
- $result = array(
- 'broken' => false,
- );
- $log = '';
-
- //Get the BLC configuration. It's used below to set the right timeout values and such.
- $conf = blc_get_configuration();
-
- //Init curl.
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $this->urlencodefix($url));
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-
- //Masquerade as Internet explorer
- //$ua = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
- $ua = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)';
- curl_setopt($ch, CURLOPT_USERAGENT, $ua);
-
- //Add a semi-plausible referer header to avoid tripping up some bot traps
- curl_setopt($ch, CURLOPT_REFERER, home_url());
-
- //Redirects don't work when safe mode or open_basedir is enabled.
- if ( !blcUtility::is_safe_mode() && !blcUtility::is_open_basedir() ) {
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- }
- //Set maximum redirects
- curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
-
- //Set the timeout
- curl_setopt($ch, CURLOPT_TIMEOUT, $conf->options['timeout']);
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conf->options['timeout']);
-
- //Set the proxy configuration. The user can provide this in wp-config.php
- if (defined('WP_PROXY_HOST')) {
- curl_setopt($ch, CURLOPT_PROXY, WP_PROXY_HOST);
- }
- if (defined('WP_PROXY_PORT')) {
- curl_setopt($ch, CURLOPT_PROXYPORT, WP_PROXY_PORT);
- }
- if (defined('WP_PROXY_USERNAME')){
- $auth = WP_PROXY_USERNAME;
- if (defined('WP_PROXY_PASSWORD')){
- $auth .= ':' . WP_PROXY_PASSWORD;
- }
- curl_setopt($ch, CURLOPT_PROXYUSERPWD, $auth);
- }
- //Make CURL return a valid result even if it gets a 404 or other error.
- curl_setopt($ch, CURLOPT_FAILONERROR, false);
-
- $nobody = !$use_get; //Whether to send a HEAD request (the default) or a GET request
-
- $parts = @parse_url($url);
- if( $parts['scheme'] == 'https' ){
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //Required to make HTTPS URLs work.
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
- $nobody = false; //Can't use HEAD with HTTPS.
- }
-
- if ( $nobody ){
- //If possible, use HEAD requests for speed.
- curl_setopt($ch, CURLOPT_NOBODY, true);
- } else {
- //If we must use GET at least limit the amount of downloaded data.
- curl_setopt($ch, CURLOPT_HTTPHEADER, array('Range: bytes=0-2048')); //2 KB
- }
-
- //Register a callback function which will process the HTTP header(s).
- //It can be called multiple times if the remote server performs a redirect.
- curl_setopt($ch, CURLOPT_HEADERFUNCTION, array(&$this,'read_header'));
- //Execute the request
- $start_time = microtime_float();
- curl_exec($ch);
- $measured_request_duration = microtime_float() - $start_time;
-
- $info = curl_getinfo($ch);
-
- //Store the results
- $result['http_code'] = intval( $info['http_code'] );
- $result['final_url'] = $info['url'];
- $result['request_duration'] = $info['total_time'];
- $result['redirect_count'] = $info['redirect_count'];
-
- //CURL doesn't return a request duration when a timeout happens, so we measure it ourselves.
- //It is useful to see how long the plugin waited for the server to respond before assuming it timed out.
- if( empty($result['request_duration']) ){
- $result['request_duration'] = $measured_request_duration;
- }
-
- //Determine if the link counts as "broken"
- if ( $result['http_code'] == 0 ){
- $result['broken'] = true;
-
- $error_code = curl_errno($ch);
- $log .= sprintf( "%s [Error #%d]\n", curl_error($ch), $error_code );
-
- //We only handle a couple of CURL error codes; most are highly esoteric.
- //libcurl "CURLE_" constants can't be used here because some of them have
- //different names or values in PHP.
- switch( $error_code ) {
- case 6: //CURLE_COULDNT_RESOLVE_HOST
- $result['status_code'] = BLC_LINK_STATUS_WARNING;
- $result['status_text'] = __('Server Not Found', 'broken-link-checker');
- break;
-
- case 28: //CURLE_OPERATION_TIMEDOUT
- $result['timeout'] = true;
- break;
-
- case 7: //CURLE_COULDNT_CONNECT
- //More often than not, this error code indicates that the connection attempt
- //timed out. This heuristic tries to distinguish between connections that fail
- //due to timeouts and those that fail due to other causes.
- if ( $result['request_duration'] >= 0.9*$conf->options['timeout'] ){
- $result['timeout'] = true;
- } else {
- $result['status_code'] = BLC_LINK_STATUS_WARNING;
- $result['status_text'] = __('Connection Failed', 'broken-link-checker');
- }
- break;
-
- default:
- $result['status_code'] = BLC_LINK_STATUS_WARNING;
- $result['status_text'] = __('Unknown Error', 'broken-link-checker');
- }
-
- } else {
- $result['broken'] = $this->is_error_code($result['http_code']);
- }
- curl_close($ch);
-
- if ( $nobody && $result['broken'] ){
- //The site in question might be expecting GET instead of HEAD, so lets retry the request
- //using the GET verb.
- return $this->check($url, true);
-
- //Note : normally a server that doesn't allow HEAD requests on a specific resource *should*
- //return "405 Method Not Allowed". Unfortunately, there are sites that return 404 or
- //another, even more general, error code instead. So just checking for 405 wouldn't be enough.
- }
-
- //When safe_mode or open_basedir is enabled CURL will be forbidden from following redirects,
- //so redirect_count will be 0 for all URLs. As a workaround, set it to 1 when the HTTP
- //response codes indicates a redirect but redirect_count is zero.
- //Note to self : Extracting the Location header might also be helpful.
- if ( ($result['redirect_count'] == 0) && ( in_array( $result['http_code'], array(301, 302, 303, 307) ) ) ){
- $result['redirect_count'] = 1;
- }
-
- //Build the log from HTTP code and headers.
- $log .= '=== ';
- if ( $result['http_code'] ){
- $log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']);
- } else {
- $log .= __('(No response)', 'broken-link-checker');
- }
- $log .= " ===\n\n";
- $log .= $this->last_headers;
-
- if ( !empty($result['broken']) && !empty($result['timeout']) ) {
- $log .= "\n(" . __("Most likely the connection timed out or the domain doesn't exist.", 'broken-link-checker') . ')';
- }
-
- $result['log'] = $log;
-
- //The hash should contain info about all pieces of data that pertain to determining if the
- //link is working.
- $result['result_hash'] = implode('|', array(
- $result['http_code'],
- !empty($result['broken'])?'broken':'0',
- !empty($result['timeout'])?'timeout':'0',
- md5($result['final_url']),
- ));
-
- return $result;
- }
-
- function read_header($ch, $header){
- $this->last_headers .= $header;
- return strlen($header);
- }
-
- }
- class blcSnoopyHttp extends blcHttpCheckerBase {
-
- function check($url){
- $url = $this->clean_url($url);
- //Note : Snoopy doesn't work too well with HTTPS URLs.
-
- $result = array(
- 'broken' => false,
- 'timeout' => false,
- );
- $log = '';
-
- //Get the timeout setting from the BLC configuration.
- $conf = blc_get_configuration();
- $timeout = $conf->options['timeout'];
-
- $start_time = microtime_float();
-
- //Fetch the URL with Snoopy
- $snoopy = new Snoopy;
- $snoopy->read_timeout = $timeout; //read timeout in seconds
- $snoopy->agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"; //masquerade as IE 7
- $snoopy->referer = home_url(); //valid referer helps circumvent some hotlink protection schemes
- $snoopy->maxlength = 1024*5; //load up to 5 kilobytes
- $snoopy->fetch( $this->urlencodefix($url) );
-
- $result['request_duration'] = microtime_float() - $start_time;
- $result['http_code'] = $snoopy->status; //HTTP status code
- //Snoopy returns -100 on timeout
- if ( $result['http_code'] == -100 ){
- $result['http_code'] = 0;
- $result['timeout'] = true;
- }
-
- //Build the log
- $log .= '=== ';
- if ( $result['http_code'] ){
- $log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']);
- } else {
- $log .= __('(No response)', 'broken-link-checker');
- }
- $log .= " ===\n\n";
- if ($snoopy->error)
- $log .= $snoopy->error."\n";
- if ($snoopy->timed_out) {
- $log .= __("Request timed out.", 'broken-link-checker') . "\n";
- $result['timeout'] = true;
- }
- if ( is_array($snoopy->headers) )
- $log .= implode("", $snoopy->headers)."\n"; //those headers already contain newlines
- //Redirected?
- if ( $snoopy->lastredirectaddr ) {
- $result['final_url'] = $snoopy->lastredirectaddr;
- $result['redirect_count'] = $snoopy->_redirectdepth;
- } else {
- $result['final_url'] = $url;
- }
-
- //Determine if the link counts as "broken"
- $result['broken'] = $this->is_error_code($result['http_code']) || $result['timeout'];
-
- $log .= "<em>(" . __('Using Snoopy', 'broken-link-checker') . ")</em>";
- $result['log'] = $log;
-
- //The hash should contain info about all pieces of data that pertain to determining if the
- //link is working.
- $result['result_hash'] = implode('|', array(
- $result['http_code'],
- $result['broken']?'broken':'0',
- $result['timeout']?'timeout':'0',
- md5($result['final_url']),
- ));
-
- return $result;
- }
-
- }
- ?>