PageRenderTime 46ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/wp-content/plugins/broken-link-checker/modules/checkers/http.php

https://bitbucket.org/lgorence/quickpress
PHP | 407 lines | 258 code | 66 blank | 83 comment | 47 complexity | d25910450cb9b7359f8c10812502ca15 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, AGPL-1.0
  1. <?php
  2. /*
  3. Plugin Name: Basic HTTP
  4. Description: Check all links that have the HTTP/HTTPS protocol.
  5. Version: 1.0
  6. Author: Janis Elsts
  7. ModuleID: http
  8. ModuleCategory: checker
  9. ModuleContext: on-demand
  10. ModuleLazyInit: true
  11. ModuleClassName: blcHttpChecker
  12. ModulePriority: -1
  13. */
  14. //TODO: Rewrite sub-classes as transports, not stand-alone checkers
  15. class blcHttpChecker extends blcChecker {
  16. /* @var blcChecker */
  17. var $implementation = null;
  18. function init(){
  19. parent::init();
  20. if ( function_exists('curl_init') || is_callable('curl_init') ) {
  21. $this->implementation = new blcCurlHttp(
  22. $this->module_id,
  23. $this->cached_header,
  24. $this->plugin_conf,
  25. $this->module_manager
  26. );
  27. } else {
  28. //Try to load Snoopy.
  29. if ( !class_exists('Snoopy') ){
  30. $snoopy_file = ABSPATH. WPINC . '/class-snoopy.php';
  31. if (file_exists($snoopy_file) ){
  32. include $snoopy_file;
  33. }
  34. }
  35. //If Snoopy is available, it will be used in place of CURL.
  36. if ( class_exists('Snoopy') ){
  37. $this->implementation = new blcSnoopyHttp(
  38. $this->module_id,
  39. $this->cached_header,
  40. $this->plugin_conf,
  41. $this->module_manager
  42. );
  43. }
  44. }
  45. }
  46. function can_check($url, $parsed){
  47. if ( isset($this->implementation) ){
  48. return $this->implementation->can_check($url, $parsed);
  49. } else {
  50. return false;
  51. }
  52. }
  53. function check($url, $use_get = false){
  54. return $this->implementation->check($url, $use_get);
  55. }
  56. }
  57. /**
  58. * Base class for checkers that deal with HTTP(S) URLs.
  59. *
  60. * @package Broken Link Checker
  61. * @access public
  62. */
  63. class blcHttpCheckerBase extends blcChecker {
  64. function clean_url($url){
  65. $url = html_entity_decode($url);
  66. $url = preg_replace(
  67. array(
  68. '/([\?&]PHPSESSID=\w+)$/i', //remove session ID
  69. '/(#[^\/]*)$/', //and anchors/fragments
  70. '/&amp;/', //convert improper HTML entities
  71. '/([\?&]sid=\w+)$/i' //remove another flavour of session ID
  72. ),
  73. array('','','&',''),
  74. $url
  75. );
  76. $url = trim($url);
  77. return $url;
  78. }
  79. function is_error_code($http_code){
  80. /*"Good" response codes are anything in the 2XX range (e.g "200 OK") and redirects - the 3XX range.
  81. HTTP 401 Unauthorized is a special case that is considered OK as well. Other errors - the 4XX range -
  82. are treated as such. */
  83. $good_code = ( ($http_code >= 200) && ($http_code < 400) ) || ( $http_code == 401 );
  84. return !$good_code;
  85. }
  86. /**
  87. * This checker only accepts HTTP(s) links.
  88. *
  89. * @param string $url
  90. * @param array|bool $parsed
  91. * @return bool
  92. */
  93. function can_check($url, $parsed){
  94. if ( !isset($parsed['scheme']) ) return false;
  95. return in_array( strtolower($parsed['scheme']), array('http', 'https') );
  96. }
  97. /**
  98. * Takes an URL and replaces spaces and some other non-alphanumeric characters with their urlencoded equivalents.
  99. *
  100. * @param string $url
  101. * @return string
  102. */
  103. function urlencodefix($url){
  104. //TODO: Remove/fix this. Probably not a good idea to "fix" invalid URLs like that.
  105. return preg_replace_callback(
  106. '|[^a-z0-9\+\-\/\\#:.,;=?!&%@()$\|*~_]|i',
  107. create_function('$str','return rawurlencode($str[0]);'),
  108. $url
  109. );
  110. }
  111. }
  112. class blcCurlHttp extends blcHttpCheckerBase {
  113. var $last_headers = '';
  114. function check($url, $use_get = false){
  115. $this->last_headers = '';
  116. $url = $this->clean_url($url);
  117. $result = array(
  118. 'broken' => false,
  119. );
  120. $log = '';
  121. //Get the BLC configuration. It's used below to set the right timeout values and such.
  122. $conf = blc_get_configuration();
  123. //Init curl.
  124. $ch = curl_init();
  125. curl_setopt($ch, CURLOPT_URL, $this->urlencodefix($url));
  126. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  127. //Masquerade as Internet explorer
  128. //$ua = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
  129. $ua = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)';
  130. curl_setopt($ch, CURLOPT_USERAGENT, $ua);
  131. //Add a semi-plausible referer header to avoid tripping up some bot traps
  132. curl_setopt($ch, CURLOPT_REFERER, home_url());
  133. //Redirects don't work when safe mode or open_basedir is enabled.
  134. if ( !blcUtility::is_safe_mode() && !blcUtility::is_open_basedir() ) {
  135. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  136. }
  137. //Set maximum redirects
  138. curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
  139. //Set the timeout
  140. curl_setopt($ch, CURLOPT_TIMEOUT, $conf->options['timeout']);
  141. curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conf->options['timeout']);
  142. //Set the proxy configuration. The user can provide this in wp-config.php
  143. if (defined('WP_PROXY_HOST')) {
  144. curl_setopt($ch, CURLOPT_PROXY, WP_PROXY_HOST);
  145. }
  146. if (defined('WP_PROXY_PORT')) {
  147. curl_setopt($ch, CURLOPT_PROXYPORT, WP_PROXY_PORT);
  148. }
  149. if (defined('WP_PROXY_USERNAME')){
  150. $auth = WP_PROXY_USERNAME;
  151. if (defined('WP_PROXY_PASSWORD')){
  152. $auth .= ':' . WP_PROXY_PASSWORD;
  153. }
  154. curl_setopt($ch, CURLOPT_PROXYUSERPWD, $auth);
  155. }
  156. //Make CURL return a valid result even if it gets a 404 or other error.
  157. curl_setopt($ch, CURLOPT_FAILONERROR, false);
  158. $nobody = !$use_get; //Whether to send a HEAD request (the default) or a GET request
  159. $parts = @parse_url($url);
  160. if( $parts['scheme'] == 'https' ){
  161. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //Required to make HTTPS URLs work.
  162. curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
  163. $nobody = false; //Can't use HEAD with HTTPS.
  164. }
  165. if ( $nobody ){
  166. //If possible, use HEAD requests for speed.
  167. curl_setopt($ch, CURLOPT_NOBODY, true);
  168. } else {
  169. //If we must use GET at least limit the amount of downloaded data.
  170. curl_setopt($ch, CURLOPT_HTTPHEADER, array('Range: bytes=0-2048')); //2 KB
  171. }
  172. //Register a callback function which will process the HTTP header(s).
  173. //It can be called multiple times if the remote server performs a redirect.
  174. curl_setopt($ch, CURLOPT_HEADERFUNCTION, array(&$this,'read_header'));
  175. //Execute the request
  176. $start_time = microtime_float();
  177. curl_exec($ch);
  178. $measured_request_duration = microtime_float() - $start_time;
  179. $info = curl_getinfo($ch);
  180. //Store the results
  181. $result['http_code'] = intval( $info['http_code'] );
  182. $result['final_url'] = $info['url'];
  183. $result['request_duration'] = $info['total_time'];
  184. $result['redirect_count'] = $info['redirect_count'];
  185. //CURL doesn't return a request duration when a timeout happens, so we measure it ourselves.
  186. //It is useful to see how long the plugin waited for the server to respond before assuming it timed out.
  187. if( empty($result['request_duration']) ){
  188. $result['request_duration'] = $measured_request_duration;
  189. }
  190. //Determine if the link counts as "broken"
  191. if ( $result['http_code'] == 0 ){
  192. $result['broken'] = true;
  193. $error_code = curl_errno($ch);
  194. $log .= sprintf( "%s [Error #%d]\n", curl_error($ch), $error_code );
  195. //We only handle a couple of CURL error codes; most are highly esoteric.
  196. //libcurl "CURLE_" constants can't be used here because some of them have
  197. //different names or values in PHP.
  198. switch( $error_code ) {
  199. case 6: //CURLE_COULDNT_RESOLVE_HOST
  200. $result['status_code'] = BLC_LINK_STATUS_WARNING;
  201. $result['status_text'] = __('Server Not Found', 'broken-link-checker');
  202. break;
  203. case 28: //CURLE_OPERATION_TIMEDOUT
  204. $result['timeout'] = true;
  205. break;
  206. case 7: //CURLE_COULDNT_CONNECT
  207. //More often than not, this error code indicates that the connection attempt
  208. //timed out. This heuristic tries to distinguish between connections that fail
  209. //due to timeouts and those that fail due to other causes.
  210. if ( $result['request_duration'] >= 0.9*$conf->options['timeout'] ){
  211. $result['timeout'] = true;
  212. } else {
  213. $result['status_code'] = BLC_LINK_STATUS_WARNING;
  214. $result['status_text'] = __('Connection Failed', 'broken-link-checker');
  215. }
  216. break;
  217. default:
  218. $result['status_code'] = BLC_LINK_STATUS_WARNING;
  219. $result['status_text'] = __('Unknown Error', 'broken-link-checker');
  220. }
  221. } else {
  222. $result['broken'] = $this->is_error_code($result['http_code']);
  223. }
  224. curl_close($ch);
  225. if ( $nobody && $result['broken'] ){
  226. //The site in question might be expecting GET instead of HEAD, so lets retry the request
  227. //using the GET verb.
  228. return $this->check($url, true);
  229. //Note : normally a server that doesn't allow HEAD requests on a specific resource *should*
  230. //return "405 Method Not Allowed". Unfortunately, there are sites that return 404 or
  231. //another, even more general, error code instead. So just checking for 405 wouldn't be enough.
  232. }
  233. //When safe_mode or open_basedir is enabled CURL will be forbidden from following redirects,
  234. //so redirect_count will be 0 for all URLs. As a workaround, set it to 1 when the HTTP
  235. //response codes indicates a redirect but redirect_count is zero.
  236. //Note to self : Extracting the Location header might also be helpful.
  237. if ( ($result['redirect_count'] == 0) && ( in_array( $result['http_code'], array(301, 302, 303, 307) ) ) ){
  238. $result['redirect_count'] = 1;
  239. }
  240. //Build the log from HTTP code and headers.
  241. $log .= '=== ';
  242. if ( $result['http_code'] ){
  243. $log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']);
  244. } else {
  245. $log .= __('(No response)', 'broken-link-checker');
  246. }
  247. $log .= " ===\n\n";
  248. $log .= $this->last_headers;
  249. if ( !empty($result['broken']) && !empty($result['timeout']) ) {
  250. $log .= "\n(" . __("Most likely the connection timed out or the domain doesn't exist.", 'broken-link-checker') . ')';
  251. }
  252. $result['log'] = $log;
  253. //The hash should contain info about all pieces of data that pertain to determining if the
  254. //link is working.
  255. $result['result_hash'] = implode('|', array(
  256. $result['http_code'],
  257. !empty($result['broken'])?'broken':'0',
  258. !empty($result['timeout'])?'timeout':'0',
  259. md5($result['final_url']),
  260. ));
  261. return $result;
  262. }
  263. function read_header($ch, $header){
  264. $this->last_headers .= $header;
  265. return strlen($header);
  266. }
  267. }
  268. class blcSnoopyHttp extends blcHttpCheckerBase {
  269. function check($url){
  270. $url = $this->clean_url($url);
  271. //Note : Snoopy doesn't work too well with HTTPS URLs.
  272. $result = array(
  273. 'broken' => false,
  274. 'timeout' => false,
  275. );
  276. $log = '';
  277. //Get the timeout setting from the BLC configuration.
  278. $conf = blc_get_configuration();
  279. $timeout = $conf->options['timeout'];
  280. $start_time = microtime_float();
  281. //Fetch the URL with Snoopy
  282. $snoopy = new Snoopy;
  283. $snoopy->read_timeout = $timeout; //read timeout in seconds
  284. $snoopy->agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"; //masquerade as IE 7
  285. $snoopy->referer = home_url(); //valid referer helps circumvent some hotlink protection schemes
  286. $snoopy->maxlength = 1024*5; //load up to 5 kilobytes
  287. $snoopy->fetch( $this->urlencodefix($url) );
  288. $result['request_duration'] = microtime_float() - $start_time;
  289. $result['http_code'] = $snoopy->status; //HTTP status code
  290. //Snoopy returns -100 on timeout
  291. if ( $result['http_code'] == -100 ){
  292. $result['http_code'] = 0;
  293. $result['timeout'] = true;
  294. }
  295. //Build the log
  296. $log .= '=== ';
  297. if ( $result['http_code'] ){
  298. $log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']);
  299. } else {
  300. $log .= __('(No response)', 'broken-link-checker');
  301. }
  302. $log .= " ===\n\n";
  303. if ($snoopy->error)
  304. $log .= $snoopy->error."\n";
  305. if ($snoopy->timed_out) {
  306. $log .= __("Request timed out.", 'broken-link-checker') . "\n";
  307. $result['timeout'] = true;
  308. }
  309. if ( is_array($snoopy->headers) )
  310. $log .= implode("", $snoopy->headers)."\n"; //those headers already contain newlines
  311. //Redirected?
  312. if ( $snoopy->lastredirectaddr ) {
  313. $result['final_url'] = $snoopy->lastredirectaddr;
  314. $result['redirect_count'] = $snoopy->_redirectdepth;
  315. } else {
  316. $result['final_url'] = $url;
  317. }
  318. //Determine if the link counts as "broken"
  319. $result['broken'] = $this->is_error_code($result['http_code']) || $result['timeout'];
  320. $log .= "<em>(" . __('Using Snoopy', 'broken-link-checker') . ")</em>";
  321. $result['log'] = $log;
  322. //The hash should contain info about all pieces of data that pertain to determining if the
  323. //link is working.
  324. $result['result_hash'] = implode('|', array(
  325. $result['http_code'],
  326. $result['broken']?'broken':'0',
  327. $result['timeout']?'timeout':'0',
  328. md5($result['final_url']),
  329. ));
  330. return $result;
  331. }
  332. }
  333. ?>