plaintext-url.php - Regexp for detecting plaintext URLs lif…

/wp-content/plugins/broken-link-checker/modules/extras/plaintext-url.php

https://bitbucket.org/lgorence/quickpress · PHP · 154 lines · 79 code · 20 blank · 55 comment · 16 complexity · d3fbba1ba064f37f63c58ddd9e5587f5 MD5 · raw file

<?php
/*
Plugin Name: Plaintext URLs 
Description: Parse plaintext URLs as links
Version: 1.0
Author: Janis Elsts

ModuleCategory: parser
ModuleClassName: blcPlaintextURL
ModuleContext: on-demand
ModuleLazyInit: true

ModulePriority: 800
*/

class blcPlaintextURL extends blcParser {
	var $supported_formats = array('html', 'plaintext');
	
	//Regexp for detecting plaintext URLs lifted from make_clickable()
	var $url_regexp = '#(?<=[\s>\]])(\()?([\w]+?://(?:[\w\\x80-\\xff\#$%&~/=?@\[\](+-]|[.,;:](?![\s<]|(\))?([\s]|$))|(?(1)\)(?![\s<.,;:]|$)|\)))+)#is';
	
	//Used by the edit and unlink callbacks
	var $old_url = '';
	var $new_url = '';
			
  /**
   * Parse a string for plaintext URLs
   *
   * @param string $content The text to parse.
   * @param string $base_url The base URL. Ignored.  
   * @param string $default_link_text Default link text.
   * @return array An array of new blcLinkInstance objects.  
   */
	function parse($content, $base_url = '', $default_link_text = ''){
		//Don't want to detect URLs inside links or tag attributes - 
		//there are already other parsers for that.
		
		//Avoid <a href="http://...">http://...</a>
		$content = preg_replace('#<a[^>]*>.*?</a>#si', '', $content);
		//HTML tags are treated as natural boundaries for plaintext URLs 
		//(since we strip tags, we must place another boundary char where they were).
		//The closing tag of [shortcodes] is also treated as a boundary.  
		$content = str_replace(array('<', '>', '[/'), array("\n<", ">\n", "\n[/"), $content);
		//Finally, kill all tags.
		$content = strip_tags($content);
		
		//Find all URLs
		$found = preg_match_all(
			$this->url_regexp, 
			$content, 
			$matches
		);
		
		$instances = array();
		
		if ( $found ){
			//Create a new instance for each match
			foreach($matches[2] as $match){
				//Do a little bit of validation
				$url = esc_url_raw(trim($match));
				if ( empty($url) ){
					continue;
				}
				if ( function_exists('filter_var') ){
					//Note: filter_var() is no panacea as it accepts many invalid URLs  
					if ( !filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_HOST_REQUIRED) ){
						continue;
					}
				}
				$parts = @parse_url($url);
				if ( empty($parts['host']) || !strpos($parts['host'], '.') ){
					continue;
				}
				
				//Create a new link instance.
				$instance = new blcLinkInstance();
				    
			    $instance->set_parser($this);
			    $instance->raw_url = $match;
			    $instance->link_text = $match;
			    
			    $link_obj = new blcLink($url); //Creates or loads the link
			    $instance->set_link($link_obj);
			    
			    $instances[] = $instance;
			}
		}
		
		return $instances;
	}	
	
  /**
   * Change all occurences of a given plaintext URLs to a new URL. 
   *
   * @param string $content Look for URLs in this string.
   * @param string $new_url Change them to this URL.
   * @param string $old_url The URL to look for.
   * @param string $old_raw_url The raw, not-normalized URL. Optional. 
   *
   * @return array|WP_Error If successful, the return value will be an associative array with two
   * keys : 'content' - the modified content, and 'raw_url' - the new raw, non-normalized URL used
   * for the modified links. In most cases, the returned raw_url will be equal to the new_url.
   */
	function edit($content, $new_url, $old_url, $old_raw_url = ''){
		$this->new_url = $new_url;
		if ( empty($old_raw_url) ){
			$this->old_url = $old_url;
		} else {
			$this->old_url = $old_raw_url;
		}
		
		return array(
			'content' => preg_replace_callback($this->url_regexp, array(&$this, 'edit_callback'), $content),
			'raw_url' => $new_url,
			'link_text' => $new_url,
		);
	}
	
	function edit_callback($match){
		if ( $match[2] == $this->old_url ){
			return $this->new_url;
		} else {
			return $match[0];
		}
	}
	
	
  /**
   * Remove all occurences of a specific plaintext URL.
   *
   * @param string $content	Look for URLs in this string.
   * @param string $url The URL to look for.
   * @param string $raw_url The raw, non-normalized version of the URL to look for. Optional.
   * @return string Input string with all matching plaintext URLs removed. 
   */
	function unlink($content, $url, $raw_url = ''){
		if ( empty($raw_url) ){
			$this->old_url = $url;
		} else {
			$this->old_url = $raw_url;
		}
		
		return preg_replace_callback($this->url_regexp, array(&$this, 'unlink_callback'), $content);
	}
	
	function unlink_callback($match){
		if ( $match[2] == $this->old_url ){
			return '';
		} else {
			return $match[0];
		}
	}
}
?>