inputfilter.php | searchcode

/system/classes/inputfilter.php

https://github.com/HabariMag/habarimag-old
PHP | 487 lines | 321 code | 45 blank | 121 comment | 59 complexity | d35920e60f1266583f042b2629b06998 MD5 | raw file
Possible License(s): Apache-2.0

<?php
/**
 * @package Habari
 *
 */

/**
 * Input filtering functions.
 *
 */
class InputFilter
{
	/**
	 * Allowed elements.
	 */
	private static $whitelist_elements = array(
		// http://www.w3.org/TR/html4/struct/global.html#h-7.5.4
		'div', 'span',
		// http://www.w3.org/TR/html4/struct/links.html#h-12.2
		'a',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.2.1
		'strong', 'em', 'code', 'kbd', 'dfn', 'samp', 'var', 'cite', 'abbr', 'acronym',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.2.2
		'blockquote', 'q',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.2.3
		'sub', 'sup',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.3.1
		'p',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.3.2.1
		'br',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.3.4
		'pre',
		// http://www.w3.org/TR/html4/struct/text.html#h-9.4
		'ins', 'del',
		// http://www.w3.org/TR/html4/struct/lists.html#h-10.2
		'ol', 'ul', 'li',
		// http://www.w3.org/TR/html4/struct/lists.html#h-10.3
		'dl', 'dt', 'dd',
		// http://www.w3.org/TR/html4/present/graphics.html#h-15.2.1
		'b', 'i', 'u', 's', 'tt',
		// http://www.w3.org/TR/html4/struct/global.html#h-7.5.5
		'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
		// http://www.w3.org/TR/html4/struct/global.html#h-7.5.6
		'address',
		// http://www.w3.org/TR/html4/struct/dirlang.html#h-8.2.4
		'bdo',
		// http://www.w3.org/TR/html4/struct/tables.html#h-11.2.1
		'table',
		// http://www.w3.org/TR/html4/struct/tables.html#h-11.2.2
		'caption',
		// http://www.w3.org/TR/html4/struct/tables.html#h-11.2.3
		'thead', 'tfoot', 'tbody',
		// http://www.w3.org/TR/html4/struct/tables.html#h-11.2.4
		'colgroup', 'col',
		// http://www.w3.org/TR/html4/struct/tables.html#h-11.2.5
		'tr',
		// http://www.w3.org/TR/html4/struct/tables.html#h-11.2.6
		'th', 'td',
		// http://www.w3.org/TR/html4/struct/objects.html#h-13.2
		'img',
		// http://www.w3.org/TR/html4/struct/objects.html#h-13.6.1
		'map', 'area',
		// http://www.w3.org/TR/html4/present/graphics.html#h-15.2.1 (the non-deprecated ones)
		'tt', 'i', 'b', 'big', 'small',
		// http://www.w3.org/TR/html4/present/graphics.html#h-15.3
		'hr',
		// http://www.w3.org/TR/html4/present/frames.html#h-16.2.1
		'frameset',
		// http://www.w3.org/TR/html4/present/frames.html#h-16.2.2
		'frame',
		// http://www.w3.org/TR/html4/present/frames.html#h-16.4.1
		'noframes',
		// http://www.w3.org/TR/html4/present/frames.html#h-16.5
		'iframe',


	);

	/**
	 * Allowed attributes and values.
	 */
	private static $whitelist_attributes = array(
		// attributes that are valid for ALL elements (a subset of coreattrs)
		// elements that only take coreattrs don't need to be listed separately
		'*' => array(
			'lang' => 'language-code',
			'xml:lang' => 'language-code', // this is our xhtml support... all of it
			'dir' => array( 'ltr', 'rtl', ),
			'title' => 'text',
		),
		// http://www.w3.org/TR/html4/struct/links.html#h-12.2
		'a' => array( 'href' => 'uri', ),
		// http://www.w3.org/TR/html4/struct/text.html#h-9.4
		'ins' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
		'del' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
		// http://www.w3.org/TR/html4/struct/text.html#h-9.2.2
		'blockquote' => array( 'cite' => 'uri', ),
		'q' => array( 'cite' => 'uri', ),
		'img' => array( 'src' => 'uri', 'alt' => 'text' ),
	);

	/**
	 * #EMPTY elements.
	 */
	private static $elements_empty = array(
		'img',
	);

	/**
	 * Protocols that are ok for use in URIs.
	 */
	private static $whitelist_protocols = array(
		'http', 'https', 'ftp', 'mailto', 'irc', 'news', 'nntp', 'callto', 'rtsp', 'mms', 'svn',
	);

	/**
	 * List of all defined named character entities in HTML 4.01 and XHTML.
	 */
	private static $character_entities = array(
		'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml',
		'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
		'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm',
		'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
		'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc',
		'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
		'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute',
		'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute', 'acirc',
		'atilde', 'auml', 'aring', 'aelig', 'ccedil', 'egrave', 'eacute', 'ecirc',
		'euml', 'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde', 'ograve',
		'oacute', 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave', 'uacute',
		'ucirc', 'uuml', 'yacute', 'thorn', 'yuml', 'fnof', 'Alpha', 'Beta', 'Gamma',
		'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta', 'Iota', 'Kappa', 'Lambda', 'Mu',
		'Nu', 'Xi', 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon', 'Phi', 'Chi',
		'Psi', 'Omega', 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta',
		'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho',
		'sigmaf', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega',
		'thetasym', 'upsih', 'piv', 'bull', 'hellip', 'prime', 'Prime', 'oline',
		'frasl', 'weierp', 'image', 'real', 'trade', 'alefsym', 'larr', 'uarr',
		'rarr', 'darr', 'harr', 'crarr', 'lArr', 'uArr', 'rArr', 'dArr', 'hArr',
		'forall', 'part', 'exist', 'empty', 'nabla', 'isin', 'notin', 'ni', 'prod',
		'sum', 'minus', 'lowast', 'radic', 'prop', 'infin', 'ang', 'and', 'or',
		'cap', 'cup', 'int', 'there4', 'sim', 'cong', 'asymp', 'ne', 'equiv', 'le',
		'ge', 'sub', 'sup', 'nsub', 'sube', 'supe', 'oplus', 'otimes', 'perp',
		'sdot', 'lceil', 'rceil', 'lfloor', 'rfloor', 'lang', 'rang', 'loz',
		'spades', 'clubs', 'hearts', 'diams', 'quot', 'amp', 'lt', 'gt', 'OElig',
		'oelig', 'Scaron', 'scaron', 'Yuml', 'circ', 'tilde', 'ensp', 'emsp',
		'thinsp', 'zwnj', 'zwj', 'lrm', 'rlm', 'ndash', 'mdash', 'lsquo', 'rsquo',
		'sbquo', 'ldquo', 'rdquo', 'bdquo', 'dagger', 'Dagger', 'permil', 'lsaquo',
		'rsaquo', 'euro',
	);
	private static $character_entities_re = '';

	/**
	 * Perform all filtering, return new string.
	 * @param string $str Input string.
	 * @return string Filtered output string.
	 */
	public static function filter( $str )
	{
		if ( !MultiByte::valid_data( $str ) ) {
			return '';
		}
		else {
			do {
				$_str = $str;
				$str = self::strip_nulls( $str );
				$str = self::strip_illegal_entities( $str );
				$str = self::filter_html_elements( $str );
			} while ( $str != $_str );

			return $str;
		}
	}

	public static function strip_nulls( $str )
	{
		$str = str_replace( '\0', '', $str );

		return $str;
	}

	/**
	 * Callback function for strip_illegal_entities, do not use.
	 * @access private
	 * @param array $m matches
	 */
	public static function _validate_entity( $m )
	{
		$is_valid = false;

		// valid entity references have the form
		//   /&named([;<\n\r])/
		// for named entities, or
		//   /&#(\d{1,5}|[xX][0-9a-fA-F]{1,4})([;<\n\r])/
		// for numeric character references

		$e = trim( $m[1] );
		$r = $m[2];
		if ( $r == ';' ) {
			$r = '';
		}

		if ( $e{0} == '#' ) {
			$e = strtolower( $e );
			if ( $e{1} == 'x' ) {
				$e = hexdec( substr( $e, 2 ) );
			}
			else {
				$e = substr( $e, 1 );
			}

			// numeric character references may only have values in the range 0-65535 (16 bit)
			// we strip null, though, just for kicks
			$is_valid = ( intval( $e ) > 0 && intval( $e ) <= 65535 );

			if ( $is_valid ) {
				// normalize to decimal form
				$e = '#' . intval( $e ) . ';';
			}
		}
		else {
			if ( self::$character_entities_re == '' ) {
				self::$character_entities_re = ';(' . implode( '|', self::$character_entities ) . ');';
			}

			// named entities must be known
			$is_valid = preg_match( self::$character_entities_re, $e, $matches );

			// XXX should we map named entities to their numeric equivalents?

			if ( $is_valid ) {
				// normalize to name and nothing but the name... eh.
				$e = $matches[1] . ';';
			}
		}

		return $is_valid ? '&' . $e . $r : '';
	}

	public static function strip_illegal_entities( $str )
	{
		$str = preg_replace_callback( "/&([^;<\n\r]+)([;<\n\r])/", array( __CLASS__, '_validate_entity' ), $str );

		return $str;
	}

	/**
	 * This really doesn't belong here. It should also be done much better. This is a nasty, NASTY kludge.
	 */
	public static function parse_url( $url )
	{
		// result array
		$r = array(
			'scheme' => '',
			'host' => '',
			'port' => '',
			'user' => '',
			'pass' => '',
			'path' => '',
			'query' => '',
			'fragment' => '',
			//
			'is_relative' => false,
			'is_pseudo' => false,
			'is_error' => false,
			//
			'pseudo_args' => '',
		);
		
		// sanitize the url
		$sanitized = html_entity_decode( $url, null, 'UTF-8' );		// make double-sure we've converted all entities
		$sanitized = filter_var( $sanitized, FILTER_SANITIZE_URL );		// strip everything but ascii, essentially
		
		$sanitized_scheme = parse_url( $sanitized, PHP_URL_SCHEME );
		
		// Use PHP's parse_url to get the basics
		$parsed = parse_url( $url );
		if ( $parsed == false ) {
			$r['is_error'] = true;
			return $r;
		}
		$r = array_merge( $r, $parsed );
		
		// replace the scheme with the one we got from the fully-sanitized string
		$r['scheme'] = $sanitized_scheme;

		$r['is_pseudo'] = !in_array( $r['scheme'], array( 'http', 'https', '' ) );
		$r['is_relative'] = ( $r['host'] == '' && !$r['is_pseudo'] );
		
		if ( $r['is_pseudo'] ) {
			$r['pseudo_args'] = $r['path'];
			$r['path'] = '';
		}

		return $r;
	}

	/**
	 * Restore a URL separated by a parse_url() call.
	 * @param $parsed_url array An array as returned by parse_url()
	 */
	public static function glue_url( $parsed_url )
	{
		if ( ! is_array( $parsed_url ) ) {
			return false;
		}

		$res = '';
		$res .= $parsed_url['scheme'];
		if ( $parsed_url['is_pseudo'] || in_array( strtolower( $parsed_url['scheme'] ), array( 'mailto', 'callto' ) ) ) {
			$res .= ':';
		}
		else {
			if ( ! $parsed_url['is_relative'] ) {
				$res .= '://';
			}
		}
		if ( $parsed_url['is_pseudo'] ) {
			$res .= $parsed_url['pseudo_args'];
		}
		else {
			// user[:pass]@
			if ( $parsed_url['user'] ) {
				$res .= $parsed_url['user'];
				if ( $parsed_url['pass'] ) {
					$res .= ':' . $parsed_url['pass'];
				}
				$res .= '@';
			}
			$res .= $parsed_url['host'];
			if ( !empty( $parsed_url['port'] ) ) {
				if ( array_key_exists( $parsed_url['scheme'], Utils::scheme_ports() ) && Utils::scheme_ports( $parsed_url['scheme'] ) == $parsed_url['port'] ) {
					// default port for this scheme, do nothing
				}
				else {
					$res .= ':' . $parsed_url['port'];
				}
			}
			if ( !empty( $parsed_url['path'] ) ) {
				$res .= $parsed_url['path'];
			}
			else {
				$res .= '/';
			}
			if ( $parsed_url['query'] ) {
				$res .= '?' . $parsed_url['query'];
			}
			if ( $parsed_url['fragment'] ) {
				$res .= '#' . $parsed_url['fragment'];
			}
		}

		return $res;
	}

	private static function check_attr_value( $k, $v, $type )
	{
		if ( is_array( $type ) ) {
			// array of allowed values, exact matches only
			return in_array( $v, $type, true );
		}
		else {
			// data type
			switch ( $type ) {
				case 'uri':
					// RfC 2396 <http://www.ietf.org/rfc/rfc2396.txt>
					$bits = self::parse_url( $v );
					return $bits['is_relative'] || in_array( $bits['scheme'], self::$whitelist_protocols );
					break;
				case 'language-code':
					// RfC 1766 <http://www.ietf.org/rfc/rfc1766.txt>
					//    Language-Tag = Primary-tag *( "-" Subtag )
					//    Primary-tag = 1*8ALPHA
					//    Subtag = 1*8ALPHA
					return preg_match( '/^[a-zA-Z]{1,8}(?:-[a-zA-Z]{1,8})*$/i', $v );
					break;
				case 'text':
					// XXX is this sufficient?
					return is_string( $v );
					break;
				case 'datetime':
					// <http://www.w3.org/TR/1998/NOTE-datetime-19980827>
					// <http://www.w3.org/TR/html4/types.html#h-6.11>
					//    YYYY-MM-DDThh:mm:ssTZD
					return preg_match( '/^[0-9]{4}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9](?:Z|[\+-][0-2][0-9]:[0-5][0-9])$/', $v );
					break;
				default:
					Error::raise( sprintf( _t( 'Unkown attribute type "%s" in %s' ), $type, __CLASS__ ) );
					return false;
			}
		}
	}

	/**
	 * @todo TODO must build DOM to really properly remove offending elements
	 * @todo TODO properly filter URLs
	 */
	public static function filter_html_elements( $str )
	{
		$tokenizer = new HTMLTokenizer( $str );

		// tokenize, baby
		$tokens = $tokenizer->parse();

		// filter token stream
		$filtered = new HTMLTokenSet;
		$stack = array();
		foreach ( $tokens as $node ) {
			switch ( $node['type'] ) {
				case HTMLTokenizer::NODE_TYPE_TEXT:
					$node['value'] = html_entity_decode( $node['value'], ENT_QUOTES, MultiByte::hab_encoding() );
					break;
				case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
				case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY:
					// is this element allowed at all?
					if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
						if ( ! in_array( strtolower( $node['name'] ), self::$elements_empty ) ) {
							array_push( $stack, $node['name'] );
						}
						//$node = null; //remove the node completely
						// convert the node to text
						$node = array(
							'type' => HTMLTokenizer::NODE_TYPE_TEXT,
							'name' => '#text',
							'value' => HTMLTokenSet::token_to_string( $node ),
							'attrs' => array(),
						);
					}
					else {
						// check attributes
						foreach ( $node['attrs'] as $k => $v ) {
							
							$attr_ok = false;
							
							// if the attribute is in the global whitelist and validates
							if ( array_key_exists( strtolower( $k ), self::$whitelist_attributes['*'] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes['*'][ strtolower( $k ) ] ) ) {
								$attr_ok = true;
							}
							
							// if there is a whitelist for this node and this attribute is in that list and it validates
							if ( array_key_exists( strtolower( $node['name'] ), self::$whitelist_attributes ) && array_key_exists( strtolower( $k ), self::$whitelist_attributes[ strtolower( $node['name'] ) ] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes[ strtolower( $node['name'] ) ][ strtolower( $k ) ] ) ) {
								$attr_ok = true;
							}
							
							// if it wasn't in one of the whitelists or failed its check, remove it
							if ( $attr_ok != true ) {
								unset( $node['attrs'][$k] );
							}
						}
					}
					break;
				case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
					if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
						if ( strtolower( $temp = array_pop( $stack ) ) !== strtolower( $node['name'] ) ) {
							// something weird happened (Luke, use the DOM!)
							array_push( $stack, $temp );
						}
						//$node = null;
						//convert the node to text
						$node = array(
							'type' => HTMLTokenizer::NODE_TYPE_TEXT,
							'name' => '#text',
							'value' => HTMLTokenSet::token_to_string( $node ),
							'attrs' => array(),
						);
					}
					break;
				case HTMLTokenizer::NODE_TYPE_PI:
				case HTMLTokenizer::NODE_TYPE_COMMENT:
				case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
				case HTMLTokenizer::NODE_TYPE_STATEMENT:
				default:
					$node = null;
					break;
			}

			if ( $node != null ) {
				$filtered[] = $node;
			}
		}

		// rebuild our output string
		return preg_replace( '#<([^>\s]+)(?:\s+[^>]+)?></\1>#u', '', (string) $filtered );
	}
}

?>