parser.php | searchcode

/includes/parser.php

https://github.com/francinebo/glype
PHP | 1029 lines | 534 code | 220 blank | 275 comment | 60 complexity | 14a5d9c40ca5937821a62de5d2f29b3b MD5 | raw file

<?php
/*******************************************************************
* Glype is copyright and trademark 2007-2012 UpsideOut, Inc. d/b/a Glype
* and/or its licensors, successors and assigners. All rights reserved.
*
* Use of Glype is subject to the terms of the Software License Agreement.
* http://www.glype.com/license.php
*******************************************************************
* This is the parser for the proxy - changes the original 'raw'
* document so that everything (images, links, etc.) is rerouted to
* be downloaded via the proxy script instead of directly.
******************************************************************/

class parser {

	# State of javascript parser - null for parse everything, false
	# for parse all non-standard overrides, or (array) with specifics
	private $jsFlagState;
	
	# Browsing options (Remove Scripts, etc.)
	private $htmlOptions;

	# Constructor accepts options and saves them in the object
	function __construct($htmlOptions, $jsFlags) {
		$this->jsFlagState = $jsFlags;
		$this->htmlOptions = $htmlOptions;
	}

	
	/*****************************************************************
	* HTML parsers - main parsing function splits up document into
	* component parts ('normal' HTML, scripts and styles)
	******************************************************************/
	
	function HTMLDocument($input, $insert='', $inject=false, $footer='') {

		#
		# Apply parsing that only needs to be done once..
		#

		# Remove titles if option is enabled
		if ( $this->htmlOptions['stripTitle'] ) {
			$input = preg_replace('#<title.*?</title>#is', '', $input, 1);
			$input = preg_replace('#<meta[^>]*name=["\']title["\'][^>]*>#is', '', $input, 1);
		}

		# Remove and record a <base> href
		$input = preg_replace_callback('#<base href\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1);

		# Proxy url= values in meta redirects
		$input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1);

		# Process forms
		$input = preg_replace_callback('#<form([^>]*)>(.*?)</form>#is', 'html_form', $input);
		
		# Remove scripts blocks (avoids individual processing below)
		if ( $this->htmlOptions['stripJS'] ) {
			$input = preg_replace('#<script[^>]*>.*?</script>#is', '', $input);
		}
		
		
		#
		# Split up the document into its different types and parse them
		#

		# Build up new document into this var
		$new	  = '';
		$offset = 0;

		# Find instances of script or style blocks
		while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) {

			# What type of block is this?
			$block = strtolower($match[1][0]);

			# Start position of content
			$outerStart = $match[0][1];
			$innerStart = $outerStart + strlen($match[0][0]);

			# Determine type of end tag and find it's position
			$endTag	 = "</$block>";
			$innerEnd = stripos($input, $endTag, $innerStart);
			if ($innerEnd===false) {
				$endTag	 = "</";
				$innerEnd = stripos($input, $endTag, $innerStart);
				if ($innerEnd===false) {
					$input = preg_replace('#<script[^>]*>.*?$#is', '', $input);
					break;
				}
			}
			$outerEnd = $innerEnd + strlen($endTag);
			
			# Parse everything up till here and add to the new document
			$new .= $this->HTML(substr($input, $offset, $innerStart - $offset));
			
			# Find parsing function
			$parseFunction = $block == 'style' ? 'CSS' : 'JS' ;

			# Add the parsed block
			$new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart));

			# Move offset to new position
			$offset = $innerEnd;

		}

		# And add the final chunk (between last script/style block and end of doc)
		$new .= $this->HTML(substr($input, $offset));

		# Replace input with the updated document
		$input = $new;

		# Encode the page
		if ( $this->htmlOptions['encodePage'] ) {
			$input = encodePage($input);
		}

		#
		# Now add our own code bits
		#

		# Insert our mini form after the <body>
		if ( $insert !== false ) {

			# Check for a frameset
			if ( ( $useFrames = stripos($input, '<frameset') ) !== false ) {

				# Flag the frames so only first displays mini-form
				$input = preg_replace_callback('#<frame[^>]+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_flagFrames', $input);

			}

			# Attempt to add after body
			$input = preg_replace('#(<body[^>]*>)#i', '$1' . $insert, $input, 1, $tmp);

			# Check it inserted and append (if not a frameset)
			if ( ! $tmp && ! $useFrames ) {
				$input = $insert . $input;
			}

		}

		# Insert our javascript library
		if ( $inject ) {

			# Generate javascript to insert
			$inject = injectionJS();

			# Add our proxy javascript after <head>
			$input = preg_replace('#(<head[^>]*>)#i', '$1' . $inject, $input, 1, $tmp);

			# If no <head>, just prepend
			if ( ! $tmp ) {
				$input = $inject . $input;
			}

		}

		# Add anything to the footer?
		if ( $footer ) {

			$input = preg_replace('#(</body[^>]*>)#i', $footer . '$1', $input, 1, $tmp);

			# If no </body>, just append the footer
			if ( ! $tmp ){
				$input .= $footer;
			}

		}

		# Return new document
		return $input;

	}

	# Parse HTML sections
	function HTML($input) {

		# Removing objects? Follow spec and display inner content of object tags instead.
		if ( $this->htmlOptions['stripObjects'] ) {

			# Remove all object tags (including those deprecated but still common)
			$input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp);

			# Found any? Remove the corresponding end tags
			if ( $tmp ) {
				$input = preg_replace('#</(?>object|applet|param|embed)>#i', '', $input, $tmp);
			}

		} else {

			# Parse <param name="movie" value="URL"> tags
			$input = preg_replace_callback('#<param[^>]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input);

			# To do: proxy object related URLs

		}

		# Show content within <noscript> tags
		# (preg_ seems to be faster than 2 str_ireplace() calls)
		if ( $this->htmlOptions['stripJS'] ) {
			$input = preg_replace('#</?noscript>#i', '', $input);
		}

		# Parse onX events
		$input = preg_replace_callback('#\b(on(?<!\.on)[a-z]{2,20})\s*=\s*([\\\'"])?((?(2)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(2)\\2|)#i', array(&$this, 'html_eventJS'), $input);

		# Parse style attributes
		$input = preg_replace_callback('#style\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', array(&$this, 'html_elementCSS'), $input);

		# Proxy URL attributes - this is the bottleneck but optimized
		# as much as possible (or at least, as much as I can).
		$input = preg_replace_callback('#(?><[A-Z][A-Z0-9]{0,15})(?>\s+[^>\s]+)*?\s*(?>(href|src|background)\s*=(?!\\\\)\s*)(?>([\\\'"])?)((?(2)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^ >]{1,1000}))(?(2)\\2|)#i', 'html_attribute', $input);

		# Return changed input
		return $input;

	}

	# Proxy an onX javascript event
	function html_eventJS($input) {
		return $this->htmlOptions['stripJS'] ? '' : $input[1] . '=' . $input[2] . $this->JS($input[3]) . $input[2];
	}

	# Proxy a style="CSS" attribute
	function html_elementCSS($input) {
		return 'style=' . $input[1] . $this->CSS($input[2]) . $input[1];
	}


	/*****************************************************************
	* CSS parser - main parsing function
	* CSS parsing is a complicated by the caching of CSS files. We need
	* to consider (A) cross-domain caching and (B) the unique URLs option.
	*	 A) If possible, use a relative URL so the saved URLs do not explictly
	*		 point to a single domain.
	*	 B) There is a second set of callback functions with "_unique" suffixed
	*		 and these return the original URL to be reparesed.
	******************************************************************/

	# The URLs depend on the unique and path info settings. The type parameter allows
	# us to specify the unique callbacks.
	function CSS($input, $storeUnique=false) {

		# What type of parsing is this? Normally we parse any URLs to redirect
		# back through the proxy but not when storing a cache with unique URLs.
		$type = $storeUnique ? '_unique' : '';

		# CSS needs proxying the calls to url(), @import and src=''
		$input = preg_replace_callback('#\burl\s*\(\s*[\\\'"]?([^\\\'"\)]+)[\\\'"]?\s*\)#i', 'css_URL' . $type, $input);
		$input = preg_replace_callback('#@import\s*[\\\'"]([^\\\'"\(\)]+)[\\\'"]#i', 'css_import' . $type, $input);
		$input = preg_replace_callback('#\bsrc\s*=\s*([\\\'"])?([^)\\\'"]+)(?(1)\\1|)#i', 'css_src' . $type, $input);

		# Return changed
		return $input;

	}


	/*****************************************************************
	* Javascript parser - main parsing function
	*
	* The specific parts that need proxying depends on which javascript
	* functions we've been able to override. On first page load, the browser
	* capabilities are tested to see what we can do client-side and the results
	* sent back to us. This allows us to parse only what we have to.
	* If $CONFIG['override_javascript'] is disabled, all commands are parsed
	* server-side. This will use much more CPU!
	*
	* Commands to proxy only if no override at all:
	*	 document.write()
	*	 document.writeln()
	*	 window.open()
	*	 eval()
	*
	* Commands to proxy, regardless of browser capabilities:
	*	 location.replace()
	*	 .innerHTML=
	*
	* Commands to proxy if the extra "watch" flag is set
	* (the browser doesn't support the .watch() method):
	*	 location=
	*	 x.location=
	*	 location.href=
	*
	* Commands to proxy if the extra "setters" flag is set
	* (the browser doesn't support the __defineSetter__() method):
	*	 .src=
	*	 .href=
	*	 .background=
	*	 .action=
	*
	* Commands to proxy if the extra "ajax" flag is set
	* (the browser failed to override the .open() method):
	*	 XMLHttpRequest.open()
	******************************************************************/

	function JS($input) {

		# Stripping?
		if ( $this->htmlOptions['stripJS'] ) {
			return '';
		}
		
		# Get our flags
		$flags = $this->jsFlagState;

		# Unless we know we don't need to, apply all the browser-specific flags
		if ( ! is_array($this->jsFlagState) ) {
			$flags = array('ajax', 'watch', 'setters');
		}
		
		# If override is disabled, add a "base" flag
		if ( $this->jsFlagState === null ) {
			$flags[] = 'base';
		}

		# Start parsing!
		$search = array();
		
		# Create shortcuts to various search patterns:
		#	  "before"	  - matches preceeding character (string of single char) [ignoring whitespace]
		#	  "after"	  - matches next character (string of single char) [ignoring whitespace]
		#	  "id"		  - key for identifying the original match (e.g. if we have >1 of the same key)
		$assignmentPattern	= array('before'	  => '.',				  'after' => '='); 
		$methodPattern			= array('before'	  => '.',				  'after' => '(');
		$functionPattern		= array('after' => '(');

		# Configure strings to search for, starting with always replaced commands
		$search['innerHTML'][] = $assignmentPattern;
		$search['location'][]  = array('after' => '.', 'id' => 'replace()');
			# ^ This is only for location.replace() - other forms are handled later

		# Look for attribute assignments
		if ( in_array('setters', $flags) ) {
			$search['src'][]			= $assignmentPattern;
			$search['href'][]			= $assignmentPattern;
			$search['action'][]		= $assignmentPattern;
			$search['background'][] = $assignmentPattern;
		}
		
		# Look for location changes
		# location.href will be handled above, location= is handled here
		if ( in_array('watch', $flags) ) {
			$search['location'][] = array('after' => '=', 'id' => 'assignment');
		}

		# Look for .open() if either AJAX (XMLHttpRequest.open) or
		# base (window.open) flags are present
		if ( in_array('ajax', $flags) || in_array('base', $flags) ) {
			$search['open'][] = $methodPattern;
		}
		
		# Add the basic code if no override
		if ( in_array('base', $flags) ) {
			$search['eval'][]		= $functionPattern;
			$search['writeln'][]	  = $methodPattern;
			$search['write'][]	= $methodPattern;
		}

		# Set up starting parameters
		$offset			= 0;
		$length			= strlen($input);
		$searchStrings = array_keys($search);

		while ( $offset < $length ) {

			# Start off by assuming no more items (i.e. the next position
			# of interest is the end of the document)
			$commandPos = $length;

			# Loop through the search subjects
			foreach ( $searchStrings as $item ) {

				# Any more instances of this?
				if ( ( $tmp = strpos($input, $item, $offset) ) === false ) {

					# Nope, skip to next item
					continue;

				}


				# Closer to the currently held 'next' position?
				if ( $tmp < $commandPos ) {

					$commandPos = $tmp;
					$command = $item;

				}

			}

			# No matches found? Finish parsing.
			if ( $commandPos == $length ) {
				break;
			}
			
			# We've found the main point of interest; now use the
			# search parameters to check the surrounding chars to validate
			# the match.
			$valid = false;

			foreach ( $search[$command] as $pattern ) {
			
				# Check the preceeding chars
				if ( isset($pattern['before']) && str_checkprev($input, $pattern['before'], $commandPos-1) === false ) {
					continue;
				}
				
				# Check next chars
				if ( isset($pattern['after']) && ( $postCharPos = str_checknext($input, $pattern['after'], $commandPos + strlen($command), false, true) ) === false ) {
					continue;
				}
				
				# Still here? Match must be OK so generate a match ID			 
				if ( isset($pattern['id']) ) {
					$valid = $command . $pattern['id'];
				} else {
					$valid = $command;
				}
				
				break;
				
			}
			
			# What we do next depends on which match (if any) we've found...
			switch ( $valid ) {
			
				# Assigment
				case 'src':
				case 'href':
				case 'background':
				case 'action':
				case 'locationassignment':
				case 'innerHTML':

					# Check our post-char position for = as well (could be equality
					# test rather than assignment, i.e. == )
					if ( ! isset($input[$postCharPos]) || $input[$postCharPos] == '=' ) {
						break;
					}

					# Find the end of this statement
					$endPos = analyze_js($input, $postCharPos);
					$valueLength = $endPos - $postCharPos;

					# Produce replacement command
					$replacement = sprintf('parse%s(%s)', $command=='innerHTML' ? 'HTML' : 'URL', substr($input, $postCharPos, $valueLength));
					
					# Adjust total document length as appropriate
					$length += strlen($replacement);
					
					# Make the replacement
					$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
					
					# Move offset up to new position
					$offset = $endPos + 10;
					
					# Go get next match
					continue 2;
					
					
				# Function calls - we don't know for certain if these are in fact members of the
				# appropriate objects (window/XMLHttpRequest for .open(), document for .write() and
				# .writeln) so we won't change anything. Main.js still overrides these functions but
				# does nothing with them by default. We add an extra parameter to tell our override
				# to kick in.
				case 'open':
				case 'write':
				case 'writeln':
					
					# Find the end position (the closing ")" for the function call)
					$endPos = analyze_js($input, $postCharPos);
					
					# Insert our additional argument just before that
					$input = substr_replace($input, ',"gl"', $endPos, 0);
					
					# Adjust the document length
					$length += 5;
					
					# And move the offset
					$offset = $endPos + 5;
					
					# Get next match
					continue 2;
				
				
				# Eval() is a just as easy since we can just wrap the entire thing in parseJS().
				case 'eval':
				
					# Ensure this is a call to eval(), not anotherfunctionendingineval()
					if ( isset($input[$commandPos-1]) && strpos('abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_', $input[$commandPos-1]) !== false ) {
						break;
					}
				
					# Find the end position (the closing ")" for the function call)
					$endPos = analyze_js($input, $postCharPos);
					$valueLength = $endPos - $postCharPos;
					
					# Generate our replacement
					$replacement = sprintf('parseJS(%s)', substr($input, $postCharPos, $valueLength));
					
					# Make the replacement
					$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
					
					# Adjust the document length
					$length += 9;
					
					# And move the offset
					$offset = $endPos + 9;
					continue 2;
				
				
				# location.replace() is a tricky one. We have the position of the char
				# after . as $postCharPos and need to ensure we're calling replace(), 
				# then parse the entire URL
				case 'locationreplace()':

					# Validate the match
					if ( ! preg_match('#\Greplace\s*\(#', $input, $tmp, 0, $postCharPos) ) {
						break;
					}

					# Move $postCharPos to inside the brackets of .replace()
					$postCharPos += strlen($tmp[0]);
				
					# Find the end position (the closing ")" for the function call)
					$endPos = analyze_js($input, $postCharPos);
					$valueLength = $endPos - $postCharPos;
					
					# Generate our replacement
					$replacement = sprintf('parseURL(%s)', substr($input, $postCharPos, $valueLength));
					
					# Make the replacement
					$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
					
					# Adjust the document length
					$length += 9;
					
					# And move the offset
					$offset = $endPos + 9;
					
					continue 2;
					
			}
			
			# Still here? A match didn't validate so adjust offset to just after
			# current position
			$offset = $commandPos + 1;

		}

		# Ignore document.domain
		$input = str_replace('document.domain', 'ignore', $input);

		# Return changed
		return $input;

	}

}


/*****************************************************************
* HTML callbacks
******************************************************************/

# Remove and record the <base> href
function html_stripBase($input) {
	global $base;
	$base = $input[2];
	return '';
}

# Proxy the location of a meta refresh
function html_metaRefresh($input) {
	return str_replace($input[3], proxyURL($input[3]), $input[0]);
}

# Proxy URL in <param name="movie" value="URL">
function html_paramValue($input) {

	# Check for a name="movie" tag
	if ( stripos($input[0], 'movie') === false ) {
		return $input[0];
	}

	return str_replace($input[2], proxyURL($input[2]), $input[0]);
}

# Process forms - the query string is used by the proxy script
# and GET data needs to be encoded anyway. We convert all GET
# forms to POST and then the proxy script will forward it properly.
function html_form($input) {

	# Check for a given method
	if ( preg_match('#\bmethod\s*=\s*["\\\']?(get|post)["\\\']?#i', $input[1], $tmp) ) {

		# Not POST?
		if ( strtolower($tmp[1]) != 'post' ) {

			# Convert to post and flag as a conversion
			$input[1] = str_replace($tmp[0], 'method="post"', $input[1]);
			$converted = true;

		}

	} else {

		# Append a POST method (no method given and GET is default)
		$input[1] .= ' method="post"';
		$converted = true;

	}

	# Prepare the extra input to insert
	$add = empty($converted) ? '' : '<input type="hidden" name="convertGET" value="1">';

	# To do: javascript onsubmit event to immediately redirect to the appropriate
	# location using GET data, without an intermediate POST to the proxy script.

	# Proxy the form action
	$input[1] = preg_replace_callback('#\baction\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_formAction', $input[1]);

	# What type of form is this? Due to register_globals support, PHP converts
	# a number of characters to _ in incoming variable names. To get around this,
	# we can use the raw post data from php://input but this is not available
	# for multipart forms. Instead we must encode the input names in these forms.
	if ( stripos($input[1], 'multipart/form-data') ) {

		$input[2] = preg_replace_callback('#name\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_inputName', $input[2]);

	}

	# Return updated form
	return '<form' . $input[1] . '>' . $add . $input[2] . '</form>';

}

# Proxy the action="URL" value in forms
function html_formAction($input) {
	return 'action=' . $input[1] . proxyURL($input[2]) . $input[1];
}

# Encode input names
function html_inputName($input) {
	return 'name=' . $input[1] . inputEncode($input[2]) . $input[1];
}

# Proxy URL values in attributes
function html_attribute($input) {

	# Is this an iframe?
	$flag = stripos($input[0], 'iframe') === 1 ? 'frame' : '';

	# Do not proxy magnet links
    if (stripos($input[3], 'magnet:?') === 0) {
        return $input[0];
    }
	
	# URL occurred as value of an attribute and should have been htmlspecialchar()ed
	# We need to do the job of the browser and decode before proxying.
	return str_replace($input[3], htmlspecialchars(proxyURL(htmlspecialchars_decode($input[3]), $flag)), $input[0]);
}

# Flag frames in a frameset so only the first one shows the mini-form.
# This could be done in the above callback but adds extra processing
# when 99% of the time, it won't be needed.
function html_flagFrames($input) {

	static $addFlag;

	# If it's the first frame, leave it but set the flag var
	if ( ! isset($addFlag) ) {
		$addFlag = true;
		return $input[0];
	}

	# Add the frame flag
	$newURL = $input[2] . ( strpos($input[2], '?') ? '&amp;f=frame' : 'fframe/');

	return str_replace($input[2], $newURL, $input[0]);

}


/*****************************************************************
* CSS callbacks
******************************************************************/

# Proxy CSS url(LOCATION)
function css_URL($input) {
	return 'url(' . proxyURL(trim($input[1])) . ')';
}

# Proxy CSS @import "URL"
function css_import($input) {
	return '@import "' . proxyURL($input[1]) . '"';
}

# Proxy CSS src=
function css_src($input) {
	return 'src=' . $input[1] . proxyURL($input[2]) . $input[1];
}

# Callbacks for use with unique URLs and cached CSS
# The <UNIQUE[]URL> acts as a marker for quick and easy processing later

# Unique CSS url(LOCATION)
function css_URL_unique($input) {
	return 'url(<UNIQUE[' . absoluteURL($input[1],'') . ']URL>)';
}

# Unique CSS @import "URL"
function css_import_unique($input) {
	return '@import "<UNIQUE[' . absoluteURL($input[1]) . ']URL>"';
}

# Unique CSS src=
function css_src_unique($input) {
	return 'src=' . $input[1] . '<UNIQUE[' . absoluteURL($input[2]) . ']URL>' . $input[1];
}


/*****************************************************************
* Helper functions
******************************************************************/

# Take a string, and check that the next non-whitespace char is the
# passed in char (X). Return false if non-whitespace and non-X char is
# found. Otherwise, return the position of X.
# If $inverse is true, the next non-whitespace char must NOT be in $char
# If $pastChar is true, ignore whitespace after finding X and return
# the position of the last post-X whitespace char.
function str_checknext($input, $char, $offset, $inverse = false, $pastChar = false) {

	for ( $i = $offset, $length = strlen($input); $i < $length; ++$i ) {

		# Examine char
		switch ( $input[$i] ) {

			# Ignore whitespace
			case ' ':
			case "\t":
			case "\r":
			case "\n":
				break;

			# Found the passed char
			case $char:
			
				# $inverse means we do NOT want this char
				if ( $inverse ) {
					return false;
				}
				
				# Move past this to the next non-whitespace?
				if ( $pastChar ) {
					++$i;
					return $i + strspn($input, " \t\r\n", $i);
				}
				
				# Found desired char, no $pastChar, just return  X offset
				return $i;

			# Found non-$char non-whitespace
			default:
			
				# This is the desired result if $inverse
				if ( $inverse ) {
					return $i;
				}
				
				# No $inverse, found a non-$char, return false
				return false;

		}

	}

	return false;

}


# Same as above but go backwards
function str_checkprev($input, $char, $offset, $inverse = false) {

	for ( $i = $offset; $i > 0; --$i ) {

		# Examine char
		switch ( $input[$i] ) {

			# Ignore whitespace
			case ' ':
			case "\t":
			case "\r":
			case "\n":
				break;

			# Found char
			case $char:
				return $inverse ? false : $i;

			# Found non-$char char
			default:
				return $inverse ? $i : false;

		}

	}
	
	return $inverse;

}


# Analyze javascript and return offset positions.
# Default is to find the end of the statement, indicated by:
#	 (1) ; while not in string
#	 (2) newline which, if not there, would create invalid syntax
#	 (3) a closing bracket (object, language construct or function call) for which
#		  no corresponding opening bracket was detected AFTER the passed offset
# If (int) $argPos is true, we return an array of the start and end position
# for the nth argument, where n = $argPos. The $start position must be just inside
# the parenthesis of the function call we're interested in.
function analyze_js($input, $start, $argPos = false) {

	# Set chars we're interested in
	$specialChars = ";\n\r\"'+{}()[]";

	# Add , if looking for an argument position
	if ( $argPos ) {
		$specialChars .= ',';
		$currentArg = 1;
	}

	# Loop through the input, stopping only at special chars
	for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0;
			$end === false && ( $i += strcspn($input, $specialChars, $i) ) && $i < $length && ( $char = $input[$i] );
			++$i ) {

		switch ( $char ) {

			# Starting string delimiters
			case '"':
			case "'":

				if ( $input[$i-1] == '\\' ) { 
					break;
				}
			
				# Skip straight to end of string
				# Find the corresponding end delimiter and ensure it's not escaped
				while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' );

				# Check for false, in which case we assume the end is the end of the doc
				if ( $i === false ) {
					break 2;
				}

				break;

			# End of operation?
			case ';':
				$end = $i;
				break;

			# New lines
			case "\n":
			case "\r":
				# Newlines are OK if occuring within an open brackets, arrays or objects.
				if ( $openObjects || $openBrackets || $openArrays || $argPos ) {
					break;
				}

				# Newlines are also OK if followed by an opening function OR concatenation
				# e.g. someFunc\n(params) or someVar \n + anotherVar
				# Find next non-whitespace char position
				$tmp = $i + strspn($input, " \t\r\n", $i+1);

				# And compare to allowed chars
				if ( isset($input[$tmp+1]) && ( $input[$tmp+1] == '(' || $input[$tmp+1] == '+' ) ) {
					$i = $tmp;
					break;
				}

				# Newline not indicated as OK, set the end to here
				$end = $i;
				break;

			# Concatenation
			case '+':
				# Our interest in the + operator is it's use in allowing an expression
				# to span multiple lines. If we come across a +, move past all whitespace,
				# including newlines (which would otherwise indicate end of expression).
				$i += strspn($input, " \t\r\n", $i+1);
				break;

			# Opening chars (objects, parenthesis and arrays)
			case '{':
				++$openObjects;
				break;
			case '(':
				++$openBrackets;
				break;
			case '[':
				++$openArrays;
				break;

			# Closing chars - is there a corresponding open char?
			# Yes = reduce stored count. No = end of statement.
			case '}':
				$openObjects	? --$openObjects	 : $end = $i;
				break;
			case ')':
				$openBrackets	? --$openBrackets	 : $end = $i;
				break;
			case ']':
				$openArrays		? --$openArrays	 : $end = $i;
				break;

			# Commas - tell us which argument it is
			case ',':

				# Ignore commas inside other functions or whatnot
				if ( $openObjects || $openBrackets || $openArrays ) {
					break;
				}

				# End now
				if ( $currentArg == $argPos ) {
					$end = $i;
				}

				# Increase the current argument number
				++$currentArg;

				# If we're not after the first arg, start now?
				if ( $currentArg == $argPos ) {
					$start = $i+1;
				}

				break;

		}

	}

	# End not found? Use end of document
	if ( $end === false ) {
		$end = $length;
	}

	# Return array of start/end
	if ( $argPos ) {
		return array($start, $end);
	}

	# Return end
	return $end;

}

/*****************************************************************
* Page encoding functions
******************************************************************/

# Encode page - splits into HTML/script sections and encodes HTML
function encodePage($input) {

	# Look for script blocks
	if ( preg_match_all('#<script.*?</script>#is', $input, $scripts, PREG_OFFSET_CAPTURE) ) {
	
		# Create starting offset - only start encoding after the <head>
		# as this seems to help browsers cope!
		$offset = preg_match('#<head[^>]*>(.)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][1] : 0;
		$new	  = $offset ? substr($input, 0, $offset) : '';
		
		# Go through all the matches
		foreach ( $scripts[0] as $id => $match ) {

			# Determine position of the preceeding non-script block
			$end	  = $match[1] ? $match[1]-1 : 0;
			$start  = $offset; 
			$length = $end - $start;
			
			# Add encoded block to page if there is one
			if ( $length ) 
			$new .= encodeBlock(substr($input, $start, $length));
			
			# Add unencoded script to page
			$new .= $match[0];
			
			# Move offset up
			$offset = $match[1] + strlen($match[0]);
			
		}
		
		# Add final block
		if ( $remainder = substr($input, $offset) ) {
			$new .= encodeBlock($remainder);
		}
		
		# Update input with new
		$input = $new;
		
	} else {
		# No scripts is easy - just encode the lot
		$input = encodeBlock($input);
	}

	# Return the encoded page
	return $input;

}

# Encode block - applies the actual encoding (or rather "escaping")
function encodeBlock($input) {

	# Escape values
	$s = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','\'',"\r","\n",'-');
	$r = array('%61','%62','%63','%64','%65','%66','%67','%68','%69','%6a','%6b','%6c','%6d','%6e','%6f','%70','%71','%72','%73','%74','%75','%76','%77','%78','%79','%7a','%41','%42','%43','%44','%45','%46','%47','%48','%49','%4a','%4b','%4c','%4d','%4e','%4f','%50','%51','%52','%53','%54','%55','%56','%57','%58','%59','%5a','%27','%0d','%0a','%2D');

	# Return javascript decoder
	return '<script type="text/javascript">document.write(unescape(\'' . str_replace($s, $r, $input) . '\'));</script>';

}