PageRenderTime 162ms CodeModel.GetById 80ms app.highlight 39ms RepoModel.GetById 31ms app.codeStats 1ms

/includes/parser.php

https://github.com/francinebo/glype
PHP | 1029 lines | 534 code | 220 blank | 275 comment | 60 complexity | 14a5d9c40ca5937821a62de5d2f29b3b MD5 | raw file
   1<?php
   2/*******************************************************************
   3* Glype is copyright and trademark 2007-2012 UpsideOut, Inc. d/b/a Glype
   4* and/or its licensors, successors and assigners. All rights reserved.
   5*
   6* Use of Glype is subject to the terms of the Software License Agreement.
   7* http://www.glype.com/license.php
   8*******************************************************************
   9* This is the parser for the proxy - changes the original 'raw'
  10* document so that everything (images, links, etc.) is rerouted to
  11* be downloaded via the proxy script instead of directly.
  12******************************************************************/
  13
  14class parser {
  15
  16	# State of javascript parser - null for parse everything, false
  17	# for parse all non-standard overrides, or (array) with specifics
  18	private $jsFlagState;
  19	
  20	# Browsing options (Remove Scripts, etc.)
  21	private $htmlOptions;
  22
  23	# Constructor accepts options and saves them in the object
  24	function __construct($htmlOptions, $jsFlags) {
  25		$this->jsFlagState = $jsFlags;
  26		$this->htmlOptions = $htmlOptions;
  27	}
  28
  29	
  30	/*****************************************************************
  31	* HTML parsers - main parsing function splits up document into
  32	* component parts ('normal' HTML, scripts and styles)
  33	******************************************************************/
  34	
  35	function HTMLDocument($input, $insert='', $inject=false, $footer='') {
  36
  37		#
  38		# Apply parsing that only needs to be done once..
  39		#
  40
  41		# Remove titles if option is enabled
  42		if ( $this->htmlOptions['stripTitle'] ) {
  43			$input = preg_replace('#<title.*?</title>#is', '', $input, 1);
  44			$input = preg_replace('#<meta[^>]*name=["\']title["\'][^>]*>#is', '', $input, 1);
  45		}
  46
  47		# Remove and record a <base> href
  48		$input = preg_replace_callback('#<base href\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1);
  49
  50		# Proxy url= values in meta redirects
  51		$input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1);
  52
  53		# Process forms
  54		$input = preg_replace_callback('#<form([^>]*)>(.*?)</form>#is', 'html_form', $input);
  55		
  56		# Remove scripts blocks (avoids individual processing below)
  57		if ( $this->htmlOptions['stripJS'] ) {
  58			$input = preg_replace('#<script[^>]*>.*?</script>#is', '', $input);
  59		}
  60		
  61		
  62		#
  63		# Split up the document into its different types and parse them
  64		#
  65
  66		# Build up new document into this var
  67		$new	  = '';
  68		$offset = 0;
  69
  70		# Find instances of script or style blocks
  71		while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) {
  72
  73			# What type of block is this?
  74			$block = strtolower($match[1][0]);
  75
  76			# Start position of content
  77			$outerStart = $match[0][1];
  78			$innerStart = $outerStart + strlen($match[0][0]);
  79
  80			# Determine type of end tag and find it's position
  81			$endTag	 = "</$block>";
  82			$innerEnd = stripos($input, $endTag, $innerStart);
  83			if ($innerEnd===false) {
  84				$endTag	 = "</";
  85				$innerEnd = stripos($input, $endTag, $innerStart);
  86				if ($innerEnd===false) {
  87					$input = preg_replace('#<script[^>]*>.*?$#is', '', $input);
  88					break;
  89				}
  90			}
  91			$outerEnd = $innerEnd + strlen($endTag);
  92			
  93			# Parse everything up till here and add to the new document
  94			$new .= $this->HTML(substr($input, $offset, $innerStart - $offset));
  95			
  96			# Find parsing function
  97			$parseFunction = $block == 'style' ? 'CSS' : 'JS' ;
  98
  99			# Add the parsed block
 100			$new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart));
 101
 102			# Move offset to new position
 103			$offset = $innerEnd;
 104
 105		}
 106
 107		# And add the final chunk (between last script/style block and end of doc)
 108		$new .= $this->HTML(substr($input, $offset));
 109
 110		# Replace input with the updated document
 111		$input = $new;
 112
 113		# Encode the page
 114		if ( $this->htmlOptions['encodePage'] ) {
 115			$input = encodePage($input);
 116		}
 117
 118		#
 119		# Now add our own code bits
 120		#
 121
 122		# Insert our mini form after the <body>
 123		if ( $insert !== false ) {
 124
 125			# Check for a frameset
 126			if ( ( $useFrames = stripos($input, '<frameset') ) !== false ) {
 127
 128				# Flag the frames so only first displays mini-form
 129				$input = preg_replace_callback('#<frame[^>]+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_flagFrames', $input);
 130
 131			}
 132
 133			# Attempt to add after body
 134			$input = preg_replace('#(<body[^>]*>)#i', '$1' . $insert, $input, 1, $tmp);
 135
 136			# Check it inserted and append (if not a frameset)
 137			if ( ! $tmp && ! $useFrames ) {
 138				$input = $insert . $input;
 139			}
 140
 141		}
 142
 143		# Insert our javascript library
 144		if ( $inject ) {
 145
 146			# Generate javascript to insert
 147			$inject = injectionJS();
 148
 149			# Add our proxy javascript after <head>
 150			$input = preg_replace('#(<head[^>]*>)#i', '$1' . $inject, $input, 1, $tmp);
 151
 152			# If no <head>, just prepend
 153			if ( ! $tmp ) {
 154				$input = $inject . $input;
 155			}
 156
 157		}
 158
 159		# Add anything to the footer?
 160		if ( $footer ) {
 161
 162			$input = preg_replace('#(</body[^>]*>)#i', $footer . '$1', $input, 1, $tmp);
 163
 164			# If no </body>, just append the footer
 165			if ( ! $tmp ){
 166				$input .= $footer;
 167			}
 168
 169		}
 170
 171		# Return new document
 172		return $input;
 173
 174	}
 175
 176	# Parse HTML sections
 177	function HTML($input) {
 178
 179		# Removing objects? Follow spec and display inner content of object tags instead.
 180		if ( $this->htmlOptions['stripObjects'] ) {
 181
 182			# Remove all object tags (including those deprecated but still common)
 183			$input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp);
 184
 185			# Found any? Remove the corresponding end tags
 186			if ( $tmp ) {
 187				$input = preg_replace('#</(?>object|applet|param|embed)>#i', '', $input, $tmp);
 188			}
 189
 190		} else {
 191
 192			# Parse <param name="movie" value="URL"> tags
 193			$input = preg_replace_callback('#<param[^>]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input);
 194
 195			# To do: proxy object related URLs
 196
 197		}
 198
 199		# Show content within <noscript> tags
 200		# (preg_ seems to be faster than 2 str_ireplace() calls)
 201		if ( $this->htmlOptions['stripJS'] ) {
 202			$input = preg_replace('#</?noscript>#i', '', $input);
 203		}
 204
 205		# Parse onX events
 206		$input = preg_replace_callback('#\b(on(?<!\.on)[a-z]{2,20})\s*=\s*([\\\'"])?((?(2)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(2)\\2|)#i', array(&$this, 'html_eventJS'), $input);
 207
 208		# Parse style attributes
 209		$input = preg_replace_callback('#style\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', array(&$this, 'html_elementCSS'), $input);
 210
 211		# Proxy URL attributes - this is the bottleneck but optimized
 212		# as much as possible (or at least, as much as I can).
 213		$input = preg_replace_callback('#(?><[A-Z][A-Z0-9]{0,15})(?>\s+[^>\s]+)*?\s*(?>(href|src|background)\s*=(?!\\\\)\s*)(?>([\\\'"])?)((?(2)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^ >]{1,1000}))(?(2)\\2|)#i', 'html_attribute', $input);
 214
 215		# Return changed input
 216		return $input;
 217
 218	}
 219
 220	# Proxy an onX javascript event
 221	function html_eventJS($input) {
 222		return $this->htmlOptions['stripJS'] ? '' : $input[1] . '=' . $input[2] . $this->JS($input[3]) . $input[2];
 223	}
 224
 225	# Proxy a style="CSS" attribute
 226	function html_elementCSS($input) {
 227		return 'style=' . $input[1] . $this->CSS($input[2]) . $input[1];
 228	}
 229
 230
 231	/*****************************************************************
 232	* CSS parser - main parsing function
 233	* CSS parsing is a complicated by the caching of CSS files. We need
 234	* to consider (A) cross-domain caching and (B) the unique URLs option.
 235	*	 A) If possible, use a relative URL so the saved URLs do not explictly
 236	*		 point to a single domain.
 237	*	 B) There is a second set of callback functions with "_unique" suffixed
 238	*		 and these return the original URL to be reparesed.
 239	******************************************************************/
 240
 241	# The URLs depend on the unique and path info settings. The type parameter allows
 242	# us to specify the unique callbacks.
 243	function CSS($input, $storeUnique=false) {
 244
 245		# What type of parsing is this? Normally we parse any URLs to redirect
 246		# back through the proxy but not when storing a cache with unique URLs.
 247		$type = $storeUnique ? '_unique' : '';
 248
 249		# CSS needs proxying the calls to url(), @import and src=''
 250		$input = preg_replace_callback('#\burl\s*\(\s*[\\\'"]?([^\\\'"\)]+)[\\\'"]?\s*\)#i', 'css_URL' . $type, $input);
 251		$input = preg_replace_callback('#@import\s*[\\\'"]([^\\\'"\(\)]+)[\\\'"]#i', 'css_import' . $type, $input);
 252		$input = preg_replace_callback('#\bsrc\s*=\s*([\\\'"])?([^)\\\'"]+)(?(1)\\1|)#i', 'css_src' . $type, $input);
 253
 254		# Return changed
 255		return $input;
 256
 257	}
 258
 259
 260	/*****************************************************************
 261	* Javascript parser - main parsing function
 262	*
 263	* The specific parts that need proxying depends on which javascript
 264	* functions we've been able to override. On first page load, the browser
 265	* capabilities are tested to see what we can do client-side and the results
 266	* sent back to us. This allows us to parse only what we have to.
 267	* If $CONFIG['override_javascript'] is disabled, all commands are parsed
 268	* server-side. This will use much more CPU!
 269	*
 270	* Commands to proxy only if no override at all:
 271	*	 document.write()
 272	*	 document.writeln()
 273	*	 window.open()
 274	*	 eval()
 275	*
 276	* Commands to proxy, regardless of browser capabilities:
 277	*	 location.replace()
 278	*	 .innerHTML=
 279	*
 280	* Commands to proxy if the extra "watch" flag is set
 281	* (the browser doesn't support the .watch() method):
 282	*	 location=
 283	*	 x.location=
 284	*	 location.href=
 285	*
 286	* Commands to proxy if the extra "setters" flag is set
 287	* (the browser doesn't support the __defineSetter__() method):
 288	*	 .src=
 289	*	 .href=
 290	*	 .background=
 291	*	 .action=
 292	*
 293	* Commands to proxy if the extra "ajax" flag is set
 294	* (the browser failed to override the .open() method):
 295	*	 XMLHttpRequest.open()
 296	******************************************************************/
 297
 298	function JS($input) {
 299
 300		# Stripping?
 301		if ( $this->htmlOptions['stripJS'] ) {
 302			return '';
 303		}
 304		
 305		# Get our flags
 306		$flags = $this->jsFlagState;
 307
 308		# Unless we know we don't need to, apply all the browser-specific flags
 309		if ( ! is_array($this->jsFlagState) ) {
 310			$flags = array('ajax', 'watch', 'setters');
 311		}
 312		
 313		# If override is disabled, add a "base" flag
 314		if ( $this->jsFlagState === null ) {
 315			$flags[] = 'base';
 316		}
 317
 318		# Start parsing!
 319		$search = array();
 320		
 321		# Create shortcuts to various search patterns:
 322		#	  "before"	  - matches preceeding character (string of single char) [ignoring whitespace]
 323		#	  "after"	  - matches next character (string of single char) [ignoring whitespace]
 324		#	  "id"		  - key for identifying the original match (e.g. if we have >1 of the same key)
 325		$assignmentPattern	= array('before'	  => '.',				  'after' => '='); 
 326		$methodPattern			= array('before'	  => '.',				  'after' => '(');
 327		$functionPattern		= array('after' => '(');
 328
 329		# Configure strings to search for, starting with always replaced commands
 330		$search['innerHTML'][] = $assignmentPattern;
 331		$search['location'][]  = array('after' => '.', 'id' => 'replace()');
 332			# ^ This is only for location.replace() - other forms are handled later
 333
 334		# Look for attribute assignments
 335		if ( in_array('setters', $flags) ) {
 336			$search['src'][]			= $assignmentPattern;
 337			$search['href'][]			= $assignmentPattern;
 338			$search['action'][]		= $assignmentPattern;
 339			$search['background'][] = $assignmentPattern;
 340		}
 341		
 342		# Look for location changes
 343		# location.href will be handled above, location= is handled here
 344		if ( in_array('watch', $flags) ) {
 345			$search['location'][] = array('after' => '=', 'id' => 'assignment');
 346		}
 347
 348		# Look for .open() if either AJAX (XMLHttpRequest.open) or
 349		# base (window.open) flags are present
 350		if ( in_array('ajax', $flags) || in_array('base', $flags) ) {
 351			$search['open'][] = $methodPattern;
 352		}
 353		
 354		# Add the basic code if no override
 355		if ( in_array('base', $flags) ) {
 356			$search['eval'][]		= $functionPattern;
 357			$search['writeln'][]	  = $methodPattern;
 358			$search['write'][]	= $methodPattern;
 359		}
 360
 361		# Set up starting parameters
 362		$offset			= 0;
 363		$length			= strlen($input);
 364		$searchStrings = array_keys($search);
 365
 366		while ( $offset < $length ) {
 367
 368			# Start off by assuming no more items (i.e. the next position
 369			# of interest is the end of the document)
 370			$commandPos = $length;
 371
 372			# Loop through the search subjects
 373			foreach ( $searchStrings as $item ) {
 374
 375				# Any more instances of this?
 376				if ( ( $tmp = strpos($input, $item, $offset) ) === false ) {
 377
 378					# Nope, skip to next item
 379					continue;
 380
 381				}
 382
 383
 384				# Closer to the currently held 'next' position?
 385				if ( $tmp < $commandPos ) {
 386
 387					$commandPos = $tmp;
 388					$command = $item;
 389
 390				}
 391
 392			}
 393
 394			# No matches found? Finish parsing.
 395			if ( $commandPos == $length ) {
 396				break;
 397			}
 398			
 399			# We've found the main point of interest; now use the
 400			# search parameters to check the surrounding chars to validate
 401			# the match.
 402			$valid = false;
 403
 404			foreach ( $search[$command] as $pattern ) {
 405			
 406				# Check the preceeding chars
 407				if ( isset($pattern['before']) && str_checkprev($input, $pattern['before'], $commandPos-1) === false ) {
 408					continue;
 409				}
 410				
 411				# Check next chars
 412				if ( isset($pattern['after']) && ( $postCharPos = str_checknext($input, $pattern['after'], $commandPos + strlen($command), false, true) ) === false ) {
 413					continue;
 414				}
 415				
 416				# Still here? Match must be OK so generate a match ID			 
 417				if ( isset($pattern['id']) ) {
 418					$valid = $command . $pattern['id'];
 419				} else {
 420					$valid = $command;
 421				}
 422				
 423				break;
 424				
 425			}
 426			
 427			# What we do next depends on which match (if any) we've found...
 428			switch ( $valid ) {
 429			
 430				# Assigment
 431				case 'src':
 432				case 'href':
 433				case 'background':
 434				case 'action':
 435				case 'locationassignment':
 436				case 'innerHTML':
 437
 438					# Check our post-char position for = as well (could be equality
 439					# test rather than assignment, i.e. == )
 440					if ( ! isset($input[$postCharPos]) || $input[$postCharPos] == '=' ) {
 441						break;
 442					}
 443
 444					# Find the end of this statement
 445					$endPos = analyze_js($input, $postCharPos);
 446					$valueLength = $endPos - $postCharPos;
 447
 448					# Produce replacement command
 449					$replacement = sprintf('parse%s(%s)', $command=='innerHTML' ? 'HTML' : 'URL', substr($input, $postCharPos, $valueLength));
 450					
 451					# Adjust total document length as appropriate
 452					$length += strlen($replacement);
 453					
 454					# Make the replacement
 455					$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
 456					
 457					# Move offset up to new position
 458					$offset = $endPos + 10;
 459					
 460					# Go get next match
 461					continue 2;
 462					
 463					
 464				# Function calls - we don't know for certain if these are in fact members of the
 465				# appropriate objects (window/XMLHttpRequest for .open(), document for .write() and
 466				# .writeln) so we won't change anything. Main.js still overrides these functions but
 467				# does nothing with them by default. We add an extra parameter to tell our override
 468				# to kick in.
 469				case 'open':
 470				case 'write':
 471				case 'writeln':
 472					
 473					# Find the end position (the closing ")" for the function call)
 474					$endPos = analyze_js($input, $postCharPos);
 475					
 476					# Insert our additional argument just before that
 477					$input = substr_replace($input, ',"gl"', $endPos, 0);
 478					
 479					# Adjust the document length
 480					$length += 5;
 481					
 482					# And move the offset
 483					$offset = $endPos + 5;
 484					
 485					# Get next match
 486					continue 2;
 487				
 488				
 489				# Eval() is a just as easy since we can just wrap the entire thing in parseJS().
 490				case 'eval':
 491				
 492					# Ensure this is a call to eval(), not anotherfunctionendingineval()
 493					if ( isset($input[$commandPos-1]) && strpos('abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_', $input[$commandPos-1]) !== false ) {
 494						break;
 495					}
 496				
 497					# Find the end position (the closing ")" for the function call)
 498					$endPos = analyze_js($input, $postCharPos);
 499					$valueLength = $endPos - $postCharPos;
 500					
 501					# Generate our replacement
 502					$replacement = sprintf('parseJS(%s)', substr($input, $postCharPos, $valueLength));
 503					
 504					# Make the replacement
 505					$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
 506					
 507					# Adjust the document length
 508					$length += 9;
 509					
 510					# And move the offset
 511					$offset = $endPos + 9;
 512					continue 2;
 513				
 514				
 515				# location.replace() is a tricky one. We have the position of the char
 516				# after . as $postCharPos and need to ensure we're calling replace(), 
 517				# then parse the entire URL
 518				case 'locationreplace()':
 519
 520					# Validate the match
 521					if ( ! preg_match('#\Greplace\s*\(#', $input, $tmp, 0, $postCharPos) ) {
 522						break;
 523					}
 524
 525					# Move $postCharPos to inside the brackets of .replace()
 526					$postCharPos += strlen($tmp[0]);
 527				
 528					# Find the end position (the closing ")" for the function call)
 529					$endPos = analyze_js($input, $postCharPos);
 530					$valueLength = $endPos - $postCharPos;
 531					
 532					# Generate our replacement
 533					$replacement = sprintf('parseURL(%s)', substr($input, $postCharPos, $valueLength));
 534					
 535					# Make the replacement
 536					$input = substr_replace($input, $replacement, $postCharPos, $valueLength);
 537					
 538					# Adjust the document length
 539					$length += 9;
 540					
 541					# And move the offset
 542					$offset = $endPos + 9;
 543					
 544					continue 2;
 545					
 546			}
 547			
 548			# Still here? A match didn't validate so adjust offset to just after
 549			# current position
 550			$offset = $commandPos + 1;
 551
 552		}
 553
 554		# Ignore document.domain
 555		$input = str_replace('document.domain', 'ignore', $input);
 556
 557		# Return changed
 558		return $input;
 559
 560	}
 561
 562}
 563
 564
 565/*****************************************************************
 566* HTML callbacks
 567******************************************************************/
 568
 569# Remove and record the <base> href
 570function html_stripBase($input) {
 571	global $base;
 572	$base = $input[2];
 573	return '';
 574}
 575
 576# Proxy the location of a meta refresh
 577function html_metaRefresh($input) {
 578	return str_replace($input[3], proxyURL($input[3]), $input[0]);
 579}
 580
 581# Proxy URL in <param name="movie" value="URL">
 582function html_paramValue($input) {
 583
 584	# Check for a name="movie" tag
 585	if ( stripos($input[0], 'movie') === false ) {
 586		return $input[0];
 587	}
 588
 589	return str_replace($input[2], proxyURL($input[2]), $input[0]);
 590}
 591
 592# Process forms - the query string is used by the proxy script
 593# and GET data needs to be encoded anyway. We convert all GET
 594# forms to POST and then the proxy script will forward it properly.
 595function html_form($input) {
 596
 597	# Check for a given method
 598	if ( preg_match('#\bmethod\s*=\s*["\\\']?(get|post)["\\\']?#i', $input[1], $tmp) ) {
 599
 600		# Not POST?
 601		if ( strtolower($tmp[1]) != 'post' ) {
 602
 603			# Convert to post and flag as a conversion
 604			$input[1] = str_replace($tmp[0], 'method="post"', $input[1]);
 605			$converted = true;
 606
 607		}
 608
 609	} else {
 610
 611		# Append a POST method (no method given and GET is default)
 612		$input[1] .= ' method="post"';
 613		$converted = true;
 614
 615	}
 616
 617	# Prepare the extra input to insert
 618	$add = empty($converted) ? '' : '<input type="hidden" name="convertGET" value="1">';
 619
 620	# To do: javascript onsubmit event to immediately redirect to the appropriate
 621	# location using GET data, without an intermediate POST to the proxy script.
 622
 623	# Proxy the form action
 624	$input[1] = preg_replace_callback('#\baction\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_formAction', $input[1]);
 625
 626	# What type of form is this? Due to register_globals support, PHP converts
 627	# a number of characters to _ in incoming variable names. To get around this,
 628	# we can use the raw post data from php://input but this is not available
 629	# for multipart forms. Instead we must encode the input names in these forms.
 630	if ( stripos($input[1], 'multipart/form-data') ) {
 631
 632		$input[2] = preg_replace_callback('#name\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_inputName', $input[2]);
 633
 634	}
 635
 636	# Return updated form
 637	return '<form' . $input[1] . '>' . $add . $input[2] . '</form>';
 638
 639}
 640
 641# Proxy the action="URL" value in forms
 642function html_formAction($input) {
 643	return 'action=' . $input[1] . proxyURL($input[2]) . $input[1];
 644}
 645
 646# Encode input names
 647function html_inputName($input) {
 648	return 'name=' . $input[1] . inputEncode($input[2]) . $input[1];
 649}
 650
 651# Proxy URL values in attributes
 652function html_attribute($input) {
 653
 654	# Is this an iframe?
 655	$flag = stripos($input[0], 'iframe') === 1 ? 'frame' : '';
 656
 657	# Do not proxy magnet links
 658    if (stripos($input[3], 'magnet:?') === 0) {
 659        return $input[0];
 660    }
 661	
 662	# URL occurred as value of an attribute and should have been htmlspecialchar()ed
 663	# We need to do the job of the browser and decode before proxying.
 664	return str_replace($input[3], htmlspecialchars(proxyURL(htmlspecialchars_decode($input[3]), $flag)), $input[0]);
 665}
 666
 667# Flag frames in a frameset so only the first one shows the mini-form.
 668# This could be done in the above callback but adds extra processing
 669# when 99% of the time, it won't be needed.
 670function html_flagFrames($input) {
 671
 672	static $addFlag;
 673
 674	# If it's the first frame, leave it but set the flag var
 675	if ( ! isset($addFlag) ) {
 676		$addFlag = true;
 677		return $input[0];
 678	}
 679
 680	# Add the frame flag
 681	$newURL = $input[2] . ( strpos($input[2], '?') ? '&amp;f=frame' : 'fframe/');
 682
 683	return str_replace($input[2], $newURL, $input[0]);
 684
 685}
 686
 687
 688/*****************************************************************
 689* CSS callbacks
 690******************************************************************/
 691
 692# Proxy CSS url(LOCATION)
 693function css_URL($input) {
 694	return 'url(' . proxyURL(trim($input[1])) . ')';
 695}
 696
 697# Proxy CSS @import "URL"
 698function css_import($input) {
 699	return '@import "' . proxyURL($input[1]) . '"';
 700}
 701
 702# Proxy CSS src=
 703function css_src($input) {
 704	return 'src=' . $input[1] . proxyURL($input[2]) . $input[1];
 705}
 706
 707# Callbacks for use with unique URLs and cached CSS
 708# The <UNIQUE[]URL> acts as a marker for quick and easy processing later
 709
 710# Unique CSS url(LOCATION)
 711function css_URL_unique($input) {
 712	return 'url(<UNIQUE[' . absoluteURL($input[1],'') . ']URL>)';
 713}
 714
 715# Unique CSS @import "URL"
 716function css_import_unique($input) {
 717	return '@import "<UNIQUE[' . absoluteURL($input[1]) . ']URL>"';
 718}
 719
 720# Unique CSS src=
 721function css_src_unique($input) {
 722	return 'src=' . $input[1] . '<UNIQUE[' . absoluteURL($input[2]) . ']URL>' . $input[1];
 723}
 724
 725
 726/*****************************************************************
 727* Helper functions
 728******************************************************************/
 729
 730# Take a string, and check that the next non-whitespace char is the
 731# passed in char (X). Return false if non-whitespace and non-X char is
 732# found. Otherwise, return the position of X.
 733# If $inverse is true, the next non-whitespace char must NOT be in $char
 734# If $pastChar is true, ignore whitespace after finding X and return
 735# the position of the last post-X whitespace char.
 736function str_checknext($input, $char, $offset, $inverse = false, $pastChar = false) {
 737
 738	for ( $i = $offset, $length = strlen($input); $i < $length; ++$i ) {
 739
 740		# Examine char
 741		switch ( $input[$i] ) {
 742
 743			# Ignore whitespace
 744			case ' ':
 745			case "\t":
 746			case "\r":
 747			case "\n":
 748				break;
 749
 750			# Found the passed char
 751			case $char:
 752			
 753				# $inverse means we do NOT want this char
 754				if ( $inverse ) {
 755					return false;
 756				}
 757				
 758				# Move past this to the next non-whitespace?
 759				if ( $pastChar ) {
 760					++$i;
 761					return $i + strspn($input, " \t\r\n", $i);
 762				}
 763				
 764				# Found desired char, no $pastChar, just return  X offset
 765				return $i;
 766
 767			# Found non-$char non-whitespace
 768			default:
 769			
 770				# This is the desired result if $inverse
 771				if ( $inverse ) {
 772					return $i;
 773				}
 774				
 775				# No $inverse, found a non-$char, return false
 776				return false;
 777
 778		}
 779
 780	}
 781
 782	return false;
 783
 784}
 785
 786
 787# Same as above but go backwards
 788function str_checkprev($input, $char, $offset, $inverse = false) {
 789
 790	for ( $i = $offset; $i > 0; --$i ) {
 791
 792		# Examine char
 793		switch ( $input[$i] ) {
 794
 795			# Ignore whitespace
 796			case ' ':
 797			case "\t":
 798			case "\r":
 799			case "\n":
 800				break;
 801
 802			# Found char
 803			case $char:
 804				return $inverse ? false : $i;
 805
 806			# Found non-$char char
 807			default:
 808				return $inverse ? $i : false;
 809
 810		}
 811
 812	}
 813	
 814	return $inverse;
 815
 816}
 817
 818
 819# Analyze javascript and return offset positions.
 820# Default is to find the end of the statement, indicated by:
 821#	 (1) ; while not in string
 822#	 (2) newline which, if not there, would create invalid syntax
 823#	 (3) a closing bracket (object, language construct or function call) for which
 824#		  no corresponding opening bracket was detected AFTER the passed offset
 825# If (int) $argPos is true, we return an array of the start and end position
 826# for the nth argument, where n = $argPos. The $start position must be just inside
 827# the parenthesis of the function call we're interested in.
 828function analyze_js($input, $start, $argPos = false) {
 829
 830	# Set chars we're interested in
 831	$specialChars = ";\n\r\"'+{}()[]";
 832
 833	# Add , if looking for an argument position
 834	if ( $argPos ) {
 835		$specialChars .= ',';
 836		$currentArg = 1;
 837	}
 838
 839	# Loop through the input, stopping only at special chars
 840	for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0;
 841			$end === false && ( $i += strcspn($input, $specialChars, $i) ) && $i < $length && ( $char = $input[$i] );
 842			++$i ) {
 843
 844		switch ( $char ) {
 845
 846			# Starting string delimiters
 847			case '"':
 848			case "'":
 849
 850				if ( $input[$i-1] == '\\' ) { 
 851					break;
 852				}
 853			
 854				# Skip straight to end of string
 855				# Find the corresponding end delimiter and ensure it's not escaped
 856				while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' );
 857
 858				# Check for false, in which case we assume the end is the end of the doc
 859				if ( $i === false ) {
 860					break 2;
 861				}
 862
 863				break;
 864
 865			# End of operation?
 866			case ';':
 867				$end = $i;
 868				break;
 869
 870			# New lines
 871			case "\n":
 872			case "\r":
 873				# Newlines are OK if occuring within an open brackets, arrays or objects.
 874				if ( $openObjects || $openBrackets || $openArrays || $argPos ) {
 875					break;
 876				}
 877
 878				# Newlines are also OK if followed by an opening function OR concatenation
 879				# e.g. someFunc\n(params) or someVar \n + anotherVar
 880				# Find next non-whitespace char position
 881				$tmp = $i + strspn($input, " \t\r\n", $i+1);
 882
 883				# And compare to allowed chars
 884				if ( isset($input[$tmp+1]) && ( $input[$tmp+1] == '(' || $input[$tmp+1] == '+' ) ) {
 885					$i = $tmp;
 886					break;
 887				}
 888
 889				# Newline not indicated as OK, set the end to here
 890				$end = $i;
 891				break;
 892
 893			# Concatenation
 894			case '+':
 895				# Our interest in the + operator is it's use in allowing an expression
 896				# to span multiple lines. If we come across a +, move past all whitespace,
 897				# including newlines (which would otherwise indicate end of expression).
 898				$i += strspn($input, " \t\r\n", $i+1);
 899				break;
 900
 901			# Opening chars (objects, parenthesis and arrays)
 902			case '{':
 903				++$openObjects;
 904				break;
 905			case '(':
 906				++$openBrackets;
 907				break;
 908			case '[':
 909				++$openArrays;
 910				break;
 911
 912			# Closing chars - is there a corresponding open char?
 913			# Yes = reduce stored count. No = end of statement.
 914			case '}':
 915				$openObjects	? --$openObjects	 : $end = $i;
 916				break;
 917			case ')':
 918				$openBrackets	? --$openBrackets	 : $end = $i;
 919				break;
 920			case ']':
 921				$openArrays		? --$openArrays	 : $end = $i;
 922				break;
 923
 924			# Commas - tell us which argument it is
 925			case ',':
 926
 927				# Ignore commas inside other functions or whatnot
 928				if ( $openObjects || $openBrackets || $openArrays ) {
 929					break;
 930				}
 931
 932				# End now
 933				if ( $currentArg == $argPos ) {
 934					$end = $i;
 935				}
 936
 937				# Increase the current argument number
 938				++$currentArg;
 939
 940				# If we're not after the first arg, start now?
 941				if ( $currentArg == $argPos ) {
 942					$start = $i+1;
 943				}
 944
 945				break;
 946
 947		}
 948
 949	}
 950
 951	# End not found? Use end of document
 952	if ( $end === false ) {
 953		$end = $length;
 954	}
 955
 956	# Return array of start/end
 957	if ( $argPos ) {
 958		return array($start, $end);
 959	}
 960
 961	# Return end
 962	return $end;
 963
 964}
 965
 966/*****************************************************************
 967* Page encoding functions
 968******************************************************************/
 969
 970# Encode page - splits into HTML/script sections and encodes HTML
 971function encodePage($input) {
 972
 973	# Look for script blocks
 974	if ( preg_match_all('#<script.*?</script>#is', $input, $scripts, PREG_OFFSET_CAPTURE) ) {
 975	
 976		# Create starting offset - only start encoding after the <head>
 977		# as this seems to help browsers cope!
 978		$offset = preg_match('#<head[^>]*>(.)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][1] : 0;
 979		$new	  = $offset ? substr($input, 0, $offset) : '';
 980		
 981		# Go through all the matches
 982		foreach ( $scripts[0] as $id => $match ) {
 983
 984			# Determine position of the preceeding non-script block
 985			$end	  = $match[1] ? $match[1]-1 : 0;
 986			$start  = $offset; 
 987			$length = $end - $start;
 988			
 989			# Add encoded block to page if there is one
 990			if ( $length ) 
 991			$new .= encodeBlock(substr($input, $start, $length));
 992			
 993			# Add unencoded script to page
 994			$new .= $match[0];
 995			
 996			# Move offset up
 997			$offset = $match[1] + strlen($match[0]);
 998			
 999		}
1000		
1001		# Add final block
1002		if ( $remainder = substr($input, $offset) ) {
1003			$new .= encodeBlock($remainder);
1004		}
1005		
1006		# Update input with new
1007		$input = $new;
1008		
1009	} else {
1010		# No scripts is easy - just encode the lot
1011		$input = encodeBlock($input);
1012	}
1013
1014	# Return the encoded page
1015	return $input;
1016
1017}
1018
1019# Encode block - applies the actual encoding (or rather "escaping")
1020function encodeBlock($input) {
1021
1022	# Escape values
1023	$s = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','\'',"\r","\n",'-');
1024	$r = array('%61','%62','%63','%64','%65','%66','%67','%68','%69','%6a','%6b','%6c','%6d','%6e','%6f','%70','%71','%72','%73','%74','%75','%76','%77','%78','%79','%7a','%41','%42','%43','%44','%45','%46','%47','%48','%49','%4a','%4b','%4c','%4d','%4e','%4f','%50','%51','%52','%53','%54','%55','%56','%57','%58','%59','%5a','%27','%0d','%0a','%2D');
1025
1026	# Return javascript decoder
1027	return '<script type="text/javascript">document.write(unescape(\'' . str_replace($s, $r, $input) . '\'));</script>';
1028
1029}