PageRenderTime 54ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/includes/parser.php

https://github.com/francinebo/glype
PHP | 1029 lines | 534 code | 220 blank | 275 comment | 60 complexity | 14a5d9c40ca5937821a62de5d2f29b3b MD5 | raw file
  1. <?php
  2. /*******************************************************************
  3. * Glype is copyright and trademark 2007-2012 UpsideOut, Inc. d/b/a Glype
  4. * and/or its licensors, successors and assigners. All rights reserved.
  5. *
  6. * Use of Glype is subject to the terms of the Software License Agreement.
  7. * http://www.glype.com/license.php
  8. *******************************************************************
  9. * This is the parser for the proxy - changes the original 'raw'
  10. * document so that everything (images, links, etc.) is rerouted to
  11. * be downloaded via the proxy script instead of directly.
  12. ******************************************************************/
  13. class parser {
  14. # State of javascript parser - null for parse everything, false
  15. # for parse all non-standard overrides, or (array) with specifics
  16. private $jsFlagState;
  17. # Browsing options (Remove Scripts, etc.)
  18. private $htmlOptions;
  19. # Constructor accepts options and saves them in the object
  20. function __construct($htmlOptions, $jsFlags) {
  21. $this->jsFlagState = $jsFlags;
  22. $this->htmlOptions = $htmlOptions;
  23. }
  24. /*****************************************************************
  25. * HTML parsers - main parsing function splits up document into
  26. * component parts ('normal' HTML, scripts and styles)
  27. ******************************************************************/
  28. function HTMLDocument($input, $insert='', $inject=false, $footer='') {
  29. #
  30. # Apply parsing that only needs to be done once..
  31. #
  32. # Remove titles if option is enabled
  33. if ( $this->htmlOptions['stripTitle'] ) {
  34. $input = preg_replace('#<title.*?</title>#is', '', $input, 1);
  35. $input = preg_replace('#<meta[^>]*name=["\']title["\'][^>]*>#is', '', $input, 1);
  36. }
  37. # Remove and record a <base> href
  38. $input = preg_replace_callback('#<base href\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1);
  39. # Proxy url= values in meta redirects
  40. $input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1);
  41. # Process forms
  42. $input = preg_replace_callback('#<form([^>]*)>(.*?)</form>#is', 'html_form', $input);
  43. # Remove scripts blocks (avoids individual processing below)
  44. if ( $this->htmlOptions['stripJS'] ) {
  45. $input = preg_replace('#<script[^>]*>.*?</script>#is', '', $input);
  46. }
  47. #
  48. # Split up the document into its different types and parse them
  49. #
  50. # Build up new document into this var
  51. $new = '';
  52. $offset = 0;
  53. # Find instances of script or style blocks
  54. while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) {
  55. # What type of block is this?
  56. $block = strtolower($match[1][0]);
  57. # Start position of content
  58. $outerStart = $match[0][1];
  59. $innerStart = $outerStart + strlen($match[0][0]);
  60. # Determine type of end tag and find it's position
  61. $endTag = "</$block>";
  62. $innerEnd = stripos($input, $endTag, $innerStart);
  63. if ($innerEnd===false) {
  64. $endTag = "</";
  65. $innerEnd = stripos($input, $endTag, $innerStart);
  66. if ($innerEnd===false) {
  67. $input = preg_replace('#<script[^>]*>.*?$#is', '', $input);
  68. break;
  69. }
  70. }
  71. $outerEnd = $innerEnd + strlen($endTag);
  72. # Parse everything up till here and add to the new document
  73. $new .= $this->HTML(substr($input, $offset, $innerStart - $offset));
  74. # Find parsing function
  75. $parseFunction = $block == 'style' ? 'CSS' : 'JS' ;
  76. # Add the parsed block
  77. $new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart));
  78. # Move offset to new position
  79. $offset = $innerEnd;
  80. }
  81. # And add the final chunk (between last script/style block and end of doc)
  82. $new .= $this->HTML(substr($input, $offset));
  83. # Replace input with the updated document
  84. $input = $new;
  85. # Encode the page
  86. if ( $this->htmlOptions['encodePage'] ) {
  87. $input = encodePage($input);
  88. }
  89. #
  90. # Now add our own code bits
  91. #
  92. # Insert our mini form after the <body>
  93. if ( $insert !== false ) {
  94. # Check for a frameset
  95. if ( ( $useFrames = stripos($input, '<frameset') ) !== false ) {
  96. # Flag the frames so only first displays mini-form
  97. $input = preg_replace_callback('#<frame[^>]+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_flagFrames', $input);
  98. }
  99. # Attempt to add after body
  100. $input = preg_replace('#(<body[^>]*>)#i', '$1' . $insert, $input, 1, $tmp);
  101. # Check it inserted and append (if not a frameset)
  102. if ( ! $tmp && ! $useFrames ) {
  103. $input = $insert . $input;
  104. }
  105. }
  106. # Insert our javascript library
  107. if ( $inject ) {
  108. # Generate javascript to insert
  109. $inject = injectionJS();
  110. # Add our proxy javascript after <head>
  111. $input = preg_replace('#(<head[^>]*>)#i', '$1' . $inject, $input, 1, $tmp);
  112. # If no <head>, just prepend
  113. if ( ! $tmp ) {
  114. $input = $inject . $input;
  115. }
  116. }
  117. # Add anything to the footer?
  118. if ( $footer ) {
  119. $input = preg_replace('#(</body[^>]*>)#i', $footer . '$1', $input, 1, $tmp);
  120. # If no </body>, just append the footer
  121. if ( ! $tmp ){
  122. $input .= $footer;
  123. }
  124. }
  125. # Return new document
  126. return $input;
  127. }
  128. # Parse HTML sections
  129. function HTML($input) {
  130. # Removing objects? Follow spec and display inner content of object tags instead.
  131. if ( $this->htmlOptions['stripObjects'] ) {
  132. # Remove all object tags (including those deprecated but still common)
  133. $input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp);
  134. # Found any? Remove the corresponding end tags
  135. if ( $tmp ) {
  136. $input = preg_replace('#</(?>object|applet|param|embed)>#i', '', $input, $tmp);
  137. }
  138. } else {
  139. # Parse <param name="movie" value="URL"> tags
  140. $input = preg_replace_callback('#<param[^>]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input);
  141. # To do: proxy object related URLs
  142. }
  143. # Show content within <noscript> tags
  144. # (preg_ seems to be faster than 2 str_ireplace() calls)
  145. if ( $this->htmlOptions['stripJS'] ) {
  146. $input = preg_replace('#</?noscript>#i', '', $input);
  147. }
  148. # Parse onX events
  149. $input = preg_replace_callback('#\b(on(?<!\.on)[a-z]{2,20})\s*=\s*([\\\'"])?((?(2)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(2)\\2|)#i', array(&$this, 'html_eventJS'), $input);
  150. # Parse style attributes
  151. $input = preg_replace_callback('#style\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', array(&$this, 'html_elementCSS'), $input);
  152. # Proxy URL attributes - this is the bottleneck but optimized
  153. # as much as possible (or at least, as much as I can).
  154. $input = preg_replace_callback('#(?><[A-Z][A-Z0-9]{0,15})(?>\s+[^>\s]+)*?\s*(?>(href|src|background)\s*=(?!\\\\)\s*)(?>([\\\'"])?)((?(2)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^ >]{1,1000}))(?(2)\\2|)#i', 'html_attribute', $input);
  155. # Return changed input
  156. return $input;
  157. }
  158. # Proxy an onX javascript event
  159. function html_eventJS($input) {
  160. return $this->htmlOptions['stripJS'] ? '' : $input[1] . '=' . $input[2] . $this->JS($input[3]) . $input[2];
  161. }
  162. # Proxy a style="CSS" attribute
  163. function html_elementCSS($input) {
  164. return 'style=' . $input[1] . $this->CSS($input[2]) . $input[1];
  165. }
  166. /*****************************************************************
  167. * CSS parser - main parsing function
  168. * CSS parsing is a complicated by the caching of CSS files. We need
  169. * to consider (A) cross-domain caching and (B) the unique URLs option.
  170. * A) If possible, use a relative URL so the saved URLs do not explictly
  171. * point to a single domain.
  172. * B) There is a second set of callback functions with "_unique" suffixed
  173. * and these return the original URL to be reparesed.
  174. ******************************************************************/
  175. # The URLs depend on the unique and path info settings. The type parameter allows
  176. # us to specify the unique callbacks.
  177. function CSS($input, $storeUnique=false) {
  178. # What type of parsing is this? Normally we parse any URLs to redirect
  179. # back through the proxy but not when storing a cache with unique URLs.
  180. $type = $storeUnique ? '_unique' : '';
  181. # CSS needs proxying the calls to url(), @import and src=''
  182. $input = preg_replace_callback('#\burl\s*\(\s*[\\\'"]?([^\\\'"\)]+)[\\\'"]?\s*\)#i', 'css_URL' . $type, $input);
  183. $input = preg_replace_callback('#@import\s*[\\\'"]([^\\\'"\(\)]+)[\\\'"]#i', 'css_import' . $type, $input);
  184. $input = preg_replace_callback('#\bsrc\s*=\s*([\\\'"])?([^)\\\'"]+)(?(1)\\1|)#i', 'css_src' . $type, $input);
  185. # Return changed
  186. return $input;
  187. }
  188. /*****************************************************************
  189. * Javascript parser - main parsing function
  190. *
  191. * The specific parts that need proxying depends on which javascript
  192. * functions we've been able to override. On first page load, the browser
  193. * capabilities are tested to see what we can do client-side and the results
  194. * sent back to us. This allows us to parse only what we have to.
  195. * If $CONFIG['override_javascript'] is disabled, all commands are parsed
  196. * server-side. This will use much more CPU!
  197. *
  198. * Commands to proxy only if no override at all:
  199. * document.write()
  200. * document.writeln()
  201. * window.open()
  202. * eval()
  203. *
  204. * Commands to proxy, regardless of browser capabilities:
  205. * location.replace()
  206. * .innerHTML=
  207. *
  208. * Commands to proxy if the extra "watch" flag is set
  209. * (the browser doesn't support the .watch() method):
  210. * location=
  211. * x.location=
  212. * location.href=
  213. *
  214. * Commands to proxy if the extra "setters" flag is set
  215. * (the browser doesn't support the __defineSetter__() method):
  216. * .src=
  217. * .href=
  218. * .background=
  219. * .action=
  220. *
  221. * Commands to proxy if the extra "ajax" flag is set
  222. * (the browser failed to override the .open() method):
  223. * XMLHttpRequest.open()
  224. ******************************************************************/
  225. function JS($input) {
  226. # Stripping?
  227. if ( $this->htmlOptions['stripJS'] ) {
  228. return '';
  229. }
  230. # Get our flags
  231. $flags = $this->jsFlagState;
  232. # Unless we know we don't need to, apply all the browser-specific flags
  233. if ( ! is_array($this->jsFlagState) ) {
  234. $flags = array('ajax', 'watch', 'setters');
  235. }
  236. # If override is disabled, add a "base" flag
  237. if ( $this->jsFlagState === null ) {
  238. $flags[] = 'base';
  239. }
  240. # Start parsing!
  241. $search = array();
  242. # Create shortcuts to various search patterns:
  243. # "before" - matches preceeding character (string of single char) [ignoring whitespace]
  244. # "after" - matches next character (string of single char) [ignoring whitespace]
  245. # "id" - key for identifying the original match (e.g. if we have >1 of the same key)
  246. $assignmentPattern = array('before' => '.', 'after' => '=');
  247. $methodPattern = array('before' => '.', 'after' => '(');
  248. $functionPattern = array('after' => '(');
  249. # Configure strings to search for, starting with always replaced commands
  250. $search['innerHTML'][] = $assignmentPattern;
  251. $search['location'][] = array('after' => '.', 'id' => 'replace()');
  252. # ^ This is only for location.replace() - other forms are handled later
  253. # Look for attribute assignments
  254. if ( in_array('setters', $flags) ) {
  255. $search['src'][] = $assignmentPattern;
  256. $search['href'][] = $assignmentPattern;
  257. $search['action'][] = $assignmentPattern;
  258. $search['background'][] = $assignmentPattern;
  259. }
  260. # Look for location changes
  261. # location.href will be handled above, location= is handled here
  262. if ( in_array('watch', $flags) ) {
  263. $search['location'][] = array('after' => '=', 'id' => 'assignment');
  264. }
  265. # Look for .open() if either AJAX (XMLHttpRequest.open) or
  266. # base (window.open) flags are present
  267. if ( in_array('ajax', $flags) || in_array('base', $flags) ) {
  268. $search['open'][] = $methodPattern;
  269. }
  270. # Add the basic code if no override
  271. if ( in_array('base', $flags) ) {
  272. $search['eval'][] = $functionPattern;
  273. $search['writeln'][] = $methodPattern;
  274. $search['write'][] = $methodPattern;
  275. }
  276. # Set up starting parameters
  277. $offset = 0;
  278. $length = strlen($input);
  279. $searchStrings = array_keys($search);
  280. while ( $offset < $length ) {
  281. # Start off by assuming no more items (i.e. the next position
  282. # of interest is the end of the document)
  283. $commandPos = $length;
  284. # Loop through the search subjects
  285. foreach ( $searchStrings as $item ) {
  286. # Any more instances of this?
  287. if ( ( $tmp = strpos($input, $item, $offset) ) === false ) {
  288. # Nope, skip to next item
  289. continue;
  290. }
  291. # Closer to the currently held 'next' position?
  292. if ( $tmp < $commandPos ) {
  293. $commandPos = $tmp;
  294. $command = $item;
  295. }
  296. }
  297. # No matches found? Finish parsing.
  298. if ( $commandPos == $length ) {
  299. break;
  300. }
  301. # We've found the main point of interest; now use the
  302. # search parameters to check the surrounding chars to validate
  303. # the match.
  304. $valid = false;
  305. foreach ( $search[$command] as $pattern ) {
  306. # Check the preceeding chars
  307. if ( isset($pattern['before']) && str_checkprev($input, $pattern['before'], $commandPos-1) === false ) {
  308. continue;
  309. }
  310. # Check next chars
  311. if ( isset($pattern['after']) && ( $postCharPos = str_checknext($input, $pattern['after'], $commandPos + strlen($command), false, true) ) === false ) {
  312. continue;
  313. }
  314. # Still here? Match must be OK so generate a match ID
  315. if ( isset($pattern['id']) ) {
  316. $valid = $command . $pattern['id'];
  317. } else {
  318. $valid = $command;
  319. }
  320. break;
  321. }
  322. # What we do next depends on which match (if any) we've found...
  323. switch ( $valid ) {
  324. # Assigment
  325. case 'src':
  326. case 'href':
  327. case 'background':
  328. case 'action':
  329. case 'locationassignment':
  330. case 'innerHTML':
  331. # Check our post-char position for = as well (could be equality
  332. # test rather than assignment, i.e. == )
  333. if ( ! isset($input[$postCharPos]) || $input[$postCharPos] == '=' ) {
  334. break;
  335. }
  336. # Find the end of this statement
  337. $endPos = analyze_js($input, $postCharPos);
  338. $valueLength = $endPos - $postCharPos;
  339. # Produce replacement command
  340. $replacement = sprintf('parse%s(%s)', $command=='innerHTML' ? 'HTML' : 'URL', substr($input, $postCharPos, $valueLength));
  341. # Adjust total document length as appropriate
  342. $length += strlen($replacement);
  343. # Make the replacement
  344. $input = substr_replace($input, $replacement, $postCharPos, $valueLength);
  345. # Move offset up to new position
  346. $offset = $endPos + 10;
  347. # Go get next match
  348. continue 2;
  349. # Function calls - we don't know for certain if these are in fact members of the
  350. # appropriate objects (window/XMLHttpRequest for .open(), document for .write() and
  351. # .writeln) so we won't change anything. Main.js still overrides these functions but
  352. # does nothing with them by default. We add an extra parameter to tell our override
  353. # to kick in.
  354. case 'open':
  355. case 'write':
  356. case 'writeln':
  357. # Find the end position (the closing ")" for the function call)
  358. $endPos = analyze_js($input, $postCharPos);
  359. # Insert our additional argument just before that
  360. $input = substr_replace($input, ',"gl"', $endPos, 0);
  361. # Adjust the document length
  362. $length += 5;
  363. # And move the offset
  364. $offset = $endPos + 5;
  365. # Get next match
  366. continue 2;
  367. # Eval() is a just as easy since we can just wrap the entire thing in parseJS().
  368. case 'eval':
  369. # Ensure this is a call to eval(), not anotherfunctionendingineval()
  370. if ( isset($input[$commandPos-1]) && strpos('abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_', $input[$commandPos-1]) !== false ) {
  371. break;
  372. }
  373. # Find the end position (the closing ")" for the function call)
  374. $endPos = analyze_js($input, $postCharPos);
  375. $valueLength = $endPos - $postCharPos;
  376. # Generate our replacement
  377. $replacement = sprintf('parseJS(%s)', substr($input, $postCharPos, $valueLength));
  378. # Make the replacement
  379. $input = substr_replace($input, $replacement, $postCharPos, $valueLength);
  380. # Adjust the document length
  381. $length += 9;
  382. # And move the offset
  383. $offset = $endPos + 9;
  384. continue 2;
  385. # location.replace() is a tricky one. We have the position of the char
  386. # after . as $postCharPos and need to ensure we're calling replace(),
  387. # then parse the entire URL
  388. case 'locationreplace()':
  389. # Validate the match
  390. if ( ! preg_match('#\Greplace\s*\(#', $input, $tmp, 0, $postCharPos) ) {
  391. break;
  392. }
  393. # Move $postCharPos to inside the brackets of .replace()
  394. $postCharPos += strlen($tmp[0]);
  395. # Find the end position (the closing ")" for the function call)
  396. $endPos = analyze_js($input, $postCharPos);
  397. $valueLength = $endPos - $postCharPos;
  398. # Generate our replacement
  399. $replacement = sprintf('parseURL(%s)', substr($input, $postCharPos, $valueLength));
  400. # Make the replacement
  401. $input = substr_replace($input, $replacement, $postCharPos, $valueLength);
  402. # Adjust the document length
  403. $length += 9;
  404. # And move the offset
  405. $offset = $endPos + 9;
  406. continue 2;
  407. }
  408. # Still here? A match didn't validate so adjust offset to just after
  409. # current position
  410. $offset = $commandPos + 1;
  411. }
  412. # Ignore document.domain
  413. $input = str_replace('document.domain', 'ignore', $input);
  414. # Return changed
  415. return $input;
  416. }
  417. }
  418. /*****************************************************************
  419. * HTML callbacks
  420. ******************************************************************/
  421. # Remove and record the <base> href
  422. function html_stripBase($input) {
  423. global $base;
  424. $base = $input[2];
  425. return '';
  426. }
  427. # Proxy the location of a meta refresh
  428. function html_metaRefresh($input) {
  429. return str_replace($input[3], proxyURL($input[3]), $input[0]);
  430. }
  431. # Proxy URL in <param name="movie" value="URL">
  432. function html_paramValue($input) {
  433. # Check for a name="movie" tag
  434. if ( stripos($input[0], 'movie') === false ) {
  435. return $input[0];
  436. }
  437. return str_replace($input[2], proxyURL($input[2]), $input[0]);
  438. }
  439. # Process forms - the query string is used by the proxy script
  440. # and GET data needs to be encoded anyway. We convert all GET
  441. # forms to POST and then the proxy script will forward it properly.
  442. function html_form($input) {
  443. # Check for a given method
  444. if ( preg_match('#\bmethod\s*=\s*["\\\']?(get|post)["\\\']?#i', $input[1], $tmp) ) {
  445. # Not POST?
  446. if ( strtolower($tmp[1]) != 'post' ) {
  447. # Convert to post and flag as a conversion
  448. $input[1] = str_replace($tmp[0], 'method="post"', $input[1]);
  449. $converted = true;
  450. }
  451. } else {
  452. # Append a POST method (no method given and GET is default)
  453. $input[1] .= ' method="post"';
  454. $converted = true;
  455. }
  456. # Prepare the extra input to insert
  457. $add = empty($converted) ? '' : '<input type="hidden" name="convertGET" value="1">';
  458. # To do: javascript onsubmit event to immediately redirect to the appropriate
  459. # location using GET data, without an intermediate POST to the proxy script.
  460. # Proxy the form action
  461. $input[1] = preg_replace_callback('#\baction\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_formAction', $input[1]);
  462. # What type of form is this? Due to register_globals support, PHP converts
  463. # a number of characters to _ in incoming variable names. To get around this,
  464. # we can use the raw post data from php://input but this is not available
  465. # for multipart forms. Instead we must encode the input names in these forms.
  466. if ( stripos($input[1], 'multipart/form-data') ) {
  467. $input[2] = preg_replace_callback('#name\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_inputName', $input[2]);
  468. }
  469. # Return updated form
  470. return '<form' . $input[1] . '>' . $add . $input[2] . '</form>';
  471. }
  472. # Proxy the action="URL" value in forms
  473. function html_formAction($input) {
  474. return 'action=' . $input[1] . proxyURL($input[2]) . $input[1];
  475. }
  476. # Encode input names
  477. function html_inputName($input) {
  478. return 'name=' . $input[1] . inputEncode($input[2]) . $input[1];
  479. }
  480. # Proxy URL values in attributes
  481. function html_attribute($input) {
  482. # Is this an iframe?
  483. $flag = stripos($input[0], 'iframe') === 1 ? 'frame' : '';
  484. # Do not proxy magnet links
  485. if (stripos($input[3], 'magnet:?') === 0) {
  486. return $input[0];
  487. }
  488. # URL occurred as value of an attribute and should have been htmlspecialchar()ed
  489. # We need to do the job of the browser and decode before proxying.
  490. return str_replace($input[3], htmlspecialchars(proxyURL(htmlspecialchars_decode($input[3]), $flag)), $input[0]);
  491. }
  492. # Flag frames in a frameset so only the first one shows the mini-form.
  493. # This could be done in the above callback but adds extra processing
  494. # when 99% of the time, it won't be needed.
  495. function html_flagFrames($input) {
  496. static $addFlag;
  497. # If it's the first frame, leave it but set the flag var
  498. if ( ! isset($addFlag) ) {
  499. $addFlag = true;
  500. return $input[0];
  501. }
  502. # Add the frame flag
  503. $newURL = $input[2] . ( strpos($input[2], '?') ? '&amp;f=frame' : 'fframe/');
  504. return str_replace($input[2], $newURL, $input[0]);
  505. }
  506. /*****************************************************************
  507. * CSS callbacks
  508. ******************************************************************/
  509. # Proxy CSS url(LOCATION)
  510. function css_URL($input) {
  511. return 'url(' . proxyURL(trim($input[1])) . ')';
  512. }
  513. # Proxy CSS @import "URL"
  514. function css_import($input) {
  515. return '@import "' . proxyURL($input[1]) . '"';
  516. }
  517. # Proxy CSS src=
  518. function css_src($input) {
  519. return 'src=' . $input[1] . proxyURL($input[2]) . $input[1];
  520. }
  521. # Callbacks for use with unique URLs and cached CSS
  522. # The <UNIQUE[]URL> acts as a marker for quick and easy processing later
  523. # Unique CSS url(LOCATION)
  524. function css_URL_unique($input) {
  525. return 'url(<UNIQUE[' . absoluteURL($input[1],'') . ']URL>)';
  526. }
  527. # Unique CSS @import "URL"
  528. function css_import_unique($input) {
  529. return '@import "<UNIQUE[' . absoluteURL($input[1]) . ']URL>"';
  530. }
  531. # Unique CSS src=
  532. function css_src_unique($input) {
  533. return 'src=' . $input[1] . '<UNIQUE[' . absoluteURL($input[2]) . ']URL>' . $input[1];
  534. }
  535. /*****************************************************************
  536. * Helper functions
  537. ******************************************************************/
  538. # Take a string, and check that the next non-whitespace char is the
  539. # passed in char (X). Return false if non-whitespace and non-X char is
  540. # found. Otherwise, return the position of X.
  541. # If $inverse is true, the next non-whitespace char must NOT be in $char
  542. # If $pastChar is true, ignore whitespace after finding X and return
  543. # the position of the last post-X whitespace char.
  544. function str_checknext($input, $char, $offset, $inverse = false, $pastChar = false) {
  545. for ( $i = $offset, $length = strlen($input); $i < $length; ++$i ) {
  546. # Examine char
  547. switch ( $input[$i] ) {
  548. # Ignore whitespace
  549. case ' ':
  550. case "\t":
  551. case "\r":
  552. case "\n":
  553. break;
  554. # Found the passed char
  555. case $char:
  556. # $inverse means we do NOT want this char
  557. if ( $inverse ) {
  558. return false;
  559. }
  560. # Move past this to the next non-whitespace?
  561. if ( $pastChar ) {
  562. ++$i;
  563. return $i + strspn($input, " \t\r\n", $i);
  564. }
  565. # Found desired char, no $pastChar, just return X offset
  566. return $i;
  567. # Found non-$char non-whitespace
  568. default:
  569. # This is the desired result if $inverse
  570. if ( $inverse ) {
  571. return $i;
  572. }
  573. # No $inverse, found a non-$char, return false
  574. return false;
  575. }
  576. }
  577. return false;
  578. }
  579. # Same as above but go backwards
  580. function str_checkprev($input, $char, $offset, $inverse = false) {
  581. for ( $i = $offset; $i > 0; --$i ) {
  582. # Examine char
  583. switch ( $input[$i] ) {
  584. # Ignore whitespace
  585. case ' ':
  586. case "\t":
  587. case "\r":
  588. case "\n":
  589. break;
  590. # Found char
  591. case $char:
  592. return $inverse ? false : $i;
  593. # Found non-$char char
  594. default:
  595. return $inverse ? $i : false;
  596. }
  597. }
  598. return $inverse;
  599. }
  600. # Analyze javascript and return offset positions.
  601. # Default is to find the end of the statement, indicated by:
  602. # (1) ; while not in string
  603. # (2) newline which, if not there, would create invalid syntax
  604. # (3) a closing bracket (object, language construct or function call) for which
  605. # no corresponding opening bracket was detected AFTER the passed offset
  606. # If (int) $argPos is true, we return an array of the start and end position
  607. # for the nth argument, where n = $argPos. The $start position must be just inside
  608. # the parenthesis of the function call we're interested in.
  609. function analyze_js($input, $start, $argPos = false) {
  610. # Set chars we're interested in
  611. $specialChars = ";\n\r\"'+{}()[]";
  612. # Add , if looking for an argument position
  613. if ( $argPos ) {
  614. $specialChars .= ',';
  615. $currentArg = 1;
  616. }
  617. # Loop through the input, stopping only at special chars
  618. for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0;
  619. $end === false && ( $i += strcspn($input, $specialChars, $i) ) && $i < $length && ( $char = $input[$i] );
  620. ++$i ) {
  621. switch ( $char ) {
  622. # Starting string delimiters
  623. case '"':
  624. case "'":
  625. if ( $input[$i-1] == '\\' ) {
  626. break;
  627. }
  628. # Skip straight to end of string
  629. # Find the corresponding end delimiter and ensure it's not escaped
  630. while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' );
  631. # Check for false, in which case we assume the end is the end of the doc
  632. if ( $i === false ) {
  633. break 2;
  634. }
  635. break;
  636. # End of operation?
  637. case ';':
  638. $end = $i;
  639. break;
  640. # New lines
  641. case "\n":
  642. case "\r":
  643. # Newlines are OK if occuring within an open brackets, arrays or objects.
  644. if ( $openObjects || $openBrackets || $openArrays || $argPos ) {
  645. break;
  646. }
  647. # Newlines are also OK if followed by an opening function OR concatenation
  648. # e.g. someFunc\n(params) or someVar \n + anotherVar
  649. # Find next non-whitespace char position
  650. $tmp = $i + strspn($input, " \t\r\n", $i+1);
  651. # And compare to allowed chars
  652. if ( isset($input[$tmp+1]) && ( $input[$tmp+1] == '(' || $input[$tmp+1] == '+' ) ) {
  653. $i = $tmp;
  654. break;
  655. }
  656. # Newline not indicated as OK, set the end to here
  657. $end = $i;
  658. break;
  659. # Concatenation
  660. case '+':
  661. # Our interest in the + operator is it's use in allowing an expression
  662. # to span multiple lines. If we come across a +, move past all whitespace,
  663. # including newlines (which would otherwise indicate end of expression).
  664. $i += strspn($input, " \t\r\n", $i+1);
  665. break;
  666. # Opening chars (objects, parenthesis and arrays)
  667. case '{':
  668. ++$openObjects;
  669. break;
  670. case '(':
  671. ++$openBrackets;
  672. break;
  673. case '[':
  674. ++$openArrays;
  675. break;
  676. # Closing chars - is there a corresponding open char?
  677. # Yes = reduce stored count. No = end of statement.
  678. case '}':
  679. $openObjects ? --$openObjects : $end = $i;
  680. break;
  681. case ')':
  682. $openBrackets ? --$openBrackets : $end = $i;
  683. break;
  684. case ']':
  685. $openArrays ? --$openArrays : $end = $i;
  686. break;
  687. # Commas - tell us which argument it is
  688. case ',':
  689. # Ignore commas inside other functions or whatnot
  690. if ( $openObjects || $openBrackets || $openArrays ) {
  691. break;
  692. }
  693. # End now
  694. if ( $currentArg == $argPos ) {
  695. $end = $i;
  696. }
  697. # Increase the current argument number
  698. ++$currentArg;
  699. # If we're not after the first arg, start now?
  700. if ( $currentArg == $argPos ) {
  701. $start = $i+1;
  702. }
  703. break;
  704. }
  705. }
  706. # End not found? Use end of document
  707. if ( $end === false ) {
  708. $end = $length;
  709. }
  710. # Return array of start/end
  711. if ( $argPos ) {
  712. return array($start, $end);
  713. }
  714. # Return end
  715. return $end;
  716. }
  717. /*****************************************************************
  718. * Page encoding functions
  719. ******************************************************************/
  720. # Encode page - splits into HTML/script sections and encodes HTML
  721. function encodePage($input) {
  722. # Look for script blocks
  723. if ( preg_match_all('#<script.*?</script>#is', $input, $scripts, PREG_OFFSET_CAPTURE) ) {
  724. # Create starting offset - only start encoding after the <head>
  725. # as this seems to help browsers cope!
  726. $offset = preg_match('#<head[^>]*>(.)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][1] : 0;
  727. $new = $offset ? substr($input, 0, $offset) : '';
  728. # Go through all the matches
  729. foreach ( $scripts[0] as $id => $match ) {
  730. # Determine position of the preceeding non-script block
  731. $end = $match[1] ? $match[1]-1 : 0;
  732. $start = $offset;
  733. $length = $end - $start;
  734. # Add encoded block to page if there is one
  735. if ( $length )
  736. $new .= encodeBlock(substr($input, $start, $length));
  737. # Add unencoded script to page
  738. $new .= $match[0];
  739. # Move offset up
  740. $offset = $match[1] + strlen($match[0]);
  741. }
  742. # Add final block
  743. if ( $remainder = substr($input, $offset) ) {
  744. $new .= encodeBlock($remainder);
  745. }
  746. # Update input with new
  747. $input = $new;
  748. } else {
  749. # No scripts is easy - just encode the lot
  750. $input = encodeBlock($input);
  751. }
  752. # Return the encoded page
  753. return $input;
  754. }
  755. # Encode block - applies the actual encoding (or rather "escaping")
  756. function encodeBlock($input) {
  757. # Escape values
  758. $s = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','\'',"\r","\n",'-');
  759. $r = array('%61','%62','%63','%64','%65','%66','%67','%68','%69','%6a','%6b','%6c','%6d','%6e','%6f','%70','%71','%72','%73','%74','%75','%76','%77','%78','%79','%7a','%41','%42','%43','%44','%45','%46','%47','%48','%49','%4a','%4b','%4c','%4d','%4e','%4f','%50','%51','%52','%53','%54','%55','%56','%57','%58','%59','%5a','%27','%0d','%0a','%2D');
  760. # Return javascript decoder
  761. return '<script type="text/javascript">document.write(unescape(\'' . str_replace($s, $r, $input) . '\'));</script>';
  762. }