PageRenderTime 607ms CodeModel.GetById 31ms RepoModel.GetById 5ms app.codeStats 0ms

/repository/url/locallib.php

https://bitbucket.org/synergylearning/campusconnect
PHP | 756 lines | 419 code | 52 blank | 285 comment | 107 complexity | b866653cd345278a81d942c74e6a9915 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-3.0, GPL-3.0, LGPL-2.1, Apache-2.0, BSD-3-Clause, AGPL-3.0
  1. <?php
  2. /**
  3. * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * * Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. *
  13. * * Redistributions in binary form must reproduce the above
  14. * copyright notice, this list of conditions and the following
  15. * disclaimer in the documentation and/or other materials provided
  16. * with the distribution.
  17. *
  18. * * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
  19. * the names of its contributors may be used to endorse or promote
  20. * products derived from this software without specific prior
  21. * written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  27. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  31. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  33. * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
  34. * OF SUCH DAMAGE.
  35. */
  36. /*
  37. * This is a BSD License approved by the Open Source Initiative (OSI).
  38. * See: http://www.opensource.org/licenses/bsd-license.php
  39. */
  40. defined('MOODLE_INTERNAL') || die();
  41. /**
  42. * Combine a base URL and a relative URL to produce a new
  43. * absolute URL. The base URL is often the URL of a page,
  44. * and the relative URL is a URL embedded on that page.
  45. *
  46. * This function implements the "absolutize" algorithm from
  47. * the RFC3986 specification for URLs.
  48. *
  49. * This function supports multi-byte characters with the UTF-8 encoding,
  50. * per the URL specification.
  51. *
  52. * Parameters:
  53. * baseUrl the absolute base URL.
  54. *
  55. * url the relative URL to convert.
  56. *
  57. * Return values:
  58. * An absolute URL that combines parts of the base and relative
  59. * URLs, or FALSE if the base URL is not absolute or if either
  60. * URL cannot be parsed.
  61. */
  62. function url_to_absolute( $baseUrl, $relativeUrl )
  63. {
  64. // If relative URL has a scheme, clean path and return.
  65. $r = split_url( $relativeUrl );
  66. if ( $r === FALSE )
  67. return FALSE;
  68. if ( !empty( $r['scheme'] ) )
  69. {
  70. if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
  71. $r['path'] = url_remove_dot_segments( $r['path'] );
  72. return join_url( $r );
  73. }
  74. // Make sure the base URL is absolute.
  75. $b = split_url( $baseUrl );
  76. if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
  77. return FALSE;
  78. $r['scheme'] = $b['scheme'];
  79. if (empty($b['path'])) {
  80. $b['path'] = '';
  81. }
  82. // If relative URL has an authority, clean path and return.
  83. if ( isset( $r['host'] ) )
  84. {
  85. if ( !empty( $r['path'] ) )
  86. $r['path'] = url_remove_dot_segments( $r['path'] );
  87. return join_url( $r );
  88. }
  89. unset( $r['port'] );
  90. unset( $r['user'] );
  91. unset( $r['pass'] );
  92. // Copy base authority.
  93. $r['host'] = $b['host'];
  94. if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
  95. if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
  96. if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
  97. // If relative URL has no path, use base path
  98. if ( empty( $r['path'] ) )
  99. {
  100. if ( !empty( $b['path'] ) )
  101. $r['path'] = $b['path'];
  102. if ( !isset( $r['query'] ) && isset( $b['query'] ) )
  103. $r['query'] = $b['query'];
  104. return join_url( $r );
  105. }
  106. // If relative URL path doesn't start with /, merge with base path.
  107. if ($r['path'][0] != '/') {
  108. $base = core_text::strrchr($b['path'], '/', TRUE);
  109. if ($base === FALSE) {
  110. $base = '';
  111. }
  112. $r['path'] = $base . '/' . $r['path'];
  113. }
  114. $r['path'] = url_remove_dot_segments($r['path']);
  115. return join_url($r);
  116. }
  117. /**
  118. * Filter out "." and ".." segments from a URL's path and return
  119. * the result.
  120. *
  121. * This function implements the "remove_dot_segments" algorithm from
  122. * the RFC3986 specification for URLs.
  123. *
  124. * This function supports multi-byte characters with the UTF-8 encoding,
  125. * per the URL specification.
  126. *
  127. * Parameters:
  128. * path the path to filter
  129. *
  130. * Return values:
  131. * The filtered path with "." and ".." removed.
  132. */
  133. function url_remove_dot_segments( $path )
  134. {
  135. // multi-byte character explode
  136. $inSegs = preg_split( '!/!u', $path );
  137. $outSegs = array( );
  138. foreach ( $inSegs as $seg )
  139. {
  140. if ( $seg == '' || $seg == '.')
  141. continue;
  142. if ( $seg == '..' )
  143. array_pop( $outSegs );
  144. else
  145. array_push( $outSegs, $seg );
  146. }
  147. $outPath = implode( '/', $outSegs );
  148. if ($path[0] == '/') {
  149. $outPath = '/' . $outPath;
  150. }
  151. // Compare last multi-byte character against '/'.
  152. if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
  153. $outPath .= '/';
  154. }
  155. return $outPath;
  156. }
  157. /**
  158. * This function parses an absolute or relative URL and splits it
  159. * into individual components.
  160. *
  161. * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
  162. * A portion of the ABNFs are repeated here:
  163. *
  164. * URI-reference = URI
  165. * / relative-ref
  166. *
  167. * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  168. *
  169. * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
  170. *
  171. * hier-part = "//" authority path-abempty
  172. * / path-absolute
  173. * / path-rootless
  174. * / path-empty
  175. *
  176. * relative-part = "//" authority path-abempty
  177. * / path-absolute
  178. * / path-noscheme
  179. * / path-empty
  180. *
  181. * authority = [ userinfo "@" ] host [ ":" port ]
  182. *
  183. * So, a URL has the following major components:
  184. *
  185. * scheme
  186. * The name of a method used to interpret the rest of
  187. * the URL. Examples: "http", "https", "mailto", "file'.
  188. *
  189. * authority
  190. * The name of the authority governing the URL's name
  191. * space. Examples: "example.com", "user@example.com",
  192. * "example.com:80", "user:password@example.com:80".
  193. *
  194. * The authority may include a host name, port number,
  195. * user name, and password.
  196. *
  197. * The host may be a name, an IPv4 numeric address, or
  198. * an IPv6 numeric address.
  199. *
  200. * path
  201. * The hierarchical path to the URL's resource.
  202. * Examples: "/index.htm", "/scripts/page.php".
  203. *
  204. * query
  205. * The data for a query. Examples: "?search=google.com".
  206. *
  207. * fragment
  208. * The name of a secondary resource relative to that named
  209. * by the path. Examples: "#section1", "#header".
  210. *
  211. * An "absolute" URL must include a scheme and path. The authority, query,
  212. * and fragment components are optional.
  213. *
  214. * A "relative" URL does not include a scheme and must include a path. The
  215. * authority, query, and fragment components are optional.
  216. *
  217. * This function splits the $url argument into the following components
  218. * and returns them in an associative array. Keys to that array include:
  219. *
  220. * "scheme" The scheme, such as "http".
  221. * "host" The host name, IPv4, or IPv6 address.
  222. * "port" The port number.
  223. * "user" The user name.
  224. * "pass" The user password.
  225. * "path" The path, such as a file path for "http".
  226. * "query" The query.
  227. * "fragment" The fragment.
  228. *
  229. * One or more of these may not be present, depending upon the URL.
  230. *
  231. * Optionally, the "user", "pass", "host" (if a name, not an IP address),
  232. * "path", "query", and "fragment" may have percent-encoded characters
  233. * decoded. The "scheme" and "port" cannot include percent-encoded
  234. * characters and are never decoded. Decoding occurs after the URL has
  235. * been parsed.
  236. *
  237. * Parameters:
  238. * url the URL to parse.
  239. *
  240. * decode an optional boolean flag selecting whether
  241. * to decode percent encoding or not. Default = TRUE.
  242. *
  243. * Return values:
  244. * the associative array of URL parts, or FALSE if the URL is
  245. * too malformed to recognize any parts.
  246. */
  247. function split_url( $url, $decode=FALSE)
  248. {
  249. // Character sets from RFC3986.
  250. $xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
  251. $xpchar = $xunressub . ':@% ';
  252. // Scheme from RFC3986.
  253. $xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)';
  254. // User info (user + password) from RFC3986.
  255. $xuserinfo = '(([' . $xunressub . '%]*)' .
  256. '(:([' . $xunressub . ':%]*))?)';
  257. // IPv4 from RFC3986 (without digit constraints).
  258. $xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
  259. // IPv6 from RFC2732 (without digit and grouping constraints).
  260. $xipv6 = '(\[([a-fA-F\d.:]+)\])';
  261. // Host name from RFC1035. Technically, must start with a letter.
  262. // Relax that restriction to better parse URL structure, then
  263. // leave host name validation to application.
  264. $xhost_name = '([a-zA-Z\d-.%]+)';
  265. // Authority from RFC3986. Skip IP future.
  266. $xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
  267. $xport = '(\d*)';
  268. $xauthority = '((' . $xuserinfo . '@)?' . $xhost .
  269. '?(:' . $xport . ')?)';
  270. // Path from RFC3986. Blend absolute & relative for efficiency.
  271. $xslash_seg = '(/[' . $xpchar . ']*)';
  272. $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
  273. $xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)';
  274. $xpath_abs = '(/(' . $xpath_rel . ')?)';
  275. $xapath = '(' . $xpath_authabs . '|' . $xpath_abs .
  276. '|' . $xpath_rel . ')';
  277. // Query and fragment from RFC3986.
  278. $xqueryfrag = '([' . $xpchar . '/?' . ']*)';
  279. // URL.
  280. $xurl = '^(' . $xscheme . ':)?' . $xapath . '?' .
  281. '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
  282. // Split the URL into components.
  283. if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
  284. return FALSE;
  285. if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]);
  286. if ( !empty($m[7]) ) {
  287. if ( isset( $m[9] ) ) $parts['user'] = $m[9];
  288. else $parts['user'] = '';
  289. }
  290. if ( !empty($m[10]) ) $parts['pass'] = $m[11];
  291. if ( !empty($m[13]) ) $h=$parts['host'] = $m[13];
  292. else if ( !empty($m[14]) ) $parts['host'] = $m[14];
  293. else if ( !empty($m[16]) ) $parts['host'] = $m[16];
  294. else if ( !empty( $m[5] ) ) $parts['host'] = '';
  295. if ( !empty($m[17]) ) $parts['port'] = $m[18];
  296. if ( !empty($m[19]) ) $parts['path'] = $m[19];
  297. else if ( !empty($m[21]) ) $parts['path'] = $m[21];
  298. else if ( !empty($m[25]) ) $parts['path'] = $m[25];
  299. if ( !empty($m[27]) ) $parts['query'] = $m[28];
  300. if ( !empty($m[29]) ) $parts['fragment']= $m[30];
  301. if ( !$decode )
  302. return $parts;
  303. if ( !empty($parts['user']) )
  304. $parts['user'] = rawurldecode( $parts['user'] );
  305. if ( !empty($parts['pass']) )
  306. $parts['pass'] = rawurldecode( $parts['pass'] );
  307. if ( !empty($parts['path']) )
  308. $parts['path'] = rawurldecode( $parts['path'] );
  309. if ( isset($h) )
  310. $parts['host'] = rawurldecode( $parts['host'] );
  311. if ( !empty($parts['query']) )
  312. $parts['query'] = rawurldecode( $parts['query'] );
  313. if ( !empty($parts['fragment']) )
  314. $parts['fragment'] = rawurldecode( $parts['fragment'] );
  315. return $parts;
  316. }
  317. /**
  318. * This function joins together URL components to form a complete URL.
  319. *
  320. * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
  321. * This function implements the specification's "component recomposition"
  322. * algorithm for combining URI components into a full URI string.
  323. *
  324. * The $parts argument is an associative array containing zero or
  325. * more of the following:
  326. *
  327. * "scheme" The scheme, such as "http".
  328. * "host" The host name, IPv4, or IPv6 address.
  329. * "port" The port number.
  330. * "user" The user name.
  331. * "pass" The user password.
  332. * "path" The path, such as a file path for "http".
  333. * "query" The query.
  334. * "fragment" The fragment.
  335. *
  336. * The "port", "user", and "pass" values are only used when a "host"
  337. * is present.
  338. *
  339. * The optional $encode argument indicates if appropriate URL components
  340. * should be percent-encoded as they are assembled into the URL. Encoding
  341. * is only applied to the "user", "pass", "host" (if a host name, not an
  342. * IP address), "path", "query", and "fragment" components. The "scheme"
  343. * and "port" are never encoded. When a "scheme" and "host" are both
  344. * present, the "path" is presumed to be hierarchical and encoding
  345. * processes each segment of the hierarchy separately (i.e., the slashes
  346. * are left alone).
  347. *
  348. * The assembled URL string is returned.
  349. *
  350. * Parameters:
  351. * parts an associative array of strings containing the
  352. * individual parts of a URL.
  353. *
  354. * encode an optional boolean flag selecting whether
  355. * to do percent encoding or not. Default = true.
  356. *
  357. * Return values:
  358. * Returns the assembled URL string. The string is an absolute
  359. * URL if a scheme is supplied, and a relative URL if not. An
  360. * empty string is returned if the $parts array does not contain
  361. * any of the needed values.
  362. */
  363. function join_url( $parts, $encode=FALSE)
  364. {
  365. if ( $encode )
  366. {
  367. if ( isset( $parts['user'] ) )
  368. $parts['user'] = rawurlencode( $parts['user'] );
  369. if ( isset( $parts['pass'] ) )
  370. $parts['pass'] = rawurlencode( $parts['pass'] );
  371. if ( isset( $parts['host'] ) &&
  372. !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
  373. $parts['host'] = rawurlencode( $parts['host'] );
  374. if ( !empty( $parts['path'] ) )
  375. $parts['path'] = preg_replace( '!%2F!ui', '/',
  376. rawurlencode( $parts['path'] ) );
  377. if ( isset( $parts['query'] ) )
  378. $parts['query'] = rawurlencode( $parts['query'] );
  379. if ( isset( $parts['fragment'] ) )
  380. $parts['fragment'] = rawurlencode( $parts['fragment'] );
  381. }
  382. $url = '';
  383. if ( !empty( $parts['scheme'] ) )
  384. $url .= $parts['scheme'] . ':';
  385. if ( isset( $parts['host'] ) )
  386. {
  387. $url .= '//';
  388. if ( isset( $parts['user'] ) )
  389. {
  390. $url .= $parts['user'];
  391. if ( isset( $parts['pass'] ) )
  392. $url .= ':' . $parts['pass'];
  393. $url .= '@';
  394. }
  395. if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
  396. $url .= '[' . $parts['host'] . ']'; // IPv6
  397. else
  398. $url .= $parts['host']; // IPv4 or name
  399. if ( isset( $parts['port'] ) )
  400. $url .= ':' . $parts['port'];
  401. if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
  402. $url .= '/';
  403. }
  404. if ( !empty( $parts['path'] ) )
  405. $url .= $parts['path'];
  406. if ( isset( $parts['query'] ) )
  407. $url .= '?' . $parts['query'];
  408. if ( isset( $parts['fragment'] ) )
  409. $url .= '#' . $parts['fragment'];
  410. return $url;
  411. }
  412. /**
  413. * This function encodes URL to form a URL which is properly
  414. * percent encoded to replace disallowed characters.
  415. *
  416. * RFC3986 specifies the allowed characters in the URL as well as
  417. * reserved characters in the URL. This function replaces all the
  418. * disallowed characters in the URL with their repective percent
  419. * encodings. Already encoded characters are not encoded again,
  420. * such as '%20' is not encoded to '%2520'.
  421. *
  422. * Parameters:
  423. * url the url to encode.
  424. *
  425. * Return values:
  426. * Returns the encoded URL string.
  427. */
  428. function encode_url($url) {
  429. $reserved = array(
  430. ":" => '!%3A!ui',
  431. "/" => '!%2F!ui',
  432. "?" => '!%3F!ui',
  433. "#" => '!%23!ui',
  434. "[" => '!%5B!ui',
  435. "]" => '!%5D!ui',
  436. "@" => '!%40!ui',
  437. "!" => '!%21!ui',
  438. "$" => '!%24!ui',
  439. "&" => '!%26!ui',
  440. "'" => '!%27!ui',
  441. "(" => '!%28!ui',
  442. ")" => '!%29!ui',
  443. "*" => '!%2A!ui',
  444. "+" => '!%2B!ui',
  445. "," => '!%2C!ui',
  446. ";" => '!%3B!ui',
  447. "=" => '!%3D!ui',
  448. "%" => '!%25!ui',
  449. );
  450. $url = rawurlencode($url);
  451. $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
  452. return $url;
  453. }
  454. /**
  455. * Extract URLs from a web page.
  456. *
  457. * URLs are extracted from a long list of tags and attributes as defined
  458. * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
  459. * URLs are also extracted from tags and attributes that are common
  460. * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
  461. * and from WML 1.3 and 2.0.
  462. *
  463. * The function returns an associative array of associative arrays of
  464. * arrays of URLs. The outermost array's keys are the tag (element) name,
  465. * such as "a" for <a> or "img" for <img>. The values for these entries
  466. * are associative arrays where the keys are attribute names for those
  467. * tags, such as "href" for <a href="...">. Finally, the values for
  468. * those arrays are URLs found in those tags and attributes throughout
  469. * the text.
  470. *
  471. * Parameters:
  472. * text the UTF-8 text to scan
  473. *
  474. * Return values:
  475. * an associative array where keys are tags and values are an
  476. * associative array where keys are attributes and values are
  477. * an array of URLs.
  478. *
  479. * See:
  480. * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
  481. */
  482. function extract_html_urls( $text )
  483. {
  484. $match_elements = array(
  485. // HTML
  486. array('element'=>'a', 'attribute'=>'href'), // 2.0
  487. array('element'=>'a', 'attribute'=>'urn'), // 2.0
  488. array('element'=>'base', 'attribute'=>'href'), // 2.0
  489. array('element'=>'form', 'attribute'=>'action'), // 2.0
  490. array('element'=>'img', 'attribute'=>'src'), // 2.0
  491. array('element'=>'link', 'attribute'=>'href'), // 2.0
  492. array('element'=>'applet', 'attribute'=>'code'), // 3.2
  493. array('element'=>'applet', 'attribute'=>'codebase'), // 3.2
  494. array('element'=>'area', 'attribute'=>'href'), // 3.2
  495. array('element'=>'body', 'attribute'=>'background'), // 3.2
  496. array('element'=>'img', 'attribute'=>'usemap'), // 3.2
  497. array('element'=>'input', 'attribute'=>'src'), // 3.2
  498. array('element'=>'applet', 'attribute'=>'archive'), // 4.01
  499. array('element'=>'applet', 'attribute'=>'object'), // 4.01
  500. array('element'=>'blockquote', 'attribute'=>'cite'), // 4.01
  501. array('element'=>'del', 'attribute'=>'cite'), // 4.01
  502. array('element'=>'frame', 'attribute'=>'longdesc'), // 4.01
  503. array('element'=>'frame', 'attribute'=>'src'), // 4.01
  504. array('element'=>'head', 'attribute'=>'profile'), // 4.01
  505. array('element'=>'iframe', 'attribute'=>'longdesc'), // 4.01
  506. array('element'=>'iframe', 'attribute'=>'src'), // 4.01
  507. array('element'=>'img', 'attribute'=>'longdesc'), // 4.01
  508. array('element'=>'input', 'attribute'=>'usemap'), // 4.01
  509. array('element'=>'ins', 'attribute'=>'cite'), // 4.01
  510. array('element'=>'object', 'attribute'=>'archive'), // 4.01
  511. array('element'=>'object', 'attribute'=>'classid'), // 4.01
  512. array('element'=>'object', 'attribute'=>'codebase'), // 4.01
  513. array('element'=>'object', 'attribute'=>'data'), // 4.01
  514. array('element'=>'object', 'attribute'=>'usemap'), // 4.01
  515. array('element'=>'q', 'attribute'=>'cite'), // 4.01
  516. array('element'=>'script', 'attribute'=>'src'), // 4.01
  517. array('element'=>'audio', 'attribute'=>'src'), // 5.0
  518. array('element'=>'command', 'attribute'=>'icon'), // 5.0
  519. array('element'=>'embed', 'attribute'=>'src'), // 5.0
  520. array('element'=>'event-source','attribute'=>'src'), // 5.0
  521. array('element'=>'html', 'attribute'=>'manifest'), // 5.0
  522. array('element'=>'source', 'attribute'=>'src'), // 5.0
  523. array('element'=>'video', 'attribute'=>'src'), // 5.0
  524. array('element'=>'video', 'attribute'=>'poster'), // 5.0
  525. array('element'=>'bgsound', 'attribute'=>'src'), // Extension
  526. array('element'=>'body', 'attribute'=>'credits'), // Extension
  527. array('element'=>'body', 'attribute'=>'instructions'), // Extension
  528. array('element'=>'body', 'attribute'=>'logo'), // Extension
  529. array('element'=>'div', 'attribute'=>'href'), // Extension
  530. array('element'=>'div', 'attribute'=>'src'), // Extension
  531. array('element'=>'embed', 'attribute'=>'code'), // Extension
  532. array('element'=>'embed', 'attribute'=>'pluginspage'), // Extension
  533. array('element'=>'html', 'attribute'=>'background'), // Extension
  534. array('element'=>'ilayer', 'attribute'=>'src'), // Extension
  535. array('element'=>'img', 'attribute'=>'dynsrc'), // Extension
  536. array('element'=>'img', 'attribute'=>'lowsrc'), // Extension
  537. array('element'=>'input', 'attribute'=>'dynsrc'), // Extension
  538. array('element'=>'input', 'attribute'=>'lowsrc'), // Extension
  539. array('element'=>'table', 'attribute'=>'background'), // Extension
  540. array('element'=>'td', 'attribute'=>'background'), // Extension
  541. array('element'=>'th', 'attribute'=>'background'), // Extension
  542. array('element'=>'layer', 'attribute'=>'src'), // Extension
  543. array('element'=>'xml', 'attribute'=>'src'), // Extension
  544. array('element'=>'button', 'attribute'=>'action'), // Forms 2.0
  545. array('element'=>'datalist', 'attribute'=>'data'), // Forms 2.0
  546. array('element'=>'form', 'attribute'=>'data'), // Forms 2.0
  547. array('element'=>'input', 'attribute'=>'action'), // Forms 2.0
  548. array('element'=>'select', 'attribute'=>'data'), // Forms 2.0
  549. // XHTML
  550. array('element'=>'html', 'attribute'=>'xmlns'),
  551. // WML
  552. array('element'=>'access', 'attribute'=>'path'), // 1.3
  553. array('element'=>'card', 'attribute'=>'onenterforward'), // 1.3
  554. array('element'=>'card', 'attribute'=>'onenterbackward'),// 1.3
  555. array('element'=>'card', 'attribute'=>'ontimer'), // 1.3
  556. array('element'=>'go', 'attribute'=>'href'), // 1.3
  557. array('element'=>'option', 'attribute'=>'onpick'), // 1.3
  558. array('element'=>'template', 'attribute'=>'onenterforward'), // 1.3
  559. array('element'=>'template', 'attribute'=>'onenterbackward'),// 1.3
  560. array('element'=>'template', 'attribute'=>'ontimer'), // 1.3
  561. array('element'=>'wml', 'attribute'=>'xmlns'), // 2.0
  562. );
  563. $match_metas = array(
  564. 'content-base',
  565. 'content-location',
  566. 'referer',
  567. 'location',
  568. 'refresh',
  569. );
  570. // Extract all elements
  571. if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
  572. return array( );
  573. $elements = $matches[1];
  574. $value_pattern = '=(("([^"]*)")|([^\s]*))';
  575. // Match elements and attributes
  576. foreach ( $match_elements as $match_element )
  577. {
  578. $name = $match_element['element'];
  579. $attr = $match_element['attribute'];
  580. $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
  581. if ( $name == 'object' )
  582. $split_pattern = '/\s*/u'; // Space-separated URL list
  583. else if ( $name == 'archive' )
  584. $split_pattern = '/,\s*/u'; // Comma-separated URL list
  585. else
  586. unset( $split_pattern ); // Single URL
  587. foreach ( $elements as $element )
  588. {
  589. if ( !preg_match( $pattern, $element, $match ) )
  590. continue;
  591. $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
  592. if ( !isset( $split_pattern ) )
  593. $urls[$name][$attr][] = $m;
  594. else
  595. {
  596. $msplit = preg_split( $split_pattern, $m );
  597. foreach ( $msplit as $ms )
  598. $urls[$name][$attr][] = $ms;
  599. }
  600. }
  601. }
  602. // Match meta http-equiv elements
  603. foreach ( $match_metas as $match_meta )
  604. {
  605. $attr_pattern = '/http-equiv="?' . $match_meta . '"?/iu';
  606. $content_pattern = '/content' . $value_pattern . '/iu';
  607. $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
  608. foreach ( $elements as $element )
  609. {
  610. if ( !preg_match( '/^meta/iu', $element ) ||
  611. !preg_match( $attr_pattern, $element ) ||
  612. !preg_match( $content_pattern, $element, $match ) )
  613. continue;
  614. $m = empty($match[3]) ? $match[4] : $match[3];
  615. if ( $match_meta != 'refresh' )
  616. $urls['meta']['http-equiv'][] = $m;
  617. else if ( preg_match( $refresh_pattern, $m, $match ) )
  618. $urls['meta']['http-equiv'][] = $match[2];
  619. }
  620. }
  621. // Match style attributes
  622. $urls['style'] = array( );
  623. $style_pattern = '/style' . $value_pattern . '/iu';
  624. foreach ( $elements as $element )
  625. {
  626. if ( !preg_match( $style_pattern, $element, $match ) )
  627. continue;
  628. $m = empty($match[3]) ? $match[4] : $match[3];
  629. $style_urls = extract_css_urls( $m );
  630. if ( !empty( $style_urls ) )
  631. $urls['style'] = array_merge_recursive(
  632. $urls['style'], $style_urls );
  633. }
  634. // Match style bodies
  635. if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
  636. {
  637. foreach ( $style_bodies[1] as $style_body )
  638. {
  639. $style_urls = extract_css_urls( $style_body );
  640. if ( !empty( $style_urls ) )
  641. $urls['style'] = array_merge_recursive(
  642. $urls['style'], $style_urls );
  643. }
  644. }
  645. if ( empty($urls['style']) )
  646. unset( $urls['style'] );
  647. return $urls;
  648. }
  649. /**
  650. * Extract URLs from UTF-8 CSS text.
  651. *
  652. * URLs within @import statements and url() property functions are extracted
  653. * and returned in an associative array of arrays. Array keys indicate
  654. * the use context for the URL, including:
  655. *
  656. * "import"
  657. * "property"
  658. *
  659. * Each value in the associative array is an array of URLs.
  660. *
  661. * Parameters:
  662. * text the UTF-8 text to scan
  663. *
  664. * Return values:
  665. * an associative array of arrays of URLs.
  666. *
  667. * See:
  668. * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
  669. */
  670. function extract_css_urls( $text )
  671. {
  672. $urls = array( );
  673. $url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
  674. $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
  675. $pattern = '/(' .
  676. '(@import\s*[\'"]' . $url_pattern . '[\'"])' .
  677. '|(@import\s*' . $urlfunc_pattern . ')' .
  678. '|(' . $urlfunc_pattern . ')' . ')/iu';
  679. if ( !preg_match_all( $pattern, $text, $matches ) )
  680. return $urls;
  681. // @import '...'
  682. // @import "..."
  683. foreach ( $matches[3] as $match )
  684. if ( !empty($match) )
  685. $urls['import'][] =
  686. preg_replace( '/\\\\(.)/u', '\\1', $match );
  687. // @import url(...)
  688. // @import url('...')
  689. // @import url("...")
  690. foreach ( $matches[7] as $match )
  691. if ( !empty($match) )
  692. $urls['import'][] =
  693. preg_replace( '/\\\\(.)/u', '\\1', $match );
  694. // url(...)
  695. // url('...')
  696. // url("...")
  697. foreach ( $matches[11] as $match )
  698. if ( !empty($match) )
  699. $urls['property'][] =
  700. preg_replace( '/\\\\(.)/u', '\\1', $match );
  701. return $urls;
  702. }