PageRenderTime 38ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/part2/part2admin/common/FCKeditor/editor/filemanager/connectors/php/sanitizer.php

http://part2web.googlecode.com/
PHP | 486 lines | 364 code | 31 blank | 91 comment | 35 complexity | 41ce91f3ea9b1a2dd37db5dc34c33917 MD5 | raw file
  1. <?php
  2. /**
  3. * Regular expression to match various types of character references in
  4. * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  5. */
  6. define( 'MW_CHAR_REFS_REGEX',
  7. '/&([A-Za-z0-9\x80-\xff]+);
  8. |&\#([0-9]+);
  9. |&\#x([0-9A-Za-z]+);
  10. |&\#X([0-9A-Za-z]+);
  11. |(&)/x' );
  12. /**
  13. * codepointToUtf8( UNICODE_REPLACEMENT )
  14. */
  15. define( 'UTF8_REPLACEMENT', "\xef\xbf\xbd");
  16. /**
  17. * Class borrowed from Mediawiki, based on the following files:
  18. * Sanitizer.php, SpecialUpload.php, UtfNormal.php, UtfNormalUtil.php
  19. *
  20. */
  21. class Sanitizer {
  22. /**
  23. * List of all named character entities defined in HTML 4.01
  24. * http://www.w3.org/TR/html4/sgml/entities.html
  25. */
  26. var $htmlEntities = array(
  27. 'Aacute' => 193,
  28. 'aacute' => 225,
  29. 'Acirc' => 194,
  30. 'acirc' => 226,
  31. 'acute' => 180,
  32. 'AElig' => 198,
  33. 'aelig' => 230,
  34. 'Agrave' => 192,
  35. 'agrave' => 224,
  36. 'alefsym' => 8501,
  37. 'Alpha' => 913,
  38. 'alpha' => 945,
  39. 'amp' => 38,
  40. 'and' => 8743,
  41. 'ang' => 8736,
  42. 'Aring' => 197,
  43. 'aring' => 229,
  44. 'asymp' => 8776,
  45. 'Atilde' => 195,
  46. 'atilde' => 227,
  47. 'Auml' => 196,
  48. 'auml' => 228,
  49. 'bdquo' => 8222,
  50. 'Beta' => 914,
  51. 'beta' => 946,
  52. 'brvbar' => 166,
  53. 'bull' => 8226,
  54. 'cap' => 8745,
  55. 'Ccedil' => 199,
  56. 'ccedil' => 231,
  57. 'cedil' => 184,
  58. 'cent' => 162,
  59. 'Chi' => 935,
  60. 'chi' => 967,
  61. 'circ' => 710,
  62. 'clubs' => 9827,
  63. 'cong' => 8773,
  64. 'copy' => 169,
  65. 'crarr' => 8629,
  66. 'cup' => 8746,
  67. 'curren' => 164,
  68. 'dagger' => 8224,
  69. 'Dagger' => 8225,
  70. 'darr' => 8595,
  71. 'dArr' => 8659,
  72. 'deg' => 176,
  73. 'Delta' => 916,
  74. 'delta' => 948,
  75. 'diams' => 9830,
  76. 'divide' => 247,
  77. 'Eacute' => 201,
  78. 'eacute' => 233,
  79. 'Ecirc' => 202,
  80. 'ecirc' => 234,
  81. 'Egrave' => 200,
  82. 'egrave' => 232,
  83. 'empty' => 8709,
  84. 'emsp' => 8195,
  85. 'ensp' => 8194,
  86. 'Epsilon' => 917,
  87. 'epsilon' => 949,
  88. 'equiv' => 8801,
  89. 'Eta' => 919,
  90. 'eta' => 951,
  91. 'ETH' => 208,
  92. 'eth' => 240,
  93. 'Euml' => 203,
  94. 'euml' => 235,
  95. 'euro' => 8364,
  96. 'exist' => 8707,
  97. 'fnof' => 402,
  98. 'forall' => 8704,
  99. 'frac12' => 189,
  100. 'frac14' => 188,
  101. 'frac34' => 190,
  102. 'frasl' => 8260,
  103. 'Gamma' => 915,
  104. 'gamma' => 947,
  105. 'ge' => 8805,
  106. 'gt' => 62,
  107. 'harr' => 8596,
  108. 'hArr' => 8660,
  109. 'hearts' => 9829,
  110. 'hellip' => 8230,
  111. 'Iacute' => 205,
  112. 'iacute' => 237,
  113. 'Icirc' => 206,
  114. 'icirc' => 238,
  115. 'iexcl' => 161,
  116. 'Igrave' => 204,
  117. 'igrave' => 236,
  118. 'image' => 8465,
  119. 'infin' => 8734,
  120. 'int' => 8747,
  121. 'Iota' => 921,
  122. 'iota' => 953,
  123. 'iquest' => 191,
  124. 'isin' => 8712,
  125. 'Iuml' => 207,
  126. 'iuml' => 239,
  127. 'Kappa' => 922,
  128. 'kappa' => 954,
  129. 'Lambda' => 923,
  130. 'lambda' => 955,
  131. 'lang' => 9001,
  132. 'laquo' => 171,
  133. 'larr' => 8592,
  134. 'lArr' => 8656,
  135. 'lceil' => 8968,
  136. 'ldquo' => 8220,
  137. 'le' => 8804,
  138. 'lfloor' => 8970,
  139. 'lowast' => 8727,
  140. 'loz' => 9674,
  141. 'lrm' => 8206,
  142. 'lsaquo' => 8249,
  143. 'lsquo' => 8216,
  144. 'lt' => 60,
  145. 'macr' => 175,
  146. 'mdash' => 8212,
  147. 'micro' => 181,
  148. 'middot' => 183,
  149. 'minus' => 8722,
  150. 'Mu' => 924,
  151. 'mu' => 956,
  152. 'nabla' => 8711,
  153. 'nbsp' => 160,
  154. 'ndash' => 8211,
  155. 'ne' => 8800,
  156. 'ni' => 8715,
  157. 'not' => 172,
  158. 'notin' => 8713,
  159. 'nsub' => 8836,
  160. 'Ntilde' => 209,
  161. 'ntilde' => 241,
  162. 'Nu' => 925,
  163. 'nu' => 957,
  164. 'Oacute' => 211,
  165. 'oacute' => 243,
  166. 'Ocirc' => 212,
  167. 'ocirc' => 244,
  168. 'OElig' => 338,
  169. 'oelig' => 339,
  170. 'Ograve' => 210,
  171. 'ograve' => 242,
  172. 'oline' => 8254,
  173. 'Omega' => 937,
  174. 'omega' => 969,
  175. 'Omicron' => 927,
  176. 'omicron' => 959,
  177. 'oplus' => 8853,
  178. 'or' => 8744,
  179. 'ordf' => 170,
  180. 'ordm' => 186,
  181. 'Oslash' => 216,
  182. 'oslash' => 248,
  183. 'Otilde' => 213,
  184. 'otilde' => 245,
  185. 'otimes' => 8855,
  186. 'Ouml' => 214,
  187. 'ouml' => 246,
  188. 'para' => 182,
  189. 'part' => 8706,
  190. 'permil' => 8240,
  191. 'perp' => 8869,
  192. 'Phi' => 934,
  193. 'phi' => 966,
  194. 'Pi' => 928,
  195. 'pi' => 960,
  196. 'piv' => 982,
  197. 'plusmn' => 177,
  198. 'pound' => 163,
  199. 'prime' => 8242,
  200. 'Prime' => 8243,
  201. 'prod' => 8719,
  202. 'prop' => 8733,
  203. 'Psi' => 936,
  204. 'psi' => 968,
  205. 'quot' => 34,
  206. 'radic' => 8730,
  207. 'rang' => 9002,
  208. 'raquo' => 187,
  209. 'rarr' => 8594,
  210. 'rArr' => 8658,
  211. 'rceil' => 8969,
  212. 'rdquo' => 8221,
  213. 'real' => 8476,
  214. 'reg' => 174,
  215. 'rfloor' => 8971,
  216. 'Rho' => 929,
  217. 'rho' => 961,
  218. 'rlm' => 8207,
  219. 'rsaquo' => 8250,
  220. 'rsquo' => 8217,
  221. 'sbquo' => 8218,
  222. 'Scaron' => 352,
  223. 'scaron' => 353,
  224. 'sdot' => 8901,
  225. 'sect' => 167,
  226. 'shy' => 173,
  227. 'Sigma' => 931,
  228. 'sigma' => 963,
  229. 'sigmaf' => 962,
  230. 'sim' => 8764,
  231. 'spades' => 9824,
  232. 'sub' => 8834,
  233. 'sube' => 8838,
  234. 'sum' => 8721,
  235. 'sup' => 8835,
  236. 'sup1' => 185,
  237. 'sup2' => 178,
  238. 'sup3' => 179,
  239. 'supe' => 8839,
  240. 'szlig' => 223,
  241. 'Tau' => 932,
  242. 'tau' => 964,
  243. 'there4' => 8756,
  244. 'Theta' => 920,
  245. 'theta' => 952,
  246. 'thetasym' => 977,
  247. 'thinsp' => 8201,
  248. 'THORN' => 222,
  249. 'thorn' => 254,
  250. 'tilde' => 732,
  251. 'times' => 215,
  252. 'trade' => 8482,
  253. 'Uacute' => 218,
  254. 'uacute' => 250,
  255. 'uarr' => 8593,
  256. 'uArr' => 8657,
  257. 'Ucirc' => 219,
  258. 'ucirc' => 251,
  259. 'Ugrave' => 217,
  260. 'ugrave' => 249,
  261. 'uml' => 168,
  262. 'upsih' => 978,
  263. 'Upsilon' => 933,
  264. 'upsilon' => 965,
  265. 'Uuml' => 220,
  266. 'uuml' => 252,
  267. 'weierp' => 8472,
  268. 'Xi' => 926,
  269. 'xi' => 958,
  270. 'Yacute' => 221,
  271. 'yacute' => 253,
  272. 'yen' => 165,
  273. 'Yuml' => 376,
  274. 'yuml' => 255,
  275. 'Zeta' => 918,
  276. 'zeta' => 950,
  277. 'zwj' => 8205,
  278. 'zwnj' => 8204 );
  279. /**
  280. * Return UTF-8 sequence for a given Unicode code point.
  281. * May die if fed out of range data.
  282. *
  283. * @param $codepoint Integer:
  284. * @return String
  285. * @public
  286. */
  287. function codepointToUtf8( $codepoint ) {
  288. if($codepoint < 0x80)
  289. return chr($codepoint);
  290. if($codepoint < 0x800)
  291. return chr($codepoint >> 6 & 0x3f | 0xc0) .
  292. chr($codepoint & 0x3f | 0x80);
  293. if($codepoint < 0x10000)
  294. return chr($codepoint >> 12 & 0x0f | 0xe0) .
  295. chr($codepoint >> 6 & 0x3f | 0x80) .
  296. chr($codepoint & 0x3f | 0x80);
  297. if($codepoint < 0x110000)
  298. return chr($codepoint >> 18 & 0x07 | 0xf0) .
  299. chr($codepoint >> 12 & 0x3f | 0x80) .
  300. chr($codepoint >> 6 & 0x3f | 0x80) .
  301. chr($codepoint & 0x3f | 0x80);
  302. return $codepoint ;
  303. }
  304. /**
  305. * Decode any character references, numeric or named entities,
  306. * in the text and return a UTF-8 string.
  307. *
  308. * @param string $text
  309. * @return string
  310. * @public
  311. * @static
  312. */
  313. function decodeCharReferences( $text ) {
  314. return preg_replace_callback( MW_CHAR_REFS_REGEX, array( $this, 'decodeCharReferencesCallback' ), $text ) ;
  315. }
  316. /**
  317. * @param string $matches
  318. * @return string
  319. */
  320. function decodeCharReferencesCallback( $matches ) {
  321. if( $matches[1] != '' ) {
  322. return $this->decodeEntity( $matches[1] ) ;
  323. } elseif( $matches[2] != '' ) {
  324. return $this->decodeChar( intval( $matches[2] ) ) ;
  325. } elseif( $matches[3] != '' ) {
  326. return $this->decodeChar( hexdec( $matches[3] ) ) ;
  327. } elseif( $matches[4] != '' ) {
  328. return $this->decodeChar( hexdec( $matches[4] ) ) ;
  329. }
  330. # Last case should be an ampersand by itself
  331. return $matches[0] ;
  332. }
  333. /**
  334. * Return UTF-8 string for a codepoint if that is a valid
  335. * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  336. * @param int $codepoint
  337. * @return string
  338. */
  339. function decodeChar( $codepoint ) {
  340. if( $this->validateCodepoint( $codepoint ) ) {
  341. return $this->codepointToUtf8( $codepoint ) ;
  342. } else {
  343. return UTF8_REPLACEMENT ;
  344. }
  345. }
  346. /**
  347. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  348. * return the UTF-8 encoding of that character. Otherwise, returns
  349. * pseudo-entity source (eg &foo;)
  350. *
  351. * @param string $name
  352. * @return string
  353. */
  354. function decodeEntity( $name ) {
  355. if( isset( $this->$htmlEntities[$name] ) ) {
  356. return $this->codepointToUtf8( $this->$htmlEntities[$name] ) ;
  357. } else {
  358. return "&$name;" ;
  359. }
  360. }
  361. /**
  362. * Returns true if a given Unicode codepoint is a valid character in XML.
  363. * @param int $codepoint
  364. * @return bool
  365. */
  366. function validateCodepoint( $codepoint ) {
  367. return ($codepoint == 0x09)
  368. || ($codepoint == 0x0a)
  369. || ($codepoint == 0x0d)
  370. || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
  371. || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
  372. || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff) ;
  373. }
  374. /**
  375. * Heuristig for detecting files that *could* contain JavaScript instructions or
  376. * things that may look like HTML to a browser and are thus
  377. * potentially harmful. The present implementation will produce false positives in some situations.
  378. *
  379. * @param string $file Pathname to the file
  380. * @return bool true if the file contains something looking like embedded scripts
  381. */
  382. function detectScript( $file ) {
  383. #For binarie field, just check the first K.
  384. $fp = fopen( $file, 'rb' ) ;
  385. $chunk = fread( $fp, 1024 ) ;
  386. fclose( $fp ) ;
  387. $chunk = strtolower( $chunk ) ;
  388. if (!$chunk)
  389. return false ;
  390. #decode from UTF-16 if needed (could be used for obfuscation).
  391. if ( substr( $chunk, 0, 2 ) == "\xfe\xff" )
  392. $enc = "UTF-16BE" ;
  393. elseif ( substr( $chunk, 0, 2 ) == "\xff\xfe" )
  394. $enc = "UTF-16LE" ;
  395. else
  396. $enc= NULL ;
  397. if ( $enc ) {
  398. $chunk_tmp = @iconv($enc, "ASCII//IGNORE", $chunk) ;
  399. if ( $chunk_tmp )
  400. $chunk = $chunk_tmp ;
  401. }
  402. $chunk = trim( $chunk ) ;
  403. #FIXME: convert from UTF-16 if necessarry!
  404. #check for HTML doctype
  405. if ( eregi( "<!DOCTYPE *X?HTML", $chunk ) ) {
  406. return true ;
  407. }
  408. /**
  409. * Internet Explorer for Windows performs some really stupid file type
  410. * autodetection which can cause it to interpret valid image files as HTML
  411. * and potentially execute JavaScript, creating a cross-site scripting
  412. * attack vectors.
  413. *
  414. * Apple's Safari browser also performs some unsafe file type autodetection
  415. * which can cause legitimate files to be interpreted as HTML if the
  416. * web server is not correctly configured to send the right content-type
  417. * (or if you're really uploading plain text and octet streams!)
  418. *
  419. * Returns true if IE is likely to mistake the given file for HTML.
  420. * Also returns true if Safari would mistake the given file for HTML
  421. * when served with a generic content-type.
  422. */
  423. $tags = array(
  424. '<body',
  425. '<head',
  426. '<html', #also in safari
  427. '<img',
  428. '<pre',
  429. '<script', #also in safari
  430. '<table',
  431. '<title'
  432. ) ;
  433. foreach( $tags as $tag ) {
  434. if( false !== strpos( $chunk, $tag ) ) {
  435. return true ;
  436. }
  437. }
  438. /*
  439. * look for javascript
  440. */
  441. #resolve entity-refs to look at attributes. may be harsh on big files... cache result?
  442. $chunk = $this->decodeCharReferences( $chunk ) ;
  443. #look for script-types
  444. if ( preg_match( '!type\s*=\s*[\'"]?\s*(?:\w*/)?(?:ecma|java)!sim', $chunk ) )
  445. return true ;
  446. #look for html-style script-urls
  447. if ( preg_match( '!(?:href|src|data)\s*=\s*[\'"]?\s*(?:ecma|java)script:!sim', $chunk ) )
  448. return true ;
  449. #look for css-style script-urls
  450. if ( preg_match( '!url\s*\(\s*[\'"]?\s*(?:ecma|java)script:!sim', $chunk ) )
  451. return true ;
  452. return false ;
  453. }
  454. }
  455. ?>