/lib/expack/rss/parser.inc.php

http://github.com/unpush/p2-php · PHP · 526 lines · 364 code · 57 blank · 105 comment · 68 complexity · f382889bbf6a788753455c1c7624e915 MD5 · raw file

  1. <?php
  2. /**
  3. * rep2expack - RSS Parser
  4. */
  5. require_once P2EX_LIB_DIR . '/rss/common.inc.php';
  6. require_once 'XML/RSS.php';
  7. // {{{ ImageCache2Ƃ̘Ag
  8. if ($GLOBALS['_conf']['expack.rss.with_imgcache'] &&
  9. ((!$GLOBALS['_conf']['ktai'] && $GLOBALS['_conf']['expack.ic2.enabled'] % 2 == 1) ||
  10. ($GLOBALS['_conf']['ktai'] && $GLOBALS['_conf']['expack.ic2.enabled'] >= 2)))
  11. {
  12. if (!class_exists('IC2_Switch', false)) {
  13. require P2EX_LIB_DIR . '/ic2/Switch.php';
  14. }
  15. if (IC2_Switch::get($GLOBALS['_conf']['ktai'])) {
  16. if (!function_exists('rss_get_image')) {
  17. require P2EX_LIB_DIR . '/rss/getimage.inc.php';
  18. }
  19. define('P2_RSS_IMAGECACHE_AVAILABLE', 1);
  20. } else {
  21. define('P2_RSS_IMAGECACHE_AVAILABLE', 0);
  22. }
  23. } else {
  24. define('P2_RSS_IMAGECACHE_AVAILABLE', 0);
  25. }
  26. // }}}
  27. // {{{ p2GetRSS()
  28. /**
  29. * RSS_E[hAp[XʂԂ
  30. */
  31. function p2GetRSS($remotefile, $atom = 0)
  32. {
  33. global $_conf;
  34. $refresh = (!empty($_GET['refresh']) || !empty($_POST['refresh']));
  35. $localpath = rss_get_save_path($remotefile);
  36. if (PEAR::isError($localpath)) {
  37. P2Util::pushInfoHtml('<p>' . $localpath->getMessage() . '</p>');
  38. return $localpath;
  39. }
  40. // ۑpfBNgȂ΂‚
  41. if (!is_dir(dirname($localpath))) {
  42. FileCtl::mkdirFor($localpath);
  43. }
  44. // If-Modified-Since‚Ń_E[hit@CAÂA[ĥƂj
  45. if (!file_exists($localpath) || $refresh ||
  46. filemtime($localpath) < (time() - $_conf['expack.rss.check_interval'] * 60)
  47. ) {
  48. $dl = P2Util::fileDownload($remotefile, $localpath, true, 301);
  49. if ($dl->isSuccess()) {
  50. chmod($localpath, $_conf['expack.rss.setting_perm']);
  51. }
  52. }
  53. // LbVXVȂA_E[hȂRSSp[X
  54. if (file_exists($localpath) && (!isset($dl) || $dl->isSuccess())) {
  55. if ($atom) {
  56. $atom = (isset($dl) && $dl->code == 200) ? 2 : 1;
  57. }
  58. $rss = p2ParseRSS($localpath, $atom);
  59. return $rss;
  60. } else {
  61. return $dl;
  62. }
  63. }
  64. // }}}
  65. // {{{ p2ParseRSS()
  66. /**
  67. * RSSp[X
  68. */
  69. function p2ParseRSS($xmlpath, $atom=0)
  70. {
  71. // $atom^ȂXSLgRSS 1.0ɕϊ
  72. // iϊς݃t@C݂ȂA$atom==2̂ƂɎsj
  73. // XML(Atom)encodingw肳ĂXSLTvZbT
  74. // R[hUTF-8(XSLŎw肵R[h)ɕϊĂ
  75. if ($atom) {
  76. $xslpath = P2EX_LIB_DIR . '/rss/atom03-to-rss10.xsl';
  77. $rsspath = $xmlpath . '.rss';
  78. if (file_exists($rsspath) && $atom != 2) {
  79. // OK
  80. } elseif (extension_loaded('xsl')) {
  81. if (!atom_to_rss($xmlpath, $xslpath, $rsspath)) {
  82. $retval = false;
  83. return $retval;
  84. }
  85. } else {
  86. P2Util::pushInfoHtml('<p>p2 error: AtomtB[hǂނɂPHPXSL@\gKvłB</p>');
  87. $retval = false;
  88. return $retval;
  89. }
  90. } else {
  91. $rsspath = $xmlpath;
  92. }
  93. // GR[fBO𔻒肵AXML_RSSNX̃CX^X𐶐
  94. // 2006-02-01 蓮p~
  95. /*$srcenc = 'UTF-8';
  96. $tgtenc = 'UTF-8';
  97. if ($fp = @fopen($rsspath, 'rb')) {
  98. $content = fgets($fp, 64);
  99. if (preg_match('/<\\?xml version=(["\'])1.0\\1 encoding=(["\'])(.+?)\\2 ?\\?>/', $content, $matches)) {
  100. $srcenc = $matches[3];
  101. }
  102. fclose($fp);
  103. }
  104. $rss = new XML_RSS($rsspath, $srcenc, $tgtenc);*/
  105. $rss = new XML_RSS($rsspath);
  106. if (PEAR::isError($rss)) {
  107. P2Util::pushInfoHtml('<p>p2 error: RSS - ' . $rss->getMessage() . '</p>');
  108. return $rss;
  109. }
  110. // ͑Ώۂ̃^O㏑
  111. $rss->channelTags = array_unique(array_merge($rss->channelTags, array (
  112. 'CATEGORY', 'CLOUD', 'COPYRIGHT', 'DESCRIPTION', 'DOCS', 'GENERATOR', 'IMAGE',
  113. 'ITEMS', 'LANGUAGE', 'LASTBUILDDATE', 'LINK', 'MANAGINGEditor', 'PUBDATE',
  114. 'RATING', 'SKIPDAYS', 'SKIPHOURS', 'TEXTINPUT', 'TITLE', 'TTL', 'WEBMASTER'
  115. )));
  116. $rss->itemTags = array_unique(array_merge($rss->itemTags, array (
  117. 'AUTHOR', 'CATEGORY', 'COMMENTS', 'CONTENT:ENCODED', 'DESCRIPTION',
  118. 'ENCLOSURE', 'GUID', 'LINK', 'PUBDATE', 'SOURCE', 'TITLE'
  119. )));
  120. $rss->imageTags = array_unique(array_merge($rss->imageTags, array (
  121. 'DESCRIPTION', 'HEIGHT', 'LINK', 'TITLE', 'URL', 'WIDTH'
  122. )));
  123. $rss->textinputTags = array_unique(array_merge($rss->textinputTags, array (
  124. 'DESCRIPTION', 'LINK', 'NAME', 'TITLE'
  125. )));
  126. $rss->moduleTags = array_unique(array_merge($rss->moduleTags, array (
  127. 'BLOGCHANNEL:BLOGROLL', 'BLOGCHANNEL:CHANGES', 'BLOGCHANNEL:MYSUBSCRIPTIONS',
  128. 'CC:LICENSE', 'CONTENT:ENCODED', 'DC:CONTRIBUTOR', 'DC:COVERAGE',
  129. 'DC:CREATOR', 'DC:DATE', 'DC:DESCRIPTION', 'DC:FORMAT', 'DC:IDENTIFIER',
  130. 'DC:LANGUAGE', 'DC:PUBDATE', 'DC:PUBLISHER', 'DC:RELATION', 'DC:RIGHTS',
  131. 'DC:SOURCE', 'DC:SUBJECT', 'DC:TITLE', 'DC:TYPE',
  132. 'SY:UPDATEBASE', 'SY:UPDATEFREQUENCY', 'SY:UPDATEPERIOD'
  133. )));
  134. // RSSp[X
  135. $result = $rss->parse();
  136. if (PEAR::isError($result)) {
  137. P2Util::pushInfoHtml('<p>p2 error: RSS - ' . $result->getMessage() . '</p>');
  138. return $result;
  139. }
  140. return $rss;
  141. }
  142. // }}}
  143. // {{{ atom_to_rss()
  144. /**
  145. * Atom 0.3 RSS 1.0 ɕϊiʁj
  146. */
  147. function atom_to_rss($input, $stylesheet, $output)
  148. {
  149. global $_conf;
  150. // ۑpfBNgȂ΂‚
  151. if (!is_dir(dirname($output))) {
  152. FileCtl::mkdirFor($output);
  153. }
  154. // ϊ
  155. if (extension_loaded('xslt')) { // PHP4, Sablotron
  156. $rss_content = atom_to_rss_by_xslt($input, $stylesheet, $output);
  157. } elseif (extension_loaded('xsl')) { // PHP5, LibXSLT
  158. $rss_content = atom_to_rss_by_xsl($input, $stylesheet, $output);
  159. }
  160. // `FbN
  161. if (!$rss_content) {
  162. if (file_exists($output)) {
  163. unlink($output);
  164. }
  165. return FALSE;
  166. }
  167. chmod($output, $_conf['expack.rss.setting_perm']);
  168. // FreeBSD 5.3 Ports textproc/php4-xslt ł̓oÔϊ̍ۂɖOԂ̂ŕ␳
  169. // (php4-xslt-4.3.10_2, expat-1.95.8, libiconv-1.9.2_1, Sablot-1.0.1)
  170. // oÔȂ‹Ȃ牽ςȂEEE͂B
  171. $rss_fix_patterns = array(
  172. '/<(\/)?(RDF|Seq|li)( .+?)?>/u' => '<$1rdf:$2$3>',
  173. '/<(channel|item) about=/u' => '<$1 rdf:about=',
  174. '/<(\/)?(encoded)>/u' => '<$1content:$2>',
  175. '/<(\/)?(creator|subject|date|pubdate)>/u' => '<$1dc:$2>');
  176. $rss_fixed = preg_replace(array_keys($rss_fix_patterns), array_values($rss_fix_patterns), $rss_content);
  177. if (md5($rss_content) != md5($rss_fixed)) {
  178. $fp = @fopen($output, 'wb') or p2die("cannot write. ({$output})");
  179. flock($fp, LOCK_EX);
  180. fwrite($fp, $rss_fixed);
  181. flock($fp, LOCK_UN);
  182. fclose($fp);
  183. }
  184. return TRUE;
  185. }
  186. // }}}
  187. // {{{ atom_to_rss_by_xslt()
  188. /**
  189. * Atom 0.3 RSS 1.0 ɕϊiPHP4, XSLTj
  190. */
  191. function atom_to_rss_by_xslt($input, $stylesheet, $output)
  192. {
  193. $xh = xslt_create();
  194. if (!@xslt_process($xh, $input, $stylesheet, $output)) {
  195. $errmsg = xslt_errno($xh) . ': ' . xslt_error($xh);
  196. P2Util::pushInfoHtml('<p>p2 error: XSLT - AtomRSSɕϊł܂łB(' . $errmsg . ')</p>');
  197. xslt_free($xh);
  198. return FALSE;
  199. }
  200. xslt_free($xh);
  201. return FileCtl::file_read_contents($output);
  202. }
  203. // }}}
  204. // {{{ atom_to_rss_by_xsl()
  205. /**
  206. * Atom 0.3 RSS 1.0 ɕϊiPHP5, DOM & XSLj
  207. */
  208. function atom_to_rss_by_xsl($input, $stylesheet, $output)
  209. {
  210. $xmlDoc = new DomDocument;
  211. if ($xmlDoc->load(realpath($input))) {
  212. $xslDoc = new DomDocument;
  213. $xslDoc->load(realpath($stylesheet));
  214. $proc = new XSLTProcessor;
  215. $proc->importStyleSheet($xslDoc);
  216. $rssDoc = $proc->transformToDoc($xmlDoc);
  217. $rssDoc->save($output);
  218. $rss_content = FileCtl::file_read_contents($output);
  219. } else {
  220. $rss_content = null;
  221. }
  222. if (!$rss_content) {
  223. P2Util::pushInfoHtml('<p>p2 error: XSL - AtomRSSɕϊł܂łB</p>');
  224. return FALSE;
  225. }
  226. return $rss_content;
  227. }
  228. // }}}
  229. // {{{ rss_item_exists()
  230. /**
  231. * RSSitemvfɔCӂ̎qvf邩ǂ`FbN
  232. * vf͖
  233. */
  234. function rss_item_exists($items, $element)
  235. {
  236. foreach ($items as $item) {
  237. if (isset($item[$element]) && strlen(trim($item[$element])) > 0) {
  238. return TRUE;
  239. }
  240. }
  241. return FALSE;
  242. }
  243. // }}}
  244. // {{{ rss_format_date()
  245. /**
  246. * RSS̓t\pɒ
  247. */
  248. function rss_format_date($date)
  249. {
  250. if (preg_match('/(?P<date>(\d\d)?\d\d-\d\d-\d\d)T(?P<time>\d\d:\d\d(:\d\d)?)(?P<zone>([+\-])(\d\d):(\d\d)|Z)?/', $date, $t)) {
  251. $time = $t['date'].' '.$t['time'].' ';
  252. if ($t['zone'] && $t['zone'] != 'Z') {
  253. $time .= $t[6].$t[7].$t[8]; // [+-]HHMM
  254. } else {
  255. $time .= 'GMT';
  256. }
  257. return date('y/m/d H:i:s', strtotime($time));
  258. }
  259. return htmlspecialchars($date, ENT_QUOTES);
  260. }
  261. // }}}
  262. // {{{ rss_desc_converter()
  263. /**
  264. * RSSdescriptionvf\pɒ
  265. */
  266. function rss_desc_converter($description)
  267. {
  268. // HTML^OȂCR+LF/CR/LF<br>+LFɂȂǁAy`
  269. if (!preg_match('/<(\/?[A-Za-z]+[1-6]?)( [^>]+>)?( ?\/)?>/', $description)) {
  270. return preg_replace('/[ \t]*(\r\n?|\n)[ \t]*/', "<br>\n", trim($description));
  271. }
  272. // ‚^Oꗗ
  273. $allowed_tags = '<a><b><i><u><s><strong><em><code><br><h1><h2><h3><h4><h5><h6><p><div><address><blockquote><ol><ul><li><img>';
  274. // scriptvfstylevf͒gƂ܂Ƃ߂ď
  275. $description = preg_replace('/<(script|style)(?: .+?)?>(.+?)?<\/\1>/is', '', $description);
  276. // s‚̃^O
  277. $description = strip_tags($description, $allowed_tags);
  278. // ^Ȏ`FbN
  279. $description = preg_replace_callback('/<(\/?[A-Za-z]+[1-6]?)( [^>]+?)?>/', 'rss_desc_tag_cleaner', $description);
  280. return $description;
  281. }
  282. // }}}
  283. // {{{ rss_desc_tag_cleaner()
  284. /**
  285. * ^OȂǂR[obN֐
  286. */
  287. function rss_desc_tag_cleaner($tag)
  288. {
  289. global $_conf;
  290. $element = strtolower($tag[1]);
  291. $attributes = trim($tag[2]);
  292. $close = trim($tag[3]); // HTML 4.01`ŕ\̂Ŗ
  293. // I^OȂ
  294. if (!$attributes || substr($element, 0, 1) == '/') {
  295. return '<'.$element.'>';
  296. }
  297. $tag = '<'.$element;
  298. if (preg_match_all('/(?:^| )([A-Za-z\-]+)\s*=\s*("[^"]*"|\'[^\']*\'|\w[^ ]*)(?: |$)/', $attributes, $matches, PREG_SET_ORDER)) {
  299. foreach ($matches as $attr) {
  300. $key = strtolower($attr[1]);
  301. $value = $attr[2];
  302. // JavaScriptCxgnhEX^CV[gE^[QbgȂǂ̑͋֎~
  303. if (preg_match('/^(on[a-z]+|style|class|id|target)$/', $key)) {
  304. continue;
  305. }
  306. // l̈p폜
  307. $q = substr($value, 0, 1);
  308. if ($q == "'") {
  309. $value = str_replace('"', '&quot;', substr($value, 1, -1));
  310. } elseif ($q == '"') {
  311. $value = substr($value, 1, -1);
  312. }
  313. // ŕ
  314. switch ($key) {
  315. case 'href':
  316. if ($element != 'a' || preg_match('/^javascript:/i', $value)) {
  317. break; // avfȊOhref֎~
  318. }
  319. if (preg_match('|^[^/:]*/|', $value)) {
  320. $value = rss_url_rel_to_abs($value);
  321. }
  322. return '<a href="'.P2Util::throughIme($value).'"'.$_conf['ext_win_target_at'].'>';
  323. case 'src':
  324. if ($element != 'img' || preg_match('/^javascript:/i', $value)) {
  325. break; // imgvfȊOsrc֎~
  326. }
  327. if (preg_match('|^[^/:]*/|', $value)) {
  328. $value = rss_url_rel_to_abs($value);
  329. }
  330. if (P2_RSS_IMAGECACHE_AVAILABLE) {
  331. $image = rss_get_image($value, $GLOBALS['channel']['title']);
  332. if ($image[3] != P2_IMAGECACHE_OK) {
  333. if ($_conf['ktai']) {
  334. // ځ[摜 - g
  335. switch ($image[3]) {
  336. case P2_IMAGECACHE_ABORN:return '[p2:ځ[摜]';
  337. case P2_IMAGECACHE_BROKEN: return '[p2:]'; //
  338. case P2_IMAGECACHE_LARGE: return '[p2:]'; // ͌ł͖
  339. case P2_IMAGECACHE_VIRUS: return '[p2:EBXx]';
  340. default : return '[p2:unknown error]'; // \
  341. }
  342. } else {
  343. // ځ[摜 - PC
  344. return "<img src=\"{$image[0][0]}\" {$image[0][1]}>";
  345. }
  346. } elseif ($_conf['ktai']) {
  347. // CC\ - gсiPCpTlCTCYj
  348. return "<img src=\"{$image[1][0]}\" {$image[1][1]}>";
  349. } else {
  350. // CC\ - PCitTCYj
  351. return "<img src=\"{$image[0][0]}\" {$image[0][1]}>";
  352. }
  353. }
  354. // C[WLbV̂Ƃ摜͕\Ȃ
  355. break '';
  356. case 'alt':
  357. if ($element == 'img' && !P2_RSS_IMAGECACHE_AVAILABLE) {
  358. return ' [img:'.$value.']'; // 摜altɕ\
  359. }
  360. $tag .= ' ="'.$value.'"';
  361. break;
  362. case 'width':
  363. case 'height':
  364. // Ƃ肠
  365. break;
  366. default:
  367. $tag .= ' ="'.$value.'"';
  368. }
  369. } // endforeach
  370. // vfōŏImF
  371. switch ($element) {
  372. // hrefȂavf
  373. case 'a':
  374. return '<a>';
  375. // altȂimgvf
  376. case 'img':
  377. return '';
  378. }
  379. } // endif
  380. $tag .= '>';
  381. return $tag;
  382. }
  383. // }}}
  384. // {{{ rss_url_rel_to_abs()
  385. /**
  386. * URL URL ɂĕԂ֐
  387. *
  388. * O[oϐQƂƂ RSS URL ^]܂
  389. * ύXKvȉӏ̂Ŏ蔲
  390. */
  391. function rss_url_rel_to_abs($url)
  392. {
  393. // URL p[X
  394. $p = @parse_url($GLOBALS['channel']['link']);
  395. if (!$p || !isset($p['scheme']) || $p['scheme'] != 'http' || !isset($p['host'])) {
  396. return $url;
  397. }
  398. // [g URL 쐬
  399. $top = $p['scheme'] . '://';
  400. if (isset($p['user'])) {
  401. $top .= $p['user'];
  402. if (isset($p['pass'])) {
  403. $top .= '@' . $p['pass'];
  404. }
  405. $top .= ':';
  406. }
  407. $top .= $p['host'];
  408. if (isset($p['port'])) {
  409. $top .= ':' . $p['port'];
  410. }
  411. // ΃pXȂ烋[g URL ƌĕԂ
  412. if (substr($url, 0, 1) == '/') {
  413. return $top . $url;
  414. }
  415. // [g URL ɃXbVt
  416. $top .= '/';
  417. // `l̃pX𕪉
  418. if (isset($p['path'])) {
  419. $paths1 = explode('/', trim($p['path'], '/'));
  420. } else {
  421. $paths1 = array();
  422. }
  423. // URL 𕪉
  424. if ($query = strstr($url, '?')) {
  425. $paths2 = explode('/', substr($url, 0, strlen($query) * -1));
  426. } else {
  427. $paths2 = explode('/', $url);
  428. $query = '';
  429. }
  430. // URL ̃pX΃pXɉ
  431. while (($s = array_shift($paths2)) !== null) {
  432. $r = $s;
  433. switch ($s) {
  434. case '':
  435. case '.':
  436. // pass
  437. break;
  438. case '..':
  439. array_pop($paths1);
  440. break;
  441. default:
  442. array_push($paths1, $s);
  443. }
  444. }
  445. // ΃pXXbVŏIĂƂ̏
  446. if ($r === '') {
  447. array_push($paths1, '');
  448. }
  449. // URL Ԃ
  450. return $top . implode('/', $paths1) . $query;
  451. }
  452. // }}}
  453. /*
  454. * Local Variables:
  455. * mode: php
  456. * coding: cp932
  457. * tab-width: 4
  458. * c-basic-offset: 4
  459. * indent-tabs-mode: nil
  460. * End:
  461. */
  462. // vim: set syn=php fenc=cp932 ai et ts=4 sw=4 sts=4 fdm=marker: