PageRenderTime 54ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/e107_plugins/import/providers/html_import_class.php

https://github.com/CasperGemini/e107
PHP | 574 lines | 470 code | 79 blank | 25 comment | 15 complexity | 348d1c7b71926434d09d63a9dcd45858 MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /*
  3. * e107 website system
  4. *
  5. * Copyright (C) 2008-2009 e107 Inc (e107.org)
  6. * Released under the terms and conditions of the
  7. * GNU General Public License (http://www.gnu.org/licenses/gpl.txt)
  8. *
  9. *
  10. *
  11. * $Source: /cvs_backup/e107_0.8/e107_plugins/import/wordpress_import_class.php,v $
  12. * $Revision: 11315 $
  13. * $Date: 2010-02-10 10:18:01 -0800 (Wed, 10 Feb 2010) $
  14. * $Author: secretr $
  15. */
  16. //$import_class_names['html_import'] = 'HTML';
  17. //$import_class_comment['html_import'] = 'Import content from an html website. eg. created with Frontpage, Dreamweaver or Notepad etc. ';
  18. //$import_class_support['html_import'] = array('news','page');
  19. //$import_default_prefix['html_import'] = '';
  20. require_once('import_classes.php');
  21. class html_import extends base_import_class
  22. {
  23. public $title = 'HTML';
  24. public $description = 'Import content from an html website. eg. created with Frontpage, Dreamweaver or Notepad etc. ';
  25. public $supported = array('news','page');
  26. public $mprefix = false;
  27. public $override = true;
  28. var $sourceType = 'rss';
  29. var $feedUrl = null;
  30. var $defaultClass = false;
  31. var $useTidy = true;
  32. var $action = 'preview'; // default action after setup page;
  33. private $localPath = '';
  34. private $content = array();
  35. private $contentArray = array();
  36. function init()
  37. {
  38. $this->feedUrl = vartrue($_POST['siteUrl'],false);
  39. $this->feedUrl = rtrim($this->feedUrl,"/");
  40. if($_POST['preview'])
  41. {
  42. $this->previewContent();
  43. return false;
  44. }
  45. if($_POST['do_conversion'])
  46. {
  47. $import = $this->sortSelection();
  48. $this->doConversion($import);
  49. }
  50. }
  51. function sortSelection()
  52. {
  53. $import = array();
  54. foreach($_POST as $k=>$v)
  55. {
  56. if($v == 'news' || $v=='page')
  57. {
  58. $file = str_replace("add__","",$k);
  59. $import[$v][] = $file;
  60. }
  61. }
  62. return $import;
  63. }
  64. function doConversion($data)
  65. {
  66. print_a($data);
  67. }
  68. function config()
  69. {
  70. $var[0]['caption'] = "Website Home-page URL";
  71. $var[0]['html'] = "<input class='tbox' type='text' name='siteUrl' size='80' value='{$_POST['rss_feed']}' maxlength='250' />";
  72. return $var;
  73. }
  74. // Set up a query for the specified task.
  75. // Returns TRUE on success. FALSE on error
  76. function setupQuery($task, $blank_user=FALSE)
  77. {
  78. $this->arrayData = array();
  79. print_a($_POST);
  80. $file = $this->feedUrl;
  81. switch ($task)
  82. {
  83. case 'news' :
  84. case 'page' :
  85. case 'links' :
  86. // $rawData = $xml->getRemoteFile($file);
  87. // print_a($rawData);
  88. //$content = $this->getAll();
  89. if ($array === FALSE || $file === FALSE) return FALSE;
  90. foreach($array['channel']['item'] as $val)
  91. {
  92. $this->arrayData[] = $val;
  93. }
  94. $this->arrayData = array_reverse($this->arrayData); // most recent last.
  95. reset($this->arrayData);
  96. break;
  97. default :
  98. return FALSE;
  99. }
  100. $this->copyUserInfo = !$blank_user;
  101. $this->currentTask = $task;
  102. return TRUE;
  103. }
  104. private function getAll($root = '')
  105. {
  106. $html = $this->getRawHtml($root);
  107. $pages = $this->findLinks($html);
  108. $c = 0;
  109. foreach($pages as $url=>$p)
  110. {
  111. // echo "url=".$url;
  112. $html = $this->getAll($url);
  113. $html = str_replace("\n","",$html); // strip line-breaks.
  114. $html = preg_replace("/<title>([^<]*)<\/title>/i","",$html);
  115. $html = trim($html,"\n");
  116. $body = trim(strip_tags($html,"<b><i><u><strong><em><br><img><object><embed><a>"));
  117. $this->content[$url] = array(
  118. 'title' => str_replace("\n","",$p['title']),
  119. // 'raw' => $html,
  120. 'body' => $body
  121. );
  122. $c++;
  123. if($c == 15)
  124. {
  125. break;
  126. }
  127. }
  128. return $this->content;
  129. }
  130. private function previewContent()
  131. {
  132. $frm = e107::getForm();
  133. $ns = e107::getRender();
  134. $tp = e107::getParser();
  135. $content = $this->getAll();
  136. $text = "<form method='post' action='".e_SELF."?import_type=html_import' id='core-import-form'>
  137. <fieldset id='core-import-select-type'>
  138. <legend class='e-hideme'>".DBLAN_10."</legend>
  139. <table class='table adminlist'>
  140. <colgroup>
  141. <col style='width:40%' />
  142. <col />
  143. <col />
  144. <col />
  145. </colgroup>
  146. <thead>
  147. <tr>
  148. <th>".LAN_TITLE."</th>
  149. <th>Sample</th>
  150. <th>".LAN_URL."</th>
  151. <th class='center'>".LAN_OPTIONS."</th>
  152. </tr>
  153. </thead>
  154. <tbody>\n";
  155. foreach ($content as $key=>$data)
  156. {
  157. $text .= "<tr>
  158. <td>".$data['title']."</td>\n
  159. <td>".$tp->text_truncate($data['body'],150)."</td>\n
  160. <td>
  161. <a class='e-dialog' href='".$this->localPath.$key."'>".$key."</a>
  162. </td>
  163. ";
  164. $text .= "
  165. <td>
  166. ".$frm->select('add__'.$key,array('news'=>'News','page'=>'Page','0'=>'Ignore'))."
  167. </td>
  168. </tr>";
  169. }
  170. $text .= "
  171. </tbody>
  172. </table>
  173. <div class='buttons-bar center'>
  174. ".$frm->admin_button('do_conversion',LAN_CONTINUE, 'execute').
  175. $frm->admin_button('back',LAN_CANCEL, 'cancel')."
  176. <input type='hidden' name='db_import_type' value='html_import' />
  177. <input type='hidden' name='import_type' value='html_import' />
  178. <input type='hidden' name='import_source' value='".$this->sourceType."' />
  179. <input type='hidden' name='import_block_news' value='1' />
  180. <input type='hidden' name='siteUrl' value='".$this->feedUrl."' />
  181. </div>
  182. </fieldset>
  183. </form>";
  184. $ns->tablerender(LAN_PLUGIN_IMPORT_NAME.SEP.$this->feedUrl,$text);
  185. }
  186. private function getRawHtml($file='')
  187. {
  188. $url = $this->feedUrl."/".$file;
  189. if($file == '') { $file = "index.html"; } // just for local file, not url.
  190. $path = md5($this->feedUrl);
  191. $local_file = $path."/".$file;
  192. $this->localPath = e_TEMP.$path."/";
  193. if(!is_dir(e_TEMP.$path))
  194. {
  195. mkdir(e_TEMP.$path,0755);
  196. }
  197. if(!file_exists(e_TEMP.$local_file))
  198. {
  199. e107::getFile()->getRemoteFile($url, $local_file); // downloads to e107_system/.../temp
  200. }
  201. if($this->useTidy)
  202. {
  203. $tidy = new tidy();
  204. $options = array("output-xhtml" => true, "clean" => true);
  205. $parsed = tidy_parse_file(e_TEMP.$local_file,$options);
  206. return $parsed->value;
  207. }
  208. elseif(!$html = file_get_contents(e_TEMP.$local_file))
  209. {
  210. return "Couldn't read file";
  211. }
  212. return $html;
  213. }
  214. private function findLinks($content,$type='html')
  215. {
  216. $doc = new DOMDocument();
  217. $doc->loadHTML($content);
  218. $urls = $doc->getElementsByTagName('a');
  219. $pages = array();
  220. foreach ($urls as $u)
  221. {
  222. $title = str_replace("\n","",$u->nodeValue);
  223. $href = $u->attributes->getNamedItem('href')->value;
  224. $href = ltrim(str_replace($this->feedUrl,"",$href),"/");
  225. if($type == 'html' && (substr($href,-5,5)=='.html' || substr($href,-4,4)=='.htm'))
  226. {
  227. $pages[$href] = array('title'=>$title, 'href'=>$href);
  228. }
  229. }
  230. return $pages;
  231. }
  232. //------------------------------------
  233. // Internal functions below here
  234. //------------------------------------
  235. /**
  236. * Align source data to e107 User Table
  237. * @param $target array - default e107 target values for e107_user table.
  238. * @param $source array - WordPress table data
  239. */
  240. function copyUserData(&$target, &$source)
  241. {
  242. }
  243. /**
  244. * Align source data with e107 News Table
  245. * @param $target array - default e107 target values for e107_news table.
  246. * @param $source array - RSS data
  247. */
  248. function copyNewsData(&$target, &$source)
  249. {
  250. if(!$content = $this->process('content_encoded',$source))
  251. {
  252. $body = $this->process('description',$source);
  253. }
  254. else
  255. {
  256. $body = $content;
  257. }
  258. $body = $this->saveImages($body,'news');
  259. $keywords = $this->process('category',$source);
  260. if(!vartrue($source['title'][0]))
  261. {
  262. list($title,$newbody) = explode("<br />",$body,2);
  263. $title = strip_tags($title);
  264. if(trim($newbody)!='')
  265. {
  266. $body = $newbody;
  267. }
  268. }
  269. else
  270. {
  271. $title = $source['title'][0];
  272. }
  273. $target['news_title'] = $title;
  274. // $target['news_sef'] = $source['post_name'];
  275. $target['news_body'] = "[html]".$body."[/html]";
  276. // $target['news_extended'] = '';
  277. $target['news_meta_keywords'] = implode(",",$keywords);
  278. // $target['news_meta_description'] = '';
  279. $target['news_datestamp'] = strtotime($source['pubDate'][0]);
  280. // $target['news_author'] = $source['post_author'];
  281. // $target['news_category'] = '';
  282. // $target['news_allow_comments'] = ($source['comment_status']=='open') ? 1 : 0;
  283. // $target['news_start'] = '';
  284. // $target['news_end'] = '';
  285. /// $target['news_class'] = '';
  286. // $target['news_render_type'] = '';
  287. // $target['news_comment_total'] = $source['comment_count'];
  288. // $target['news_summary'] = $source['post_excerpt'];
  289. // $target['news_thumbnail'] = '';
  290. // $target['news_sticky'] = '';
  291. return $target; // comment out to debug
  292. $this->renderDebug($source,$target);
  293. // DEBUG INFO BELOW.
  294. }
  295. function process($type='description',$source)
  296. {
  297. switch ($type)
  298. {
  299. case 'category':
  300. $keywords = array();
  301. if(is_array(varset($source['category'][0])))
  302. {
  303. foreach($source['category'] as $val)
  304. {
  305. if(varset($val['@value']))
  306. {
  307. $keywords[] = $val['@value'];
  308. }
  309. }
  310. return $keywords;
  311. }
  312. elseif(is_array(varset($source['category'])))
  313. {
  314. foreach($source['category'] as $val)
  315. {
  316. if(varset($val) && is_string($val))
  317. {
  318. $keywords[] = $val;
  319. }
  320. }
  321. return $keywords;
  322. }
  323. break;
  324. default:
  325. return varset($source[$type][0]);
  326. break;
  327. }
  328. }
  329. /**
  330. * Align source data to e107 Page Table
  331. * @param $target array - default e107 target values for e107_page table.
  332. * @param $source array - WordPress table data
  333. */
  334. function copyPageData(&$target, &$source)
  335. {
  336. $body = $this->saveImages($source['description'][0],'page');
  337. // $target['page_id'] = $source['ID']; // auto increment
  338. $target['page_title'] = $source['title'][0];
  339. // $target['page_sef'] = $source['post_name'];
  340. $target['page_text'] = "[html]".$body."[/html]";
  341. // $target['page_metakeys'] = '';
  342. // $target['page_metadscr'] = '';
  343. $target['page_datestamp'] = strtotime($source['pubDate'][0]);
  344. // $target['page_author'] = $source['post_author'];
  345. // $target['page_category'] = '',
  346. // $target['page_comment_flag'] = ($source['comment_status']=='open') ? 1 : 0;
  347. // $target['page_password'] = $source['post_password'];
  348. return $target; // comment out to debug
  349. // DEBUG INFO BELOW.
  350. $this->renderDebug($source,$target);
  351. }
  352. /**
  353. * Align source data to e107 Links Table
  354. * @param $target array - default e107 target values for e107_links table.
  355. * @param $source array - WordPress table data
  356. */
  357. function copyLinksData(&$target, &$source)
  358. {
  359. $tp = e107::getParser();
  360. // $target['page_id'] = $source['ID']; // auto increment
  361. $target['link_name'] = $source['title'][0];
  362. $target['link_url'] = $source['link'][0];
  363. // $target['link_description'] = "[html]".$source['post_content']."[/html]";
  364. // $target['link_button'] = '';
  365. // $target['link_category'] = '';
  366. // $target['link_order'] = strtotime($source['post_date']);
  367. // $target['link_parent'] = $source['post_author'];
  368. // $target['link_open'] = '';
  369. // $target['link_class'] = '';
  370. // $target['link_sefurl'] = $source['post_password'];
  371. return $target; // comment out to debug
  372. $this->renderDebug($source,$target);
  373. }
  374. /** Download and Import remote images and update body text with local relative-links. eg. {e_MEDIA}
  375. * @param returns text-body with remote links replaced with local ones for the images downloaded.
  376. */
  377. function saveImages($body,$cat='news')
  378. {
  379. $mes = e107::getMessage();
  380. $med = e107::getMedia();
  381. $tp = e107::getParser();
  382. $search = array();
  383. $replace = array();
  384. // echo htmlentities($body);
  385. preg_match_all("/(((http:\/\/www)|(http:\/\/)|(www))[-a-zA-Z0-9@:%_\+.~#?&\/\/=]+)\.(jpg|jpeg|gif|png|svg)/im",$body,$matches);
  386. $fl = e107::getFile();
  387. if(is_array($matches[0]))
  388. {
  389. $relPath = 'images/'.md5($this->feedUrl);
  390. if(!is_dir(e_MEDIA.$relPath))
  391. {
  392. mkdir(e_MEDIA.$relPath,'0755');
  393. }
  394. foreach($matches[0] as $link)
  395. {
  396. if(file_exists($relPath."/".$filename))
  397. {
  398. continue;
  399. }
  400. $filename = basename($link);
  401. $fl->getRemoteFile($link,$relPath."/".$filename);
  402. $search[] = $link;
  403. $replace[] = $tp->createConstants(e_MEDIA.$relPath."/".$filename,1);
  404. }
  405. }
  406. if(count($search))
  407. {
  408. $med->import($cat,e_MEDIA.$relPath);
  409. }
  410. return str_replace($search,$replace,$body);
  411. }
  412. function renderDebug($source,$target)
  413. {
  414. // echo print_a($target);
  415. // return;
  416. echo "
  417. <div style='width:1000px'>
  418. <table style='width:100%'>
  419. <tr>
  420. <td style='width:500px;padding:10px'>".print_a($source,TRUE)."</td>
  421. <td style='border-left:1px solid black;padding:10px'>".print_a($target,TRUE)."</td>
  422. </tr>
  423. </table>
  424. </div>";
  425. }
  426. }
  427. ?>