PageRenderTime 49ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/extract.php

https://github.com/mozilla/phpbb3-static
PHP | 390 lines | 318 code | 41 blank | 31 comment | 59 complexity | fc5604ad82f5984b151c2853e39106b0 MD5 | raw file
  1. <?php
  2. require_once('config.php');
  3. require_once('common.php');
  4. $forum_url = trim($forum_url, '/');
  5. // A category is a group of forums.
  6. function get_categories($phpbb_version, $db, $db_prefix) {
  7. $categories = array();
  8. if ($phpbb_version == PHPBB2) {
  9. $res = $db->query("SELECT cat_id, cat_title FROM {$db_prefix}categories " .
  10. "ORDER BY cat_order;");
  11. }
  12. else if ($phpbb_version == PHPBB3) {
  13. //FIXME: fix ordering
  14. $res = $db->query("SELECT forum_id AS cat_id, forum_name AS cat_title " .
  15. "FROM {$db_prefix}forums WHERE parent_id=0 ORDER BY left_id;");
  16. }
  17. foreach ($res as $row) {
  18. $categories[$row['cat_id']] = array(
  19. 'title' => $row['cat_title'],
  20. 'forums' => array(),
  21. );
  22. }
  23. return $categories;
  24. }
  25. function get_forums_tree($phpbb_version, $db, $db_prefix) {
  26. $forums_tree = array();
  27. if ($phpbb_version == PHPBB2) {
  28. $res = $db->query("SELECT forum_id, cat_id FROM {$db_prefix}forums;");
  29. foreach ($res as $row) {
  30. $forums_tree[$row['forum_id']] = array(
  31. 'parent_id' => -1,
  32. 'cat_id' => $row['cat_id'],
  33. );
  34. }
  35. }
  36. else if ($phpbb_version == PHPBB3) {
  37. $res = $db->query("SELECT forum_id, parent_id FROM {$db_prefix}forums;");
  38. foreach ($res as $row) {
  39. $forums_tree[$row['forum_id']] = array(
  40. 'parent_id' => $row['parent_id'],
  41. );
  42. }
  43. foreach ($forums_tree as $fid => $forum) {
  44. $parent_id = $forum['parent_id'];
  45. if ($parent_id != 0) {
  46. while ($parent_id != 0) {
  47. $cat_id = $parent_id;
  48. $parent_id = $forums_tree[$parent_id]['parent_id'];
  49. }
  50. $forums_tree[$fid]['cat_id'] = $cat_id;
  51. }
  52. }
  53. }
  54. return $forums_tree;
  55. }
  56. // Returns updated $topics.
  57. function get_posts($phpbb_version, $db, $db_prefix, $extracted) {
  58. global $forum_url;
  59. global $bb;
  60. $res = $db->query(
  61. 'SELECT config_value FROM ' . $db_prefix .
  62. "config WHERE config_name = 'smilies_path';");
  63. $smilies_path = $res->fetch()['config_value'];
  64. // This variable will be returned later.
  65. $topics = $extracted['topics'];
  66. // Cache of posts
  67. $dba_id = dba_open('posts.cache.dbm', 'c');
  68. // For each previously identified topic, fetch the corresponding posts.
  69. log_info("Topics:");
  70. foreach ($topics as $tid => $topic) {
  71. if ($phpbb_version == PHPBB2) {
  72. $res = $db->query('SELECT p.post_id, p.poster_id, p.post_username, u.username, p.post_time, pt.post_subject, pt.post_text, pt.bbcode_uid FROM '.$db_prefix.'posts p LEFT JOIN '.$db_prefix.'users u ON p.poster_id=u.user_id LEFT JOIN '.$db_prefix.'posts_text pt ON p.post_id=pt.post_id WHERE p.topic_id=' . $tid . ' ORDER BY p.post_time ASC');
  73. }
  74. else if ($phpbb_version == PHPBB3) {
  75. $res = $db->query(<<<SQL
  76. SELECT
  77. p.post_id,
  78. p.poster_id,
  79. p.post_username,
  80. u.username,
  81. p.post_time,
  82. p.post_subject,
  83. p.post_text,
  84. p.bbcode_uid
  85. FROM
  86. {$db_prefix}posts p
  87. LEFT JOIN {$db_prefix}users u ON p.poster_id=u.user_id
  88. WHERE
  89. p.topic_id={$tid}
  90. ORDER BY p.post_time ASC
  91. ;
  92. SQL
  93. );
  94. }
  95. $topics[$tid]['posts'] = array();
  96. foreach ($res as $row) {
  97. $post_id = $row['post_id'];
  98. $got_text = false;
  99. if (dba_exists($post_id, $dba_id)) {
  100. $post_text = dba_fetch($post_id, $dba_id);
  101. $got_text = true;
  102. } else {
  103. $url = $forum_url . '/viewtopic.php?t=' . $tid . '&p=' . $row['post_id'];
  104. $html = file_get_contents($url);
  105. if ($html !== false) {
  106. $doc = new DOMDocument();
  107. $caller = new ErrorTrap(array($doc, 'loadHTML'));
  108. $caller->call($html);
  109. // We could output these errors if we wanted. They could help
  110. // debugging HTML parsing issues.
  111. // if (!$caller->ok()) {
  112. // var_dump($caller->errors());
  113. // }
  114. $xpath = new DOMXpath($doc);
  115. foreach($xpath->query("//div[contains(@class, 'post') and contains(@id, 'p')]") as $div) {
  116. $id = $div->getAttribute('id');
  117. $textNodes = $xpath->query("//div[@id='{$id}']//div[@class='content']");
  118. // What if it wasn't found?
  119. $textNode = $textNodes[0];
  120. $text = $doc->saveHTML($textNode);
  121. $dbm_key = substr($id, 1);
  122. dba_insert($dbm_key, $text, $dba_id);
  123. }
  124. // Maybe the above succeeded, maybe it didn't.
  125. if (dba_exists($post_id, $dba_id)) {
  126. $post_text = dba_fetch($post_id, $dba_id);
  127. $got_text = true;
  128. }
  129. }
  130. }
  131. if (!$got_text) {
  132. error_log("Warning: Could not fetch post id {$row['post_id']}.");
  133. // We got a zero-length file. Let's try to parse the database
  134. // representation that we've retrieved from the database. If there are
  135. // links in it, they will be broken, but it's better than nothing.
  136. $post_text = $row['post_text'];
  137. $post_text = str_replace(':' . $row['bbcode_uid'], '', $post_text);
  138. $post_text = preg_replace('/\[(\/?)code:\d*\]/', '[\1code]', $post_text);
  139. $post_text = nl2br($bb->qParse($post_text));
  140. }
  141. // Fix the smilies paths. In an phpBB installation, links to smilies start
  142. // from the top level. In the case of the archive, topics are 3 levels down,
  143. // when you cound slashes. So if images are in the same place as previously,
  144. // we need to go 3 levels up to find them.
  145. $post_text = str_replace('src="./' . $smilies_path,
  146. 'src="../../../' . $smilies_path, $post_text);
  147. $topics[$tid]['posts'][] = array(
  148. 'username' => $row['username'],
  149. 'post_text' => $post_text,
  150. 'post_time' => $row['post_time'],
  151. 'bbcode_uid' => $row['bbcode_uid'],
  152. 'post_id' => $row['post_id'],
  153. );
  154. }
  155. log_info(" $tid");
  156. } // each $topics
  157. log_info(" done.\n");
  158. dba_close($dba_id);
  159. return $topics;
  160. }
  161. // loadHTML is spewing warnings that are of no interest to me, and silencing
  162. // them is excitingly complicated.
  163. // Solution copied from:
  164. // http://stackoverflow.com/questions/1148928/disable-warnings-when-loading-non-well-formed-html-by-domdocument-php
  165. class ErrorTrap {
  166. protected $callback;
  167. protected $errors = array();
  168. function __construct($callback) {
  169. $this->callback = $callback;
  170. }
  171. function call() {
  172. $result = null;
  173. set_error_handler(array($this, 'onError'));
  174. try {
  175. $result = call_user_func_array($this->callback, func_get_args());
  176. } catch (Exception $ex) {
  177. restore_error_handler();
  178. throw $ex;
  179. }
  180. restore_error_handler();
  181. return $result;
  182. }
  183. function onError($errno, $errstr, $errfile, $errline) {
  184. $this->errors[] = array($errno, $errstr, $errfile, $errline);
  185. }
  186. function ok() {
  187. return count($this->errors) === 0;
  188. }
  189. function errors() {
  190. return $this->errors;
  191. }
  192. }
  193. // List of topics
  194. function get_forums_and_topics($phpbb_version, $db, $db_prefix, $extracted) {
  195. global $filter_forum;
  196. global $phpbb3_minor_version;
  197. $topics = array();
  198. $forums = array();
  199. // Get details of each forum.
  200. if ($phpbb_version == PHPBB2) {
  201. $res = $db->query("SELECT forum_id, forum_name, forum_posts, forum_topics FROM {$db_prefix}forums ORDER BY forum_order;");
  202. }
  203. else if ($phpbb_version == PHPBB3) {
  204. //FIXME: fix ordering
  205. if($phpbb3_minor_version == 0) {
  206. $res = $db->query("SELECT forum_id, forum_name, forum_posts, forum_topics FROM {$db_prefix}forums WHERE parent_id<>0 ORDER BY left_id;");
  207. } elseif ($phpbb3_minor_version == 1 || $phpbb3_minor_version == 2) {
  208. $res = $db->query("SELECT forum_id, forum_name, forum_posts_approved, forum_topics_approved FROM {$db_prefix}forums WHERE parent_id<>0 ORDER BY left_id;");
  209. } else {
  210. die('Unknown PHPBB minor version');
  211. }
  212. }
  213. $categories = $extracted['categories'];
  214. foreach ($res as $row) {
  215. $fid = $row['forum_id'];
  216. if (in_array($fid, $filter_forum)) {
  217. continue;
  218. }
  219. $forums_tree = $extracted['forums_tree'];
  220. $cat_id = $forums_tree[$fid]['cat_id'];
  221. if($phpbb3_minor_version == 0) {
  222. $forums[$fid] = array(
  223. 'title' => $row['forum_name'],
  224. 'nposts' => $row['forum_posts'],
  225. 'ntopics' => $row['forum_topics'],
  226. 'topics' => array()
  227. );
  228. } elseif ($phpbb3_minor_version == 1 || $phpbb3_minor_version == 2) {
  229. $forums[$fid] = array(
  230. 'title' => $row['forum_name'],
  231. 'nposts' => $row['forum_posts_approved'],
  232. 'ntopics' => $row['forum_topics_approved'],
  233. 'topics' => array()
  234. );
  235. } else {
  236. die('Unknown PHPBB minor version');
  237. }
  238. $categories[$cat_id]['forums'][] = $fid;
  239. }
  240. // Get topics
  241. if($phpbb3_minor_version == 0) {
  242. $res = $db->query(<<<SQL
  243. SELECT
  244. t.forum_id,
  245. t.topic_id,
  246. t.topic_title,
  247. t.topic_time,
  248. t.topic_replies,
  249. u.username
  250. FROM
  251. {$db_prefix}topics t
  252. LEFT JOIN {$db_prefix}users u ON t.topic_poster=u.user_id
  253. WHERE
  254. t.topic_moved_id = 0
  255. ORDER BY
  256. t.topic_time DESC
  257. -- LIMIT 100 -- uncomment in development for faster runs
  258. ;
  259. SQL
  260. );
  261. } elseif ($phpbb3_minor_version == 1 || $phpbb3_minor_version == 2) {
  262. $res = $db->query(<<<SQL
  263. SELECT
  264. t.forum_id,
  265. t.topic_id,
  266. t.topic_title,
  267. t.topic_time,
  268. t.topic_posts_approved,
  269. u.username
  270. FROM
  271. {$db_prefix}topics t
  272. LEFT JOIN {$db_prefix}users u ON t.topic_poster=u.user_id
  273. WHERE
  274. t.topic_moved_id = 0
  275. ORDER BY
  276. t.topic_time DESC
  277. -- LIMIT 100 -- uncomment in development for faster runs
  278. ;
  279. SQL
  280. );
  281. } else {
  282. die('Unknown PHPBB minor version');
  283. }
  284. foreach ($res as $row) {
  285. $fid = $row['forum_id'];
  286. if (in_array($fid, $filter_forum)) {
  287. continue;
  288. }
  289. if($phpbb3_minor_version == 0) {
  290. $topics[$row['topic_id']] = array(
  291. 'fid' => $fid,
  292. 'title' => $row['topic_title'],
  293. 'time' => $row['topic_time'],
  294. 'replies' => $row['topic_replies'],
  295. 'author' => $row['username'],
  296. 'lastmod' => gmdate('Y-m-d\TH:i:s\Z', $row['topic_time']),
  297. );
  298. } elseif ($phpbb3_minor_version == 1 || $phpbb3_minor_version == 2) {
  299. $topics[$row['topic_id']] = array(
  300. 'fid' => $fid,
  301. 'title' => $row['topic_title'],
  302. 'time' => $row['topic_time'],
  303. 'replies' => $row['topic_posts_approved'],
  304. 'author' => $row['username'],
  305. 'lastmod' => gmdate('Y-m-d\TH:i:s\Z', $row['topic_time']),
  306. );
  307. } else {
  308. die('Unknown PHPBB minor version');
  309. }
  310. $forums[$fid]['topics'][] = $row['topic_id'];
  311. }
  312. return array($categories, $forums, $topics);
  313. }
  314. function save_data_in_json($what, $where_to) {
  315. // The encoding flags aren't crucial, they are just here because they make it
  316. // easier for me to review the resulting JSON.
  317. log_info('Encoding to JSON… ');
  318. $encoded_data = json_encode($what,
  319. JSON_PRETTY_PRINT | JSON_HEX_APOS | JSON_HEX_QUOT
  320. | JSON_HEX_AMP | JSON_UNESCAPED_UNICODE);
  321. if ($encoded_data !== false) {
  322. log_info('saving to disk… ');
  323. $fp = fopen($where_to, 'w');
  324. fwrite($fp, $encoded_data);
  325. // The encoder doesn't add a newline at the end.
  326. fwrite($fp, "\n");
  327. fclose($fp);
  328. log_info('done.');
  329. } else {
  330. error_log('Could not encode data to JSON.');
  331. }
  332. }
  333. $db = new PDO(
  334. 'mysql:host=' . $db_host . ';dbname=' . $db_name . ';charset=utf8mb4',
  335. $db_user, $db_pass);
  336. try {
  337. $extracted = array();
  338. $extracted['categories'] = get_categories($phpbb_version, $db, $db_prefix);
  339. $extracted['forums_tree'] = get_forums_tree($phpbb_version, $db, $db_prefix);
  340. list($extracted['categories'],
  341. $extracted['forums'],
  342. $extracted['topics']) = get_forums_and_topics($phpbb_version, $db, $db_prefix, $extracted);
  343. $extracted['topics'] = get_posts($phpbb_version, $db, $db_prefix, $extracted);
  344. unset($forums_and_topics);
  345. save_data_in_json($extracted, 'forum-data.json');
  346. } catch(PDOException $ex) {
  347. echo "An Error occured! " . $ex->getMessage();
  348. throw $ex;
  349. }