PageRenderTime 59ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/classes/CphpBB.php

https://gitlab.com/ezeql/fups
PHP | 449 lines | 307 code | 40 blank | 102 comment | 48 complexity | 637508cd856145a9a644f21672751880 MD5 | raw file
  1. <?php
  2. /*
  3. * FUPS: Forum user-post scraper. An extensible PHP framework for scraping and
  4. * outputting the posts of a specified user from a specified forum/board
  5. * running supported forum software. Can be run as either a web app or a
  6. * commandline script.
  7. *
  8. * Copyright (C) 2013-2015 Laird Shaw.
  9. *
  10. * This program is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Affero General Public License as
  12. * published by the Free Software Foundation, either version 3 of the
  13. * License, or (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Affero General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Affero General Public License
  21. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. *
  23. */
  24. /* File : classes/CphpBB.php.
  25. * Description: The phpBB forum scraper class, descending from FUPSBase.
  26. */
  27. require_once __DIR__.'/../phpBB-days-and-months-intl.php';
  28. class phpBBFUPS extends FUPSBase {
  29. protected $regexps = null;
  30. protected $old_version = false;
  31. public function __construct($web_initiated, $params, $do_not_init = false) {
  32. if (!$do_not_init) {
  33. $this->optional_settings[] = 'extract_user';
  34. }
  35. parent::__construct($web_initiated, $params, $do_not_init);
  36. if (!$do_not_init) {
  37. $this->regexps = array(
  38. /* 'template_skin' => array(
  39. 'sid' => a regexp to extract the SID value from the login page <ucp.php?mode=login>
  40. 'board_title' => a regexp to extract the board's title from the login page
  41. <ucp.php?mode=login>
  42. 'login_success' => a regexp to match the html of a successful-login page
  43. 'login_required' => a regexp to match a phpBB error that login is required to view
  44. member details
  45. 'user_name' => a regexp to extract the user's name from the user's profile page
  46. <memberlist.php?mode=viewprofile&u=[user_id]>
  47. 'thread_author' => a regexp to extract the thread's author from the thread view page
  48. <viewtopic.php?f=[forumid]&t=[topicid]>
  49. 'search_results_not_found' => a regexp to detect when a search results page returns no results i.e. on:
  50. <search.php?st=0&sk=t&sd=d&author_id=[author_id]&start=[start]>
  51. // N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  52. 'search_results_page_data' => a regexp to be matched on the user's posts search page
  53. <search.php?st=0&sk=t&sd=d&author_id=[author_id]&start=[start]> using
  54. preg_match_all with flags set to PREG_SET_ORDER so that each entry of
  55. $matches ends up with the following matches in the order specified in
  56. search_results_page_data_order.
  57. 'search_results_page_data_order' => an array specifying the order in which the following matches occur
  58. in the matches returned by the previous array.
  59. = array(
  60. 'title' => the match index of the title of post,
  61. 'ts' => the match index of the timestamp of post,
  62. 'forum' => the match index of the title of forum,
  63. 'topic' => the match index of the thread topic,
  64. 'forumid' => the match index of the forum id,
  65. 'topicid' => the match index of the topic id,
  66. 'postid' => the match index of the post id,
  67. )
  68. 'search_id' => a regexp to match the search id (only available in older versions of phpBB)
  69. 'post_contents' => a regexp to match post id (first match) and post contents (second match)
  70. on a thread page; it is called with match_all so it will return all
  71. post ids and contents on the page
  72. 'prev_page' => a regexp to extract the forumid (first match), topicid (second match) and
  73. start (third match) parameters from the "previous page" url on a thread
  74. view page
  75. 'next_page' => a regexp to extract the forumid (first match), topicid (second match) and
  76. start (third match) parameters from the "next page" url on a thread
  77. view page
  78. ),
  79. */
  80. 'mobile' => array(
  81. 'board_title' => '#<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile.*&bull;[ ]((?:(?!&bull;).)*)</title>#Us',
  82. 'login_success' => '#<table cellspacing="0">\\s*<tr class="row1">\\s*<td align="center"><p class="gen">#Us',
  83. 'login_required' => '#<table cellspacing="0">\\s*<tr class="row2">\\s*<td>#',
  84. 'user_name' => '#<b class="genmed">([^<]*)</b>#',
  85. 'thread_author' => '#<strong class="postauthor"[^>]*>[ ]([^<]*)</strong>#',
  86. 'search_results_not_found' => '#<h2>[^0-9<]*0[^0-9<]*</h2>#',
  87. # N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  88. 'search_results_page_data' => '#<span class="topictitle"><a name="p(\d+?)".*viewforum\.php\?f=(\d+?)[^>]*>([^<]*)</a>.*viewtopic\.php\?f=\d+?&amp;t=(\d+?)[^>]*>([^<]*)</a>.*viewtopic\.php\?[^>]*>([^<]*)</a>.*</b>[ ]([^<]*)</p>#Us',
  89. 'search_results_page_data_order' => array('title' => 6, 'ts' => 7, 'forum' => 3, 'topic' => 5, 'forumid' => 2, 'topicid' => 4, 'postid' => 1),
  90. 'post_contents' => '#<tr class="row1">\\s*<td class="gensmall"><a href="\\./viewtopic\\.php\\?p=(\\d+?).*<tr class="row1">\\s*<td>\\s*<div class="postbody">(.*)</div>\\s*</td>\\s*</tr>\\s*</table>#Us',
  91. ),
  92. 'prosilver.1' => array(
  93. 'sid' => '/name="sid" value="([^"]*)"/',
  94. 'board_title' => '#<h1>(.*)</h1>#',
  95. 'login_success' => '/<div class="panel" id="message">/',
  96. 'login_required' => '/class="panel"/',
  97. 'user_name' => '#<dl class="left-box details"[^>]*>\\s*<dt>[^<]*</dt>\\s*<dd>\\s*<span>([^<]+)</span>#Us',
  98. 'thread_author' => '#<p class="author">.*memberlist\.php.*>(.+)<#Us',
  99. 'search_results_not_found' => '#<div class="panel" id="message">\\s*<div class="inner"><span class="corners-top"><span></span></span>\\s*<h2>#Us',
  100. # N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  101. 'search_results_page_data' => '#<h3>[^>]*>([^<]*)</a>.*<dl class="postprofile">.*<dd>'.get_posted_on_date_regex().' (.+)</dd>.*<dd>[^:]*: .*>(.+)</a>.*<dd>[^:]*: .*>(.+)</a>.*viewtopic\.php\?f=(\d+?)&amp;t=(\d+?)&amp;p=(\d+?)#Us',
  102. 'search_results_page_data_order' => array('title' => 1, 'ts' => 3, 'forum' => 4, 'topic' => 5, 'forumid' => 6, 'topicid' => 7, 'postid' => 8),
  103. 'post_contents' => '#<div id="p(\d+)".*<div class="content">(.*)</div>[\r\n]+#Us',
  104. 'prev_page' => '#<strong>\\d+</strong>[^<]+<strong>\\d+</strong>.*<a href="\\./viewtopic\\.php\?f=(\\d+)&amp;t=(\\d+)&amp;start=(\\d+?)[^"]*">\\d+</a><span class="page-sep">, </span><strong>\\d+</strong>#Us',
  105. 'next_page' => '#<strong>\\d+</strong><span class="page-sep">, </span><a href="\\./viewtopic\\.php\\?f=(\\d+)&amp;t=(\\d+)&amp;start=(\\d+?)[^"]*">[^<]*</a>#Us',
  106. ),
  107. 'prosilver.2' => array(
  108. 'search_results_page_data' => '#<h3>[^>]*>([^<]*)</a>.*<dl class="postprofile">(?:(?!</dl>).)*<dd>([^<]+)</dd>.*<dd>[^:]*: .*>(.+)</a>.*<dd>[^:]*: .*>(.+)</a>.*viewtopic\.php\?f=(\d+?)&amp;t=(\d+?)&amp;p=(\d+?)#Us',
  109. 'search_results_page_data_order' => array('title' => 1, 'ts' => 2, 'forum' => 3, 'topic' => 4, 'forumid' => 5, 'topicid' => 6, 'postid' => 7),
  110. ),
  111. 'prosilver.3' => array(
  112. 'search_results_page_data' => '#<dl class="postprofile">.*<dd[^>]*>([^<]+)</dd>.*<dd>[^:]*: .*>(.+)</a>.*<dd>[^:]*: .*>(.+)</a>.*<h3>.*viewtopic\.php\?f=(\d+?)&amp;t=(\d+?)&amp;p=(\d+?)[^>]*>([^<]+)</a>#Us',
  113. 'search_results_page_data_order' => array('title' => 7, 'ts' => 1, 'forum' => 2, 'topic' => 3, 'forumid' => 4, 'topicid' => 5, 'postid' => 6),
  114. ),
  115. 'subsilver.2' => array(
  116. /* 'sid' => ? (not constructed yet), */
  117. /* 'board_title' => ? (not constructed yet), */
  118. 'login_success' => '#<table class="tablebg" width="100%" cellspacing="1">\\s*<tr>\\s*<th>[^<]*</th>\\s*</tr>\\s*<tr>\\s*<td class="row1" align="center"><br /><p class="gen">#Us',
  119. /* 'login_required' => ? (not constructed yet), */
  120. 'user_name' => '#<td align="center"><b class="gen">([^<]*)</b></td>#',
  121. /* 'thread_author' => ? (not constructed yet), */
  122. 'search_results_not_found' => '#<td class="row1" align="center"><br /><p class="gen">[^<]*</p><br /></td>#',
  123. # N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  124. 'search_results_page_data' => '#<tr class="row2">\\s*<td colspan="2" height="25"><p class="topictitle"><a name="p(\\d+)" id="p\\d+"></a>&nbsp;[^:]*: <a href="\\./viewforum\\.php\\?f=(\\d+?)[^"]*">([^<]*)</a> &nbsp; [^:]*: <a href="\\./viewtopic\\.php\\?f=\\d+&amp;t=(\\d+?)[^"]*">([^<]+)</a> </p></td>\\s*</tr>\\s*<tr class="row1">\\s*<td width="150" align="center" valign="middle"><b class="postauthor"><a href="[^"]*">[^<]*</a></b></td>\\s*<td height="25">\\s*<table width="100%" cellspacing="0" cellpadding="0" border="0">\\s*<tr>\\s*<td class="gensmall">\\s*<div style="float: left;">\\s*&nbsp;<b>[^:]*:</b> <a href="[^"]*">([^<]*)</a>\\s*</div>\\s*<div style="float: right;"><b>[^:]*:</b>\\s(.*)&nbsp;</div>#Us',
  125. 'search_results_page_data_order' => array('title' => 6, 'ts' => 7, 'forum' => 3, 'topic' => 5, 'forumid' => 2, 'topicid' => 4, 'postid' => 1),
  126. /* 'post_contents' => ? (not constructed yet), */
  127. /* 'prev_page' => same as for prosilver.1 */
  128. 'next_page' => '#<strong>\d+</strong>[^<]+<strong>\d+</strong>.*<a href="\\./viewtopic\\.php\?f=(\\d+)&amp;t=(\d+)&amp;start=(\\d+?)[^"]*">[^<]*</a></b></td>#Us',
  129. ),
  130. // Try the above first
  131. 'subsilver.2x' => array(
  132. // N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  133. 'search_results_page_data' => '#<tr class="row2">\\s*<td colspan="2" height="25"><p class="topictitle"><a name="p(\\d+?)" id="p\\d+"></a>&nbsp;[^:]*: <a href="\\./viewforum\\.php\\?f=(\\d+?)[^"]*">([^<]*)</a> &nbsp; [^:]*: <a href="\\./viewtopic\\.php\\?f=\\d+&amp;t=(\\d+?)">([^<]+)</a> </p></td>\\s*</tr>\\s*<tr class="row1">\\s*<td width="150" align="center" valign="middle"><b class="postauthor"><a href="[^"]*">[^<]*</a></b></td>\\s*<td height="25">\\s*<table width="100%" cellspacing="0" cellpadding="0" border="0">\\s*<tr>\\s*<td class="gensmall">\\s*<div style="float: left;">\\s*\\[[^\\]]*\\]\\s*</div>\\s*<div style="float: right;"><b>[^:]*:</b>\\s(.*)&nbsp;</div>#Us',
  134. 'search_results_page_data_order' => array('title' => 7 /* this match is deliberately designed to be an empty one because posts matching this regex don't actually have a title, which is the whole reason this subsilver.2x entry is necessary */, 'ts' => 6, 'forum' => 3, 'topic' => 5, 'forumid' => 2, 'topicid' => 4, 'postid' => 1),
  135. ),
  136. 'subsilver.1' => array(
  137. /* 'sid' => ? (not constructed yet), */
  138. /* 'board_title' => ? (not constructed yet), */
  139. /* 'login_success' => ? (not constructed yet), */
  140. /* 'login_required' => ? (not constructed yet), */
  141. /* 'user_name' => ? (not constructed yet), */
  142. 'thread_author' => '#<b class="postauthor">(.+)</b>#Us',
  143. /* 'search_results_not_found' => ? (not constructed yet), */
  144. // N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  145. /* 'search_results_page_data' => ? (not constructed yet), */
  146. 'post_contents' => '#<a name="p(\\d+)">.*<td valign="top">\\s*<table width="100%" cellspacing="5">\\s*<tr>\\s*<td>\\s*?(.*)(\\s*?<br /><br />\\s*<span class="gensmall">.*</span>|)\\n\\s*<br clear="all" /><br />#Us',
  147. 'prev_page' => '#<a href="\\./viewtopic\\.php\\?f=(\\d+)&amp;t=(\\d+)&amp;start=(\\d+)">[^<]+</a>&nbsp;&nbsp;<a href="\\./viewtopic\\.php\\?f=\\d+&amp;t=\\d+[^"]*">\\d+</a><span class="page-sep">,#',
  148. /* 'next_page' => ? (not constructed yet), */
  149. ),
  150. 'subsilver.2005' => array(
  151. 'login_success' => '#<a href="privmsg\\.php\\?folder=inbox"#',
  152. 'user_name' => '#alt="[^\\[]*\\[ (.*) \\]"#',
  153. 'search_results_page_data' => '#<span class="topictitle">.*&nbsp;<a href="viewtopic\\.php\\?t=(\\d+?).*class="topictitle">([^<]*)</a></span></td>.*<span class="postdetails">[^:]*:&nbsp;<b><a href="viewforum\\.php\\?f=(\\d+?)[^>]*>([^<]*)</a></b>&nbsp; &nbsp;[^:]*: ([^&]*?)&nbsp;.*viewtopic\\.php\\?p=(\\d+?)[^>]*>([^<]*)</a></b></span>#Us',
  154. 'search_results_page_data_order' => array('topicid' => 1, 'topic' => 2, 'forumid' => 3, 'forum' => 4, 'ts' => 5, 'postid' => 6, 'title' => 7),
  155. 'post_contents' => '#<td class="row1".*<a href="viewtopic\\.php\\?p=(\\d+?).*<tr>\\s*<td colspan="2"><hr /></td>\\s*</tr>\\s*<tr>\\s*<td colspan="2">(.*)</td>\\s*</tr>\\s*</table></td>\\s*</tr>#Us',
  156. ),
  157. 'subsilver.0' => array(
  158. 'sid' => '#href="\\./index\\.php\\?sid=([^"]*)"#',
  159. /** @todo Remove English-specific components of this regex ("Log in" and potentially the double-colon). */
  160. 'board_title' => '#<title>(.*) :: Log in</title>#',
  161. /* 'login_success' => ? (not constructed yet), */
  162. /* 'login_required' => ? (not constructed yet), */
  163. /* 'user_name' => ? (not constructed yet), */
  164. 'search_results_not_found' => '#<table border="0" cellpadding="3" cellspacing="1" width="100%" class="forumline" align="center">\\s*<tr>\\s*<th width="150" height="25" class="thCornerL" nowrap="nowrap">Author</th>\\s*<th width="100%" class="thCornerR" nowrap="nowrap">Message</th>\\s*</tr>\\s*<tr>\\s*<td class="catBottom" colspan="2" height="28" align="center">&nbsp; </td>\\s*</tr>\\s*</table>#Us',
  165. // N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  166. 'search_results_page_data' => '#<tr>\\s*<td[^>]*><span class="topictitle"><img src="[^"]+" align="absmiddle" />&nbsp; .*:&nbsp;<a href="viewtopic\\.php\\?t=(\\d+)&amp;highlight=" class="topictitle">([^<]*)</a></span></td>\\s*</tr>\\s*<tr>\\s*<td width="\\d+" align="left" valign="top" class="row1" rowspan="2"><span class="name"><b><a href="profile\\.php\\?mode=viewprofile&amp;u=3">[^<]*</a></b></span><br />\\s*<br />\\s*<span class="postdetails">[^<]*<b>[^<]*</b><br />\\s*[^<]*<b>[^<]*</b></span><br />\\s*</td>\\s*<td width="100%" valign="top" class="row1"><img[^>]*><span class="postdetails">[^<]*<b><a href="viewforum\\.php\\?f=(\\d+)" class="postdetails">([^<]*)</a></b>&nbsp; &nbsp;[^:]*: (.*)&nbsp; &nbsp;[^:]*: <b><a href="viewtopic\\.php\\?p=(\\d+)&amp;highlight=\\#\\d+">([^<]+)</a></b></span></td>\\s*</tr>#Us',
  167. 'search_results_page_data_order' => array('title' => 7, 'ts' => 5, 'forum' => 4, 'topic' => 2, 'forumid' => 3, 'topicid' => 1, 'postid' => 6),
  168. 'search_id' => '#\\?search_id=(\\d+)&#',
  169. 'post_contents' => '#<tr>\\s*<td width="100%"><a href="viewtopic\\.php\\?p=(\\d+?)[^\\#]*\\#\\d+"><img[^>]*></a><span class="postdetails">[^<]*<span class="gen">&nbsp;</span>[^<]*</span></td>\\s*<td valign="top" nowrap="nowrap"><a href="posting\\.php\\?[^"]*"><img[^>]*></a>\\s*</td>\\s*</tr>\\s*<tr>\\s*<td colspan="2"><hr /></td>\\s*</tr>\\s*<tr>\\s*<td colspan="2"><span class="postbody">(.*)</span><span class="gensmall">(<br /><br />|)[^<]*</span></td>\\s*</tr>#Us',
  170. 'thread_author' => '#<b>(.*?)</b></span><br /><span class="postdetails">#',
  171. 'prev_page' => '#()<span class="gensmall"><b>.*?<a href="viewtopic\\.php\\?t=(\\d+?).*start=(\\d+?)[^>]*>[^<]*</a>, <b>#U',
  172. 'next_page' => '#()<span class="gensmall"><b>[^<]+<a href="viewtopic\\.php\\?t=(\\d+).*start=(\\d+)[^"]*">#',
  173. ),
  174. 'forexfactory' => array(
  175. 'sid' => '#SSIONURL = \'?(s\=)(.*&)|(.*)\';#',
  176. 'board_title' => '#<title>(.*)</title>#',
  177. ),
  178. );
  179. }
  180. }
  181. protected function check_do_login() {
  182. # Do this first bit so that we set old_version if necessary regardless of whether or not the user supplied credentials.
  183. # Discover the SID
  184. $this->set_url($this->settings['base_url'].'/ucp.php?mode=login');
  185. $redirect = false;
  186. $html = $this->do_send($redirect, /*$quit_on_error*/false, $err);
  187. if ($err) {
  188. # Earlier versions of phpBB need a different URL
  189. $this->old_version = true;
  190. }
  191. # Do the rest conditionally on the user having supplied credentials.
  192. if (!empty($this->settings['login_user']) || !empty($this->settings['login_password'])) {
  193. if ($this->old_version) {
  194. $this->set_url($this->settings['base_url'].'/login.php');
  195. $html = $this->do_send();
  196. }
  197. if ($this->skins_preg_match('sid', $html, $matches)) {
  198. $sid = $matches[1];
  199. if ($this->dbg) $this->write_err('SID: '.$sid);
  200. } else {
  201. $this->exit_err('Could not find the hidden sid input on the login page. The URL of the searched page is <'.$this->last_url.'>', __FILE__, __METHOD__, __LINE__, $html);
  202. }
  203. $this->write_status('Attempting to log in.');
  204. # Attempt to log in
  205. if ($this->dbg) $this->write_err('Attempting to log in.');
  206. $postfields = array(
  207. 'username' => $this->settings['login_user'],
  208. 'password' => $this->settings['login_password'],
  209. 'autologin' => '',
  210. 'viewonline' => '',
  211. 'redirect' => 'index.php',
  212. 'sid' => $sid,
  213. 'login' => 'true',
  214. );
  215. $opts = array(
  216. CURLOPT_POST => true ,
  217. CURLOPT_POSTFIELDS => $postfields
  218. );
  219. if (!curl_setopt_array($this->ch, $opts)) {
  220. $this->exit_err('Failed to set the following cURL options:'.PHP_EOL.var_export($opts, true), __FILE__, __METHOD__, __LINE__);
  221. }
  222. # A successful login either redirects via HTTP or returns a page with a message matching the 'login_success' regex.
  223. $html = $this->do_send($redirect);
  224. if ((!$html && $redirect) || $this->skins_preg_match('login_success', $html, $dummy)) {
  225. if ($this->dbg) $this->write_err('Logged in successfully.');
  226. } else {
  227. $this->exit_err('Login was unsuccessful (did not find success message). This could be due to a wrong username/password combination. The URL is <'.$this->last_url.'>', __FILE__, __METHOD__, __LINE__, $html);
  228. }
  229. # Set cURL method back to GET because this class and especially its ancestor rely on the default method being GET
  230. if (!curl_setopt($this->ch, CURLOPT_POST, false)) {
  231. $this->exit_err('Failed to set cURL option CURLOPT_POST back to false.',__FILE__, __METHOD__, __LINE__);
  232. }
  233. }
  234. }
  235. protected function find_author_posts_via_search_page__end_hook(&$do_inc_progress_level, $html, $found_earliest, $matches) {
  236. if ($this->post_search_counter === 0 && !$found_earliest) {
  237. if ($this->skins_preg_match('search_id', $html, $matches__search_id)) {
  238. $this->search_id = $matches__search_id[1];
  239. }
  240. }
  241. parent::find_author_posts_via_search_page__end_hook($do_inc_progress_level, $html, $found_earliest, $matches);
  242. }
  243. # For quotes rendered by the subsilver skin
  244. protected function get_extra_head_lines() {
  245. return '<style type="text/css">
  246. .quotetitle, .attachtitle {
  247. margin: 10px 5px 0 5px;
  248. padding: 4px;
  249. font-weight: bold;
  250. }
  251. .quotecontent, .attachcontent {
  252. margin: 0 5px 10px 5px;
  253. padding: 5px;
  254. font-weight: normal;
  255. }
  256. </style>';
  257. }
  258. static function get_forum_software_homepage() {
  259. return 'https://www.phpbb.com/';
  260. }
  261. static function get_msg_how_to_detect_forum() {
  262. return 'Typically, phpBB forums can be identified by the presence of the text "Powered by phpBB" in the footer of their forum pages. It is possible, however, that these footer texts have been removed by the administrator of the forum. In this case, the only way to know for sure is to contact your forum administrator.';
  263. }
  264. protected function get_post_contents__end_hook($forumid, $topicid, $postid, $html, &$found, $err, $count, &$ret) {
  265. # Sometimes (this seems to be a phpBB bug), posts don't appear on the thread page they're supposed to,
  266. # and instead appear on the previous or next page in the thread. Here, we deal with those scenarios.
  267. $org_url = $this->last_url;
  268. if (!$found) {
  269. $this->write_err('Trying to find post ID '.$postid.' on previous page of thread, if that page exists.');
  270. if (!$this->skins_preg_match('prev_page', $html, $matches__prev_page)) {
  271. $this->write_and_record_err_admin('Warning: could not extract the details of the previous thread page from the current page. The URL of the current page is <'.$org_url.'>.', __FILE__, __METHOD__, __LINE__, $html);
  272. } else {
  273. $this->set_url($this->get_topic_url($matches__prev_page[1], $matches__prev_page[2], $matches__prev_page[3]));
  274. $html__prev_page = $this->do_send();
  275. if (!$this->skins_preg_match_all('post_contents', $html__prev_page, $matches__prev_posts)) {
  276. $this->write_and_record_err_admin('Warning: could not find any post contents on the previous page in the thread. The URL of that previous page in the thread is: '.$this->last_url, __FILE__, __METHOD__, __LINE__, $html__prev_page);
  277. } else {
  278. list($found, $count) = $this->get_post_contents_from_matches($matches__prev_posts, $postid, $topicid);
  279. if ($found) {
  280. $this->write_err('Success! Retrieved post contents of post ID "'.$postid.'".');
  281. $ret = true;
  282. } else {
  283. $this->write_and_record_err_admin("Warning: post ID '$postid' not found on previous page. The URL of that previous page is <".$this->last_url.'>.', __FILE__, __METHOD__, __LINE__, $html__prev_page);
  284. }
  285. if ($found) $count--;
  286. if ($count > 0 && $this->dbg) $this->write_err('Retrieved '.$count.' other posts from the page.');
  287. }
  288. }
  289. }
  290. if (!$found) {
  291. $this->write_err('Trying to find post ID '.$postid.' on next page of thread, if that page exists.');
  292. if (!$this->skins_preg_match('next_page', $html, $matches__next_page)) {
  293. $this->write_and_record_err_admin('Warning: could not extract the details of the next thread page from the current page. The URL of that page is <'.$org_url.'>.', __FILE__, __METHOD__, __LINE__, $html);
  294. } else {
  295. $this->set_url($this->get_topic_url($matches__next_page[1], $matches__next_page[2], $matches__next_page[3]));
  296. $html__next_page = $this->do_send();
  297. if (!$this->skins_preg_match_all('post_contents', $html__next_page, $matches__next_posts)) {
  298. $this->write_and_record_err_admin('Warning: could not find any post contents on the next page in the thread. The URL of that next page in the thread is: '.$this->last_url, __FILE__, __METHOD__, __LINE__, $html__next_page);
  299. } else {
  300. list($found, $count) = $this->get_post_contents_from_matches($matches__next_posts, $postid, $topicid);
  301. if ($found) {
  302. $this->write_err('Success! Retrieved post contents of post ID "'.$postid.'".');
  303. $ret = true;
  304. } else if ($err || $this->dbg) {
  305. $this->write_and_record_err_admin("Warning: post ID '$postid' not found on next page. The URL of that next page is <".$this->last_url.'>.', __FILE__, __METHOD__, __LINE__, $html__next_page);
  306. }
  307. if ($found) $count--;
  308. if ($count > 0 && $this->dbg) $this->write_err('Retrieved '.$count.' other posts from the page.');
  309. }
  310. }
  311. }
  312. }
  313. protected function get_post_url($forumid, $topicid, $postid, $with_hash = false) {
  314. return $this->settings['base_url']."/viewtopic.php?f=$forumid&t=$topicid&p=$postid".($with_hash ? '#p'.$postid : '');
  315. }
  316. static function get_qanda() {
  317. $qanda = parent::get_qanda();
  318. $qanda = array_merge($qanda, array(
  319. 'q_relationship' => array(
  320. 'q' => 'Does this script have any relationship with <a href="https://github.com/ProgVal/PHPBB-Extract">the PHPBB-Extract script on GitHub</a>?',
  321. 'a' => 'No, they are separate projects.',
  322. ),
  323. ));
  324. $qanda_new = array(
  325. 'q_how_know_phpbb' => array(
  326. 'q' => 'How can I know if a forum is a phpBB forum?',
  327. 'a' => self::get_msg_how_to_detect_forum(),
  328. )
  329. );
  330. foreach ($qanda as $id => $qa) {
  331. $qanda_new[$id] = $qa;
  332. if ($id == 'q_lang') {
  333. $qanda_new['q_login_req'] = array(
  334. 'q' => 'Do I need to supply a login username and password?',
  335. 'a' => '<p>Probably not. These are the conditions under which you do:</p>
  336. <ul>
  337. <li>You do not supply a value for the Extract User Username setting, and the phpBB board you\'re retrieving from requires login before it will display member information.</li>
  338. <li>Your local timezone (configured in your board preferences) is different to the board\'s default timezone, and you wish for all dates and times displayed against your posts to be in your local timezone.</li>
  339. <li>You are retrieving posts from a private forum.</li>
  340. </ul>',
  341. );
  342. $qanda_new['q_login_details_safe'] = array(
  343. 'q' => 'Is it safe to supply my login username and password?',
  344. 'a' => '<p>You will need to use your judgement here. I have attempted to make it as safe as possible without compromising simplicity. Your username and password, along with all other settings, will be stored in one or two files in a private directory (i.e. not accessible via the web) on my web hosting account for no longer than three days (a scheduled task deletes these files periodically; it runs once a day and deletes files more than two days old). In addition, you will be presented with an option after the script runs, or, if you cancel the script, to delete immediately all files associated with your request. I will never look inside the temporary files containing your username/password.</p>
  345. <p>If this doesn\'t satisfy you, you might consider temporarily changing your password for the script, and then changing it back again once the script has finished.</p>');
  346. $qanda_new['q_post_contents_safe'] = array(
  347. 'q' => 'Is it safe to retrieve posts from a private forum through this script?',
  348. 'a' => 'Your username and password are as safe as the previous answer describes. The content of your posts (the output file) is slightly less safe in that this output file is publicly accessible - but only to those who know the 32-character random token associated with it, and only until it is deleted either by you after you have saved it, or by the daily scheduled deletion task. As with usernames and passwords, I will never look inside the temporary file containing your posts\' content.',
  349. );
  350. $qanda_new['q_images_supported'] = array(
  351. 'q' => 'Are images supported?',
  352. 'a' => 'External images are supported so long as you are online at the time of viewing the output - they are not downloaded, the link is merely retained. Internal images - those uploaded to the forum as attachments - aren\'t supported at all; they occur as relative URLs, which the script does not convert into absolute URLs.',
  353. );
  354. $qanda_new['q_which_skins_supported'] = array(
  355. 'q' => 'Which skins are supported?',
  356. 'a' => 'Both the prosilver and subsilver skins are supported. The script probably won\'t work with customised skins, but if you desire support for such a skin (you are getting error messages about regular expressions failing), feel free to <a href="'.FUPS_CONTACT_URL.'">contact me</a>. A workaround is to simply set your skin to either prosilver or subsilver in the user control panel of your phpBB forum whilst you are logged in, and then to supply your login credentials in the settings above, optionally reverting your skin back to whatever it was before in the user control panel after running FUPS.',
  357. );
  358. }
  359. }
  360. return $qanda_new;
  361. }
  362. protected function get_search_url() {
  363. if ($this->old_version) {
  364. $url = $this->settings['base_url'].'/search.php?'.($this->search_id !== null ? 'search_id='.urlencode($this->search_id) : 'search_author='.urlencode($this->settings['extract_user'])).'&start='.urlencode($this->post_search_counter);
  365. } else $url = $this->settings['base_url'].'/search.php?st=0&sk=t&sd=d&author_id='.urlencode($this->settings['extract_user_id']).'&start='.urlencode($this->post_search_counter);
  366. return $url;
  367. }
  368. public function get_settings_array() {
  369. $settings_arr = parent::get_settings_array();
  370. $new_settings_arr = array();
  371. foreach ($settings_arr as $key => $setting) {
  372. $new_settings_arr[$key] = $setting;
  373. if ($key == 'extract_user_id') {
  374. $new_settings_arr['extract_user'] = array(
  375. 'label' => 'Extract User Username',
  376. 'default' => '' ,
  377. 'description' => 'Set this to the username corresponding to the above ID. Note that it does not and cannot replace the need for the above ID; that ID is required. In contrast, this setting is not required (i.e. it can be left blank) if the script has permission to view member information on the specified phpBB board, in which case the script will extract it automatically from the member information page associated with the above ID: this will fail if the forum requires users to be logged in to view member information and if you do not provide valid login credentials (which can be specified below), in which case you should specify this setting.',
  378. );
  379. }
  380. }
  381. $new_settings_arr['base_url']['default'] = 'http://www.theabsolute.net/phpBB';
  382. $new_settings_arr['base_url']['description'] .= ' This is the URL that appears in your browser\'s address bar when you access the forum, only with everything onwards from (and including) the filename of whichever script is being accessed (e.g. /index.php or /viewtopic.php) stripped off. The default URL provided is for the particular phpBB board known as "Genius Forums".';
  383. $new_settings_arr['extract_user_id']['description'] .= ' You can find a user\'s ID by hovering your cursor over a hyperlink to their name and taking note of the number that appears after "&amp;u=" in the URL in the browser\'s status bar.';
  384. $new_settings_arr['login_user']['description'] = 'Set this to the username of the user whom you wish to log in as (it\'s fine to set it to the same value as Extract User Username above), or leave it blank if you do not wish FUPS to log in. Logging in is optional but if you log in then the timestamps associated with each post will be according to the timezone specified in that user\'s preferences, rather than the board default. Also, some boards require you to be logged in so that you can view posts. If you don\'t want to log in, then simply leave blank this setting and the next setting.';
  385. return $new_settings_arr;
  386. }
  387. protected function get_topic_url($forumid, $topicid, $start = null) {
  388. return $this->settings['base_url'].'/viewtopic.php?f='.urlencode($forumid).'&t='.urlencode($topicid).($start === null ? '' : '&start='.urlencode($start));
  389. }
  390. protected function get_user_page_url() {
  391. return $this->settings['base_url'].'/memberlist.php?mode=viewprofile&u='.urlencode($this->settings['extract_user_id']);
  392. }
  393. public function supports_feature($feature) {
  394. static $features = array(
  395. 'login' => true
  396. );
  397. return isset($features[$feature]) ? $features[$feature] : parent::supports_feature($feature);
  398. }
  399. protected function validate_settings() {
  400. parent::validate_settings();
  401. if (filter_var($this->settings['extract_user_id'], FILTER_VALIDATE_INT) === false) {
  402. $this->exit_err('The value supplied for the extract_user_id setting, "'.$this->settings['extract_user_id'].'", is not an integer, which it is required to be for phpBB boards.', __FILE__, __METHOD__, __LINE__);
  403. }
  404. }
  405. }
  406. ?>