PageRenderTime 53ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/classes/CFUPSBase.php

https://gitlab.com/ezeql/fups
PHP | 1135 lines | 922 code | 118 blank | 95 comment | 171 complexity | 04902102cb6ee8cbbc7c3fa558ecdf42 MD5 | raw file
  1. <?php
  2. /*
  3. * FUPS: Forum user-post scraper. An extensible PHP framework for scraping and
  4. * outputting the posts of a specified user from a specified forum/board
  5. * running supported forum software. Can be run as either a web app or a
  6. * commandline script.
  7. *
  8. * Copyright (C) 2013-2015 Laird Shaw.
  9. *
  10. * This program is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Affero General Public License as
  12. * published by the Free Software Foundation, either version 3 of the
  13. * License, or (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Affero General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Affero General Public License
  21. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. *
  23. */
  24. /* File : classes/CFUPSBase.php.
  25. * Description: The base class for forum scraping. Cannot be instantiated
  26. * due to abstract methods - only descendant classes for specific
  27. * forums can be instantiated.
  28. */
  29. require_once __DIR__.'/../common.php';
  30. require_once __DIR__.'/../phpBB-days-and-months-intl.php';
  31. abstract class FUPSBase {
  32. # The maximum time in seconds before the script chains a new instance of itself and then exits,
  33. # to avoid timeouts due to exceeding the PHP commandline max_execution_time ini setting.
  34. public $FUPS_CHAIN_DURATION = null;
  35. protected $charset = null;
  36. protected $have_written_to_admin_err_file = false;
  37. protected $required_settings = array('base_url', 'extract_user_id', 'php_timezone');
  38. protected $optional_settings = array('start_from_date', 'non_us_date_format', 'debug');
  39. protected $private_settings = array('login_user', 'login_password');
  40. /* Different skins sometimes output html different enough that
  41. * a different regex is required for each skin to match the values that
  42. * this script searches for: the array below collects all of these
  43. * regexes in one place for easier maintenance, making it especially
  44. * easier to support new skins (this array is only set "for real"
  45. * in the descendant classes).
  46. *
  47. * It is not necessary for a skin to contain an entry for each regex
  48. * type so long as some other skin's entry for that regex matches.
  49. */
  50. protected $regexps = array(
  51. /* 'skin_template' => array(
  52. 'board_title' => a regex to extract the board's title from at least one forum page
  53. (this regex is tried for each page until it succeeds)
  54. 'login_success' => a regex to match the html of a successful-login page
  55. 'login_required' => a regex to match an error message that login is required to view
  56. member details
  57. 'user_name' => a regex to extract the user's name from the user's profile page
  58. 'thread_author' => a regex to extract the thread's author from the thread view page
  59. 'search_results_not_found' => a regex to detect when a search results page returns no results
  60. 'search_results_page_data' => a regex to be matched on the user's posts search page using
  61. preg_match_all with flags set to PREG_SET_ORDER so that each entry of
  62. $matches ends up with the following matches in the order specified in
  63. search_results_page_data_order.
  64. N.B. Must not match any results matched by any other skin's
  65. search_results_page_data regex - the results of all are combined!
  66. 'search_results_page_data_order' => an array specifying the order in which the following matches occur
  67. in the matches returned by the previous regex.
  68. = array(
  69. 'title' => the match index of the title of post,
  70. 'ts' => the match index of the timestamp of post,
  71. 'forum' => the match index of the title of forum,
  72. 'topic' => the match index of the thread topic,
  73. 'forumid' => the match index of the forum id,
  74. 'topicid' => the match index of the topic id,
  75. 'postid' => the match index of the post id,
  76. )
  77. 'post_contents' => a regex to match post id (first match) and post contents (second match)
  78. on a thread page; it is called with match_all so it will return all
  79. post ids and contents on the page
  80. ),
  81. */
  82. );
  83. protected $settings = array();
  84. protected $progress_level = 0;
  85. protected $org_start_time = null;
  86. protected $start_time = null;
  87. protected $web_initiated = null;
  88. protected $token = false;
  89. protected $settings_filename = false;
  90. protected $output_dirname = false;
  91. protected $output_dirname_web = null;
  92. protected $errs_filename = false;
  93. protected $cookie_filename = false;
  94. protected $ch = null;
  95. protected $last_url = null;
  96. protected $search_id = null;
  97. protected $post_search_counter = 0;
  98. protected $posts_not_found = array();
  99. protected $posts_data = array();
  100. protected $total_posts = 0;
  101. protected $current_topic_id = null;
  102. protected $num_posts_retrieved = 0;
  103. protected $num_thread_infos_retrieved = 0;
  104. protected $search_page_num = 0;
  105. protected $dbg = false;
  106. protected $quiet = false;
  107. protected $progress_levels = array(
  108. 0 => 'init_user_post_search',
  109. 1 => 'user_post_search',
  110. 2 => 'topic_post_sort',
  111. 3 => 'posts_retrieval',
  112. 4 => 'extract_per_thread_info',
  113. 5 => 'handle_missing_posts',
  114. 6 => 'write_output',
  115. 7 => 'check_send_non_fatal_err_email',
  116. );
  117. protected $was_chained = false;
  118. public function __construct($web_initiated, $params, $do_not_init = false) {
  119. if (!$do_not_init) {
  120. $this->org_start_time = time();
  121. $this->start_time = $this->org_start_time;
  122. if ($this->supports_feature('login')) {
  123. $this->optional_settings = array_merge($this->optional_settings, array('login_user', 'login_password'));
  124. }
  125. $this->web_initiated = $web_initiated;
  126. if ($this->web_initiated) {
  127. if (!isset($params['token'])) {
  128. $this->exit_err('Fatal error: $web_initiated was true but $params did not contain a "token" key.', __FILE__, __METHOD__, __LINE__);
  129. }
  130. $this->token = $params['token'];
  131. $this->settings_filename = make_settings_filename($this->token);
  132. $this->output_dirname = make_output_dirname ($this->token);
  133. $this->errs_filename = make_errs_filename ($this->token);
  134. } else {
  135. if (!isset($params['settings_filename'])) {
  136. $this->exit_err('Fatal error: $web_initiated was false but $params did not contain a "settings_filename" key.', __FILE__, __METHOD__, __LINE__);
  137. }
  138. $this->settings_filename = $params['settings_filename'];
  139. if (!isset($params['output_dirname'])) {
  140. $this->exit_err('Fatal error: $web_initiated was false but $params did not contain a "output_dirname" key.', __FILE__, __METHOD__, __LINE__);
  141. }
  142. $this->output_dirname = $params['output_dirname'];
  143. $len = strlen($this->output_dirname);
  144. // Make sure user-supplied (commandline interface) output directories end in a slash,
  145. // because when we generate them (web interface) we make sure they end in a slash,
  146. // and this way we can rely on them ending in a slash in all contexts. Note that here
  147. // we assume an empty output directory to refer to the root directory.
  148. if ($len <= 0 || $this->output_dirname[$len-1] != '/') {
  149. $this->output_dirname .= '/';
  150. }
  151. $this->quiet = $params['quiet' ];
  152. }
  153. if (FUPS_CHAIN_DURATION == -1) {
  154. $max_execution_time = ini_get('max_execution_time');
  155. if (is_numeric($max_execution_time) && $max_execution_time > 0) {
  156. $this->FUPS_CHAIN_DURATION = $max_execution_time * 3/4;
  157. } else $this->FUPS_CHAIN_DURATION = FUPS_FALLBACK_FUPS_CHAIN_DURATION;
  158. } else $this->FUPS_CHAIN_DURATION = FUPS_CHAIN_DURATION;
  159. $this->write_status('Reading settings.');
  160. $default_settings = $this->get_default_settings();
  161. $raw_settings = $this->read_settings_raw_s($this->settings_filename);
  162. foreach ($raw_settings as $setting => $value) {
  163. if (in_array($setting, $this->required_settings) || in_array($setting, $this->optional_settings)) {
  164. $this->settings[$setting] = $value;
  165. }
  166. }
  167. $missing = array_diff($this->required_settings, array_keys($this->settings));
  168. if ($missing) {
  169. $this->exit_err("The following settings were missing: ".implode(', ', $missing).'.', __FILE__, __METHOD__, __LINE__);
  170. }
  171. foreach ($default_settings as $setting => $default) {
  172. if (empty($this->settings[$setting])) $this->settings[$setting] = $default;
  173. }
  174. date_default_timezone_set($this->settings['php_timezone']); // This timezone only matters when converting the earliest time setting.
  175. if (!empty($this->settings['start_from_date'])) {
  176. $this->settings['earliest'] = $this->strtotime_intl($this->settings['start_from_date']);
  177. if ($this->settings['earliest'] === false) $this->write_err("Error: failed to convert 'start_from_date' ({$this->settings['start_from_date']}) into a UNIX timestamp.");
  178. }
  179. $this->dbg = in_array($this->settings['debug'], array('true', '1')) ? true : false;
  180. if ($this->dbg) {
  181. $this->write_err('SETTINGS:');
  182. $this->write_err(var_export($this->settings, true));
  183. }
  184. $this->write_status('Finished reading settings.');
  185. $this->validate_settings();
  186. // Create output directory, appending .1 or .2 etc if necessary.
  187. // Do this last so we don't create it if settings validation fails.
  188. $max_attempts = 10000;
  189. $appendix = 0;
  190. // Strip off the trailing slash
  191. $dirname = substr($this->output_dirname, 0, strlen($this->output_dirname) - 1);
  192. while (file_exists($dirname) && $appendix <= $max_attempts) $dirname = $this->output_dirname.'.'.(++$appendix);
  193. if ($appendix > $max_attempts) {
  194. $this->exit_err('Output directory "'.$this->output_dirname.'" already exists. Exceeded maximum attempts ('.$max_attempts.') in finding an alternative that does not exist. Tried "'.$this->output_dirname.'.1", "'.$this->output_dirname.'.2", "'.$this->output_dirname.'.3", etc.', __FILE__, __METHOD__, __LINE__);
  195. }
  196. if (!mkdir($dirname, 0775, true)) {
  197. $this->exit_err('Failed to create output directory "'.$dirname.'".', __FILE__, __METHOD__, __LINE__);
  198. }
  199. $this->output_dirname = $dirname.'/';
  200. if ($this->web_initiated) {
  201. $this->output_dirname_web = make_output_dirname($this->token, /*$for_web*/true, $appendix == 0 ? '' : $appendix);
  202. }
  203. }
  204. }
  205. public function __wakeup() {
  206. $this->start_time = time();
  207. date_default_timezone_set($this->settings['php_timezone']);
  208. $this->was_chained = true;
  209. $this->write_status('Woke up in chained process.');
  210. }
  211. protected function archive_output($dirname, $zip_filename) {
  212. $ret = false;
  213. if (!class_exists('ZipArchive')) {
  214. $this->write_err('Unable to create output archive: the "ZipArchive" class does not exist. You can install it using these online instructions: <http://php.net/manual/en/zip.installation.php>.', __FILE__, __METHOD__, __LINE__);
  215. } else {
  216. $zip = new ZipArchive();
  217. if ($zip->open($zip_filename, ZipArchive::CREATE) !== true) {
  218. $this->write_err('Unable to create zip archive "'.$zip_filename.'".', __FILE__, __METHOD__, __LINE__);
  219. } else {
  220. $local_dirname = basename($dirname);
  221. $handle = opendir($dirname);
  222. if ($handle === false) {
  223. $this->write_err('Unable to open directory "'.$dirname.'" for reading.', __FILE__, __METHOD__, __LINE__);
  224. } else {
  225. while (($f = readdir($handle)) !== false) {
  226. if ($f != '.' && $f != '..') {
  227. $zip->addFile($dirname.$f, $local_dirname.'/'.$f);
  228. }
  229. }
  230. closedir($handle);
  231. if (!$zip->close()) {
  232. $this->write_err('Failed to close the zip archive "'.$zip_filename.'".', __FILE__, __METHOD__, __LINE__);
  233. } else $ret = true;
  234. }
  235. }
  236. }
  237. return $ret;
  238. }
  239. protected function array_to_utf8(&$arr) {
  240. if ($this->charset !== null) {
  241. if (is_string($arr)) {
  242. $arr_new = iconv($this->charset, 'UTF-8', $arr);
  243. if ($arr_new !== false)
  244. $arr = $arr_new;
  245. } else if (is_array($arr)) {
  246. foreach ($arr as &$entry) {
  247. $this->array_to_utf8($entry);
  248. }
  249. }
  250. }
  251. }
  252. protected function check_do_chain() {
  253. if (time() - $this->start_time > $this->FUPS_CHAIN_DURATION) {
  254. $serialize_filename = make_serialize_filename($this->web_initiated ? $this->token : $this->settings_filename);
  255. if ($this->dbg) $this->write_err('Set $serialize_filename to "'.$serialize_filename.'".');
  256. if (!file_put_contents($serialize_filename, serialize($this))) {
  257. $this->exit_err('file_put_contents returned false.', __FILE__, __METHOD__, __LINE__);
  258. }
  259. $args = array(
  260. 'chained' => true,
  261. );
  262. if ($this->web_initiated) {
  263. $args['token'] = $this->token;
  264. } else {
  265. $args['settings_filename'] = $this->settings_filename;
  266. $args['output_dirname'] = $this->output_dirname;
  267. $args['quiet'] = $this->quiet;
  268. }
  269. curl_close($this->ch); // So we save the cookie file to disk for the chained process.
  270. $cmd = make_php_exec_cmd($args);
  271. $this->write_status('Chaining next process.');
  272. if ($this->dbg) $this->write_err('Chaining process: about to run command: '.$cmd);
  273. if (!try_run_bg_proc($cmd)) {
  274. $this->exit_err('Apologies, the server encountered a technical error: it was unable to initiate a chained background process to continue the task of scraping, sorting and finally presenting your posts. The command used was:'.PHP_EOL.PHP_EOL.$cmd.PHP_EOL.PHP_EOL.'Any output was:'.PHP_EOL.implode(PHP_EOL, $output).PHP_EOL.PHP_EOL.'You might like to try again.', __FILE__, __METHOD__, __LINE__);
  275. }
  276. if ($this->dbg) $this->write_err('Exiting parent chaining process.');
  277. exit;
  278. }
  279. }
  280. protected function check_do_login() {}
  281. protected function check_get_board_title($html) {
  282. if (empty($this->settings['board_title'])) {
  283. # Try to discover the board's title
  284. if (!$this->skins_preg_match('board_title', $html, $matches)) {
  285. if ($this->dbg) $this->write_err("Warning: couldn't find the site title. The URL of the searched page is ".$this->last_url, __FILE__, __METHOD__, __LINE__, $html);
  286. }
  287. $this->settings['board_title'] = $matches[1];
  288. if ($this->dbg) $this->write_err("Site title: {$this->settings['board_title']}");
  289. }
  290. }
  291. protected function check_get_charset($html) {
  292. if ($this->charset === null && preg_match('#\\<meta\\s+http-equiv\\s*=\\s*"Content-Type"\\s+content\\s*=\\s*"text/html;\\s+charset=([^"]+)">#', $html, $matches)) {
  293. $this->charset = $matches[1];
  294. if ($this->dbg) $this->write_err('Set charset to "'.$this->charset.'".');
  295. }
  296. }
  297. protected function check_get_username() {
  298. # Discover user's name if extract_user was not present in settings file (NB might need to be logged in to do this).
  299. if (empty($this->settings['extract_user'])) {
  300. $this->write_status('Attempting to determine username.');
  301. $this->set_url($this->get_user_page_url());
  302. $html = $this->do_send();
  303. if (!$this->skins_preg_match('user_name', $html, $matches)) {
  304. $login_req = $this->skins_preg_match('login_required', $html, $matches);
  305. $err_msg = "Error: couldn't find the member name corresponding to specified user ID \"{$this->settings['extract_user_id']}\". ";
  306. if ($login_req) $err_msg .= 'The board requires that you be logged in to view member names. You can specify a login username and password in the settings on the previous page. If you already did specify them, then this error could be due to a wrong username/password combination. Instead of supplying login details, you can simply supply a value for "Extract User Username".';
  307. else $err_msg .= 'The URL of the searched page is <'.$this->last_url.'>.';
  308. $this->write_and_record_err_admin($err_msg, __FILE__, __METHOD__, __LINE__, $html);
  309. $this->settings['extract_user'] = '[unknown]';
  310. } else $this->settings['extract_user'] = $matches[1];
  311. }
  312. }
  313. function do_send(&$redirect = false, $quit_on_error = true, &$err = false) {
  314. static $retry_delays = array(0, 5, 5);
  315. static $first_so_no_wait = true;
  316. $html = '';
  317. if ($first_so_no_wait) $first_so_no_wait = false;
  318. else $this->wait_courteously();
  319. $err = false;
  320. for ($i = 0; $i < count($retry_delays); $i++) {
  321. $delay = $retry_delays[$i];
  322. if ($err) {
  323. if ($this->dbg) $this->write_err("Retrying after $delay seconds.");
  324. sleep($delay);
  325. }
  326. if ($this->dbg) $this->write_err("In do_send(), retrieving URL <{$this->last_url}>");
  327. // We emulate CURLOPT_FOLLOWLOCATION by grabbing headers and matching a "Location:"
  328. // header because some hosts (Hostgator!) currently have a version of cURL older
  329. // than that in which this bug was fixed: <http://sourceforge.net/p/curl/bugs/1159/>.
  330. // This bug is activated when following XenForo post URLs when CURLOPT_FOLLOWLOCATION
  331. // is set.
  332. $response = curl_exec($this->ch);
  333. if ($response === false) {
  334. $err = 'curl_exec returned false. curl_error returns: "'.curl_error($this->ch).'".';
  335. if ($this->dbg) $this->write_err($err, __FILE__, __METHOD__, __LINE__);
  336. } else {
  337. $header_size = curl_getinfo($this->ch, CURLINFO_HEADER_SIZE);
  338. $headers = substr($response, 0, $header_size);
  339. $html = substr($response, $header_size);
  340. $response_code = curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
  341. if ($response_code != 200) {
  342. $location = false;
  343. if (preg_match('/^Location: (.*)$/im', $headers, $matches)) {
  344. $url = trim($matches[1]);
  345. // Strip from any # onwards - this appears to be buggy either in
  346. // certain older versions of cURL or receiving webservers.
  347. $tmp = explode('#', $url, 2);
  348. $url = $tmp[0];
  349. if ($redirect !== false) {
  350. $redirect = $url;
  351. return '';
  352. }
  353. $this->validate_url($url, 'the redirected-to location', true);
  354. $this->set_url($url);
  355. if ($this->dbg) $this->write_err('In '.__METHOD__.'(): Found a "Location" header; following to <'.$url.'>.');
  356. $i--;
  357. continue;
  358. }
  359. $err = 'Received response other than 200 from server ('.$response_code.') for URL: '.$this->last_url;
  360. if ($this->dbg) $this->write_err($err, __FILE__, __METHOD__, __LINE__);
  361. } else {
  362. $err = false;
  363. break;
  364. }
  365. }
  366. if ($err) break;
  367. }
  368. if ($err) {
  369. if ($quit_on_error) $this->exit_err('Too many errors with request; abandoning page and quitting. Request URL is <'.$this->last_url.'>. Last error was: '.$err, __FILE__, __METHOD__, __LINE__);
  370. } else {
  371. $this->check_get_board_title($html);
  372. }
  373. return $html;
  374. }
  375. # Non-static variant of the static variant below
  376. protected function exit_err($msg, $file, $method, $line, $html = false, $send_mail = true) {
  377. $token = $this->web_initiated ? $this->token : false;
  378. $dbg = $this->dbg;
  379. $this->write_err($msg, $file, $method, $line);
  380. $settings_str = $this->get_settings_str();
  381. static::exit_err_common_s($msg, $file, $method, $line, $this->have_written_to_admin_err_file, get_class($this), $html, $settings_str, $send_mail, $token, $dbg);
  382. }
  383. static public function exit_err_s($msg, $file, $method, $line, $html = false, $send_mail = true, $token = false, $dbg = false) {
  384. $ferr = fopen('php://stderr', 'a');
  385. static::write_err_s($ferr, $msg, $file, $method, $line);
  386. static::exit_err_common_s($msg, $file, $method, $line, false, null, $html, false, $send_mail, $token, $dbg);
  387. }
  388. static public function exit_err_common_s($msg, $file, $method, $line, $have_written_to_admin_err_file, $classname = null, $html = false, $settings_str = false, $send_mail = true, $token = false, $dbg = false) {
  389. $full_admin_msg = static::record_err_admin_s($msg, $file, $method, $line, $have_written_to_admin_err_file, $classname, $html, $settings_str, $token, $dbg);
  390. if ($send_mail) {
  391. static::send_err_mail_to_admin_s($full_admin_msg, $token, true);
  392. }
  393. if ($token) {
  394. static::write_status_s('A fatal error occurred. EXITING', $token);
  395. }
  396. exit(1);
  397. }
  398. # Assumes search results are ordered from most recent post to oldest post.
  399. protected function find_author_posts_via_search_page() {
  400. $num_posts_found = 0;
  401. if ($this->dbg) $this->write_err('Reached search page with post_search_counter set to '.$this->post_search_counter.'.');
  402. if (!curl_setopt($this->ch, CURLOPT_POST, false)) {
  403. $this->write_err('Failed to set cURL option CURLOPT_POST to false.',__FILE__, __METHOD__, __LINE__);
  404. }
  405. $this->set_url($this->get_search_url());
  406. $html = $this->do_send();
  407. if ($this->skins_preg_match('search_results_not_found', $html, $matches)) {
  408. if ($this->dbg) $this->write_err('Matched "search_results_not_found" regex; we have finished finding posts.');
  409. $this->progress_level++;
  410. return 0;
  411. }
  412. if (!$this->skins_preg_match_all('search_results_page_data', $html, $matches, 'search_results_page_data_order', $combine = true)) {
  413. $this->write_and_record_err_admin('Error: couldn\'t find any search result matches on one of the search results pages. The URL of the page is '.$this->last_url, __FILE__, __METHOD__, __LINE__, $html);
  414. $this->progress_level++;
  415. return 0;
  416. }
  417. $found_earliest = false;
  418. foreach ($matches as $match) {
  419. $forum = $match[$match['match_indexes']['forum' ]];
  420. $forumid = $match[$match['match_indexes']['forumid']];
  421. $topic = $match[$match['match_indexes']['topic' ]];
  422. $topicid = isset($match['match_indexes']['topicid']) ? $match[$match['match_indexes']['topicid']] : null;
  423. $postid = $match[$match['match_indexes']['postid' ]];
  424. $posttitle = isset($match['match_indexes']['title']) && isset($match[$match['match_indexes']['title']]) ? $match[$match['match_indexes']['title']] : '';
  425. $ts_raw = $match[$match['match_indexes']['ts' ]];
  426. $this->find_author_posts_via_search_page__ts_raw_hook($ts_raw);
  427. $ts = $this->strtotime_intl($ts_raw);
  428. if ($ts === false) {
  429. $err_msg = "Error: strtotime_intl failed for '$ts_raw'.";
  430. if (!isset($this->settings['non_us_date_format']) && strpos($ts_raw, '/') !== false) {
  431. $err_msg .= ' Hint: Perhaps you need to check the "Non-US date format" box on the previous page.';
  432. }
  433. $this->write_err($err_msg);
  434. } else {
  435. if (!empty($this->settings['earliest']) && $ts < $this->settings['earliest']) {
  436. $found_earliest = true;
  437. if ($this->dbg) $this->write_err("Found post earlier than earliest allowed; not searching further: ".$ts_raw." < {$this->settings['start_from_date']}.");
  438. break;
  439. }
  440. }
  441. $this->find_author_posts_via_search_page__match_hook($match, $forum, $forumid, $topic, $topicid, $postid, $posttitle, $ts_raw, $ts);
  442. $this->posts_data[$topicid]['forum' ] = $forum;
  443. $this->posts_data[$topicid]['topic' ] = $topic;
  444. $this->posts_data[$topicid]['forumid'] = $forumid;
  445. $this->posts_data[$topicid]['posts'][$postid] = array(
  446. 'posttitle' => $posttitle,
  447. 'ts' => $ts_raw,
  448. 'timestamp' => $ts,
  449. 'content' => null,
  450. );
  451. if ($this->dbg) {
  452. $this->write_err("Added post: $posttitle ($topic; $ts; $forum; forumid: $forumid; topicid: $topicid; postid: $postid)");
  453. }
  454. $num_posts_found++;
  455. }
  456. $do_inc_progress_level = $found_earliest;
  457. $this->find_author_posts_via_search_page__end_hook($do_inc_progress_level, $html, $found_earliest, $matches);
  458. if ($do_inc_progress_level) $this->progress_level++;
  459. return $num_posts_found;
  460. }
  461. protected function find_author_posts_via_search_page__end_hook(&$do_inc_progress_level, $html, $found_earliest, $matches) {
  462. $this->post_search_counter += count($matches);
  463. }
  464. protected function find_author_posts_via_search_page__match_hook($match, &$forum, &$forumid, &$topic, &$topicid, &$postid, &$posttitle, &$ts_raw, &$ts) {}
  465. # Override this function to e.g. remove extraneous text from the matched timestamp string
  466. # prior to attempting to parse it into a UNIX timestamp.
  467. protected function find_author_posts_via_search_page__ts_raw_hook(&$ts_raw) {}
  468. protected function find_post($postid) {
  469. foreach ($this->posts_data as $topicid => $t) {
  470. foreach ($t['posts'] as $pid => $p) {
  471. if ($pid == $postid) return array($p, $t, $topicid);
  472. }
  473. }
  474. return false; # Earlier return possible
  475. }
  476. static protected function get_classname_msg_s($classname) {
  477. return 'The active FUPS class is: '.$classname;
  478. }
  479. protected function get_default_settings() {
  480. return array(
  481. 'delay' => 5,
  482. 'debug' => false
  483. );
  484. }
  485. protected function get_extra_head_lines() {
  486. return '';
  487. }
  488. protected function get_final_output_array() {
  489. static $ret = null;
  490. if ($ret === null) {
  491. $ret = array(
  492. 'board_title' => $this->settings['board_title'],
  493. 'user_name' => $this->settings['extract_user'],
  494. 'board_base_url' => $this->settings['base_url'],
  495. 'start_from_date' => $this->settings['start_from_date'],
  496. 'character_set' => $this->charset,
  497. 'threads_and_posts' => $this->posts_data,
  498. );
  499. }
  500. return $ret;
  501. }
  502. static protected function get_formatted_err($method, $line, $file, $msg) {
  503. $ret = '';
  504. if ($method) $ret = "In $method";
  505. if ($line) {
  506. $ret .= ($ret ? ' in' : 'In')." line $line";
  507. }
  508. if ($file) {
  509. $ret .= ($ret ? ' in' : 'In')." file $file";
  510. }
  511. $ret .= ($ret ? ': ' : '').$msg;
  512. return $ret;
  513. }
  514. static function get_forum_software_homepage() {
  515. return '[YOU NEED TO CUSTOMISE THE static get_forum_software_homepage() function OF YOUR CLASS DESCENDING FROM FUPSBase!]';
  516. }
  517. static function get_msg_how_to_detect_forum() {
  518. return '[YOU NEED TO CUSTOMISE THE static get_msg_how_to_detect_forum() function OF YOUR CLASS DESCENDING FROM FUPSBase!]';
  519. }
  520. protected function get_output_variants() {
  521. return array(
  522. array(
  523. 'filename_appendix' => '.threadasc.dateasc.html',
  524. 'method' => 'write_output_html_threadasc_dateasc',
  525. 'description' => 'HTML, sorting posts first by ascending thread title (i.e. alphabetical order) then ascending post date (i.e. earliest first)',
  526. ),
  527. array(
  528. 'filename_appendix' => '.threadasc.datedesc.html',
  529. 'method' => 'write_output_html_threadasc_datedesc',
  530. 'description' => 'HTML, sorting posts first by ascending thread title (i.e. alphabetical order) then descending post date (i.e. latest first)',
  531. ),
  532. array(
  533. 'filename_appendix' => '.threaddesc.dateasc.html',
  534. 'method' => 'write_output_html_threaddesc_dateasc',
  535. 'description' => 'HTML, sorting posts first by descending thread title (i.e. reverse alphabetical order) then ascending post date (i.e. earliest first)',
  536. ),
  537. array(
  538. 'filename_appendix' => '.threaddesc.datedesc.html',
  539. 'method' => 'write_output_html_threaddesc_datedesc',
  540. 'description' => 'HTML, sorting posts first by descending thread title (i.e. reverse alphabetical order) then descending post date (i.e. latest first)',
  541. ),
  542. array(
  543. 'filename_appendix' => '.dateasc.html',
  544. 'method' => 'write_output_html_dateasc',
  545. 'description' => 'HTML, sorting posts by ascending date (i.e. earliest first) regardless of which thread they are in',
  546. ),
  547. array(
  548. 'filename_appendix' => '.datedesc.html',
  549. 'method' => 'write_output_html_datedesc',
  550. 'description' => 'HTML, sorting posts by descending date (i.e. latest first) regardless of which thread they are in',
  551. ),
  552. array(
  553. 'filename_appendix' => '.php_serialised',
  554. 'method' => 'write_output_php_serialised',
  555. 'description' => 'Serialised PHP',
  556. ),
  557. array(
  558. 'filename_appendix' => '.php',
  559. 'method' => 'write_output_php',
  560. 'description' => 'PHP (unserialised array)',
  561. ),
  562. array(
  563. 'filename_appendix' => '.json',
  564. 'method' => 'write_output_json',
  565. 'description' => 'JSON',
  566. ),
  567. );
  568. }
  569. protected function get_post_contents($forumid, $topicid, $postid) {
  570. $ret = false;
  571. $found = false;
  572. if (!curl_setopt($this->ch, CURLOPT_POST, false)) {
  573. $this->write_err('Failed to set cURL option CURLOPT_POST to false.',__FILE__, __METHOD__, __LINE__);
  574. }
  575. $url = $this->get_post_url($forumid, $topicid, $postid);
  576. $this->set_url($url);
  577. $html = $this->do_send();
  578. $this->check_get_charset($html);
  579. $err = false;
  580. $count = 0;
  581. if (!$this->skins_preg_match_all('post_contents', $html, $matches)) {
  582. $err = true;
  583. $this->write_err('Error: Did not find any post IDs or contents on the thread page for post ID '.$postid.'. The URL of the page is "'.$this->last_url.'"', __FILE__, __METHOD__, __LINE__, $html);
  584. } else {
  585. list($found, $count) = $this->get_post_contents_from_matches($matches, $postid, $topicid);
  586. if ($found) {
  587. if ($this->dbg) $this->write_err('Retrieved post contents of post ID "'.$postid.'"');
  588. $ret = true;
  589. $count--;
  590. } else $this->write_and_record_err_admin('FAILED to retrieve post contents of post ID "'.$postid.'". The URL of the page is "'.$this->last_url.'"', __FILE__, __METHOD__, __LINE__, $html);
  591. if ($count > 0 && $this->dbg) $this->write_err('Retrieved '.$count.' other posts.');
  592. }
  593. $this->get_post_contents__end_hook($forumid, $topicid, $postid, $html, $found, $err, $count, $ret);
  594. if (!$found) $this->posts_not_found[$postid] = true;
  595. $this->num_posts_retrieved += $count + ($found ? 1 : 0);
  596. return $ret;
  597. }
  598. protected function get_post_contents__end_hook($forumid, $topicid, $postid, $html, &$found, $err, $count, &$ret) {}
  599. protected function get_post_contents_from_matches($matches, $postid, $topicid) {
  600. $found = false;
  601. $count = 0;
  602. $posts =& $this->posts_data[$topicid]['posts'];
  603. foreach ($matches as $match) {
  604. if (isset($posts[$match[1]])) {
  605. $posts[$match[1]]['content'] = $match[2];
  606. if ($postid == $match[1]) $found = true;
  607. $count++;
  608. }
  609. }
  610. return array($found, $count);
  611. }
  612. abstract protected function get_post_url($forumid, $topicid, $postid, $with_hash = false);
  613. static function get_qanda() {
  614. return array(
  615. 'q_lang' => array(
  616. 'q' => 'Does the script work with forums using a language other than English?',
  617. 'a' => 'Yes, or at least, it\'s intended to: if you experience problems, please <a href="'.FUPS_CONTACT_URL.'">contact me</a>.',
  618. ),
  619. 'q_how_long' => array(
  620. 'q' => 'How long will the process take?',
  621. 'a' => 'It depends on how many posts are to be retrieved, and how many pages they are spread across. You can expect to wait roughly one hour to extract and output 1,000 posts.',
  622. ),
  623. 'q_why_slow' => array(
  624. 'q' => 'Why is this script so slow?',
  625. 'a' => 'So as to avoid hammering other people\'s web servers, the script pauses for five seconds between each page retrieval.',
  626. ),
  627. );
  628. }
  629. abstract protected function get_search_url();
  630. public function get_settings_array() {
  631. $default_settings = array(
  632. 'base_url' => array(
  633. 'label' => 'Base forum URL' ,
  634. 'default' => '' ,
  635. 'description' => 'Set this to the base URL of the forum.',
  636. 'style' => 'min-width: 300px;' ,
  637. ),
  638. 'extract_user_id' => array(
  639. 'label' => 'Extract User ID' ,
  640. 'default' => '' ,
  641. 'description' => 'Set this to the user ID of the user whose posts are to be extracted.',
  642. )
  643. );
  644. if ($this->supports_feature('login')) {
  645. $default_settings = array_merge($default_settings, array(
  646. 'login_user' => array(
  647. 'label' => 'Login User Username',
  648. 'default' => '',
  649. 'description' => 'Set this to the username of the user whom you wish to log in as, or leave it blank if you do not wish FUPS to log in.',
  650. ),
  651. 'login_password' => array(
  652. 'label' => 'Login User Password',
  653. 'default' => '',
  654. 'description' => 'Set this to the password associated with the Login User Username (or leave it blank if you do not require login).',
  655. 'type' => 'password',
  656. ),
  657. ));
  658. }
  659. $default_settings = array_merge($default_settings, array(
  660. 'start_from_date' => array(
  661. 'label' => 'Start From Date+Time',
  662. 'default' => '',
  663. 'description' => 'Set this to the datetime of the earliest post to be extracted i.e. only posts of this datetime and later will be extracted. If you do not set this (i.e. if you leave it blank) then all posts will be extracted. This value is parsed with PHP\'s <a href="http://www.php.net/strtotime">strtotime()</a> function, so check that link for details on what it should look like. An example of something that will work is: 2013-04-30 15:30.',
  664. ),
  665. 'php_timezone' => array(
  666. 'label' => 'PHP Timezone',
  667. 'default' => 'Australia/Hobart',
  668. 'description' => 'Set this to the time zone in which the user\'s posts were made. Valid time zone values are listed starting <a href="http://php.net/manual/en/timezones.php">here</a>. This is a required setting, because PHP requires the time zone to be set when using date/time functions, however it only applies when "Start From Date+Time" is set above, in which case the value that you supply for "Start From Date+Time" will be assumed to be in the time zone you supply here, as will the date+times for posts retrieved from the forum. It is safe to leave this value set to the default if you are not supplying a value for the "Start From Date+Time" setting.',
  669. ),
  670. 'non_us_date_format' => array(
  671. 'label' => 'Non-US date format',
  672. 'default' => '',
  673. 'description' => 'Check this box if the forum from which you\'re scraping outputs dates in the non-US ordering dd/mm rather than the US ordering mm/dd. Applies only if day and month are specified by digits and separated by forward slashes.',
  674. 'type' => 'checkbox',
  675. ),
  676. ));
  677. return $default_settings;
  678. }
  679. static protected function get_settings_msg_s($settings_str) {
  680. return 'The session\'s settings are:'.PHP_EOL.$settings_str;
  681. }
  682. protected function get_settings_str() {
  683. $settings_str = '';
  684. foreach ($this->settings as $k => $v) {
  685. if ($v && in_array($k, $this->private_settings)) {
  686. $v = '[redacted]';
  687. }
  688. $settings_str .= "\t$k=$v".PHP_EOL;
  689. }
  690. return $settings_str;
  691. }
  692. abstract protected function get_topic_url($forumid, $topicid);
  693. abstract protected function get_user_page_url();
  694. static public function get_valid_forum_types() {
  695. static $ignored_files = array('.', '..', 'CFUPSBase.php');
  696. $ret = array();
  697. $class_files = scandir(__DIR__);
  698. if ($class_files) foreach ($class_files as $class_file) {
  699. if (!in_array($class_file, $ignored_files)) {
  700. $class = substr($class_file, 1, -4); # Omit initial "C" and trailing ".php"
  701. $ret[strtolower($class)] = $class;
  702. }
  703. }
  704. return $ret;
  705. }
  706. protected function hook_after__init_user_post_search () {} // Run after progress level 0
  707. protected function hook_after__user_post_search () {} // Run after progress level 1
  708. protected function hook_after__topic_post_sort () {} // Run after progress level 2
  709. protected function hook_after__posts_retrieval () {} // Run after progress level 3
  710. protected function hook_after__extract_per_thread_info() {} // Run after progress level 4
  711. protected function hook_after__handle_missing_posts () {} // Run after progress level 5
  712. protected function hook_after__write_output () {} // Run after progress level 6
  713. protected function hook_after__check_send_non_fatal_err_email() {} // Run after progress level 7
  714. protected function init_post_search_counter() {
  715. $this->post_search_counter = 0;
  716. }
  717. protected function init_search_user_posts() {}
  718. static public function read_forum_type_from_settings_file_s($settings_filename) {
  719. $settings_raw = static::read_settings_raw_s($settings_filename);
  720. return isset($settings_raw['forum_type']) ? $settings_raw['forum_type'] : false;
  721. }
  722. static public function read_settings_raw_s($settings_filename) {
  723. $ret = array();
  724. $contents = file_get_contents($settings_filename);
  725. $contents_a = explode(PHP_EOL, $contents);
  726. $settings = array();
  727. foreach ($contents_a as $line) {
  728. $a = explode('=', $line, 2);
  729. if (count($a) < 2) continue;
  730. $setting = $a[0];
  731. $value = $a[1];
  732. $ret[$setting] = $value;
  733. }
  734. return $ret;
  735. }
  736. static protected function record_err_admin_s($msg, $file, $method, $line, &$have_written_to_admin_err_file, $classname = null, $html = false, $settings_str = false, $token = false, $dbg = false) {
  737. $ferr = fopen('php://stderr', 'a');
  738. $html_msg = $html !== false ? 'The relevant page\'s HTML is:'.PHP_EOL.PHP_EOL.$html.PHP_EOL.PHP_EOL.PHP_EOL.PHP_EOL.PHP_EOL.PHP_EOL : '';
  739. $settings_msg = (!$have_written_to_admin_err_file && $settings_str) ? static::get_settings_msg_s($settings_str) : '';
  740. $classname_msg = (!$have_written_to_admin_err_file && $classname) ? static::get_classname_msg_s($classname).PHP_EOL.PHP_EOL : '';
  741. $full_admin_msg = $classname_msg.$settings_msg.PHP_EOL.static::get_formatted_err($method, $line, $file, $msg).PHP_EOL.PHP_EOL.$html_msg;
  742. if ($token) {
  743. $filename = make_errs_admin_filename($token);
  744. if ($dbg) {
  745. if ($ferr !== false) {
  746. fwrite($ferr, 'Attempting to open "'.$filename.'" for appending.'.PHP_EOL);
  747. }
  748. }
  749. $ferr_adm = fopen($filename, 'a');
  750. if ($ferr_adm !== false) {
  751. if (fwrite($ferr_adm, $full_admin_msg) === false) {
  752. if ($dbg) fwrite($ferr, 'Error: failed to fwrite() to '.$filename.'.'.PHP_EOL);
  753. } else $have_written_to_admin_err_file = true;
  754. fclose($ferr_adm);
  755. } else if ($dbg) fwrite($ferr, 'Error: failed to fopen() '.$filename.' for appending.'.PHP_EOL);
  756. } else fwrite($ferr, $html_msg);
  757. fclose($ferr);
  758. return $full_admin_msg;
  759. }
  760. public function run() {
  761. $valid_protocols = (CURLPROTO_HTTP | CURLPROTO_HTTPS);
  762. $this->cookie_filename = make_cookie_filename($this->web_initiated ? $this->token : $this->settings_filename);
  763. if ($this->dbg) $this->write_err('Set cookie_filename to "'.$this->cookie_filename.'".');
  764. if (!$this->was_chained) {
  765. @unlink($this->cookie_filename); // Ensure that any existing cookie file on commandline reruns doesn't mess with us.
  766. }
  767. $this->ch = curl_init();
  768. if ($this->ch === false) {
  769. $this->exit_err('Failed to initialise cURL.', __FILE__, __METHOD__, __LINE__);
  770. }
  771. $opts = array(
  772. CURLOPT_USERAGENT => FUPS_USER_AGENT,
  773. CURLOPT_FOLLOWLOCATION => false, // We emulate this due to a bug - see do_send().
  774. CURLOPT_RETURNTRANSFER => true,
  775. CURLOPT_HEADER => true,
  776. CURLOPT_TIMEOUT => 20,
  777. CURLOPT_COOKIEJAR => $this->cookie_filename,
  778. CURLOPT_COOKIEFILE => $this->cookie_filename,
  779. CURLOPT_PROTOCOLS => $valid_protocols, // Protect against malicious users specifying 'file://...' as base_url setting.
  780. CURLOPT_REDIR_PROTOCOLS => $valid_protocols, // Protect against malicious users specifying a base_url setting to a server which redirects to 'file://...'.
  781. );
  782. if (!curl_setopt_array($this->ch, $opts)) {
  783. $this->exit_err('Failed to set the following cURL options:'.PHP_EOL.var_export($opts, true), __FILE__, __METHOD__, __LINE__);
  784. }
  785. # Login if necessary
  786. if ($this->supports_feature('login')) {
  787. if ($this->was_chained) {
  788. if ($this->dbg) $this->write_err('Not bothering to check whether to log in again, because we\'ve just chained.');
  789. } else $this->check_do_login();
  790. }
  791. # Find all of the user's posts through the search feature
  792. if ($this->progress_level == 0) {
  793. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  794. $this->check_get_username();
  795. $this->search_page_num = 1;
  796. $this->init_post_search_counter();
  797. $this->init_search_user_posts();
  798. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  799. $this->progress_level++;
  800. $this->$hook_method(); // hook_after__init_user_post_search();
  801. }
  802. if ($this->progress_level == 1) {
  803. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  804. do {
  805. $this->write_status('Scraping search page for posts starting from page #'.$this->search_page_num.'.');
  806. $num_posts_found = $this->find_author_posts_via_search_page();
  807. if ($this->dbg) $this->write_err('Found '.$num_posts_found.' posts.');
  808. $this->total_posts += $num_posts_found;
  809. $this->search_page_num++;
  810. $this->check_do_chain();
  811. } while ($this->progress_level == 1);
  812. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level-1];
  813. $this->$hook_method(); // hook_after__user_post_search();
  814. }
  815. # Sort topics and posts
  816. if ($this->progress_level == 2) {
  817. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  818. $this->write_status('Sorting posts and topics prior to scraping posts\' content.');
  819. # Sort topics in ascending alphabetical order
  820. uasort($this->posts_data, 'cmp_topics_topic');
  821. # Sort posts within each topic into ascending timestamp order
  822. foreach ($this->posts_data as $topicid => $dummy) {
  823. $posts =& $this->posts_data[$topicid]['posts'];
  824. uasort($posts, 'cmp_posts_date');
  825. }
  826. if ($this->dbg) {
  827. $this->write_err('SORTED POSTS::');
  828. foreach ($this->posts_data as $topicid => $topic) {
  829. $this->write_err("\tTopic: {$topic['topic']}\tTopic ID: $topicid");
  830. foreach ($topic['posts'] as $postid => $p) {
  831. $newts = strftime('%c', $p['timestamp']);
  832. $this->write_err("\t\tTime: $newts ({$p['ts']}); Post ID: $postid");
  833. }
  834. }
  835. }
  836. $this->write_status('Finished sorting posts and topics. Now scraping contents of '.$this->total_posts.' posts.');
  837. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  838. $this->progress_level++;
  839. $this->$hook_method(); // hook_after__topic_post_sort();
  840. }
  841. # Retrieve the contents of all of the user's posts
  842. if ($this->progress_level == 3) {
  843. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  844. # If the current topic ID is already set, then we are continuing after having chained.
  845. $go = is_null($this->current_topic_id);
  846. foreach ($this->posts_data as $topicid => $dummy) {
  847. if (!$go && $this->current_topic_id == $topicid) $go = true;
  848. if ($go) {
  849. $this->current_topic_id = $topicid;
  850. $t =& $this->posts_data[$topicid];
  851. $posts =& $t['posts'];
  852. $done = false;
  853. while (!$done) {
  854. $done = true;
  855. foreach ($posts as $postid => $dummy2) {
  856. $p =& $posts[$postid];
  857. if ($p['content'] == null && !isset($this->posts_not_found[$postid])) {
  858. $this->get_post_contents($t['forumid'], $topicid, $postid);
  859. $this->write_status('Retrieved '.$this->num_posts_retrieved.' of '.$this->total_posts.' posts.');
  860. $done = false;
  861. }
  862. $this->check_do_chain();
  863. }
  864. }
  865. }
  866. }
  867. $this->current_topic_id = null; # Reset this for progress level 4
  868. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  869. $this->progress_level++;
  870. $this->$hook_method(); // hook_after__posts_retrieval();
  871. }
  872. # Extract per-thread information: thread author and forum
  873. if ($this->progress_level == 4) {
  874. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  875. # If the current topic ID is already set, then we are continuing after having chained.
  876. $go = is_null($this->current_topic_id);
  877. $total_threads = count($this->posts_data);
  878. foreach ($this->posts_data as $topicid => $dummy) {
  879. if (!$go) {
  880. if ($this->current_topic_id == $topicid) $go = true;
  881. } else {
  882. $topic =& $this->posts_data[$topicid];
  883. $url = $this->get_topic_url($topic['forumid'], $topicid);
  884. $this->set_url($url);
  885. $html = $this->do_send();
  886. if (!$this->skins_preg_match('thread_author', $html, $matches)) {
  887. $this->write_and_record_err_admin("Error: couldn't find a match for the author of the thread with topic id '$topicid'. The URL of the page is <".$url.'>.', __FILE__, __METHOD__, __LINE__, $html);
  888. $topic['startedby'] = '???';
  889. } else {
  890. $topic['startedby'] = $matches[1];
  891. if ($this->dbg) $this->write_err("Added author of '{$topic['startedby']}' for topic id '$topicid'.");
  892. $this->num_thread_infos_retrieved++;
  893. $this->write_status('Retrieved author and topic name for '.$this->num_thread_infos_retrieved.' of '.$total_threads.' threads.');
  894. }
  895. $this->current_topic_id = $topicid;
  896. $this->check_do_chain();
  897. }
  898. }
  899. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  900. $this->progress_level++;
  901. $this->$hook_method(); // hook_after__extract_per_thread_info();
  902. }
  903. # Warn about missing posts
  904. if ($this->progress_level == 5) {
  905. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  906. if ($this->posts_not_found) {
  907. $this->write_err(PHP_EOL.PHP_EOL.PHP_EOL."The contents of the following posts were not found::".PHP_EOL.PHP_EOL.PHP_EOL);
  908. foreach ($this->posts_not_found as $postid => $dummy) {
  909. $a = $this->find_post($postid);
  910. if ($a == false) $this->write_err("\tError: failed to find post with ID '$postid' in internal data.");
  911. else {
  912. list($p, $t, $topicid) = $a;
  913. $this->write_err("\t{$p['posttitle']} ({$t['topic']}; {$p['timestamp']}; {$t['forum']}; forumid: {$t['forumid']}; topicid: $topicid; postid: $postid; ".$this->get_post_url($t['forumid'], $topicid, $postid).')');
  914. }
  915. }
  916. }
  917. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  918. $this->progress_level++;
  919. $this->$hook_method(); // hook_after__handle_missing_posts();
  920. }
  921. # Write output
  922. if ($this->progress_level == 6) {
  923. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  924. $this->write_status('Writing output.');
  925. # Write all output variants
  926. $this->write_output();
  927. # Signal that we are done
  928. $this->write_status('DONE');
  929. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  930. $this->progress_level++;
  931. $this->$hook_method(); // hook_after__write_output();
  932. }
  933. # Potentially send an admin email re non-fatal errors.
  934. if ($this->progress_level == 7) {
  935. if ($this->dbg) $this->write_err('Entered progress level '.$this->progress_level);
  936. if ($this->web_initiated) {
  937. $errs = file_get_contents(make_errs_filename ($this->token));
  938. // Disable error messages because if there are no errors then this file
  939. // won't exist - we want to avoid an error message telling us as much.
  940. $errs_admin = @file_get_contents(make_errs_admin_filename($this->token));
  941. if ($errs || $errs_admin) {
  942. $err_msg = '';
  943. if ($errs) {
  944. $len = strlen($errs);
  945. $trunc_msg = '';
  946. if ($len > FUPS_MAX_ERROR_FILE_EMAIL_LENGTH) {
  947. $errs = substr($errs, 0, FUPS_MAX_ERROR_FILE_EMAIL_LENGTH);
  948. $trunc_msg = ' (truncated from '.number_format($len).' bytes to '.number_format(FUPS_MAX_ERROR_FILE_EMAIL_LENGTH).' bytes)';
  949. }
  950. // No need to include the settings and classname if admin error info exists too,
  951. // because settings and classname are already included each time the admin error
  952. // file is appended to.
  953. if (!$errs_admin) {
  954. $settings_msg = static::get_settings_msg_s(static::get_settings_str());
  955. $classname_msg = static::get_classname_msg_s(get_class($this));
  956. $err_msg .= $settings_msg.PHP_EOL.PHP_EOL.$classname_msg.PHP_EOL;
  957. }
  958. $err_msg .= 'The following non-fatal errors were recorded in the error file'.$trunc_msg.':'.PHP_EOL.PHP_EOL.$errs.PHP_EOL;
  959. }
  960. if ($errs_admin) {
  961. if ($errs) $err_msg .= PHP_EOL.PHP_EOL;
  962. $len = strlen($errs_admin);
  963. $trunc_msg = '';
  964. if ($len > FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH) {
  965. $errs_admin = substr($errs_admin, 0, FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH);
  966. $trunc_msg = ' (truncated from '.number_format($len).' bytes to '.number_format(FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH).' bytes)';
  967. }
  968. $err_msg .= 'The following extended non-fatal error messages were recorded in the admin error file'.$trunc_msg.':'.PHP_EOL.PHP_EOL.$errs_admin.PHP_EOL;
  969. }
  970. static::send_err_mail_to_admin_s($err_msg, $this->token, false);
  971. }
  972. }
  973. $hook_method = 'hook_after__'.$this->progress_levels[$this->progress_level];
  974. $this->progress_level++;
  975. $this->$hook_method(); // hook_after__check_send_non_fatal_err_email();
  976. }
  977. }
  978. static protected function send_err_mail_to_admin_s($full_admin_msg, $token = false, $is_fatal = true) {
  979. global $argv;
  980. $body = ($is_fatal ? 'F' : 'Non-f').'atal error'.($is_fatal ? '' : '(s)').' occurred in the FUPS process with commandline arguments:'.PHP_EOL.var_export($argv, true).PHP_EOL.PHP_EOL;
  981. $body .= $full_admin_msg;
  982. $subject = ($is_fatal ? 'F' : 'Non-f').'atal error'.($is_fatal ? '' : '(s)').' in FUPS process';
  983. if ($token) $subject .= ' '.$token;
  984. $headers = 'From: '.FUPS_EMAIL_SENDER."\r\n".
  985. "MIME-Version: 1.0\r\n" .
  986. "Content-type: text/plain; charset=UTF-8\r\n";
  987. mail(FUPS_EMAIL_RECIPIENT, $subject, $body, $headers);
  988. }
  989. protected function set_url($url) {
  990. if (!curl_setopt($this->ch, CURLOPT_URL, $url)) {
  991. $this->exit_err('Failed to set cURL URL: <'.$url.'>.', __FILE__, __METHOD__, __LINE__);
  992. } else $this->last_url = $url;
  993. }
  994. protected function skins_preg_match_base($regexp_id, $text, &$matches, $all = false, $match_indexes_id = false, $combine = false) {
  995. $ret = false;
  996. $matches = array();
  997. foreach ($this->regexps as $skin => $skin_regexps) {
  998. if (!empty($skin_regexps[$regexp_id])) {
  999. $regexp = $skin_regexps[$regexp_id];
  1000. if (
  1001. ($all && preg_match_all($regexp, $text, $matches_tmp, PREG_SET_ORDER))
  1002. ||
  1003. (!$all && preg_match($regexp, $text, $matc