PageRenderTime 27ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/classes/CXenForo.php

https://gitlab.com/ezeql/fups
PHP | 291 lines | 182 code | 33 blank | 76 comment | 32 complexity | d0615b42739d07559f124f6242650dbb MD5 | raw file
  1. <?php
  2. /*
  3. * FUPS: Forum user-post scraper. An extensible PHP framework for scraping and
  4. * outputting the posts of a specified user from a specified forum/board
  5. * running supported forum software. Can be run as either a web app or a
  6. * commandline script.
  7. *
  8. * Copyright (C) 2013-2014 Laird Shaw.
  9. *
  10. * This program is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Affero General Public License as
  12. * published by the Free Software Foundation, either version 3 of the
  13. * License, or (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Affero General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Affero General Public License
  21. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. *
  23. */
  24. /* File : classes/CXenForo.php.
  25. * Description: The XenForo forum scraper class, descending from FUPSBase.
  26. */
  27. class XenForoFUPS extends FUPSBase {
  28. protected $user_id_num = '';
  29. protected $topic_ids = array();
  30. protected $regexps = array(
  31. 'cwt_default' => array(
  32. // a regexp to extract the board's title from any forum page
  33. 'board_title' => '#<div class="boardTitle"><strong>([^<]*)</strong></div>#',
  34. // a regexp to extract the user's name from the user's profile page:
  35. // </members/[user_id]/>
  36. 'user_name' => '#<h1 itemprop="name" class="username"><span class="[^"]*">([^<]*)</span>#',
  37. // a regexp to extract the thread's author from the thread view page:
  38. // </threads/[topicid]/>
  39. 'thread_author' => '#<p id="pageDescription" class="muted ">[^<]*<a href="forums/[^/]*/">[^<]*</a>[^<]*<a href="members/[^/]*/" class="username"[^>]*>([^<]*)</a>#Us',
  40. // a regexp to detect when a search results page returns no results i.e. on:
  41. // </search/[searchid]/?page=[pagenum]>
  42. 'search_results_not_found' => '#<div class="messageBody">[^<]*</div>#',
  43. // a regexp to be matched on the user's posts search page
  44. // </search/[searchid]/?page=[pagenum]> using
  45. // preg_match_all with flags set to PREG_SET_ORDER so that each entry of
  46. // $matches ends up with the following matches in the order specified in
  47. // search_results_page_data_order.
  48. // N.B. Must not match any results matched by any other search_results_page_data regex - the results of all are combined!
  49. 'search_results_page_data' => '#<div class="listBlock main">\\s*<div class="titleText">\\s*<span class="contentType">[^<]*</span>\\s*<h3 class="title"><a href="([^/]*)/([^/]+)/">(<span[^>]*>[^<]*</span> )?([^<]*)</a></h3>\\s*</div>\\s*<blockquote class="snippet">\\s*<a href="[^/]*/[^/]+/">[^<]*</a>\\s*</blockquote>\\s*<div class="meta">\\s*[^<]*<a href="members/[^/]*/"\\s*class="username"[^>]*>[^<]*</a>,\\s*<span class="DateTime" title="([^"]+)">[^<]*</span>[^<]*<a href="forums/([^/]*)/">([^<]*)</a>#Us',
  50. // an array specifying the order in which the following matches occur
  51. // in the matches returned by the previous array.
  52. // = array(
  53. // 'topic' => the match index of the topic of post,
  54. // 'ts' => the match index of the timestamp of post,
  55. // 'forum' => the match index of the title of forum,
  56. // 'forumid' => the match index of the forum id,
  57. // 'postid' => the match index of the post id,
  58. // 'postsorthreads' => the match index of the text which is either "posts" or "threads"
  59. // )
  60. 'search_results_page_data_order' => array('topic' => 4, 'ts' => 5, 'forum' => 7, 'forumid' => 6, 'postid' => 2, 'postsorthreads' => 1),
  61. // a regexp to match post id (first match) and post contents (second match)
  62. // on a thread page; it is called with match_all so it will return all
  63. // post ids and contents on the page
  64. 'post_contents' => '#<li id="post-(\\d+)".*<article>\\s*<blockquote class="messageText [^"]*">(.*)\\s*?</blockquote>\\s*</article>#Us',
  65. // a regexp to match the thread id in a thread page
  66. 'thread_id' => '[THIS REGEX IS SET WITHIN __construct()]',
  67. 'older_content' => '#<div class="secondaryContent olderMessages">\\s*<a href="search/member\\?user_id=\\d*&amp;before=(\\d+)">#'
  68. ),
  69. 'cwt_default2' => array(
  70. 'user_name' => '#<h1 itemprop="name" class="username">([^<]*)</h1>#', // Sometimes the inner span is missing.
  71. // Sometimes the DateTime <span> is actually an <abbr>.
  72. 'search_results_page_data' => '#<div class="listBlock main">\\s*<div class="titleText">\\s*<span class="contentType">[^<]*</span>\\s*<h3 class="title"><a href="([^/]*)/([^/]+)/">(<span[^>]*>[^<]*</span> )?([^<]*)</a></h3>\\s*</div>\\s*<blockquote class="snippet">\\s*<a href="[^/]*/[^/]+/">[^<]*</a>\\s*</blockquote>\\s*<div class="meta">\\s*[^<]*<a href="members/[^/]*/"\\s*class="username"[^>]*>[^<]*</a>,\\s*<abbr class="DateTime"[^>]*>([^<]*)</abbr>[^<]*<a href="forums/([^/]*)/">([^<]*)</a>#Us',
  73. 'search_results_page_data_order' => array('topic' => 4, 'ts' => 5, 'forum' => 7, 'forumid' => 6, 'postid' => 2, 'postsorthreads' => 1),
  74. ),
  75. );
  76. public function __construct($web_initiated, $params, $do_not_init = false) {
  77. if (!$do_not_init) {
  78. $this->required_settings[] = 'thread_url_prefix';
  79. }
  80. parent::__construct($web_initiated, $params, $do_not_init);
  81. if (!$do_not_init) {
  82. $this->regexps['cwt_default']['thread_id'] = '#<a href="'.$this->settings['thread_url_prefix'].'([^/]*)/[^"]*" title="[^"]*" class="datePermalink"#';
  83. }
  84. }
  85. protected function find_author_posts_via_search_page__match_hook($match, &$forum, &$forumid, &$topic, &$topicid, &$postid, &$posttitle, &$ts_raw, &$ts) {
  86. # Messy workaround: posts which start a thread are displayed as a "thread" result in XenForo search results,
  87. # so we need to convert the threadid into a postid.
  88. if (isset($match['match_indexes']['postsorthreads']) && $match[$match['match_indexes']['postsorthreads']] == 'threads') {
  89. $url = $this->settings['base_url'].'/'.$this->settings['thread_url_prefix'].$postid; # really a threadid
  90. $this->set_url($url);
  91. $html = $this->do_send();
  92. if (!$this->skins_preg_match('post_contents', $html, $matches)) {
  93. $this->write_and_record_err_admin("Error: the regex to detect the first post ID on the thread page at <$url> failed.", __FILE__, __METHOD__, __LINE__, $html);
  94. $postid = null;
  95. } else {
  96. $postid = $matches[1];
  97. }
  98. }
  99. # Another messy workaround: the topic (thread) ID is not present anywhere in the XenForo search page HTML,
  100. # so we generate fake incrementing thread IDs, associated with the thread text, and then resolve
  101. # the actual IDs later (in get_post_contents__end_hook() and hook_after__posts_retrieval() below).
  102. if (!isset($this->topic_ids[$topic])) {
  103. $ids = array_values($this->topic_ids);
  104. $lastid = array_pop($ids);
  105. $lastid++;
  106. $this->topic_ids[$topic] = $lastid;
  107. }
  108. $topicid = $this->topic_ids[$topic];
  109. }
  110. protected function find_author_posts_via_search_page__ts_raw_hook(&$ts_raw) {
  111. if ($this->dbg) $this->write_err('Deleting any "at " in time string "'.$ts_raw.'".');
  112. $ts_new = preg_replace('/\\bat /', '', $ts_raw);
  113. if (!is_null($ts_new)) $ts_raw = $ts_new;
  114. }
  115. protected function find_author_posts_via_search_page__end_hook(&$do_inc_progress_level, $html, $found_earliest, $matches) {
  116. if ($this->skins_preg_match('older_content', $html, $matches)) {
  117. $this->write_status('Attempting to determine next search ID.');
  118. $this->search_id = $this->get_search_id($matches[1]);
  119. if (is_null($this->search_id)) {
  120. $do_inc_progress_level = true;
  121. } else $this->post_search_counter = 1;
  122. } else $this->post_search_counter++;
  123. }
  124. static function get_forum_software_homepage() {
  125. return 'http://xenforo.com/';
  126. }
  127. static function get_msg_how_to_detect_forum() {
  128. return 'Typically, XenForo forums can be identified by the presence of the text "Forum software by XenForo" in the footer of their forum pages. It is possible, however, that these footer texts have been removed by the administrator of the forum. In this case, the only way to know for sure is to contact your forum administrator.';
  129. }
  130. # First part of the postponed resolution of thread (topic) IDs -
  131. # see also find_author_posts_via_search_page__match_hook() above and hook_after__posts_retrieval() below.
  132. protected function get_post_contents__end_hook($forumid, $topicid, $postid, $html, &$found, $err, $count, &$ret) {
  133. if (!$err && $found) {
  134. if (!$this->skins_preg_match('thread_id', $html, $matches)) {
  135. $this->write_and_record_err_admin('Error: could not match the thread_id on the page with URL <'.$this->last_url.'>', __FILE__, __METHOD__, __LINE__, $html);
  136. } else {
  137. $this->topic_ids[$this->posts_data[$topicid]['topic']] = $matches[1];
  138. }
  139. }
  140. }
  141. protected function get_post_url($forumid, $topicid, $postid, $with_hash = false) {
  142. return $this->settings['base_url']."/posts/$postid/";
  143. }
  144. static function get_qanda() {
  145. $qanda = parent::get_qanda();
  146. $qanda_new = array(
  147. 'q_how_know_xenforo' => array(
  148. 'q' => 'How can I know if a forum is a XenForo forum?',
  149. 'a' => self::get_msg_how_to_detect_forum(),
  150. )
  151. );
  152. foreach ($qanda as $id => $qa) {
  153. $qanda_new[$id] = $qa;
  154. if ($id == 'q_lang') {
  155. $qanda_new['q_images_supported'] = array(
  156. 'q' => 'Are images supported?',
  157. 'a' => 'Yes, images are supported so long as you are online at the time of viewing the output - they are not downloaded, the link is merely retained.',
  158. );
  159. $qanda_new['q_which_skins_supported'] = array(
  160. 'q' => 'Which skins are supported?',
  161. 'a' => 'Whichever skin(s) is/are default for the <a href="http://civilwartalk.com">CivilWarTalk</a> and <a href="http://ecigssa.co.za/">ECIGS SA</a> forums. FUPS\' XenForo scraping functionality was originally developed as a paid job to extract posts from the CivilWarTalk forum; the XenForo software is otherwise unknown to the author of the FUPS software, who has not even registered for an account on CivilWarTalk, nor on any other XenForo forum, and who doesn\'t otherwise have access to the XenForo software, having not purchased it. If you need support for another XenForo skin, feel free to <a href="'.FUPS_CONTACT_URL.'">contact me</a>.',
  162. );
  163. }
  164. }
  165. return $qanda_new;
  166. }
  167. protected function get_search_id($before = false) {
  168. $search_id = null;
  169. $url = $this->settings['base_url'].'/search/member?user_id='.$this->user_id_num;
  170. if ($before !== false) {
  171. $url .= '&before='.$before;
  172. }
  173. $this->set_url($url);
  174. // $opts = array(
  175. // CURLOPT_FOLLOWLOCATION => false,
  176. // CURLOPT_HEADER => true ,
  177. // );
  178. // if (!curl_setopt_array($this->ch, $opts)) {
  179. // $this->exit_err('Failed to set the following cURL options:'.PHP_EOL.var_export($opts, true), __FILE__, __METHOD__, __LINE__);
  180. // }
  181. $response = curl_exec($this->ch);
  182. if ($response === false) {
  183. $this->write_err('curl_exec returned false. curl_error returns: "'.curl_error($this->ch).'".', __FILE__, __METHOD__, __LINE__);
  184. } else {
  185. $header_size = curl_getinfo($this->ch, CURLINFO_HEADER_SIZE);
  186. $headers = substr($response, 0, $header_size);
  187. $location = false;
  188. if (preg_match('/^Location: (.*)$/im', $headers, $matches)) {
  189. $location = trim($matches[1]);
  190. if ($location) {
  191. $i = strlen($location) - 1 /* account for trailing / */;
  192. while (--$i >= 0 && $location[$i] >= '0' && $location[$i] <= '9') {
  193. $search_id = $location[$i].$search_id;
  194. }
  195. }
  196. } else if ($this->dbg) {
  197. $this->write_err('Failed to detect a "Location:" header.', __FILE__, __METHOD__, __LINE__);
  198. }
  199. }
  200. // $opts = array(
  201. // CURLOPT_FOLLOWLOCATION => true ,
  202. // CURLOPT_HEADER => false,
  203. // );
  204. // if (!curl_setopt_array($this->ch, $opts)) {
  205. // $this->exit_err('Failed to set the following cURL options:'.PHP_EOL.var_export($opts, true), __FILE__, __METHOD__, __LINE__);
  206. // }
  207. if ($this->dbg) $this->write_err("Got search ID of '$search_id'.");
  208. return $search_id;
  209. }
  210. protected function get_search_url() {
  211. return $this->settings['base_url'].'/search/'.$this->search_id.'/?page='.$this->post_search_counter;
  212. }
  213. public function get_settings_array() {
  214. $settings_arr = parent::get_settings_array();
  215. $settings_arr['base_url']['default'] = 'http://civilwartalk.com';
  216. $settings_arr['base_url']['description'] .= ' This is the URL that appears in your browser\'s address bar when you access the forum, only with everything onwards from (and including) the path of whichever script is being accessed (e.g. /threads or /forums) stripped off. The default URL provided is for the particular XenForo board known as "CivilWarTalk".';
  217. $settings_arr['extract_user_id']['description'] .= ' You can find a user\'s ID by hovering your cursor over a hyperlink to their name and taking note of everything that appears between "/members/" and the next "/" (i.e. this will be something like "my-member-name.12345") in the browser\'s status bar.';
  218. $settings_arr['thread_url_prefix'] = array(
  219. 'label' => 'Thread URL prefix',
  220. 'default' => 'threads/',
  221. 'description' => 'Set this to that part of the URL for forum thread (topic) pages between the beginning part of the URL, that which was entered above beside "Base forum URL" but followed by a forward slash, and the end part of the URL, the thread id optionally followed by forward slash and page number. By default, this setting should be "threads/", but the XenForo forum software supports changing this default through <a href="https://xenforo.com/help/route-filters/">route filters</a>, and some XenForo forums have been configured in this way such that this setting ("Thread URL prefix") needs to be empty. An example of how to discern this value (it is emboldened) in a typical thread URL with "Base forum URL" set to "http://civilwartalk.com" is: "http://civilwartalk.com/<b>threads/</b>traveller.84936/page-2". Here, the initial base URL plus forward slash is obvious, the thread id part is "traveller.84936" and the optional-forward-slash-followed-by-page-number part is "/page-2". If route filtering were set up on the CivilWarTalk forum such that this setting should be empty, then that same thread URL would have looked like this: "http://civilwartalk.com/traveller.84936/page-2". If, hypothetically, this "Thread URL prefix" setting were to correctly be "topic/here/", then that same thread URL would have looked like this: "http://civilwartalk.com/topic/here/traveller.84936/page-2".',
  222. );
  223. return $settings_arr;
  224. }
  225. protected function get_topic_url($forumid, $topicid) {
  226. return $this->settings['base_url'].'/'.$this->settings['thread_url_prefix'].$topicid.'/';
  227. }
  228. protected function get_user_page_url() {
  229. return $this->settings['base_url'].'/members/'.urlencode($this->settings['extract_user_id']).'/';
  230. }
  231. # Second and final part of the postponed resolution of thread (topic) IDs -
  232. # see also find_author_posts_via_search_page__match_hook() and get_post_contents__end_hook() above.
  233. protected function hook_after__posts_retrieval() {
  234. $posts_data2 = array();
  235. foreach ($this->posts_data as $topicid => $t) {
  236. $posts_data2[$this->topic_ids[$t['topic']]] = $t;
  237. }
  238. $this->posts_data = $posts_data2;
  239. }
  240. protected function init_post_search_counter() {
  241. $this->post_search_counter = 1;
  242. }
  243. protected function init_search_user_posts() {
  244. $this->write_status('Attempting to determine search ID.');
  245. $i = strlen($this->settings['extract_user_id']);
  246. while (--$i >= 0 && $this->settings['extract_user_id'][$i] >= '0' && $this->settings['extract_user_id'][$i] <= '9') {
  247. $this->user_id_num = $this->settings['extract_user_id'][$i].$this->user_id_num;
  248. }
  249. $this->search_id = $this->get_search_id();
  250. }
  251. public function supports_feature($feature) {
  252. static $features = array(
  253. 'login' => false
  254. );
  255. return isset($features[$feature]) ? $features[$feature] : parent::supports_feature($feature);
  256. }
  257. }
  258. ?>