PageRenderTime 53ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/forum/gym_sitemaps/modules/google_xml.php

https://github.com/GreyTeardrop/socionicasys-forum
PHP | 247 lines | 179 code | 2 blank | 66 comment | 37 complexity | 4db86e261fedcc7a083088651cd702bf MD5 | raw file
Possible License(s): AGPL-1.0, LGPL-3.0, MPL-2.0-no-copyleft-exception
  1. <?php
  2. /**
  3. *
  4. * @package phpBB SEO GYM Sitemaps
  5. * @version $Id: google_xml.php 148 2009-11-07 14:50:54Z dcz $
  6. * @copyright (c) 2006 - 2009 www.phpbb-seo.com
  7. * @license http://opensource.org/osi3.0/licenses/lgpl-license.php GNU Lesser General Public License
  8. *
  9. */
  10. // First basic security
  11. if ( !defined('IN_PHPBB') ) {
  12. exit;
  13. }
  14. /**
  15. * google_xml Class
  16. * www.phpBB-SEO.com
  17. * @package phpBB SEO
  18. */
  19. class google_xml {
  20. var $url_settings = array();
  21. var $options = array();
  22. var $module_config = array();
  23. var $outputs = array();
  24. var $xml_files = array();
  25. /**
  26. * constuctor
  27. */
  28. function google_xml(&$gym_master) {
  29. $this->gym_master = &$gym_master;
  30. $this->options = &$this->gym_master->actions;
  31. $this->outputs = &$this->gym_master->output_data;
  32. $this->url_settings = &$this->gym_master->url_config;
  33. $this->module_config = array_merge(
  34. // Global
  35. $this->gym_master->google_config,
  36. // Other stuff required here
  37. array(
  38. 'google_sources' => $this->gym_master->path_config['gym_path'] . 'sources/',
  39. 'google_randomize' => (boolean) $this->gym_master->gym_config['google_xml_randomize'],
  40. 'google_unique' => (boolean) $this->gym_master->gym_config['google_xml_unique'],
  41. 'google_check_robots' => (boolean) $this->gym_master->gym_config['google_xml_check_robots'],
  42. 'google_force_limit' => (boolean) $this->gym_master->gym_config['google_xml_force_limit'],
  43. 'google_force_lastmod' => (boolean) $this->gym_master->gym_config['google_xml_force_lastmod'],
  44. )
  45. );
  46. $this->module_config['xml_parse'] = (boolean) ($this->module_config['google_randomize'] || $this->module_config['google_unique'] || $this->module_config['google_force_limit'] || $this->module_config['google_force_lastmod']|| $this->module_config['google_check_robots']);
  47. // Check cache
  48. $this->gym_master->gym_output->setup_cache(); // Will exit if the cache is sent
  49. // List available files
  50. $this->get_source_list();
  51. // Init url settngs
  52. $this->init_url_settings();
  53. }
  54. /**
  55. * Initialize mod rewrite to handle multiple URL standards.
  56. * Only one 'if' is required after this in THE loop to properly switch
  57. * between the four types (none, advanced, mixed and simple).
  58. * @access private
  59. */
  60. function init_url_settings() {
  61. global $phpbb_seo;
  62. // vars will fell like rain in the code ;)
  63. $this->url_settings['google_xml_delim'] = !empty($phpbb_seo->seo_delim['google_xml']) ? $phpbb_seo->seo_delim['google_xml'] : '-';
  64. $this->url_settings['google_xml_static'] = !empty($phpbb_seo->seo_static['google_xml']) ? $phpbb_seo->seo_static['google_xml'] : 'xml';
  65. $this->url_settings['modrewrite'] = $this->module_config['google_modrewrite'];
  66. if ($this->url_settings['modrewrite']) { // Module links
  67. $this->url_settings['google_xml_tpl'] = $this->module_config['google_url'] . 'xml' . $this->url_settings['google_xml_delim'] . '%1$s.xml' . $this->url_settings['gzip_ext_out'];
  68. } else {
  69. $this->url_settings['google_xml_tpl'] = $this->module_config['google_url'] . $this->url_settings['google_default'] . '?xml=%1$s';
  70. }
  71. return;
  72. }
  73. /**
  74. * sitemap, builds the sitemap
  75. * @access private
  76. */
  77. function sitemap() {
  78. global $cache, $phpEx, $config, $user;
  79. if (!empty($this->xml_files[$this->options['module_sub']])) {
  80. // Check robots.txt ?
  81. if ($this->module_config['google_check_robots']) {
  82. $this->gym_master->obtain_robots_disallows();
  83. }
  84. $sitemap_xml_url = sprintf( $this->url_settings['google_xml_tpl'], $this->options['module_sub'] );
  85. $this->gym_master->seo_kill_dupes($sitemap_xml_url);
  86. $xml_file = $this->xml_files[$this->options['module_sub']];
  87. // Grab data
  88. if (strpos($xml_file, 'http://') !== false) {
  89. @ini_set('user_agent','GYM Sitemaps &amp; RSS / www.phpBB-SEO.com');
  90. // You may want to use a higher value for the timout in case you use slow external sitemaps
  91. @ini_set('default_socket_timeout', 5);
  92. }
  93. if ($xml_data = @file_get_contents($xml_file)) {
  94. if (!empty($http_response_header)) {
  95. $_last_mod = get_date_from_header($http_response_header);
  96. } else {
  97. $_last_mod = (int) @filemtime($xml_file);
  98. }
  99. $this->outputs['last_mod_time'] = $_last_mod > $config['board_startdate'] ? $_last_mod : ($user->time_now - rand(500, 10000));
  100. if (($url_tag_pos = utf8_strpos($xml_data, '<url>')) === false) {
  101. // this basic test failed
  102. // @TODO add loggs about this ?
  103. $this->gym_master->gym_error(404, '', __FILE__, __LINE__);
  104. }
  105. if (!$this->module_config['xml_parse']) {
  106. // use our hown headers
  107. $xml_data = str_replace('</urlset>', '', trim($xml_data) );
  108. // Add to the output variable
  109. $this->outputs['data'] .= substr($xml_data, $url_tag_pos);
  110. // Link count
  111. $this->outputs['url_sofar'] = preg_match_all('`\<loc\>[^<>]+\</loc\>`Ui', $xml_data, $matches);
  112. // free memory
  113. unset($xml_data, $matches);
  114. } else {
  115. $total_matches = preg_match_all('`\<url\>.+\</url\>`Usi', $xml_data, $matches, PREG_SET_ORDER);
  116. // free memory
  117. unset($xml_data);
  118. if (!empty($matches)) {
  119. // Randomize ?
  120. if ($this->module_config['google_randomize']) {
  121. shuffle($matches);
  122. }
  123. // Limit ?
  124. if ($this->module_config['google_url_limit'] > 0 && $this->module_config['google_url_limit'] < $total_matches) {
  125. $matches = array_slice($matches, 0, $this->module_config['google_url_limit']);
  126. }
  127. // Force last mod ?
  128. $_last_mod = $this->module_config['google_force_lastmod'] ? $this->outputs['last_mod_time'] : 0;
  129. // Parse URLs
  130. $dt = rand(0, 3600);
  131. $url_check = array();
  132. foreach ($matches as $key => $data) {
  133. preg_match_all('`\<(loc|lastmod|changefreq|priority)\>([^<>]+)\</\1\>`Ui', $data[0], $url_tags, PREG_SET_ORDER);
  134. $loc = $priority = $changefreq = $lastmod = '';
  135. foreach ($url_tags as $url_tag) {
  136. if (empty($url_tag[1]) || empty($url_tag[2])) {
  137. continue;
  138. }
  139. $url_tag[1] = strtolower($url_tag[1]);
  140. ${$url_tag[1]} = trim($url_tag[2]);
  141. }
  142. if (empty($loc)) {
  143. continue;
  144. }
  145. // Check unique ?
  146. if ($this->module_config['google_unique']) {
  147. if (isset($url_check[$loc])) {
  148. continue;
  149. }
  150. $url_check[$loc] = 1;
  151. }
  152. if ($this->module_config['google_check_robots'] && $this->gym_master->is_robots_disallowed($loc)) {
  153. continue;
  154. }
  155. if ($this->module_config['google_force_lastmod']) {
  156. $last_mod = $_last_mod - $dt;
  157. $priority = $this->gym_master->get_priority($last_mod);
  158. $changefreq = $this->gym_master->get_changefreq($last_mod);
  159. $lastmod = gmdate('Y-m-d\TH:i:s'.'+00:00', $last_mod);
  160. } else {
  161. $lastmod = !empty($lastmod) ? $lastmod : 0;
  162. $priority = !empty($priority) ? $priority : 0;
  163. $changefreq = !empty($changefreq) ? $changefreq : 0;
  164. }
  165. $this->parse_item($loc, $priority, $changefreq, $lastmod);
  166. unset($matches[$key]);
  167. $dt += rand(30, 3600*12);
  168. }
  169. unset($url_check);
  170. } else {
  171. // Clear the cache to make sure the guilty url is not shown in the sitemapIndex
  172. $cache->destroy('_gym_config_google_xml');
  173. $this->gym_master->gym_error(500, '', __FILE__, __LINE__);
  174. }
  175. }
  176. } else {
  177. // Clear the cache to make sure the guilty url is not shown in the sitemapIndex
  178. $cache->destroy('_gym_config_google_xml');
  179. $this->gym_master->gym_error(404, '', __FILE__, __LINE__);
  180. }
  181. } else {
  182. $this->gym_master->gym_error(404, '', __FILE__, __LINE__);
  183. }
  184. return;
  185. }
  186. /**
  187. * sitemapindex, builds the sitemapindex
  188. * @access private
  189. */
  190. function sitemapindex() {
  191. global $config;
  192. // It's global list call, add module sitemaps
  193. // Reset the local counting, since we are cycling through modules
  194. $this->outputs['url_sofar'] = 0;
  195. foreach ($this->xml_files as $xml_action => $source) {
  196. $sitemap_xml_url = sprintf( $this->url_settings['google_xml_tpl'], $xml_action );
  197. $last_mod = (int) @filemtime($xml_file);
  198. $last_mod = ($last_mod > $config['board_startdate'] && !$this->module_config['google_force_lastmod']) ? $last_mod : (time() - rand(500, 10000));
  199. $this->gym_master->parse_sitemap($sitemap_xml_url, $last_mod);
  200. }
  201. // Add the local counting, since we are cycling through modules
  202. $this->outputs['url_sofar_total'] = $this->outputs['url_sofar_total'] + $this->outputs['url_sofar'];
  203. return;
  204. }
  205. /**
  206. * get_source_list, builds the available sitemap list
  207. * @access private
  208. */
  209. function get_source_list() {
  210. global $cache, $phpEx;
  211. if (($this->xml_files = $cache->get('_gym_config_google_xml')) === false) {
  212. $this->xml_files = array();
  213. // Check the eventual external url config
  214. if (file_exists($this->module_config['google_sources'] . "xml_google_external.$phpEx")) {
  215. include($this->module_config['google_sources'] . "xml_google_external.$phpEx");
  216. // Duplicated keys will be overriden bellow
  217. $this->xml_files = array_merge($this->xml_files, $external_setup);
  218. }
  219. $RegEx = '`^google_([a-z0-9_-]+)\.xml$`i';
  220. $xml_dir = @opendir( $this->module_config['google_sources'] );
  221. while( ($xml_file = @readdir($xml_dir)) !== false ) {
  222. if(preg_match($RegEx, $xml_file, $matches)) {
  223. if (!empty($matches[1])) {
  224. $this->xml_files[$matches[1]] = $this->module_config['google_sources'] . 'google_' . $matches[1] . '.xml';
  225. }
  226. }
  227. }
  228. @closedir($xml_dir);
  229. $cache->put('_gym_config_google_xml', $this->xml_files);
  230. }
  231. return;
  232. }
  233. /**
  234. * parse_item() adds the item info to the output
  235. */
  236. function parse_item($url, $priority = 1.0, $changefreq = 'always', $lastmodtime = 0) {
  237. global $config, $user;
  238. $changefreq = isset($this->gym_master->freq_values[$changefreq]) ? sprintf($this->gym_master->style_config['changefreq_tpl'], $changefreq) : '';
  239. $priority = $priority <= 1 && $priority > 0 ? sprintf($this->gym_master->style_config['priority_tpl'], $priority) : '';
  240. $lastmodtime = $lastmodtime > $config['board_startdate'] ? sprintf($this->gym_master->style_config['lastmod_tpl'], $lastmodtime) : '';
  241. $this->gym_master->output_data['data'] .= sprintf($this->gym_master->style_config['Sitemap_tpl'], $url, $lastmodtime, $changefreq, $priority);
  242. $this->gym_master->output_data['url_sofar']++;
  243. }
  244. }
  245. ?>