PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/Phergie/Plugin/Url.php

https://github.com/markizano/phergie
PHP | 692 lines | 356 code | 74 blank | 262 comment | 68 complexity | e6ee682877fd3fa8fcec8c3450ba6803 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. <?php
  2. /**
  3. * Phergie
  4. *
  5. * PHP version 5
  6. *
  7. * LICENSE
  8. *
  9. * This source file is subject to the new BSD license that is bundled
  10. * with this package in the file LICENSE.
  11. * It is also available through the world-wide-web at this URL:
  12. * http://phergie.org/license
  13. *
  14. * @category Phergie
  15. * @package Phergie_Plugin_Url
  16. * @author Phergie Development Team <team@phergie.org>
  17. * @copyright 2008-2010 Phergie Development Team (http://phergie.org)
  18. * @license http://phergie.org/license New BSD License
  19. * @link http://pear.phergie.org/package/Phergie_Plugin_Url
  20. */
  21. /**
  22. * Monitors incoming messages for instances of URLs and responds with messages
  23. * containing relevant information about detected URLs.
  24. *
  25. * Has an utility method accessible via
  26. * $this->getPlugin('Url')->getTitle('http://foo..').
  27. *
  28. * @category Phergie
  29. * @package Phergie_Plugin_Url
  30. * @author Phergie Development Team <team@phergie.org>
  31. * @license http://phergie.org/license New BSD License
  32. * @link http://pear.phergie.org/package/Phergie_Plugin_Url
  33. * @uses Phergie_Plugin_Encoding pear.phergie.org
  34. * @uses Phergie_Plugin_Http pear.phergie.org
  35. * @uses Phergie_Plugin_Tld pear.phergie.org
  36. */
  37. class Phergie_Plugin_Url extends Phergie_Plugin_Abstract
  38. {
  39. /**
  40. * Links output format
  41. *
  42. * Can use the variables %nick%, %title% and %link% in it to display
  43. * page titles and links
  44. *
  45. * @var string
  46. */
  47. protected $baseFormat = '%message%';
  48. protected $messageFormat = '[ %link% ] %title%';
  49. /**
  50. * Flag indicating whether a single response should be sent for a single
  51. * message containing multiple links
  52. *
  53. * @var bool
  54. */
  55. protected $mergeLinks = true;
  56. /**
  57. * Max length of the fetched URL title
  58. *
  59. * @var int
  60. */
  61. protected $titleLength = 40;
  62. /**
  63. * Cache object to store cached URLs to prevent spamming, especially with more
  64. * than one bot on the same channel.
  65. *
  66. * @var Phergie_Plugin_Cache
  67. */
  68. protected $cache = array();
  69. /**
  70. * Time in seconds to store the cached entries
  71. *
  72. * Setting it to 0 or below disables the cache expiration
  73. *
  74. * @var int
  75. */
  76. protected $expire = 1800;
  77. /**
  78. * Number of entries to keep in the cache at one time per channel
  79. *
  80. * Setting it to 0 or below disables the cache limit
  81. *
  82. * @var int
  83. */
  84. protected $limit = 10;
  85. /**
  86. * Flag that determines if the plugin will fall back to using an HTTP
  87. * stream when a URL using SSL is detected and OpenSSL support isn't
  88. * available in the PHP installation in use
  89. *
  90. * @var bool
  91. */
  92. protected $sslFallback = true;
  93. /**
  94. * Flag that is set to true by the custom error handler if an HTTP error
  95. * code has been received
  96. *
  97. * @var boolean
  98. */
  99. protected $errorStatus = false;
  100. protected $errorMessage = null;
  101. /**
  102. * Flag indicating whether or not to display error messages as the title
  103. * if a link posted encounters an error
  104. *
  105. * @var boolean
  106. */
  107. protected $showErrors = true;
  108. /**
  109. * Flag indicating whether to detect schemeless URLS (i.e. "example.com")
  110. *
  111. * @var boolean
  112. */
  113. protected $detectSchemeless = false;
  114. /**
  115. * Shortener object
  116. */
  117. protected $shortener;
  118. /**
  119. * Array of renderers
  120. */
  121. protected $renderers = array();
  122. /**
  123. * Checks for dependencies.
  124. *
  125. * @return void
  126. */
  127. public function onLoad()
  128. {
  129. $plugins = $this->plugins;
  130. $plugins->getPlugin('Encoding');
  131. $plugins->getPlugin('Http');
  132. $plugins->getPlugin('Tld');
  133. $plugins->getPlugin('Cache');
  134. // make the shortener configurable
  135. $shortener = $this->getConfig('url.shortener', 'Isgd');
  136. $shortener = "Phergie_Plugin_Url_Shorten_{$shortener}";
  137. $this->shortener = new $shortener($this->plugins->getPlugin('Http'));
  138. if (!$this->shortener instanceof Phergie_Plugin_Url_Shorten_Abstract) {
  139. $this->fail(
  140. "Declared shortener class {$shortener} is not of proper ancestry"
  141. );
  142. }
  143. // load config (a bit ugly, but focusing on porting):
  144. foreach (
  145. array(
  146. 'detect_schemeless' => 'detectSchemeless',
  147. 'base_format' => 'baseFormat',
  148. 'message_format' => 'messageFormat',
  149. 'merge_links' => 'mergeLinks',
  150. 'title_length' => 'titleLength',
  151. 'show_errors' => 'showErrors',
  152. 'expire' => 'expire',
  153. ) as $config => $local) {
  154. if (isset($this->config["url.{$config}"])) {
  155. $this->$local = $this->config["url.{$config}"];
  156. }
  157. }
  158. $this->cache = $plugins->cache;
  159. }
  160. /**
  161. * Checks an incoming message for the presence of a URL and, if one is
  162. * found, responds with its title if it is an HTML document and the
  163. * shortened equivalent of its original URL if it meets length requirements.
  164. *
  165. * @todo Update this to pull configuration settings from $this->config
  166. * rather than caching them as class properties
  167. * @return void
  168. */
  169. public function onPrivmsg()
  170. {
  171. $this->handleMsg();
  172. }
  173. /**
  174. * Checks an incoming message for the presence of a URL and, if one is
  175. * found, responds with its title if it is an HTML document and the
  176. * shortened equivalent of its original URL if it meets length requirements.
  177. *
  178. * @todo Update this to pull configuration settings from $this->config
  179. * rather than caching them as class properties
  180. * @return void
  181. */
  182. public function onAction()
  183. {
  184. $this->handleMsg();
  185. }
  186. /**
  187. * Handles message events and responds with url titles.
  188. *
  189. * @return void
  190. */
  191. protected function handleMsg()
  192. {
  193. $source = $this->getEvent()->getSource();
  194. $user = $this->getEvent()->getNick();
  195. $responses = array();
  196. $urls = $this->findUrls($this->getEvent()->getArgument(1));
  197. foreach ($urls as $parsed) {
  198. $url = $parsed['glued'];
  199. // allow out-of-class renderers to handle this URL
  200. foreach ($this->renderers as $renderer) {
  201. if ($renderer->renderUrl($parsed) === true) {
  202. // renderers should return true if they've fully
  203. // rendered the passed URL (they're responsible
  204. // for their own output)
  205. $this->debug('Handled by renderer: ' . get_class($renderer));
  206. continue 2;
  207. }
  208. }
  209. // Convert url
  210. $shortenedUrl = $this->shortener->shorten($url);
  211. if (!$shortenedUrl) {
  212. $this->debug('Invalid Url: Unable to shorten. (' . $url . ')');
  213. $shortenedUrl = $url;
  214. }
  215. // Prevent spamfest
  216. if ($this->checkUrlCache($url, $shortenedUrl)) {
  217. $this->debug('Invalid Url: URL is in the cache. (' . $url . ')');
  218. continue;
  219. }
  220. $title = $this->getTitle($url);
  221. if (!empty($title)) {
  222. $responses[] = str_replace(
  223. array(
  224. '%title%',
  225. '%link%',
  226. '%nick%'
  227. ), array(
  228. $title,
  229. $shortenedUrl,
  230. $user
  231. ), $this->messageFormat
  232. );
  233. }
  234. // Update cache
  235. $this->updateUrlCache($url, $shortenedUrl);
  236. unset($title, $shortenedUrl, $title);
  237. }
  238. // Check to see if there were any URL responses,
  239. // format them and handle if they
  240. // get merged into one message or not
  241. if (count($responses) > 0) {
  242. if ($this->mergeLinks) {
  243. $message = str_replace(
  244. array(
  245. '%message%',
  246. '%nick%'
  247. ), array(
  248. implode('; ', $responses),
  249. $user
  250. ), $this->baseFormat
  251. );
  252. $this->doPrivmsg($source, $message);
  253. } else {
  254. foreach ($responses as $response) {
  255. $message = str_replace(
  256. array(
  257. '%message%',
  258. '%nick%'
  259. ), array(
  260. implode('; ', $responses),
  261. $user
  262. ), $this->baseFormat
  263. );
  264. $this->doPrivmsg($source, $message);
  265. }
  266. }
  267. }
  268. }
  269. /**
  270. * Detect URLs in a given string.
  271. *
  272. * @param string $message the string to detect urls in
  273. *
  274. * @return array the array of urls found
  275. */
  276. public function findUrls($message)
  277. {
  278. $pattern = '#'.($this->detectSchemeless ? '' : 'https?://').'(?:([0-9]{1,3}(?:\.[0-9]{1,3}){3})(?![^/]) | ('
  279. .($this->detectSchemeless ? '(?<!http:/|https:/)[@/\\\]' : '').')?(?:(?:[a-z0-9_-]+\.?)+\.[a-z0-9]{1,6}))[^\s]*#xis';
  280. $urls = array();
  281. // URL Match
  282. if (preg_match_all($pattern, $message, $matches, PREG_SET_ORDER)) {
  283. foreach ($matches as $m) {
  284. $url = trim(rtrim($m[0], ', ].?!;'));
  285. // Check to see if the URL was from an email address, is a directory, etc
  286. if (!empty($m[2])) {
  287. $this->debug('Invalid Url: URL is either an email or a directory path. (' . $url . ')');
  288. continue;
  289. }
  290. // Parse the given URL
  291. if (!$parsed = $this->parseUrl($url)) {
  292. $this->debug('Invalid Url: Could not parse the URL. (' . $url . ')');
  293. continue;
  294. }
  295. // Check to see if the given IP/Host is valid
  296. if (!empty($m[1]) and !$this->checkValidIP($m[1])) {
  297. $this->debug('Invalid Url: ' . $m[1] . ' is not a valid IP address. (' . $url . ')');
  298. continue;
  299. }
  300. // Process TLD if it's not an IP
  301. if (empty($m[1])) {
  302. // Get the TLD from the host
  303. $pos = strrpos($parsed['host'], '.');
  304. $parsed['tld'] = ($pos !== false ? substr($parsed['host'], ($pos+1)) : '');
  305. // Check to see if the URL has a valid TLD
  306. if ($this->plugins->tld->getTld($parsed['tld']) === false) {
  307. $this->debug('Invalid Url: ' . $parsed['tld'] . ' is not a supported TLD. (' . $url . ')');
  308. continue;
  309. }
  310. }
  311. // Check to see if the URL is to a secured site or not and handle it accordingly
  312. if ($parsed['scheme'] == 'https' && !extension_loaded('openssl')) {
  313. if (!$this->sslFallback) {
  314. $this->debug('Invalid Url: HTTPS is an invalid scheme, OpenSSL isn\'t available. (' . $url . ')');
  315. continue;
  316. } else {
  317. $parsed['scheme'] = 'http';
  318. }
  319. }
  320. if (!in_array($parsed['scheme'], array('http', 'https'))) {
  321. $this->debug('Invalid Url: ' . $parsed['scheme'] . ' is not a supported scheme. (' . $url . ')');
  322. continue;
  323. }
  324. $urls[] = $parsed + array('glued' => $this->glueURL($parsed));
  325. }
  326. }
  327. return $urls;
  328. }
  329. /**
  330. * Checks a given URL (+shortened) against the cache to verify if they were
  331. * previously posted on the channel.
  332. *
  333. * @param string $url The URL to check against
  334. * @param string $shortenedUrl The shortened URL to check against
  335. *
  336. * @return bool
  337. */
  338. protected function checkUrlCache($url, $shortenedUrl)
  339. {
  340. $cache = array();
  341. $source = $this->getEvent()->getSource();
  342. /**
  343. * Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
  344. * and minimize the size of the cache for less cache bloat.
  345. */
  346. $url = $this->getUrlChecksum($url);
  347. $shortenedUrl = $this->getUrlChecksum($shortenedUrl);
  348. $cache['url'] = $this->cache->fetch('urlCache');
  349. $cache['shortened'] = $this->cache->fetch('shortCache');
  350. $expire = $this->expire;
  351. $this->debug("Cache expire: {$expire}");
  352. /**
  353. * If cache expiration is enabled, check to see if the given url has expired in the cache
  354. * If expire is disabled, simply check to see if the url is listed
  355. */
  356. if ($expire > 0 && isset($cache['url'][$source], $cache['shortened'][$source])) {
  357. unset($cache, $url, $shortenedUrl, $expire);
  358. return true;
  359. }
  360. unset($cache, $url, $shortenedUrl, $expire);
  361. return false;
  362. }
  363. /**
  364. * Updates the cache and adds the given URL (+shortened) to the cache. It
  365. * also handles cleaning the cache of old entries as well.
  366. *
  367. * @param string $url The URL to add to the cache
  368. * @param string $shortenedUrl The shortened to add to the cache
  369. *
  370. * @return bool
  371. */
  372. protected function updateUrlCache($url, $shortenedUrl)
  373. {
  374. $cache = array();
  375. $source = $this->getEvent()->getSource();
  376. /**
  377. * Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
  378. * and minimize the size of the cache for less cache bloat.
  379. */
  380. $url = $this->getUrlChecksum($url);
  381. $shortenedUrl = $this->getUrlChecksum($shortenedUrl);
  382. $time = time();
  383. // Handle the URL cache and remove old entries that surpass the limit if enabled
  384. $cache['urlCache'][$source][$url] = $time;
  385. if ($this->limit > 0 && count($cache['urlCache'][$source]) > $this->limit) {
  386. asort($cache['urlCache'][$source], SORT_NUMERIC);
  387. array_shift($cache['urlCache'][$source]);
  388. }
  389. // Handle the shortened cache and remove old entries that surpass the limit if enabled
  390. $cache['shortCache'][$source][$shortenedUrl] = $time;
  391. if ($this->limit > 0 && count($cache['shortCache'][$source]) > $this->limit) {
  392. asort($cache['shortCache'][$source], SORT_NUMERIC);
  393. array_shift($cache['shortCache'][$source]);
  394. }
  395. $this->cache->store('urlCache', $cache['urlCache'], $this->expire);
  396. $this->cache->store('shortCache', $cache['shortCache'], $this->expire);
  397. unset($url, $shortenedUrl, $time);
  398. }
  399. /**
  400. * Transliterates a UTF-8 string into corresponding ASCII characters and
  401. * truncates and appends an ellipsis to the string if it exceeds a given
  402. * length.
  403. *
  404. * @param string $str String to decode
  405. * @param int $trim Maximum string length, optional
  406. *
  407. * @return string
  408. */
  409. protected function decode($str, $trim = null)
  410. {
  411. $out = $this->plugins->encoding->transliterate($str);
  412. if ($trim > 0) {
  413. $out = substr($out, 0, $trim) . (strlen($out) > $trim ? '...' : '');
  414. }
  415. return $out;
  416. }
  417. /**
  418. * Takes a url, parses and cleans the URL without of all the junk
  419. * and then return the hex checksum of the url.
  420. *
  421. * @param string $url url to checksum
  422. *
  423. * @return string the hex checksum of the cleaned url
  424. */
  425. protected function getUrlChecksum($url)
  426. {
  427. $checksum = strtolower(urldecode($this->glueUrl($url, true)));
  428. $checksum = preg_replace('#\s#', '', $this->plugins->encoding->transliterate($checksum));
  429. return dechex(crc32($checksum));
  430. }
  431. /**
  432. * Parses a given URI and procceses the output to remove redundant
  433. * or missing values.
  434. *
  435. * @param string $url the url to parse
  436. *
  437. * @return array the url components
  438. */
  439. protected function parseUrl($url)
  440. {
  441. if (is_array($url)) return $url;
  442. $url = trim(ltrim($url, ' /@\\'));
  443. if (!preg_match('&^(?:([a-z][-+.a-z0-9]*):)&xis', $url, $matches)) {
  444. $url = 'http://' . $url;
  445. }
  446. $parsed = parse_url($url);
  447. if (!isset($parsed['scheme'])) {
  448. $parsed['scheme'] = 'http';
  449. }
  450. $parsed['scheme'] = strtolower($parsed['scheme']);
  451. if (isset($parsed['path']) && !isset($parsed['host'])) {
  452. $host = $parsed['path'];
  453. $path = '';
  454. if (strpos($parsed['path'], '/') !== false) {
  455. list($host, $path) = array_pad(explode('/', $parsed['path'], 2), 2, null);
  456. }
  457. $parsed['host'] = $host;
  458. $parsed['path'] = $path;
  459. }
  460. return $parsed;
  461. }
  462. /**
  463. * Parses a given URI and then glues it back together in the proper format.
  464. * If base is set, then it chops off the scheme, user and pass and fragment
  465. * information to return a more unique base URI.
  466. *
  467. * @param string $uri uri to rebuild
  468. * @param string $base set to true to only return the base components
  469. *
  470. * @return string the rebuilt uri
  471. */
  472. protected function glueUrl($uri, $base = false)
  473. {
  474. $parsed = $uri;
  475. if (!is_array($parsed)) {
  476. $parsed = $this->parseUrl($parsed);
  477. }
  478. if (is_array($parsed)) {
  479. $uri = '';
  480. if (!$base) {
  481. $uri .= (!empty($parsed['scheme']) ? $parsed['scheme'] . ':' .
  482. ((strtolower($parsed['scheme']) == 'mailto') ? '' : '//') : '');
  483. $uri .= (!empty($parsed['user']) ? $parsed['user'] .
  484. (!empty($parsed['pass']) ? ':' . $parsed['pass'] : '') . '@' : '');
  485. }
  486. if ($base && !empty($parsed['host'])) {
  487. $parsed['host'] = trim($parsed['host']);
  488. if (substr($parsed['host'], 0, 4) == 'www.') {
  489. $parsed['host'] = substr($parsed['host'], 4);
  490. }
  491. }
  492. $uri .= (!empty($parsed['host']) ? $parsed['host'] : '');
  493. if (!empty($parsed['port'])
  494. && (($parsed['scheme'] == 'http' && $parsed['port'] == 80)
  495. || ($parsed['scheme'] == 'https' && $parsed['port'] == 443))
  496. ) {
  497. unset($parsed['port']);
  498. }
  499. $uri .= (!empty($parsed['port']) ? ':' . $parsed['port'] : '');
  500. if (!empty($parsed['path']) && (!$base || $base && $parsed['path'] != '/')) {
  501. $uri .= (substr($parsed['path'], 0, 1) == '/') ? $parsed['path'] : ('/' . $parsed['path']);
  502. }
  503. $uri .= (!empty($parsed['query']) ? '?' . $parsed['query'] : '');
  504. if (!$base) {
  505. $uri .= (!empty($parsed['fragment']) ? '#' . $parsed['fragment'] : '');
  506. }
  507. }
  508. return $uri;
  509. }
  510. /**
  511. * Checks the given string to see if its a valid IP4 address
  512. *
  513. * @param string $ip the ip to validate
  514. *
  515. * @return bool
  516. */
  517. protected function checkValidIP($ip)
  518. {
  519. return long2ip(ip2long($ip)) === $ip;
  520. }
  521. /**
  522. * Returns the title of the given page
  523. *
  524. * @param string $url url to the page
  525. *
  526. * @return string title
  527. */
  528. public function getTitle($url)
  529. {
  530. $http = $this->plugins->getPlugin('Http');
  531. $options = array(
  532. 'timeout' => 3.5,
  533. 'user_agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12'
  534. );
  535. $response = $http->head($url, array(), $options);
  536. if ($response->getCode() == 405) { // [Head] request method not allowed
  537. $response = $http->get($url, array(), $options);
  538. }
  539. $header = $response->getHeaders('Content-Type');
  540. if (!preg_match('#^(text/x?html|application/xhtml+xml)(?:;.*)?$#', $header)) {
  541. $title = $header;
  542. } else {
  543. $response = $http->get($url, array(), $options);
  544. $content = $response->getContent();
  545. if (preg_match('#<title[^>]*>(.*?)</title>#is', $content, $match)) {
  546. $title = preg_replace('/[\s\v]+/', ' ', trim($match[1]));
  547. }
  548. }
  549. $encoding = $this->plugins->getPlugin('Encoding');
  550. $title = $encoding->decodeEntities($title);
  551. if (empty($title)) {
  552. if ($response->isError()) {
  553. $title = $response->getCodeAsString();
  554. } else {
  555. $title = 'No Title';
  556. }
  557. }
  558. return $title;
  559. }
  560. /**
  561. * Output a debug message
  562. *
  563. * @param string $msg the message to output
  564. *
  565. * @return void
  566. */
  567. protected function debug($msg)
  568. {
  569. echo "(DEBUG:Url) $msg\n";
  570. }
  571. /**
  572. * Add a renderer to the stack
  573. *
  574. * @param object $obj the renderer to add
  575. *
  576. * @return void
  577. */
  578. public function registerRenderer($obj)
  579. {
  580. $this->renderers[spl_object_hash($obj)] = $obj;
  581. }
  582. /**
  583. * Processes events before they are dispatched and tries to shorten any
  584. * urls in the text
  585. *
  586. * @return void
  587. */
  588. public function preDispatch()
  589. {
  590. if (!$this->getConfig('url.shortenOutput', false)) {
  591. return;
  592. }
  593. $events = $this->events->getEvents();
  594. foreach ($events as $event) {
  595. switch ($event->getType()) {
  596. case Phergie_Event_Request::TYPE_PRIVMSG:
  597. case Phergie_Event_Request::TYPE_ACTION:
  598. case Phergie_Event_Request::TYPE_NOTICE:
  599. $text = $event->getArgument(1);
  600. $urls = $this->findUrls($text);
  601. foreach ($urls as $parsed) {
  602. $url = $parsed['glued'];
  603. // shorten url
  604. $shortenedUrl = $this->shortener->shorten($url);
  605. if (!$shortenedUrl) {
  606. $this->debug(
  607. 'Invalid Url: Unable to shorten. (' . $url . ')'
  608. );
  609. $shortenedUrl = $url;
  610. }
  611. $text = str_replace($url, $shortenedUrl, $text);
  612. }
  613. $event->setArgument(1, $text);
  614. break;
  615. }
  616. }
  617. }
  618. }