PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/Twitter/Extractor.php

https://github.com/ngnpope/twitter-text-php
PHP | 234 lines | 93 code | 18 blank | 123 comment | 5 complexity | 26d0400cef0b1e233d558f21fc459f27 MD5 | raw file
Possible License(s): Apache-2.0
  1. <?php
  2. /**
  3. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  4. * @author Nick Pope <nick@nickpope.me.uk>
  5. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  6. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  7. * @package Twitter
  8. */
  9. require_once 'Regex.php';
  10. /**
  11. * Twitter Extractor Class
  12. *
  13. * Parses tweets and extracts URLs, usernames, username/list pairs and
  14. * hashtags.
  15. *
  16. * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
  17. * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
  18. * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
  19. *
  20. * @author Mike Cochrane <mikec@mikenz.geek.nz>
  21. * @author Nick Pope <nick@nickpope.me.uk>
  22. * @copyright Copyright © 2010, Mike Cochrane, Nick Pope
  23. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
  24. * @package Twitter
  25. */
  26. class Twitter_Extractor extends Twitter_Regex {
  27. /**
  28. * Provides fluent method chaining.
  29. *
  30. * @param string $tweet The tweet to be converted.
  31. *
  32. * @see __construct()
  33. *
  34. * @return Twitter_Extractor
  35. */
  36. public static function create($tweet) {
  37. return new self($tweet);
  38. }
  39. /**
  40. * Reads in a tweet to be parsed and extracts elements from it.
  41. *
  42. * Extracts various parts of a tweet including URLs, usernames, hashtags...
  43. *
  44. * @param string $tweet The tweet to extract.
  45. */
  46. public function __construct($tweet) {
  47. parent::__construct($tweet);
  48. }
  49. /**
  50. * Extracts all parts of a tweet and returns an associative array containing
  51. * the extracted elements.
  52. *
  53. * @return array The elements in the tweet.
  54. */
  55. public function extract() {
  56. return array(
  57. 'hashtags' => $this->extractHashtags(),
  58. 'urls' => $this->extractURLs(),
  59. 'mentions' => $this->extractMentionedUsernames(),
  60. 'replyto' => $this->extractRepliedUsernames(),
  61. 'hashtags_with_indices' => $this->extractHashtagsWithIndices(),
  62. 'urls_with_indices' => $this->extractURLsWithIndices(),
  63. 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices(),
  64. );
  65. }
  66. /**
  67. * Extracts all the hashtags from the tweet.
  68. *
  69. * @return array The hashtag elements in the tweet.
  70. */
  71. public function extractHashtags() {
  72. preg_match_all(self::$patterns['valid_hashtag'], $this->tweet, $matches);
  73. return $matches[3];
  74. }
  75. /**
  76. * Extracts all the cashtags from the tweet.
  77. *
  78. * @return array The cashtag elements in the tweet.
  79. */
  80. public function extractCashtags() {
  81. preg_match_all(self::$patterns['valid_cashtag'], $this->tweet, $matches);
  82. return $matches[3];
  83. }
  84. /**
  85. * Extracts all the URLs from the tweet.
  86. *
  87. * @return array The URL elements in the tweet.
  88. */
  89. public function extractURLs() {
  90. preg_match_all(self::$patterns['valid_url'], $this->tweet, $matches);
  91. list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($matches, 8, '');
  92. # FIXME: Handle extraction of protocol-less domains and t.co short URLs.
  93. # https://github.com/twitter/twitter-text-rb/commit/adb6e693b6d003819d615d19219c22d07f114a63
  94. # https://github.com/twitter/twitter-text-rb/commit/05de2c11a729f93d7680a6d4c12bff6d5ba4c164
  95. return $url;
  96. }
  97. /**
  98. * Extract all the usernames from the tweet.
  99. *
  100. * A mention is an occurrence of a username anywhere in a tweet.
  101. *
  102. * @return array The usernames elements in the tweet.
  103. */
  104. public function extractMentionedUsernames() {
  105. preg_match_all(self::$patterns['valid_mentions_or_lists'], $this->tweet, $matches);
  106. list($all, $before, $at, $username, $after, $outer) = array_pad($matches, 6, '');
  107. $usernames = array();
  108. for ($i = 0; $i < count($username); $i ++) {
  109. # Check username ending in
  110. if (preg_match(self::$patterns['end_mention_match'], $outer[$i])) continue;
  111. # If $after is not empty, there is an invalid character.
  112. if (!empty($after[$i])) continue;
  113. array_push($usernames, $username[$i]);
  114. }
  115. return $usernames;
  116. }
  117. /**
  118. * Extract all the usernames replied to from the tweet.
  119. *
  120. * A reply is an occurrence of a username at the beginning of a tweet.
  121. *
  122. * @return array The usernames replied to in a tweet.
  123. */
  124. public function extractRepliedUsernames() {
  125. preg_match(self::$patterns['valid_reply'], $this->tweet, $matches);
  126. return isset($matches[1]) ? $matches[1] : '';
  127. }
  128. /**
  129. * Extracts all the hashtags and the indices they occur at from the tweet.
  130. *
  131. * @return array The hashtag elements in the tweet.
  132. */
  133. public function extractHashtagsWithIndices() {
  134. preg_match_all(self::$patterns['valid_hashtag'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
  135. $results = &$matches[3];
  136. self::fixMultiByteIndices($this->tweet, $matches, $results, array('hashtag'), 1);
  137. return $results;
  138. }
  139. /**
  140. * Extracts all the cashtags and the indices they occur at from the tweet.
  141. *
  142. * @return array The cashtag elements in the tweet.
  143. */
  144. public function extractCashtagsWithIndices() {
  145. preg_match_all(self::$patterns['valid_cashtag'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
  146. $results = &$matches[3];
  147. self::fixMultiByteIndices($this->tweet, $matches, $results, array('cashtag'), 1);
  148. return $results;
  149. }
  150. /**
  151. * Extracts all the URLs and the indices they occur at from the tweet.
  152. *
  153. * @return array The URLs elements in the tweet.
  154. */
  155. public function extractURLsWithIndices() {
  156. preg_match_all(self::$patterns['valid_url'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
  157. $results = &$matches[2];
  158. self::fixMultiByteIndices($this->tweet, $matches, $results, array('url'), 0);
  159. # FIXME: Handle extraction of protocol-less domains.
  160. # https://github.com/twitter/twitter-text-rb/commit/adb6e693b6d003819d615d19219c22d07f114a63
  161. return $results;
  162. }
  163. /**
  164. * Extracts all the usernames and the indices they occur at from the tweet.
  165. *
  166. * @return array The username elements in the tweet.
  167. */
  168. public function extractMentionedUsernamesWithIndices() {
  169. preg_match_all(self::$patterns['valid_mentions_or_lists'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
  170. $results = &$matches[3];
  171. self::fixMultiByteIndices($this->tweet, $matches, $results, array('screen_name'), 1);
  172. return $results;
  173. }
  174. /**
  175. * Extracts all the usernames and the indices they occur at from the tweet.
  176. *
  177. * @return array The username elements in the tweet.
  178. */
  179. public function extractMentionedUsernamesOrListsWithIndices() {
  180. preg_match_all(self::$patterns['valid_mentions_or_lists'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
  181. $results = array();
  182. for ($i = 0; $i < count($matches[3]); $i++) {
  183. $results[] = array($matches[3][$i][0], $matches[4][$i][0], $matches[3][$i][1]);
  184. }
  185. self::fixMultiByteIndices($this->tweet, $matches, $results, array('screen_name', 'list_slug'), 1);
  186. return $results;
  187. }
  188. /**
  189. * Processes an array of matches and fixes up the offsets to support
  190. * multibyte strings. This needs to be done due to the state of unicode
  191. * support in PHP.
  192. *
  193. * @param string $tweet The tweet being matched.
  194. * @param array $matches The matches from the regular expression match.
  195. * @param array $results The extracted results from the matches.
  196. * @param array $keys The list of array keys to be added.
  197. * @param int $tweak An amount to adjust the end index by.
  198. */
  199. protected static function fixMultiByteIndices(&$tweet, &$matches, &$results, $keys, $tweak = 1) {
  200. for ($i = 0; $i < count($results); $i++) {
  201. # Add the array keys:
  202. $results[$i] = array_combine(array_merge($keys, array('indices')), $results[$i]);
  203. # Fix for PREG_OFFSET_CAPTURE returning byte offsets:
  204. $start = mb_strlen(substr($tweet, 0, $matches[1][$i][1]));
  205. $start += mb_strlen($matches[1][$i][0]);
  206. # Determine the multibyte length of the matched string:
  207. $length = array_sum(array_map(function ($key) use (&$results, $i) {
  208. return mb_strlen($results[$i][$key]);
  209. }, $keys));
  210. # Ensure that the indices array contains the start and end index:
  211. $results[$i]['indices'] = array($start, $start + $length + $tweak);
  212. }
  213. }
  214. }
  215. ################################################################################
  216. # vim:et:ft=php:nowrap:sts=2:sw=2:ts=2