Extractor.php | searchcode

/lib/Twitter/Extractor.php

https://github.com/ngnpope/twitter-text-php
PHP | 234 lines | 93 code | 18 blank | 123 comment | 5 complexity | 26d0400cef0b1e233d558f21fc459f27 MD5 | raw file
Possible License(s): Apache-2.0

<?php
/**
 * @author     Mike Cochrane <mikec@mikenz.geek.nz>
 * @author     Nick Pope <nick@nickpope.me.uk>
 * @copyright  Copyright © 2010, Mike Cochrane, Nick Pope
 * @license    http://www.apache.org/licenses/LICENSE-2.0  Apache License v2.0
 * @package    Twitter
 */

require_once 'Regex.php';

/**
 * Twitter Extractor Class
 *
 * Parses tweets and extracts URLs, usernames, username/list pairs and
 * hashtags.
 *
 * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
 * is based on code by {@link http://github.com/mzsanford Matt Sanford} and
 * heavily modified by {@link http://github.com/ngnpope Nick Pope}.
 *
 * @author     Mike Cochrane <mikec@mikenz.geek.nz>
 * @author     Nick Pope <nick@nickpope.me.uk>
 * @copyright  Copyright © 2010, Mike Cochrane, Nick Pope
 * @license    http://www.apache.org/licenses/LICENSE-2.0  Apache License v2.0
 * @package    Twitter
 */
class Twitter_Extractor extends Twitter_Regex {

  /**
   * Provides fluent method chaining.
   *
   * @param  string  $tweet        The tweet to be converted.
   *
   * @see  __construct()
   *
   * @return  Twitter_Extractor
   */
  public static function create($tweet) {
    return new self($tweet);
  }

  /**
   * Reads in a tweet to be parsed and extracts elements from it.
   *
   * Extracts various parts of a tweet including URLs, usernames, hashtags...
   *
   * @param  string  $tweet  The tweet to extract.
   */
  public function __construct($tweet) {
    parent::__construct($tweet);
  }

  /**
   * Extracts all parts of a tweet and returns an associative array containing
   * the extracted elements.
   *
   * @return  array  The elements in the tweet.
   */
  public function extract() {
    return array(
      'hashtags' => $this->extractHashtags(),
      'urls'     => $this->extractURLs(),
      'mentions' => $this->extractMentionedUsernames(),
      'replyto'  => $this->extractRepliedUsernames(),
      'hashtags_with_indices' => $this->extractHashtagsWithIndices(),
      'urls_with_indices'     => $this->extractURLsWithIndices(),
      'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices(),
    );
  }

  /**
   * Extracts all the hashtags from the tweet.
   *
   * @return  array  The hashtag elements in the tweet.
   */
  public function extractHashtags() {
    preg_match_all(self::$patterns['valid_hashtag'], $this->tweet, $matches);
    return $matches[3];
  }

  /**
   * Extracts all the cashtags from the tweet.
   *
   * @return  array  The cashtag elements in the tweet.
   */
  public function extractCashtags() {
    preg_match_all(self::$patterns['valid_cashtag'], $this->tweet, $matches);
    return $matches[3];
  }

  /**
   * Extracts all the URLs from the tweet.
   *
   * @return  array  The URL elements in the tweet.
   */
  public function extractURLs() {
    preg_match_all(self::$patterns['valid_url'], $this->tweet, $matches);
    list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($matches, 8, '');
    # FIXME: Handle extraction of protocol-less domains and t.co short URLs.
    # https://github.com/twitter/twitter-text-rb/commit/adb6e693b6d003819d615d19219c22d07f114a63
    # https://github.com/twitter/twitter-text-rb/commit/05de2c11a729f93d7680a6d4c12bff6d5ba4c164
    return $url;
  }

  /**
   * Extract all the usernames from the tweet.
   *
   * A mention is an occurrence of a username anywhere in a tweet.
   *
   * @return  array  The usernames elements in the tweet.
   */
  public function extractMentionedUsernames() {
    preg_match_all(self::$patterns['valid_mentions_or_lists'], $this->tweet, $matches);
    list($all, $before, $at, $username, $after, $outer) = array_pad($matches, 6, '');
    $usernames = array();
    for ($i = 0; $i < count($username); $i ++) {
      # Check username ending in
      if (preg_match(self::$patterns['end_mention_match'], $outer[$i])) continue;
      # If $after is not empty, there is an invalid character.
      if (!empty($after[$i])) continue;
      array_push($usernames, $username[$i]);
    }
    return $usernames;
  }

  /**
   * Extract all the usernames replied to from the tweet.
   *
   * A reply is an occurrence of a username at the beginning of a tweet.
   *
   * @return  array  The usernames replied to in a tweet.
   */
  public function extractRepliedUsernames() {
    preg_match(self::$patterns['valid_reply'], $this->tweet, $matches);
    return isset($matches[1]) ? $matches[1] : '';
  }

  /**
   * Extracts all the hashtags and the indices they occur at from the tweet.
   *
   * @return  array  The hashtag elements in the tweet.
   */
  public function extractHashtagsWithIndices() {
    preg_match_all(self::$patterns['valid_hashtag'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $results = &$matches[3];
    self::fixMultiByteIndices($this->tweet, $matches, $results, array('hashtag'), 1);
    return $results;
  }

  /**
   * Extracts all the cashtags and the indices they occur at from the tweet.
   *
   * @return  array  The cashtag elements in the tweet.
   */
  public function extractCashtagsWithIndices() {
    preg_match_all(self::$patterns['valid_cashtag'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $results = &$matches[3];
    self::fixMultiByteIndices($this->tweet, $matches, $results, array('cashtag'), 1);
    return $results;
  }

  /**
   * Extracts all the URLs and the indices they occur at from the tweet.
   *
   * @return  array  The URLs elements in the tweet.
   */
  public function extractURLsWithIndices() {
    preg_match_all(self::$patterns['valid_url'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $results = &$matches[2];
    self::fixMultiByteIndices($this->tweet, $matches, $results, array('url'), 0);
    # FIXME: Handle extraction of protocol-less domains.
    # https://github.com/twitter/twitter-text-rb/commit/adb6e693b6d003819d615d19219c22d07f114a63
    return $results;
  }

  /**
   * Extracts all the usernames and the indices they occur at from the tweet.
   *
   * @return  array  The username elements in the tweet.
   */
  public function extractMentionedUsernamesWithIndices() {
    preg_match_all(self::$patterns['valid_mentions_or_lists'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $results = &$matches[3];
    self::fixMultiByteIndices($this->tweet, $matches, $results, array('screen_name'), 1);
    return $results;
  }

  /**
   * Extracts all the usernames and the indices they occur at from the tweet.
   *
   * @return  array  The username elements in the tweet.
   */
  public function extractMentionedUsernamesOrListsWithIndices() {
    preg_match_all(self::$patterns['valid_mentions_or_lists'], $this->tweet, $matches, PREG_OFFSET_CAPTURE);
    $results = array();
    for ($i = 0; $i < count($matches[3]); $i++) {
      $results[] = array($matches[3][$i][0], $matches[4][$i][0], $matches[3][$i][1]);
    }
    self::fixMultiByteIndices($this->tweet, $matches, $results, array('screen_name', 'list_slug'), 1);
    return $results;
  }

  /**
   * Processes an array of matches and fixes up the offsets to support
   * multibyte strings.  This needs to be done due to the state of unicode
   * support in PHP.
   *
   * @param  string  $tweet    The tweet being matched.
   * @param  array   $matches  The matches from the regular expression match.
   * @param  array   $results  The extracted results from the matches.
   * @param  array   $keys     The list of array keys to be added.
   * @param  int     $tweak    An amount to adjust the end index by.
   */
  protected static function fixMultiByteIndices(&$tweet, &$matches, &$results, $keys, $tweak = 1) {
    for ($i = 0; $i < count($results); $i++) {
      # Add the array keys:
      $results[$i] = array_combine(array_merge($keys, array('indices')), $results[$i]);
      # Fix for PREG_OFFSET_CAPTURE returning byte offsets:
      $start = mb_strlen(substr($tweet, 0, $matches[1][$i][1]));
      $start += mb_strlen($matches[1][$i][0]);
      # Determine the multibyte length of the matched string:
      $length = array_sum(array_map(function ($key) use (&$results, $i) {
        return mb_strlen($results[$i][$key]);
      }, $keys));
      # Ensure that the indices array contains the start and end index:
      $results[$i]['indices'] = array($start, $start + $length + $tweak);
    }
  }

}

################################################################################
# vim:et:ft=php:nowrap:sts=2:sw=2:ts=2