Extractor.php - The Twitter_Extractor class extracts variou…

/common/Extractor.php

http://netputweets.googlecode.com/ · PHP · 101 lines · 80 code · 4 blank · 17 comment · 0 complexity · 1c45e2589798860bab74bc0e5d01d7fc MD5 · raw file


<?php

/*

From http://github.com/mzsanford/twitter-text-php/blob/master/src/Twitter/Extractor.php

This file is

Copyright 2010 Mike Cochrane

 

Licensed under the Apache License, Version 2.0 (the "License"); you may not

use this file except in compliance with the License. You may obtain a copy of

the License at

 

http://www.apache.org/licenses/LICENSE-2.0

 

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

License for the specific language governing permissions and limitations under

the License. 

*/



class Twitter_Extractor {



    public function extractAll($tweet) {

        return array(

                'hashtags' => $this->extractHashtags($tweet),

                'urls' =>     $this->extractURLS($tweet),

                'mentions' => $this->extractMentionedScreennames($tweet),

                'replyto' =>  $this->extractReplyScreenname($tweet)

                );

    }



    public function extractHashtags($tweet) {

        preg_match_all('$(^|[^0-9A-Z&/]+)([#?]+)([0-9A-Z_]*[A-Z_]+[a-z0-9_ќР-жи-іј-џ]*)$i', $tweet, $matches);

        return $matches[3];

    }



    public function extractURLS($tweet) {

        $URL_VALID_PRECEEDING_CHARS = "(?:[^/\"':!=]|^|\\:)";

        $URL_VALID_DOMAIN = "(?:[\\.-]|[^\\p{P}\\s])+\\.[a-z]{2,}(?::[0-9]+)?";

        $URL_VALID_URL_PATH_CHARS = "[a-z0-9!\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~@]";

        // Valid end-of-path chracters (so /foo. does not gobble the period).

        //   1. Allow ) for Wikipedia URLs.

        //   2. Allow =&# for empty URL parameters and other URL-join artifacts

        $URL_VALID_URL_PATH_ENDING_CHARS = "[a-z0-9\\)=#/]";

        $URL_VALID_URL_QUERY_CHARS = "[a-z0-9!\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~]";

        $URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9_&=#]";

        $VALID_URL_PATTERN_STRING = '$(' .                                 //  $1 total match

        "(" . $URL_VALID_PRECEEDING_CHARS . ")" .                       //  $2 Preceeding chracter

        "(" .                                                           //  $3 URL

          "(https?://|www\\.)" .                                        //  $4 Protocol or beginning

          "(" . $URL_VALID_DOMAIN . ")" .                               //  $5 Domain(s) and optional port number

          "(/" . $URL_VALID_URL_PATH_CHARS . "*" .                      //  $6 URL Path

                 $URL_VALID_URL_PATH_ENDING_CHARS . "?)?" .

          "(\\?" . $URL_VALID_URL_QUERY_CHARS . "*" .                   //  $7 Query String

                  $URL_VALID_URL_QUERY_ENDING_CHARS . ")?" .

        ")" .

        ')$i';



        preg_match_all($VALID_URL_PATTERN_STRING, $tweet, $matches);

        return $matches[3];

    }



    /**

     * Extract @username references from Tweet text. A mention is an occurance of @username anywhere in a Tweet.

     *

     * @param  String text of the tweet from which to extract usernames

     * @return Array of usernames referenced (without the leading @ sign)

     */

    public function extractMentionedScreennames($tweet) {

        preg_match_all('/(^|[^a-zA-Z0-9_])[@?]([a-zA-Z0-9_]{1,20})(?=(.|$))/', $tweet, $matches);

        $usernames = array();

        for ($i = 0; $i < sizeof($matches[2]); $i += 1) {

          if (! preg_match('/^[@?]/', $matches[3][$i])) {

            array_push($usernames, $matches[2][$i]);

          }  

        }

        return $usernames;

    }



    public function extractReplyScreenname($tweet) {

        /* Single byte whitespace characters */

        $whitespace  = '[';

        $whitespace .= "\x09-\x0D";     # 0x0009-0x000D White_Space # Cc   [5] <control-0009>..<control-000D>

        $whitespace .= "\x20";          # 0x0020 White_Space # Zs       SPACE

        $whitespace .= "\x85";          # 0x0085 White_Space # Cc       <control-0085>

        $whitespace .= "\xA0";          # 0x00A0 White_Space # Zs       NO-BREAK SPACE

        $whitespace .= "]|";



        /* Mutli byte whitespace characters */

        $whitespace .= "\xe1\x9a\x80|";                           # 0x1680White_Space # Zs       OGHAM SPACE MARK

        $whitespace .= "\xe1\xa0\x8e|";                           # 0x180E White_Space # Zs       MONGOLIAN VOWEL SEPARATOR

        $whitespace .= "\xe2\x80[\x80-\x8a,\xa8,\xa9,\xaf\xdf]|"; # 0x2000-0x200A White_Space # Zs  [11] EN QUAD..HAIR SPACE

                                                                  # 0x2028 White_Space # Zl       LINE SEPARATOR

                                                                  # 0x2029 White_Space # Zp       PARAGRAPH SEPARATOR

                                                                  # 0x202F White_Space # Zs       NARROW NO-BREAK SPACE

                                                                  # 0x205F White_Space # Zs       MEDIUM MATHEMATICAL SPACE

        $whitespace .= "\xe3\x80\x80";                            #0x3000 White_Space # Zs       IDEOGRAPHIC SPACE



        preg_match('/^(' . $whitespace . ')*[@?]([a-zA-Z0-9_]{1,20})/', $tweet, $matches);

        return isset($matches[2]) ? $matches[2] : '';

    }

}