PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/common/Extractor.php

http://netputweets.googlecode.com/
PHP | 101 lines | 80 code | 4 blank | 17 comment | 0 complexity | 1c45e2589798860bab74bc0e5d01d7fc MD5 | raw file
  1. <?php
  2. /*
  3. From http://github.com/mzsanford/twitter-text-php/blob/master/src/Twitter/Extractor.php
  4. This file is
  5. Copyright 2010 Mike Cochrane
  6. Licensed under the Apache License, Version 2.0 (the "License"); you may not
  7. use this file except in compliance with the License. You may obtain a copy of
  8. the License at
  9. http://www.apache.org/licenses/LICENSE-2.0
  10. Unless required by applicable law or agreed to in writing, software
  11. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. License for the specific language governing permissions and limitations under
  14. the License.
  15. */
  16. class Twitter_Extractor {
  17. public function extractAll($tweet) {
  18. return array(
  19. 'hashtags' => $this->extractHashtags($tweet),
  20. 'urls' => $this->extractURLS($tweet),
  21. 'mentions' => $this->extractMentionedScreennames($tweet),
  22. 'replyto' => $this->extractReplyScreenname($tweet)
  23. );
  24. }
  25. public function extractHashtags($tweet) {
  26. preg_match_all('$(^|[^0-9A-Z&/]+)([#?]+)([0-9A-Z_]*[A-Z_]+[a-z0-9_ќР-жи-іј-џ]*)$i', $tweet, $matches);
  27. return $matches[3];
  28. }
  29. public function extractURLS($tweet) {
  30. $URL_VALID_PRECEEDING_CHARS = "(?:[^/\"':!=]|^|\\:)";
  31. $URL_VALID_DOMAIN = "(?:[\\.-]|[^\\p{P}\\s])+\\.[a-z]{2,}(?::[0-9]+)?";
  32. $URL_VALID_URL_PATH_CHARS = "[a-z0-9!\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~@]";
  33. // Valid end-of-path chracters (so /foo. does not gobble the period).
  34. // 1. Allow ) for Wikipedia URLs.
  35. // 2. Allow =&# for empty URL parameters and other URL-join artifacts
  36. $URL_VALID_URL_PATH_ENDING_CHARS = "[a-z0-9\\)=#/]";
  37. $URL_VALID_URL_QUERY_CHARS = "[a-z0-9!\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~]";
  38. $URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9_&=#]";
  39. $VALID_URL_PATTERN_STRING = '$(' . // $1 total match
  40. "(" . $URL_VALID_PRECEEDING_CHARS . ")" . // $2 Preceeding chracter
  41. "(" . // $3 URL
  42. "(https?://|www\\.)" . // $4 Protocol or beginning
  43. "(" . $URL_VALID_DOMAIN . ")" . // $5 Domain(s) and optional port number
  44. "(/" . $URL_VALID_URL_PATH_CHARS . "*" . // $6 URL Path
  45. $URL_VALID_URL_PATH_ENDING_CHARS . "?)?" .
  46. "(\\?" . $URL_VALID_URL_QUERY_CHARS . "*" . // $7 Query String
  47. $URL_VALID_URL_QUERY_ENDING_CHARS . ")?" .
  48. ")" .
  49. ')$i';
  50. preg_match_all($VALID_URL_PATTERN_STRING, $tweet, $matches);
  51. return $matches[3];
  52. }
  53. /**
  54. * Extract @username references from Tweet text. A mention is an occurance of @username anywhere in a Tweet.
  55. *
  56. * @param String text of the tweet from which to extract usernames
  57. * @return Array of usernames referenced (without the leading @ sign)
  58. */
  59. public function extractMentionedScreennames($tweet) {
  60. preg_match_all('/(^|[^a-zA-Z0-9_])[@?]([a-zA-Z0-9_]{1,20})(?=(.|$))/', $tweet, $matches);
  61. $usernames = array();
  62. for ($i = 0; $i < sizeof($matches[2]); $i += 1) {
  63. if (! preg_match('/^[@?]/', $matches[3][$i])) {
  64. array_push($usernames, $matches[2][$i]);
  65. }
  66. }
  67. return $usernames;
  68. }
  69. public function extractReplyScreenname($tweet) {
  70. /* Single byte whitespace characters */
  71. $whitespace = '[';
  72. $whitespace .= "\x09-\x0D"; # 0x0009-0x000D White_Space # Cc [5] <control-0009>..<control-000D>
  73. $whitespace .= "\x20"; # 0x0020 White_Space # Zs SPACE
  74. $whitespace .= "\x85"; # 0x0085 White_Space # Cc <control-0085>
  75. $whitespace .= "\xA0"; # 0x00A0 White_Space # Zs NO-BREAK SPACE
  76. $whitespace .= "]|";
  77. /* Mutli byte whitespace characters */
  78. $whitespace .= "\xe1\x9a\x80|"; # 0x1680White_Space # Zs OGHAM SPACE MARK
  79. $whitespace .= "\xe1\xa0\x8e|"; # 0x180E White_Space # Zs MONGOLIAN VOWEL SEPARATOR
  80. $whitespace .= "\xe2\x80[\x80-\x8a,\xa8,\xa9,\xaf\xdf]|"; # 0x2000-0x200A White_Space # Zs [11] EN QUAD..HAIR SPACE
  81. # 0x2028 White_Space # Zl LINE SEPARATOR
  82. # 0x2029 White_Space # Zp PARAGRAPH SEPARATOR
  83. # 0x202F White_Space # Zs NARROW NO-BREAK SPACE
  84. # 0x205F White_Space # Zs MEDIUM MATHEMATICAL SPACE
  85. $whitespace .= "\xe3\x80\x80"; #0x3000 White_Space # Zs IDEOGRAPHIC SPACE
  86. preg_match('/^(' . $whitespace . ')*[@?]([a-zA-Z0-9_]{1,20})/', $tweet, $matches);
  87. return isset($matches[2]) ? $matches[2] : '';
  88. }
  89. }