PageRenderTime 248ms CodeModel.GetById 59ms RepoModel.GetById 0ms app.codeStats 0ms

/Lib/site-packages/nltk/tokenize/casual.py

https://gitlab.com/pierreEffiScience/TwitterClustering
Python | 342 lines | 258 code | 12 blank | 72 comment | 2 complexity | d8a084205fa86c94538b644aa6a59b0e MD5 | raw file
  1. # coding: utf-8
  2. #
  3. # Natural Language Toolkit: Twitter Tokenizer
  4. #
  5. # Copyright (C) 2001-2016 NLTK Project
  6. # Author: Christopher Potts <cgpotts@stanford.edu>
  7. # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
  8. # Pierpaolo Pantone <> (modifications)
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. #
  12. """
  13. Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
  14. domains and tasks. The basic logic is this:
  15. 1. The tuple regex_strings defines a list of regular expression
  16. strings.
  17. 2. The regex_strings strings are put, in order, into a compiled
  18. regular expression object called word_re.
  19. 3. The tokenization is done by word_re.findall(s), where s is the
  20. user-supplied string, inside the tokenize() method of the class
  21. Tokenizer.
  22. 4. When instantiating Tokenizer objects, there is a single option:
  23. preserve_case. By default, it is set to True. If it is set to
  24. False, then the tokenizer will downcase everything except for
  25. emoticons.
  26. """
  27. ######################################################################
  28. from __future__ import unicode_literals
  29. import re
  30. from nltk.compat import htmlentitydefs, int2byte, unichr
  31. ######################################################################
  32. # The following strings are components in the regular expression
  33. # that is used for tokenizing. It's important that phone_number
  34. # appears first in the final regex (since it can contain whitespace).
  35. # It also could matter that tags comes after emoticons, due to the
  36. # possibility of having text like
  37. #
  38. # <:| and some text >:)
  39. #
  40. # Most importantly, the final element should always be last, since it
  41. # does a last ditch whitespace-based tokenization of whatever is left.
  42. # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
  43. # This particular element is used in a couple ways, so we define it
  44. # with a name:
  45. EMOTICONS = r"""
  46. (?:
  47. [<>]?
  48. [:;=8] # eyes
  49. [\-o\*\']? # optional nose
  50. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  51. |
  52. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  53. [\-o\*\']? # optional nose
  54. [:;=8] # eyes
  55. [<>]?
  56. |
  57. <3 # heart
  58. )"""
  59. # URL pattern due to John Gruber, modified by Tom Winzig. See
  60. # https://gist.github.com/winzig/8894715
  61. URLS = r""" # Capture 1: entire matched URL
  62. (?:
  63. https?: # URL protocol and colon
  64. (?:
  65. /{1,3} # 1-3 slashes
  66. | # or
  67. [a-z0-9%] # Single letter or digit or '%'
  68. # (Trying not to match e.g. "URI::Escape")
  69. )
  70. | # or
  71. # looks like domain name followed by a slash:
  72. [a-z0-9.\-]+[.]
  73. (?:[a-z]{2,13})
  74. /
  75. )
  76. (?: # One or more:
  77. [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
  78. | # or
  79. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  80. |
  81. \([^\s]+?\) # balanced parens, non-recursive: (...)
  82. )+
  83. (?: # End with:
  84. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  85. |
  86. \([^\s]+?\) # balanced parens, non-recursive: (...)
  87. | # or
  88. [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
  89. )
  90. | # OR, the following to match naked domains:
  91. (?:
  92. (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
  93. [a-z0-9]+
  94. (?:[.\-][a-z0-9]+)*
  95. [.]
  96. (?:[a-z]{2,13})
  97. \b
  98. /?
  99. (?!@) # not succeeded by a @,
  100. # avoid matching "foo.na" in "foo.na@example.com"
  101. )
  102. """
  103. # The components of the tokenizer:
  104. REGEXPS = (
  105. URLS,
  106. # Phone numbers:
  107. r"""
  108. (?:
  109. (?: # (international)
  110. \+?[01]
  111. [\-\s.]*
  112. )?
  113. (?: # (area code)
  114. [\(]?
  115. \d{3}
  116. [\-\s.\)]*
  117. )?
  118. \d{3} # exchange
  119. [\-\s.]*
  120. \d{4} # base
  121. )"""
  122. ,
  123. # ASCII Emoticons
  124. EMOTICONS
  125. ,
  126. # HTML tags:
  127. r"""<[^>\s]+>"""
  128. ,
  129. # ASCII Arrows
  130. r"""[\-]+>|<[\-]+"""
  131. ,
  132. # Twitter username:
  133. r"""(?:@[\w_]+)"""
  134. ,
  135. # Twitter hashtags:
  136. r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
  137. ,
  138. # email addresses
  139. r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]"""
  140. ,
  141. # Remaining word types:
  142. r"""
  143. (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
  144. |
  145. (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
  146. |
  147. (?:[\w_]+) # Words without apostrophes or dashes.
  148. |
  149. (?:\.(?:\s*\.){1,}) # Ellipsis dots.
  150. |
  151. (?:\S) # Everything else that isn't whitespace.
  152. """
  153. )
  154. ######################################################################
  155. # This is the core tokenizing regex:
  156. WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I
  157. | re.UNICODE)
  158. # WORD_RE performs poorly on these patterns:
  159. HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
  160. # The emoticon string gets its own regex so that we can preserve case for
  161. # them as needed:
  162. EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
  163. # These are for regularizing HTML entities to Unicode:
  164. ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
  165. ######################################################################
  166. # Functions for converting html entities
  167. ######################################################################
  168. def _str_to_unicode(text, encoding=None, errors='strict'):
  169. if encoding is None:
  170. encoding = 'utf-8'
  171. if isinstance(text, bytes):
  172. return text.decode(encoding, errors)
  173. return text
  174. def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
  175. """
  176. Remove entities from text by converting them to their
  177. corresponding unicode character.
  178. :param text: a unicode string or a byte string encoded in the given
  179. `encoding` (which defaults to 'utf-8').
  180. :param list keep: list of entity names which should not be replaced.\
  181. This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
  182. and named entities (such as ``&nbsp;`` or ``&gt;``).
  183. :param bool remove_illegal: If `True`, entities that can't be converted are\
  184. removed. Otherwise, entities that can't be converted are kept "as
  185. is".
  186. :returns: A unicode string with the entities removed.
  187. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
  188. >>> from nltk.tokenize.casual import _replace_html_entities
  189. >>> _replace_html_entities(b'Price: &pound;100')
  190. 'Price: \\xa3100'
  191. >>> print(_replace_html_entities(b'Price: &pound;100'))
  192. Price: £100
  193. >>>
  194. """
  195. def _convert_entity(match):
  196. entity_body = match.group(3)
  197. if match.group(1):
  198. try:
  199. if match.group(2):
  200. number = int(entity_body, 16)
  201. else:
  202. number = int(entity_body, 10)
  203. # Numeric character references in the 80-9F range are typically
  204. # interpreted by browsers as representing the characters mapped
  205. # to bytes 80-9F in the Windows-1252 encoding. For more info
  206. # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
  207. if 0x80 <= number <= 0x9f:
  208. return int2byte(number).decode('cp1252')
  209. except ValueError:
  210. number = None
  211. else:
  212. if entity_body in keep:
  213. return match.group(0)
  214. else:
  215. number = htmlentitydefs.name2codepoint.get(entity_body)
  216. if number is not None:
  217. try:
  218. return unichr(number)
  219. except ValueError:
  220. pass
  221. return "" if remove_illegal else match.group(0)
  222. return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
  223. ######################################################################
  224. class TweetTokenizer:
  225. r"""
  226. Tokenizer for tweets.
  227. >>> from nltk.tokenize import TweetTokenizer
  228. >>> tknzr = TweetTokenizer()
  229. >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
  230. >>> tknzr.tokenize(s0)
  231. ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
  232. Examples using `strip_handles` and `reduce_len parameters`:
  233. >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
  234. >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
  235. >>> tknzr.tokenize(s1)
  236. [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
  237. """
  238. def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
  239. self.preserve_case = preserve_case
  240. self.reduce_len = reduce_len
  241. self.strip_handles = strip_handles
  242. def tokenize(self, text):
  243. """
  244. :param text: str
  245. :rtype: list(str)
  246. :return: a tokenized list of strings; concatenating this list returns\
  247. the original string if `preserve_case=False`
  248. """
  249. # Fix HTML character entities:
  250. text = _replace_html_entities(text)
  251. # Remove username handles
  252. if self.strip_handles:
  253. text = remove_handles(text)
  254. # Normalize word lengthening
  255. if self.reduce_len:
  256. text = reduce_lengthening(text)
  257. # Shorten problematic sequences of characters
  258. safe_text = HANG_RE.sub(r'\1\1\1', text)
  259. # Tokenize:
  260. words = WORD_RE.findall(safe_text)
  261. # Possibly alter the case, but avoid changing emoticons like :D into :d:
  262. if not self.preserve_case:
  263. words = list(map((lambda x : x if EMOTICON_RE.search(x) else
  264. x.lower()), words))
  265. return words
  266. ######################################################################
  267. # Normalization Functions
  268. ######################################################################
  269. def reduce_lengthening(text):
  270. """
  271. Replace repeated character sequences of length 3 or greater with sequences
  272. of length 3.
  273. """
  274. pattern = re.compile(r"(.)\1{2,}")
  275. return pattern.sub(r"\1\1\1", text)
  276. def remove_handles(text):
  277. """
  278. Remove Twitter username handles from text.
  279. """
  280. pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+")
  281. return pattern.sub('', text)
  282. ######################################################################
  283. # Tokenization Function
  284. ######################################################################
  285. def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
  286. """
  287. Convenience function for wrapping the tokenizer.
  288. """
  289. return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len,
  290. strip_handles=strip_handles).tokenize(text)
  291. ###############################################################################