PageRenderTime 26ms CodeModel.GetById 29ms RepoModel.GetById 1ms app.codeStats 0ms

/src/edu/cmu/cs/lti/ark/tweetnlp/twokenize.scala

https://github.com/Pet3ris/ark-tweet-nlp
Scala | 303 lines | 181 code | 30 blank | 92 comment | 0 complexity | bab8b2b114373f6faf932cc1cc044383 MD5 | raw file
  1. package edu.cmu.cs.lti.ark.tweetnlp // comment this out to run as script, scala is weird
  2. import scala.collection.JavaConversions._
  3. /*
  4. TweetMotif is licensed under the Apache License 2.0:
  5. http://www.apache.org/licenses/LICENSE-2.0.html
  6. Copyright Brendan O'Connor, Michel Krieger, and David Ahn, 2009-2010.
  7. */
  8. /*
  9. twokenize.scala -- a little Twitter tokenizer,
  10. tested for English and some other European languages.
  11. Twokenize.tokenize("@hellocalyclops =))=))=)) Oh well.")
  12. => ["@hellocalyclops", "=))", "=))", "=))", "Oh", "well", "."]
  13. Invoking main() takes tweet texts on stdin and outputs space-separated tokenizations.
  14. Code History
  15. * Original version in TweetMotif in Python (2009-2010, github.com/brendano/tweetmotif)
  16. having two forks:
  17. - (2011) Scala port and improvements by David Snyder (dsnyder@cs.utexas.edu)
  18. and Jason Baldridge (jasonbaldridge@gmail.com)
  19. https://bitbucket.org/jasonbaldridge/twokenize/
  20. - (2011) Modifications for POS tagging by Kevin Gimpel (kgimpel@cs.cmu.edu)
  21. and Daniel Mills (dpmills@cs.cmu.edu)
  22. * Merge to Scala by Brendan O'Connor, for ARK TweetNLP package (2011-06)
  23. Original paper:
  24. TweetMotif: Exploratory Search and Topic Summarization for Twitter.
  25. Brendan O'Connor, Michel Krieger, and David Ahn.
  26. ICWSM-2010 (demo track)
  27. http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
  28. ---
  29. Scala port of Brendar O'Connor's twokenize.py
  30. This is not a direct port, as some changes were made in the aim of
  31. simplicity.
  32. - David Snyder (dsnyder@cs.utexas.edu)
  33. April 2011
  34. Modifications to more functional style, fix a few bugs, and making
  35. output more like twokenize.py. Added abbrevations. Tweaked some
  36. regex's to produce better tokens.
  37. - Jason Baldridge (jasonbaldridge@gmail.com)
  38. June 2011
  39. */
  40. /**
  41. * TODO
  42. * - byte offsets should be added here. can easily re-align
  43. * since the only munged characters are whitespace (hopefully)
  44. */
  45. import scala.util.matching.Regex
  46. import collection.JavaConversions._
  47. object Twokenize {
  48. val Contractions = """(?i)(\w+)(n't|'ve|'ll|'d|'re|'s|'m)$""".r
  49. val Whitespace = """\s+""".r
  50. val punctChars = """['“\".?!,:;]"""
  51. val punctSeq = punctChars+"""+"""
  52. val entity = """&(amp|lt|gt|quot);"""
  53. // URLs
  54. // David: I give the Larry David eye to this whole URL regex
  55. // (http://www.youtube.com/watch?v=2SmoBvg-etU) There are
  56. // TODO potentially better options, see:
  57. // http://daringfireball.net/2010/07/improved_regex_for_matching_urls
  58. // http://mathiasbynens.be/demo/url-regex
  59. val urlStart1 = """(https?://|www\.)"""
  60. val commonTLDs = """(com|co\.uk|org|net|info|ca|ly|mp|edu|gov)"""
  61. val urlStart2 = """[A-Za-z0-9\.-]+?\.""" + commonTLDs + """(?=[/ \W])"""
  62. val urlBody = """[^ \t\r\n<>]*?"""
  63. val urlExtraCrapBeforeEnd = "("+punctChars+"|"+entity+")+?"
  64. val urlEnd = """(\.\.+|[<>]|\s|$)"""
  65. val url = """\b("""+urlStart1+"|"+urlStart2+")"+urlBody+"(?=("+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
  66. // Numeric
  67. val timeLike = """\d+:\d+"""
  68. val numNum = """\d+\.\d+"""
  69. val numberWithCommas = """(\d+,)+?\d{3}""" + """(?=([^,]|$))"""
  70. // Abbreviations
  71. val boundaryNotDot = """($|\s|[“\"?!,:;]|""" + entity + ")"
  72. val aa1 = """([A-Za-z]\.){2,}(?=""" + boundaryNotDot + ")"
  73. val aa2 = """[^A-Za-z]([A-Za-z]\.){1,}[A-Za-z](?=""" + boundaryNotDot + ")"
  74. val standardAbbreviations = """\b([Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."""
  75. val arbitraryAbbrev = "(" + aa1 +"|"+ aa2 + "|" + standardAbbreviations + ")"
  76. val separators = "(--+|―)"
  77. val decorations = """[♫]+"""
  78. val thingsThatSplitWords = """[^\s\.,]"""
  79. val embeddedApostrophe = thingsThatSplitWords+"""+'""" + thingsThatSplitWords + """+"""
  80. // Emoticons
  81. val normalEyes = "(?iu)[:=]"
  82. val wink = "[;]"
  83. val noseArea = "(|o|O|-|[^a-zA-Z0-9 ])"
  84. val happyMouths = """[D\)\]]+"""
  85. val sadMouths = """[\(\[]+"""
  86. val tongue = "[pP]"
  87. val otherMouths = """[doO/\\]+""" // remove forward slash if http://'s aren't cleaned
  88. // mouth repetition examples:
  89. // @aliciakeys Put it in a love song :-))
  90. // @hellocalyclops =))=))=)) Oh well
  91. def OR(parts: String*) = {
  92. "(" + parts.toList.mkString("|") + ")"
  93. }
  94. val emoticon = OR(
  95. // Standard version :) :( :] :D :P
  96. OR(normalEyes, wink) + noseArea + OR(tongue, otherMouths, sadMouths, happyMouths),
  97. // reversed version (: D: use positive lookbehind to remove "(word):"
  98. // because eyes on the right side is more ambiguous with the standard usage of : ;
  99. """(?<=( |^))""" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink)
  100. // TODO japanese-style emoticons
  101. // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
  102. )
  103. def allowEntities(pat: String)= {
  104. // so we can write patterns with < and > and let them match escaped html too
  105. pat.replace("<", "(<|&lt;)").replace(">", "(>|&gt;)")
  106. }
  107. val Hearts = allowEntities("""(<+/?3+)""")
  108. val Arrows = allowEntities("""(<*[-=]*>+|<+[-=]*>*)""")
  109. // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
  110. // "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
  111. // "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
  112. // "hello (@person)" ==> "hello (@person )" WRONG
  113. // "hello (@person)" ==> "hello ( @person )" RIGHT
  114. // ... Some sort of weird interaction with edgepunct I guess, because edgepunct
  115. // has poor content-symbol detection.
  116. val Hashtag = """#[a-zA-Z0-9_]+"""; // also gets #1 #40 which probably aren't hashtags .. but good as tokens
  117. val AtMention = """@[a-zA-Z0-9_]+""";
  118. // I was worried this would conflict with at-mentions
  119. // but seems ok in sample of 5800: 7 changes all email fixes
  120. // http://www.regular-expressions.info/email.html
  121. val Bound = """(\W|^|$)"""
  122. val Email = "(?<=" +Bound+ """)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=""" +Bound+")"
  123. // We will be tokenizing using these regexps as delimiters
  124. // Additionally, these things are "protected", meaning they shouldn't be further split themselves.
  125. val Protected = new Regex(
  126. OR(
  127. Hearts,
  128. Arrows,
  129. emoticon,
  130. url,
  131. Email,
  132. entity,
  133. timeLike,
  134. numNum,
  135. numberWithCommas,
  136. punctSeq,
  137. arbitraryAbbrev,
  138. separators,
  139. decorations,
  140. embeddedApostrophe,
  141. Hashtag,
  142. AtMention
  143. ))
  144. // Edge punctuation
  145. // Want: 'foo' => ' foo '
  146. // While also: don't => don't
  147. // the first is considered "edge punctuation".
  148. // the second is word-internal punctuation -- don't want to mess with it.
  149. // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
  150. // I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
  151. // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
  152. val edgePunctChars = """'"«»{}\(\)\[\]\*"""
  153. val edgePunct = "[" + edgePunctChars + "]"
  154. val notEdgePunct = "[a-zA-Z0-9]" // content characters
  155. val offEdge = """(^|$|:|;|\s)""" // colon here gets "(hello):" ==> "( hello ):"
  156. val EdgePunctLeft = new Regex(offEdge + "("+edgePunct+"+)("+notEdgePunct+")")
  157. val EdgePunctRight = new Regex("("+notEdgePunct+")("+edgePunct+"+)" + offEdge)
  158. def splitEdgePunct (input: String) = {
  159. var s = input
  160. s = EdgePunctLeft.replaceAllIn(s, "$1$2 $3")
  161. s = EdgePunctRight.replaceAllIn(s, "$1 $2$3")
  162. s
  163. }
  164. // The main work of tokenizing a tweet.
  165. def simpleTokenize (text: String) = {
  166. // Do the no-brainers first
  167. val splitPunctText = splitEdgePunct(text)
  168. val textLength = splitPunctText.length
  169. // Find the matches for subsequences that should be protected,
  170. // e.g. URLs, 1.0, U.N.K.L.E., 12:53
  171. val matches = Protected.findAllIn(splitPunctText).matchData.toList
  172. // The spans of the "bads" should not be split.
  173. val badSpans = matches map (mat => Tuple2(mat.start, mat.end))
  174. // Create a list of indices to create the "goods", which can be
  175. // split. We are taking "bad" spans like
  176. // List((2,5), (8,10))
  177. // to create
  178. /// List(0, 2, 5, 8, 10, 12)
  179. // where, e.g., "12" here would be the textLength
  180. val indices = (0 :: badSpans.foldRight(List[Int]())((x,y) => x._1 :: x._2 :: y)) ::: List(textLength)
  181. // Group the indices and map them to their respective portion of the string
  182. val goods = indices.grouped(2).map { x => splitPunctText.slice(x(0),x(1)) }.toList
  183. //The 'good' strings are safe to be further tokenized by whitespace
  184. val splitGoods = goods map { str => str.trim.split(" ").toList }
  185. //Storing as List[List[String]] to make zip easier later on
  186. val bads = badSpans map { case(start,end) => List(splitPunctText.slice(start,end)) }
  187. // Reinterpolate the 'good' and 'bad' Lists, ensuring that
  188. // additonal tokens from last good item get included
  189. val zippedStr =
  190. (if (splitGoods.length == bads.length)
  191. splitGoods.zip(bads) map { pair => pair._1 ++ pair._2 }
  192. else
  193. (splitGoods.zip(bads) map { pair => pair._1 ++ pair._2 }) ::: List(splitGoods.last)
  194. ).flatten
  195. // Split based on special patterns (like contractions) and check all tokens are non empty
  196. zippedStr.map(splitToken(_)).flatten.filter(_.length > 0)
  197. }
  198. // "foo bar" => "foo bar"
  199. def squeezeWhitespace (input: String) = Whitespace.replaceAllIn(input," ").trim
  200. // Final pass tokenization based on special patterns
  201. def splitToken (token: String) = {
  202. token match {
  203. // BTO: our POS tagger wants "ur" and "you're" to both be one token.
  204. // Uncomment to get "you 're"
  205. // case Contractions(stem, contr) => List(stem.trim, contr.trim)
  206. case token => List(token.trim)
  207. }
  208. }
  209. // Apply method allows it to be used as Twokenize(line) in Scala.
  210. def apply (text: String): List[String] = simpleTokenize(squeezeWhitespace(text))
  211. // More normal name for @apply@
  212. def tokenize (text: String): List[String] = apply(text)
  213. // Very slight normalization for AFTER tokenization.
  214. // The tokenization regexes are written to work on non-normalized text.
  215. // (to make byte offsets easier to compute)
  216. // Hm: 2+ repeated character normalization here?
  217. // No, that's more linguistic, should be further down the pipeline
  218. def normalizeText(text: String) = {
  219. text.replaceAll("&lt;", "<").replaceAll("&gt;",">").replaceAll("&amp;","&")
  220. }
  221. def tokenizeForTagger (text: String): List[String] = {
  222. tokenize(text).map(normalizeText)
  223. }
  224. def tokenizeForTagger_J (text: String): java.util.List[String] = {
  225. tokenizeForTagger(text).toSeq
  226. }
  227. // Main method
  228. def main (args: Array[String]) = {
  229. // force stdin/stdout interpretation as UTF-8
  230. // and ignore the stupid JVM default settings (MacRoman? wtf??)
  231. Console.setOut(new java.io.PrintStream(System.out, true, "UTF8"))
  232. io.Source.fromInputStream(System.in, "UTF-8").getLines foreach {
  233. line => {
  234. println(tokenizeForTagger(line).reduceLeft(_ + " " + _))
  235. }
  236. }
  237. }
  238. }