PageRenderTime 85ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 0ms

/KeyGraph/src/dataset/twitter/TwitterDataLoader.java

#
Java | 361 lines | 311 code | 28 blank | 22 comment | 91 complexity | 936e44f9a3e9e626aff8fddfff15906b MD5 | raw file
  1. package dataset.twitter;
  2. import java.io.BufferedReader;
  3. import java.io.FileReader;
  4. import java.sql.Timestamp;
  5. import java.util.ArrayList;
  6. import java.util.HashMap;
  7. import java.util.HashSet;
  8. import java.util.List;
  9. import java.util.StringTokenizer;
  10. import java.io.*;
  11. import java.util.*;
  12. import topicDetection.Constants;
  13. import topicDetection.DataLoader;
  14. import topicDetection.Document;
  15. import topicDetection.Keyword;
  16. import topicDetection.Porter;
  17. import edu.stanford.nlp.trees.*;
  18. import edu.stanford.nlp.process.*;
  19. import edu.stanford.nlp.objectbank.TokenizerFactory;
  20. import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
  21. public class TwitterDataLoader extends DataLoader {
  22. public TwitterDataLoader(Constants constants) {
  23. super(constants);
  24. }
  25. public void fetchTweets4website(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
  26. boolean removeDuplicates) throws Exception {
  27. String line = null;
  28. BufferedReader in = new BufferedReader(new FileReader(datasetFile));
  29. int i = 0;
  30. StringDuplicate sd = new StringDuplicate();
  31. while ((line = in.readLine()) != null) {
  32. i++;
  33. if (i % 1000 == 0)
  34. System.out.println(i);
  35. String[] tokens = line.split("\t");
  36. Document d = new Document(Integer.parseInt(tokens[0]) + "");
  37. String tweet = tokens[1];
  38. fetchTweetContent(tweet, stopwords, porter, d);
  39. if (constants.REMOVE_DUPLICATES)
  40. d.isDuplicate = sd.isDuplicate(tweet);
  41. if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
  42. docs.put(d.id, d);
  43. for (Keyword k : d.keywords.values()) {
  44. if (DF.containsKey(k.baseForm))
  45. DF.put(k.baseForm, DF.get(k.baseForm) + 1);
  46. else
  47. DF.put(k.baseForm, new Double(1));
  48. }
  49. }
  50. }
  51. }
  52. public void fetchTweets(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter)
  53. throws Exception {
  54. String line = null;
  55. BufferedReader in = new BufferedReader(new FileReader(datasetFile));
  56. int i = 0;
  57. while ((line = in.readLine()) != null)
  58. try {
  59. i++;
  60. if (i % 1000 == 0)
  61. System.out.println(i);
  62. if (i == 150000)
  63. break;
  64. String[] tokens = line.split("\t");
  65. // Document d = new Document(i + "");
  66. Document d = new Document(tokens[3]);
  67. String tweet = tokens[4];
  68. // String[] datetokens = tokens[1].split(" ");
  69. // String date = datetokens[1] + " " + datetokens[2] + ", " +
  70. // datetokens[5] + " " + datetokens[3];
  71. // d.publishDate = new Timestamp(Timestamp.parse(date));
  72. d.id = d.id + "\t" + tokens[2];
  73. d.setBody(tweet);
  74. // if(tweet.indexOf("RT @")!=-1 || tweet.indexOf("rt @")!=-1 ||
  75. // tweet.indexOf("RT@")!=-1 || tweet.indexOf("rt@")!=-1)
  76. // continue;
  77. fetchTweetContent(tweet, stopwords, porter, d);
  78. if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
  79. docs.put(d.id, d);
  80. for (Keyword k : d.keywords.values()) {
  81. if (DF.containsKey(k.baseForm))
  82. DF.put(k.baseForm, DF.get(k.baseForm) + 1);
  83. else
  84. DF.put(k.baseForm, new Double(1));
  85. }
  86. }
  87. } catch (Exception e) {
  88. e.printStackTrace();
  89. System.out.println(i + ": \"" + line + "\"");
  90. }
  91. }
  92. public void fetchTweets(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
  93. Timestamp start, Timestamp end) throws Exception {
  94. String line = null;
  95. BufferedReader in = new BufferedReader(new FileReader(datasetFile));
  96. int i = 0;
  97. while ((line = in.readLine()) != null)
  98. try {
  99. i++;
  100. if (i % 10000 == 0)
  101. System.out.println(i + "->" + docs.size());
  102. String[] tokens = line.split("\t");
  103. // Document d = new Document(i + "");
  104. Document d = new Document(tokens[0]);
  105. String tweet = tokens[4];
  106. String date = tokens[1];
  107. // String[] datetokens = tokens[1].split(" ");
  108. // String date = datetokens[1] + " " + datetokens[2] + ", " +
  109. // datetokens[5] + " " + datetokens[3];
  110. d.publishDate = Timestamp.valueOf(date);
  111. if (d.publishDate.compareTo(start) < 0)
  112. continue;
  113. if (d.publishDate.compareTo(end) > -1)
  114. break;
  115. // d.id = d.id + "\t" + tokens[2];
  116. d.setBody(tweet);
  117. // if(tweet.indexOf("RT @")!=-1 || tweet.indexOf("rt @")!=-1 ||
  118. // tweet.indexOf("RT@")!=-1 || tweet.indexOf("rt@")!=-1)
  119. // continue;
  120. fetchTweetContent(tweet, stopwords, porter, d);
  121. if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
  122. docs.put(d.id, d);
  123. for (Keyword k : d.keywords.values()) {
  124. if (DF.containsKey(k.baseForm))
  125. DF.put(k.baseForm, DF.get(k.baseForm) + 1);
  126. else
  127. DF.put(k.baseForm, new Double(1));
  128. }
  129. }
  130. } catch (Exception e) {
  131. e.printStackTrace();
  132. System.out.println(i + ": \"" + line + "\"");
  133. }
  134. }
  135. public void fetchTweetsByQuery(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
  136. String query, Timestamp startDate, Timestamp endDate) throws Exception {
  137. String line = null;
  138. BufferedReader in = new BufferedReader(new FileReader(datasetFile));
  139. int i = 0;
  140. while ((line = in.readLine()) != null)
  141. try {
  142. i++;
  143. if (i % 100000 == 0)
  144. System.out.println(i);
  145. Document d = new Document(i + "");
  146. String[] tokens = line.split("\t");
  147. if (tokens.length < 3)
  148. continue;
  149. String tweet = tokens[2];
  150. String[] datetokens = tokens[0].split(" ");
  151. String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
  152. d.publishDate = new Timestamp(Timestamp.parse(date));
  153. d.id = d.id + "\t" + tokens[1];
  154. if (startDate.before(d.publishDate) && endDate.after(d.publishDate) && containsQuery(tweet, query)) {
  155. fetchTweetContent(tweet, stopwords, porter, d);
  156. if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
  157. docs.put(d.id, d);
  158. for (Keyword k : d.keywords.values()) {
  159. if (DF.containsKey(k.baseForm))
  160. DF.put(k.baseForm, DF.get(k.baseForm) + 1);
  161. else
  162. DF.put(k.baseForm, new Double(1));
  163. }
  164. }
  165. }
  166. } catch (Exception e) {
  167. e.printStackTrace();
  168. }
  169. }
  170. public void fetchTweetsByUsers(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
  171. HashSet<String> users, Timestamp startDate, Timestamp endDate) throws Exception {
  172. String line = null;
  173. BufferedReader in = new BufferedReader(new FileReader(datasetFile));
  174. int i = 0;
  175. while ((line = in.readLine()) != null) {
  176. i++;
  177. if (i % 100000 == 0)
  178. System.out.println(i + "->" + docs.size());
  179. Document d = new Document(i + "");
  180. String[] tokens = line.split("\t");
  181. if (tokens.length < 3)
  182. continue;
  183. String tweet = tokens[2];
  184. String userId = tokens[1];
  185. String[] datetokens = tokens[0].split(" ");
  186. String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
  187. d.publishDate = new Timestamp(Timestamp.parse(date));
  188. d.id = d.id + " " + userId;
  189. if (startDate.before(d.publishDate) && endDate.after(d.publishDate) && users.contains(userId)) {
  190. fetchTweetContent(tweet, stopwords, porter, d);
  191. if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
  192. docs.put(d.id, d);
  193. for (Keyword k : d.keywords.values()) {
  194. if (DF.containsKey(k.baseForm))
  195. DF.put(k.baseForm, DF.get(k.baseForm) + 1);
  196. else
  197. DF.put(k.baseForm, new Double(1));
  198. }
  199. }
  200. }
  201. }
  202. }
  203. public HashSet<String> getUsers(String datasetFile, String query, Timestamp startDate, Timestamp endDate) throws Exception {
  204. HashSet<String> users = new HashSet<String>();
  205. String line = null;
  206. BufferedReader in = new BufferedReader(new FileReader(datasetFile));
  207. int i = 0;
  208. while ((line = in.readLine()) != null) {
  209. i++;
  210. if (i % 100000 == 0)
  211. System.out.println(i);
  212. String[] tokens = line.split("\t");
  213. if (tokens.length < 3)
  214. continue;
  215. String tweet = tokens[2];
  216. String[] datetokens = tokens[0].split(" ");
  217. String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
  218. Timestamp publishDate = new Timestamp(Timestamp.parse(date));
  219. if (startDate.before(publishDate) && endDate.after(publishDate) && containsQuery(tweet, query))
  220. users.add(tokens[1]);
  221. }
  222. return users;
  223. }
  224. public static boolean containsQuery(String text, String query) {
  225. text = text.toLowerCase();
  226. query = query.toLowerCase();
  227. String[] mustTokens = query.split("\\+");
  228. for (String mustToken : mustTokens) {
  229. boolean includes = false;
  230. String[] mightTokens = mustToken.split(" ");
  231. for (String mightToken : mightTokens)
  232. if (text.indexOf(mightToken.trim().replaceAll("_", " ")) != -1) {
  233. includes = true;
  234. break;
  235. }
  236. if (!includes)
  237. return false;
  238. }
  239. return true;
  240. }
  241. public void fetchTweetContent(String content, HashSet<String> stopwords, Porter porter, Document d) {
  242. if (constants.KEYWORDS_2_ENABLE)
  243. loadDocumentKeyFile(getNounPhrases(content), stopwords, porter, d, constants.KEYWORDS_2_WEIGHT);
  244. if (constants.TEXT_ENABLE)
  245. fetchTweetText(content, stopwords, porter, d, constants.TEXT_WEIGHT);
  246. ArrayList<String> toRemove = new ArrayList<String>();
  247. for (Keyword k : d.keywords.values())
  248. if (k.word.contains("@"))
  249. toRemove.add(k.baseForm);
  250. for (String base : toRemove)
  251. d.keywords.remove(base);
  252. }
  253. public void fetchTweetText(String content, HashSet<String> stopwords, Porter porter, Document d, double BoostRate) {
  254. content = content.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " ");
  255. StringTokenizer st = new StringTokenizer(content, "!?|\"' -_0123456789.,;#$&%/\\*()<>\t");
  256. d.setBody(content);
  257. while (st.hasMoreTokens()) {
  258. String word = st.nextToken();
  259. String token = word.toLowerCase();
  260. double tf = 1 * BoostRate;
  261. String base = "";
  262. if ((token.indexOf("?") == -1 && token.length() > 2 && !stopwords.contains(token)))
  263. base = porter.stripAffixes(token);
  264. if (base.length() > 2)
  265. if (!d.keywords.containsKey(base))
  266. d.keywords.put(base, new Keyword(base, word, tf, 1, 0));
  267. else
  268. d.keywords.get(base).tf += tf;
  269. }
  270. }
  271. public static ArrayList<String> getBigrams(String content) {
  272. ArrayList<String> res = new ArrayList<String>();
  273. StringTokenizer st = new StringTokenizer(content, "!?|\"' -_@0123456789.,;#$&%/\\*()<>\t");
  274. String prev = null;
  275. while (st.hasMoreTokens()) {
  276. String term = st.nextToken();
  277. if (term.length() > 1) {
  278. if (prev != null)
  279. res.add(prev + " " + term);
  280. prev = term;
  281. }
  282. }
  283. return res;
  284. }
  285. static LexicalizedParser lp = null;
  286. static TokenizerFactory tf = null;
  287. // TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
  288. public static ArrayList<String> getNounPhrases(String inputContent) {
  289. if (tf == null)
  290. tf = PTBTokenizer.factory(false, new WordTokenFactory());
  291. if (lp == null)
  292. lp = new LexicalizedParser("englishPCFG.ser.gz");
  293. ArrayList<String> res = new ArrayList<String>();
  294. String content = inputContent.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " ");
  295. try {
  296. StringTokenizer sentences = new StringTokenizer(content, "[!\\?;.]");
  297. while (sentences.hasMoreElements()) {
  298. String sentence = sentences.nextToken();
  299. if (sentence.trim().length() == 0)
  300. continue;
  301. // System.out.println("ORIGINAL:" + sentence);
  302. List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
  303. lp.parse(tokens); // parse the tokens
  304. Tree t = lp.getBestParse(); // get the best parse tree
  305. res.addAll(getNounPhrases(t));
  306. // System.out.println(res.size());
  307. // System.out.println("\nPROCESSED:\n\n"); tp.printTree(t);
  308. // //
  309. // print tree
  310. }
  311. } catch (Exception e) {
  312. System.err.println("ERROR: " + e.getMessage());
  313. }
  314. return res;
  315. }
  316. public static ArrayList<String> getNounPhrases(Tree t) {
  317. ArrayList<String> res = new ArrayList<String>();
  318. if (!t.isLeaf())
  319. for (Tree child : t.getChildrenAsList()) {
  320. ArrayList<String> childRes = getNounPhrases(child);
  321. if (childRes.size() > 0)
  322. res.addAll(childRes);
  323. }
  324. if (res.size() == 0)
  325. if (t.label().value().equals("NP")) {
  326. String text = "";
  327. for (Tree tt : t.getLeaves())
  328. text += " " + tt.value();
  329. res.add(text.trim());
  330. }
  331. return res;
  332. }
  333. }