PageRenderTime 28ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/KeyGraph/src/dataset/twitter/MainTwitter.java

#
Java | 398 lines | 301 code | 56 blank | 41 comment | 46 complexity | e409c0bba07169c24ca039e0ad01be5a MD5 | raw file
  1. package dataset.twitter;
  2. import java.io.BufferedReader;
  3. import java.io.BufferedWriter;
  4. import java.io.DataInputStream;
  5. import java.io.File;
  6. import java.io.FileInputStream;
  7. import java.io.FileReader;
  8. import java.io.FileWriter;
  9. import java.io.PrintStream;
  10. import java.sql.Timestamp;
  11. import java.util.ArrayList;
  12. import java.util.HashMap;
  13. import java.util.HashSet;
  14. import topicDetection.Constants;
  15. import topicDetection.Document;
  16. import topicDetection.DocumentAnalyze;
  17. import topicDetection.DocumentCluster;
  18. import topicDetection.GraphAnalyze;
  19. import topicDetection.Porter;
  20. import topicDetection.Utils;
  21. // Referenced classes of package dataset.twitter:
  22. // TwitterUtil
  23. public class MainTwitter {
  24. public static String host = null;
  25. public static Constants constants;
  26. public MainTwitter() {
  27. }
  28. public static void main(String args[]) throws Exception {
  29. // String in="RT @localnatives: A friend of ours found this inside an elliot smith book in a bookstore in Brooklyn http://plixi.com/p/50287064";
  30. // System.out.println(in.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " "));
  31. // System.exit(0);
  32. constants = new Constants(new DataInputStream(new FileInputStream("conf/TwitterConstants.txt")));
  33. // createDataset("/fs/clip-clip-proj/SMSite2010/BB(r)_v2/Data1/Ego_dataset_1/Data/TweetsUserContent.txt",
  34. // //
  35. // createDataset("/fs/clip-clip-proj/SMSite2010/BB(r)_v2/Data1/Ego_dataset_2/Data/protocol2_backgroundTweets.txt",
  36. // "local+natives", Timestamp.valueOf("2000-01-01 12:01:01"),
  37. // Timestamp.valueOf("2020-01-01 12:01:01"),
  38. // "localNatives.tweets1.txt");
  39. // runSlidingWindow("usdebt2011_firstWeek.txt",
  40. // "usdebt2011_firstWeek.txt_events");
  41. // System.exit(0);
  42. // args=new String[]{"run4website",
  43. // "summerSearch-sxsw.txt","summerSearch-sxsw.txt.out"};
  44. String usage = "Usage:\n\t1. run inputDatasetFile\n" + "\t2. run4website inputDatasetFile outputFile\n"
  45. + "\t3. runQuery inputDatasetFile query StartDate(yyyy-mm-dd) endDateDate(yyyy-mm-dd) outputFile\n"
  46. + "\t4. runQueryStream inputDatasetFile query StartDate(yyyy-mm-dd) endDateDate(yyyy-mm-dd) outputFile\n";
  47. try {
  48. if (args[0].equals("run4website")) {
  49. run4website(args[1], args[2]);
  50. } else if (args[0].equals("run")) {
  51. run(args[1], args[2]);
  52. } else if (args[0].equals("runQuery")) {
  53. runQuery(args[1], args[2].replaceAll("\"", " ").trim(), Timestamp.valueOf(args[3] + " 12:00:00"), Timestamp.valueOf(args[4] + " 12:00:00"),
  54. args[4]);
  55. } else if (args[0].equals("runQueryStream")) {
  56. runQueryStreamInFiles(args[1], args[2].replaceAll("\"", " ").trim(), Timestamp.valueOf(args[3] + " 12:00:00"),
  57. Timestamp.valueOf(args[4] + " 12:00:00"), args[5]);
  58. } else
  59. System.out.println(usage);
  60. } catch (Exception e) {
  61. System.out.println(usage);
  62. e.printStackTrace();
  63. }
  64. }
  65. public static void createDataset(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {
  66. String line = null;
  67. BufferedReader in = new BufferedReader(new FileReader(inputFile));
  68. BufferedWriter out = new BufferedWriter(new FileWriter(outputFile));
  69. int i = 0;
  70. while ((line = in.readLine()) != null)
  71. try {
  72. i++;
  73. if (i % 100000 == 0)
  74. System.out.println(i);
  75. String[] tokens = line.split("\t");
  76. if (tokens.length < 3)
  77. continue;
  78. String tweet = tokens[2];
  79. String[] datetokens = tokens[0].split(" ");
  80. String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
  81. Timestamp publishDate = new Timestamp(Timestamp.parse(date));
  82. if (startDate.before(publishDate) && endDate.after(publishDate) && TwitterDataLoader.containsQuery(tweet, query)) {
  83. out.write(line + "\n");
  84. }
  85. } catch (Exception e) {
  86. e.printStackTrace();
  87. }
  88. out.close();
  89. }
  90. public static void run4website(String inputFile, String outputFile) throws Exception {
  91. // PrintStream out = new PrintStream(new File(outputFile));
  92. HashSet<String> stopwords = Utils.importStopwords();
  93. Porter porter = new Porter();
  94. HashMap<String, Double> DF = new HashMap<String, Double>();
  95. HashMap<String, Document> docs = new HashMap<String, Document>();
  96. (new TwitterDataLoader(constants)).fetchTweets4website(inputFile, docs, stopwords, DF, porter,constants.REMOVE_DUPLICATES);
  97. System.out.println("#docs: " + docs.size());
  98. ArrayList<DocumentCluster> clusters = (new DocumentAnalyze(constants)).clusterbyKeyGraph(docs, DF);
  99. DocumentAnalyze.printClustersForTheWebsite(clusters, outputFile);
  100. // out.close();
  101. }
  102. public static void run(String inputFile, String outputFile) throws Exception {
  103. runQuery(inputFile, null, null, null, outputFile);
  104. }
  105. public static void runQuery(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {
  106. PrintStream out = new PrintStream(new File(outputFile));
  107. HashSet<String> stopwords = Utils.importStopwords();
  108. Porter porter = new Porter();
  109. HashMap<String, Double> DF = new HashMap<String, Double>();
  110. HashMap<String, Document> docs = new HashMap<String, Document>();
  111. if (query == null)
  112. (new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter);
  113. else
  114. (new TwitterDataLoader(constants)).fetchTweetsByQuery(inputFile, docs, stopwords, DF, porter, query, startDate, endDate);
  115. System.out.println("#docs: " + docs.size());
  116. ArrayList<DocumentCluster> clusters = (new DocumentAnalyze(constants)).clusterbyKeyGraph(docs, DF);
  117. // DocumentAnalyze.printClusters(clusters, out, true);
  118. DocumentCluster.serializeAll(clusters, out, true);
  119. out.close();
  120. }
  121. public static void runSlidingWindow(String inputFile, String outputFile) throws Exception {
  122. Timestamp start = Timestamp.valueOf("2011-06-25 18:00:00");
  123. Timestamp end = Timestamp.valueOf("2011-07-29 11:00:00");
  124. int windowShiftSize = 5;
  125. HashSet<String> stopwords = Utils.importStopwords();
  126. Porter porter = new Porter();
  127. HashMap<String, Double> DF = new HashMap<String, Double>();
  128. HashMap<String, Document> docs = new HashMap<String, Document>();
  129. Timestamp windowStartTime = start;
  130. Timestamp windowEndTime = new Timestamp(windowStartTime.getTime() + windowShiftSize * 2 * 24 * 60 * 60 * 1000);
  131. (new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter, windowStartTime, windowEndTime);
  132. DocumentAnalyze documentAnalyze = new DocumentAnalyze(constants);
  133. ArrayList<DocumentCluster> lastEvents = documentAnalyze.clusterbyKeyGraph(docs, DF);
  134. PrintStream out = new PrintStream(new File(outputFile + ".tmp"));
  135. for (Document d : docs.values()) {
  136. d.publishDate = null;
  137. d.setBody(null);
  138. d.setTitle(null);
  139. }
  140. while (windowEndTime.before(end)) {
  141. DF = new HashMap<String, Double>();
  142. docs = new HashMap<String, Document>();
  143. out.println();
  144. windowStartTime = new Timestamp(windowStartTime.getTime() + windowShiftSize * 24 * 60 * 60 * 1000);
  145. windowEndTime = new Timestamp(windowEndTime.getTime() + windowShiftSize * 24 * 60 * 60 * 1000);
  146. System.out.println(windowStartTime + "\t" + windowEndTime);
  147. (new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter, windowStartTime, windowEndTime);
  148. System.out.println(docs.size() + " docs are loaded!");
  149. ArrayList<DocumentCluster> events = documentAnalyze.clusterbyKeyGraph(docs, DF);
  150. for (Document d : docs.values()) {
  151. d.publishDate = null;
  152. d.setBody(null);
  153. d.setTitle(null);
  154. }
  155. // ----------- merge events -----------
  156. for (DocumentCluster dc1 : lastEvents) {
  157. boolean isMerged = false;
  158. for (DocumentCluster dc2 : events)
  159. if (documentAnalyze.intersectDocs(dc1.docs, dc2.docs) > 3) {
  160. isMerged = true;
  161. dc2.docs.putAll(dc1.docs);
  162. dc2.keyGraph.putAll(dc2.keyGraph);
  163. }
  164. if (!isMerged) {
  165. // eventsStream.add(dc1);
  166. // --- Save and Print
  167. // DocumentAnalyze.printCluster(dc1, out, true);
  168. dc1.serialize(out, true);
  169. dc1.docs = null;
  170. }
  171. }
  172. lastEvents = events;
  173. events = null;
  174. }
  175. for (DocumentCluster dc1 : lastEvents) {
  176. // eventsStream.add(dc1);
  177. // --- Save and Print
  178. out.println();
  179. // DocumentAnalyze.printCluster(dc1, out, true);
  180. dc1.serialize(out, true);
  181. dc1.docs = null;
  182. }
  183. out.close();
  184. ArrayList<DocumentCluster> allEvents = DocumentCluster.deserializeAll(outputFile + ".tmp",true);
  185. DocumentAnalyze.printClustersForTheWebsite(allEvents, outputFile);
  186. System.out.println("done");
  187. }
  188. public static void runQueryStream(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {
  189. PrintStream out = new PrintStream(new File(outputFile));
  190. long windowSize = (long) 2 * 30 * 24 * 60 * 60 * 1000;
  191. long windowShiftSize = windowSize / 2;
  192. Timestamp start = startDate;
  193. Timestamp end = new Timestamp(start.getTime() + windowSize);
  194. HashSet<String> stopwords = Utils.importStopwords();
  195. Porter porter = new Porter();
  196. HashMap<String, Double> DF = new HashMap<String, Double>();
  197. HashMap<String, Document> docs = new HashMap<String, Document>();
  198. HashSet<String> users = new TwitterDataLoader(constants).getUsers(inputFile, query, startDate, endDate);
  199. System.out.println(users.size() + " users");
  200. DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);
  201. ArrayList<DocumentCluster> lastEvents = documentAnalyzer.clusterbyKeyGraph(docs, DF);
  202. start = new Timestamp(start.getTime() - windowShiftSize);
  203. while (start.before(endDate)) {
  204. start = new Timestamp(start.getTime() + windowShiftSize);
  205. end = new Timestamp(start.getTime() + windowSize);
  206. DF = new HashMap<String, Double>();
  207. docs = new HashMap<String, Document>();
  208. out.println();
  209. out.println("[" + start + " , " + end + "]");
  210. System.out.println("[" + start + " , " + end + "]");
  211. (new TwitterDataLoader(constants)).fetchTweetsByUsers(inputFile, docs, stopwords, DF, porter, users, start, end);
  212. out.println(docs.size() + " docs are loaded!");
  213. System.out.println("#docs: " + docs.size());
  214. ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);
  215. // ----------- merge events -----------
  216. for (DocumentCluster dc1 : lastEvents) {
  217. boolean isMerged = false;
  218. for (DocumentCluster dc2 : events)
  219. if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) > 3) {
  220. isMerged = true;
  221. dc2.docs.putAll(dc1.docs);
  222. dc2.keyGraph.putAll(dc2.keyGraph);
  223. }
  224. if (!isMerged) {
  225. // eventsStream.add(dc1);
  226. // --- Save and Print
  227. // DocumentAnalyze.printCluster(dc1, out);
  228. dc1.serialize(out, true);
  229. dc1.docs = null;
  230. }
  231. }
  232. lastEvents = events;
  233. events = null;
  234. }
  235. for (DocumentCluster dc1 : lastEvents) {
  236. // eventsStream.add(dc1);
  237. // --- Save and Print
  238. out.println();
  239. // DocumentAnalyze.printCluster(dc1, out);
  240. dc1.serialize(out, true);
  241. dc1.docs = null;
  242. }
  243. System.out.println("done");
  244. }
  245. public static void runQueryStreamInFiles(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {
  246. long windowSize = (long) 6 * 24 * 60 * 60 * 1000;
  247. String dirname = "tweetEvents/" + query.replaceAll("[+_]", "") + "/";
  248. new File(dirname).mkdir();
  249. findEventChunks(inputFile, query, startDate, endDate, dirname, windowSize);
  250. findEventStreamFromChunks(dirname, query, startDate, endDate, outputFile, windowSize);
  251. }
  252. public static void findEventChunks(String inputDir, String query, Timestamp startDate, Timestamp endDate, String outputDir, long windowSize)
  253. throws Exception {
  254. long windowShiftSize = windowSize / 2;
  255. Timestamp start = startDate;
  256. Timestamp end = new Timestamp(start.getTime() + windowSize);
  257. HashSet<String> stopwords = Utils.importStopwords();
  258. Porter porter = new Porter();
  259. HashMap<String, Double> DF = new HashMap<String, Double>();
  260. HashMap<String, Document> docs = new HashMap<String, Document>();
  261. HashSet<String> users = new TwitterDataLoader(constants).getUsers(inputDir, query, startDate, endDate);
  262. System.out.println(users.size() + " users");
  263. DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);
  264. start = new Timestamp(start.getTime() - windowShiftSize);
  265. while (start.before(endDate)) {
  266. start = new Timestamp(start.getTime() + windowShiftSize);
  267. end = new Timestamp(start.getTime() + windowSize);
  268. PrintStream eout = new PrintStream(new File(outputDir + "/" + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt"));
  269. DF = new HashMap<String, Double>();
  270. docs = new HashMap<String, Document>();
  271. eout.println();
  272. eout.println("[" + start + " , " + end + "]");
  273. System.out.println("[" + start + " , " + end + "]");
  274. (new TwitterDataLoader(constants)).fetchTweetsByUsers(inputDir, docs, stopwords, DF, porter, users, start, end);
  275. eout.println(docs.size() + " docs are loaded!");
  276. System.out.println("#docs: " + docs.size());
  277. ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);
  278. for (DocumentCluster dc1 : events) {
  279. // eventsStream.add(dc1);
  280. // --- Save and Print
  281. // DocumentAnalyze.printCluster(dc1, eout, true);
  282. dc1.serialize(eout, true);
  283. dc1.docs = null;
  284. }
  285. eout.close();
  286. }
  287. }
  288. public static void findEventStreamFromChunks(String inputDir, String query, Timestamp startDate, Timestamp endDate, String outputFile, long windowSize)
  289. throws Exception {
  290. PrintStream out = new PrintStream(new File(outputFile));
  291. long windowShiftSize = windowSize / 2;
  292. DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);
  293. System.out.println("[" + startDate + " , " + endDate + "]");
  294. Timestamp start = startDate;
  295. Timestamp end = new Timestamp(start.getTime() + windowSize);
  296. String ein = inputDir + "/" + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt";
  297. ArrayList<DocumentCluster> lastingEvents = DocumentCluster.deserializeAll(ein,true);
  298. while (start.before(endDate)) {
  299. start = new Timestamp(start.getTime() + windowShiftSize);
  300. end = new Timestamp(start.getTime() + windowSize);
  301. ein = inputDir + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt";
  302. System.out.println("[" + start + " , " + end + "]");
  303. ArrayList<DocumentCluster> events = DocumentCluster.deserializeAll(ein,true);
  304. // ----------- merge events -----------
  305. for (DocumentCluster dc1 : lastingEvents) {
  306. boolean isMerged = false;
  307. for (DocumentCluster dc2 : events) {
  308. if (dc1.docs.size() > 0 && dc2.docs.size() > 0)
  309. if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) * 1.0 / Math.min(dc1.docs.size(), dc2.docs.size()) > .2) {
  310. isMerged = true;
  311. dc2.docs.putAll(dc1.docs);
  312. dc2.keyGraph = GraphAnalyze.mergeKeyGraphs(dc1.keyGraph, dc2.keyGraph);
  313. }
  314. }
  315. if (!isMerged) {
  316. // --- Save and Print
  317. if (dc1.docs.size() > 20 && isRelated(dc1, query))
  318. // DocumentAnalyze.printCluster(dc1, out, true);
  319. dc1.serialize(out, true);
  320. dc1.docs = null;
  321. }
  322. }
  323. lastingEvents = events;
  324. events = null;
  325. }
  326. for (DocumentCluster dc1 : lastingEvents)
  327. if (isRelated(dc1, query)) {
  328. // --- Save and Print
  329. out.println();
  330. // DocumentAnalyze.printCluster(dc1, out, true);
  331. dc1.serialize(out, true);
  332. dc1.docs = null;
  333. }
  334. System.out.println("done");
  335. }
  336. public static boolean isRelated(DocumentCluster dc, String query) {
  337. for (Document d : dc.docs.values())
  338. if (TwitterDataLoader.containsQuery(d.getBody(), query))
  339. return true;
  340. return false;
  341. }
  342. }