MainTwitter.java | searchcode

/KeyGraph/src/dataset/twitter/MainTwitter.java

#
Java | 398 lines | 301 code | 56 blank | 41 comment | 46 complexity | e409c0bba07169c24ca039e0ad01be5a MD5 | raw file

package dataset.twitter;



import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.DataInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.PrintStream;

import java.sql.Timestamp;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.HashSet;



import topicDetection.Constants;

import topicDetection.Document;

import topicDetection.DocumentAnalyze;

import topicDetection.DocumentCluster;

import topicDetection.GraphAnalyze;

import topicDetection.Porter;

import topicDetection.Utils;



// Referenced classes of package dataset.twitter:

//            TwitterUtil



public class MainTwitter {



	public static String host = null;

	public static Constants constants;



	public MainTwitter() {

	}



	public static void main(String args[]) throws Exception {

		// String in="RT @localnatives: A friend of ours found this inside an elliot smith book in a bookstore in Brooklyn  http://plixi.com/p/50287064";

		// System.out.println(in.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " "));

		// System.exit(0);

		constants = new Constants(new DataInputStream(new FileInputStream("conf/TwitterConstants.txt")));

		// createDataset("/fs/clip-clip-proj/SMSite2010/BB(r)_v2/Data1/Ego_dataset_1/Data/TweetsUserContent.txt",

		// //

		// createDataset("/fs/clip-clip-proj/SMSite2010/BB(r)_v2/Data1/Ego_dataset_2/Data/protocol2_backgroundTweets.txt",

		// "local+natives", Timestamp.valueOf("2000-01-01 12:01:01"),

		// Timestamp.valueOf("2020-01-01 12:01:01"),

		// "localNatives.tweets1.txt");



		// runSlidingWindow("usdebt2011_firstWeek.txt",

		// "usdebt2011_firstWeek.txt_events");

		// System.exit(0);

		// args=new String[]{"run4website",

		// "summerSearch-sxsw.txt","summerSearch-sxsw.txt.out"};

		String usage = "Usage:\n\t1. run inputDatasetFile\n" + "\t2. run4website inputDatasetFile outputFile\n"

				+ "\t3. runQuery inputDatasetFile query StartDate(yyyy-mm-dd) endDateDate(yyyy-mm-dd) outputFile\n"

				+ "\t4. runQueryStream inputDatasetFile query StartDate(yyyy-mm-dd) endDateDate(yyyy-mm-dd) outputFile\n";

		try {



			if (args[0].equals("run4website")) {

				run4website(args[1], args[2]);

			} else if (args[0].equals("run")) {

				run(args[1], args[2]);

			} else if (args[0].equals("runQuery")) {

				runQuery(args[1], args[2].replaceAll("\"", " ").trim(), Timestamp.valueOf(args[3] + " 12:00:00"), Timestamp.valueOf(args[4] + " 12:00:00"),

						args[4]);

			} else if (args[0].equals("runQueryStream")) {

				runQueryStreamInFiles(args[1], args[2].replaceAll("\"", " ").trim(), Timestamp.valueOf(args[3] + " 12:00:00"),

						Timestamp.valueOf(args[4] + " 12:00:00"), args[5]);

			} else

				System.out.println(usage);

		} catch (Exception e) {

			System.out.println(usage);

			e.printStackTrace();

		}

	}



	public static void createDataset(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {

		String line = null;

		BufferedReader in = new BufferedReader(new FileReader(inputFile));

		BufferedWriter out = new BufferedWriter(new FileWriter(outputFile));

		int i = 0;

		while ((line = in.readLine()) != null)

			try {

				i++;

				if (i % 100000 == 0)

					System.out.println(i);

				String[] tokens = line.split("\t");

				if (tokens.length < 3)

					continue;

				String tweet = tokens[2];

				String[] datetokens = tokens[0].split(" ");

				String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];

				Timestamp publishDate = new Timestamp(Timestamp.parse(date));

				if (startDate.before(publishDate) && endDate.after(publishDate) && TwitterDataLoader.containsQuery(tweet, query)) {

					out.write(line + "\n");

				}



			} catch (Exception e) {

				e.printStackTrace();

			}

		out.close();



	}



	public static void run4website(String inputFile, String outputFile) throws Exception {

		// PrintStream out = new PrintStream(new File(outputFile));

		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();



		

		(new TwitterDataLoader(constants)).fetchTweets4website(inputFile, docs, stopwords, DF, porter,constants.REMOVE_DUPLICATES);



		System.out.println("#docs: " + docs.size());

		ArrayList<DocumentCluster> clusters = (new DocumentAnalyze(constants)).clusterbyKeyGraph(docs, DF);

		DocumentAnalyze.printClustersForTheWebsite(clusters, outputFile);

		// out.close();

	}



	public static void run(String inputFile, String outputFile) throws Exception {

		runQuery(inputFile, null, null, null, outputFile);

	}



	public static void runQuery(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {

		PrintStream out = new PrintStream(new File(outputFile));

		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();

		if (query == null)

			(new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter);

		else

			(new TwitterDataLoader(constants)).fetchTweetsByQuery(inputFile, docs, stopwords, DF, porter, query, startDate, endDate);



		System.out.println("#docs: " + docs.size());

		ArrayList<DocumentCluster> clusters = (new DocumentAnalyze(constants)).clusterbyKeyGraph(docs, DF);

		// DocumentAnalyze.printClusters(clusters, out, true);

		DocumentCluster.serializeAll(clusters, out, true);

		out.close();

	}



	public static void runSlidingWindow(String inputFile, String outputFile) throws Exception {

		Timestamp start = Timestamp.valueOf("2011-06-25 18:00:00");

		Timestamp end = Timestamp.valueOf("2011-07-29 11:00:00");

		int windowShiftSize = 5;



		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();



		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();

		Timestamp windowStartTime = start;

		Timestamp windowEndTime = new Timestamp(windowStartTime.getTime() + windowShiftSize * 2 * 24 * 60 * 60 * 1000);

		(new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter, windowStartTime, windowEndTime);

		DocumentAnalyze documentAnalyze = new DocumentAnalyze(constants);

		ArrayList<DocumentCluster> lastEvents = documentAnalyze.clusterbyKeyGraph(docs, DF);



		PrintStream out = new PrintStream(new File(outputFile + ".tmp"));

		for (Document d : docs.values()) {

			d.publishDate = null;

			d.setBody(null);

			d.setTitle(null);

		}



		while (windowEndTime.before(end)) {

			DF = new HashMap<String, Double>();

			docs = new HashMap<String, Document>();

			out.println();

			windowStartTime = new Timestamp(windowStartTime.getTime() + windowShiftSize * 24 * 60 * 60 * 1000);

			windowEndTime = new Timestamp(windowEndTime.getTime() + windowShiftSize * 24 * 60 * 60 * 1000);

			System.out.println(windowStartTime + "\t" + windowEndTime);

			(new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter, windowStartTime, windowEndTime);

			System.out.println(docs.size() + " docs are loaded!");

			ArrayList<DocumentCluster> events = documentAnalyze.clusterbyKeyGraph(docs, DF);

			for (Document d : docs.values()) {

				d.publishDate = null;

				d.setBody(null);

				d.setTitle(null);

			}

			// ----------- merge events -----------

			for (DocumentCluster dc1 : lastEvents) {

				boolean isMerged = false;

				for (DocumentCluster dc2 : events)

					if (documentAnalyze.intersectDocs(dc1.docs, dc2.docs) > 3) {

						isMerged = true;

						dc2.docs.putAll(dc1.docs);

						dc2.keyGraph.putAll(dc2.keyGraph);

					}

				if (!isMerged) {

					// eventsStream.add(dc1);

					// --- Save and Print

					// DocumentAnalyze.printCluster(dc1, out, true);

					dc1.serialize(out, true);

					dc1.docs = null;

				}

			}

			lastEvents = events;

			events = null;

		}

		for (DocumentCluster dc1 : lastEvents) {

			// eventsStream.add(dc1);

			// --- Save and Print

			out.println();

			// DocumentAnalyze.printCluster(dc1, out, true);

			dc1.serialize(out, true);



			dc1.docs = null;

		}



		out.close();

		ArrayList<DocumentCluster> allEvents = DocumentCluster.deserializeAll(outputFile + ".tmp",true);

		DocumentAnalyze.printClustersForTheWebsite(allEvents, outputFile);

		System.out.println("done");

	}



	public static void runQueryStream(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {



		PrintStream out = new PrintStream(new File(outputFile));

		long windowSize = (long) 2 * 30 * 24 * 60 * 60 * 1000;

		long windowShiftSize = windowSize / 2;



		Timestamp start = startDate;

		Timestamp end = new Timestamp(start.getTime() + windowSize);



		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();



		HashSet<String> users = new TwitterDataLoader(constants).getUsers(inputFile, query, startDate, endDate);

		System.out.println(users.size() + " users");



		DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);

		ArrayList<DocumentCluster> lastEvents = documentAnalyzer.clusterbyKeyGraph(docs, DF);



		start = new Timestamp(start.getTime() - windowShiftSize);

		while (start.before(endDate)) {

			start = new Timestamp(start.getTime() + windowShiftSize);

			end = new Timestamp(start.getTime() + windowSize);



			DF = new HashMap<String, Double>();

			docs = new HashMap<String, Document>();

			out.println();

			out.println("[" + start + " , " + end + "]");

			System.out.println("[" + start + " , " + end + "]");



			(new TwitterDataLoader(constants)).fetchTweetsByUsers(inputFile, docs, stopwords, DF, porter, users, start, end);

			out.println(docs.size() + " docs are loaded!");

			System.out.println("#docs: " + docs.size());

			ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);



			// ----------- merge events -----------

			for (DocumentCluster dc1 : lastEvents) {

				boolean isMerged = false;

				for (DocumentCluster dc2 : events)

					if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) > 3) {

						isMerged = true;

						dc2.docs.putAll(dc1.docs);

						dc2.keyGraph.putAll(dc2.keyGraph);

					}

				if (!isMerged) {

					// eventsStream.add(dc1);

					// --- Save and Print

//					DocumentAnalyze.printCluster(dc1, out);

					dc1.serialize(out, true);

					dc1.docs = null;

				}

			}

			lastEvents = events;

			events = null;

		}

		for (DocumentCluster dc1 : lastEvents) {

			// eventsStream.add(dc1);

			// --- Save and Print

			out.println();

//			DocumentAnalyze.printCluster(dc1, out);

			dc1.serialize(out, true);

			dc1.docs = null;

		}

		System.out.println("done");

	}



	public static void runQueryStreamInFiles(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {

		long windowSize = (long) 6 * 24 * 60 * 60 * 1000;

		String dirname = "tweetEvents/" + query.replaceAll("[+_]", "") + "/";

		new File(dirname).mkdir();

		findEventChunks(inputFile, query, startDate, endDate, dirname, windowSize);

		findEventStreamFromChunks(dirname, query, startDate, endDate, outputFile, windowSize);

	}



	public static void findEventChunks(String inputDir, String query, Timestamp startDate, Timestamp endDate, String outputDir, long windowSize)

			throws Exception {

		long windowShiftSize = windowSize / 2;



		Timestamp start = startDate;

		Timestamp end = new Timestamp(start.getTime() + windowSize);



		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();



		HashSet<String> users = new TwitterDataLoader(constants).getUsers(inputDir, query, startDate, endDate);

		System.out.println(users.size() + " users");



		DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);



		start = new Timestamp(start.getTime() - windowShiftSize);

		while (start.before(endDate)) {



			start = new Timestamp(start.getTime() + windowShiftSize);

			end = new Timestamp(start.getTime() + windowSize);



			PrintStream eout = new PrintStream(new File(outputDir + "/" + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt"));

			DF = new HashMap<String, Double>();

			docs = new HashMap<String, Document>();

			eout.println();

			eout.println("[" + start + " , " + end + "]");

			System.out.println("[" + start + " , " + end + "]");



			(new TwitterDataLoader(constants)).fetchTweetsByUsers(inputDir, docs, stopwords, DF, porter, users, start, end);

			eout.println(docs.size() + " docs are loaded!");

			System.out.println("#docs: " + docs.size());

			ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);



			for (DocumentCluster dc1 : events) {

				// eventsStream.add(dc1);

				// --- Save and Print



//				DocumentAnalyze.printCluster(dc1, eout, true);

				dc1.serialize(eout, true);

				dc1.docs = null;

			}

			eout.close();

		}



	}



	public static void findEventStreamFromChunks(String inputDir, String query, Timestamp startDate, Timestamp endDate, String outputFile, long windowSize)

			throws Exception {

		PrintStream out = new PrintStream(new File(outputFile));

		long windowShiftSize = windowSize / 2;

		DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);



		System.out.println("[" + startDate + " , " + endDate + "]");

		Timestamp start = startDate;

		Timestamp end = new Timestamp(start.getTime() + windowSize);

		String ein = inputDir + "/" + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt";

		ArrayList<DocumentCluster> lastingEvents = DocumentCluster.deserializeAll(ein,true);

		while (start.before(endDate)) {

			start = new Timestamp(start.getTime() + windowShiftSize);

			end = new Timestamp(start.getTime() + windowSize);



			ein = inputDir + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt";

			System.out.println("[" + start + " , " + end + "]");



			ArrayList<DocumentCluster> events = DocumentCluster.deserializeAll(ein,true);



			// ----------- merge events -----------

			for (DocumentCluster dc1 : lastingEvents) {

				boolean isMerged = false;

				for (DocumentCluster dc2 : events) {

					if (dc1.docs.size() > 0 && dc2.docs.size() > 0)

						if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) * 1.0 / Math.min(dc1.docs.size(), dc2.docs.size()) > .2) {

							isMerged = true;

							dc2.docs.putAll(dc1.docs);

							dc2.keyGraph = GraphAnalyze.mergeKeyGraphs(dc1.keyGraph, dc2.keyGraph);

						}

				}

				if (!isMerged) {

					// --- Save and Print

					if (dc1.docs.size() > 20 && isRelated(dc1, query))

//						DocumentAnalyze.printCluster(dc1, out, true);

						dc1.serialize(out, true);

					dc1.docs = null;

				}

			}

			lastingEvents = events;

			events = null;

		}

		for (DocumentCluster dc1 : lastingEvents)

			if (isRelated(dc1, query)) {

				// --- Save and Print

				out.println();

//				DocumentAnalyze.printCluster(dc1, out, true);

				dc1.serialize(out, true);

				dc1.docs = null;

			}

		System.out.println("done");

	}



	public static boolean isRelated(DocumentCluster dc, String query) {

		for (Document d : dc.docs.values())

			if (TwitterDataLoader.containsQuery(d.getBody(), query))

				return true;

		return false;

	}



}