MainTwitter.java - Referenced classes of package dataset.tw…

/KeyGraph/src/dataset/twitter/MainTwitter.java

# · Java · 398 lines · 301 code · 56 blank · 41 comment · 46 complexity · e409c0bba07169c24ca039e0ad01be5a MD5 · raw file


package dataset.twitter;



import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.DataInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.PrintStream;

import java.sql.Timestamp;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.HashSet;



import topicDetection.Constants;

import topicDetection.Document;

import topicDetection.DocumentAnalyze;

import topicDetection.DocumentCluster;

import topicDetection.GraphAnalyze;

import topicDetection.Porter;

import topicDetection.Utils;



// Referenced classes of package dataset.twitter:

//            TwitterUtil



public class MainTwitter {



	public static String host = null;

	public static Constants constants;



	public MainTwitter() {

	}



	public static void main(String args[]) throws Exception {

		// String in="RT @localnatives: A friend of ours found this inside an elliot smith book in a bookstore in Brooklyn  http://plixi.com/p/50287064";

		// System.out.println(in.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " "));

		// System.exit(0);

		constants = new Constants(new DataInputStream(new FileInputStream("conf/TwitterConstants.txt")));

		// createDataset("/fs/clip-clip-proj/SMSite2010/BB(r)_v2/Data1/Ego_dataset_1/Data/TweetsUserContent.txt",

		// //

		// createDataset("/fs/clip-clip-proj/SMSite2010/BB(r)_v2/Data1/Ego_dataset_2/Data/protocol2_backgroundTweets.txt",

		// "local+natives", Timestamp.valueOf("2000-01-01 12:01:01"),

		// Timestamp.valueOf("2020-01-01 12:01:01"),

		// "localNatives.tweets1.txt");



		// runSlidingWindow("usdebt2011_firstWeek.txt",

		// "usdebt2011_firstWeek.txt_events");

		// System.exit(0);

		// args=new String[]{"run4website",

		// "summerSearch-sxsw.txt","summerSearch-sxsw.txt.out"};

		String usage = "Usage:\n\t1. run inputDatasetFile\n" + "\t2. run4website inputDatasetFile outputFile\n"

				+ "\t3. runQuery inputDatasetFile query StartDate(yyyy-mm-dd) endDateDate(yyyy-mm-dd) outputFile\n"

				+ "\t4. runQueryStream inputDatasetFile query StartDate(yyyy-mm-dd) endDateDate(yyyy-mm-dd) outputFile\n";

		try {



			if (args[0].equals("run4website")) {

				run4website(args[1], args[2]);

			} else if (args[0].equals("run")) {

				run(args[1], args[2]);

			} else if (args[0].equals("runQuery")) {

				runQuery(args[1], args[2].replaceAll("\"", " ").trim(), Timestamp.valueOf(args[3] + " 12:00:00"), Timestamp.valueOf(args[4] + " 12:00:00"),

						args[4]);

			} else if (args[0].equals("runQueryStream")) {

				runQueryStreamInFiles(args[1], args[2].replaceAll("\"", " ").trim(), Timestamp.valueOf(args[3] + " 12:00:00"),

						Timestamp.valueOf(args[4] + " 12:00:00"), args[5]);

			} else

				System.out.println(usage);

		} catch (Exception e) {

			System.out.println(usage);

			e.printStackTrace();

		}

	}



	public static void createDataset(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {

		String line = null;

		BufferedReader in = new BufferedReader(new FileReader(inputFile));

		BufferedWriter out = new BufferedWriter(new FileWriter(outputFile));

		int i = 0;

		while ((line = in.readLine()) != null)

			try {

				i++;

				if (i % 100000 == 0)

					System.out.println(i);

				String[] tokens = line.split("\t");

				if (tokens.length < 3)

					continue;

				String tweet = tokens[2];

				String[] datetokens = tokens[0].split(" ");

				String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];

				Timestamp publishDate = new Timestamp(Timestamp.parse(date));

				if (startDate.before(publishDate) && endDate.after(publishDate) && TwitterDataLoader.containsQuery(tweet, query)) {

					out.write(line + "\n");

				}



			} catch (Exception e) {

				e.printStackTrace();

			}

		out.close();



	}



	public static void run4website(String inputFile, String outputFile) throws Exception {

		// PrintStream out = new PrintStream(new File(outputFile));

		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();



		

		(new TwitterDataLoader(constants)).fetchTweets4website(inputFile, docs, stopwords, DF, porter,constants.REMOVE_DUPLICATES);



		System.out.println("#docs: " + docs.size());

		ArrayList<DocumentCluster> clusters = (new DocumentAnalyze(constants)).clusterbyKeyGraph(docs, DF);

		DocumentAnalyze.printClustersForTheWebsite(clusters, outputFile);

		// out.close();

	}



	public static void run(String inputFile, String outputFile) throws Exception {

		runQuery(inputFile, null, null, null, outputFile);

	}



	public static void runQuery(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {

		PrintStream out = new PrintStream(new File(outputFile));

		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();

		if (query == null)

			(new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter);

		else

			(new TwitterDataLoader(constants)).fetchTweetsByQuery(inputFile, docs, stopwords, DF, porter, query, startDate, endDate);



		System.out.println("#docs: " + docs.size());

		ArrayList<DocumentCluster> clusters = (new DocumentAnalyze(constants)).clusterbyKeyGraph(docs, DF);

		// DocumentAnalyze.printClusters(clusters, out, true);

		DocumentCluster.serializeAll(clusters, out, true);

		out.close();

	}



	public static void runSlidingWindow(String inputFile, String outputFile) throws Exception {

		Timestamp start = Timestamp.valueOf("2011-06-25 18:00:00");

		Timestamp end = Timestamp.valueOf("2011-07-29 11:00:00");

		int windowShiftSize = 5;



		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();



		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();

		Timestamp windowStartTime = start;

		Timestamp windowEndTime = new Timestamp(windowStartTime.getTime() + windowShiftSize * 2 * 24 * 60 * 60 * 1000);

		(new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter, windowStartTime, windowEndTime);

		DocumentAnalyze documentAnalyze = new DocumentAnalyze(constants);

		ArrayList<DocumentCluster> lastEvents = documentAnalyze.clusterbyKeyGraph(docs, DF);



		PrintStream out = new PrintStream(new File(outputFile + ".tmp"));

		for (Document d : docs.values()) {

			d.publishDate = null;

			d.setBody(null);

			d.setTitle(null);

		}



		while (windowEndTime.before(end)) {

			DF = new HashMap<String, Double>();

			docs = new HashMap<String, Document>();

			out.println();

			windowStartTime = new Timestamp(windowStartTime.getTime() + windowShiftSize * 24 * 60 * 60 * 1000);

			windowEndTime = new Timestamp(windowEndTime.getTime() + windowShiftSize * 24 * 60 * 60 * 1000);

			System.out.println(windowStartTime + "\t" + windowEndTime);

			(new TwitterDataLoader(constants)).fetchTweets(inputFile, docs, stopwords, DF, porter, windowStartTime, windowEndTime);

			System.out.println(docs.size() + " docs are loaded!");

			ArrayList<DocumentCluster> events = documentAnalyze.clusterbyKeyGraph(docs, DF);

			for (Document d : docs.values()) {

				d.publishDate = null;

				d.setBody(null);

				d.setTitle(null);

			}

			// ----------- merge events -----------

			for (DocumentCluster dc1 : lastEvents) {

				boolean isMerged = false;

				for (DocumentCluster dc2 : events)

					if (documentAnalyze.intersectDocs(dc1.docs, dc2.docs) > 3) {

						isMerged = true;

						dc2.docs.putAll(dc1.docs);

						dc2.keyGraph.putAll(dc2.keyGraph);

					}

				if (!isMerged) {

					// eventsStream.add(dc1);

					// --- Save and Print

					// DocumentAnalyze.printCluster(dc1, out, true);

					dc1.serialize(out, true);

					dc1.docs = null;

				}

			}

			lastEvents = events;

			events = null;

		}

		for (DocumentCluster dc1 : lastEvents) {

			// eventsStream.add(dc1);

			// --- Save and Print

			out.println();

			// DocumentAnalyze.printCluster(dc1, out, true);

			dc1.serialize(out, true);



			dc1.docs = null;

		}



		out.close();

		ArrayList<DocumentCluster> allEvents = DocumentCluster.deserializeAll(outputFile + ".tmp",true);

		DocumentAnalyze.printClustersForTheWebsite(allEvents, outputFile);

		System.out.println("done");

	}



	public static void runQueryStream(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {



		PrintStream out = new PrintStream(new File(outputFile));

		long windowSize = (long) 2 * 30 * 24 * 60 * 60 * 1000;

		long windowShiftSize = windowSize / 2;



		Timestamp start = startDate;

		Timestamp end = new Timestamp(start.getTime() + windowSize);



		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();



		HashSet<String> users = new TwitterDataLoader(constants).getUsers(inputFile, query, startDate, endDate);

		System.out.println(users.size() + " users");



		DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);

		ArrayList<DocumentCluster> lastEvents = documentAnalyzer.clusterbyKeyGraph(docs, DF);



		start = new Timestamp(start.getTime() - windowShiftSize);

		while (start.before(endDate)) {

			start = new Timestamp(start.getTime() + windowShiftSize);

			end = new Timestamp(start.getTime() + windowSize);



			DF = new HashMap<String, Double>();

			docs = new HashMap<String, Document>();

			out.println();

			out.println("[" + start + " , " + end + "]");

			System.out.println("[" + start + " , " + end + "]");



			(new TwitterDataLoader(constants)).fetchTweetsByUsers(inputFile, docs, stopwords, DF, porter, users, start, end);

			out.println(docs.size() + " docs are loaded!");

			System.out.println("#docs: " + docs.size());

			ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);



			// ----------- merge events -----------

			for (DocumentCluster dc1 : lastEvents) {

				boolean isMerged = false;

				for (DocumentCluster dc2 : events)

					if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) > 3) {

						isMerged = true;

						dc2.docs.putAll(dc1.docs);

						dc2.keyGraph.putAll(dc2.keyGraph);

					}

				if (!isMerged) {

					// eventsStream.add(dc1);

					// --- Save and Print

//					DocumentAnalyze.printCluster(dc1, out);

					dc1.serialize(out, true);

					dc1.docs = null;

				}

			}

			lastEvents = events;

			events = null;

		}

		for (DocumentCluster dc1 : lastEvents) {

			// eventsStream.add(dc1);

			// --- Save and Print

			out.println();

//			DocumentAnalyze.printCluster(dc1, out);

			dc1.serialize(out, true);

			dc1.docs = null;

		}

		System.out.println("done");

	}



	public static void runQueryStreamInFiles(String inputFile, String query, Timestamp startDate, Timestamp endDate, String outputFile) throws Exception {

		long windowSize = (long) 6 * 24 * 60 * 60 * 1000;

		String dirname = "tweetEvents/" + query.replaceAll("[+_]", "") + "/";

		new File(dirname).mkdir();

		findEventChunks(inputFile, query, startDate, endDate, dirname, windowSize);

		findEventStreamFromChunks(dirname, query, startDate, endDate, outputFile, windowSize);

	}



	public static void findEventChunks(String inputDir, String query, Timestamp startDate, Timestamp endDate, String outputDir, long windowSize)

			throws Exception {

		long windowShiftSize = windowSize / 2;



		Timestamp start = startDate;

		Timestamp end = new Timestamp(start.getTime() + windowSize);



		HashSet<String> stopwords = Utils.importStopwords();

		Porter porter = new Porter();

		HashMap<String, Double> DF = new HashMap<String, Double>();

		HashMap<String, Document> docs = new HashMap<String, Document>();



		HashSet<String> users = new TwitterDataLoader(constants).getUsers(inputDir, query, startDate, endDate);

		System.out.println(users.size() + " users");



		DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);



		start = new Timestamp(start.getTime() - windowShiftSize);

		while (start.before(endDate)) {



			start = new Timestamp(start.getTime() + windowShiftSize);

			end = new Timestamp(start.getTime() + windowSize);



			PrintStream eout = new PrintStream(new File(outputDir + "/" + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt"));

			DF = new HashMap<String, Double>();

			docs = new HashMap<String, Document>();

			eout.println();

			eout.println("[" + start + " , " + end + "]");

			System.out.println("[" + start + " , " + end + "]");



			(new TwitterDataLoader(constants)).fetchTweetsByUsers(inputDir, docs, stopwords, DF, porter, users, start, end);

			eout.println(docs.size() + " docs are loaded!");

			System.out.println("#docs: " + docs.size());

			ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);



			for (DocumentCluster dc1 : events) {

				// eventsStream.add(dc1);

				// --- Save and Print



//				DocumentAnalyze.printCluster(dc1, eout, true);

				dc1.serialize(eout, true);

				dc1.docs = null;

			}

			eout.close();

		}



	}



	public static void findEventStreamFromChunks(String inputDir, String query, Timestamp startDate, Timestamp endDate, String outputFile, long windowSize)

			throws Exception {

		PrintStream out = new PrintStream(new File(outputFile));

		long windowShiftSize = windowSize / 2;

		DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);



		System.out.println("[" + startDate + " , " + endDate + "]");

		Timestamp start = startDate;

		Timestamp end = new Timestamp(start.getTime() + windowSize);

		String ein = inputDir + "/" + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt";

		ArrayList<DocumentCluster> lastingEvents = DocumentCluster.deserializeAll(ein,true);

		while (start.before(endDate)) {

			start = new Timestamp(start.getTime() + windowShiftSize);

			end = new Timestamp(start.getTime() + windowSize);



			ein = inputDir + start.toString().split(" ")[0] + "_" + end.toString().split(" ")[0] + ".txt";

			System.out.println("[" + start + " , " + end + "]");



			ArrayList<DocumentCluster> events = DocumentCluster.deserializeAll(ein,true);



			// ----------- merge events -----------

			for (DocumentCluster dc1 : lastingEvents) {

				boolean isMerged = false;

				for (DocumentCluster dc2 : events) {

					if (dc1.docs.size() > 0 && dc2.docs.size() > 0)

						if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) * 1.0 / Math.min(dc1.docs.size(), dc2.docs.size()) > .2) {

							isMerged = true;

							dc2.docs.putAll(dc1.docs);

							dc2.keyGraph = GraphAnalyze.mergeKeyGraphs(dc1.keyGraph, dc2.keyGraph);

						}

				}

				if (!isMerged) {

					// --- Save and Print

					if (dc1.docs.size() > 20 && isRelated(dc1, query))

//						DocumentAnalyze.printCluster(dc1, out, true);

						dc1.serialize(out, true);

					dc1.docs = null;

				}

			}

			lastingEvents = events;

			events = null;

		}

		for (DocumentCluster dc1 : lastingEvents)

			if (isRelated(dc1, query)) {

				// --- Save and Print

				out.println();

//				DocumentAnalyze.printCluster(dc1, out, true);

				dc1.serialize(out, true);

				dc1.docs = null;

			}

		System.out.println("done");

	}



	public static boolean isRelated(DocumentCluster dc, String query) {

		for (Document d : dc.docs.values())

			if (TwitterDataLoader.containsQuery(d.getBody(), query))

				return true;

		return false;

	}



}

Tech Fingerprint

Alerts (81)

'public' Maintainability Info: Public non-final fields violate encapsulation. Prefer making fields private and providing public getter/setter methods if access is needed.
29 30
'new FileInputStream(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
39
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
65 66 90 93 157 170 172 210 230 243 244 247 248 284 303 313 317 318 321 322 344 347 354
'System.out.println(' Use a logging framework (e.g., SLF4J, Log4j) for better control and configurability
68 70 84 113 134 170 172 212 230 244 248 279 303 318 322 344 354 388
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
69 96
'.printStackTrace()' Avoid printing stack traces directly to std err/out. Use a proper logging framework to handle exceptions consistently and direct output appropriately.
71 97
'throws Exception' Declaring 'throws Exception' is too broad. Declare specific checked exceptions that the method might throw, allowing callers to handle them appropriately.
75 103 119 123 141 215 282 291 339
'new FileReader(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
77
'new FileWriter(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
78
'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
83
'<' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
86
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
99 138 209 333
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
144 152 168 169 218 219 283 292 341
'>' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
183 255 371
'.mkdir()' Correctness Info: The return value of File operations like delete(), mkdir(), renameTo() should be checked to ensure the operation succeeded.
285
Complexity hotspot; lines 361 to 363 (total complexity: 4)
361 362 363