TwitterDataLoader.java

/KeyGraph/src/dataset/twitter/TwitterDataLoader.java

# · Java · 361 lines · 311 code · 28 blank · 22 comment · 91 complexity · 936e44f9a3e9e626aff8fddfff15906b MD5 · raw file


package dataset.twitter;

import java.io.BufferedReader;
import java.io.FileReader;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.StringTokenizer;

import java.io.*;
import java.util.*;

import topicDetection.Constants;
import topicDetection.DataLoader;
import topicDetection.Document;
import topicDetection.Keyword;
import topicDetection.Porter;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.process.*;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;

public class TwitterDataLoader extends DataLoader {
	public TwitterDataLoader(Constants constants) {
		super(constants);
	}

	public void fetchTweets4website(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
			boolean removeDuplicates) throws Exception {
		String line = null;
		BufferedReader in = new BufferedReader(new FileReader(datasetFile));
		int i = 0;
		StringDuplicate sd = new StringDuplicate();

		while ((line = in.readLine()) != null) {
			i++;
			if (i % 1000 == 0)
				System.out.println(i);
			String[] tokens = line.split("\t");
			Document d = new Document(Integer.parseInt(tokens[0]) + "");
			String tweet = tokens[1];
			fetchTweetContent(tweet, stopwords, porter, d);

			if (constants.REMOVE_DUPLICATES)
				d.isDuplicate = sd.isDuplicate(tweet);

			if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
				docs.put(d.id, d);
				for (Keyword k : d.keywords.values()) {
					if (DF.containsKey(k.baseForm))
						DF.put(k.baseForm, DF.get(k.baseForm) + 1);
					else
						DF.put(k.baseForm, new Double(1));
				}
			}
		}
	}

	public void fetchTweets(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter)
			throws Exception {
		String line = null;
		BufferedReader in = new BufferedReader(new FileReader(datasetFile));
		int i = 0;
		while ((line = in.readLine()) != null)
			try {
				i++;
				if (i % 1000 == 0)
					System.out.println(i);
				if (i == 150000)
					break;
				String[] tokens = line.split("\t");
				// Document d = new Document(i + "");
				Document d = new Document(tokens[3]);
				String tweet = tokens[4];
				// String[] datetokens = tokens[1].split(" ");
				// String date = datetokens[1] + " " + datetokens[2] + ", " +
				// datetokens[5] + " " + datetokens[3];
				// d.publishDate = new Timestamp(Timestamp.parse(date));
				d.id = d.id + "\t" + tokens[2];
				d.setBody(tweet);
				// if(tweet.indexOf("RT @")!=-1 || tweet.indexOf("rt @")!=-1 ||
				// tweet.indexOf("RT@")!=-1 || tweet.indexOf("rt@")!=-1)
				// continue;
				fetchTweetContent(tweet, stopwords, porter, d);

				if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
					docs.put(d.id, d);
					for (Keyword k : d.keywords.values()) {
						if (DF.containsKey(k.baseForm))
							DF.put(k.baseForm, DF.get(k.baseForm) + 1);
						else
							DF.put(k.baseForm, new Double(1));
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(i + ": \"" + line + "\"");
			}
	}

	public void fetchTweets(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
			Timestamp start, Timestamp end) throws Exception {
		String line = null;
		BufferedReader in = new BufferedReader(new FileReader(datasetFile));
		int i = 0;
		while ((line = in.readLine()) != null)
			try {
				i++;
				if (i % 10000 == 0)
					System.out.println(i + "->" + docs.size());
				String[] tokens = line.split("\t");
				// Document d = new Document(i + "");
				Document d = new Document(tokens[0]);
				String tweet = tokens[4];
				String date = tokens[1];
				// String[] datetokens = tokens[1].split(" ");
				// String date = datetokens[1] + " " + datetokens[2] + ", " +
				// datetokens[5] + " " + datetokens[3];
				d.publishDate = Timestamp.valueOf(date);
				if (d.publishDate.compareTo(start) < 0)
					continue;
				if (d.publishDate.compareTo(end) > -1)
					break;
				// d.id = d.id + "\t" + tokens[2];
				d.setBody(tweet);
				// if(tweet.indexOf("RT @")!=-1 || tweet.indexOf("rt @")!=-1 ||
				// tweet.indexOf("RT@")!=-1 || tweet.indexOf("rt@")!=-1)
				// continue;
				fetchTweetContent(tweet, stopwords, porter, d);

				if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
					docs.put(d.id, d);
					for (Keyword k : d.keywords.values()) {
						if (DF.containsKey(k.baseForm))
							DF.put(k.baseForm, DF.get(k.baseForm) + 1);
						else
							DF.put(k.baseForm, new Double(1));
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(i + ": \"" + line + "\"");
			}
	}

	public void fetchTweetsByQuery(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
			String query, Timestamp startDate, Timestamp endDate) throws Exception {
		String line = null;
		BufferedReader in = new BufferedReader(new FileReader(datasetFile));
		int i = 0;
		while ((line = in.readLine()) != null)
			try {
				i++;
				if (i % 100000 == 0)
					System.out.println(i);
				Document d = new Document(i + "");
				String[] tokens = line.split("\t");
				if (tokens.length < 3)
					continue;
				String tweet = tokens[2];
				String[] datetokens = tokens[0].split(" ");
				String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
				d.publishDate = new Timestamp(Timestamp.parse(date));
				d.id = d.id + "\t" + tokens[1];
				if (startDate.before(d.publishDate) && endDate.after(d.publishDate) && containsQuery(tweet, query)) {
					fetchTweetContent(tweet, stopwords, porter, d);
					if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
						docs.put(d.id, d);
						for (Keyword k : d.keywords.values()) {
							if (DF.containsKey(k.baseForm))
								DF.put(k.baseForm, DF.get(k.baseForm) + 1);
							else
								DF.put(k.baseForm, new Double(1));
						}
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
			}

	}

	public void fetchTweetsByUsers(String datasetFile, HashMap<String, Document> docs, HashSet<String> stopwords, HashMap<String, Double> DF, Porter porter,
			HashSet<String> users, Timestamp startDate, Timestamp endDate) throws Exception {
		String line = null;
		BufferedReader in = new BufferedReader(new FileReader(datasetFile));
		int i = 0;
		while ((line = in.readLine()) != null) {
			i++;
			if (i % 100000 == 0)
				System.out.println(i + "->" + docs.size());
			Document d = new Document(i + "");
			String[] tokens = line.split("\t");
			if (tokens.length < 3)
				continue;
			String tweet = tokens[2];
			String userId = tokens[1];
			String[] datetokens = tokens[0].split(" ");
			String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
			d.publishDate = new Timestamp(Timestamp.parse(date));
			d.id = d.id + " " + userId;
			if (startDate.before(d.publishDate) && endDate.after(d.publishDate) && users.contains(userId)) {
				fetchTweetContent(tweet, stopwords, porter, d);
				if (d.keywords.size() >= constants.DOC_KEYWORDS_SIZE_MIN) {
					docs.put(d.id, d);
					for (Keyword k : d.keywords.values()) {
						if (DF.containsKey(k.baseForm))
							DF.put(k.baseForm, DF.get(k.baseForm) + 1);
						else
							DF.put(k.baseForm, new Double(1));
					}
				}
			}
		}
	}

	public HashSet<String> getUsers(String datasetFile, String query, Timestamp startDate, Timestamp endDate) throws Exception {
		HashSet<String> users = new HashSet<String>();
		String line = null;
		BufferedReader in = new BufferedReader(new FileReader(datasetFile));
		int i = 0;
		while ((line = in.readLine()) != null) {
			i++;
			if (i % 100000 == 0)
				System.out.println(i);
			String[] tokens = line.split("\t");
			if (tokens.length < 3)
				continue;
			String tweet = tokens[2];
			String[] datetokens = tokens[0].split(" ");
			String date = datetokens[1] + " " + datetokens[2] + ", " + datetokens[5] + " " + datetokens[3];
			Timestamp publishDate = new Timestamp(Timestamp.parse(date));
			if (startDate.before(publishDate) && endDate.after(publishDate) && containsQuery(tweet, query))
				users.add(tokens[1]);
		}
		return users;
	}

	public static boolean containsQuery(String text, String query) {
		text = text.toLowerCase();
		query = query.toLowerCase();
		String[] mustTokens = query.split("\\+");
		for (String mustToken : mustTokens) {
			boolean includes = false;
			String[] mightTokens = mustToken.split(" ");
			for (String mightToken : mightTokens)
				if (text.indexOf(mightToken.trim().replaceAll("_", " ")) != -1) {
					includes = true;
					break;
				}
			if (!includes)
				return false;
		}
		return true;
	}

	public void fetchTweetContent(String content, HashSet<String> stopwords, Porter porter, Document d) {
		if (constants.KEYWORDS_2_ENABLE)
			loadDocumentKeyFile(getNounPhrases(content), stopwords, porter, d, constants.KEYWORDS_2_WEIGHT);
		if (constants.TEXT_ENABLE)
			fetchTweetText(content, stopwords, porter, d, constants.TEXT_WEIGHT);
		ArrayList<String> toRemove = new ArrayList<String>();
		for (Keyword k : d.keywords.values())
			if (k.word.contains("@"))
				toRemove.add(k.baseForm);
		for (String base : toRemove)
			d.keywords.remove(base);

	}

	public void fetchTweetText(String content, HashSet<String> stopwords, Porter porter, Document d, double BoostRate) {
		content = content.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " ");
		StringTokenizer st = new StringTokenizer(content, "!?|\"' -_0123456789.,;#$&%/\\*()<>\t");
		d.setBody(content);
		while (st.hasMoreTokens()) {
			String word = st.nextToken();
			String token = word.toLowerCase();
			double tf = 1 * BoostRate;
			String base = "";
			if ((token.indexOf("?") == -1 && token.length() > 2 && !stopwords.contains(token)))
				base = porter.stripAffixes(token);

			if (base.length() > 2)
				if (!d.keywords.containsKey(base))
					d.keywords.put(base, new Keyword(base, word, tf, 1, 0));
				else
					d.keywords.get(base).tf += tf;

		}
	}

	public static ArrayList<String> getBigrams(String content) {
		ArrayList<String> res = new ArrayList<String>();
		StringTokenizer st = new StringTokenizer(content, "!?|\"' -_@0123456789.,;#$&%/\\*()<>\t");
		String prev = null;
		while (st.hasMoreTokens()) {
			String term = st.nextToken();
			if (term.length() > 1) {
				if (prev != null)
					res.add(prev + " " + term);
				prev = term;
			}
		}
		return res;
	}

	static LexicalizedParser lp = null;
	static TokenizerFactory tf = null;

	// TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
	public static ArrayList<String> getNounPhrases(String inputContent) {
		if (tf == null)
			tf = PTBTokenizer.factory(false, new WordTokenFactory());
		if (lp == null)
			lp = new LexicalizedParser("englishPCFG.ser.gz");
		ArrayList<String> res = new ArrayList<String>();

		String content = inputContent.replaceAll("[hH][tT][tT][pP][s]?:[\\\\/][\\\\/][^ ]*\\b", " ");
		try {
			StringTokenizer sentences = new StringTokenizer(content, "[!\\?;.]");
			while (sentences.hasMoreElements()) {
				String sentence = sentences.nextToken();
				if (sentence.trim().length() == 0)
					continue;
				// System.out.println("ORIGINAL:" + sentence);
				List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
				lp.parse(tokens); // parse the tokens
				Tree t = lp.getBestParse(); // get the best parse tree
				res.addAll(getNounPhrases(t));
				// System.out.println(res.size());
				// System.out.println("\nPROCESSED:\n\n"); tp.printTree(t);
				// //
				// print tree
			}
		} catch (Exception e) {
			System.err.println("ERROR: " + e.getMessage());
		}
		return res;
	}

	public static ArrayList<String> getNounPhrases(Tree t) {
		ArrayList<String> res = new ArrayList<String>();
		if (!t.isLeaf())
			for (Tree child : t.getChildrenAsList()) {
				ArrayList<String> childRes = getNounPhrases(child);
				if (childRes.size() > 0)
					res.addAll(childRes);
			}
		if (res.size() == 0)
			if (t.label().value().equals("NP")) {
				String text = "";
				for (Tree tt : t.getLeaves())
					text += " " + tt.value();
				res.add(text.trim());
			}
		return res;
	}

}

Tech Fingerprint

Alerts (68)

'import' Maintainability Info: Wildcard imports (e.g., `import java.util.*;`) can obscure the origin of classes and lead to namespace collisions. Prefer importing specific classes explicitly.
12 13 20 21
'throws Exception' Declaring 'throws Exception' is too broad. Declare specific checked exceptions that the method might throw, allowing callers to handle them appropriately.
31 62 104 149 186 219
'new FileReader(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
33 64 106 151 188 222
'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
39 69 71 111 156 192 226 282
'System.out.println(' Use a logging framework (e.g., SLF4J, Log4j) for better control and configurability
40 70 99 112 144 157 193 227
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
42 81 99 112 144 158 164 166 193 194 201 203 233 302 338 355
'HashSet<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
61 219 259 273
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
97 142 179 337
'.printStackTrace()' Avoid printing stack traces directly to std err/out. Use a proper logging framework to handle exceptions consistently and direct output appropriately.
98 143 180
'<' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
160 196 229
Complexity hotspot; line 282 (total complexity: 4)
282
'>' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
285
'ArrayList<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
294 313 343
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
328