DocumentAnalyze.java

/KeyGraph/src/topicDetection/DocumentAnalyze.java

#
Java | 539 lines | 377 code | 56 blank | 106 comment | 95 complexity | 1c4b8bb42da0dbd520294ea9d087d106 MD5 | raw file

package topicDetection;



import java.io.BufferedReader;

import java.io.FileReader;

import java.io.PrintStream;

import java.sql.Timestamp;

import java.util.ArrayList;

import java.util.Collection;

import java.util.HashMap;



public class DocumentAnalyze {

	Constants constants;



	public DocumentAnalyze(Constants cons) {

		constants = cons;

	}



	public ArrayList<DocumentCluster> clusterByLDA(HashMap<String, Document> docs, HashMap<String, Double> DF, String model) {

		ArrayList<DocumentCluster> documentClusters = new ArrayList<DocumentCluster>();

		try {



			HashMap<Integer, DocumentCluster> clusters = new HashMap<Integer, DocumentCluster>();



			BufferedReader in = new BufferedReader(new FileReader("results-" + model + ".txt"));

			in.readLine();

			String line = null;

			while ((line = in.readLine()) != null) {

				String[] tokens = line.split(" ");

				String docid = tokens[1];

				if (docid.lastIndexOf("/") != -1)

					docid = docid.substring(docid.lastIndexOf("/") + 1);

				docid = docid.substring(0, docid.lastIndexOf("."));

				Integer clusterid = Integer.parseInt(tokens[2]);

				Document d = docs.get(docid);

				if (d == null) {

					d = new Document(docid);

					docs.put(d.id, d);

				}

				// System.out.println(docid + "\t" + d + "\t" + clusterid);

				System.out.print(tokens[3] + "\t");

				if (!clusters.containsKey(clusterid))

					clusters.put(clusterid, new DocumentCluster());

				clusters.get(clusterid).docs.put(d.id, d);

			}

			System.out.println();

			if (model.startsWith("mallet")) {



				in = new BufferedReader(new FileReader("results-" + model + ".topickeys.txt"));

				// in.readLine();

				line = null;

				while ((line = in.readLine()) != null) {

					String[] tokens = line.split("\t");

					DocumentCluster dc = clusters.get(Integer.parseInt(tokens[0]));

					dc.keyGraph = new HashMap<String, Node>();

					for (String keyword : tokens[2].split(" "))

						dc.keyGraph.put(keyword, new Node(new Keyword(keyword, keyword, 0, 0, 0)));

				}

			}

			for (Integer clusterid : clusters.keySet())

				documentClusters.add(clusters.get(clusterid));



		} catch (Exception e) {

			e.printStackTrace();

		}

		System.out.println("Document Clusters (final) :::::::::" + documentClusters.size());



		return documentClusters;

	}



	public ArrayList<DocumentCluster> clusterbyKeyGraph(HashMap<String, Document> docs, HashMap<String, Double> DF) {

		GraphAnalyze g = new GraphAnalyze(constants);

		g.buildGraph(docs, DF, constants.REMOVE_DUPLICATES);

		// g.printGraph(g.graphNodes);

		ComputeDocumentVectorSize(docs, DF, g.graphNodes);

		ArrayList<HashMap<String, Node>> communities = g.extractCommunities(g.graphNodes);

		System.out.println(communities.size());

		return extractClustersFromKeyCommunity(docs, communities, DF, docs.size(), g.graphNodes);

	}



	public void ComputeDocumentVectorSize(HashMap<String, Document> docs, HashMap<String, Double> DF, HashMap<String, Node> graphNodes) {

		// ArrayList<String> toRemove=new ArrayList<String>();

		for (Document d : docs.values()) {

			d.vectorSize = 0;

			for (Keyword k : d.keywords.values())

				if (graphNodes.containsKey(k.baseForm))

					d.vectorSize += Math.pow(TFIDF(k.tf, idf(DF.get(k.baseForm), docs.size())), 2);

			d.vectorSize = Math.sqrt(d.vectorSize);



			// if(d.vectorSize==0)

			// toRemove.add(d.id);

		}



		// for(String id: toRemove)

		// docs.remove(id);

	}



	public double TFIDF(double tf, double idf) {

		if (tf == 0 || idf == 0)

			return 0;

		return tf * idf;

	}



	public ArrayList<DocumentCluster> extractClustersFromKeyCommunity(HashMap<String, Document> docs, ArrayList<HashMap<String, Node>> communities,

			HashMap<String, Double> DF, int docSize, HashMap<String, Node> graphNodes) {



		ArrayList<DocumentCluster> documentClusters = new ArrayList<DocumentCluster>();

		for (HashMap<String, Node> c : communities) {

			DocumentCluster dc = new DocumentCluster();

			// -- find related documents -----------

			dc.keyGraph = c;

			for (Node n : c.values())

				for (Document d : n.keyword.documents.values())

					if (!dc.docs.containsKey(d.id)) {

						double cosineSimilarity = cosineSimilarity(c, d, DF, docSize);

						if (cosineSimilarity > constants.DOC_SIM2KEYGRAPH_MIN) {

							dc.docs.put(d.id, d);

							dc.similarities.put(d.id, cosineSimilarity);

						}

					}



			// -- filter clusters -------------

			// dc.variance = variance(dc, DF, docSize, graphNodes);

			// if (dc.centroid.vectorSize == 0 || dc.variance <=

			// constants.CLUSTER_VAR_MAX)

			{

				ArrayList<String> toRemove = new ArrayList<String>();

				// System.out.println("\n****** Community #" +

				// documentClusters.size());

				// printKeywords(dc);

				// for (Document d : dc.docs.values()) {

				// if (cosineSimilarity(dc.centroid, d, DF, docSize) <

				// constants.DOC_SIM2CENTROID_MIN)

				// toRemove.add(d.id);

				// // else

				// // System.out.println(d.topic + ": " + d.id);

				// }

				// -- time based filtering -----------

				// if (dc.docs.size() > 0){

				// DocumentCluster[] dcs = filterTimeBased(dc, toRemove);

				// // if(dcs[0].docs.size() >= constants.TOPIC_MIN_SIZE)

				// // documentClusters.add(dcs[0]);

				// // if(dcs[1].docs.size() >= constants.TOPIC_MIN_SIZE)

				// // documentClusters.add(dcs[1]);

				// }

				if (dc.docs.size() - toRemove.size() >= constants.TOPIC_MIN_SIZE) {

					documentClusters.add(dc);

					for (String id : toRemove) {

						dc.docs.remove(id);

						dc.similarities.remove(id);

					}

				}



			}

		}



		System.out.println("Keyword Communities :::::::::" + communities.size());

		System.out.println("Document Clusters (initial) :::::::::" + documentClusters.size());

		// printClusters(documentClusters);

		mergeSimilarClusters(documentClusters);

		// printClusters(documentClusters);



		if (constants.HARD_CLUSTERING)

			hardClustering(docs, DF, docSize, documentClusters);



		System.out.println("Document Clusters (final) :::::::::" + documentClusters.size());



		return documentClusters;

	}



	public ArrayList<DocumentCluster> extractClustersFromKeyCommunity2(HashMap<String, Document> docs, ArrayList<HashMap<String, Node>> communities,

			HashMap<String, Double> DF, int docSize, HashMap<String, Node> graphNodes) {



		ArrayList<DocumentCluster> tmpdocumentClusters = new ArrayList<DocumentCluster>();

		ArrayList<DocumentCluster> documentClusters = new ArrayList<DocumentCluster>();



		for (HashMap<String, Node> c : communities) {

			DocumentCluster dc = new DocumentCluster();

			dc.keyGraph = c;

			tmpdocumentClusters.add(dc);

		}



		for (Document d : docs.values()) {

			double maxSim = 0;

			DocumentCluster maxDC = null;

			for (DocumentCluster dc : tmpdocumentClusters) {

				// double sim = cosineSimilarity(dc.keyGraph, d, DF, docSize);

				double sim = dc.similarities.get(d.id);

				if (sim > maxSim) {

					System.out.println("siiiim:: " + sim);

					maxSim = sim;

					maxDC = dc;

				}

			}

			if (maxSim > constants.DOC_SIM2KEYGRAPH_MIN)

				maxDC.docs.put(d.id, d);

		}



		for (DocumentCluster dc : tmpdocumentClusters)

		// -- filter clusters -------------

		// dc.variance = variance(dc, DF, docSize, graphNodes);

		// if (dc.centroid.vectorSize == 0 || dc.variance <=

		// constants.CLUSTER_VAR_MAX)

		{

			ArrayList<String> toRemove = new ArrayList<String>();

			// System.out.println("\n****** Community #" +

			// documentClusters.size());

			// printKeywords(dc);

			// for (Document d : dc.docs.values()) {

			// if (cosineSimilarity(dc.centroid, d, DF, docSize) <

			// constants.DOC_SIM2CENTROID_MIN)

			// toRemove.add(d.id);

			// // else

			// // System.out.println(d.topic + ": " + d.id);

			// }

			// -- time based filtering -----------

			// if (dc.docs.size() > 0){

			// DocumentCluster[] dcs = filterTimeBased(dc, toRemove);

			// // if(dcs[0].docs.size() >= constants.TOPIC_MIN_SIZE)

			// // documentClusters.add(dcs[0]);

			// // if(dcs[1].docs.size() >= constants.TOPIC_MIN_SIZE)

			// // documentClusters.add(dcs[1]);

			// }

			if (dc.docs.size() - toRemove.size() >= constants.TOPIC_MIN_SIZE) {

				documentClusters.add(dc);

				for (String id : toRemove) {

					dc.docs.remove(id);

					dc.similarities.remove(id);

				}

			}

		}



		System.out.println("Keyword Communities :::::::::" + communities.size());

		System.out.println("Document Clusters (initial) :::::::::" + documentClusters.size());

		// printClusters(documentClusters);

		mergeSimilarClusters(documentClusters);

		// printClusters(documentClusters);



		if (constants.HARD_CLUSTERING)

			hardClustering(docs, DF, docSize, documentClusters);



		System.out.println("Document Clusters (final) :::::::::" + documentClusters.size());



		return documentClusters;

	}



	public DocumentCluster[] filterTimeBased(DocumentCluster dc, ArrayList<String> toRemove) {

		long time = 0;

		HashMap<String, Document> docs = dc.docs;

		for (int i = 0; i < 5 && docs.size() > 0; i++) {

			time = 0;

			toRemove.clear();

			for (Document d : docs.values())

				time += d.publishDate.getTime();

			time /= docs.size();

			docs = new HashMap<String, Document>();

			for (Document d : dc.docs.values())

				if (Math.abs(d.publishDate.getTime() - time) > ((long) 15) * 24 * 60 * 60 * 1000)

					toRemove.add(d.id);

				else

					docs.put(d.id, d);

		}



		DocumentCluster[] dcs = new DocumentCluster[] { new DocumentCluster(), new DocumentCluster() };

		dcs[0].keyGraph = dc.keyGraph;

		dcs[1].keyGraph = dc.keyGraph;

		for (String id : toRemove) {

			Document doc = dc.docs.get(id);

			if (doc.publishDate.after(new Timestamp(time)))

				dcs[1].docs.put(id, doc);

			else

				dcs[0].docs.put(id, doc);

		}

		return dcs;

	}



	private void hardClustering(HashMap<String, Document> docs, HashMap<String, Double> DF, int docSize, ArrayList<DocumentCluster> documentClusters) {

		int ii = 0;

		for (Document d : docs.values()) {

			boolean isAssigned = false;

			for (DocumentCluster dc : documentClusters)

				if (dc.docs.containsKey(d.id)) {

					isAssigned = true;

					break;

				}

			if (!isAssigned) {

				double max_sim = 0;

				DocumentCluster bestDC = null;

				for (DocumentCluster dc : documentClusters)

					// if (cosineSimilarity(dc.keyGraph, d, DF, docSize) >

					// max_sim) {

					if (dc.similarities.containsKey(d.id) && dc.similarities.get(d.id) > max_sim) {

						max_sim = cosineSimilarity(dc.keyGraph, d, DF, docSize);

						bestDC = dc;

					}

				if (max_sim > constants.DOC_SIM2KEYGRAPH_MIN / 3.5)

					bestDC.docs.put(d.id, d);

				else

					ii++;

			}

		}

		System.out.println("Off topic documents:" + ii + " out of " + docs.size());

	}



	public static void printTopics(Collection<DocumentCluster> clusters, PrintStream out) {

		for (DocumentCluster dc : clusters) {

			out.print("KEYWORDS:\t");

			printKeywords(dc, out);

			out.print("\nDOCUMNETS:\t");

			for (Document d : dc.docs.values())

				out.print(d.id + ",");

			out.print("\nKEYGRAPH_NODES:\t");

			for (Node n : dc.keyGraph.values())

				out.print(n.id + ":" + n.keyword.baseForm + ":" + n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");

			out.println("\nKEYGRAPH_EDGES:\t");

			for (Node n : dc.keyGraph.values()) {

				for (Edge e : n.edges.values())

					if (e.n1.equals(n))

						out.print(e.n1.id + ":" + e.n1.keyword.baseForm + "-" + e.n2.id + ":" + e.n2.keyword.baseForm + ",");

			}

			out.println("\n");

			// out.println("~" + dc.docs.size() / 10 + "0: " + dc.docs.size() +

			// " docs");



		}



	}



	// public static void printClusters(Collection<DocumentCluster> clusters,

	// PrintStream out) {

	// // printClusters(clusters, out, false);

	// for (DocumentCluster dc : clusters)

	// dc.serialize(out);

	// }



	// public static void printClusters(Collection<DocumentCluster> clusters,

	// PrintStream out, boolean printDocContent) {

	// for (DocumentCluster dc : clusters) {

	// printCluster(dc, out, printDocContent);

	// }

	// }



	// public static void printCluster(DocumentCluster dc, PrintStream out) {

	// printCluster(dc, out, false);

	// }



	// public static void printCluster(DocumentCluster dc, PrintStream out,

	// boolean printDocContent) {

	//

	// out.println("\n****** Community #" + dc.id);

	// printKeywords(dc, out);

	// printKeyGraph(dc.keyGraph, out);

	// out.println("~" + dc.docs.size() / 10 + "0: " + dc.docs.size() +

	// " docs");

	// for (Document d : dc.docs.values())

	// // out.println(d.topics + ": " + d.publishDate + " " + d.id);

	// if (printDocContent)

	// out.println(d.publishDate + "\t" + d.id + "\t" + d.getBody());

	// else

	// out.println(d.publishDate + "\t" + d.id);

	// }



	public static void printClustersForTheWebsite(Collection<DocumentCluster> clusters, String outputFileName) throws Exception {



		PrintStream out = new PrintStream(outputFileName + ".event_document");

		for (DocumentCluster dc : clusters) {

			out.print(dc.id + "\t");

			// out.println("~" + dc.docs.size() / 10 + "0: " + dc.docs.size() +

			// " docs");

			for (Document d : dc.docs.values())

				out.print(d.id + ":" + (d.isDuplicate ? 0 : 1) + ",");

			out.println();

		}

		out.close();

		out = new PrintStream(outputFileName + ".event_keyGraph_nodes");

		for (DocumentCluster dc : clusters) {

			out.print(dc.id + "\t");

			for (Node n : dc.keyGraph.values())

				out.print(n.id + ":" + n.keyword.baseForm + ":" + n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");

			out.println();

		}

		out.close();

		out = new PrintStream(outputFileName + ".event_keyGraph_edges");

		for (DocumentCluster dc : clusters) {

			out.print(dc.id + "\t");

			for (Node n : dc.keyGraph.values()) {

				for (Edge e : n.edges.values())

					if (e.n1.equals(n))

						out.print(e.n1.id + ":" + e.n1.keyword.baseForm + "-" + e.n2.id + ":" + e.n2.keyword.baseForm + ",");

			}

			out.println();

		}

		out.close();



		// printKeywords(dc, out);

		// printKeyGraph(dc.keyGraph, out);



	}



	public static void printKeywords(DocumentCluster dc, PrintStream out) {

		for (Node n : dc.keyGraph.values())

			out.print(n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");

		// out.println();

	}



	public static void printKeyGraph(HashMap<String, Node> keyGraph, PrintStream out) {

		for (Node n : keyGraph.values())

			out.print(n.id + ":" + n.keyword.baseForm + ":" + n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");

		out.println();

		for (Node n : keyGraph.values()) {

			for (Edge e : n.edges.values())

				if (e.n1.equals(n))

					out.print(e.n1.id + ":" + e.n1.keyword.baseForm + "-" + e.n2.id + ":" + e.n2.keyword.baseForm + ",");

		}

		out.println();

	}



	private void mergeSimilarClusters(ArrayList<DocumentCluster> documentClusters) {

		ArrayList<DocumentCluster> topics = new ArrayList<DocumentCluster>();

		while (documentClusters.size() > 0) {

			DocumentCluster dc1 = documentClusters.remove(0);

			ArrayList<DocumentCluster> toRemove = new ArrayList<DocumentCluster>();

			boolean isChanged = false;

			do {

				isChanged = false;

				for (DocumentCluster dc2 : documentClusters) {

					double intersect = intersectDocs(dc1.docs, dc2.docs);

					if (intersect / Math.min(dc1.docs.size(), dc2.docs.size()) >= constants.CLUSTER_INTERSECT_MIN) {

						mergeClusters(dc1, dc2);

						isChanged = true;

						toRemove.add(dc2);

					}

				}

				documentClusters.removeAll(toRemove);

			} while (isChanged);

			topics.add(dc1);

		}

		documentClusters.addAll(topics);

	}



	public int intersectDocs(HashMap dc1, HashMap dc2) {

		int intersect = 0;

		for (Object key : dc1.keySet())

			if (dc2.containsKey(key))

				intersect++;

		return intersect;

	}



	public void mergeClusters(DocumentCluster dc1, DocumentCluster dc2) {

		for (Document d : dc2.docs.values())

			if (!dc1.docs.containsKey(d.id)) {

				dc1.docs.put(d.id, d);

				dc1.similarities.put(d.id, dc2.similarities.get(d.id));

			} else if (dc1.similarities.get(d.id) < dc2.similarities.get(d.id))

				dc1.similarities.put(d.id, dc2.similarities.get(d.id));

		dc1.keyGraph.putAll(dc2.keyGraph);

	}



	public double cosineSimilarity(Document d1, Document d2, HashMap<String, Double> DF, int docSize) {

		double sim = 0;

		for (Keyword k1 : d1.keywords.values()) {

			if (d2.keywords.containsKey(k1.baseForm)) {

				Double df = DF.get(k1.baseForm);

				double tf1 = k1.tf;

				double tf2 = d2.keywords.get(k1.baseForm).tf;

				sim += TFIDF(tf1, idf(df, docSize)) * TFIDF(tf2, idf(df, docSize));

			}

		}

		return sim / d1.vectorSize / d2.vectorSize;

	}



	public double cosineSimilarity(HashMap<String, Node> community, Document d2, HashMap<String, Double> DF, int docSize) {

		double sim = 0;

		double vectorSize1 = 0;

		int numberOfKeywordsInCCommon = 0;

		for (Node n : community.values()) {

			double nTF = 0;

			for (Edge e : n.edges.values())

				// nTF += e.df;

				nTF += Math.max(e.cp1, e.cp2);

			// nkeywordtf += (n.equals(e.n2)) ? e.cp1 : e.cp2;

			n.keyword.tf = nTF / n.edges.size();

			vectorSize1 += Math.pow(TFIDF(n.keyword.tf, idf(DF.get(n.keyword.baseForm), docSize)), 2);



			if (d2.keywords.containsKey(n.keyword.baseForm)) {

				numberOfKeywordsInCCommon++;

				sim += TFIDF(n.keyword.tf, idf(DF.get(n.keyword.baseForm), docSize))

						* TFIDF(d2.keywords.get(n.keyword.baseForm).tf, idf(DF.get(n.keyword.baseForm), docSize));

			}

		}

		vectorSize1 = Math.sqrt(vectorSize1);

		if (numberOfKeywordsInCCommon > 2)

			return sim / vectorSize1 / d2.vectorSize;

		else

			return 0;

	}



	public double variance(DocumentCluster dc, HashMap<String, Double> DF, int docSize, HashMap<String, Node> graphNodes) {

		double var = 0;

		if (dc.centroid == null)

			dc.centroid = centroid(dc.docs, DF, graphNodes);

		for (Document d : dc.docs.values()) {

			double diff = 1 - cosineSimilarity(dc.centroid, d, DF, docSize);

			var += diff * diff;

		}

		return var / dc.docs.size();

	}



	public Document centroid(HashMap<String, Document> docs, HashMap<String, Double> DF, HashMap<String, Node> graphNodes) {

		Document centroid = new Document("-1");

		for (Document d : docs.values())

			for (Keyword k : d.keywords.values()) {

				// if (graphNodes.containsKey(k.baseForm))

				if (centroid.keywords.containsKey(k.baseForm)) {

					Keyword kk = centroid.keywords.get(k.baseForm);

					kk.tf += k.tf;

					kk.df++;

				} else

					centroid.keywords.put(k.baseForm, new Keyword(k.baseForm, k.getWord(), k.tf, k.df, 0));

			}

		for (Keyword k : centroid.keywords.values())

			if (idf(k.df, docs.size()) != 0) {// DF.get(k.baseForm) >

				// if (idf(DF.get(k.baseForm), 2) != 0) {// DF.get(k.baseForm) >

				// Constants.KEYWORD_DF_MIN)

				// {

				k.tf /= docs.size();

				centroid.vectorSize += Math.pow(TFIDF(k.tf, DF.get(k.baseForm)), 2);

			} else

				k.tf = 0;

		centroid.vectorSize = Math.sqrt(centroid.vectorSize);

		return centroid;

	}



	public double idf(double df, int size) {

		// if (df < constants.SIMILARITY_KEYWORD_DF_MIN || df >

		// constants.NODE_DF_MAX * size)

		// return 0;

		return Math.log(size / df) / Math.log(2);

	}

}