PageRenderTime 58ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/KeyGraph/src/topicDetection/DocumentAnalyze.java

#
Java | 539 lines | 377 code | 56 blank | 106 comment | 95 complexity | 1c4b8bb42da0dbd520294ea9d087d106 MD5 | raw file
  1. package topicDetection;
  2. import java.io.BufferedReader;
  3. import java.io.FileReader;
  4. import java.io.PrintStream;
  5. import java.sql.Timestamp;
  6. import java.util.ArrayList;
  7. import java.util.Collection;
  8. import java.util.HashMap;
  9. public class DocumentAnalyze {
  10. Constants constants;
  11. public DocumentAnalyze(Constants cons) {
  12. constants = cons;
  13. }
  14. public ArrayList<DocumentCluster> clusterByLDA(HashMap<String, Document> docs, HashMap<String, Double> DF, String model) {
  15. ArrayList<DocumentCluster> documentClusters = new ArrayList<DocumentCluster>();
  16. try {
  17. HashMap<Integer, DocumentCluster> clusters = new HashMap<Integer, DocumentCluster>();
  18. BufferedReader in = new BufferedReader(new FileReader("results-" + model + ".txt"));
  19. in.readLine();
  20. String line = null;
  21. while ((line = in.readLine()) != null) {
  22. String[] tokens = line.split(" ");
  23. String docid = tokens[1];
  24. if (docid.lastIndexOf("/") != -1)
  25. docid = docid.substring(docid.lastIndexOf("/") + 1);
  26. docid = docid.substring(0, docid.lastIndexOf("."));
  27. Integer clusterid = Integer.parseInt(tokens[2]);
  28. Document d = docs.get(docid);
  29. if (d == null) {
  30. d = new Document(docid);
  31. docs.put(d.id, d);
  32. }
  33. // System.out.println(docid + "\t" + d + "\t" + clusterid);
  34. System.out.print(tokens[3] + "\t");
  35. if (!clusters.containsKey(clusterid))
  36. clusters.put(clusterid, new DocumentCluster());
  37. clusters.get(clusterid).docs.put(d.id, d);
  38. }
  39. System.out.println();
  40. if (model.startsWith("mallet")) {
  41. in = new BufferedReader(new FileReader("results-" + model + ".topickeys.txt"));
  42. // in.readLine();
  43. line = null;
  44. while ((line = in.readLine()) != null) {
  45. String[] tokens = line.split("\t");
  46. DocumentCluster dc = clusters.get(Integer.parseInt(tokens[0]));
  47. dc.keyGraph = new HashMap<String, Node>();
  48. for (String keyword : tokens[2].split(" "))
  49. dc.keyGraph.put(keyword, new Node(new Keyword(keyword, keyword, 0, 0, 0)));
  50. }
  51. }
  52. for (Integer clusterid : clusters.keySet())
  53. documentClusters.add(clusters.get(clusterid));
  54. } catch (Exception e) {
  55. e.printStackTrace();
  56. }
  57. System.out.println("Document Clusters (final) :::::::::" + documentClusters.size());
  58. return documentClusters;
  59. }
  60. public ArrayList<DocumentCluster> clusterbyKeyGraph(HashMap<String, Document> docs, HashMap<String, Double> DF) {
  61. GraphAnalyze g = new GraphAnalyze(constants);
  62. g.buildGraph(docs, DF, constants.REMOVE_DUPLICATES);
  63. // g.printGraph(g.graphNodes);
  64. ComputeDocumentVectorSize(docs, DF, g.graphNodes);
  65. ArrayList<HashMap<String, Node>> communities = g.extractCommunities(g.graphNodes);
  66. System.out.println(communities.size());
  67. return extractClustersFromKeyCommunity(docs, communities, DF, docs.size(), g.graphNodes);
  68. }
  69. public void ComputeDocumentVectorSize(HashMap<String, Document> docs, HashMap<String, Double> DF, HashMap<String, Node> graphNodes) {
  70. // ArrayList<String> toRemove=new ArrayList<String>();
  71. for (Document d : docs.values()) {
  72. d.vectorSize = 0;
  73. for (Keyword k : d.keywords.values())
  74. if (graphNodes.containsKey(k.baseForm))
  75. d.vectorSize += Math.pow(TFIDF(k.tf, idf(DF.get(k.baseForm), docs.size())), 2);
  76. d.vectorSize = Math.sqrt(d.vectorSize);
  77. // if(d.vectorSize==0)
  78. // toRemove.add(d.id);
  79. }
  80. // for(String id: toRemove)
  81. // docs.remove(id);
  82. }
  83. public double TFIDF(double tf, double idf) {
  84. if (tf == 0 || idf == 0)
  85. return 0;
  86. return tf * idf;
  87. }
  88. public ArrayList<DocumentCluster> extractClustersFromKeyCommunity(HashMap<String, Document> docs, ArrayList<HashMap<String, Node>> communities,
  89. HashMap<String, Double> DF, int docSize, HashMap<String, Node> graphNodes) {
  90. ArrayList<DocumentCluster> documentClusters = new ArrayList<DocumentCluster>();
  91. for (HashMap<String, Node> c : communities) {
  92. DocumentCluster dc = new DocumentCluster();
  93. // -- find related documents -----------
  94. dc.keyGraph = c;
  95. for (Node n : c.values())
  96. for (Document d : n.keyword.documents.values())
  97. if (!dc.docs.containsKey(d.id)) {
  98. double cosineSimilarity = cosineSimilarity(c, d, DF, docSize);
  99. if (cosineSimilarity > constants.DOC_SIM2KEYGRAPH_MIN) {
  100. dc.docs.put(d.id, d);
  101. dc.similarities.put(d.id, cosineSimilarity);
  102. }
  103. }
  104. // -- filter clusters -------------
  105. // dc.variance = variance(dc, DF, docSize, graphNodes);
  106. // if (dc.centroid.vectorSize == 0 || dc.variance <=
  107. // constants.CLUSTER_VAR_MAX)
  108. {
  109. ArrayList<String> toRemove = new ArrayList<String>();
  110. // System.out.println("\n****** Community #" +
  111. // documentClusters.size());
  112. // printKeywords(dc);
  113. // for (Document d : dc.docs.values()) {
  114. // if (cosineSimilarity(dc.centroid, d, DF, docSize) <
  115. // constants.DOC_SIM2CENTROID_MIN)
  116. // toRemove.add(d.id);
  117. // // else
  118. // // System.out.println(d.topic + ": " + d.id);
  119. // }
  120. // -- time based filtering -----------
  121. // if (dc.docs.size() > 0){
  122. // DocumentCluster[] dcs = filterTimeBased(dc, toRemove);
  123. // // if(dcs[0].docs.size() >= constants.TOPIC_MIN_SIZE)
  124. // // documentClusters.add(dcs[0]);
  125. // // if(dcs[1].docs.size() >= constants.TOPIC_MIN_SIZE)
  126. // // documentClusters.add(dcs[1]);
  127. // }
  128. if (dc.docs.size() - toRemove.size() >= constants.TOPIC_MIN_SIZE) {
  129. documentClusters.add(dc);
  130. for (String id : toRemove) {
  131. dc.docs.remove(id);
  132. dc.similarities.remove(id);
  133. }
  134. }
  135. }
  136. }
  137. System.out.println("Keyword Communities :::::::::" + communities.size());
  138. System.out.println("Document Clusters (initial) :::::::::" + documentClusters.size());
  139. // printClusters(documentClusters);
  140. mergeSimilarClusters(documentClusters);
  141. // printClusters(documentClusters);
  142. if (constants.HARD_CLUSTERING)
  143. hardClustering(docs, DF, docSize, documentClusters);
  144. System.out.println("Document Clusters (final) :::::::::" + documentClusters.size());
  145. return documentClusters;
  146. }
  147. public ArrayList<DocumentCluster> extractClustersFromKeyCommunity2(HashMap<String, Document> docs, ArrayList<HashMap<String, Node>> communities,
  148. HashMap<String, Double> DF, int docSize, HashMap<String, Node> graphNodes) {
  149. ArrayList<DocumentCluster> tmpdocumentClusters = new ArrayList<DocumentCluster>();
  150. ArrayList<DocumentCluster> documentClusters = new ArrayList<DocumentCluster>();
  151. for (HashMap<String, Node> c : communities) {
  152. DocumentCluster dc = new DocumentCluster();
  153. dc.keyGraph = c;
  154. tmpdocumentClusters.add(dc);
  155. }
  156. for (Document d : docs.values()) {
  157. double maxSim = 0;
  158. DocumentCluster maxDC = null;
  159. for (DocumentCluster dc : tmpdocumentClusters) {
  160. // double sim = cosineSimilarity(dc.keyGraph, d, DF, docSize);
  161. double sim = dc.similarities.get(d.id);
  162. if (sim > maxSim) {
  163. System.out.println("siiiim:: " + sim);
  164. maxSim = sim;
  165. maxDC = dc;
  166. }
  167. }
  168. if (maxSim > constants.DOC_SIM2KEYGRAPH_MIN)
  169. maxDC.docs.put(d.id, d);
  170. }
  171. for (DocumentCluster dc : tmpdocumentClusters)
  172. // -- filter clusters -------------
  173. // dc.variance = variance(dc, DF, docSize, graphNodes);
  174. // if (dc.centroid.vectorSize == 0 || dc.variance <=
  175. // constants.CLUSTER_VAR_MAX)
  176. {
  177. ArrayList<String> toRemove = new ArrayList<String>();
  178. // System.out.println("\n****** Community #" +
  179. // documentClusters.size());
  180. // printKeywords(dc);
  181. // for (Document d : dc.docs.values()) {
  182. // if (cosineSimilarity(dc.centroid, d, DF, docSize) <
  183. // constants.DOC_SIM2CENTROID_MIN)
  184. // toRemove.add(d.id);
  185. // // else
  186. // // System.out.println(d.topic + ": " + d.id);
  187. // }
  188. // -- time based filtering -----------
  189. // if (dc.docs.size() > 0){
  190. // DocumentCluster[] dcs = filterTimeBased(dc, toRemove);
  191. // // if(dcs[0].docs.size() >= constants.TOPIC_MIN_SIZE)
  192. // // documentClusters.add(dcs[0]);
  193. // // if(dcs[1].docs.size() >= constants.TOPIC_MIN_SIZE)
  194. // // documentClusters.add(dcs[1]);
  195. // }
  196. if (dc.docs.size() - toRemove.size() >= constants.TOPIC_MIN_SIZE) {
  197. documentClusters.add(dc);
  198. for (String id : toRemove) {
  199. dc.docs.remove(id);
  200. dc.similarities.remove(id);
  201. }
  202. }
  203. }
  204. System.out.println("Keyword Communities :::::::::" + communities.size());
  205. System.out.println("Document Clusters (initial) :::::::::" + documentClusters.size());
  206. // printClusters(documentClusters);
  207. mergeSimilarClusters(documentClusters);
  208. // printClusters(documentClusters);
  209. if (constants.HARD_CLUSTERING)
  210. hardClustering(docs, DF, docSize, documentClusters);
  211. System.out.println("Document Clusters (final) :::::::::" + documentClusters.size());
  212. return documentClusters;
  213. }
  214. public DocumentCluster[] filterTimeBased(DocumentCluster dc, ArrayList<String> toRemove) {
  215. long time = 0;
  216. HashMap<String, Document> docs = dc.docs;
  217. for (int i = 0; i < 5 && docs.size() > 0; i++) {
  218. time = 0;
  219. toRemove.clear();
  220. for (Document d : docs.values())
  221. time += d.publishDate.getTime();
  222. time /= docs.size();
  223. docs = new HashMap<String, Document>();
  224. for (Document d : dc.docs.values())
  225. if (Math.abs(d.publishDate.getTime() - time) > ((long) 15) * 24 * 60 * 60 * 1000)
  226. toRemove.add(d.id);
  227. else
  228. docs.put(d.id, d);
  229. }
  230. DocumentCluster[] dcs = new DocumentCluster[] { new DocumentCluster(), new DocumentCluster() };
  231. dcs[0].keyGraph = dc.keyGraph;
  232. dcs[1].keyGraph = dc.keyGraph;
  233. for (String id : toRemove) {
  234. Document doc = dc.docs.get(id);
  235. if (doc.publishDate.after(new Timestamp(time)))
  236. dcs[1].docs.put(id, doc);
  237. else
  238. dcs[0].docs.put(id, doc);
  239. }
  240. return dcs;
  241. }
  242. private void hardClustering(HashMap<String, Document> docs, HashMap<String, Double> DF, int docSize, ArrayList<DocumentCluster> documentClusters) {
  243. int ii = 0;
  244. for (Document d : docs.values()) {
  245. boolean isAssigned = false;
  246. for (DocumentCluster dc : documentClusters)
  247. if (dc.docs.containsKey(d.id)) {
  248. isAssigned = true;
  249. break;
  250. }
  251. if (!isAssigned) {
  252. double max_sim = 0;
  253. DocumentCluster bestDC = null;
  254. for (DocumentCluster dc : documentClusters)
  255. // if (cosineSimilarity(dc.keyGraph, d, DF, docSize) >
  256. // max_sim) {
  257. if (dc.similarities.containsKey(d.id) && dc.similarities.get(d.id) > max_sim) {
  258. max_sim = cosineSimilarity(dc.keyGraph, d, DF, docSize);
  259. bestDC = dc;
  260. }
  261. if (max_sim > constants.DOC_SIM2KEYGRAPH_MIN / 3.5)
  262. bestDC.docs.put(d.id, d);
  263. else
  264. ii++;
  265. }
  266. }
  267. System.out.println("Off topic documents:" + ii + " out of " + docs.size());
  268. }
  269. public static void printTopics(Collection<DocumentCluster> clusters, PrintStream out) {
  270. for (DocumentCluster dc : clusters) {
  271. out.print("KEYWORDS:\t");
  272. printKeywords(dc, out);
  273. out.print("\nDOCUMNETS:\t");
  274. for (Document d : dc.docs.values())
  275. out.print(d.id + ",");
  276. out.print("\nKEYGRAPH_NODES:\t");
  277. for (Node n : dc.keyGraph.values())
  278. out.print(n.id + ":" + n.keyword.baseForm + ":" + n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");
  279. out.println("\nKEYGRAPH_EDGES:\t");
  280. for (Node n : dc.keyGraph.values()) {
  281. for (Edge e : n.edges.values())
  282. if (e.n1.equals(n))
  283. out.print(e.n1.id + ":" + e.n1.keyword.baseForm + "-" + e.n2.id + ":" + e.n2.keyword.baseForm + ",");
  284. }
  285. out.println("\n");
  286. // out.println("~" + dc.docs.size() / 10 + "0: " + dc.docs.size() +
  287. // " docs");
  288. }
  289. }
  290. // public static void printClusters(Collection<DocumentCluster> clusters,
  291. // PrintStream out) {
  292. // // printClusters(clusters, out, false);
  293. // for (DocumentCluster dc : clusters)
  294. // dc.serialize(out);
  295. // }
  296. // public static void printClusters(Collection<DocumentCluster> clusters,
  297. // PrintStream out, boolean printDocContent) {
  298. // for (DocumentCluster dc : clusters) {
  299. // printCluster(dc, out, printDocContent);
  300. // }
  301. // }
  302. // public static void printCluster(DocumentCluster dc, PrintStream out) {
  303. // printCluster(dc, out, false);
  304. // }
  305. // public static void printCluster(DocumentCluster dc, PrintStream out,
  306. // boolean printDocContent) {
  307. //
  308. // out.println("\n****** Community #" + dc.id);
  309. // printKeywords(dc, out);
  310. // printKeyGraph(dc.keyGraph, out);
  311. // out.println("~" + dc.docs.size() / 10 + "0: " + dc.docs.size() +
  312. // " docs");
  313. // for (Document d : dc.docs.values())
  314. // // out.println(d.topics + ": " + d.publishDate + " " + d.id);
  315. // if (printDocContent)
  316. // out.println(d.publishDate + "\t" + d.id + "\t" + d.getBody());
  317. // else
  318. // out.println(d.publishDate + "\t" + d.id);
  319. // }
  320. public static void printClustersForTheWebsite(Collection<DocumentCluster> clusters, String outputFileName) throws Exception {
  321. PrintStream out = new PrintStream(outputFileName + ".event_document");
  322. for (DocumentCluster dc : clusters) {
  323. out.print(dc.id + "\t");
  324. // out.println("~" + dc.docs.size() / 10 + "0: " + dc.docs.size() +
  325. // " docs");
  326. for (Document d : dc.docs.values())
  327. out.print(d.id + ":" + (d.isDuplicate ? 0 : 1) + ",");
  328. out.println();
  329. }
  330. out.close();
  331. out = new PrintStream(outputFileName + ".event_keyGraph_nodes");
  332. for (DocumentCluster dc : clusters) {
  333. out.print(dc.id + "\t");
  334. for (Node n : dc.keyGraph.values())
  335. out.print(n.id + ":" + n.keyword.baseForm + ":" + n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");
  336. out.println();
  337. }
  338. out.close();
  339. out = new PrintStream(outputFileName + ".event_keyGraph_edges");
  340. for (DocumentCluster dc : clusters) {
  341. out.print(dc.id + "\t");
  342. for (Node n : dc.keyGraph.values()) {
  343. for (Edge e : n.edges.values())
  344. if (e.n1.equals(n))
  345. out.print(e.n1.id + ":" + e.n1.keyword.baseForm + "-" + e.n2.id + ":" + e.n2.keyword.baseForm + ",");
  346. }
  347. out.println();
  348. }
  349. out.close();
  350. // printKeywords(dc, out);
  351. // printKeyGraph(dc.keyGraph, out);
  352. }
  353. public static void printKeywords(DocumentCluster dc, PrintStream out) {
  354. for (Node n : dc.keyGraph.values())
  355. out.print(n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");
  356. // out.println();
  357. }
  358. public static void printKeyGraph(HashMap<String, Node> keyGraph, PrintStream out) {
  359. for (Node n : keyGraph.values())
  360. out.print(n.id + ":" + n.keyword.baseForm + ":" + n.keyword.getWord().replaceAll("[,'\"]", " ") + ",");
  361. out.println();
  362. for (Node n : keyGraph.values()) {
  363. for (Edge e : n.edges.values())
  364. if (e.n1.equals(n))
  365. out.print(e.n1.id + ":" + e.n1.keyword.baseForm + "-" + e.n2.id + ":" + e.n2.keyword.baseForm + ",");
  366. }
  367. out.println();
  368. }
  369. private void mergeSimilarClusters(ArrayList<DocumentCluster> documentClusters) {
  370. ArrayList<DocumentCluster> topics = new ArrayList<DocumentCluster>();
  371. while (documentClusters.size() > 0) {
  372. DocumentCluster dc1 = documentClusters.remove(0);
  373. ArrayList<DocumentCluster> toRemove = new ArrayList<DocumentCluster>();
  374. boolean isChanged = false;
  375. do {
  376. isChanged = false;
  377. for (DocumentCluster dc2 : documentClusters) {
  378. double intersect = intersectDocs(dc1.docs, dc2.docs);
  379. if (intersect / Math.min(dc1.docs.size(), dc2.docs.size()) >= constants.CLUSTER_INTERSECT_MIN) {
  380. mergeClusters(dc1, dc2);
  381. isChanged = true;
  382. toRemove.add(dc2);
  383. }
  384. }
  385. documentClusters.removeAll(toRemove);
  386. } while (isChanged);
  387. topics.add(dc1);
  388. }
  389. documentClusters.addAll(topics);
  390. }
  391. public int intersectDocs(HashMap dc1, HashMap dc2) {
  392. int intersect = 0;
  393. for (Object key : dc1.keySet())
  394. if (dc2.containsKey(key))
  395. intersect++;
  396. return intersect;
  397. }
  398. public void mergeClusters(DocumentCluster dc1, DocumentCluster dc2) {
  399. for (Document d : dc2.docs.values())
  400. if (!dc1.docs.containsKey(d.id)) {
  401. dc1.docs.put(d.id, d);
  402. dc1.similarities.put(d.id, dc2.similarities.get(d.id));
  403. } else if (dc1.similarities.get(d.id) < dc2.similarities.get(d.id))
  404. dc1.similarities.put(d.id, dc2.similarities.get(d.id));
  405. dc1.keyGraph.putAll(dc2.keyGraph);
  406. }
  407. public double cosineSimilarity(Document d1, Document d2, HashMap<String, Double> DF, int docSize) {
  408. double sim = 0;
  409. for (Keyword k1 : d1.keywords.values()) {
  410. if (d2.keywords.containsKey(k1.baseForm)) {
  411. Double df = DF.get(k1.baseForm);
  412. double tf1 = k1.tf;
  413. double tf2 = d2.keywords.get(k1.baseForm).tf;
  414. sim += TFIDF(tf1, idf(df, docSize)) * TFIDF(tf2, idf(df, docSize));
  415. }
  416. }
  417. return sim / d1.vectorSize / d2.vectorSize;
  418. }
  419. public double cosineSimilarity(HashMap<String, Node> community, Document d2, HashMap<String, Double> DF, int docSize) {
  420. double sim = 0;
  421. double vectorSize1 = 0;
  422. int numberOfKeywordsInCCommon = 0;
  423. for (Node n : community.values()) {
  424. double nTF = 0;
  425. for (Edge e : n.edges.values())
  426. // nTF += e.df;
  427. nTF += Math.max(e.cp1, e.cp2);
  428. // nkeywordtf += (n.equals(e.n2)) ? e.cp1 : e.cp2;
  429. n.keyword.tf = nTF / n.edges.size();
  430. vectorSize1 += Math.pow(TFIDF(n.keyword.tf, idf(DF.get(n.keyword.baseForm), docSize)), 2);
  431. if (d2.keywords.containsKey(n.keyword.baseForm)) {
  432. numberOfKeywordsInCCommon++;
  433. sim += TFIDF(n.keyword.tf, idf(DF.get(n.keyword.baseForm), docSize))
  434. * TFIDF(d2.keywords.get(n.keyword.baseForm).tf, idf(DF.get(n.keyword.baseForm), docSize));
  435. }
  436. }
  437. vectorSize1 = Math.sqrt(vectorSize1);
  438. if (numberOfKeywordsInCCommon > 2)
  439. return sim / vectorSize1 / d2.vectorSize;
  440. else
  441. return 0;
  442. }
  443. public double variance(DocumentCluster dc, HashMap<String, Double> DF, int docSize, HashMap<String, Node> graphNodes) {
  444. double var = 0;
  445. if (dc.centroid == null)
  446. dc.centroid = centroid(dc.docs, DF, graphNodes);
  447. for (Document d : dc.docs.values()) {
  448. double diff = 1 - cosineSimilarity(dc.centroid, d, DF, docSize);
  449. var += diff * diff;
  450. }
  451. return var / dc.docs.size();
  452. }
  453. public Document centroid(HashMap<String, Document> docs, HashMap<String, Double> DF, HashMap<String, Node> graphNodes) {
  454. Document centroid = new Document("-1");
  455. for (Document d : docs.values())
  456. for (Keyword k : d.keywords.values()) {
  457. // if (graphNodes.containsKey(k.baseForm))
  458. if (centroid.keywords.containsKey(k.baseForm)) {
  459. Keyword kk = centroid.keywords.get(k.baseForm);
  460. kk.tf += k.tf;
  461. kk.df++;
  462. } else
  463. centroid.keywords.put(k.baseForm, new Keyword(k.baseForm, k.getWord(), k.tf, k.df, 0));
  464. }
  465. for (Keyword k : centroid.keywords.values())
  466. if (idf(k.df, docs.size()) != 0) {// DF.get(k.baseForm) >
  467. // if (idf(DF.get(k.baseForm), 2) != 0) {// DF.get(k.baseForm) >
  468. // Constants.KEYWORD_DF_MIN)
  469. // {
  470. k.tf /= docs.size();
  471. centroid.vectorSize += Math.pow(TFIDF(k.tf, DF.get(k.baseForm)), 2);
  472. } else
  473. k.tf = 0;
  474. centroid.vectorSize = Math.sqrt(centroid.vectorSize);
  475. return centroid;
  476. }
  477. public double idf(double df, int size) {
  478. // if (df < constants.SIMILARITY_KEYWORD_DF_MIN || df >
  479. // constants.NODE_DF_MAX * size)
  480. // return 0;
  481. return Math.log(size / df) / Math.log(2);
  482. }
  483. }