PageRenderTime 45ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/KeyGraph/src/dataset/news/NewsMain.java

#
Java | 593 lines | 433 code | 73 blank | 87 comment | 89 complexity | 53ad704acce597a1d5a6d2012e5782e9 MD5 | raw file
  1. package dataset.news;
  2. import java.io.BufferedReader;
  3. import java.io.BufferedWriter;
  4. import java.io.DataInputStream;
  5. import java.io.DataOutputStream;
  6. import java.io.File;
  7. import java.io.FileInputStream;
  8. import java.io.FileOutputStream;
  9. import java.io.FileReader;
  10. import java.io.FileWriter;
  11. import java.io.PrintStream;
  12. import java.sql.Timestamp;
  13. import java.util.ArrayList;
  14. import java.util.Date;
  15. import java.util.HashMap;
  16. import java.util.HashSet;
  17. import java.util.StringTokenizer;
  18. import topicDetection.Constants;
  19. import topicDetection.Document;
  20. import topicDetection.DocumentAnalyze;
  21. import topicDetection.DocumentCluster;
  22. import topicDetection.Edge;
  23. import topicDetection.EvaluationAnalyze;
  24. import topicDetection.Topic;
  25. import topicDetection.GraphAnalyze;
  26. import topicDetection.Keyword;
  27. import topicDetection.Node;
  28. import topicDetection.Porter;
  29. import topicDetection.Utils;
  30. public class NewsMain {
  31. public static Constants constants;
  32. public static void main(String[] args) throws Exception {
  33. constants = new Constants("conf/NewsConstants.txt");
  34. // retrieveOriginakKeywords();
  35. // prepareDataforLDADave();
  36. // prepareDataforLDAMallet();
  37. // System.exit(0);
  38. // refindNEs();
  39. // new
  40. // TDTUtil(constants).correctXMLChars("/fs/clip-clip-proj/GeoNets/EntityTDT/hassan/tkn_sgm",
  41. // "/fs/clip-clip-proj/GeoNets/EntityTDT/hassan/tkn_sgm_corrected");
  42. // TDTUtil.extraxtText_mmtkn_sgm("/fs/clip-tdt/data/TDT4/tdt4_aem_v1_0/tkn_sgm",
  43. // "/fs/clip-clip-proj/GeoNets/EntityTDT/hassan/tkn_sgm/");
  44. // TDTUtil.extraxtText_mmtkn_sgm("/fs/clip-tdt/data/TDT4/tdt4_aem_v1_0/mttkn_sgm",
  45. // "/fs/clip-clip-proj/GeoNets/EntityTDT/hassan/mttkn_sgm/");
  46. // generateTDTTopicFiles();
  47. // generateDateFiles();
  48. // ideaTest();
  49. // runFindEventsStream();
  50. run();
  51. // keygraphGrow();
  52. // refindNEs();
  53. // test();
  54. // runParameterEstimation();
  55. }
  56. private static void generateTDTTopicFiles() throws Exception {
  57. HashMap<String, HashSet<String>> topics = new HashMap<String, HashSet<String>>();
  58. loadTopicFile(topics, "/fs/clip-tdt/data/TDT4/2003topic_annotations/2002/tdt2002_topic_rel.v2_1", true);
  59. loadTopicFile(topics, "/fs/clip-tdt/data/TDT4/2003topic_annotations/2003/tdt2003_topic_rel.v2_0", true);
  60. for (String topicid : topics.keySet()) {
  61. BufferedWriter out = new BufferedWriter(new FileWriter("./data/topic_new/" + topicid + ".txt"));
  62. for (String docno : topics.get(topicid))
  63. out.write(docno + "\n");
  64. out.close();
  65. }
  66. System.out.println(topics.size() + ":" + topics);
  67. }
  68. public static void refindNEs() throws Exception {
  69. HashSet<String> nes = new HashSet<String>();
  70. String[] keyfiles = new File(constants.DATA_KEYWORDS_1_PATH).list();
  71. BufferedReader in = null;
  72. for (String filename : keyfiles) {
  73. in = new BufferedReader(new FileReader(constants.DATA_KEYWORDS_1_PATH + filename));
  74. String line = null;
  75. while ((line = in.readLine()) != null && line.length() > 2) {
  76. int index = line.lastIndexOf("==");
  77. // String token = line.substring(0, index).trim();
  78. String word = line.substring(0, index).trim();
  79. String[] tokens = word.split(" and ");
  80. for (String token : tokens)
  81. if (token.length() > 2)
  82. nes.add(token.toLowerCase());
  83. }
  84. in.close();
  85. }
  86. int i = 0;
  87. String outdir = "data/key_NE_new/";
  88. String textdir = "/fs/clip-clip-proj/GeoNets/EntityTDT/hassan/output/text/";
  89. for (String filename : keyfiles) {
  90. in = new BufferedReader(new FileReader(textdir + filename));
  91. String line = null;
  92. String content = "";
  93. while ((line = in.readLine()) != null)
  94. content += line.toLowerCase() + " ";
  95. // content=" "+content.replaceAll("[.,:/\\-_]"," " )+" ";
  96. HashMap<String, Integer> keys = new HashMap<String, Integer>();
  97. for (String ne : nes) {
  98. // if (content.indexOf(ne) != -1)
  99. if (content.indexOf(" " + ne + " ") != -1)
  100. if (keys.containsKey(ne))
  101. keys.put(ne, keys.get(ne) + 1);
  102. else
  103. keys.put(ne, 1);
  104. }
  105. BufferedWriter out = new BufferedWriter(new FileWriter(outdir + filename));
  106. for (String key : keys.keySet())
  107. out.write(key + "==" + keys.get(key) + "\n");
  108. out.close();
  109. i++;
  110. if (i % 100 == 0)
  111. System.out.println(i + "/" + keyfiles.length);
  112. }
  113. }
  114. public static void loadTopicFile(HashMap<String, HashSet<String>> topics, String topicFile, boolean filterNonEnglish) throws Exception {
  115. BufferedReader in = new BufferedReader(new FileReader(topicFile));
  116. String line = null;
  117. while ((line = in.readLine()) != null)
  118. if (line.startsWith("<ONTOPIC")) {
  119. String topicid = line.substring(line.indexOf("topicid="));
  120. topicid = topicid.substring(topicid.indexOf('=') + 1, topicid.indexOf(' '));
  121. String docno = line.substring(line.indexOf("docno="));
  122. docno = docno.substring(docno.indexOf('=') + 1, docno.indexOf(' '));
  123. String fileid = line.substring(line.indexOf("fileid="));
  124. fileid = fileid.substring(fileid.indexOf('=') + 1, fileid.indexOf(' '));
  125. if (!filterNonEnglish || !(fileid.endsWith("MAN") || fileid.endsWith("ARB") || fileid.endsWith("TWN")))
  126. if (topics.containsKey(topicid))
  127. topics.get(topicid).add(docno);
  128. else {
  129. HashSet<String> docs = new HashSet<String>();
  130. docs.add(docno);
  131. topics.put(topicid, docs);
  132. }
  133. }
  134. in.close();
  135. }
  136. public static void ideaTest() {
  137. HashSet<String> stopwords = Utils.importStopwords();
  138. // Document.generateDataFiles(5, .2);
  139. HashMap<String, Double> DF = new HashMap<String, Double>();
  140. HashMap<String, Document> docs = new HashMap<String, Document>();
  141. ArrayList<Topic> events = new ArrayList<Topic>();
  142. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  143. GraphAnalyze g = new GraphAnalyze(constants);
  144. g.buildGraph(docs, DF, constants.REMOVE_DUPLICATES);
  145. HashMap<Node, HashSet<Topic>> nodeEvents = new HashMap<Node, HashSet<Topic>>();
  146. HashMap<Topic, HashMap<String, Node>> eventNodes = new HashMap<Topic, HashMap<String, Node>>();
  147. for (Node n : g.graphNodes.values())
  148. nodeEvents.put(n, new HashSet<Topic>());
  149. for (Topic e : events)
  150. eventNodes.put(e, new HashMap<String, Node>());
  151. for (Topic e : events) {
  152. HashSet<Node> nodes = new HashSet<Node>();
  153. for (Document d : e.docs.values())
  154. for (Keyword k : d.keywords.values()) {
  155. Node n = g.graphNodes.get(k.baseForm);
  156. if (n != null)
  157. if (nodes.add(n)) {
  158. nodeEvents.get(n).add(e);
  159. eventNodes.get(e).put(n.keyword.baseForm, n);
  160. }
  161. }
  162. }
  163. for (Node n : nodeEvents.keySet())
  164. if (nodeEvents.get(n).size() > 1)
  165. for (HashMap<String, Node> nodes : eventNodes.values())
  166. nodes.remove(n.keyword.baseForm);
  167. // for (Node n : g.graphNodes.values()) {
  168. // System.out.print("##" + n.keyword.baseForm + " : ");
  169. // for (Event e : nodeEvents.get(n))
  170. // System.out.print(e.id + ", ");
  171. // System.out.println();
  172. // }
  173. ArrayList<HashMap<String, Node>> communities = new ArrayList<HashMap<String, Node>>();
  174. for (HashMap<String, Node> community : eventNodes.values())
  175. // communities.add(getBiggestConnectedComponnent(community));
  176. communities.add(community);
  177. EvaluationAnalyze.evaluate(events, new DocumentAnalyze(constants).extractClustersFromKeyCommunity(docs, communities, DF, docs.size(), g.graphNodes));
  178. System.out.println("done");
  179. }
  180. public static HashMap<String, Node> getBiggestConnectedComponnent(HashMap<String, Node> community) {
  181. int originalsize = community.size();
  182. int biggestSize = -1;
  183. HashMap<String, Node> biggestCommunity = new HashMap<String, Node>();
  184. ArrayList<HashMap<String, Node>> ccs = new GraphAnalyze(constants).findConnectedComponentsFromSubset(community);
  185. for (HashMap<String, Node> cc : ccs)
  186. if (cc.size() > biggestSize) {
  187. biggestSize = cc.size();
  188. biggestCommunity = cc;
  189. }
  190. System.out.println("new/original:::::" + biggestSize + "/" + originalsize);
  191. return biggestCommunity;
  192. }
  193. public static double isDense(HashMap<String, Node> community) {
  194. double count = 0;
  195. for (Node node : community.values())
  196. for (Node n : community.values())
  197. if (n.edges.containsKey(Edge.getId(node, n)))
  198. count++;
  199. return count / community.size() / (community.size() - 1);
  200. }
  201. public static void runFindEventsStream() throws Exception {
  202. // String[] inputs = new String[] { "2000-10", "2000-11", "2000-12",
  203. // "2001-01" };
  204. Timestamp start = Timestamp.valueOf("2000-10-01 12:00:00");
  205. Timestamp end = Timestamp.valueOf("2001-01-31 23:59:59");
  206. int windowSize = 40;
  207. int windowShiftSize = windowSize / 2;
  208. HashSet<String> stopwords = Utils.importStopwords();
  209. ArrayList<DocumentCluster> eventStream = new ArrayList<DocumentCluster>();
  210. PrintStream out = new PrintStream(new File("eventStream.txt"));
  211. HashMap<String, Double> DF = new HashMap<String, Double>();
  212. HashMap<String, Document> docs = new HashMap<String, Document>();
  213. long DayToMiliSeconds = (long) 24 * 60 * 60 * 1000;
  214. ArrayList<Topic> topics = new ArrayList<Topic>();
  215. System.out.println("[" + start + ", " + new Timestamp(start.getTime() + windowSize * DayToMiliSeconds) + "]");
  216. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(topics, docs, stopwords, DF, start, new Timestamp(start.getTime() + windowSize
  217. * DayToMiliSeconds));
  218. DocumentAnalyze documentAnalyzer = new DocumentAnalyze(constants);
  219. ArrayList<DocumentCluster> lastEvents = documentAnalyzer.clusterbyKeyGraph(docs, DF);
  220. for (Timestamp startDay = new Timestamp(start.getTime() + windowShiftSize * DayToMiliSeconds); startDay.before(new Date(end.getTime() - windowSize
  221. * DayToMiliSeconds)); startDay.setTime(startDay.getTime() + windowShiftSize * DayToMiliSeconds)) {
  222. DF = new HashMap<String, Double>();
  223. docs = new HashMap<String, Document>();
  224. topics = new ArrayList<Topic>();
  225. out.println();
  226. System.out.println("[" + startDay + ", " + new Timestamp(startDay.getTime() + windowSize * DayToMiliSeconds) + "]");
  227. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(topics, docs, stopwords, DF, startDay, new Timestamp(startDay.getTime() + windowSize
  228. * DayToMiliSeconds));
  229. out.println(docs.size() + " docs are loaded!");
  230. ArrayList<DocumentCluster> events = documentAnalyzer.clusterbyKeyGraph(docs, DF);
  231. // ----------- merge events -----------
  232. for (DocumentCluster dc1 : lastEvents) {
  233. boolean isMerged = false;
  234. for (DocumentCluster dc2 : events)
  235. if (documentAnalyzer.intersectDocs(dc1.docs, dc2.docs) > 3) {
  236. isMerged = true;
  237. dc2.docs.putAll(dc1.docs);
  238. dc2.keyGraph.putAll(dc2.keyGraph);
  239. }
  240. if (!isMerged) {
  241. eventStream.add(dc1);
  242. // --- Save and Print
  243. // documentAnalyzer.printCluster(dc1, out, false);
  244. dc1.serialize(out, false);
  245. // dc1.docs = null;
  246. }
  247. }
  248. lastEvents = events;
  249. events = null;
  250. }
  251. for (DocumentCluster dc1 : lastEvents) {
  252. eventStream.add(dc1);
  253. // --- Save and Print
  254. out.println();
  255. // documentAnalyzer.printCluster(dc1, out, false);
  256. dc1.serialize(out, false);
  257. // dc1.docs = null;
  258. }
  259. System.out.println("done");
  260. topics = new ArrayList<Topic>();
  261. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(topics, docs, stopwords, DF);
  262. EvaluationAnalyze.evaluate(topics, eventStream);
  263. }
  264. public static String[] slice(String[] input, int start, int end) {
  265. String[] out = new String[end - start + 1];
  266. for (int i = 0; i < out.length; i++)
  267. out[i] = input[start + i];
  268. return out;
  269. }
  270. public static void run() throws Exception {
  271. HashSet<String> stopwords = Utils.importStopwords();
  272. HashMap<String, Double> DF = new HashMap<String, Double>();
  273. HashMap<String, Document> docs = new HashMap<String, Document>();
  274. ArrayList<Topic> events = new ArrayList<Topic>();
  275. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  276. System.out.println(docs.size() + " docs are loaded!");
  277. ArrayList<DocumentCluster> clusters = new DocumentAnalyze(constants).clusterbyKeyGraph(docs, DF);
  278. // ArrayList<DocumentCluster> clusters = new
  279. // DocumentAnalyze(constants).clusterByLDA(docs, DF,"tdt-mallet");
  280. // for(Event e:events){
  281. // ArrayList<String> toRemove=new ArrayList<String>();
  282. // for(String docid:e.docs.keySet())
  283. // if(!docs.containsKey(docid))
  284. // toRemove.add(docid);
  285. // for(String id:toRemove)
  286. // e.docs.remove(id);
  287. // }
  288. EvaluationAnalyze.evaluate(events, clusters);
  289. // DocumentAnalyze.printClusters(clusters, new PrintStream(new File("events.txt")));
  290. DocumentCluster.serializeAll(clusters, new PrintStream(new File("events.txt")), false);
  291. System.out.println("done");
  292. }
  293. public static void prepareDataforLDADave() throws Exception {
  294. HashSet<String> stopwords = Utils.importStopwords();
  295. HashMap<String, Double> DF = new HashMap<String, Double>();
  296. HashMap<String, Document> docs = new HashMap<String, Document>();
  297. ArrayList<Topic> events = new ArrayList<Topic>();
  298. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  299. System.out.println(docs.size() + " docs are loaded!");
  300. HashMap<String, Integer> indexes = new HashMap<String, Integer>();
  301. int i = 0;
  302. for (String key : DF.keySet())
  303. indexes.put(key, ++i);
  304. i = 0;
  305. BufferedWriter dataFile = new BufferedWriter(new FileWriter("tdt.dave.data.txt"));
  306. BufferedWriter indexFile = new BufferedWriter(new FileWriter("tdt.dave.index.txt"));
  307. for (Document d : docs.values()) {
  308. indexFile.write((++i) + " " + d.id + "\n");
  309. dataFile.write("" + d.keywords.size());
  310. for (Keyword k : d.keywords.values())
  311. dataFile.write(" " + indexes.get(k.baseForm) + ":" + (int) (k.tf));
  312. dataFile.write("\n");
  313. }
  314. dataFile.close();
  315. indexFile.close();
  316. }
  317. public static void prepareDataforLDAMallet() throws Exception {
  318. HashSet<String> stopwords = Utils.importStopwords();
  319. HashMap<String, Double> DF = new HashMap<String, Double>();
  320. HashMap<String, Document> docs = new HashMap<String, Document>();
  321. ArrayList<Topic> events = new ArrayList<Topic>();
  322. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  323. System.out.println(docs.size() + " docs are loaded!");
  324. // BufferedWriter dataFile = new BufferedWriter(new
  325. // FileWriter("tdt.mallet.data.txt"));
  326. for (Document d : docs.values()) {
  327. BufferedWriter dataFile = new BufferedWriter(new FileWriter("/nfshomes/sayyadi/Desktop/GeoNets/hassan/source_codes/mallet-2.0.5/tdtdata_new/"
  328. + d.id + ".txt"));
  329. // dataFile.write(d.id+".txt eng");
  330. for (Keyword k : d.keywords.values())
  331. for (int z = 0; z < k.tf; z++)
  332. dataFile.write(" " + k.baseForm);
  333. dataFile.write("\n");
  334. dataFile.close();
  335. }
  336. // dataFile.close();
  337. }
  338. public static void robustness() throws Exception {
  339. HashSet<String> stopwords = Utils.importStopwords();
  340. DataOutputStream out = new DataOutputStream(new FileOutputStream("parameters.txt"));
  341. for (double i = .05; i <= .5; i += .05) {
  342. constants.EDGE_CORRELATION_MIN = i;
  343. HashMap<String, Double> DF = new HashMap<String, Double>();
  344. HashMap<String, Document> docs = new HashMap<String, Document>();
  345. ArrayList<Topic> events = new ArrayList<Topic>();
  346. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  347. try {
  348. double[] result = EvaluationAnalyze.evaluate(events, new DocumentAnalyze(constants).clusterbyKeyGraph(docs, DF));
  349. out.writeBytes(i + "\t" + result[0] + "\t" + result[1] + "\t" + result[2] + "\r\r\n");
  350. out.flush();
  351. } catch (Exception e) {
  352. e.printStackTrace();
  353. }
  354. }
  355. out.close();
  356. System.out.println("done");
  357. }
  358. public static void runParameterEstimation() throws Exception {
  359. HashSet<String> stopwords = Utils.importStopwords();
  360. DataOutputStream out = new DataOutputStream(new FileOutputStream("parameters.txt"));
  361. for (double i = .1; i < .5; i += .05)
  362. for (double j = .001; j < .03; j += .005)
  363. for (double k = 0.25; k > .1; k -= .03)
  364. for (double l = .25; l <= .25; l += .05)
  365. for (int m = 2; m < 6; m++)
  366. for (int n = 3; n < 8; n++)
  367. {
  368. constants.DOC_SIM2KEYGRAPH_MIN = i;
  369. constants.DOC_SIM2CENTROID_MIN = j;
  370. constants.EDGE_CORRELATION_MIN = k;
  371. constants.EDGE_DF_MIN = m;
  372. constants.NODE_DF_MIN = n;
  373. constants.CLUSTER_INTERSECT_MIN = l;
  374. HashMap<String, Double> DF = new HashMap<String, Double>();
  375. HashMap<String, Document> docs = new HashMap<String, Document>();
  376. ArrayList<Topic> events = new ArrayList<Topic>();
  377. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  378. try {
  379. double[] result = EvaluationAnalyze.evaluate(events, new DocumentAnalyze(constants).clusterbyKeyGraph(docs, DF));
  380. out.writeBytes(constants.DOC_SIM2KEYGRAPH_MIN + "\t" + constants.DOC_SIM2CENTROID_MIN + "\t"
  381. + constants.EDGE_CORRELATION_MIN + "\t" + constants.EDGE_DF_MIN + "\t" + constants.NODE_DF_MIN + "\t"
  382. + constants.CLUSTER_INTERSECT_MIN + "\t" + result[0] + "\t" + result[1] + "\t" + result[2] + "\r\r\n");
  383. out.flush();
  384. } catch (Exception e) {
  385. e.printStackTrace();
  386. }
  387. }
  388. out.close();
  389. System.out.println("done");
  390. }
  391. // public static void generateDateFiles() {
  392. // DateFormat df = new SimpleDateFormat("yyyyMMdd.HHmm");
  393. // try {
  394. // File f = new File(Constants.DATA_TOPIC_PATH);
  395. // File[] files = f.listFiles();
  396. // HashMap<String, HashSet<String>> dates = new HashMap<String,
  397. // HashSet<String>>();
  398. // for (File ff : files) {
  399. // DataInputStream reader = new DataInputStream(new FileInputStream(ff));
  400. // String line = null;
  401. // while ((line = reader.readLine()) != null && line.trim().length() > 0) {
  402. // Timestamp publishdate = new Timestamp(df.parse(line.substring(3,
  403. // line.lastIndexOf('.'))).getTime());
  404. // String key = publishdate.toString().substring(0, 7);
  405. // if (dates.get(key) == null)
  406. // dates.put(key, new HashSet<String>());
  407. // dates.get(key).add(line);
  408. // }
  409. // }
  410. // for (String key : dates.keySet()) {
  411. // BufferedWriter out = new BufferedWriter(new FileWriter("data/date/" +
  412. // key));
  413. // for (String file : dates.get(key))
  414. // out.write(file + "\n");
  415. // out.close();
  416. // }
  417. // } catch (Exception e) {
  418. // e.printStackTrace();
  419. // }
  420. // }
  421. //
  422. public static void test() throws Exception {
  423. // HashMap<String, Document> docs = new HashMap<String, Document>();
  424. BufferedReader in = null;
  425. String out = "$2/";
  426. HashSet<String> ids = new HashSet<String>();
  427. File f = new File("./data/topic_new/");
  428. File[] files = f.listFiles();
  429. for (int i = 0; i < files.length; i++) {
  430. in = new BufferedReader(new FileReader(files[i]));
  431. String line = null;
  432. while ((line = in.readLine()) != null)
  433. ids.add(line);
  434. }
  435. BufferedWriter writer = new BufferedWriter(new FileWriter("test.txt"));
  436. for (String id : ids)
  437. writer.write("cp $1/" + id + ".txt " + out + "\n");
  438. writer.close();
  439. }
  440. public static void keygraphGrow2() throws Exception {
  441. HashSet<String> stopwords = Utils.importStopwords();
  442. Porter porter = new Porter();
  443. HashMap<String, Double> DF = new HashMap<String, Double>();
  444. DataInputStream topicin = new DataInputStream(new FileInputStream("/fs/clip-clip-proj/GeoNets/EntityTDT/hassan/output/allfiletopics/alltopics.txt"));
  445. String fileName = null;
  446. int i = 0;
  447. while ((fileName = topicin.readLine()) != null) {
  448. try {
  449. DataInputStream in = new DataInputStream(new FileInputStream(constants.DATA_TEXT_PATH + fileName + ".txt"));
  450. String content = null;
  451. String line = null;
  452. HashSet<String> keywords = new HashSet<String>();
  453. while ((line = in.readLine()) != null)
  454. content += line;
  455. StringTokenizer st = new StringTokenizer(content, "!?|\"' -_@0123456789.,;#$&%/\\*()<>\t");
  456. while (st.hasMoreTokens()) {
  457. String word = st.nextToken();
  458. String token = word.toLowerCase();
  459. String base = "";
  460. if ((token.indexOf("?") == -1 && token.length() > 2 && !stopwords.contains(token)))
  461. base = porter.stripAffixes(token);
  462. if (base.length() > 2)
  463. keywords.add(base);
  464. }
  465. i++;
  466. in.close();
  467. for (String word : keywords)
  468. if (!DF.containsKey(word))
  469. DF.put(word, 1.);
  470. else
  471. DF.put(word, DF.get(word) + 1);
  472. } catch (Exception e) {
  473. e.printStackTrace();
  474. }
  475. if (i % 1000 == 0)
  476. System.out.println(i + "\t" + sizeof(DF, i));
  477. }
  478. }
  479. public static int sizeof(HashMap<String, Double> DF, double size) {
  480. int count = 0;
  481. for (double df : DF.values())
  482. if (df < size / 4 & df >= size / 100)
  483. count++;
  484. return count;
  485. }
  486. public static void keygraphGrow() throws Exception {
  487. DataInputStream in = new DataInputStream(new FileInputStream("grow.txt"));
  488. String line = null;
  489. String content = "";
  490. while ((line = in.readLine()) != null)
  491. content += line + "\n";
  492. in.close();
  493. DataOutputStream out = new DataOutputStream(new FileOutputStream("grow.txt"));
  494. out.writeBytes(content);
  495. ArrayList<Integer> docscount = new ArrayList<Integer>();
  496. ArrayList<Integer> wordscount = new ArrayList<Integer>();
  497. for (double perc = .01; perc <= .3; perc += .02) {
  498. NewsDataLoader.percent = perc;
  499. HashSet<String> stopwords = Utils.importStopwords();
  500. HashMap<String, Double> DF = new HashMap<String, Double>();
  501. HashMap<String, Document> docs = new HashMap<String, Document>();
  502. ArrayList<Topic> events = new ArrayList<Topic>();
  503. new NewsDataLoader(constants).loadDocumentKeyFilesByTopics(events, docs, stopwords, DF);
  504. System.out.println(docs.size() + " docs are loaded!");
  505. GraphAnalyze keyGraph = new GraphAnalyze(constants);
  506. keyGraph.buildGraph(docs, DF, constants.REMOVE_DUPLICATES);
  507. docscount.add(docs.size());
  508. wordscount.add(keyGraph.graphNodes.size());
  509. out.writeBytes(docs.size() + "\t" + keyGraph.graphNodes.size() + "\n");
  510. out.flush();
  511. }
  512. System.out.println("*****");
  513. for (Integer s : docscount)
  514. System.out.println(s);
  515. System.out.println("*****");
  516. for (Integer s : wordscount)
  517. System.out.println(s);
  518. out.close();
  519. }
  520. }