PageRenderTime 45ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/src/edu/psu/chemxseer/structure/preprocess/RandomChoseDBGraph.java

https://github.com/Santa827/Chemxseer_subSearch
Java | 396 lines | 301 code | 17 blank | 78 comment | 32 complexity | a8ec43c64da158b27360fcd26f9ffe28 MD5 | raw file
  1. package edu.psu.chemxseer.structure.preprocess;
  2. import java.io.BufferedReader;
  3. import java.io.BufferedWriter;
  4. import java.io.FileReader;
  5. import java.io.FileWriter;
  6. import java.io.IOException;
  7. import java.text.ParseException;
  8. import java.text.SimpleDateFormat;
  9. import java.util.Arrays;
  10. import java.util.Date;
  11. import java.util.Random;
  12. import de.parmol.graph.Graph;
  13. import de.parmol.parsers.GraphParser;
  14. import edu.psu.chemxseer.structure.subsearch.FGindex.EdgeIndex;
  15. import edu.psu.chemxseer.structure.subsearch.Impl.GraphDatabase_OnDisk;
  16. import edu.psu.chemxseer.structure.subsearch.Interfaces.GraphDatabase;
  17. public class RandomChoseDBGraph {
  18. public static void saveGDB(Graph[] graphs, GraphParser gParser, String fileName) throws IOException{
  19. // First Step: filter out graphs that are not in between the boundary
  20. BufferedWriter dbWriter = new BufferedWriter(new FileWriter(fileName));
  21. int count = 0;
  22. float edgeNum = 0, nodeNum =0;
  23. for(int i = 0; i< graphs.length; i++){
  24. Graph theGraph = graphs[i];
  25. dbWriter.write(count++ + " => " + gParser.serialize(theGraph)+"\n");
  26. edgeNum += theGraph.getEdgeCount();
  27. nodeNum += theGraph.getNodeCount();
  28. }
  29. dbWriter.close();
  30. // Intrigue java garbage collector
  31. Runtime r = Runtime.getRuntime();
  32. r.gc();
  33. // Write the meta information of the new file
  34. BufferedWriter metaWriter = new BufferedWriter(new FileWriter(fileName + "_Meta"));
  35. // 1. Processing Date
  36. SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
  37. Date date = new Date();
  38. metaWriter.write(bartDateFormat.format(date));
  39. metaWriter.newLine();
  40. // 2. Number of graphs in this file
  41. metaWriter.write("Number of Graphs:" + count);
  42. metaWriter.newLine();
  43. metaWriter.write("Average EdgeNum: " + (float)(edgeNum)/count + ", Average NodeNum: " +
  44. (float)(nodeNum)/count);
  45. // Close meta data file
  46. try {
  47. metaWriter.close();
  48. } catch (IOException e) {
  49. e.printStackTrace();
  50. }
  51. }
  52. /**
  53. * Given a graph database gDB, first prune distinct edges with the edgeIndex
  54. * Then randomly selected chooseN graphs
  55. * @param gDB
  56. * @param chooseN
  57. * @param pruneDistinctEdges
  58. * @param edgeIndex
  59. * @return
  60. * @throws IOException
  61. * @throws ParseException
  62. */
  63. public static Graph[] randomlyChooseDBGraph(GraphDatabase_OnDisk gDB, int chooseN,
  64. boolean pruneDistinctEdges, EdgeIndex edgeIndex) throws IOException, ParseException {
  65. if(pruneDistinctEdges == false)
  66. return randomlyChooseDBGraph(gDB, chooseN);
  67. // First Step: filter out graphs that are not in between the boundary
  68. String tempFileName = gDB.getDBFileName() + "_temp";
  69. BufferedWriter tempDBWriter = new BufferedWriter(new FileWriter(tempFileName));
  70. int count = 0;
  71. float edgeNum = 0, nodeNum =0;
  72. for(int i = 0; i< gDB.getTotalNum(); i++){
  73. Graph theGraph = gDB.findGraph(i);
  74. if(edgeIndex.containInfrequentEdges(theGraph))
  75. continue;
  76. else{
  77. tempDBWriter.write(count++ + " => " + gDB.findGraphString(i)+"\n");
  78. edgeNum += theGraph.getEdgeCount();
  79. nodeNum += theGraph.getNodeCount();
  80. }
  81. }
  82. tempDBWriter.close();
  83. // Intrigue java garbage collector
  84. Runtime r = Runtime.getRuntime();
  85. r.gc();
  86. // Write the meta information of the new file
  87. BufferedWriter metaWriter = new BufferedWriter(new FileWriter(tempFileName + "_Meta"));
  88. // 1. Processing Date
  89. SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
  90. Date date = new Date();
  91. metaWriter.write(bartDateFormat.format(date));
  92. metaWriter.newLine();
  93. // 2. Number of graphs in this file
  94. metaWriter.write("Number of Graphs:" + count);
  95. metaWriter.newLine();
  96. metaWriter.write("Average EdgeNum: " + (float)(edgeNum)/count + ", Average NodeNum: " +
  97. (float)(nodeNum)/count);
  98. // Close meta data file
  99. try {
  100. metaWriter.close();
  101. } catch (IOException e) {
  102. e.printStackTrace();
  103. }
  104. return randomlyChooseDBGraph(new GraphDatabase_OnDisk(tempFileName, gDB.getParser()), chooseN);
  105. }
  106. /**
  107. * Given a graph database gDB, randomly chooseN graphs, then return those graphs
  108. * @param gDB
  109. * @param chooseN
  110. * @return
  111. * @throws ParseException
  112. * @throws IOException
  113. */
  114. public static Graph[] randomlyChooseDBGraph(GraphDatabase gDB, int chooseN) throws ParseException, IOException{
  115. // First get the number of graphs in this database, which can be found in the metadata file
  116. int m = gDB.getTotalNum();
  117. //System.out.println("Choose " + chooseN + " from " + m);
  118. if(m < chooseN){
  119. System.out.println("There is not need to choseN, the database is smaller than chooseN");
  120. return null;
  121. }
  122. // Then randomly select n from m: the first chooseN indexes are sorted as stored in indexes
  123. int[] indexes = new int[m];
  124. for(int i = 0; i< m; i++)
  125. indexes[i] = i;
  126. Random rd = new Random();
  127. int j = 0;
  128. int swapTemp = 0;;
  129. for(int i = 0; i< chooseN; i++){
  130. j = (int) (rd.nextFloat() * (m-i))+i;
  131. swapTemp = indexes[i];
  132. indexes[i]=indexes[j];
  133. indexes[j] = swapTemp;
  134. }
  135. Arrays.sort(indexes, 0, chooseN);
  136. // Read those db graphs and save them into the new file
  137. Graph[] results = new Graph[chooseN];
  138. for(int i = 0; i< chooseN; i++){
  139. int gID = indexes[i];
  140. results[i] = gDB.findGraph(gID);
  141. }
  142. // Intrigue java garbage collector
  143. Runtime r = Runtime.getRuntime();
  144. r.gc();
  145. return results;
  146. }
  147. /**
  148. * Given a graph database gDB, randomly choose N graphs, and then store them in the chosen database
  149. * @param gDB
  150. * @param chooseN
  151. * @param chosenDBName
  152. * @return
  153. * @throws IOException
  154. */
  155. public static void randomlyChooseDBGraph(GraphDatabase gDB, int chooseN, String chosenDBName) throws IOException{
  156. // First get the number of graphs in this database, which can be found in the metadata file
  157. int m = gDB.getTotalNum();
  158. System.out.println("Choose " + chooseN + " from " + m);
  159. if(m < chooseN){
  160. System.out.println("There is not need to choseN, the database is smaller than chooseN");
  161. return;
  162. }
  163. // Then randomly select n from m: the first chooseN indexes are sorted as stored in indexes
  164. int[] indexes = new int[m];
  165. for(int i = 0; i< m; i++)
  166. indexes[i] = i;
  167. Random rd = new Random();
  168. int j = 0;
  169. int swapTemp = 0;;
  170. for(int i = 0; i< chooseN; i++){
  171. j = (int) (rd.nextFloat() * (m-i))+i;
  172. swapTemp = indexes[i];
  173. indexes[i]=indexes[j];
  174. indexes[j] = swapTemp;
  175. }
  176. Arrays.sort(indexes, 0, chooseN);
  177. BufferedWriter chosenDBWriter = new BufferedWriter(new FileWriter(chosenDBName));
  178. String spliter = " => ";
  179. // Read those db graphs and save them into the new file
  180. for(int i = 0; i< chooseN; i++){
  181. int gID = indexes[i];
  182. String gString = gDB.findGraphString(gID);
  183. chosenDBWriter.write(i + spliter +gString);
  184. chosenDBWriter.newLine();
  185. }
  186. chosenDBWriter.close();
  187. // Intrigue java garbage collector
  188. Runtime r = Runtime.getRuntime();
  189. r.gc();
  190. BufferedWriter metaWriter = new BufferedWriter(new FileWriter(chosenDBName+ "_Meta"));
  191. // 1. Processing Date
  192. SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
  193. Date date = new Date();
  194. metaWriter.write(bartDateFormat.format(date));
  195. metaWriter.newLine();
  196. // 2. Number of graphs in this file
  197. metaWriter.write("Number of Graphs:" + chooseN);
  198. metaWriter.newLine();
  199. metaWriter.write("Average EdgeNum: " + 0 + ", Average NodeNum: " +
  200. 0);
  201. // Close meta data file
  202. try {
  203. metaWriter.close();
  204. } catch (IOException e) {
  205. e.printStackTrace();
  206. }
  207. }
  208. public static void randomlySplitDBGraph(GraphDatabase gDB, int chooseN, String chosenDBName,
  209. String leftDBName) throws IOException{
  210. // First get the number of graphs in this database, which can be found in the metadata file
  211. int m = gDB.getTotalNum();
  212. System.out.println("Choose " + chooseN + " from " + m);
  213. if(m < chooseN){
  214. System.out.println("There is not need to choseN, the database is smaller than chooseN");
  215. return;
  216. }
  217. // Then randomly select n from m: the first chooseN indexes are sorted as stored in indexes
  218. int[] indexes = new int[m];
  219. for(int i = 0; i< m; i++)
  220. indexes[i] = i;
  221. Random rd = new Random();
  222. int j = 0;
  223. int swapTemp = 0;;
  224. for(int i = 0; i< chooseN; i++){
  225. j = (int) (rd.nextFloat() * (m-i))+i;
  226. swapTemp = indexes[i];
  227. indexes[i]=indexes[j];
  228. indexes[j] = swapTemp;
  229. }
  230. Arrays.sort(indexes, 0, chooseN);
  231. Arrays.sort(indexes, chooseN, m);
  232. BufferedWriter chosenDBWriter = new BufferedWriter(new FileWriter(chosenDBName));
  233. String spliter = " => ";
  234. // Read those db graphs and save them into the new file
  235. for(int i = 0; i< chooseN; i++){
  236. int gID = indexes[i];
  237. String gString = gDB.findGraphString(gID);
  238. chosenDBWriter.write(i + spliter +gString);
  239. chosenDBWriter.newLine();
  240. }
  241. chosenDBWriter.close();
  242. // Read those db graphs and save them into the new file
  243. BufferedWriter leftDBWriter = new BufferedWriter(new FileWriter(leftDBName));
  244. for(int i = chooseN; i< m; i++){
  245. int gID = indexes[i];
  246. String gString = gDB.findGraphString(gID);
  247. leftDBWriter.write((i-chooseN) + spliter +gString);
  248. leftDBWriter.newLine();
  249. }
  250. leftDBWriter.close();
  251. // Intrigue java garbage collector
  252. Runtime r = Runtime.getRuntime();
  253. r.gc();
  254. BufferedWriter metaWriter = new BufferedWriter(new FileWriter(chosenDBName+ "_Meta"));
  255. BufferedWriter metaWriter2 = new BufferedWriter(new FileWriter(leftDBName + "_Meta"));
  256. // 1. Processing Date
  257. SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
  258. Date date = new Date();
  259. metaWriter.write(bartDateFormat.format(date));
  260. metaWriter.newLine();
  261. metaWriter2.write(bartDateFormat.format(date));
  262. metaWriter2.newLine();
  263. // 2. Number of graphs in this file
  264. metaWriter.write("Number of Graphs:" + chooseN);
  265. metaWriter.newLine();
  266. metaWriter.write("Average EdgeNum: " + 0 + ", Average NodeNum: " +
  267. 0);
  268. metaWriter2.write("Number of Graphs:" + (m-chooseN));
  269. metaWriter2.newLine();
  270. metaWriter2.write("Average EdgeNum: " + 0 + ", Average NodeNum: " +
  271. 0);
  272. // Close meta data file
  273. try {
  274. metaWriter.close();
  275. metaWriter2.close();
  276. } catch (IOException e) {
  277. e.printStackTrace();
  278. }
  279. }
  280. /**
  281. *
  282. * @param smilesDBFile
  283. * @param chooseN
  284. * @param chosenDBFile
  285. * @throws ParseException
  286. * @throws IOException
  287. */
  288. public static void randomlyChooseTwoDBGraph(String smilesDBFile, int chooseN,
  289. String chosenDBFile) throws ParseException, IOException{
  290. // First get the number of graphs in this database, which can be found in the metadata file
  291. BufferedReader metaFile = new BufferedReader(new FileReader(smilesDBFile + "_Meta"));
  292. metaFile.readLine();
  293. String[] temp = metaFile.readLine().split(":");
  294. int m = Integer.parseInt(temp[1]);
  295. metaFile.close();
  296. int edgeNum = 0, nodeNum = 0;
  297. // Then randomly select n from m: the first chooseN indexes are sorted as stored in indexes
  298. int[] indexes = new int[m];
  299. for(int i = 0; i< m; i++)
  300. indexes[i] = i;
  301. Random rd = new Random();
  302. int j = 0;
  303. int swapTemp = 0;;
  304. for(int i = 0; i< 2*chooseN; i++){
  305. j = (int) (rd.nextFloat() * (m-i))+i;
  306. swapTemp = indexes[i];
  307. indexes[i]=indexes[j];
  308. indexes[j] = swapTemp;
  309. }
  310. Arrays.sort(indexes, 0, 2*chooseN);
  311. // Read those db graphs and save them into the new file
  312. BufferedReader fullDBReader = new BufferedReader(new FileReader(smilesDBFile));
  313. BufferedWriter chosenDBWriter1 = new BufferedWriter(new FileWriter(chosenDBFile+"_1"));
  314. BufferedWriter chosenDBWriter2 = new BufferedWriter(new FileWriter(chosenDBFile+"_2"));
  315. int fileLineIndex = 0;
  316. int i = 0;
  317. String aLine = null;
  318. String spliter = " => ";
  319. while((aLine = fullDBReader.readLine())!=null && i < 2* chooseN){
  320. if(fileLineIndex < indexes[i]){
  321. fileLineIndex++;
  322. continue; // keep on reading
  323. }
  324. else if(fileLineIndex == indexes[i] && i < chooseN){
  325. //index=> orignalIndex =>smiles
  326. String gString = aLine.split(spliter)[1];
  327. Graph g = MyFactory.getSmilesParser().parse(gString, MyFactory.getGraphFactory());
  328. edgeNum += g.getEdgeCount();
  329. nodeNum += g.getNodeCount();
  330. chosenDBWriter1.write(i + spliter +gString);
  331. chosenDBWriter1.newLine();
  332. i++;
  333. fileLineIndex++;
  334. }
  335. else if(fileLineIndex == indexes[i] && i >= chooseN){
  336. //index=> orignalIndex =>smiles
  337. String gString = aLine.split(spliter)[1];
  338. Graph g = MyFactory.getSmilesParser().parse(gString, MyFactory.getGraphFactory());
  339. edgeNum += g.getEdgeCount();
  340. nodeNum += g.getNodeCount();
  341. chosenDBWriter2.write(i + spliter +gString);
  342. chosenDBWriter2.newLine();
  343. i++;
  344. fileLineIndex++;
  345. }
  346. else if(fileLineIndex > indexes[i])
  347. System.out.println("Exception: Processor: randomlyChooseDBGraph");
  348. }
  349. // Close out File
  350. try {
  351. chosenDBWriter1.close();
  352. chosenDBWriter2.close();
  353. fullDBReader.close();
  354. } catch (IOException e) {
  355. e.printStackTrace();
  356. }
  357. // Intrigue java garbage collector
  358. Runtime r = Runtime.getRuntime();
  359. r.gc();
  360. // Write the meta information of the new file
  361. BufferedWriter metaWriter = new BufferedWriter(new FileWriter(chosenDBFile+"_1" + "_Meta"));
  362. // 1. Processing Date
  363. SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
  364. Date date = new Date();
  365. metaWriter.write(bartDateFormat.format(date));
  366. metaWriter.newLine();
  367. // 2. Number of graphs in this file
  368. metaWriter.write("Number of Graphs:" + i);
  369. metaWriter.newLine();
  370. metaWriter.write("Average EdgeNum: " + (float)(edgeNum)/i + ", Average NodeNum: " +
  371. (float)(nodeNum)/i);
  372. // Close meta data file
  373. try {
  374. metaWriter.close();
  375. } catch (IOException e) {
  376. e.printStackTrace();
  377. }
  378. }
  379. }