PageRenderTime 6287ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/LuceneImplementation/src/com/selman/ReadMainFiles.java

https://github.com/selmantayyar/LuceneImplementation
Java | 160 lines | 115 code | 30 blank | 15 comment | 10 complexity | 73d1046bc856b0735907975ac4b800e0 MD5 | raw file
  1. package com.selman;
  2. import org.apache.poi.hwpf.HWPFDocument;
  3. import org.apache.poi.hwpf.extractor.WordExtractor;
  4. import org.apache.poi.poifs.filesystem.*;
  5. import java.io.*;
  6. import java.util.ArrayList;
  7. import java.util.List;
  8. import java.util.Properties;
  9. import java.util.regex.Matcher;
  10. import java.util.regex.Pattern;
  11. public class ReadMainFiles {
  12. public static List<Reference> readreferencesFile() throws IOException {
  13. String filesname = "C:\\Users\\stayyar\\Downloads\\references_for_selman.doc";
  14. POIFSFileSystem fs = null;
  15. String[] paragraphs =null;
  16. List<Reference> refObjectList = new ArrayList<Reference>();
  17. try {
  18. fs = new POIFSFileSystem(new FileInputStream(filesname));
  19. HWPFDocument doc = new HWPFDocument(fs);
  20. WordExtractor we = new WordExtractor(doc);
  21. paragraphs = we.getParagraphText();
  22. System.out.println("Word Document has " + paragraphs.length
  23. + " paragraphs");
  24. Pattern regex = Pattern.compile("[0-9]{4}");//find the date
  25. for (int i = 0; i < paragraphs.length; i++) {
  26. paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
  27. System.out.println("RAW PARAGRAPH: "+paragraphs[i]);
  28. Matcher regexMatcher = regex.matcher(paragraphs[i]);
  29. if (regexMatcher.find())
  30. {
  31. String date=regexMatcher.group();
  32. String authors=paragraphs[i].substring(0, paragraphs[i].indexOf(date));
  33. Reference ref=new Reference();
  34. ref.setAuthor(authors);
  35. ref.setYear(date);
  36. ref.setDetail(paragraphs[i].toString());
  37. refObjectList.add(ref);
  38. System.out.println("Ref List,authors and date: "+authors+" ==" +date);
  39. }
  40. }
  41. } catch (Exception e) {
  42. e.printStackTrace();
  43. }
  44. return refObjectList;
  45. }
  46. public static List<Reference> readThesisExtractReferences() throws Exception {
  47. String filesname = readProperty("INPUT_FILE_NAME");
  48. POIFSFileSystem fs = null;
  49. List<String> references = new ArrayList<String>();
  50. List<Reference> refObjectList = new ArrayList<Reference>();
  51. Pattern regex = Pattern.compile("[({\\[].*?[\\]})]");//get the values between paranthesis.
  52. int k=0;
  53. try {
  54. fs = new POIFSFileSystem(new FileInputStream(filesname));
  55. HWPFDocument doc = new HWPFDocument(fs);
  56. WordExtractor we = new WordExtractor(doc);
  57. String[] paragraphs = we.getParagraphText();
  58. System.out.println("Thesis Document has " + paragraphs.length
  59. + " paragraphs");
  60. for (int i = 0; i < paragraphs.length; i++) {
  61. paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
  62. Matcher regexMatcher = regex.matcher(paragraphs[i]);
  63. while (regexMatcher.find())
  64. { k++;
  65. //remove paranthesis
  66. String ref=regexMatcher.group().substring(1,regexMatcher.group().length()-1);
  67. //remove 'et al.' part to facilitate searching. there are lots of 'et al.' in a thesis.
  68. String newRef=ref.replace("et al.","");
  69. if(newRef.matches(".*[0-9]{4}.*"))//no number means no reference. because in each reference there should be a 4 digit date.
  70. references.add(newRef);
  71. }
  72. }
  73. } catch (Exception e) {
  74. e.printStackTrace();
  75. }
  76. System.out.println(+k +" references found");
  77. for (String string : references) {
  78. System.out.println("RAW REFERENCE: "+string);
  79. try {
  80. String []splitty=string.split(";");//refs are seperated by ';'
  81. for (int i = 0; i < splitty.length; i++) {
  82. String[] a=splitty[i].split(",");//eauthor and date in each ref seperated by ","
  83. if(a.length==2){
  84. Reference ref=new Reference();
  85. ref.setAuthor(a[0]);
  86. ref.setYear(a[1]);
  87. refObjectList.add(ref);
  88. System.out.println("reference: "+ref.toString());
  89. }
  90. else if (a.length==1) {//if comma is missed,we got the last 4 characters of a ref as date and the rest as author
  91. String year=splitty[i].substring(splitty[i].length()-5, splitty[i].length()) ;
  92. String author=splitty[i].substring(0, splitty[i].length()-5);
  93. Reference ref=new Reference();
  94. ref.setAuthor(author);
  95. ref.setYear(year);
  96. refObjectList.add(ref);
  97. System.out.println("reference irregular: "+ref.toString());
  98. }
  99. else{
  100. System.out.println("problematic entry: "+string);
  101. }
  102. }
  103. // if(a.length>2)
  104. // {
  105. // for (int i = 0; i < a.length; i++) {
  106. // System.out.println("authors and year: "+a[i]);
  107. // }
  108. // }
  109. } catch (Exception e) {
  110. // TODO Auto-generated catch block
  111. System.out.println("XXXXXXXXXXno semi-column splitting for "+string);
  112. }
  113. }
  114. return refObjectList;
  115. }
  116. public static String readProperty(String property) throws Exception, IOException{
  117. Properties prop = new Properties();
  118. prop.load(ReadMainFiles.class.getClassLoader().getResourceAsStream("config.properties"));
  119. return prop.getProperty(property);
  120. }
  121. /**
  122. * @param args
  123. * @throws IOException
  124. */
  125. public static void main(String[] args) throws IOException {
  126. // TODO Auto-generated method stub
  127. readreferencesFile();
  128. //readThesisExtractReferences();
  129. }
  130. }