PageRenderTime 46ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/live/src/main/java/org/dbpedia/extraction/live/helper/LiveConfigReader.java

https://gitlab.com/varunkothamachu/extraction-framework
Java | 275 lines | 159 code | 58 blank | 58 comment | 19 complexity | 82d9b8f27be45ff5867c445a19eb60eb MD5 | raw file
  1. package org.dbpedia.extraction.live.helper;
  2. import org.slf4j.Logger;
  3. // import org.apache.xerces.parsers.DOMParser;
  4. import org.dbpedia.extraction.live.core.Constants;
  5. import org.dbpedia.extraction.mappings.ArticleCategoriesExtractor;
  6. import org.dbpedia.extraction.mappings.SkosCategoriesExtractor;
  7. import org.dbpedia.extraction.util.Language;
  8. import org.dbpedia.extraction.wikiparser.Namespace;
  9. import org.dbpedia.extraction.wikiparser.impl.wikipedia.Namespaces;
  10. import org.slf4j.LoggerFactory;
  11. import org.w3c.dom.Document;
  12. import org.w3c.dom.Element;
  13. import org.w3c.dom.NodeList;
  14. import javax.xml.parsers.DocumentBuilder;
  15. import javax.xml.parsers.DocumentBuilderFactory;
  16. import java.io.File;
  17. import java.util.ArrayList;
  18. import java.util.HashMap;
  19. import java.util.List;
  20. import java.util.Map;
  21. /**
  22. * Created by IntelliJ IDEA.
  23. * User: mabrouk
  24. * Date: Jul 30, 2010
  25. * Time: 3:56:55 PM
  26. * This class reads the configuration file of the live extraction.
  27. */
  28. public class LiveConfigReader {
  29. private static Logger logger = LoggerFactory.getLogger(LiveConfigReader.class);
  30. // private static DOMParser parser = new DOMParser();
  31. private static final String liveConfigFile = "./live.xml";
  32. private static DocumentBuilderFactory dbFactory;
  33. private static DocumentBuilder dBuilder;
  34. private static Document doc;
  35. //Tag names that are use in live.config file
  36. private static final String EXTRACTOR_TAGNAME = "extractor";
  37. private static final String LANUAGE_TAGNAME = "language";
  38. private static final String UPDATE_ONTOLGY_AND_MAPPINGS_PERIOD_TAGNAME = "updateOntologyAndMappingsPeriod";
  39. private static final String NAME_ATTRIBUTENAME = "name";
  40. private static final String EXTRACTOR_STATUS_ATTRIBUTENAME = "status";
  41. private static final String MATCH_PATTERN_TAGNAME = "matchPattern";
  42. private static final String MATCH_PATTERN_TYPE_ATTRIBUTENAME = "type";
  43. private static final String PEXACT_ATTRIBUTENAME = "pexact";
  44. private static final String SUBJECT_TAGNAME = "s";
  45. private static final String PREDICATE_TAGNAME = "p";
  46. private static final String OBJECT_TAGNAME = "o";
  47. private static final String NOTICE_TAGNAME = "notice";
  48. public static Map<Language,Map<String, ExtractorSpecification>> extractors = null;
  49. public static Map<Language, List<Class>> extractorClasses = null;
  50. //Initialize the static members
  51. static{
  52. try{
  53. dbFactory = DocumentBuilderFactory.newInstance();
  54. dBuilder = dbFactory.newDocumentBuilder();
  55. doc = dBuilder.parse(new File(liveConfigFile));
  56. readExtractors();
  57. /** Ontology source */
  58. // JavaConversions.asEnumeration(WikiTitle.Namespace());
  59. // Source ontologySource = WikiSource.fromNamespaces(Set(WikiTitle.Namespace().OntologyClass, WikiTitle.Namespace.OntologyProperty),
  60. // new URL("http://mappings.dbpedia.org/api.php"), Language.Default() );
  61. //
  62. // /** Mappings source */
  63. // Source mappingsSource = WikiSource.fromNamespaces(Set(WikiTitle.Namespace.Mapping),
  64. // new URL("http://mappings.dbpedia.org/api.php"), Language.Default() );
  65. }
  66. catch(Exception exp){
  67. logger.error(exp.getMessage(), exp);
  68. }
  69. }
  70. /**
  71. * Reads each langauge along with its set of extractors
  72. */
  73. private static void readExtractors(){
  74. NodeList languageNodes = doc.getElementsByTagName(LANUAGE_TAGNAME);
  75. //iterate and build the required list of extractors
  76. extractors = new HashMap<Language,Map<String,ExtractorSpecification>>();
  77. extractorClasses = new HashMap<Language,List<Class>>();
  78. for(int i=0; i<languageNodes.getLength(); i++){
  79. Element elemLanguage = (Element)languageNodes.item(i);
  80. String languageName = elemLanguage.getAttribute(NAME_ATTRIBUTENAME);
  81. Language language = Language.apply(languageName);
  82. readLanguageExtractors(elemLanguage, language);
  83. }
  84. }
  85. /**
  86. * Gets the list of extractors specified in the config file along with the status of each extractor
  87. * @param elemLanguageExtractors The XML element containing the extractors of a language
  88. * @param lang The language code
  89. * */
  90. private static void readLanguageExtractors(Element elemLanguageExtractors, Language lang){
  91. try{
  92. NodeList extractorNodes = elemLanguageExtractors.getElementsByTagName(EXTRACTOR_TAGNAME);
  93. Map<String, ExtractorSpecification> langExtractors = new HashMap<String, ExtractorSpecification>(20);
  94. ArrayList<Class> langExtractorClasses = new ArrayList<Class>(20);
  95. //iterate and build the required list of extractors
  96. for(int i=0; i<extractorNodes.getLength(); i++){
  97. MatchPattern extractorSpecificPattern = null;
  98. Element elemExtractor = (Element)extractorNodes.item(i);
  99. String extractorID = elemExtractor.getAttribute(NAME_ATTRIBUTENAME);
  100. ExtractorStatus status = ExtractorStatus.valueOf(elemExtractor.getAttribute(EXTRACTOR_STATUS_ATTRIBUTENAME));
  101. langExtractorClasses.add((Class)(ClassLoader.getSystemClassLoader().loadClass(extractorID)));
  102. //Those types of extractors need special type of handling as we must call the function _addGenerics for
  103. //them
  104. if((extractorID.equals(SkosCategoriesExtractor.class.toString())) ||
  105. (extractorID.equals(ArticleCategoriesExtractor.class.toString())))
  106. extractorSpecificPattern = _addGenerics(lang, extractorID);
  107. ArrayList<MatchPattern> patternsList = _getExtractorMatchPatterns(elemExtractor);
  108. if(extractorSpecificPattern != null)
  109. patternsList.add(extractorSpecificPattern);
  110. //Construct the extractor specification object and adds it to the extractors list
  111. langExtractors.put(extractorID, new ExtractorSpecification(extractorID, status,
  112. patternsList, _getExtractorNotices(elemExtractor)));
  113. extractors.put(lang, langExtractors);
  114. extractorClasses.put(lang, langExtractorClasses);
  115. }
  116. // LiveExtractionConfigLoader.convertExtractorListToScalaList(extractorClasses);
  117. System.out.println(extractors);
  118. }
  119. catch(Exception exp){
  120. logger.error(exp.getMessage(), exp);
  121. }
  122. }
  123. /**
  124. * Loads the generic match patterns for some extractors e.g. SkosCategoriesExtractor, because those extractors
  125. * need a specific pattern for language specific category
  126. * @param lang The required language
  127. * @param extractorID The ID of the required extractor
  128. * @return The match pattern suitable for the passed extractor
  129. */
  130. private static MatchPattern _addGenerics(Language lang, String extractorID) {
  131. MatchPattern pattern = null;
  132. if(extractorID.equals(SkosCategoriesExtractor.class.toString())){
  133. pattern = new MatchPattern(MatchType.STARTSWITH, "", Constants.SKOS_BROADER,
  134. Namespaces.names(lang).get(Namespace.Category()).toString(), true);
  135. }
  136. else if(extractorID.equals(ArticleCategoriesExtractor.class.toString())){
  137. pattern = new MatchPattern(MatchType.STARTSWITH, "", Constants.SKOS_SUBJECT,
  138. Namespaces.names(lang).get(Namespace.Category()).toString() , true);
  139. }
  140. return pattern;
  141. }
  142. /**
  143. * Constructs a list of match patterns associated with the passed extractor
  144. * @param extractorElem XML element containing the full specification of the extractor
  145. * @return A list of patterns of the extractor
  146. */
  147. private static ArrayList<MatchPattern> _getExtractorMatchPatterns(Element extractorElem){
  148. ArrayList<MatchPattern> patterns = new ArrayList<MatchPattern>();
  149. NodeList patternNodes = extractorElem.getElementsByTagName(MATCH_PATTERN_TAGNAME);
  150. try{
  151. for(int i=0; i<patternNodes.getLength(); i++){
  152. Element elemPattern = (Element)patternNodes.item(i);
  153. MatchType type = MatchType.valueOf(elemPattern.getAttribute(MATCH_PATTERN_TYPE_ATTRIBUTENAME));
  154. boolean pexact = Boolean.parseBoolean(elemPattern.getAttribute(PEXACT_ATTRIBUTENAME));
  155. String subject = elemPattern.getElementsByTagName(SUBJECT_TAGNAME).item(0).getTextContent();
  156. //Since we are using name like RDFS_LABEL in the live.xml file, then we should use the reflection
  157. //to get its actual string value from the Constants class
  158. try{
  159. if(Constants.class.getField(subject).get(Constants.class) != null)
  160. subject = Constants.class.getField(subject).get(Constants.class).toString();
  161. }
  162. catch(Exception exp){}
  163. String predicate = elemPattern.getElementsByTagName(PREDICATE_TAGNAME).item(0).getTextContent();
  164. try{
  165. if(Constants.class.getField(predicate).get(Constants.class) != null)
  166. predicate = Constants.class.getField(predicate).get(Constants.class).toString();
  167. }
  168. catch(Exception exp){}
  169. String object = elemPattern.getElementsByTagName(OBJECT_TAGNAME).item(0).getTextContent();
  170. try{
  171. if(Constants.class.getField(object).get(Constants.class) != null)
  172. object = Constants.class.getField(object).get(Constants.class).toString();
  173. }
  174. catch(Exception exp){}
  175. patterns.add(new MatchPattern(type, subject, predicate, object, pexact));
  176. }
  177. }
  178. catch(Exception exp){
  179. }
  180. return patterns.size()>0? patterns : null;
  181. }
  182. /**
  183. * Constructs a list of notices associated with the passed extractor
  184. * @param extractorElem extractorElem XML element containing the full specification of the extractor
  185. * @return A list of norices of the extractor
  186. */
  187. private static ArrayList<String> _getExtractorNotices(Element extractorElem){
  188. ArrayList<String> notices = new ArrayList<String>();
  189. NodeList extractorNotices = extractorElem.getElementsByTagName(NOTICE_TAGNAME);
  190. for(int i=0; i<extractorNotices.getLength(); i++){
  191. Element noticeElem = (Element)extractorNotices.item(i);
  192. notices.add(noticeElem.getTextContent());
  193. }
  194. return notices.size()>0? notices : null;
  195. }
  196. /**
  197. * Returns the extractors with the passed status
  198. * @param lang The required language for which the extractors should be returned
  199. * @param requiredStatus The status of the extractors
  200. * @return A list containing the extractors of the passed status
  201. */
  202. public static List<Class> getExtractors(Language lang, ExtractorStatus requiredStatus){
  203. List<Class> extractorsList = extractorClasses.get(lang);
  204. Map<String, ExtractorSpecification> specs = extractors.get(lang);
  205. for(Object value : specs.values()){
  206. ExtractorSpecification spec = (ExtractorSpecification) value;
  207. if(spec.status != requiredStatus){
  208. try{
  209. extractorsList.remove(Class.forName(spec.extractorID));
  210. }
  211. catch(Exception exp){
  212. }
  213. }
  214. }
  215. return extractorsList;
  216. }
  217. }