LiveConfigReader.java - Tag names that are use in live.conf…

/live/src/main/java/org/dbpedia/extraction/live/helper/LiveConfigReader.java

https://gitlab.com/varunkothamachu/extraction-framework · Java · 275 lines · 159 code · 58 blank · 58 comment · 19 complexity · 82d9b8f27be45ff5867c445a19eb60eb MD5 · raw file


package org.dbpedia.extraction.live.helper;

import org.slf4j.Logger;
// import org.apache.xerces.parsers.DOMParser;
import org.dbpedia.extraction.live.core.Constants;
import org.dbpedia.extraction.mappings.ArticleCategoriesExtractor;
import org.dbpedia.extraction.mappings.SkosCategoriesExtractor;
import org.dbpedia.extraction.util.Language;
import org.dbpedia.extraction.wikiparser.Namespace;
import org.dbpedia.extraction.wikiparser.impl.wikipedia.Namespaces;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


/**
 * Created by IntelliJ IDEA.
 * User: mabrouk
 * Date: Jul 30, 2010
 * Time: 3:56:55 PM
 * This class reads the configuration file of the live extraction.
 */
public class LiveConfigReader {

    private static Logger logger = LoggerFactory.getLogger(LiveConfigReader.class);
    // private static DOMParser parser = new DOMParser();
    private static final String liveConfigFile = "./live.xml";

    private static DocumentBuilderFactory dbFactory;
    private static DocumentBuilder dBuilder;
    private static Document doc;



    //Tag names that are use in live.config file
    private static final String EXTRACTOR_TAGNAME = "extractor";
    private static final String LANUAGE_TAGNAME = "language";
    private static final String UPDATE_ONTOLGY_AND_MAPPINGS_PERIOD_TAGNAME = "updateOntologyAndMappingsPeriod";

    private static final String NAME_ATTRIBUTENAME = "name";
    private static final String EXTRACTOR_STATUS_ATTRIBUTENAME = "status";

    private static final String MATCH_PATTERN_TAGNAME = "matchPattern";
    private static final String MATCH_PATTERN_TYPE_ATTRIBUTENAME = "type";
    private static final String PEXACT_ATTRIBUTENAME = "pexact";

    private static final String SUBJECT_TAGNAME = "s";
    private static final String PREDICATE_TAGNAME = "p";
    private static final String OBJECT_TAGNAME = "o";
    private static final String NOTICE_TAGNAME = "notice";


    public static Map<Language,Map<String, ExtractorSpecification>>  extractors = null;

    public static Map<Language, List<Class>> extractorClasses = null;

    //Initialize the static members
    static{
        try{

            dbFactory = DocumentBuilderFactory.newInstance();
            dBuilder = dbFactory.newDocumentBuilder();
            doc = dBuilder.parse(new File(liveConfigFile));
            readExtractors();

            /** Ontology source */
//            JavaConversions.asEnumeration(WikiTitle.Namespace());
//    Source ontologySource = WikiSource.fromNamespaces(Set(WikiTitle.Namespace().OntologyClass, WikiTitle.Namespace.OntologyProperty),
//                                                   new URL("http://mappings.dbpedia.org/api.php"), Language.Default() );
//
//    /** Mappings source */
//    Source mappingsSource =  WikiSource.fromNamespaces(Set(WikiTitle.Namespace.Mapping),
//                                                    new URL("http://mappings.dbpedia.org/api.php"), Language.Default() );
        }
        catch(Exception exp){
            logger.error(exp.getMessage(), exp);
        }
    }

    /**
     * Reads each langauge along with its set of extractors
     */
    private static void readExtractors(){
        NodeList languageNodes = doc.getElementsByTagName(LANUAGE_TAGNAME);
        //iterate and build the required list of extractors
        extractors = new HashMap<Language,Map<String,ExtractorSpecification>>();
        extractorClasses = new HashMap<Language,List<Class>>();


        for(int i=0; i<languageNodes.getLength(); i++){

            Element elemLanguage = (Element)languageNodes.item(i);
            String languageName = elemLanguage.getAttribute(NAME_ATTRIBUTENAME);
            Language language = Language.apply(languageName);
            readLanguageExtractors(elemLanguage, language);
        }
    }

    /**
     * Gets the list of extractors specified in the config file along with the status of each extractor
     * @param   elemLanguageExtractors  The XML element containing the extractors of a language
     * @param   lang    The language code 
     * */
    private static void readLanguageExtractors(Element elemLanguageExtractors, Language lang){
        try{
            NodeList extractorNodes = elemLanguageExtractors.getElementsByTagName(EXTRACTOR_TAGNAME);
            Map<String, ExtractorSpecification> langExtractors = new HashMap<String, ExtractorSpecification>(20);
            ArrayList<Class> langExtractorClasses = new ArrayList<Class>(20);

            //iterate and build the required list of extractors
            for(int i=0; i<extractorNodes.getLength(); i++){
                MatchPattern extractorSpecificPattern = null;
                Element elemExtractor = (Element)extractorNodes.item(i);
                String extractorID = elemExtractor.getAttribute(NAME_ATTRIBUTENAME);
                ExtractorStatus status = ExtractorStatus.valueOf(elemExtractor.getAttribute(EXTRACTOR_STATUS_ATTRIBUTENAME));

                langExtractorClasses.add((Class)(ClassLoader.getSystemClassLoader().loadClass(extractorID)));

                //Those types of extractors need special type of handling as we must call the function _addGenerics for
                //them
                if((extractorID.equals(SkosCategoriesExtractor.class.toString())) ||
                        (extractorID.equals(ArticleCategoriesExtractor.class.toString())))
                    extractorSpecificPattern = _addGenerics(lang, extractorID);

                ArrayList<MatchPattern> patternsList = _getExtractorMatchPatterns(elemExtractor);

                if(extractorSpecificPattern != null)
                    patternsList.add(extractorSpecificPattern);

                //Construct the extractor specification object and adds it to the extractors list
                langExtractors.put(extractorID, new ExtractorSpecification(extractorID, status,
                        patternsList, _getExtractorNotices(elemExtractor)));

                extractors.put(lang, langExtractors);
                extractorClasses.put(lang, langExtractorClasses);
            }

//            LiveExtractionConfigLoader.convertExtractorListToScalaList(extractorClasses);
            System.out.println(extractors);

        }
        catch(Exception exp){
            logger.error(exp.getMessage(), exp);
        }

    }

    /**
     * Loads the generic match patterns for some extractors e.g. SkosCategoriesExtractor, because those extractors
     * need a specific pattern for language specific category 
     * @param lang  The required language
     * @param extractorID   The ID of the required extractor
     * @return  The match pattern suitable for the passed extractor
     */

    private static MatchPattern _addGenerics(Language lang, String extractorID) {
        MatchPattern pattern = null;
        if(extractorID.equals(SkosCategoriesExtractor.class.toString())){
            pattern = new MatchPattern(MatchType.STARTSWITH, "", Constants.SKOS_BROADER,
                    Namespaces.names(lang).get(Namespace.Category()).toString(), true);

        }
        else if(extractorID.equals(ArticleCategoriesExtractor.class.toString())){
            pattern = new MatchPattern(MatchType.STARTSWITH, "", Constants.SKOS_SUBJECT,
                    Namespaces.names(lang).get(Namespace.Category()).toString() , true);
        }

        return pattern;
    }

    /**
     * Constructs a list of match patterns associated with the passed extractor 
     * @param extractorElem XML element containing the full specification of the extractor
     * @return  A list of patterns of the extractor
     */
    private static ArrayList<MatchPattern> _getExtractorMatchPatterns(Element extractorElem){
        ArrayList<MatchPattern> patterns = new ArrayList<MatchPattern>();
        NodeList patternNodes = extractorElem.getElementsByTagName(MATCH_PATTERN_TAGNAME);
        try{
            for(int i=0; i<patternNodes.getLength(); i++){
                Element elemPattern = (Element)patternNodes.item(i);

                MatchType type = MatchType.valueOf(elemPattern.getAttribute(MATCH_PATTERN_TYPE_ATTRIBUTENAME));
                boolean pexact = Boolean.parseBoolean(elemPattern.getAttribute(PEXACT_ATTRIBUTENAME));

                String subject = elemPattern.getElementsByTagName(SUBJECT_TAGNAME).item(0).getTextContent();

                //Since we are using name like RDFS_LABEL in the live.xml file, then we should use the reflection
                //to get its actual string value from the Constants class
                try{
                    if(Constants.class.getField(subject).get(Constants.class) != null)
                        subject = Constants.class.getField(subject).get(Constants.class).toString();
                }
                catch(Exception exp){}

                String predicate = elemPattern.getElementsByTagName(PREDICATE_TAGNAME).item(0).getTextContent();

                try{
                    if(Constants.class.getField(predicate).get(Constants.class) != null)
                        predicate = Constants.class.getField(predicate).get(Constants.class).toString();
                }
                catch(Exception exp){}


                String object = elemPattern.getElementsByTagName(OBJECT_TAGNAME).item(0).getTextContent();
                try{
                    if(Constants.class.getField(object).get(Constants.class) != null)
                        object = Constants.class.getField(object).get(Constants.class).toString();
                }
                catch(Exception exp){}

                patterns.add(new MatchPattern(type, subject, predicate, object, pexact));
            }
        }
        catch(Exception exp){

        }



        return patterns.size()>0? patterns : null;
    }

    /**
     * Constructs a list of notices associated with the passed extractor  
     * @param extractorElem extractorElem XML element containing the full specification of the extractor
     * @return  A list of norices of the extractor
     */
    private static ArrayList<String> _getExtractorNotices(Element extractorElem){
        ArrayList<String> notices = new ArrayList<String>();
        NodeList extractorNotices = extractorElem.getElementsByTagName(NOTICE_TAGNAME);

        for(int i=0; i<extractorNotices.getLength(); i++){
            Element noticeElem = (Element)extractorNotices.item(i);
            notices.add(noticeElem.getTextContent());
        }

        return notices.size()>0? notices : null;
    }

    /**
     * Returns the extractors with the passed status
     * @param lang  The required language for which the extractors should be returned
     * @param   requiredStatus  The status of the extractors
     * @return  A list containing the extractors of the passed status 
     */
    public static List<Class> getExtractors(Language lang, ExtractorStatus requiredStatus){

        List<Class> extractorsList = extractorClasses.get(lang);
        Map<String, ExtractorSpecification> specs = extractors.get(lang);
        for(Object value : specs.values()){
            ExtractorSpecification spec = (ExtractorSpecification) value;
            if(spec.status != requiredStatus){

                try{
                    extractorsList.remove(Class.forName(spec.extractorID));
                }
                catch(Exception exp){
                }

            }
        }
        return extractorsList;
    }

}

Tech Fingerprint

Alerts (18)

'public' Maintainability Info: Public non-final fields violate encapsulation. Prefer making fields private and providing public getter/setter methods if access is needed.
62 64
'DocumentBuilderFactory.newInstance()' Security Warning: Default XML parser configurations may be vulnerable to XML External Entity (XXE) attacks. Explicitly disable external entity processing using features like FEATURE_SECURE_PROCESSING, setExpandEntityReferences(false), setSupportDTD(false), etc., unless external entities are explicitly required and validated.
70
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
93 115 187 240
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
116 117
'System.out.println(' Use a logging framework (e.g., SLF4J, Log4j) for better control and configurability
148
'ArrayList<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
185 238
'catch(Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
203 211 219 224 267
'catch' Correctness Info: Empty catch block detected. Swallowing exceptions without logging or handling can hide errors and make debugging difficult.
267