SQLite.py | searchcode

/rdfextras/store/SQLite.py

https://code.google.com/p/rdfextras/ · Python · 572 lines · 484 code · 26 blank · 62 comment · 73 complexity · affd69e3a8a19a517494ad47ef3e521b MD5 · raw file

from __future__ import generators
try:
    from sqlite3 import dbapi2
except ImportError:
    try:
        from pysqlite2 import dbapi2
    except ImportError:
        import warnings
        warnings.warn("pysqlite2 is not installed")
        __test__=False
import re, os
from rdflib.graph import QuotedGraph
from rdflib.graph import RDF
from rdflib.store import CORRUPTED_STORE
from rdflib.store import NO_STORE
from rdflib.store import VALID_STORE
from rdflib.term import Literal
from rdfextras.utils.termutils import escape_quotes
from rdfextras.store.REGEXMatching import REGEXTerm
from rdfextras.store.REGEXMatching import PYTHON_REGEX
from rdfextras.store.AbstractSQLStore import AbstractSQLStore, Graph
from rdfextras.store.AbstractSQLStore import extractTriple, unionSELECT
from rdfextras.store.AbstractSQLStore import ASSERTED_NON_TYPE_PARTITION
from rdfextras.store.AbstractSQLStore import ASSERTED_TYPE_PARTITION
from rdfextras.store.AbstractSQLStore import ASSERTED_LITERAL_PARTITION
from rdfextras.store.AbstractSQLStore import QUOTED_PARTITION
from rdfextras.store.AbstractSQLStore import table_name_prefixes
from rdfextras.store.AbstractSQLStore import TRIPLE_SELECT_NO_ORDER

Any = None

def regexp(expr, item):
    """
    User-defined REGEXP operator
    """
    r = re.compile(expr)
    return r.match(item) is not None

class SQLite(AbstractSQLStore):
    """
    SQLite store formula-aware implementation.  It stores its triples in the 
    following partitions:
    
    - Asserted non rdf:type statements
    - Asserted rdf:type statements (in a table which models Class membership)
        The motivation for this partition is primarily query speed and 
        scalability as most graphs will always have more rdf:type statements 
        than others
    - All Quoted statements
    
    In addition it persists namespace mappings in a seperate table
    """
    context_aware = True
    formula_aware = True
    transaction_aware = True
    regex_matching = PYTHON_REGEX
    autocommit_default = False
    _Store__node_pickler = None
    
    def open(self, db_path, create=True):
        """
        Opens the store specified by the configuration string. If
        create is True a store will be created if it does not already
        exist. If create is False and a store does not already exist
        an exception is raised. An exception is also raised if a store
        exists, but there is insufficient permissions to open the
        store.
        """
        if create:
            db = dbapi2.connect(db_path)
            c = db.cursor()
            # Only create tables if they don't already exist.  If the first
            # exists, assume they all do.
            try:
                c.execute(CREATE_ASSERTED_STATEMENTS_TABLE % self._internedId)
            except dbapi2.OperationalError, e:
                # Raise any error aside from existing table.
                if (str(e) != 'table %s_asserted_statements already exists' 
                        % self._internedId):
                    raise dbapi2.OperationalError, e
            else:
                c.execute(CREATE_ASSERTED_TYPE_STATEMENTS_TABLE %
                        self._internedId)
                c.execute(CREATE_QUOTED_STATEMENTS_TABLE % self._internedId)
                c.execute(CREATE_NS_BINDS_TABLE % self._internedId)
                c.execute(CREATE_LITERAL_STATEMENTS_TABLE % self._internedId)
                for tblName, indices in [
                    (
                        "%s_asserted_statements",
                        [
                            ("%s_A_termComb_index",('termComb',)),
                            ("%s_A_s_index",('subject',)),
                            ("%s_A_p_index",('predicate',)),
                            ("%s_A_o_index",('object',)),
                            ("%s_A_c_index",('context',)),
                        ],
                    ),
                    (
                        "%s_type_statements",
                        [
                            ("%s_T_termComb_index",('termComb',)),
                            ("%s_member_index",('member',)),
                            ("%s_klass_index",('klass',)),
                            ("%s_c_index",('context',)),
                        ],
                    ),
                    (
                        "%s_literal_statements",
                        [
                            ("%s_L_termComb_index",('termComb',)),
                            ("%s_L_s_index",('subject',)),
                            ("%s_L_p_index",('predicate',)),
                            ("%s_L_c_index",('context',)),
                        ],
                    ),
                    (
                        "%s_quoted_statements",
                        [
                            ("%s_Q_termComb_index",('termComb',)),
                            ("%s_Q_s_index",('subject',)),
                            ("%s_Q_p_index",('predicate',)),
                            ("%s_Q_o_index",('object',)),
                            ("%s_Q_c_index",('context',)),
                        ],
                    ),
                    (
                        "%s_namespace_binds",
                        [
                            ("%s_uri_index",('uri',)),
                        ],
                    )]:
                    for indexName, columns in indices:
                        c.execute("CREATE INDEX %s on %s (%s)" %
                                (indexName % self._internedId,
                                tblName % self._internedId,
                                ','.join(columns)))
                c.close()
                db.commit()
                db.close()
        
        self._db = dbapi2.connect(db_path)
        self._db.create_function("regexp", 2, regexp)
        
        #if os.path.exists(db_path):
        #    c = self._db.cursor()
        #    c.execute("SELECT * FROM sqlite_master WHERE type='table'")
        #    tbls = [rt[1] for rt in c.fetchall()]
        #    c.close()
        #    for tn in [tbl%(self._internedId) for tbl in table_name_prefixes]:
        #        if tn not in tbls:
        #            # The database exists, but one of the partitions doesn't 
        #            # exist
        #            return 0
        #    # Everything is there (the database and the partitions)
        #    return 1
        ## The database doesn't exist - nothing is there
        #return -1

        # Alcides fix
        if os.path.exists(db_path):
            c = self._db.cursor()
            c.execute("SELECT * FROM sqlite_master WHERE type='table'")
            tbls = [rt[1] for rt in c.fetchall()]
            c.close()
            
            missing = 0
            for tn in [tbl%(self._internedId) for tbl in table_name_prefixes]:
                if tn not in tbls:
                    missing +=1
        
            if missing == len(table_name_prefixes):
                return NO_STORE
            elif missing > 0:
                return CORRUPTED_STORE
            else:
                return VALID_STORE
                        
        # The database doesn't exist - nothing is there
        return NO_STORE

    
    def destroy(self, db_path):
        """
        FIXME: Add documentation
        """
        db = dbapi2.connect(db_path)
        c=db.cursor()
        for tblsuffix in table_name_prefixes:
            try:
                c.execute('DROP table %s' 
                    % tblsuffix%(self._internedId))
            except:
                print("unable to drop table: %s" 
                    % (tblsuffix%(self._internedId)))
        
        # Note, this only removes the associated tables for the closed world 
        # universe given by the identifier.
        print("Destroyed Close World Universe %s ( in SQLite database %s)" 
                        % (self.identifier,db_path))
        db.commit()
        c.close()
        db.close()
        os.remove(db_path)
    
    def EscapeQuotes(self, qstr):
        return escape_quotes(qstr)

    # This is overridden in order to leave unicode terms as is instead of 
    # converting them to ascii (which is the default behavior)
    def normalizeTerm(self, term):
        if isinstance(term,(QuotedGraph,Graph)):
            return term.identifier
        elif isinstance(term,Literal):
            return self.EscapeQuotes(term)
        elif term is None or isinstance(term, (list,REGEXTerm)):
            return term
        else:
            return term
    
    # "Where Clause" utility Functions
    # The predicate and object clause builders are modified in order to 
    # optimize subjects and objects utility functions which can take lists 
    # as their last argument (object,predicate - respectively)
    def buildSubjClause(self,subject,tableName):
        if isinstance(subject,REGEXTerm):
            return " REGEXP (%s,"+" %s)" % \
                (tableName and '%s.subject'%tableName or 'subject'),[subject]
        elif isinstance(subject,list):
            clauseStrings=[]
            paramStrings = []
            for s in subject:
                if isinstance(s,REGEXTerm):
                    clauseStrings.append(
                    " REGEXP (%s,"+" %s)" 
                    % (tableName and '%s.subject'%tableName or 'subject') 
                        + " %s")
                    paramStrings.append(self.normalizeTerm(s))
                elif isinstance(s,(QuotedGraph,Graph)):
                    clauseStrings.append("%s=" % \
                        (tableName and '%s.subject'%tableName or 'subject') 
                            + "%s")
                    paramStrings.append(self.normalizeTerm(s.identifier))
                else:
                    clauseStrings.append(
                        "%s=" 
                        % (tableName and '%s.subject' 
                            % tableName or 'subject') 
                                + "%s")
                    paramStrings.append(self.normalizeTerm(s))
            return '('+ ' or '.join(clauseStrings) + ')', paramStrings
        elif isinstance(subject,(QuotedGraph,Graph)):
            return "%s=" % \
                    (tableName and '%s.subject'%tableName or 'subject') + \
                            "%s", [self.normalizeTerm(subject.identifier)]
        else:
            return subject is not None and "%s=" % \
                    (tableName and '%s.subject'%tableName or 'subject') + \
                    "%s",[subject] or None
    
    # Capable of taking a list of predicates as well (in which case sub 
    # clauses are joined with 'OR')
    def buildPredClause(self,predicate,tableName):
        if isinstance(predicate,REGEXTerm):
            return " REGEXP (%s,"+" %s)" % \
                    (tableName and '%s.predicate' % \
                            tableName or 'predicate'),[predicate]
        elif isinstance(predicate,list):
            clauseStrings=[]
            paramStrings = []
            for p in predicate:
                if isinstance(p,REGEXTerm):
                    clauseStrings.append(" REGEXP (%s,"+" %s)"% \
                        (tableName and '%s.predicate' % \
                                    tableName or 'predicate'))
                else:
                    clauseStrings.append("%s=" % \
                            (tableName and '%s.predicate' % \
                                        tableName or 'predicate')+"%s")
                paramStrings.append(self.normalizeTerm(p))
            return '('+ ' or '.join(clauseStrings) + ')', paramStrings
        else:
            return predicate is not None and "%s=" % \
                (tableName and '%s.predicate'%tableName or 'predicate') + \
                "%s",[predicate] or None
    
    def buildObjClause(self,obj,tableName):
        """
        Capable of taking a list of objects as well (in which case sub-clauses
        are joined with 'OR')
        """
        if isinstance(obj,REGEXTerm):
            return " REGEXP (%s,"+" %s)" % \
                    (tableName and '%s.object'%tableName or 'object'),[obj]
        elif isinstance(obj,list):
            clauseStrings=[]
            paramStrings = []
            for o in obj:
                if isinstance(o,REGEXTerm):
                    clauseStrings.append(" REGEXP (%s,"+" %s)" % \
                        (tableName and '%s.object'%tableName or 'object'))
                    paramStrings.append(self.normalizeTerm(o))
                elif isinstance(o,(QuotedGraph,Graph)):
                    clauseStrings.append("%s="%(tableName and '%s.object' % \
                        tableName or 'object')+"%s")
                    paramStrings.append(self.normalizeTerm(o.identifier))
                else:
                    clauseStrings.append("%s="%(tableName and '%s.object' % \
                        tableName or 'object')+"%s")
                    paramStrings.append(self.normalizeTerm(o))
            return '('+ ' or '.join(clauseStrings) + ')', paramStrings
        elif isinstance(obj,(QuotedGraph,Graph)):
            return "%s=" % \
                    (tableName and '%s.object'%tableName or 'object') + \
                            "%s",[self.normalizeTerm(obj.identifier)]
        else:
            return obj is not None and "%s=" % \
                    (tableName and '%s.object' % \
                        tableName or 'object')+"%s",[obj] or None
    
    def buildContextClause(self,context,tableName):
        context = context is not None \
                    and self.normalizeTerm(context.identifier) \
                    or context
        if isinstance(context,REGEXTerm):
            return " REGEXP (%s,"+" %s)" % \
                (tableName and '%s.context' % \
                        tableName or 'context'),[context]
        else:
            return context is not None and "%s=" % \
                (tableName and '%s.context' % tableName or 'context') + \
                    "%s", [context] or None
    
    def buildTypeMemberClause(self,subject,tableName):
        if isinstance(subject,REGEXTerm):
            return " REGEXP (%s,"+" %s)" % \
                    (tableName and '%s.member' % \
                            tableName or 'member'), [subject]
        elif isinstance(subject,list):
            clauseStrings=[]
            paramStrings = []
            for s in subject:
                clauseStrings.append("%s.member="%tableName+"%s")
                if isinstance(s,(QuotedGraph,Graph)):
                    paramStrings.append(self.normalizeTerm(s.identifier))
                else:
                    paramStrings.append(self.normalizeTerm(s))
            return '('+ ' or '.join(clauseStrings) + ')', paramStrings
        else:
            return subject and u"%s.member = "%(tableName)+"%s",[subject]
    
    def buildTypeClassClause(self,obj,tableName):
        if isinstance(obj,REGEXTerm):
            return " REGEXP (%s,"+" %s)" % \
                    (tableName and '%s.klass' % \
                            tableName or 'klass'), [obj]
        elif isinstance(obj,list):
            clauseStrings=[]
            paramStrings = []
            for o in obj:
                clauseStrings.append("%s.klass=" % tableName + "%s")
                if isinstance(o,(QuotedGraph,Graph)):
                    paramStrings.append(self.normalizeTerm(o.identifier))
                else:
                    paramStrings.append(self.normalizeTerm(o))
            return '('+ ' or '.join(clauseStrings) + ')', paramStrings
        else:
            return obj is not None and "%s.klass = " % \
                    tableName + "%s", [obj] or None
    
    def triples(self, (subject, predicate, obj), context=None):
        """
        A generator over all the triples matching pattern. Pattern can
        be any objects for comparing against nodes in the store, for
        example, RegExLiteral, Date? DateRange?
        
        quoted table:                <id>_quoted_statements
        asserted rdf:type table:     <id>_type_statements
        asserted non rdf:type table: <id>_asserted_statements
        
        triple columns: subject,predicate,object,context,
                        termComb,objLanguage,objDatatype

        class membership columns: member,klass,context termComb
        
        FIXME:  These union all selects *may* be further optimized by joins
        
        """
        quoted_table = "%s_quoted_statements" % self._internedId
        asserted_table = "%s_asserted_statements" % self._internedId
        asserted_type_table = "%s_type_statements" % self._internedId
        literal_table = "%s_literal_statements" % self._internedId
        c = self._db.cursor()
        
        parameters = []
        
        if predicate == RDF.type:
            # select from asserted rdf:type partition and 
            # quoted table (if a context is specified)
            clauseString,params = self.buildClause(
                'typeTable',subject,RDF.type, obj,context,True)
            parameters.extend(params)
            selects = [
                (
                  asserted_type_table,
                  'typeTable',
                  clauseString,
                  ASSERTED_TYPE_PARTITION
                ),
            ]
        
        elif isinstance(predicate,REGEXTerm) \
                and predicate.compiledExpr.match(RDF.type) \
                or not predicate:
            # Select from quoted partition (if context is specified), 
            # literal partition if (obj is Literal or None) and 
            # asserted non rdf:type partition (if obj is URIRef or None)
            selects = []
            if not self.STRONGLY_TYPED_TERMS or \
                    isinstance(obj,Literal) \
                    or not obj \
                    or (self.STRONGLY_TYPED_TERMS \
                        and isinstance(obj,REGEXTerm)):
                clauseString,params = self.buildClause(
                        'literal',subject,predicate,obj,context)
                parameters.extend(params)
                selects.append((
                  literal_table,
                  'literal',
                  clauseString,
                  ASSERTED_LITERAL_PARTITION
                ))
            if not isinstance(obj,Literal) \
                and not (isinstance(obj,REGEXTerm) \
                and self.STRONGLY_TYPED_TERMS) \
                or not obj:
                clauseString,params = self.buildClause(
                    'asserted',subject,predicate,obj,context)
                parameters.extend(params)
                selects.append((
                  asserted_table,
                  'asserted',
                  clauseString,
                  ASSERTED_NON_TYPE_PARTITION
                ))
            
            clauseString,params = self.buildClause(
                    'typeTable',subject,RDF.type,obj,context,True)
            parameters.extend(params)
            selects.append(
                (
                  asserted_type_table,
                  'typeTable',
                  clauseString,
                  ASSERTED_TYPE_PARTITION
                )
            )
        
        elif predicate:
            # select from asserted non rdf:type partition (optionally), 
            # quoted partition (if context is speciied), and literal 
            # partition (optionally)
            selects = []
            if not self.STRONGLY_TYPED_TERMS \
                    or isinstance(obj,Literal) \
                    or not obj \
                    or (self.STRONGLY_TYPED_TERMS \
                        and isinstance(obj,REGEXTerm)):
                clauseString,params = self.buildClause(
                    'literal',subject,predicate,obj,context)
                parameters.extend(params)
                selects.append((
                  literal_table,
                  'literal',
                  clauseString,
                  ASSERTED_LITERAL_PARTITION
                ))
            if not isinstance(obj,Literal) \
                    and not (isinstance(obj,REGEXTerm) \
                    and self.STRONGLY_TYPED_TERMS) \
                    or not obj:
                clauseString,params = self.buildClause(
                        'asserted',subject,predicate,obj,context)
                parameters.extend(params)
                selects.append((
                  asserted_table,
                  'asserted',
                  clauseString,
                  ASSERTED_NON_TYPE_PARTITION
                ))
        
        if context is not None:
            clauseString,params = self.buildClause(
                'quoted',subject,predicate, obj,context)
            parameters.extend(params)
            selects.append(
                (
                  quoted_table,
                  'quoted',
                  clauseString,
                  QUOTED_PARTITION
                )
            )
        
        q = self._normalizeSQLCmd(unionSELECT(
                selects,selectType=TRIPLE_SELECT_NO_ORDER))
        self.executeSQL(c,q,parameters)
        # NOTE: SQLite does not support ORDER BY terms that aren't 
        # integers, so the entire result set must be iterated
        # in order to be able to return a generator of contexts
        tripleCoverage = {}
        result = c.fetchall()
        c.close()
        for rt in result:
            # Fix by Alcides Fonseca
            # https://github.com/slok/rdflib/commit/e05827b080772e785290b270da63dce64addfc7c#diff-0
            tmp = []
            for i,r in enumerate(rt):
                if r == u"NULL":
                    tmp.append(None)
                else:
                    tmp.append(r)
            rt = tuple(tmp)
            s,p,o,(graphKlass,idKlass,graphId) = extractTriple(rt,self,context)
            contexts = tripleCoverage.get((s,p,o),[])
            contexts.append(graphKlass(self,idKlass(graphId)))
            tripleCoverage[(s,p,o)] = contexts

        for (s,p,o),contexts in tripleCoverage.items():
            yield (s,p,o),(c for c in contexts)
    


CREATE_ASSERTED_STATEMENTS_TABLE = """
CREATE TABLE %s_asserted_statements (
    subject       text not NULL,
    predicate     text not NULL,
    object        text not NULL,
    context       text not NULL,
    termComb      tinyint unsigned not NULL)"""

CREATE_ASSERTED_TYPE_STATEMENTS_TABLE = """
CREATE TABLE %s_type_statements (
    member        text not NULL,
    klass         text not NULL,
    context       text not NULL,
    termComb      tinyint unsigned not NULL)"""

CREATE_LITERAL_STATEMENTS_TABLE = """
CREATE TABLE %s_literal_statements (
    subject       text not NULL,
    predicate     text not NULL,
    object        text,
    context       text not NULL,
    termComb      tinyint unsigned not NULL,
    objLanguage   varchar(3),
    objDatatype   text)"""

CREATE_QUOTED_STATEMENTS_TABLE = """
CREATE TABLE %s_quoted_statements (
    subject       text not NULL,
    predicate     text not NULL,
    object        text,
    context       text not NULL,
    termComb      tinyint unsigned not NULL,
    objLanguage   varchar(3),
    objDatatype   text)"""

CREATE_NS_BINDS_TABLE = """
CREATE TABLE %s_namespace_binds (
    prefix        varchar(20) UNIQUE not NULL,
    uri           text,
    PRIMARY KEY (prefix))"""
Tech Fingerprint

Alerts (49)

'open(' Use 'with open()' to ensure Files are properly closed
60
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
192
'print(' Use logging module for better control and configurability
193 198
'def' Ensure functions have docstrings for documentation
205 210 224 262 320 333 351
'isinstance(' Overuse may indicate design issues; consider polymorphism
211 213 215 225 228 232 238 251 263 267 271 291 294 298 302 311 324 334 338 343 352 356 361 411 419 422 432 464 467 477
Complexity hotspot; lines 255 to 258 (total complexity: 5)
255 256 257 258
Complexity hotspot; lines 281 to 284 (total complexity: 5)
281 282 283 284