db.py - Copyright 2006-2012 Dirk Meyer, Jason Tackaberry Pl…

/src/db.py

https://github.com/Dischi/kaa-base · Python · 2571 lines · 1950 code · 191 blank · 430 comment · 287 complexity · 380f4e5e41fb43919618ebe8d2db9bbb MD5 · raw file
Large files are truncated click here to view the full file

# -----------------------------------------------------------------------------
# db.py - db abstraction module
# -----------------------------------------------------------------------------
# Copyright 2006-2012 Dirk Meyer, Jason Tackaberry
#
# Please see the file AUTHORS for a complete list of authors.
#
# This library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License version
# 2.1 as published by the Free Software Foundation.
#
# This library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
#
# -----------------------------------------------------------------------------
from __future__ import absolute_import

__all__ = [
    'Database', 'QExpr', 'DatabaseError', 'DatabaseReadOnlyError',
    'split_path', 'ATTR_SIMPLE', 'ATTR_SEARCHABLE', 'ATTR_IGNORE_CASE',
    'ATTR_INDEXED', 'ATTR_INDEXED_IGNORE_CASE', 'ATTR_INVERTED_INDEX',
    'RAW_TYPE'
]

# python imports
import sys
import os
import time
import re
import logging
import math
import cPickle
import copy_reg
import _weakref
import threading
try:
    # Try a system install of pysqlite
    from pysqlite2 import dbapi2 as sqlite
except ImportError:
    # Python 2.6 provides sqlite natively, so try that next.
    from sqlite3 import dbapi2 as sqlite

# kaa base imports
from .utils import property
from .strutils import py3_str, BYTES_TYPE, UNICODE_TYPE
from .timer import WeakOneShotTimer
from . import main

if sqlite.version < '2.1.0':
    raise ImportError('pysqlite 2.1.0 or higher required')
if sqlite.sqlite_version < '3.3.1':
    raise ImportError('sqlite 3.3.1 or higher required')

# get logging object
log = logging.getLogger('kaa.base.db')

SCHEMA_VERSION = 0.2
SCHEMA_VERSION_COMPATIBLE = 0.2
CREATE_SCHEMA = """
    CREATE TABLE meta (
        attr        TEXT UNIQUE,
        value       TEXT
    );
    INSERT INTO meta VALUES('version', %s);

    CREATE TABLE types (
        id              INTEGER PRIMARY KEY AUTOINCREMENT,
        name            TEXT UNIQUE,
        attrs_pickle    BLOB,
        idx_pickle      BLOB
    );

    CREATE TABLE inverted_indexes (
        name            TEXT,
        attr            TEXT,
        value           TEXT
    );

    CREATE UNIQUE INDEX inverted_indexes_idx on inverted_indexes (name, attr);
"""

CREATE_IVTIDX_TEMPLATE = """
    CREATE TABLE ivtidx_%IDXNAME%_terms (
        id              INTEGER PRIMARY KEY AUTOINCREMENT,
        term            TEXT,
        count           INTEGER
    );
    CREATE UNIQUE INDEX ivtidx_%IDXNAME%_terms_idx on ivtidx_%IDXNAME%_terms (term);

    CREATE TABLE ivtidx_%IDXNAME%_terms_map (
        rank            INTEGER,
        term_id         INTEGER,
        object_type     INTEGER,
        object_id       INTEGER,
        frequency       FLOAT
    );
    CREATE INDEX ivtidx_%IDXNAME%_terms_map_idx ON ivtidx_%IDXNAME%_terms_map (term_id, rank, object_type, object_id);
    CREATE INDEX ivtidx_%IDXNAME%_terms_map_object_idx ON ivtidx_%IDXNAME%_terms_map (object_id, object_type, term_id);
    CREATE TRIGGER ivtidx_%IDXNAME%_delete_terms_map DELETE ON ivtidx_%IDXNAME%_terms_map
    BEGIN
        UPDATE ivtidx_%IDXNAME%_terms SET count=MAX(0, count-1) WHERE id=old.term_id;
    END;
"""


ATTR_SIMPLE              = 0x01
ATTR_SEARCHABLE          = 0x02      # Is a SQL column, not a pickled field
ATTR_INDEXED             = 0x04      # Will have an SQL index
ATTR_IGNORE_CASE         = 0x08      # Store in db as lowercase for searches.
ATTR_INVERTED_INDEX      = 0x10      # Attribute associated with an inverted idx
ATTR_INDEXED_IGNORE_CASE = ATTR_INDEXED | ATTR_IGNORE_CASE

# These are special attributes for querying.  Attributes with
# these names cannot be registered.
RESERVED_ATTRIBUTES = ('id', 'parent', 'object', 'type', 'limit', 'attrs', 'distinct', 'orattrs')

STOP_WORDS = (
    "about", "and", "are", "but", "com", "for", "from", "how", "not",
    "some", "that", "the", "this", "was", "what", "when", "where", "who",
    "will", "with", "the", "www", "http", "org", "of", "on"
)


class PyObjectRow(object):
    """
    ObjectRows are dictionary-like objects that represent an object in
    the database.  They are used by pysqlite instead of tuples or indexes.

    ObjectRows support on-demand unpickling of the internally stored pickle
    which contains ATTR_SIMPLE attributes.

    This is the native Python implementation of ObjectRow.  There is a
    faster C implementation in the _objectrow extension.  This
    implementation is still designed to be efficient -- the C version is
    only about 60-70% faster.
    """
    # A dict containing per-query data: [refcount, idxmap, typemap, pickle_idx]
    # This is constructed once for each query, and each row returned in the
    # query references the same data.  Each ObjectRow instance adds to the
    # refcount once initialized, and is decremented when the object is deleted.
    # Once it reaches 0, the entry is removed from the dict.
    queries = {}
    # Use __slots__ as a minor optimization to improve object creation time.
    __slots__ = ('_description', '_object_types', '_type_name', '_row', '_pickle',
                '_idxmap', '_typemap', '_keys')
    def __init__(self, cursor, row, pickle_dict=None):
        # The following is done per row per query, so it should be as light as
        # possible.
        if pickle_dict:
            # Created outside pysqlite, e.g. from Database.add()
            self._pickle = pickle_dict
            self._idxmap = None
            return

        if isinstance(cursor, tuple):
            self._description, self._object_types = cursor
        else:
            self._description = cursor.description
            self._object_types = cursor._db()._object_types

        self._row = row
        # _pickle: False == no pickle present; None == pickle present but
        # empty; if a dict, is the unpickled dictionary; else it's a byte
        # string containing the pickled data.
        self._pickle = False
        self._type_name = row[0]
        try:
            attrs = self._object_types[self._type_name][1]
        except KeyError:
            raise ValueError("Object type '%s' not defined." % self._type_name)

        query_key = id(self._description)
        if query_key in PyObjectRow.queries:
            query_info = PyObjectRow.queries[query_key]
            # Increase refcount to the query info
            query_info[0] += 1
            self._idxmap, self._typemap, pickle_idx = query_info[1:]
            if pickle_idx != -1:
                self._pickle = self._row[pickle_idx]
            return

        # Everything below this is done once per query, not per row, so
        # performance isn't quite as critical.
        idxmap = {} # attr_name -> (idx, pickled, named_ivtdx, type, flags)
        pickle_idx = -1
        for i in range(2, len(self._description)):
            attr_name = self._description[i][0]
            idxmap[attr_name] = i
            if attr_name == 'pickle':
                pickle_idx = i
                self._pickle = self._row[i]

        for attr_name, (attr_type, flags, ivtidx, split) in attrs.items():
            idx = idxmap.get(attr_name, -1)
            pickled = flags & ATTR_SIMPLE or (flags & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
            idxmap[attr_name] = idx, pickled, attr_name == ivtidx, attr_type, flags

        # Construct dict mapping type id -> type name.  Essentially an
        # inversion of _object_types
        typemap = dict((v[0], k) for k, v in self._object_types.items())

        self._idxmap = idxmap
        self._typemap = typemap
        PyObjectRow.queries[query_key] = [1, idxmap, typemap, pickle_idx]


    def __del__(self):
        if self._idxmap is None:
            # From Database.add(), pickle only, no pysqlite row.
            return
        query_key = id(self._description)
        query_info = PyObjectRow.queries[query_key]
        query_info[0] -= 1
        if query_info[0] == 0:
            # Refcount for this query info is 0, so remove from global queries
            # dict.
            del PyObjectRow.queries[query_key]


    def __getitem__(self, key):
        if self._idxmap is None:
            # From Database.add(), work strictly from pickle
            return self._pickle[key]
        elif key == 'type':
            return self._type_name
        elif key == 'parent':
            type_idx = self._idxmap.get('parent_type', [-1])[0]
            id_idx = self._idxmap.get('parent_id', [-1])[0]
            if type_idx == -1 or id_idx == -1:
                raise KeyError('Parent attribute not available')
            type_id = self._row[type_idx]
            return self._typemap.get(type_id, type_id), self._row[id_idx]
        elif key == '_row':
            return self._row
        elif isinstance(key, int):
            return self._row[key]

        attr = self._idxmap[key]
        attr_idx = attr[0]
        is_indexed_ignore_case = (attr[4] & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
        if attr_idx == -1:
            # Attribute is not in the sql row
            if attr[1] and self._pickle is None:
                # Pickle is empty, which means this attribute was never
                # assigned a value.  Return a default (empty list if attribute
                # is named after an inverted index
                return [] if attr[2] else None
            elif not attr[1] or self._pickle is False:
                # The requested attribute is not in the sqlite row, and neither is the pickle.
                raise KeyError("ObjectRow does not have enough data to provide '%s'" % key)

        if not attr[1]:
            value = self._row[attr_idx]
        elif attr_idx >= 0 and not self._pickle and is_indexed_ignore_case:
            # Attribute is ATTR_INDEXED_IGNORE_CASE which means the
            # authoritative source is in the pickle, but we don't have it.  So just
            # return what we have.
            value = self._row[attr_idx]
        else:
            if self._pickle and not isinstance(self._pickle, dict):
                # We need to check the pickle but it's not unpickled, so do so now.
                self._pickle = dbunpickle(self._pickle)
            if is_indexed_ignore_case:
                key = '__' + key
            if key not in self._pickle:
                return [] if attr[2] else None
            else:
                value = self._pickle[key]

        if sys.hexversion < 0x03000000 and (attr[3] == str or attr[3] == buffer):
            # Python 2's pysqlite returns BLOBs as buffers.  If the attribute
            # type is string or buffer (RAW_TYPE on Python 2), convert to string.
            return str(value)
        else:
            return value


    def keys(self):
        if not self._idxmap:
            # Ad hoc ObjectRow, proxy to pickle dict.
            return self._pickle.keys()

        if not hasattr(self, '_keys'):
            self._keys = ['type']
            for name, attr in self._idxmap.items():
                if (attr[0] >= 0 or (attr[1] and self._pickle is not False)) and name != 'pickle':
                    self._keys.append(name)
            if 'parent_type' in self._idxmap and 'parent_id' in self._idxmap:
                self._keys.append('parent')
        return self._keys


    def values(self):
        if not self._idxmap:
            # Ad hoc ObjectRow, proxy to pickle dict.
            return self._pickle.values()
        return [self[k] for k in self.keys()]


    def items(self):
        return zip(self.keys(), self.values())


    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default


    def has_key(self, key):
        return key in self.keys()


    def __iter__(self):
        return iter(self.keys())


    def __contains__(self, key):
        if key == 'type' or (key == 'parent' and 'parent_id' in self._idxmap):
            return True
        else:
            return key in self._idxmap


if sys.hexversion >= 0x03000000:
    RAW_TYPE = bytes
    PICKLE_PROTOCOL = 3
    class Proto2Unpickler(pickle._Unpickler):
        """
        In spite of the promise that pickles will be compatible between Python
        releases, Python 3 does the wrong thing with non-unicode strings in
        pickles (BINSTRING pickle opcode).  It will try to convert them to
        unicode strings, which is wrong when the intention was to store binary
        data in a Python 2 str.

        This class implements a custom unpickler that will load BINSTRING as
        bytes objects.  An exception is made for dictionaries, where BINSTRING
        keys are converted to unicode strings.

        Additionally, it maps the unicode and buffer types to corresponding
        Python 3 types.
        """
        dispatch = pickle._Unpickler.dispatch.copy()
        def load_binstring(self):
            len = pickle.mloads(bytes('i', 'ascii') + self.read(4))
            self.append(bytes(self.read(len)))
        dispatch[pickle.BINSTRING[0]] = load_binstring

        def load_short_binstring(self):
            len = ord(self.read(1))
            self.append(bytes(self.read(len)))
        dispatch[pickle.SHORT_BINSTRING[0]] = load_short_binstring

        def load_setitems(self):
            super(Proto2Unpickler, self).load_setitems()
            d = self.stack[-1]
            for k, v in d.items():
                if type(k) == bytes:
                    sk = str(k, self.encoding, self.errors)
                    if sk not in d:
                        d[sk] = v
                        del d[k]
        dispatch[pickle.SETITEMS[0]] = load_setitems

        def find_class(self, module, name):
            if module == '__builtin__':
                if name == 'unicode':
                    return str
                elif name == 'buffer':
                    return bytes
            return super(Proto2Unpickler, self).find_class(module, name)

    def dbunpickle(s):
        if s[1] == 0x02:
            import io
            #import pickletools
            #pickletools.dis(io.BytesIO(bytes(s)))
            return Proto2Unpickler(io.BytesIO(bytes(s))).load()
        else:
            return pickle.loads(bytes(s))

    def dbpickle(value):
        return bytes(pickle.dumps(value, 3))

    # Need to be able to unpickle pickled buffers from Python 2.
    def _unpickle_buffer(s):
        return bytes(s)

else:
    RAW_TYPE = buffer
    PICKLE_PROTOCOL = 2
    def dbunpickle(s):
        return cPickle.loads(str(s))

    def dbpickle(value):
        return buffer(cPickle.dumps(value, 2))


    # Python2 can't pickle buffer types, so register a handler for that.
    def _pickle_buffer(b):
        return _unpickle_buffer, (str(b),)

    def _unpickle_buffer(s):
        return str(s)
    copy_reg.pickle(buffer, _pickle_buffer, _unpickle_buffer)


try:
    from . import _objectrow
except ImportError:
    # Use the python-native ObjectRow
    ObjectRow = PyObjectRow
else:
    # Use the faster C-based ObjectRow
    ObjectRow = _objectrow.ObjectRow
    # Expose the custom unpickler to the _objectrow module so it can deal with
    # pickles stored inside DB rows.
    _objectrow.dbunpickle = dbunpickle


# Register a handler for pickling ObjectRow objects.
def _pickle_ObjectRow(o):
    if o._description:
        return _unpickle_ObjectRow, ((o._description, o._object_types), o._row)
    else:
        return _unpickle_ObjectRow, (None, None, dict(o.items()))

def _unpickle_ObjectRow(*args):
    return ObjectRow(*args)

copy_reg.pickle(ObjectRow, _pickle_ObjectRow, _unpickle_ObjectRow)



PATH_SPLIT = re.compile(ur'(\d+)|[_\W]', re.U | re.X)
def split_path(s):
    """
    Convenience split function for inverted index attributes.  Useful for
    attributes that contain filenames.  Splits the given string s into
    components parts (directories, filename), discarding the extension and all
    but the last two directories.  What's remaining is split into words and the
    result is returned.
    """
    dirname, filename = os.path.split(s)
    fname_noext, ext = os.path.splitext(filename)
    for part in dirname.strip('/').split(os.path.sep)[2:][-2:] + [fname_noext]:
        for match in PATH_SPLIT.split(part):
            if match:
                yield match


# FIXME: this is flawed.  Can use placeholders by taking a n-tuple and
# replacing ? to (?, ?, ..., n) and then extend the query params list with the
# given tuple/list value.
def _list_to_printable(value):
    """
    Takes a list of mixed types and outputs a unicode string.  For
    example, a list [42, 'foo', None, "foo's' string"], this returns the
    string:

        (42, 'foo', NULL, 'foo''s'' string')

    Single quotes are escaped as ''.  This is suitable for use in SQL
    queries.
    """
    fixed_items = []
    for item in value:
        if isinstance(item, (int, long, float)):
           fixed_items.append(str(item))
        elif item == None:
            fixed_items.append("NULL")
        elif isinstance(item, UNICODE_TYPE):
            fixed_items.append("'%s'" % item.replace("'", "''"))
        elif isinstance(item, BYTES_TYPE):
            fixed_items.append("'%s'" % py3_str(item.replace("'", "''")))
        else:
            raise Exception, "Unsupported type '%s' given to _list_to_printable" % type(item)

    return '(' + ','.join(fixed_items) + ')'


class DatabaseError(Exception):
    pass

class DatabaseReadOnlyError(Exception):
    pass

class QExpr(object):
    """
    Flexible query expressions for use with :meth:`kaa.db.Database.query()`
    """
    def __init__(self, operator, operand):
        """
        :param operator: ``=``, ``!=``, ``<``, ``<=``, ``>``, ``>=``, ``in``,
                         ``not in``, ``range``, ``like``, or ``regexp``
        :type operator: str
        :param operand: the rvalue of the expression; any scalar values as part of
                        the operand must be the same type as the attribute being
                        evaluated

        Except for ``in``, ``not in``, and ``range``, the operand must be the
        type of the registered attribute being evaluated (e.g. unicode, int,
        etc.).

        The operand for ``in`` and ``not in`` are lists or tuples of the attribute
        type, to test inclusion in the given set.

        The ``range`` operator accepts a 2-tuple specifying min and max values
        for the attribute.  The Python expression age=QExpr('range', (20, 30))
        translates to ``age >= 20 AND age <= 30``.
        """
        operator = operator.lower()
        assert(operator in ('=', '!=', '<', '<=', '>', '>=', 'in', 'not in', 'range', 'like', 'regexp'))
        if operator in ('in', 'not in', 'range'):
            assert(isinstance(operand, (list, tuple)))
            if operator == 'range':
                assert(len(operand) == 2)

        self._operator = operator
        self._operand = operand

    def as_sql(self, var):
        if self._operator == "range":
            a, b = self._operand
            return "%s >= ? AND %s <= ?" % (var, var), (a, b)
        elif self._operator in ("in", "not in"):
            return "%s %s %s" % (var, self._operator.upper(),
                   _list_to_printable(self._operand)), ()
        else:
            return "%s %s ?" % (var, self._operator.upper()), \
                   (self._operand,)


class RegexpCache(object):
    def __init__(self):
        self.last_item = None
        self.last_expr = None

    def __call__(self, expr, item):
        if item is None:
            return 0

        if self.last_item == item and self.last_item is not None and self.last_expr == expr:
            return self.last_result
        if self.last_expr != expr:
            self.last_expr = re.compile(unicode(expr), re.U)

        self.last_item = item
        # FIXME: bad conversion to unicode!
        self.last_result = self.last_expr.match(unicode(item)) is not None
        return self.last_result


class Database(object):
    def __init__(self, dbfile):
        """
        Open a database, creating one if it doesn't already exist.

        :param dbfile: path to the database file
        :type dbfile: str

        SQLite is used to provide the underlying database.
        """
        super(Database, self).__init__()
        # _object_types dict is keyed on type name, where value is a 3-
        # tuple (id, attrs, idx), where:
        #   - id is a unique numeric database id for the type,
        #   - attrs is a dict containing registered attributes for the type,
        #     keyed on attribute name, where value is a 3-tuple of (type, flags,
        #     ivtidx), where type is a python datatype, flags is a bitmask of
        #     ATTR_*, ivtidx is the name of the associated inverted index (used
        #     if flags has ATTR_INVERTED_INDEX, otherwise None)
        #   - idx is a list of n-tuples, where each n-tuple defines two or more
        #     (non-ATTR_SIMPLE) attributes on which to create a multi-column
        #     sql index.
        self._object_types = {}

        # _inverted_indexes dict is keyed on index name, where value is
        # a dict keyed on:
        #   - min: minimum length of terms
        #   - max: maximum length of terms
        #   - ignore: list of terms to ignore
        #   - split: function or regular expression used to split string ATTR_INVERTED_INDEX
        #            attributes.
        self._inverted_indexes = {}

        # True when there are uncommitted changes
        self._dirty = False
        # True when modifications are not allowed to the database, which
        # is the case when Python 3 is opening a database created by Python 2
        # and upgrade_to_py3() has not been called.
        self._readonly = False
        self._dbfile = os.path.realpath(dbfile)
        self._lock = threading.RLock()
        self._lazy_commit_timer = WeakOneShotTimer(self.commit)
        self._lazy_commit_interval = None
        self._open_db()


    def _open_db(self):
        self._db = sqlite.connect(self._dbfile, check_same_thread=False)

        # Create the function "regexp" for the REGEXP operator of SQLite
        self._db.create_function("regexp", 2, RegexpCache())

        self._cursor = self._db.cursor()

        class Cursor(sqlite.Cursor):
            _db = _weakref.ref(self)
        self._db.row_factory = ObjectRow
        # Queries done through this cursor will use the ObjectRow row factory.
        self._qcursor = self._db.cursor(Cursor)

        for cursor in self._cursor, self._qcursor:
            cursor.execute("PRAGMA synchronous=OFF")
            cursor.execute("PRAGMA temp_store=MEMORY")
            cursor.execute("PRAGMA cache_size=50000")
            cursor.execute("PRAGMA page_size=8192")

        if not self._check_table_exists("meta"):
            self._db.executescript(CREATE_SCHEMA % SCHEMA_VERSION)

        row = self._db_query_row("SELECT value FROM meta WHERE attr='version'")
        if float(row[0]) < SCHEMA_VERSION_COMPATIBLE:
            raise DatabaseError("Database '%s' has schema version %s; required %s" % \
                                (self._dbfile, row[0], SCHEMA_VERSION_COMPATIBLE))

        self._load_inverted_indexes()
        self._load_object_types()


    def _set_dirty(self):
        if self._lazy_commit_interval is not None:
            self._lazy_commit_timer.start(self._lazy_commit_interval)
        if self._dirty:
            return
        self._dirty = True
        main.signals['exit'].connect(self.commit)


    def _db_query(self, statement, args = (), cursor = None, many = False):
        t0=time.time()
        with self._lock:
            if not cursor:
                cursor = self._cursor
            if many:
                cursor.executemany(statement, args)
            else:
                cursor.execute(statement, args)
            rows = cursor.fetchall()
        t1=time.time()
        #print "QUERY [%.06f%s]: %s" % (t1-t0, ('', ' (many)')[many], statement), args
        return rows


    def _db_query_row(self, statement, args = (), cursor = None):
        rows = self._db_query(statement, args, cursor)
        if len(rows) == 0:
            return None
        return rows[0]


    def _to_obj_tuple(self, obj, numeric=False):
        """
        Returns a normalized object reference as a 2-tuple (type, id).

        :param obj: an ObjectRow, or 2-tuple (type, id)
        :param numeric: if True, coerce type name to a type id

        Raises ValueError if obj is not valid.
        """
        if isinstance(obj, ObjectRow):
            object_type, object_id = obj['type'], obj['id']
        else:
            try:
                object_type, object_id = obj
                if not isinstance(object_type, (int, basestring)) or not isinstance(object_id, (int, long, QExpr)):
                    raise TypeError
            except TypeError:
                raise ValueError('Object reference must be either ObjectRow, or (type, id), got %s' % obj)

        if numeric:
            object_type = self._get_type_id(object_type)

        return object_type, object_id


    def _check_table_exists(self, table):
        res = self._db_query_row("SELECT name FROM sqlite_master where " \
                                 "name=? and type='table'", (table,))
        return res != None


    def _register_check_indexes(self, indexes, attrs):
        for cols in indexes:
            if not isinstance(cols, (list, tuple)):
                raise ValueError, "Single column index specified ('%s') where multi-column index expected." % cols
            for col in cols:
                errstr = "Multi-column index (%s) contains" % ",".join(cols)
                if col not in attrs:
                    raise ValueError, "%s unknown attribute '%s'" % (errstr, col)
                if not attrs[col][1]:
                    raise ValueError, "%s ATTR_SIMPLE attribute '%s'" % (errstr, col)


    def _register_create_multi_indexes(self, indexes, table_name):
        for cols in indexes:
            self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
                           (table_name, "_".join(cols), table_name, ",".join(cols)))


    def register_object_type_attrs(self, type_name, indexes = [], **attrs):
        """
        Register one or more object attributes and/or multi-column indexes for
        the given type name.

        This function modifies the database as needed to accommodate new
        indexes and attributes, either by creating the object's tables (in the
        case of a new object type) or by altering the object's tables to add
        new columns or indexes.

        This method is idempotent: if the attributes and indexes specified have
        not changed from previous invocations, no changes will be made to the
        database.  Moreover, newly registered attributes will not affect
        previously registered attributes.  This allows, for example, a plugin
        to extend an existing object type created by the core application
        without interfering with it.

        :param type_name: the name of object type the registered attributes or
                          indexes apply to.
        :type type_name: str
        :param indexes: a list of tuples where each tuple contains 2 or more
                        registered :attr:`~kaa.db.ATTR_SEARCHABLE` attributes
                        for which a composite index will be created in the
                        underlying database.  This is useful for speeding
                        up queries involving these attributes combined.
        :type indexes: list of tuples of strings
        :param attrs: keyword arguments defining the attributes to be
                      registered.  The keyword defining the attribute name
                      cannot conflict with any of the names in
                      :attr:`~kaa.db.RESERVED_ATTRIBUTES`.  See below for a
                      more complete specification of the value.
        :type attrs: 2, 3, or 4-tuple

        Previously registered attributes may be updated in limited ways (e.g.
        by adding an index to the attribute).  If the change requested is
        not supported, a ValueError will be raised.

        .. note:: Currently, indexes and attributes can only be added, not
           removed.  That is, once an attribute or index is added, it lives
           forever.

        Object attributes, which are supplied as keyword arguments, are either
        *searchable* or *simple*.  *Searchable* attributes occupy a column in
        the underlying database table and so queries can be performed on these
        attributes, but their types are more restricted.  *Simple* attributes
        can be any type that can be pickled, but can't be searched.

        The attribute kwarg value is a tuple of 2 to 4 items in length and in
        the form (attr_type, flags, ivtidx, split).

            * attr_type: the type of the object attribute. For simple attributes
              (:attr:`~kaa.db.ATTR_SIMPLE` in *flags*), this can be any picklable
              type; for searchable attributes (:attr:`~kaa.db.ATTR_SEARCHABLE`
              in *flags*), this must be either *int*, *float*, *str*, *unicode*,
              *bytes*, or *bool*.  (On Python 2.5, you can use ``kaa.db.RAW_TYPE``
              instead of *bytes*.)
            * flags: a bitmap of :ref:`attribute flags <attrflags>`
            * ivtidx: name of a previously registered inverted index used for
              this attribute. Only needed if flags contains
              :attr:`~kaa.db.ATTR_INVERTED_INDEX`
            * split: function or regular expression used to split string-based
              values for this attribute into separate terms for indexing.  If
              this isn't defined, then the default split strategy for the
              inverted index wil be used.

        Apart from not being allowed to conflict with one of the reserved
        names, there is a special case for attribute names: when they have
        the same name as a previously registered inverted index.  These
        attributes must be :attr:`~kaa.db.ATTR_SIMPLE`, and of type *list*.
        Terms explicitly associated with the attribute are persisted with the
        object, but when accessed, all terms for all attributes for that
        inverted index will be contained in the list, not just those explicitly
        associated with the same-named attribute.

        The following example shows what an application that indexes email might
        do::

            from kaa.db import *
            from datetime import datetime
            db = Database('email.db')
            db.register_inverted_index('keywords', min=3, max=30)
            db.register_object_type_attrs('msg',
                # Create a composite index on sender and recipient, because
                # (let's suppose) it's we do a lot of searches for specific
                # senders emailing specific recipients.
                [('sender', 'recipient')],

                # Simple attribute can be anything that's picklable, which datetime is.
                date = (datetime, ATTR_SIMPLE),

                # Sender and recipient names need to be ATTR_SEARCHABLE since
                # they're part of a composite index.
                sender = (unicode, ATTR_SEARCHABLE),
                recipient = (unicode, ATTR_SEARCHABLE),

                # Subject is searchable (standard SQL-based substring matches),
                # but also being indexed as part of the keywords inverted
                # index for fast term-based searching.
                subject = (unicode, ATTR_SEARCHABLE | ATTR_INVERTED_INDEX, 'keywords'),

                # Special case where an attribute name is the same as a registered
                # inverted index.  This lets us index on, for example, the message body
                # without actually storing the message inside the database.
                keywords = (list, ATTR_SIMPLE | ATTR_INVERTED_INDEX, 'keywords')
            )
        """
        if len(indexes) == len(attrs) == 0:
            raise ValueError, "Must specify indexes or attributes for object type"

        table_name = "objects_%s" % type_name

        # First pass over the attributes kwargs, sanity-checking provided values.
        for attr_name, attr_defn in attrs.items():
            # We allow attribute definition to be either a 2- to 4-tuple (last two
            # are optional), so pad the tuple with None if a 2- or 3-tuple was specified.
            attrs[attr_name] = attr_defn = tuple(attr_defn) + (None,) * (4-len(attr_defn))
            if len(attr_defn) != 4:
                raise ValueError, "Definition for attribute '%s' is not a 2- to 4-tuple." % attr_name

            # Verify the attribute flags contain either ATTR_SEARCHABLE or ATTR_SIMPLE;
            # it can't contain both as that doesn't make sense.
            if attr_defn[1] & (ATTR_SIMPLE | ATTR_SEARCHABLE) not in (ATTR_SIMPLE, ATTR_SEARCHABLE):
                raise ValueError, "Flags for attribute '%s' must contain exactly one " \
                                  "of ATTR_SIMPLE or ATTR_SEARCHABLE" % attr_name

            # Attribute name can't conflict with reserved names.
            if attr_name in RESERVED_ATTRIBUTES:
                raise ValueError, "Attribute name '%s' is reserved." % attr_name
            elif attr_name in self._inverted_indexes:
                if not attr_defn[1] & ATTR_INVERTED_INDEX or attr_defn[2] != attr_name:
                    # Attributes can be named after inverted indexes, but only if
                    # ATTR_INVERTED_INDEX is specified and the attribute name is the
                    # same as its ivtidx name.
                    raise ValueError, "Attribute '%s' conflicts with inverted index of same name, " \
                                      "but ATTR_INVERTED_INDEX not specified in flags." % attr_name

            if attr_defn[1] & ATTR_INVERTED_INDEX:
                # Attributes with ATTR_INVERTED_INDEX can only be certain types.
                if attr_defn[0] not in (str, unicode, tuple, list, set):
                    raise TypeError, "Type for attribute '%s' must be string, unicode, list, tuple, or set " \
                                     "because it is ATTR_INVERTED_INDEX" % attr_name

                # Make sure inverted index name is valid.
                if attr_defn[2] is None:
                    raise ValueError, "Attribute '%s' flags specify inverted index, " \
                                      "but no inverted index name supplied." % attr_name
                elif attr_defn[2] not in self._inverted_indexes:
                    raise ValueError, "Attribute '%s' specifies undefined interverted index '%s'" % \
                                      (attr_name, attr_defn[2])

            # Compile split regexp if it was given.
            if attr_defn[3] is not None and not callable(attr_defn[3]):
                attrs[attr_name] = attr_defn[:3] + (re.compile(attr_defn[3]),)


        if type_name in self._object_types:
            # This type already exists.  Compare given attributes with
            # existing attributes for this type to see what needs to be done
            # (if anything).
            cur_type_id, cur_type_attrs, cur_type_idx = self._object_types[type_name]
            new_attrs = {}
            table_needs_rebuild = False
            changed = False
            for attr_name, attr_defn in attrs.items():
                attr_type, attr_flags, attr_ivtidx, attr_split = attr_defn
                # TODO: converting an attribute from SIMPLE to SEARCHABLE or vice
                # versa isn't supported yet.  Raise exception here to prevent
                # potential data loss.
                if attr_name in cur_type_attrs and attr_flags & (ATTR_SEARCHABLE | ATTR_SIMPLE) != \
                   cur_type_attrs[attr_name][1] & (ATTR_SEARCHABLE | ATTR_SIMPLE):
                   raise ValueError, "Unsupported attempt to convert attribute '%s' " \
                                     "between ATTR_SIMPLE and ATTR_SEARCHABLE" % attr_name

                if attr_name not in cur_type_attrs or cur_type_attrs[attr_name] != attr_defn:
                    # There is a new attribute specified for this type, or an
                    # existing one has changed.
                    new_attrs[attr_name] = attr_defn
                    changed = True
                    if attr_flags & ATTR_SEARCHABLE:
                        # New attribute isn't simple, needs to alter table.
                        table_needs_rebuild = True
                    elif attr_flags & ATTR_INVERTED_INDEX:
                        # TODO: there is no need to rebuild the table when adding/modifying
                        # an ATTR_SIMPLE | ATTR_INVERTED_INDEX attribute, we just need to
                        # recreate the delete trigger (and remove any rows from the
                        # inverted index's map for this object type if we're removing
                        # an association with that ivtidx).  For now we will force a
                        # rebuild since I'm too lazy to implement the proper way.
                        table_needs_rebuild = True

                        if attr_name in cur_type_attrs and not cur_type_attrs[attr_name][1] & ATTR_INVERTED_INDEX:
                            # FIXME: if we add an inverted index to an existing attribute, we'd
                            # need to reparse that attribute in all rows to populate the inverted
                            # map.  Right now just log a warning.
                            log.warning("Adding inverted index '%s' to existing attribute '%s' not fully " \
                                        "implemented; index may be out of sync.", attr_ivtidx, attr_name)

            if not changed:
                return
            if self._readonly:
                raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')

            # Update the attr list to merge both existing and new attributes.
            attrs = cur_type_attrs.copy()
            attrs.update(new_attrs)
            new_indexes = set(indexes).difference(cur_type_idx)
            indexes = set(indexes).union(cur_type_idx)
            self._register_check_indexes(indexes, attrs)

            if not table_needs_rebuild:
                # Only simple (i.e. pickled only) attributes are being added,
                # or only new indexes are added, so we don't need to rebuild the
                # table.
                if len(new_attrs):
                    self._db_query("UPDATE types SET attrs_pickle=? WHERE id=?", (dbpickle(attrs), cur_type_id))

                if len(new_indexes):
                    self._register_create_multi_indexes(new_indexes, table_name)
                    self._db_query("UPDATE types SET idx_pickle=? WHERE id=?", (dbpickle(indexes), cur_type_id))

                self.commit()
                self._load_object_types()
                return

            # We need to update the database now ...

        else:
            # New type definition.  Populate attrs with required internal
            # attributes so they get created with the table.

            new_attrs = cur_type_id = None
            # Merge standard attributes with user attributes for this new type.
            attrs.update({
                'id': (int, ATTR_SEARCHABLE, None, None),
                'parent_type': (int, ATTR_SEARCHABLE, None, None),
                'parent_id': (int, ATTR_SEARCHABLE, None, None),
                'pickle': (RAW_TYPE, ATTR_SEARCHABLE, None, None)
            })
            self._register_check_indexes(indexes, attrs)

        create_stmt = 'CREATE TABLE %s_tmp (' % table_name

        # Iterate through type attributes and append to SQL create statement.
        sql_types = {int: 'INTEGER', long: 'INTEGER', float: 'FLOAT', RAW_TYPE: 'BLOB',
                     UNICODE_TYPE: 'TEXT', BYTES_TYPE: 'BLOB', bool: 'INTEGER', basestring: 'TEXT'}
        for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
            if attr_flags & ATTR_SEARCHABLE:
                # Attribute needs to be a column in the table, not a pickled value.
                if attr_type not in sql_types:
                    raise ValueError, "Type '%s' not supported" % str(attr_type)
                create_stmt += '%s %s' % (attr_name, sql_types[attr_type])
                if attr_name == 'id':
                    # Special case, these are auto-incrementing primary keys
                    create_stmt += ' PRIMARY KEY AUTOINCREMENT'
                create_stmt += ','

        create_stmt = create_stmt.rstrip(',') + ')'
        self._db_query(create_stmt)


        # Add this type to the types table, including the attributes
        # dictionary.
        self._db_query('INSERT OR REPLACE INTO types VALUES(?, ?, ?, ?)',
                       (cur_type_id, type_name, dbpickle(attrs), dbpickle(indexes)))

        # Sync self._object_types with the object type definition we just
        # stored to the db.
        self._load_object_types()

        if new_attrs:
            # Migrate rows from old table to new temporary one.  Here we copy only
            # ATTR_SEARCHABLE columns that exist in both old and new definitions.
            columns = filter(lambda x: cur_type_attrs[x][1] & ATTR_SEARCHABLE and \
                                       x in attrs and attrs[x][1] & ATTR_SEARCHABLE, cur_type_attrs.keys())
            columns = ','.join(columns)
            self._db_query('INSERT INTO %s_tmp (%s) SELECT %s FROM %s' % \
                           (table_name, columns, columns, table_name))

            # Delete old table.
            self._db_query('DROP TABLE %s' % table_name)

        # Rename temporary table.
        self._db_query('ALTER TABLE %s_tmp RENAME TO %s' % (table_name, table_name))

        # Increase the objectcount for new inverted indexes, and create a
        # trigger that reduces the objectcount for each applicable inverted
        # index when a row is deleted.
        inverted_indexes = self._get_type_inverted_indexes(type_name)
        if inverted_indexes:
            n_rows = self._db_query_row('SELECT COUNT(*) FROM %s' % table_name)[0]
            sql = 'CREATE TRIGGER delete_object_%s DELETE ON %s BEGIN ' % (type_name, table_name)
            for idx_name in inverted_indexes:
                sql += "UPDATE inverted_indexes SET value=MAX(0, value-1) WHERE name='%s' AND attr='objectcount';" % idx_name
                # Add to objectcount (both in db and cached value)
                self._db_query("UPDATE inverted_indexes SET value=value+? WHERE name=? and attr='objectcount'",
                               (n_rows, idx_name))
                self._inverted_indexes[idx_name]['objectcount'] += n_rows
            sql += 'END'
            self._db_query(sql)

        # Create index for locating all objects under a given parent.
        self._db_query("CREATE INDEX %s_parent_idx on %s (parent_id, "\
                       "parent_type)" % (table_name, table_name))

        # If any of these attributes need to be indexed, create the index
        # for that column.
        for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
            if attr_flags & ATTR_INDEXED:
                self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
                               (table_name, attr_name, table_name, attr_name))

        # Create multi-column indexes; indexes value has already been verified.
        self._register_create_multi_indexes(indexes, table_name)
        self.commit()


    def register_inverted_index(self, name, min = None, max = None, split = None, ignore = None):
        """
        Registers a new inverted index with the database.

        An inverted index maps arbitrary terms to objects and allows you to
        query based on one or more terms.  If the inverted index already exists
        with the given parameters, no action is performed.

        :param name: the name of the inverted index; must be alphanumeric.
        :type name: str
        :param min: the minimum length of terms to index; terms smaller
                    than this will be ignored.  If None (default), there
                    is no minimum size.
        :type min: int
        :param max: the maximum length of terms to index; terms larger than
                    this will be ignored.  If None (default), there is no
                    maximum size.
        :type max: int
        :param split: used to parse string-based attributes using this inverted
                      index into individual terms.  In the case of regexps, the
                      split method will be called.  (If a string is specified,
                      it will be compiled into a regexp first.)  If *split* is
                      a callable, it will receive a string of text and must return
                      a sequence, and each item in the sequence will be indexed
                      as an individual term.  If split is not specified, the
                      default is to split words at non-alphanumeric/underscore/digit
                      boundaries.
        :type split: callable, regexp (SRE_Pattern) object, or str
        :param ignore: a list of terms that will not be indexed (so-called
                       *stop words*).  If specified, each indexed term for this
                       inverted index will first be checked against this list.
                       If it exists, the term is discarded.

        For example::

            from kaa.db import *
            db = Database('test.db')
            db.register_inverted_index('tags')
            db.register_inverted_index('keywords', min=3, max=30, ignore=STOP_WORDS)
        """
        # Verify specified name doesn't already exist as some object attribute.
        for object_name, object_type in self._object_types.items():
            if name in object_type[1] and name != object_type[1][name][2]:
                raise ValueError, "Inverted index name '%s' conflicts with registered attribute in object '%s'" % \
                                  (name, object_name)

        if split is None:
            # Default split regexp is to split words on
            # alphanumeric/digits/underscore boundaries.
            split = re.compile(u"(\d+)|[_\W]", re.U)
        elif isinstance(split, basestring):
            split = re.compile(py3_str(split), re.U)

        if name not in self._inverted_indexes and not self._readonly:
            self._db_query('INSERT INTO inverted_indexes VALUES(?, "objectcount", 0)', (name,))
            # Create the tables needed by the inverted index.
            with self._lock:
                self._db.executescript(CREATE_IVTIDX_TEMPLATE.replace('%IDXNAME%', name))
        elif name in self._inverted_indexes:
            defn = self._inverted_indexes[name]
            if min == defn['min'] and max == defn['max'] and split == defn['split'] and \
               ignore == defn['ignore']:
               # Definition unchanged, nothing to do.
               return

        if self._readonly:
            raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')

        defn = {
            'min': min,
            'max': max,
            'split': split,
            'ignore': ignore,
        }

        self._db_query("INSERT OR REPLACE INTO inverted_indexes VALUES(?, 'definition', ?)",
                       (name, dbpickle(defn)))

        defn['objectcount'] = 0
        self._inverted_indexes[name] = defn
        self.commit()


    def _load_inverted_indexes(self):
        for name, attr, value in self._db_query("SELECT * from inverted_indexes"):
            if name not in self._inverted_indexes:
                self._inverted_indexes[name] = {}
            if attr == 'objectcount':
                self._inverted_indexes[name][attr] = int(value)
            elif attr == 'definition':
                self._inverted_indexes[name].update(dbunpickle(value))


    def _load_object_types(self):
        is_pickle_proto_2 = False
        for id, name, attrs, idx in self._db_query("SELECT * from types"):
            if attrs[1] == 0x02 or idx[1] == 0x02:
                is_pickle_proto_2 = True
            self._object_types[name] = id, dbunpickle(attrs), dbunpickle(idx)

        if sys.hexversion >= 0x03000000 and is_pickle_proto_2:
            self._readonly = True
            log.warning('kaa.db databases created by Python 2 are read-only until upgrade_to_py3() is called')


    def _get_type_inverted_indexes(self, type_name):
        if type_name not in self._object_types:
            return []

        indexed_attrs = set()
        type_attrs = self._object_types[type_name][1]
        for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
            if flags & ATTR_INVERTED_INDEX:
                indexed_attrs.add(attr_ivtidx)

        return list(indexed_attrs)


    def _get_type_attrs(self, type_name):
        return self._object_types[type_name][1]

    def _get_type_id(self, type_name):
        return self._object_types[type_name][0]


    def _make_query_from_attrs(self, query_type, attrs, type_name):
        type_attrs = self._get_type_attrs(type_name)

        # True if an attribute from…
Tech Fingerprint

Alerts (34)

'isinstance(' Overuse may indicate design issues; consider polymorphism
162 242 267 476 480 482 680 685 704 1086
'global' Avoid global variables; use function parameters or class attributes for better scope management
222
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
224 370
'def' Ensure functions have docstrings for documentation
285 300 307 311 318 352 357 362 373 381 390 400 403 530
Complexity hotspot; lines 292 to 293 (total complexity: 5)
292 293
'type(' Use isinstance() for type checking instead of type()
366 485
'== None' Use 'is' for None comparisons (e.g., x is None)
478
'= []' Avoid mutable defaults like '= []' or '= {}'; use None and initialize inside
720
'list(' Avoid unnecessary list conversions; use generators where possible
1151