/src/db.py
Python | 2571 lines | 2424 code | 23 blank | 124 comment | 51 complexity | 380f4e5e41fb43919618ebe8d2db9bbb MD5 | raw file
Possible License(s): LGPL-2.1
Large files files are truncated, but you can click here to view the full file
- # -----------------------------------------------------------------------------
- # db.py - db abstraction module
- # -----------------------------------------------------------------------------
- # Copyright 2006-2012 Dirk Meyer, Jason Tackaberry
- #
- # Please see the file AUTHORS for a complete list of authors.
- #
- # This library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License version
- # 2.1 as published by the Free Software Foundation.
- #
- # This library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- # Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this library; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- # 02110-1301 USA
- #
- # -----------------------------------------------------------------------------
- from __future__ import absolute_import
- __all__ = [
- 'Database', 'QExpr', 'DatabaseError', 'DatabaseReadOnlyError',
- 'split_path', 'ATTR_SIMPLE', 'ATTR_SEARCHABLE', 'ATTR_IGNORE_CASE',
- 'ATTR_INDEXED', 'ATTR_INDEXED_IGNORE_CASE', 'ATTR_INVERTED_INDEX',
- 'RAW_TYPE'
- ]
- # python imports
- import sys
- import os
- import time
- import re
- import logging
- import math
- import cPickle
- import copy_reg
- import _weakref
- import threading
- try:
- # Try a system install of pysqlite
- from pysqlite2 import dbapi2 as sqlite
- except ImportError:
- # Python 2.6 provides sqlite natively, so try that next.
- from sqlite3 import dbapi2 as sqlite
- # kaa base imports
- from .utils import property
- from .strutils import py3_str, BYTES_TYPE, UNICODE_TYPE
- from .timer import WeakOneShotTimer
- from . import main
- if sqlite.version < '2.1.0':
- raise ImportError('pysqlite 2.1.0 or higher required')
- if sqlite.sqlite_version < '3.3.1':
- raise ImportError('sqlite 3.3.1 or higher required')
- # get logging object
- log = logging.getLogger('kaa.base.db')
- SCHEMA_VERSION = 0.2
- SCHEMA_VERSION_COMPATIBLE = 0.2
- CREATE_SCHEMA = """
- CREATE TABLE meta (
- attr TEXT UNIQUE,
- value TEXT
- );
- INSERT INTO meta VALUES('version', %s);
- CREATE TABLE types (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name TEXT UNIQUE,
- attrs_pickle BLOB,
- idx_pickle BLOB
- );
- CREATE TABLE inverted_indexes (
- name TEXT,
- attr TEXT,
- value TEXT
- );
- CREATE UNIQUE INDEX inverted_indexes_idx on inverted_indexes (name, attr);
- """
- CREATE_IVTIDX_TEMPLATE = """
- CREATE TABLE ivtidx_%IDXNAME%_terms (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- term TEXT,
- count INTEGER
- );
- CREATE UNIQUE INDEX ivtidx_%IDXNAME%_terms_idx on ivtidx_%IDXNAME%_terms (term);
- CREATE TABLE ivtidx_%IDXNAME%_terms_map (
- rank INTEGER,
- term_id INTEGER,
- object_type INTEGER,
- object_id INTEGER,
- frequency FLOAT
- );
- CREATE INDEX ivtidx_%IDXNAME%_terms_map_idx ON ivtidx_%IDXNAME%_terms_map (term_id, rank, object_type, object_id);
- CREATE INDEX ivtidx_%IDXNAME%_terms_map_object_idx ON ivtidx_%IDXNAME%_terms_map (object_id, object_type, term_id);
- CREATE TRIGGER ivtidx_%IDXNAME%_delete_terms_map DELETE ON ivtidx_%IDXNAME%_terms_map
- BEGIN
- UPDATE ivtidx_%IDXNAME%_terms SET count=MAX(0, count-1) WHERE id=old.term_id;
- END;
- """
- ATTR_SIMPLE = 0x01
- ATTR_SEARCHABLE = 0x02 # Is a SQL column, not a pickled field
- ATTR_INDEXED = 0x04 # Will have an SQL index
- ATTR_IGNORE_CASE = 0x08 # Store in db as lowercase for searches.
- ATTR_INVERTED_INDEX = 0x10 # Attribute associated with an inverted idx
- ATTR_INDEXED_IGNORE_CASE = ATTR_INDEXED | ATTR_IGNORE_CASE
- # These are special attributes for querying. Attributes with
- # these names cannot be registered.
- RESERVED_ATTRIBUTES = ('id', 'parent', 'object', 'type', 'limit', 'attrs', 'distinct', 'orattrs')
- STOP_WORDS = (
- "about", "and", "are", "but", "com", "for", "from", "how", "not",
- "some", "that", "the", "this", "was", "what", "when", "where", "who",
- "will", "with", "the", "www", "http", "org", "of", "on"
- )
- class PyObjectRow(object):
- """
- ObjectRows are dictionary-like objects that represent an object in
- the database. They are used by pysqlite instead of tuples or indexes.
- ObjectRows support on-demand unpickling of the internally stored pickle
- which contains ATTR_SIMPLE attributes.
- This is the native Python implementation of ObjectRow. There is a
- faster C implementation in the _objectrow extension. This
- implementation is still designed to be efficient -- the C version is
- only about 60-70% faster.
- """
- # A dict containing per-query data: [refcount, idxmap, typemap, pickle_idx]
- # This is constructed once for each query, and each row returned in the
- # query references the same data. Each ObjectRow instance adds to the
- # refcount once initialized, and is decremented when the object is deleted.
- # Once it reaches 0, the entry is removed from the dict.
- queries = {}
- # Use __slots__ as a minor optimization to improve object creation time.
- __slots__ = ('_description', '_object_types', '_type_name', '_row', '_pickle',
- '_idxmap', '_typemap', '_keys')
- def __init__(self, cursor, row, pickle_dict=None):
- # The following is done per row per query, so it should be as light as
- # possible.
- if pickle_dict:
- # Created outside pysqlite, e.g. from Database.add()
- self._pickle = pickle_dict
- self._idxmap = None
- return
- if isinstance(cursor, tuple):
- self._description, self._object_types = cursor
- else:
- self._description = cursor.description
- self._object_types = cursor._db()._object_types
- self._row = row
- # _pickle: False == no pickle present; None == pickle present but
- # empty; if a dict, is the unpickled dictionary; else it's a byte
- # string containing the pickled data.
- self._pickle = False
- self._type_name = row[0]
- try:
- attrs = self._object_types[self._type_name][1]
- except KeyError:
- raise ValueError("Object type '%s' not defined." % self._type_name)
- query_key = id(self._description)
- if query_key in PyObjectRow.queries:
- query_info = PyObjectRow.queries[query_key]
- # Increase refcount to the query info
- query_info[0] += 1
- self._idxmap, self._typemap, pickle_idx = query_info[1:]
- if pickle_idx != -1:
- self._pickle = self._row[pickle_idx]
- return
- # Everything below this is done once per query, not per row, so
- # performance isn't quite as critical.
- idxmap = {} # attr_name -> (idx, pickled, named_ivtdx, type, flags)
- pickle_idx = -1
- for i in range(2, len(self._description)):
- attr_name = self._description[i][0]
- idxmap[attr_name] = i
- if attr_name == 'pickle':
- pickle_idx = i
- self._pickle = self._row[i]
- for attr_name, (attr_type, flags, ivtidx, split) in attrs.items():
- idx = idxmap.get(attr_name, -1)
- pickled = flags & ATTR_SIMPLE or (flags & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
- idxmap[attr_name] = idx, pickled, attr_name == ivtidx, attr_type, flags
- # Construct dict mapping type id -> type name. Essentially an
- # inversion of _object_types
- typemap = dict((v[0], k) for k, v in self._object_types.items())
- self._idxmap = idxmap
- self._typemap = typemap
- PyObjectRow.queries[query_key] = [1, idxmap, typemap, pickle_idx]
- def __del__(self):
- if self._idxmap is None:
- # From Database.add(), pickle only, no pysqlite row.
- return
- query_key = id(self._description)
- query_info = PyObjectRow.queries[query_key]
- query_info[0] -= 1
- if query_info[0] == 0:
- # Refcount for this query info is 0, so remove from global queries
- # dict.
- del PyObjectRow.queries[query_key]
- def __getitem__(self, key):
- if self._idxmap is None:
- # From Database.add(), work strictly from pickle
- return self._pickle[key]
- elif key == 'type':
- return self._type_name
- elif key == 'parent':
- type_idx = self._idxmap.get('parent_type', [-1])[0]
- id_idx = self._idxmap.get('parent_id', [-1])[0]
- if type_idx == -1 or id_idx == -1:
- raise KeyError('Parent attribute not available')
- type_id = self._row[type_idx]
- return self._typemap.get(type_id, type_id), self._row[id_idx]
- elif key == '_row':
- return self._row
- elif isinstance(key, int):
- return self._row[key]
- attr = self._idxmap[key]
- attr_idx = attr[0]
- is_indexed_ignore_case = (attr[4] & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
- if attr_idx == -1:
- # Attribute is not in the sql row
- if attr[1] and self._pickle is None:
- # Pickle is empty, which means this attribute was never
- # assigned a value. Return a default (empty list if attribute
- # is named after an inverted index
- return [] if attr[2] else None
- elif not attr[1] or self._pickle is False:
- # The requested attribute is not in the sqlite row, and neither is the pickle.
- raise KeyError("ObjectRow does not have enough data to provide '%s'" % key)
- if not attr[1]:
- value = self._row[attr_idx]
- elif attr_idx >= 0 and not self._pickle and is_indexed_ignore_case:
- # Attribute is ATTR_INDEXED_IGNORE_CASE which means the
- # authoritative source is in the pickle, but we don't have it. So just
- # return what we have.
- value = self._row[attr_idx]
- else:
- if self._pickle and not isinstance(self._pickle, dict):
- # We need to check the pickle but it's not unpickled, so do so now.
- self._pickle = dbunpickle(self._pickle)
- if is_indexed_ignore_case:
- key = '__' + key
- if key not in self._pickle:
- return [] if attr[2] else None
- else:
- value = self._pickle[key]
- if sys.hexversion < 0x03000000 and (attr[3] == str or attr[3] == buffer):
- # Python 2's pysqlite returns BLOBs as buffers. If the attribute
- # type is string or buffer (RAW_TYPE on Python 2), convert to string.
- return str(value)
- else:
- return value
- def keys(self):
- if not self._idxmap:
- # Ad hoc ObjectRow, proxy to pickle dict.
- return self._pickle.keys()
- if not hasattr(self, '_keys'):
- self._keys = ['type']
- for name, attr in self._idxmap.items():
- if (attr[0] >= 0 or (attr[1] and self._pickle is not False)) and name != 'pickle':
- self._keys.append(name)
- if 'parent_type' in self._idxmap and 'parent_id' in self._idxmap:
- self._keys.append('parent')
- return self._keys
- def values(self):
- if not self._idxmap:
- # Ad hoc ObjectRow, proxy to pickle dict.
- return self._pickle.values()
- return [self[k] for k in self.keys()]
- def items(self):
- return zip(self.keys(), self.values())
- def get(self, key, default=None):
- try:
- return self[key]
- except KeyError:
- return default
- def has_key(self, key):
- return key in self.keys()
- def __iter__(self):
- return iter(self.keys())
- def __contains__(self, key):
- if key == 'type' or (key == 'parent' and 'parent_id' in self._idxmap):
- return True
- else:
- return key in self._idxmap
- if sys.hexversion >= 0x03000000:
- RAW_TYPE = bytes
- PICKLE_PROTOCOL = 3
- class Proto2Unpickler(pickle._Unpickler):
- """
- In spite of the promise that pickles will be compatible between Python
- releases, Python 3 does the wrong thing with non-unicode strings in
- pickles (BINSTRING pickle opcode). It will try to convert them to
- unicode strings, which is wrong when the intention was to store binary
- data in a Python 2 str.
- This class implements a custom unpickler that will load BINSTRING as
- bytes objects. An exception is made for dictionaries, where BINSTRING
- keys are converted to unicode strings.
- Additionally, it maps the unicode and buffer types to corresponding
- Python 3 types.
- """
- dispatch = pickle._Unpickler.dispatch.copy()
- def load_binstring(self):
- len = pickle.mloads(bytes('i', 'ascii') + self.read(4))
- self.append(bytes(self.read(len)))
- dispatch[pickle.BINSTRING[0]] = load_binstring
- def load_short_binstring(self):
- len = ord(self.read(1))
- self.append(bytes(self.read(len)))
- dispatch[pickle.SHORT_BINSTRING[0]] = load_short_binstring
- def load_setitems(self):
- super(Proto2Unpickler, self).load_setitems()
- d = self.stack[-1]
- for k, v in d.items():
- if type(k) == bytes:
- sk = str(k, self.encoding, self.errors)
- if sk not in d:
- d[sk] = v
- del d[k]
- dispatch[pickle.SETITEMS[0]] = load_setitems
- def find_class(self, module, name):
- if module == '__builtin__':
- if name == 'unicode':
- return str
- elif name == 'buffer':
- return bytes
- return super(Proto2Unpickler, self).find_class(module, name)
- def dbunpickle(s):
- if s[1] == 0x02:
- import io
- #import pickletools
- #pickletools.dis(io.BytesIO(bytes(s)))
- return Proto2Unpickler(io.BytesIO(bytes(s))).load()
- else:
- return pickle.loads(bytes(s))
- def dbpickle(value):
- return bytes(pickle.dumps(value, 3))
- # Need to be able to unpickle pickled buffers from Python 2.
- def _unpickle_buffer(s):
- return bytes(s)
- else:
- RAW_TYPE = buffer
- PICKLE_PROTOCOL = 2
- def dbunpickle(s):
- return cPickle.loads(str(s))
- def dbpickle(value):
- return buffer(cPickle.dumps(value, 2))
- # Python2 can't pickle buffer types, so register a handler for that.
- def _pickle_buffer(b):
- return _unpickle_buffer, (str(b),)
- def _unpickle_buffer(s):
- return str(s)
- copy_reg.pickle(buffer, _pickle_buffer, _unpickle_buffer)
- try:
- from . import _objectrow
- except ImportError:
- # Use the python-native ObjectRow
- ObjectRow = PyObjectRow
- else:
- # Use the faster C-based ObjectRow
- ObjectRow = _objectrow.ObjectRow
- # Expose the custom unpickler to the _objectrow module so it can deal with
- # pickles stored inside DB rows.
- _objectrow.dbunpickle = dbunpickle
- # Register a handler for pickling ObjectRow objects.
- def _pickle_ObjectRow(o):
- if o._description:
- return _unpickle_ObjectRow, ((o._description, o._object_types), o._row)
- else:
- return _unpickle_ObjectRow, (None, None, dict(o.items()))
- def _unpickle_ObjectRow(*args):
- return ObjectRow(*args)
- copy_reg.pickle(ObjectRow, _pickle_ObjectRow, _unpickle_ObjectRow)
- PATH_SPLIT = re.compile(ur'(\d+)|[_\W]', re.U | re.X)
- def split_path(s):
- """
- Convenience split function for inverted index attributes. Useful for
- attributes that contain filenames. Splits the given string s into
- components parts (directories, filename), discarding the extension and all
- but the last two directories. What's remaining is split into words and the
- result is returned.
- """
- dirname, filename = os.path.split(s)
- fname_noext, ext = os.path.splitext(filename)
- for part in dirname.strip('/').split(os.path.sep)[2:][-2:] + [fname_noext]:
- for match in PATH_SPLIT.split(part):
- if match:
- yield match
- # FIXME: this is flawed. Can use placeholders by taking a n-tuple and
- # replacing ? to (?, ?, ..., n) and then extend the query params list with the
- # given tuple/list value.
- def _list_to_printable(value):
- """
- Takes a list of mixed types and outputs a unicode string. For
- example, a list [42, 'foo', None, "foo's' string"], this returns the
- string:
- (42, 'foo', NULL, 'foo''s'' string')
- Single quotes are escaped as ''. This is suitable for use in SQL
- queries.
- """
- fixed_items = []
- for item in value:
- if isinstance(item, (int, long, float)):
- fixed_items.append(str(item))
- elif item == None:
- fixed_items.append("NULL")
- elif isinstance(item, UNICODE_TYPE):
- fixed_items.append("'%s'" % item.replace("'", "''"))
- elif isinstance(item, BYTES_TYPE):
- fixed_items.append("'%s'" % py3_str(item.replace("'", "''")))
- else:
- raise Exception, "Unsupported type '%s' given to _list_to_printable" % type(item)
- return '(' + ','.join(fixed_items) + ')'
- class DatabaseError(Exception):
- pass
- class DatabaseReadOnlyError(Exception):
- pass
- class QExpr(object):
- """
- Flexible query expressions for use with :meth:`kaa.db.Database.query()`
- """
- def __init__(self, operator, operand):
- """
- :param operator: ``=``, ``!=``, ``<``, ``<=``, ``>``, ``>=``, ``in``,
- ``not in``, ``range``, ``like``, or ``regexp``
- :type operator: str
- :param operand: the rvalue of the expression; any scalar values as part of
- the operand must be the same type as the attribute being
- evaluated
- Except for ``in``, ``not in``, and ``range``, the operand must be the
- type of the registered attribute being evaluated (e.g. unicode, int,
- etc.).
- The operand for ``in`` and ``not in`` are lists or tuples of the attribute
- type, to test inclusion in the given set.
- The ``range`` operator accepts a 2-tuple specifying min and max values
- for the attribute. The Python expression age=QExpr('range', (20, 30))
- translates to ``age >= 20 AND age <= 30``.
- """
- operator = operator.lower()
- assert(operator in ('=', '!=', '<', '<=', '>', '>=', 'in', 'not in', 'range', 'like', 'regexp'))
- if operator in ('in', 'not in', 'range'):
- assert(isinstance(operand, (list, tuple)))
- if operator == 'range':
- assert(len(operand) == 2)
- self._operator = operator
- self._operand = operand
- def as_sql(self, var):
- if self._operator == "range":
- a, b = self._operand
- return "%s >= ? AND %s <= ?" % (var, var), (a, b)
- elif self._operator in ("in", "not in"):
- return "%s %s %s" % (var, self._operator.upper(),
- _list_to_printable(self._operand)), ()
- else:
- return "%s %s ?" % (var, self._operator.upper()), \
- (self._operand,)
- class RegexpCache(object):
- def __init__(self):
- self.last_item = None
- self.last_expr = None
- def __call__(self, expr, item):
- if item is None:
- return 0
- if self.last_item == item and self.last_item is not None and self.last_expr == expr:
- return self.last_result
- if self.last_expr != expr:
- self.last_expr = re.compile(unicode(expr), re.U)
- self.last_item = item
- # FIXME: bad conversion to unicode!
- self.last_result = self.last_expr.match(unicode(item)) is not None
- return self.last_result
- class Database(object):
- def __init__(self, dbfile):
- """
- Open a database, creating one if it doesn't already exist.
- :param dbfile: path to the database file
- :type dbfile: str
- SQLite is used to provide the underlying database.
- """
- super(Database, self).__init__()
- # _object_types dict is keyed on type name, where value is a 3-
- # tuple (id, attrs, idx), where:
- # - id is a unique numeric database id for the type,
- # - attrs is a dict containing registered attributes for the type,
- # keyed on attribute name, where value is a 3-tuple of (type, flags,
- # ivtidx), where type is a python datatype, flags is a bitmask of
- # ATTR_*, ivtidx is the name of the associated inverted index (used
- # if flags has ATTR_INVERTED_INDEX, otherwise None)
- # - idx is a list of n-tuples, where each n-tuple defines two or more
- # (non-ATTR_SIMPLE) attributes on which to create a multi-column
- # sql index.
- self._object_types = {}
- # _inverted_indexes dict is keyed on index name, where value is
- # a dict keyed on:
- # - min: minimum length of terms
- # - max: maximum length of terms
- # - ignore: list of terms to ignore
- # - split: function or regular expression used to split string ATTR_INVERTED_INDEX
- # attributes.
- self._inverted_indexes = {}
- # True when there are uncommitted changes
- self._dirty = False
- # True when modifications are not allowed to the database, which
- # is the case when Python 3 is opening a database created by Python 2
- # and upgrade_to_py3() has not been called.
- self._readonly = False
- self._dbfile = os.path.realpath(dbfile)
- self._lock = threading.RLock()
- self._lazy_commit_timer = WeakOneShotTimer(self.commit)
- self._lazy_commit_interval = None
- self._open_db()
- def _open_db(self):
- self._db = sqlite.connect(self._dbfile, check_same_thread=False)
- # Create the function "regexp" for the REGEXP operator of SQLite
- self._db.create_function("regexp", 2, RegexpCache())
- self._cursor = self._db.cursor()
- class Cursor(sqlite.Cursor):
- _db = _weakref.ref(self)
- self._db.row_factory = ObjectRow
- # Queries done through this cursor will use the ObjectRow row factory.
- self._qcursor = self._db.cursor(Cursor)
- for cursor in self._cursor, self._qcursor:
- cursor.execute("PRAGMA synchronous=OFF")
- cursor.execute("PRAGMA temp_store=MEMORY")
- cursor.execute("PRAGMA cache_size=50000")
- cursor.execute("PRAGMA page_size=8192")
- if not self._check_table_exists("meta"):
- self._db.executescript(CREATE_SCHEMA % SCHEMA_VERSION)
- row = self._db_query_row("SELECT value FROM meta WHERE attr='version'")
- if float(row[0]) < SCHEMA_VERSION_COMPATIBLE:
- raise DatabaseError("Database '%s' has schema version %s; required %s" % \
- (self._dbfile, row[0], SCHEMA_VERSION_COMPATIBLE))
- self._load_inverted_indexes()
- self._load_object_types()
- def _set_dirty(self):
- if self._lazy_commit_interval is not None:
- self._lazy_commit_timer.start(self._lazy_commit_interval)
- if self._dirty:
- return
- self._dirty = True
- main.signals['exit'].connect(self.commit)
- def _db_query(self, statement, args = (), cursor = None, many = False):
- t0=time.time()
- with self._lock:
- if not cursor:
- cursor = self._cursor
- if many:
- cursor.executemany(statement, args)
- else:
- cursor.execute(statement, args)
- rows = cursor.fetchall()
- t1=time.time()
- #print "QUERY [%.06f%s]: %s" % (t1-t0, ('', ' (many)')[many], statement), args
- return rows
- def _db_query_row(self, statement, args = (), cursor = None):
- rows = self._db_query(statement, args, cursor)
- if len(rows) == 0:
- return None
- return rows[0]
- def _to_obj_tuple(self, obj, numeric=False):
- """
- Returns a normalized object reference as a 2-tuple (type, id).
- :param obj: an ObjectRow, or 2-tuple (type, id)
- :param numeric: if True, coerce type name to a type id
- Raises ValueError if obj is not valid.
- """
- if isinstance(obj, ObjectRow):
- object_type, object_id = obj['type'], obj['id']
- else:
- try:
- object_type, object_id = obj
- if not isinstance(object_type, (int, basestring)) or not isinstance(object_id, (int, long, QExpr)):
- raise TypeError
- except TypeError:
- raise ValueError('Object reference must be either ObjectRow, or (type, id), got %s' % obj)
- if numeric:
- object_type = self._get_type_id(object_type)
- return object_type, object_id
- def _check_table_exists(self, table):
- res = self._db_query_row("SELECT name FROM sqlite_master where " \
- "name=? and type='table'", (table,))
- return res != None
- def _register_check_indexes(self, indexes, attrs):
- for cols in indexes:
- if not isinstance(cols, (list, tuple)):
- raise ValueError, "Single column index specified ('%s') where multi-column index expected." % cols
- for col in cols:
- errstr = "Multi-column index (%s) contains" % ",".join(cols)
- if col not in attrs:
- raise ValueError, "%s unknown attribute '%s'" % (errstr, col)
- if not attrs[col][1]:
- raise ValueError, "%s ATTR_SIMPLE attribute '%s'" % (errstr, col)
- def _register_create_multi_indexes(self, indexes, table_name):
- for cols in indexes:
- self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
- (table_name, "_".join(cols), table_name, ",".join(cols)))
- def register_object_type_attrs(self, type_name, indexes = [], **attrs):
- """
- Register one or more object attributes and/or multi-column indexes for
- the given type name.
- This function modifies the database as needed to accommodate new
- indexes and attributes, either by creating the object's tables (in the
- case of a new object type) or by altering the object's tables to add
- new columns or indexes.
- This method is idempotent: if the attributes and indexes specified have
- not changed from previous invocations, no changes will be made to the
- database. Moreover, newly registered attributes will not affect
- previously registered attributes. This allows, for example, a plugin
- to extend an existing object type created by the core application
- without interfering with it.
- :param type_name: the name of object type the registered attributes or
- indexes apply to.
- :type type_name: str
- :param indexes: a list of tuples where each tuple contains 2 or more
- registered :attr:`~kaa.db.ATTR_SEARCHABLE` attributes
- for which a composite index will be created in the
- underlying database. This is useful for speeding
- up queries involving these attributes combined.
- :type indexes: list of tuples of strings
- :param attrs: keyword arguments defining the attributes to be
- registered. The keyword defining the attribute name
- cannot conflict with any of the names in
- :attr:`~kaa.db.RESERVED_ATTRIBUTES`. See below for a
- more complete specification of the value.
- :type attrs: 2, 3, or 4-tuple
- Previously registered attributes may be updated in limited ways (e.g.
- by adding an index to the attribute). If the change requested is
- not supported, a ValueError will be raised.
- .. note:: Currently, indexes and attributes can only be added, not
- removed. That is, once an attribute or index is added, it lives
- forever.
- Object attributes, which are supplied as keyword arguments, are either
- *searchable* or *simple*. *Searchable* attributes occupy a column in
- the underlying database table and so queries can be performed on these
- attributes, but their types are more restricted. *Simple* attributes
- can be any type that can be pickled, but can't be searched.
- The attribute kwarg value is a tuple of 2 to 4 items in length and in
- the form (attr_type, flags, ivtidx, split).
- * attr_type: the type of the object attribute. For simple attributes
- (:attr:`~kaa.db.ATTR_SIMPLE` in *flags*), this can be any picklable
- type; for searchable attributes (:attr:`~kaa.db.ATTR_SEARCHABLE`
- in *flags*), this must be either *int*, *float*, *str*, *unicode*,
- *bytes*, or *bool*. (On Python 2.5, you can use ``kaa.db.RAW_TYPE``
- instead of *bytes*.)
- * flags: a bitmap of :ref:`attribute flags <attrflags>`
- * ivtidx: name of a previously registered inverted index used for
- this attribute. Only needed if flags contains
- :attr:`~kaa.db.ATTR_INVERTED_INDEX`
- * split: function or regular expression used to split string-based
- values for this attribute into separate terms for indexing. If
- this isn't defined, then the default split strategy for the
- inverted index wil be used.
- Apart from not being allowed to conflict with one of the reserved
- names, there is a special case for attribute names: when they have
- the same name as a previously registered inverted index. These
- attributes must be :attr:`~kaa.db.ATTR_SIMPLE`, and of type *list*.
- Terms explicitly associated with the attribute are persisted with the
- object, but when accessed, all terms for all attributes for that
- inverted index will be contained in the list, not just those explicitly
- associated with the same-named attribute.
- The following example shows what an application that indexes email might
- do::
- from kaa.db import *
- from datetime import datetime
- db = Database('email.db')
- db.register_inverted_index('keywords', min=3, max=30)
- db.register_object_type_attrs('msg',
- # Create a composite index on sender and recipient, because
- # (let's suppose) it's we do a lot of searches for specific
- # senders emailing specific recipients.
- [('sender', 'recipient')],
- # Simple attribute can be anything that's picklable, which datetime is.
- date = (datetime, ATTR_SIMPLE),
- # Sender and recipient names need to be ATTR_SEARCHABLE since
- # they're part of a composite index.
- sender = (unicode, ATTR_SEARCHABLE),
- recipient = (unicode, ATTR_SEARCHABLE),
- # Subject is searchable (standard SQL-based substring matches),
- # but also being indexed as part of the keywords inverted
- # index for fast term-based searching.
- subject = (unicode, ATTR_SEARCHABLE | ATTR_INVERTED_INDEX, 'keywords'),
- # Special case where an attribute name is the same as a registered
- # inverted index. This lets us index on, for example, the message body
- # without actually storing the message inside the database.
- keywords = (list, ATTR_SIMPLE | ATTR_INVERTED_INDEX, 'keywords')
- )
- """
- if len(indexes) == len(attrs) == 0:
- raise ValueError, "Must specify indexes or attributes for object type"
- table_name = "objects_%s" % type_name
- # First pass over the attributes kwargs, sanity-checking provided values.
- for attr_name, attr_defn in attrs.items():
- # We allow attribute definition to be either a 2- to 4-tuple (last two
- # are optional), so pad the tuple with None if a 2- or 3-tuple was specified.
- attrs[attr_name] = attr_defn = tuple(attr_defn) + (None,) * (4-len(attr_defn))
- if len(attr_defn) != 4:
- raise ValueError, "Definition for attribute '%s' is not a 2- to 4-tuple." % attr_name
- # Verify the attribute flags contain either ATTR_SEARCHABLE or ATTR_SIMPLE;
- # it can't contain both as that doesn't make sense.
- if attr_defn[1] & (ATTR_SIMPLE | ATTR_SEARCHABLE) not in (ATTR_SIMPLE, ATTR_SEARCHABLE):
- raise ValueError, "Flags for attribute '%s' must contain exactly one " \
- "of ATTR_SIMPLE or ATTR_SEARCHABLE" % attr_name
- # Attribute name can't conflict with reserved names.
- if attr_name in RESERVED_ATTRIBUTES:
- raise ValueError, "Attribute name '%s' is reserved." % attr_name
- elif attr_name in self._inverted_indexes:
- if not attr_defn[1] & ATTR_INVERTED_INDEX or attr_defn[2] != attr_name:
- # Attributes can be named after inverted indexes, but only if
- # ATTR_INVERTED_INDEX is specified and the attribute name is the
- # same as its ivtidx name.
- raise ValueError, "Attribute '%s' conflicts with inverted index of same name, " \
- "but ATTR_INVERTED_INDEX not specified in flags." % attr_name
- if attr_defn[1] & ATTR_INVERTED_INDEX:
- # Attributes with ATTR_INVERTED_INDEX can only be certain types.
- if attr_defn[0] not in (str, unicode, tuple, list, set):
- raise TypeError, "Type for attribute '%s' must be string, unicode, list, tuple, or set " \
- "because it is ATTR_INVERTED_INDEX" % attr_name
- # Make sure inverted index name is valid.
- if attr_defn[2] is None:
- raise ValueError, "Attribute '%s' flags specify inverted index, " \
- "but no inverted index name supplied." % attr_name
- elif attr_defn[2] not in self._inverted_indexes:
- raise ValueError, "Attribute '%s' specifies undefined interverted index '%s'" % \
- (attr_name, attr_defn[2])
- # Compile split regexp if it was given.
- if attr_defn[3] is not None and not callable(attr_defn[3]):
- attrs[attr_name] = attr_defn[:3] + (re.compile(attr_defn[3]),)
- if type_name in self._object_types:
- # This type already exists. Compare given attributes with
- # existing attributes for this type to see what needs to be done
- # (if anything).
- cur_type_id, cur_type_attrs, cur_type_idx = self._object_types[type_name]
- new_attrs = {}
- table_needs_rebuild = False
- changed = False
- for attr_name, attr_defn in attrs.items():
- attr_type, attr_flags, attr_ivtidx, attr_split = attr_defn
- # TODO: converting an attribute from SIMPLE to SEARCHABLE or vice
- # versa isn't supported yet. Raise exception here to prevent
- # potential data loss.
- if attr_name in cur_type_attrs and attr_flags & (ATTR_SEARCHABLE | ATTR_SIMPLE) != \
- cur_type_attrs[attr_name][1] & (ATTR_SEARCHABLE | ATTR_SIMPLE):
- raise ValueError, "Unsupported attempt to convert attribute '%s' " \
- "between ATTR_SIMPLE and ATTR_SEARCHABLE" % attr_name
- if attr_name not in cur_type_attrs or cur_type_attrs[attr_name] != attr_defn:
- # There is a new attribute specified for this type, or an
- # existing one has changed.
- new_attrs[attr_name] = attr_defn
- changed = True
- if attr_flags & ATTR_SEARCHABLE:
- # New attribute isn't simple, needs to alter table.
- table_needs_rebuild = True
- elif attr_flags & ATTR_INVERTED_INDEX:
- # TODO: there is no need to rebuild the table when adding/modifying
- # an ATTR_SIMPLE | ATTR_INVERTED_INDEX attribute, we just need to
- # recreate the delete trigger (and remove any rows from the
- # inverted index's map for this object type if we're removing
- # an association with that ivtidx). For now we will force a
- # rebuild since I'm too lazy to implement the proper way.
- table_needs_rebuild = True
- if attr_name in cur_type_attrs and not cur_type_attrs[attr_name][1] & ATTR_INVERTED_INDEX:
- # FIXME: if we add an inverted index to an existing attribute, we'd
- # need to reparse that attribute in all rows to populate the inverted
- # map. Right now just log a warning.
- log.warning("Adding inverted index '%s' to existing attribute '%s' not fully " \
- "implemented; index may be out of sync.", attr_ivtidx, attr_name)
- if not changed:
- return
- if self._readonly:
- raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
- # Update the attr list to merge both existing and new attributes.
- attrs = cur_type_attrs.copy()
- attrs.update(new_attrs)
- new_indexes = set(indexes).difference(cur_type_idx)
- indexes = set(indexes).union(cur_type_idx)
- self._register_check_indexes(indexes, attrs)
- if not table_needs_rebuild:
- # Only simple (i.e. pickled only) attributes are being added,
- # or only new indexes are added, so we don't need to rebuild the
- # table.
- if len(new_attrs):
- self._db_query("UPDATE types SET attrs_pickle=? WHERE id=?", (dbpickle(attrs), cur_type_id))
- if len(new_indexes):
- self._register_create_multi_indexes(new_indexes, table_name)
- self._db_query("UPDATE types SET idx_pickle=? WHERE id=?", (dbpickle(indexes), cur_type_id))
- self.commit()
- self._load_object_types()
- return
- # We need to update the database now ...
- else:
- # New type definition. Populate attrs with required internal
- # attributes so they get created with the table.
- new_attrs = cur_type_id = None
- # Merge standard attributes with user attributes for this new type.
- attrs.update({
- 'id': (int, ATTR_SEARCHABLE, None, None),
- 'parent_type': (int, ATTR_SEARCHABLE, None, None),
- 'parent_id': (int, ATTR_SEARCHABLE, None, None),
- 'pickle': (RAW_TYPE, ATTR_SEARCHABLE, None, None)
- })
- self._register_check_indexes(indexes, attrs)
- create_stmt = 'CREATE TABLE %s_tmp (' % table_name
- # Iterate through type attributes and append to SQL create statement.
- sql_types = {int: 'INTEGER', long: 'INTEGER', float: 'FLOAT', RAW_TYPE: 'BLOB',
- UNICODE_TYPE: 'TEXT', BYTES_TYPE: 'BLOB', bool: 'INTEGER', basestring: 'TEXT'}
- for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
- if attr_flags & ATTR_SEARCHABLE:
- # Attribute needs to be a column in the table, not a pickled value.
- if attr_type not in sql_types:
- raise ValueError, "Type '%s' not supported" % str(attr_type)
- create_stmt += '%s %s' % (attr_name, sql_types[attr_type])
- if attr_name == 'id':
- # Special case, these are auto-incrementing primary keys
- create_stmt += ' PRIMARY KEY AUTOINCREMENT'
- create_stmt += ','
- create_stmt = create_stmt.rstrip(',') + ')'
- self._db_query(create_stmt)
- # Add this type to the types table, including the attributes
- # dictionary.
- self._db_query('INSERT OR REPLACE INTO types VALUES(?, ?, ?, ?)',
- (cur_type_id, type_name, dbpickle(attrs), dbpickle(indexes)))
- # Sync self._object_types with the object type definition we just
- # stored to the db.
- self._load_object_types()
- if new_attrs:
- # Migrate rows from old table to new temporary one. Here we copy only
- # ATTR_SEARCHABLE columns that exist in both old and new definitions.
- columns = filter(lambda x: cur_type_attrs[x][1] & ATTR_SEARCHABLE and \
- x in attrs and attrs[x][1] & ATTR_SEARCHABLE, cur_type_attrs.keys())
- columns = ','.join(columns)
- self._db_query('INSERT INTO %s_tmp (%s) SELECT %s FROM %s' % \
- (table_name, columns, columns, table_name))
- # Delete old table.
- self._db_query('DROP TABLE %s' % table_name)
- # Rename temporary table.
- self._db_query('ALTER TABLE %s_tmp RENAME TO %s' % (table_name, table_name))
- # Increase the objectcount for new inverted indexes, and create a
- # trigger that reduces the objectcount for each applicable inverted
- # index when a row is deleted.
- inverted_indexes = self._get_type_inverted_indexes(type_name)
- if inverted_indexes:
- n_rows = self._db_query_row('SELECT COUNT(*) FROM %s' % table_name)[0]
- sql = 'CREATE TRIGGER delete_object_%s DELETE ON %s BEGIN ' % (type_name, table_name)
- for idx_name in inverted_indexes:
- sql += "UPDATE inverted_indexes SET value=MAX(0, value-1) WHERE name='%s' AND attr='objectcount';" % idx_name
- # Add to objectcount (both in db and cached value)
- self._db_query("UPDATE inverted_indexes SET value=value+? WHERE name=? and attr='objectcount'",
- (n_rows, idx_name))
- self._inverted_indexes[idx_name]['objectcount'] += n_rows
- sql += 'END'
- self._db_query(sql)
- # Create index for locating all objects under a given parent.
- self._db_query("CREATE INDEX %s_parent_idx on %s (parent_id, "\
- "parent_type)" % (table_name, table_name))
- # If any of these attributes need to be indexed, create the index
- # for that column.
- for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
- if attr_flags & ATTR_INDEXED:
- self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
- (table_name, attr_name, table_name, attr_name))
- # Create multi-column indexes; indexes value has already been verified.
- self._register_create_multi_indexes(indexes, table_name)
- self.commit()
- def register_inverted_index(self, name, min = None, max = None, split = None, ignore = None):
- """
- Registers a new inverted index with the database.
- An inverted index maps arbitrary terms to objects and allows you to
- query based on one or more terms. If the inverted index already exists
- with the given parameters, no action is performed.
- :param name: the name of the inverted index; must be alphanumeric.
- :type name: str
- :param min: the minimum length of terms to index; terms smaller
- than this will be ignored. If None (default), there
- is no minimum size.
- :type min: int
- :param max: the maximum length of terms to index; terms larger than
- this will be ignored. If None (default), there is no
- maximum size.
- :type max: int
- :param split: used to parse string-based attributes using this inverted
- index into individual terms. In the case of regexps, the
- split method will be called. (If a string is specified,
- it will be compiled into a regexp first.) If *split* is
- a callable, it will receive a string of text and must return
- a sequence, and each item in the sequence will be indexed
- as an individual term. If split is not specified, the
- default is to split words at non-alphanumeric/underscore/digit
- boundaries.
- :type split: callable, regexp (SRE_Pattern) object, or str
- :param ignore: a list of terms that will not be indexed (so-called
- *stop words*). If specified, each indexed term for this
- inverted index will first be checked against this list.
- If it exists, the term is discarded.
- For example::
- from kaa.db import *
- db = Database('test.db')
- db.register_inverted_index('tags')
- db.register_inverted_index('keywords', min=3, max=30, ignore=STOP_WORDS)
- """
- # Verify specified name doesn't already exist as some object attribute.
- for object_name, object_type in self._object_types.items():
- if name in object_type[1] and name != object_type[1][name][2]:
- raise ValueError, "Inverted index name '%s' conflicts with registered attribute in object '%s'" % \
- (name, object_name)
- if split is None:
- # Default split regexp is to split words on
- # alphanumeric/digits/underscore boundaries.
- split = re.compile(u"(\d+)|[_\W]", re.U)
- elif isinstance(split, basestring):
- split = re.compile(py3_str(split), re.U)
- if name not in self._inverted_indexes and not self._readonly:
- self._db_query('INSERT INTO inverted_indexes VALUES(?, "objectcount", 0)', (name,))
- # Create the tables needed by the inverted index.
- with self._lock:
- self._db.executescript(CREATE_IVTIDX_TEMPLATE.replace('%IDXNAME%', name))
- elif name in self._inverted_indexes:
- defn = self._inverted_indexes[name]
- if min == defn['min'] and max == defn['max'] and split == defn['split'] and \
- ignore == defn['ignore']:
- # Definition unchanged, nothing to do.
- return
- if self._readonly:
- raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
- defn = {
- 'min': min,
- 'max': max,
- 'split': split,
- 'ignore': ignore,
- }
- self._db_query("INSERT OR REPLACE INTO inverted_indexes VALUES(?, 'definition', ?)",
- (name, dbpickle(defn)))
- defn['objectcount'] = 0
- self._inverted_indexes[name] = defn
- self.commit()
- def _load_inverted_indexes(self):
- for name, attr, value in self._db_query("SELECT * from inverted_indexes"):
- if name not in self._inverted_indexes:
- self._inverted_indexes[name] = {}
- if attr == 'objectcount':
- self._inverted_indexes[name][attr] = int(value)
- elif attr == 'definition':
- self._inverted_indexes[name].update(dbunpickle(value))
- def _load_object_types(self):
- is_pickle_proto_2 = False
- for id, name, attrs, idx in self._db_query("SELECT * from types"):
- if attrs[1] == 0x02 or idx[1] == 0x02:
- is_pickle_proto_2 = True
- self._object_types[name] = id, dbunpickle(attrs), dbunpickle(idx)
- if sys.hexversion >= 0x03000000 and is_pickle_proto_2:
- self._readonly = True
- log.warning('kaa.db databases created by Python 2 are read-only until upgrade_to_py3() is called')
- def _get_type_inverted_indexes(self, type_name):
- if type_name not in self._object_types:
- return []
- indexed_attrs = set()
- type_attrs = self._object_types[type_name][1]
- for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
- if flags & ATTR_INVERTED_INDEX:
- indexed_attrs.add(attr_ivtidx)
- return list(indexed_attrs)
- def _get_type_attrs(self, type_name):
- return self._object_types[type_name][1]
- def _get_type_id(self, type_name):
- return self._object_types[type_name][0]
- def _make_query_from_attrs(self, query_type, attrs, type_name):
- type_attrs = self._get_type_attrs(type_name)
- # True if an attribute from …
Large files files are truncated, but you can click here to view the full file