PageRenderTime 71ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/src/db.py

https://github.com/Dischi/kaa-base
Python | 2571 lines | 2424 code | 23 blank | 124 comment | 51 complexity | 380f4e5e41fb43919618ebe8d2db9bbb MD5 | raw file
Possible License(s): LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. # -----------------------------------------------------------------------------
  2. # db.py - db abstraction module
  3. # -----------------------------------------------------------------------------
  4. # Copyright 2006-2012 Dirk Meyer, Jason Tackaberry
  5. #
  6. # Please see the file AUTHORS for a complete list of authors.
  7. #
  8. # This library is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU Lesser General Public License version
  10. # 2.1 as published by the Free Software Foundation.
  11. #
  12. # This library is distributed in the hope that it will be useful, but
  13. # WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. # Lesser General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU Lesser General Public
  18. # License along with this library; if not, write to the Free Software
  19. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20. # 02110-1301 USA
  21. #
  22. # -----------------------------------------------------------------------------
  23. from __future__ import absolute_import
  24. __all__ = [
  25. 'Database', 'QExpr', 'DatabaseError', 'DatabaseReadOnlyError',
  26. 'split_path', 'ATTR_SIMPLE', 'ATTR_SEARCHABLE', 'ATTR_IGNORE_CASE',
  27. 'ATTR_INDEXED', 'ATTR_INDEXED_IGNORE_CASE', 'ATTR_INVERTED_INDEX',
  28. 'RAW_TYPE'
  29. ]
  30. # python imports
  31. import sys
  32. import os
  33. import time
  34. import re
  35. import logging
  36. import math
  37. import cPickle
  38. import copy_reg
  39. import _weakref
  40. import threading
  41. try:
  42. # Try a system install of pysqlite
  43. from pysqlite2 import dbapi2 as sqlite
  44. except ImportError:
  45. # Python 2.6 provides sqlite natively, so try that next.
  46. from sqlite3 import dbapi2 as sqlite
  47. # kaa base imports
  48. from .utils import property
  49. from .strutils import py3_str, BYTES_TYPE, UNICODE_TYPE
  50. from .timer import WeakOneShotTimer
  51. from . import main
  52. if sqlite.version < '2.1.0':
  53. raise ImportError('pysqlite 2.1.0 or higher required')
  54. if sqlite.sqlite_version < '3.3.1':
  55. raise ImportError('sqlite 3.3.1 or higher required')
  56. # get logging object
  57. log = logging.getLogger('kaa.base.db')
  58. SCHEMA_VERSION = 0.2
  59. SCHEMA_VERSION_COMPATIBLE = 0.2
  60. CREATE_SCHEMA = """
  61. CREATE TABLE meta (
  62. attr TEXT UNIQUE,
  63. value TEXT
  64. );
  65. INSERT INTO meta VALUES('version', %s);
  66. CREATE TABLE types (
  67. id INTEGER PRIMARY KEY AUTOINCREMENT,
  68. name TEXT UNIQUE,
  69. attrs_pickle BLOB,
  70. idx_pickle BLOB
  71. );
  72. CREATE TABLE inverted_indexes (
  73. name TEXT,
  74. attr TEXT,
  75. value TEXT
  76. );
  77. CREATE UNIQUE INDEX inverted_indexes_idx on inverted_indexes (name, attr);
  78. """
  79. CREATE_IVTIDX_TEMPLATE = """
  80. CREATE TABLE ivtidx_%IDXNAME%_terms (
  81. id INTEGER PRIMARY KEY AUTOINCREMENT,
  82. term TEXT,
  83. count INTEGER
  84. );
  85. CREATE UNIQUE INDEX ivtidx_%IDXNAME%_terms_idx on ivtidx_%IDXNAME%_terms (term);
  86. CREATE TABLE ivtidx_%IDXNAME%_terms_map (
  87. rank INTEGER,
  88. term_id INTEGER,
  89. object_type INTEGER,
  90. object_id INTEGER,
  91. frequency FLOAT
  92. );
  93. CREATE INDEX ivtidx_%IDXNAME%_terms_map_idx ON ivtidx_%IDXNAME%_terms_map (term_id, rank, object_type, object_id);
  94. CREATE INDEX ivtidx_%IDXNAME%_terms_map_object_idx ON ivtidx_%IDXNAME%_terms_map (object_id, object_type, term_id);
  95. CREATE TRIGGER ivtidx_%IDXNAME%_delete_terms_map DELETE ON ivtidx_%IDXNAME%_terms_map
  96. BEGIN
  97. UPDATE ivtidx_%IDXNAME%_terms SET count=MAX(0, count-1) WHERE id=old.term_id;
  98. END;
  99. """
  100. ATTR_SIMPLE = 0x01
  101. ATTR_SEARCHABLE = 0x02 # Is a SQL column, not a pickled field
  102. ATTR_INDEXED = 0x04 # Will have an SQL index
  103. ATTR_IGNORE_CASE = 0x08 # Store in db as lowercase for searches.
  104. ATTR_INVERTED_INDEX = 0x10 # Attribute associated with an inverted idx
  105. ATTR_INDEXED_IGNORE_CASE = ATTR_INDEXED | ATTR_IGNORE_CASE
  106. # These are special attributes for querying. Attributes with
  107. # these names cannot be registered.
  108. RESERVED_ATTRIBUTES = ('id', 'parent', 'object', 'type', 'limit', 'attrs', 'distinct', 'orattrs')
  109. STOP_WORDS = (
  110. "about", "and", "are", "but", "com", "for", "from", "how", "not",
  111. "some", "that", "the", "this", "was", "what", "when", "where", "who",
  112. "will", "with", "the", "www", "http", "org", "of", "on"
  113. )
  114. class PyObjectRow(object):
  115. """
  116. ObjectRows are dictionary-like objects that represent an object in
  117. the database. They are used by pysqlite instead of tuples or indexes.
  118. ObjectRows support on-demand unpickling of the internally stored pickle
  119. which contains ATTR_SIMPLE attributes.
  120. This is the native Python implementation of ObjectRow. There is a
  121. faster C implementation in the _objectrow extension. This
  122. implementation is still designed to be efficient -- the C version is
  123. only about 60-70% faster.
  124. """
  125. # A dict containing per-query data: [refcount, idxmap, typemap, pickle_idx]
  126. # This is constructed once for each query, and each row returned in the
  127. # query references the same data. Each ObjectRow instance adds to the
  128. # refcount once initialized, and is decremented when the object is deleted.
  129. # Once it reaches 0, the entry is removed from the dict.
  130. queries = {}
  131. # Use __slots__ as a minor optimization to improve object creation time.
  132. __slots__ = ('_description', '_object_types', '_type_name', '_row', '_pickle',
  133. '_idxmap', '_typemap', '_keys')
  134. def __init__(self, cursor, row, pickle_dict=None):
  135. # The following is done per row per query, so it should be as light as
  136. # possible.
  137. if pickle_dict:
  138. # Created outside pysqlite, e.g. from Database.add()
  139. self._pickle = pickle_dict
  140. self._idxmap = None
  141. return
  142. if isinstance(cursor, tuple):
  143. self._description, self._object_types = cursor
  144. else:
  145. self._description = cursor.description
  146. self._object_types = cursor._db()._object_types
  147. self._row = row
  148. # _pickle: False == no pickle present; None == pickle present but
  149. # empty; if a dict, is the unpickled dictionary; else it's a byte
  150. # string containing the pickled data.
  151. self._pickle = False
  152. self._type_name = row[0]
  153. try:
  154. attrs = self._object_types[self._type_name][1]
  155. except KeyError:
  156. raise ValueError("Object type '%s' not defined." % self._type_name)
  157. query_key = id(self._description)
  158. if query_key in PyObjectRow.queries:
  159. query_info = PyObjectRow.queries[query_key]
  160. # Increase refcount to the query info
  161. query_info[0] += 1
  162. self._idxmap, self._typemap, pickle_idx = query_info[1:]
  163. if pickle_idx != -1:
  164. self._pickle = self._row[pickle_idx]
  165. return
  166. # Everything below this is done once per query, not per row, so
  167. # performance isn't quite as critical.
  168. idxmap = {} # attr_name -> (idx, pickled, named_ivtdx, type, flags)
  169. pickle_idx = -1
  170. for i in range(2, len(self._description)):
  171. attr_name = self._description[i][0]
  172. idxmap[attr_name] = i
  173. if attr_name == 'pickle':
  174. pickle_idx = i
  175. self._pickle = self._row[i]
  176. for attr_name, (attr_type, flags, ivtidx, split) in attrs.items():
  177. idx = idxmap.get(attr_name, -1)
  178. pickled = flags & ATTR_SIMPLE or (flags & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
  179. idxmap[attr_name] = idx, pickled, attr_name == ivtidx, attr_type, flags
  180. # Construct dict mapping type id -> type name. Essentially an
  181. # inversion of _object_types
  182. typemap = dict((v[0], k) for k, v in self._object_types.items())
  183. self._idxmap = idxmap
  184. self._typemap = typemap
  185. PyObjectRow.queries[query_key] = [1, idxmap, typemap, pickle_idx]
  186. def __del__(self):
  187. if self._idxmap is None:
  188. # From Database.add(), pickle only, no pysqlite row.
  189. return
  190. query_key = id(self._description)
  191. query_info = PyObjectRow.queries[query_key]
  192. query_info[0] -= 1
  193. if query_info[0] == 0:
  194. # Refcount for this query info is 0, so remove from global queries
  195. # dict.
  196. del PyObjectRow.queries[query_key]
  197. def __getitem__(self, key):
  198. if self._idxmap is None:
  199. # From Database.add(), work strictly from pickle
  200. return self._pickle[key]
  201. elif key == 'type':
  202. return self._type_name
  203. elif key == 'parent':
  204. type_idx = self._idxmap.get('parent_type', [-1])[0]
  205. id_idx = self._idxmap.get('parent_id', [-1])[0]
  206. if type_idx == -1 or id_idx == -1:
  207. raise KeyError('Parent attribute not available')
  208. type_id = self._row[type_idx]
  209. return self._typemap.get(type_id, type_id), self._row[id_idx]
  210. elif key == '_row':
  211. return self._row
  212. elif isinstance(key, int):
  213. return self._row[key]
  214. attr = self._idxmap[key]
  215. attr_idx = attr[0]
  216. is_indexed_ignore_case = (attr[4] & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
  217. if attr_idx == -1:
  218. # Attribute is not in the sql row
  219. if attr[1] and self._pickle is None:
  220. # Pickle is empty, which means this attribute was never
  221. # assigned a value. Return a default (empty list if attribute
  222. # is named after an inverted index
  223. return [] if attr[2] else None
  224. elif not attr[1] or self._pickle is False:
  225. # The requested attribute is not in the sqlite row, and neither is the pickle.
  226. raise KeyError("ObjectRow does not have enough data to provide '%s'" % key)
  227. if not attr[1]:
  228. value = self._row[attr_idx]
  229. elif attr_idx >= 0 and not self._pickle and is_indexed_ignore_case:
  230. # Attribute is ATTR_INDEXED_IGNORE_CASE which means the
  231. # authoritative source is in the pickle, but we don't have it. So just
  232. # return what we have.
  233. value = self._row[attr_idx]
  234. else:
  235. if self._pickle and not isinstance(self._pickle, dict):
  236. # We need to check the pickle but it's not unpickled, so do so now.
  237. self._pickle = dbunpickle(self._pickle)
  238. if is_indexed_ignore_case:
  239. key = '__' + key
  240. if key not in self._pickle:
  241. return [] if attr[2] else None
  242. else:
  243. value = self._pickle[key]
  244. if sys.hexversion < 0x03000000 and (attr[3] == str or attr[3] == buffer):
  245. # Python 2's pysqlite returns BLOBs as buffers. If the attribute
  246. # type is string or buffer (RAW_TYPE on Python 2), convert to string.
  247. return str(value)
  248. else:
  249. return value
  250. def keys(self):
  251. if not self._idxmap:
  252. # Ad hoc ObjectRow, proxy to pickle dict.
  253. return self._pickle.keys()
  254. if not hasattr(self, '_keys'):
  255. self._keys = ['type']
  256. for name, attr in self._idxmap.items():
  257. if (attr[0] >= 0 or (attr[1] and self._pickle is not False)) and name != 'pickle':
  258. self._keys.append(name)
  259. if 'parent_type' in self._idxmap and 'parent_id' in self._idxmap:
  260. self._keys.append('parent')
  261. return self._keys
  262. def values(self):
  263. if not self._idxmap:
  264. # Ad hoc ObjectRow, proxy to pickle dict.
  265. return self._pickle.values()
  266. return [self[k] for k in self.keys()]
  267. def items(self):
  268. return zip(self.keys(), self.values())
  269. def get(self, key, default=None):
  270. try:
  271. return self[key]
  272. except KeyError:
  273. return default
  274. def has_key(self, key):
  275. return key in self.keys()
  276. def __iter__(self):
  277. return iter(self.keys())
  278. def __contains__(self, key):
  279. if key == 'type' or (key == 'parent' and 'parent_id' in self._idxmap):
  280. return True
  281. else:
  282. return key in self._idxmap
  283. if sys.hexversion >= 0x03000000:
  284. RAW_TYPE = bytes
  285. PICKLE_PROTOCOL = 3
  286. class Proto2Unpickler(pickle._Unpickler):
  287. """
  288. In spite of the promise that pickles will be compatible between Python
  289. releases, Python 3 does the wrong thing with non-unicode strings in
  290. pickles (BINSTRING pickle opcode). It will try to convert them to
  291. unicode strings, which is wrong when the intention was to store binary
  292. data in a Python 2 str.
  293. This class implements a custom unpickler that will load BINSTRING as
  294. bytes objects. An exception is made for dictionaries, where BINSTRING
  295. keys are converted to unicode strings.
  296. Additionally, it maps the unicode and buffer types to corresponding
  297. Python 3 types.
  298. """
  299. dispatch = pickle._Unpickler.dispatch.copy()
  300. def load_binstring(self):
  301. len = pickle.mloads(bytes('i', 'ascii') + self.read(4))
  302. self.append(bytes(self.read(len)))
  303. dispatch[pickle.BINSTRING[0]] = load_binstring
  304. def load_short_binstring(self):
  305. len = ord(self.read(1))
  306. self.append(bytes(self.read(len)))
  307. dispatch[pickle.SHORT_BINSTRING[0]] = load_short_binstring
  308. def load_setitems(self):
  309. super(Proto2Unpickler, self).load_setitems()
  310. d = self.stack[-1]
  311. for k, v in d.items():
  312. if type(k) == bytes:
  313. sk = str(k, self.encoding, self.errors)
  314. if sk not in d:
  315. d[sk] = v
  316. del d[k]
  317. dispatch[pickle.SETITEMS[0]] = load_setitems
  318. def find_class(self, module, name):
  319. if module == '__builtin__':
  320. if name == 'unicode':
  321. return str
  322. elif name == 'buffer':
  323. return bytes
  324. return super(Proto2Unpickler, self).find_class(module, name)
  325. def dbunpickle(s):
  326. if s[1] == 0x02:
  327. import io
  328. #import pickletools
  329. #pickletools.dis(io.BytesIO(bytes(s)))
  330. return Proto2Unpickler(io.BytesIO(bytes(s))).load()
  331. else:
  332. return pickle.loads(bytes(s))
  333. def dbpickle(value):
  334. return bytes(pickle.dumps(value, 3))
  335. # Need to be able to unpickle pickled buffers from Python 2.
  336. def _unpickle_buffer(s):
  337. return bytes(s)
  338. else:
  339. RAW_TYPE = buffer
  340. PICKLE_PROTOCOL = 2
  341. def dbunpickle(s):
  342. return cPickle.loads(str(s))
  343. def dbpickle(value):
  344. return buffer(cPickle.dumps(value, 2))
  345. # Python2 can't pickle buffer types, so register a handler for that.
  346. def _pickle_buffer(b):
  347. return _unpickle_buffer, (str(b),)
  348. def _unpickle_buffer(s):
  349. return str(s)
  350. copy_reg.pickle(buffer, _pickle_buffer, _unpickle_buffer)
  351. try:
  352. from . import _objectrow
  353. except ImportError:
  354. # Use the python-native ObjectRow
  355. ObjectRow = PyObjectRow
  356. else:
  357. # Use the faster C-based ObjectRow
  358. ObjectRow = _objectrow.ObjectRow
  359. # Expose the custom unpickler to the _objectrow module so it can deal with
  360. # pickles stored inside DB rows.
  361. _objectrow.dbunpickle = dbunpickle
  362. # Register a handler for pickling ObjectRow objects.
  363. def _pickle_ObjectRow(o):
  364. if o._description:
  365. return _unpickle_ObjectRow, ((o._description, o._object_types), o._row)
  366. else:
  367. return _unpickle_ObjectRow, (None, None, dict(o.items()))
  368. def _unpickle_ObjectRow(*args):
  369. return ObjectRow(*args)
  370. copy_reg.pickle(ObjectRow, _pickle_ObjectRow, _unpickle_ObjectRow)
  371. PATH_SPLIT = re.compile(ur'(\d+)|[_\W]', re.U | re.X)
  372. def split_path(s):
  373. """
  374. Convenience split function for inverted index attributes. Useful for
  375. attributes that contain filenames. Splits the given string s into
  376. components parts (directories, filename), discarding the extension and all
  377. but the last two directories. What's remaining is split into words and the
  378. result is returned.
  379. """
  380. dirname, filename = os.path.split(s)
  381. fname_noext, ext = os.path.splitext(filename)
  382. for part in dirname.strip('/').split(os.path.sep)[2:][-2:] + [fname_noext]:
  383. for match in PATH_SPLIT.split(part):
  384. if match:
  385. yield match
  386. # FIXME: this is flawed. Can use placeholders by taking a n-tuple and
  387. # replacing ? to (?, ?, ..., n) and then extend the query params list with the
  388. # given tuple/list value.
  389. def _list_to_printable(value):
  390. """
  391. Takes a list of mixed types and outputs a unicode string. For
  392. example, a list [42, 'foo', None, "foo's' string"], this returns the
  393. string:
  394. (42, 'foo', NULL, 'foo''s'' string')
  395. Single quotes are escaped as ''. This is suitable for use in SQL
  396. queries.
  397. """
  398. fixed_items = []
  399. for item in value:
  400. if isinstance(item, (int, long, float)):
  401. fixed_items.append(str(item))
  402. elif item == None:
  403. fixed_items.append("NULL")
  404. elif isinstance(item, UNICODE_TYPE):
  405. fixed_items.append("'%s'" % item.replace("'", "''"))
  406. elif isinstance(item, BYTES_TYPE):
  407. fixed_items.append("'%s'" % py3_str(item.replace("'", "''")))
  408. else:
  409. raise Exception, "Unsupported type '%s' given to _list_to_printable" % type(item)
  410. return '(' + ','.join(fixed_items) + ')'
  411. class DatabaseError(Exception):
  412. pass
  413. class DatabaseReadOnlyError(Exception):
  414. pass
  415. class QExpr(object):
  416. """
  417. Flexible query expressions for use with :meth:`kaa.db.Database.query()`
  418. """
  419. def __init__(self, operator, operand):
  420. """
  421. :param operator: ``=``, ``!=``, ``<``, ``<=``, ``>``, ``>=``, ``in``,
  422. ``not in``, ``range``, ``like``, or ``regexp``
  423. :type operator: str
  424. :param operand: the rvalue of the expression; any scalar values as part of
  425. the operand must be the same type as the attribute being
  426. evaluated
  427. Except for ``in``, ``not in``, and ``range``, the operand must be the
  428. type of the registered attribute being evaluated (e.g. unicode, int,
  429. etc.).
  430. The operand for ``in`` and ``not in`` are lists or tuples of the attribute
  431. type, to test inclusion in the given set.
  432. The ``range`` operator accepts a 2-tuple specifying min and max values
  433. for the attribute. The Python expression age=QExpr('range', (20, 30))
  434. translates to ``age >= 20 AND age <= 30``.
  435. """
  436. operator = operator.lower()
  437. assert(operator in ('=', '!=', '<', '<=', '>', '>=', 'in', 'not in', 'range', 'like', 'regexp'))
  438. if operator in ('in', 'not in', 'range'):
  439. assert(isinstance(operand, (list, tuple)))
  440. if operator == 'range':
  441. assert(len(operand) == 2)
  442. self._operator = operator
  443. self._operand = operand
  444. def as_sql(self, var):
  445. if self._operator == "range":
  446. a, b = self._operand
  447. return "%s >= ? AND %s <= ?" % (var, var), (a, b)
  448. elif self._operator in ("in", "not in"):
  449. return "%s %s %s" % (var, self._operator.upper(),
  450. _list_to_printable(self._operand)), ()
  451. else:
  452. return "%s %s ?" % (var, self._operator.upper()), \
  453. (self._operand,)
  454. class RegexpCache(object):
  455. def __init__(self):
  456. self.last_item = None
  457. self.last_expr = None
  458. def __call__(self, expr, item):
  459. if item is None:
  460. return 0
  461. if self.last_item == item and self.last_item is not None and self.last_expr == expr:
  462. return self.last_result
  463. if self.last_expr != expr:
  464. self.last_expr = re.compile(unicode(expr), re.U)
  465. self.last_item = item
  466. # FIXME: bad conversion to unicode!
  467. self.last_result = self.last_expr.match(unicode(item)) is not None
  468. return self.last_result
  469. class Database(object):
  470. def __init__(self, dbfile):
  471. """
  472. Open a database, creating one if it doesn't already exist.
  473. :param dbfile: path to the database file
  474. :type dbfile: str
  475. SQLite is used to provide the underlying database.
  476. """
  477. super(Database, self).__init__()
  478. # _object_types dict is keyed on type name, where value is a 3-
  479. # tuple (id, attrs, idx), where:
  480. # - id is a unique numeric database id for the type,
  481. # - attrs is a dict containing registered attributes for the type,
  482. # keyed on attribute name, where value is a 3-tuple of (type, flags,
  483. # ivtidx), where type is a python datatype, flags is a bitmask of
  484. # ATTR_*, ivtidx is the name of the associated inverted index (used
  485. # if flags has ATTR_INVERTED_INDEX, otherwise None)
  486. # - idx is a list of n-tuples, where each n-tuple defines two or more
  487. # (non-ATTR_SIMPLE) attributes on which to create a multi-column
  488. # sql index.
  489. self._object_types = {}
  490. # _inverted_indexes dict is keyed on index name, where value is
  491. # a dict keyed on:
  492. # - min: minimum length of terms
  493. # - max: maximum length of terms
  494. # - ignore: list of terms to ignore
  495. # - split: function or regular expression used to split string ATTR_INVERTED_INDEX
  496. # attributes.
  497. self._inverted_indexes = {}
  498. # True when there are uncommitted changes
  499. self._dirty = False
  500. # True when modifications are not allowed to the database, which
  501. # is the case when Python 3 is opening a database created by Python 2
  502. # and upgrade_to_py3() has not been called.
  503. self._readonly = False
  504. self._dbfile = os.path.realpath(dbfile)
  505. self._lock = threading.RLock()
  506. self._lazy_commit_timer = WeakOneShotTimer(self.commit)
  507. self._lazy_commit_interval = None
  508. self._open_db()
  509. def _open_db(self):
  510. self._db = sqlite.connect(self._dbfile, check_same_thread=False)
  511. # Create the function "regexp" for the REGEXP operator of SQLite
  512. self._db.create_function("regexp", 2, RegexpCache())
  513. self._cursor = self._db.cursor()
  514. class Cursor(sqlite.Cursor):
  515. _db = _weakref.ref(self)
  516. self._db.row_factory = ObjectRow
  517. # Queries done through this cursor will use the ObjectRow row factory.
  518. self._qcursor = self._db.cursor(Cursor)
  519. for cursor in self._cursor, self._qcursor:
  520. cursor.execute("PRAGMA synchronous=OFF")
  521. cursor.execute("PRAGMA temp_store=MEMORY")
  522. cursor.execute("PRAGMA cache_size=50000")
  523. cursor.execute("PRAGMA page_size=8192")
  524. if not self._check_table_exists("meta"):
  525. self._db.executescript(CREATE_SCHEMA % SCHEMA_VERSION)
  526. row = self._db_query_row("SELECT value FROM meta WHERE attr='version'")
  527. if float(row[0]) < SCHEMA_VERSION_COMPATIBLE:
  528. raise DatabaseError("Database '%s' has schema version %s; required %s" % \
  529. (self._dbfile, row[0], SCHEMA_VERSION_COMPATIBLE))
  530. self._load_inverted_indexes()
  531. self._load_object_types()
  532. def _set_dirty(self):
  533. if self._lazy_commit_interval is not None:
  534. self._lazy_commit_timer.start(self._lazy_commit_interval)
  535. if self._dirty:
  536. return
  537. self._dirty = True
  538. main.signals['exit'].connect(self.commit)
  539. def _db_query(self, statement, args = (), cursor = None, many = False):
  540. t0=time.time()
  541. with self._lock:
  542. if not cursor:
  543. cursor = self._cursor
  544. if many:
  545. cursor.executemany(statement, args)
  546. else:
  547. cursor.execute(statement, args)
  548. rows = cursor.fetchall()
  549. t1=time.time()
  550. #print "QUERY [%.06f%s]: %s" % (t1-t0, ('', ' (many)')[many], statement), args
  551. return rows
  552. def _db_query_row(self, statement, args = (), cursor = None):
  553. rows = self._db_query(statement, args, cursor)
  554. if len(rows) == 0:
  555. return None
  556. return rows[0]
  557. def _to_obj_tuple(self, obj, numeric=False):
  558. """
  559. Returns a normalized object reference as a 2-tuple (type, id).
  560. :param obj: an ObjectRow, or 2-tuple (type, id)
  561. :param numeric: if True, coerce type name to a type id
  562. Raises ValueError if obj is not valid.
  563. """
  564. if isinstance(obj, ObjectRow):
  565. object_type, object_id = obj['type'], obj['id']
  566. else:
  567. try:
  568. object_type, object_id = obj
  569. if not isinstance(object_type, (int, basestring)) or not isinstance(object_id, (int, long, QExpr)):
  570. raise TypeError
  571. except TypeError:
  572. raise ValueError('Object reference must be either ObjectRow, or (type, id), got %s' % obj)
  573. if numeric:
  574. object_type = self._get_type_id(object_type)
  575. return object_type, object_id
  576. def _check_table_exists(self, table):
  577. res = self._db_query_row("SELECT name FROM sqlite_master where " \
  578. "name=? and type='table'", (table,))
  579. return res != None
  580. def _register_check_indexes(self, indexes, attrs):
  581. for cols in indexes:
  582. if not isinstance(cols, (list, tuple)):
  583. raise ValueError, "Single column index specified ('%s') where multi-column index expected." % cols
  584. for col in cols:
  585. errstr = "Multi-column index (%s) contains" % ",".join(cols)
  586. if col not in attrs:
  587. raise ValueError, "%s unknown attribute '%s'" % (errstr, col)
  588. if not attrs[col][1]:
  589. raise ValueError, "%s ATTR_SIMPLE attribute '%s'" % (errstr, col)
  590. def _register_create_multi_indexes(self, indexes, table_name):
  591. for cols in indexes:
  592. self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
  593. (table_name, "_".join(cols), table_name, ",".join(cols)))
  594. def register_object_type_attrs(self, type_name, indexes = [], **attrs):
  595. """
  596. Register one or more object attributes and/or multi-column indexes for
  597. the given type name.
  598. This function modifies the database as needed to accommodate new
  599. indexes and attributes, either by creating the object's tables (in the
  600. case of a new object type) or by altering the object's tables to add
  601. new columns or indexes.
  602. This method is idempotent: if the attributes and indexes specified have
  603. not changed from previous invocations, no changes will be made to the
  604. database. Moreover, newly registered attributes will not affect
  605. previously registered attributes. This allows, for example, a plugin
  606. to extend an existing object type created by the core application
  607. without interfering with it.
  608. :param type_name: the name of object type the registered attributes or
  609. indexes apply to.
  610. :type type_name: str
  611. :param indexes: a list of tuples where each tuple contains 2 or more
  612. registered :attr:`~kaa.db.ATTR_SEARCHABLE` attributes
  613. for which a composite index will be created in the
  614. underlying database. This is useful for speeding
  615. up queries involving these attributes combined.
  616. :type indexes: list of tuples of strings
  617. :param attrs: keyword arguments defining the attributes to be
  618. registered. The keyword defining the attribute name
  619. cannot conflict with any of the names in
  620. :attr:`~kaa.db.RESERVED_ATTRIBUTES`. See below for a
  621. more complete specification of the value.
  622. :type attrs: 2, 3, or 4-tuple
  623. Previously registered attributes may be updated in limited ways (e.g.
  624. by adding an index to the attribute). If the change requested is
  625. not supported, a ValueError will be raised.
  626. .. note:: Currently, indexes and attributes can only be added, not
  627. removed. That is, once an attribute or index is added, it lives
  628. forever.
  629. Object attributes, which are supplied as keyword arguments, are either
  630. *searchable* or *simple*. *Searchable* attributes occupy a column in
  631. the underlying database table and so queries can be performed on these
  632. attributes, but their types are more restricted. *Simple* attributes
  633. can be any type that can be pickled, but can't be searched.
  634. The attribute kwarg value is a tuple of 2 to 4 items in length and in
  635. the form (attr_type, flags, ivtidx, split).
  636. * attr_type: the type of the object attribute. For simple attributes
  637. (:attr:`~kaa.db.ATTR_SIMPLE` in *flags*), this can be any picklable
  638. type; for searchable attributes (:attr:`~kaa.db.ATTR_SEARCHABLE`
  639. in *flags*), this must be either *int*, *float*, *str*, *unicode*,
  640. *bytes*, or *bool*. (On Python 2.5, you can use ``kaa.db.RAW_TYPE``
  641. instead of *bytes*.)
  642. * flags: a bitmap of :ref:`attribute flags <attrflags>`
  643. * ivtidx: name of a previously registered inverted index used for
  644. this attribute. Only needed if flags contains
  645. :attr:`~kaa.db.ATTR_INVERTED_INDEX`
  646. * split: function or regular expression used to split string-based
  647. values for this attribute into separate terms for indexing. If
  648. this isn't defined, then the default split strategy for the
  649. inverted index wil be used.
  650. Apart from not being allowed to conflict with one of the reserved
  651. names, there is a special case for attribute names: when they have
  652. the same name as a previously registered inverted index. These
  653. attributes must be :attr:`~kaa.db.ATTR_SIMPLE`, and of type *list*.
  654. Terms explicitly associated with the attribute are persisted with the
  655. object, but when accessed, all terms for all attributes for that
  656. inverted index will be contained in the list, not just those explicitly
  657. associated with the same-named attribute.
  658. The following example shows what an application that indexes email might
  659. do::
  660. from kaa.db import *
  661. from datetime import datetime
  662. db = Database('email.db')
  663. db.register_inverted_index('keywords', min=3, max=30)
  664. db.register_object_type_attrs('msg',
  665. # Create a composite index on sender and recipient, because
  666. # (let's suppose) it's we do a lot of searches for specific
  667. # senders emailing specific recipients.
  668. [('sender', 'recipient')],
  669. # Simple attribute can be anything that's picklable, which datetime is.
  670. date = (datetime, ATTR_SIMPLE),
  671. # Sender and recipient names need to be ATTR_SEARCHABLE since
  672. # they're part of a composite index.
  673. sender = (unicode, ATTR_SEARCHABLE),
  674. recipient = (unicode, ATTR_SEARCHABLE),
  675. # Subject is searchable (standard SQL-based substring matches),
  676. # but also being indexed as part of the keywords inverted
  677. # index for fast term-based searching.
  678. subject = (unicode, ATTR_SEARCHABLE | ATTR_INVERTED_INDEX, 'keywords'),
  679. # Special case where an attribute name is the same as a registered
  680. # inverted index. This lets us index on, for example, the message body
  681. # without actually storing the message inside the database.
  682. keywords = (list, ATTR_SIMPLE | ATTR_INVERTED_INDEX, 'keywords')
  683. )
  684. """
  685. if len(indexes) == len(attrs) == 0:
  686. raise ValueError, "Must specify indexes or attributes for object type"
  687. table_name = "objects_%s" % type_name
  688. # First pass over the attributes kwargs, sanity-checking provided values.
  689. for attr_name, attr_defn in attrs.items():
  690. # We allow attribute definition to be either a 2- to 4-tuple (last two
  691. # are optional), so pad the tuple with None if a 2- or 3-tuple was specified.
  692. attrs[attr_name] = attr_defn = tuple(attr_defn) + (None,) * (4-len(attr_defn))
  693. if len(attr_defn) != 4:
  694. raise ValueError, "Definition for attribute '%s' is not a 2- to 4-tuple." % attr_name
  695. # Verify the attribute flags contain either ATTR_SEARCHABLE or ATTR_SIMPLE;
  696. # it can't contain both as that doesn't make sense.
  697. if attr_defn[1] & (ATTR_SIMPLE | ATTR_SEARCHABLE) not in (ATTR_SIMPLE, ATTR_SEARCHABLE):
  698. raise ValueError, "Flags for attribute '%s' must contain exactly one " \
  699. "of ATTR_SIMPLE or ATTR_SEARCHABLE" % attr_name
  700. # Attribute name can't conflict with reserved names.
  701. if attr_name in RESERVED_ATTRIBUTES:
  702. raise ValueError, "Attribute name '%s' is reserved." % attr_name
  703. elif attr_name in self._inverted_indexes:
  704. if not attr_defn[1] & ATTR_INVERTED_INDEX or attr_defn[2] != attr_name:
  705. # Attributes can be named after inverted indexes, but only if
  706. # ATTR_INVERTED_INDEX is specified and the attribute name is the
  707. # same as its ivtidx name.
  708. raise ValueError, "Attribute '%s' conflicts with inverted index of same name, " \
  709. "but ATTR_INVERTED_INDEX not specified in flags." % attr_name
  710. if attr_defn[1] & ATTR_INVERTED_INDEX:
  711. # Attributes with ATTR_INVERTED_INDEX can only be certain types.
  712. if attr_defn[0] not in (str, unicode, tuple, list, set):
  713. raise TypeError, "Type for attribute '%s' must be string, unicode, list, tuple, or set " \
  714. "because it is ATTR_INVERTED_INDEX" % attr_name
  715. # Make sure inverted index name is valid.
  716. if attr_defn[2] is None:
  717. raise ValueError, "Attribute '%s' flags specify inverted index, " \
  718. "but no inverted index name supplied." % attr_name
  719. elif attr_defn[2] not in self._inverted_indexes:
  720. raise ValueError, "Attribute '%s' specifies undefined interverted index '%s'" % \
  721. (attr_name, attr_defn[2])
  722. # Compile split regexp if it was given.
  723. if attr_defn[3] is not None and not callable(attr_defn[3]):
  724. attrs[attr_name] = attr_defn[:3] + (re.compile(attr_defn[3]),)
  725. if type_name in self._object_types:
  726. # This type already exists. Compare given attributes with
  727. # existing attributes for this type to see what needs to be done
  728. # (if anything).
  729. cur_type_id, cur_type_attrs, cur_type_idx = self._object_types[type_name]
  730. new_attrs = {}
  731. table_needs_rebuild = False
  732. changed = False
  733. for attr_name, attr_defn in attrs.items():
  734. attr_type, attr_flags, attr_ivtidx, attr_split = attr_defn
  735. # TODO: converting an attribute from SIMPLE to SEARCHABLE or vice
  736. # versa isn't supported yet. Raise exception here to prevent
  737. # potential data loss.
  738. if attr_name in cur_type_attrs and attr_flags & (ATTR_SEARCHABLE | ATTR_SIMPLE) != \
  739. cur_type_attrs[attr_name][1] & (ATTR_SEARCHABLE | ATTR_SIMPLE):
  740. raise ValueError, "Unsupported attempt to convert attribute '%s' " \
  741. "between ATTR_SIMPLE and ATTR_SEARCHABLE" % attr_name
  742. if attr_name not in cur_type_attrs or cur_type_attrs[attr_name] != attr_defn:
  743. # There is a new attribute specified for this type, or an
  744. # existing one has changed.
  745. new_attrs[attr_name] = attr_defn
  746. changed = True
  747. if attr_flags & ATTR_SEARCHABLE:
  748. # New attribute isn't simple, needs to alter table.
  749. table_needs_rebuild = True
  750. elif attr_flags & ATTR_INVERTED_INDEX:
  751. # TODO: there is no need to rebuild the table when adding/modifying
  752. # an ATTR_SIMPLE | ATTR_INVERTED_INDEX attribute, we just need to
  753. # recreate the delete trigger (and remove any rows from the
  754. # inverted index's map for this object type if we're removing
  755. # an association with that ivtidx). For now we will force a
  756. # rebuild since I'm too lazy to implement the proper way.
  757. table_needs_rebuild = True
  758. if attr_name in cur_type_attrs and not cur_type_attrs[attr_name][1] & ATTR_INVERTED_INDEX:
  759. # FIXME: if we add an inverted index to an existing attribute, we'd
  760. # need to reparse that attribute in all rows to populate the inverted
  761. # map. Right now just log a warning.
  762. log.warning("Adding inverted index '%s' to existing attribute '%s' not fully " \
  763. "implemented; index may be out of sync.", attr_ivtidx, attr_name)
  764. if not changed:
  765. return
  766. if self._readonly:
  767. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  768. # Update the attr list to merge both existing and new attributes.
  769. attrs = cur_type_attrs.copy()
  770. attrs.update(new_attrs)
  771. new_indexes = set(indexes).difference(cur_type_idx)
  772. indexes = set(indexes).union(cur_type_idx)
  773. self._register_check_indexes(indexes, attrs)
  774. if not table_needs_rebuild:
  775. # Only simple (i.e. pickled only) attributes are being added,
  776. # or only new indexes are added, so we don't need to rebuild the
  777. # table.
  778. if len(new_attrs):
  779. self._db_query("UPDATE types SET attrs_pickle=? WHERE id=?", (dbpickle(attrs), cur_type_id))
  780. if len(new_indexes):
  781. self._register_create_multi_indexes(new_indexes, table_name)
  782. self._db_query("UPDATE types SET idx_pickle=? WHERE id=?", (dbpickle(indexes), cur_type_id))
  783. self.commit()
  784. self._load_object_types()
  785. return
  786. # We need to update the database now ...
  787. else:
  788. # New type definition. Populate attrs with required internal
  789. # attributes so they get created with the table.
  790. new_attrs = cur_type_id = None
  791. # Merge standard attributes with user attributes for this new type.
  792. attrs.update({
  793. 'id': (int, ATTR_SEARCHABLE, None, None),
  794. 'parent_type': (int, ATTR_SEARCHABLE, None, None),
  795. 'parent_id': (int, ATTR_SEARCHABLE, None, None),
  796. 'pickle': (RAW_TYPE, ATTR_SEARCHABLE, None, None)
  797. })
  798. self._register_check_indexes(indexes, attrs)
  799. create_stmt = 'CREATE TABLE %s_tmp (' % table_name
  800. # Iterate through type attributes and append to SQL create statement.
  801. sql_types = {int: 'INTEGER', long: 'INTEGER', float: 'FLOAT', RAW_TYPE: 'BLOB',
  802. UNICODE_TYPE: 'TEXT', BYTES_TYPE: 'BLOB', bool: 'INTEGER', basestring: 'TEXT'}
  803. for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
  804. if attr_flags & ATTR_SEARCHABLE:
  805. # Attribute needs to be a column in the table, not a pickled value.
  806. if attr_type not in sql_types:
  807. raise ValueError, "Type '%s' not supported" % str(attr_type)
  808. create_stmt += '%s %s' % (attr_name, sql_types[attr_type])
  809. if attr_name == 'id':
  810. # Special case, these are auto-incrementing primary keys
  811. create_stmt += ' PRIMARY KEY AUTOINCREMENT'
  812. create_stmt += ','
  813. create_stmt = create_stmt.rstrip(',') + ')'
  814. self._db_query(create_stmt)
  815. # Add this type to the types table, including the attributes
  816. # dictionary.
  817. self._db_query('INSERT OR REPLACE INTO types VALUES(?, ?, ?, ?)',
  818. (cur_type_id, type_name, dbpickle(attrs), dbpickle(indexes)))
  819. # Sync self._object_types with the object type definition we just
  820. # stored to the db.
  821. self._load_object_types()
  822. if new_attrs:
  823. # Migrate rows from old table to new temporary one. Here we copy only
  824. # ATTR_SEARCHABLE columns that exist in both old and new definitions.
  825. columns = filter(lambda x: cur_type_attrs[x][1] & ATTR_SEARCHABLE and \
  826. x in attrs and attrs[x][1] & ATTR_SEARCHABLE, cur_type_attrs.keys())
  827. columns = ','.join(columns)
  828. self._db_query('INSERT INTO %s_tmp (%s) SELECT %s FROM %s' % \
  829. (table_name, columns, columns, table_name))
  830. # Delete old table.
  831. self._db_query('DROP TABLE %s' % table_name)
  832. # Rename temporary table.
  833. self._db_query('ALTER TABLE %s_tmp RENAME TO %s' % (table_name, table_name))
  834. # Increase the objectcount for new inverted indexes, and create a
  835. # trigger that reduces the objectcount for each applicable inverted
  836. # index when a row is deleted.
  837. inverted_indexes = self._get_type_inverted_indexes(type_name)
  838. if inverted_indexes:
  839. n_rows = self._db_query_row('SELECT COUNT(*) FROM %s' % table_name)[0]
  840. sql = 'CREATE TRIGGER delete_object_%s DELETE ON %s BEGIN ' % (type_name, table_name)
  841. for idx_name in inverted_indexes:
  842. sql += "UPDATE inverted_indexes SET value=MAX(0, value-1) WHERE name='%s' AND attr='objectcount';" % idx_name
  843. # Add to objectcount (both in db and cached value)
  844. self._db_query("UPDATE inverted_indexes SET value=value+? WHERE name=? and attr='objectcount'",
  845. (n_rows, idx_name))
  846. self._inverted_indexes[idx_name]['objectcount'] += n_rows
  847. sql += 'END'
  848. self._db_query(sql)
  849. # Create index for locating all objects under a given parent.
  850. self._db_query("CREATE INDEX %s_parent_idx on %s (parent_id, "\
  851. "parent_type)" % (table_name, table_name))
  852. # If any of these attributes need to be indexed, create the index
  853. # for that column.
  854. for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
  855. if attr_flags & ATTR_INDEXED:
  856. self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
  857. (table_name, attr_name, table_name, attr_name))
  858. # Create multi-column indexes; indexes value has already been verified.
  859. self._register_create_multi_indexes(indexes, table_name)
  860. self.commit()
  861. def register_inverted_index(self, name, min = None, max = None, split = None, ignore = None):
  862. """
  863. Registers a new inverted index with the database.
  864. An inverted index maps arbitrary terms to objects and allows you to
  865. query based on one or more terms. If the inverted index already exists
  866. with the given parameters, no action is performed.
  867. :param name: the name of the inverted index; must be alphanumeric.
  868. :type name: str
  869. :param min: the minimum length of terms to index; terms smaller
  870. than this will be ignored. If None (default), there
  871. is no minimum size.
  872. :type min: int
  873. :param max: the maximum length of terms to index; terms larger than
  874. this will be ignored. If None (default), there is no
  875. maximum size.
  876. :type max: int
  877. :param split: used to parse string-based attributes using this inverted
  878. index into individual terms. In the case of regexps, the
  879. split method will be called. (If a string is specified,
  880. it will be compiled into a regexp first.) If *split* is
  881. a callable, it will receive a string of text and must return
  882. a sequence, and each item in the sequence will be indexed
  883. as an individual term. If split is not specified, the
  884. default is to split words at non-alphanumeric/underscore/digit
  885. boundaries.
  886. :type split: callable, regexp (SRE_Pattern) object, or str
  887. :param ignore: a list of terms that will not be indexed (so-called
  888. *stop words*). If specified, each indexed term for this
  889. inverted index will first be checked against this list.
  890. If it exists, the term is discarded.
  891. For example::
  892. from kaa.db import *
  893. db = Database('test.db')
  894. db.register_inverted_index('tags')
  895. db.register_inverted_index('keywords', min=3, max=30, ignore=STOP_WORDS)
  896. """
  897. # Verify specified name doesn't already exist as some object attribute.
  898. for object_name, object_type in self._object_types.items():
  899. if name in object_type[1] and name != object_type[1][name][2]:
  900. raise ValueError, "Inverted index name '%s' conflicts with registered attribute in object '%s'" % \
  901. (name, object_name)
  902. if split is None:
  903. # Default split regexp is to split words on
  904. # alphanumeric/digits/underscore boundaries.
  905. split = re.compile(u"(\d+)|[_\W]", re.U)
  906. elif isinstance(split, basestring):
  907. split = re.compile(py3_str(split), re.U)
  908. if name not in self._inverted_indexes and not self._readonly:
  909. self._db_query('INSERT INTO inverted_indexes VALUES(?, "objectcount", 0)', (name,))
  910. # Create the tables needed by the inverted index.
  911. with self._lock:
  912. self._db.executescript(CREATE_IVTIDX_TEMPLATE.replace('%IDXNAME%', name))
  913. elif name in self._inverted_indexes:
  914. defn = self._inverted_indexes[name]
  915. if min == defn['min'] and max == defn['max'] and split == defn['split'] and \
  916. ignore == defn['ignore']:
  917. # Definition unchanged, nothing to do.
  918. return
  919. if self._readonly:
  920. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  921. defn = {
  922. 'min': min,
  923. 'max': max,
  924. 'split': split,
  925. 'ignore': ignore,
  926. }
  927. self._db_query("INSERT OR REPLACE INTO inverted_indexes VALUES(?, 'definition', ?)",
  928. (name, dbpickle(defn)))
  929. defn['objectcount'] = 0
  930. self._inverted_indexes[name] = defn
  931. self.commit()
  932. def _load_inverted_indexes(self):
  933. for name, attr, value in self._db_query("SELECT * from inverted_indexes"):
  934. if name not in self._inverted_indexes:
  935. self._inverted_indexes[name] = {}
  936. if attr == 'objectcount':
  937. self._inverted_indexes[name][attr] = int(value)
  938. elif attr == 'definition':
  939. self._inverted_indexes[name].update(dbunpickle(value))
  940. def _load_object_types(self):
  941. is_pickle_proto_2 = False
  942. for id, name, attrs, idx in self._db_query("SELECT * from types"):
  943. if attrs[1] == 0x02 or idx[1] == 0x02:
  944. is_pickle_proto_2 = True
  945. self._object_types[name] = id, dbunpickle(attrs), dbunpickle(idx)
  946. if sys.hexversion >= 0x03000000 and is_pickle_proto_2:
  947. self._readonly = True
  948. log.warning('kaa.db databases created by Python 2 are read-only until upgrade_to_py3() is called')
  949. def _get_type_inverted_indexes(self, type_name):
  950. if type_name not in self._object_types:
  951. return []
  952. indexed_attrs = set()
  953. type_attrs = self._object_types[type_name][1]
  954. for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
  955. if flags & ATTR_INVERTED_INDEX:
  956. indexed_attrs.add(attr_ivtidx)
  957. return list(indexed_attrs)
  958. def _get_type_attrs(self, type_name):
  959. return self._object_types[type_name][1]
  960. def _get_type_id(self, type_name):
  961. return self._object_types[type_name][0]
  962. def _make_query_from_attrs(self, query_type, attrs, type_name):
  963. type_attrs = self._get_type_attrs(type_name)
  964. # True if an attribute from

Large files files are truncated, but you can click here to view the full file