PageRenderTime 75ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/src/db.py

https://github.com/Dischi/kaa-base
Python | 2571 lines | 2424 code | 23 blank | 124 comment | 51 complexity | 380f4e5e41fb43919618ebe8d2db9bbb MD5 | raw file
Possible License(s): LGPL-2.1
  1. # -----------------------------------------------------------------------------
  2. # db.py - db abstraction module
  3. # -----------------------------------------------------------------------------
  4. # Copyright 2006-2012 Dirk Meyer, Jason Tackaberry
  5. #
  6. # Please see the file AUTHORS for a complete list of authors.
  7. #
  8. # This library is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU Lesser General Public License version
  10. # 2.1 as published by the Free Software Foundation.
  11. #
  12. # This library is distributed in the hope that it will be useful, but
  13. # WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. # Lesser General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU Lesser General Public
  18. # License along with this library; if not, write to the Free Software
  19. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20. # 02110-1301 USA
  21. #
  22. # -----------------------------------------------------------------------------
  23. from __future__ import absolute_import
  24. __all__ = [
  25. 'Database', 'QExpr', 'DatabaseError', 'DatabaseReadOnlyError',
  26. 'split_path', 'ATTR_SIMPLE', 'ATTR_SEARCHABLE', 'ATTR_IGNORE_CASE',
  27. 'ATTR_INDEXED', 'ATTR_INDEXED_IGNORE_CASE', 'ATTR_INVERTED_INDEX',
  28. 'RAW_TYPE'
  29. ]
  30. # python imports
  31. import sys
  32. import os
  33. import time
  34. import re
  35. import logging
  36. import math
  37. import cPickle
  38. import copy_reg
  39. import _weakref
  40. import threading
  41. try:
  42. # Try a system install of pysqlite
  43. from pysqlite2 import dbapi2 as sqlite
  44. except ImportError:
  45. # Python 2.6 provides sqlite natively, so try that next.
  46. from sqlite3 import dbapi2 as sqlite
  47. # kaa base imports
  48. from .utils import property
  49. from .strutils import py3_str, BYTES_TYPE, UNICODE_TYPE
  50. from .timer import WeakOneShotTimer
  51. from . import main
  52. if sqlite.version < '2.1.0':
  53. raise ImportError('pysqlite 2.1.0 or higher required')
  54. if sqlite.sqlite_version < '3.3.1':
  55. raise ImportError('sqlite 3.3.1 or higher required')
  56. # get logging object
  57. log = logging.getLogger('kaa.base.db')
  58. SCHEMA_VERSION = 0.2
  59. SCHEMA_VERSION_COMPATIBLE = 0.2
  60. CREATE_SCHEMA = """
  61. CREATE TABLE meta (
  62. attr TEXT UNIQUE,
  63. value TEXT
  64. );
  65. INSERT INTO meta VALUES('version', %s);
  66. CREATE TABLE types (
  67. id INTEGER PRIMARY KEY AUTOINCREMENT,
  68. name TEXT UNIQUE,
  69. attrs_pickle BLOB,
  70. idx_pickle BLOB
  71. );
  72. CREATE TABLE inverted_indexes (
  73. name TEXT,
  74. attr TEXT,
  75. value TEXT
  76. );
  77. CREATE UNIQUE INDEX inverted_indexes_idx on inverted_indexes (name, attr);
  78. """
  79. CREATE_IVTIDX_TEMPLATE = """
  80. CREATE TABLE ivtidx_%IDXNAME%_terms (
  81. id INTEGER PRIMARY KEY AUTOINCREMENT,
  82. term TEXT,
  83. count INTEGER
  84. );
  85. CREATE UNIQUE INDEX ivtidx_%IDXNAME%_terms_idx on ivtidx_%IDXNAME%_terms (term);
  86. CREATE TABLE ivtidx_%IDXNAME%_terms_map (
  87. rank INTEGER,
  88. term_id INTEGER,
  89. object_type INTEGER,
  90. object_id INTEGER,
  91. frequency FLOAT
  92. );
  93. CREATE INDEX ivtidx_%IDXNAME%_terms_map_idx ON ivtidx_%IDXNAME%_terms_map (term_id, rank, object_type, object_id);
  94. CREATE INDEX ivtidx_%IDXNAME%_terms_map_object_idx ON ivtidx_%IDXNAME%_terms_map (object_id, object_type, term_id);
  95. CREATE TRIGGER ivtidx_%IDXNAME%_delete_terms_map DELETE ON ivtidx_%IDXNAME%_terms_map
  96. BEGIN
  97. UPDATE ivtidx_%IDXNAME%_terms SET count=MAX(0, count-1) WHERE id=old.term_id;
  98. END;
  99. """
  100. ATTR_SIMPLE = 0x01
  101. ATTR_SEARCHABLE = 0x02 # Is a SQL column, not a pickled field
  102. ATTR_INDEXED = 0x04 # Will have an SQL index
  103. ATTR_IGNORE_CASE = 0x08 # Store in db as lowercase for searches.
  104. ATTR_INVERTED_INDEX = 0x10 # Attribute associated with an inverted idx
  105. ATTR_INDEXED_IGNORE_CASE = ATTR_INDEXED | ATTR_IGNORE_CASE
  106. # These are special attributes for querying. Attributes with
  107. # these names cannot be registered.
  108. RESERVED_ATTRIBUTES = ('id', 'parent', 'object', 'type', 'limit', 'attrs', 'distinct', 'orattrs')
  109. STOP_WORDS = (
  110. "about", "and", "are", "but", "com", "for", "from", "how", "not",
  111. "some", "that", "the", "this", "was", "what", "when", "where", "who",
  112. "will", "with", "the", "www", "http", "org", "of", "on"
  113. )
  114. class PyObjectRow(object):
  115. """
  116. ObjectRows are dictionary-like objects that represent an object in
  117. the database. They are used by pysqlite instead of tuples or indexes.
  118. ObjectRows support on-demand unpickling of the internally stored pickle
  119. which contains ATTR_SIMPLE attributes.
  120. This is the native Python implementation of ObjectRow. There is a
  121. faster C implementation in the _objectrow extension. This
  122. implementation is still designed to be efficient -- the C version is
  123. only about 60-70% faster.
  124. """
  125. # A dict containing per-query data: [refcount, idxmap, typemap, pickle_idx]
  126. # This is constructed once for each query, and each row returned in the
  127. # query references the same data. Each ObjectRow instance adds to the
  128. # refcount once initialized, and is decremented when the object is deleted.
  129. # Once it reaches 0, the entry is removed from the dict.
  130. queries = {}
  131. # Use __slots__ as a minor optimization to improve object creation time.
  132. __slots__ = ('_description', '_object_types', '_type_name', '_row', '_pickle',
  133. '_idxmap', '_typemap', '_keys')
  134. def __init__(self, cursor, row, pickle_dict=None):
  135. # The following is done per row per query, so it should be as light as
  136. # possible.
  137. if pickle_dict:
  138. # Created outside pysqlite, e.g. from Database.add()
  139. self._pickle = pickle_dict
  140. self._idxmap = None
  141. return
  142. if isinstance(cursor, tuple):
  143. self._description, self._object_types = cursor
  144. else:
  145. self._description = cursor.description
  146. self._object_types = cursor._db()._object_types
  147. self._row = row
  148. # _pickle: False == no pickle present; None == pickle present but
  149. # empty; if a dict, is the unpickled dictionary; else it's a byte
  150. # string containing the pickled data.
  151. self._pickle = False
  152. self._type_name = row[0]
  153. try:
  154. attrs = self._object_types[self._type_name][1]
  155. except KeyError:
  156. raise ValueError("Object type '%s' not defined." % self._type_name)
  157. query_key = id(self._description)
  158. if query_key in PyObjectRow.queries:
  159. query_info = PyObjectRow.queries[query_key]
  160. # Increase refcount to the query info
  161. query_info[0] += 1
  162. self._idxmap, self._typemap, pickle_idx = query_info[1:]
  163. if pickle_idx != -1:
  164. self._pickle = self._row[pickle_idx]
  165. return
  166. # Everything below this is done once per query, not per row, so
  167. # performance isn't quite as critical.
  168. idxmap = {} # attr_name -> (idx, pickled, named_ivtdx, type, flags)
  169. pickle_idx = -1
  170. for i in range(2, len(self._description)):
  171. attr_name = self._description[i][0]
  172. idxmap[attr_name] = i
  173. if attr_name == 'pickle':
  174. pickle_idx = i
  175. self._pickle = self._row[i]
  176. for attr_name, (attr_type, flags, ivtidx, split) in attrs.items():
  177. idx = idxmap.get(attr_name, -1)
  178. pickled = flags & ATTR_SIMPLE or (flags & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
  179. idxmap[attr_name] = idx, pickled, attr_name == ivtidx, attr_type, flags
  180. # Construct dict mapping type id -> type name. Essentially an
  181. # inversion of _object_types
  182. typemap = dict((v[0], k) for k, v in self._object_types.items())
  183. self._idxmap = idxmap
  184. self._typemap = typemap
  185. PyObjectRow.queries[query_key] = [1, idxmap, typemap, pickle_idx]
  186. def __del__(self):
  187. if self._idxmap is None:
  188. # From Database.add(), pickle only, no pysqlite row.
  189. return
  190. query_key = id(self._description)
  191. query_info = PyObjectRow.queries[query_key]
  192. query_info[0] -= 1
  193. if query_info[0] == 0:
  194. # Refcount for this query info is 0, so remove from global queries
  195. # dict.
  196. del PyObjectRow.queries[query_key]
  197. def __getitem__(self, key):
  198. if self._idxmap is None:
  199. # From Database.add(), work strictly from pickle
  200. return self._pickle[key]
  201. elif key == 'type':
  202. return self._type_name
  203. elif key == 'parent':
  204. type_idx = self._idxmap.get('parent_type', [-1])[0]
  205. id_idx = self._idxmap.get('parent_id', [-1])[0]
  206. if type_idx == -1 or id_idx == -1:
  207. raise KeyError('Parent attribute not available')
  208. type_id = self._row[type_idx]
  209. return self._typemap.get(type_id, type_id), self._row[id_idx]
  210. elif key == '_row':
  211. return self._row
  212. elif isinstance(key, int):
  213. return self._row[key]
  214. attr = self._idxmap[key]
  215. attr_idx = attr[0]
  216. is_indexed_ignore_case = (attr[4] & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE)
  217. if attr_idx == -1:
  218. # Attribute is not in the sql row
  219. if attr[1] and self._pickle is None:
  220. # Pickle is empty, which means this attribute was never
  221. # assigned a value. Return a default (empty list if attribute
  222. # is named after an inverted index
  223. return [] if attr[2] else None
  224. elif not attr[1] or self._pickle is False:
  225. # The requested attribute is not in the sqlite row, and neither is the pickle.
  226. raise KeyError("ObjectRow does not have enough data to provide '%s'" % key)
  227. if not attr[1]:
  228. value = self._row[attr_idx]
  229. elif attr_idx >= 0 and not self._pickle and is_indexed_ignore_case:
  230. # Attribute is ATTR_INDEXED_IGNORE_CASE which means the
  231. # authoritative source is in the pickle, but we don't have it. So just
  232. # return what we have.
  233. value = self._row[attr_idx]
  234. else:
  235. if self._pickle and not isinstance(self._pickle, dict):
  236. # We need to check the pickle but it's not unpickled, so do so now.
  237. self._pickle = dbunpickle(self._pickle)
  238. if is_indexed_ignore_case:
  239. key = '__' + key
  240. if key not in self._pickle:
  241. return [] if attr[2] else None
  242. else:
  243. value = self._pickle[key]
  244. if sys.hexversion < 0x03000000 and (attr[3] == str or attr[3] == buffer):
  245. # Python 2's pysqlite returns BLOBs as buffers. If the attribute
  246. # type is string or buffer (RAW_TYPE on Python 2), convert to string.
  247. return str(value)
  248. else:
  249. return value
  250. def keys(self):
  251. if not self._idxmap:
  252. # Ad hoc ObjectRow, proxy to pickle dict.
  253. return self._pickle.keys()
  254. if not hasattr(self, '_keys'):
  255. self._keys = ['type']
  256. for name, attr in self._idxmap.items():
  257. if (attr[0] >= 0 or (attr[1] and self._pickle is not False)) and name != 'pickle':
  258. self._keys.append(name)
  259. if 'parent_type' in self._idxmap and 'parent_id' in self._idxmap:
  260. self._keys.append('parent')
  261. return self._keys
  262. def values(self):
  263. if not self._idxmap:
  264. # Ad hoc ObjectRow, proxy to pickle dict.
  265. return self._pickle.values()
  266. return [self[k] for k in self.keys()]
  267. def items(self):
  268. return zip(self.keys(), self.values())
  269. def get(self, key, default=None):
  270. try:
  271. return self[key]
  272. except KeyError:
  273. return default
  274. def has_key(self, key):
  275. return key in self.keys()
  276. def __iter__(self):
  277. return iter(self.keys())
  278. def __contains__(self, key):
  279. if key == 'type' or (key == 'parent' and 'parent_id' in self._idxmap):
  280. return True
  281. else:
  282. return key in self._idxmap
  283. if sys.hexversion >= 0x03000000:
  284. RAW_TYPE = bytes
  285. PICKLE_PROTOCOL = 3
  286. class Proto2Unpickler(pickle._Unpickler):
  287. """
  288. In spite of the promise that pickles will be compatible between Python
  289. releases, Python 3 does the wrong thing with non-unicode strings in
  290. pickles (BINSTRING pickle opcode). It will try to convert them to
  291. unicode strings, which is wrong when the intention was to store binary
  292. data in a Python 2 str.
  293. This class implements a custom unpickler that will load BINSTRING as
  294. bytes objects. An exception is made for dictionaries, where BINSTRING
  295. keys are converted to unicode strings.
  296. Additionally, it maps the unicode and buffer types to corresponding
  297. Python 3 types.
  298. """
  299. dispatch = pickle._Unpickler.dispatch.copy()
  300. def load_binstring(self):
  301. len = pickle.mloads(bytes('i', 'ascii') + self.read(4))
  302. self.append(bytes(self.read(len)))
  303. dispatch[pickle.BINSTRING[0]] = load_binstring
  304. def load_short_binstring(self):
  305. len = ord(self.read(1))
  306. self.append(bytes(self.read(len)))
  307. dispatch[pickle.SHORT_BINSTRING[0]] = load_short_binstring
  308. def load_setitems(self):
  309. super(Proto2Unpickler, self).load_setitems()
  310. d = self.stack[-1]
  311. for k, v in d.items():
  312. if type(k) == bytes:
  313. sk = str(k, self.encoding, self.errors)
  314. if sk not in d:
  315. d[sk] = v
  316. del d[k]
  317. dispatch[pickle.SETITEMS[0]] = load_setitems
  318. def find_class(self, module, name):
  319. if module == '__builtin__':
  320. if name == 'unicode':
  321. return str
  322. elif name == 'buffer':
  323. return bytes
  324. return super(Proto2Unpickler, self).find_class(module, name)
  325. def dbunpickle(s):
  326. if s[1] == 0x02:
  327. import io
  328. #import pickletools
  329. #pickletools.dis(io.BytesIO(bytes(s)))
  330. return Proto2Unpickler(io.BytesIO(bytes(s))).load()
  331. else:
  332. return pickle.loads(bytes(s))
  333. def dbpickle(value):
  334. return bytes(pickle.dumps(value, 3))
  335. # Need to be able to unpickle pickled buffers from Python 2.
  336. def _unpickle_buffer(s):
  337. return bytes(s)
  338. else:
  339. RAW_TYPE = buffer
  340. PICKLE_PROTOCOL = 2
  341. def dbunpickle(s):
  342. return cPickle.loads(str(s))
  343. def dbpickle(value):
  344. return buffer(cPickle.dumps(value, 2))
  345. # Python2 can't pickle buffer types, so register a handler for that.
  346. def _pickle_buffer(b):
  347. return _unpickle_buffer, (str(b),)
  348. def _unpickle_buffer(s):
  349. return str(s)
  350. copy_reg.pickle(buffer, _pickle_buffer, _unpickle_buffer)
  351. try:
  352. from . import _objectrow
  353. except ImportError:
  354. # Use the python-native ObjectRow
  355. ObjectRow = PyObjectRow
  356. else:
  357. # Use the faster C-based ObjectRow
  358. ObjectRow = _objectrow.ObjectRow
  359. # Expose the custom unpickler to the _objectrow module so it can deal with
  360. # pickles stored inside DB rows.
  361. _objectrow.dbunpickle = dbunpickle
  362. # Register a handler for pickling ObjectRow objects.
  363. def _pickle_ObjectRow(o):
  364. if o._description:
  365. return _unpickle_ObjectRow, ((o._description, o._object_types), o._row)
  366. else:
  367. return _unpickle_ObjectRow, (None, None, dict(o.items()))
  368. def _unpickle_ObjectRow(*args):
  369. return ObjectRow(*args)
  370. copy_reg.pickle(ObjectRow, _pickle_ObjectRow, _unpickle_ObjectRow)
  371. PATH_SPLIT = re.compile(ur'(\d+)|[_\W]', re.U | re.X)
  372. def split_path(s):
  373. """
  374. Convenience split function for inverted index attributes. Useful for
  375. attributes that contain filenames. Splits the given string s into
  376. components parts (directories, filename), discarding the extension and all
  377. but the last two directories. What's remaining is split into words and the
  378. result is returned.
  379. """
  380. dirname, filename = os.path.split(s)
  381. fname_noext, ext = os.path.splitext(filename)
  382. for part in dirname.strip('/').split(os.path.sep)[2:][-2:] + [fname_noext]:
  383. for match in PATH_SPLIT.split(part):
  384. if match:
  385. yield match
  386. # FIXME: this is flawed. Can use placeholders by taking a n-tuple and
  387. # replacing ? to (?, ?, ..., n) and then extend the query params list with the
  388. # given tuple/list value.
  389. def _list_to_printable(value):
  390. """
  391. Takes a list of mixed types and outputs a unicode string. For
  392. example, a list [42, 'foo', None, "foo's' string"], this returns the
  393. string:
  394. (42, 'foo', NULL, 'foo''s'' string')
  395. Single quotes are escaped as ''. This is suitable for use in SQL
  396. queries.
  397. """
  398. fixed_items = []
  399. for item in value:
  400. if isinstance(item, (int, long, float)):
  401. fixed_items.append(str(item))
  402. elif item == None:
  403. fixed_items.append("NULL")
  404. elif isinstance(item, UNICODE_TYPE):
  405. fixed_items.append("'%s'" % item.replace("'", "''"))
  406. elif isinstance(item, BYTES_TYPE):
  407. fixed_items.append("'%s'" % py3_str(item.replace("'", "''")))
  408. else:
  409. raise Exception, "Unsupported type '%s' given to _list_to_printable" % type(item)
  410. return '(' + ','.join(fixed_items) + ')'
  411. class DatabaseError(Exception):
  412. pass
  413. class DatabaseReadOnlyError(Exception):
  414. pass
  415. class QExpr(object):
  416. """
  417. Flexible query expressions for use with :meth:`kaa.db.Database.query()`
  418. """
  419. def __init__(self, operator, operand):
  420. """
  421. :param operator: ``=``, ``!=``, ``<``, ``<=``, ``>``, ``>=``, ``in``,
  422. ``not in``, ``range``, ``like``, or ``regexp``
  423. :type operator: str
  424. :param operand: the rvalue of the expression; any scalar values as part of
  425. the operand must be the same type as the attribute being
  426. evaluated
  427. Except for ``in``, ``not in``, and ``range``, the operand must be the
  428. type of the registered attribute being evaluated (e.g. unicode, int,
  429. etc.).
  430. The operand for ``in`` and ``not in`` are lists or tuples of the attribute
  431. type, to test inclusion in the given set.
  432. The ``range`` operator accepts a 2-tuple specifying min and max values
  433. for the attribute. The Python expression age=QExpr('range', (20, 30))
  434. translates to ``age >= 20 AND age <= 30``.
  435. """
  436. operator = operator.lower()
  437. assert(operator in ('=', '!=', '<', '<=', '>', '>=', 'in', 'not in', 'range', 'like', 'regexp'))
  438. if operator in ('in', 'not in', 'range'):
  439. assert(isinstance(operand, (list, tuple)))
  440. if operator == 'range':
  441. assert(len(operand) == 2)
  442. self._operator = operator
  443. self._operand = operand
  444. def as_sql(self, var):
  445. if self._operator == "range":
  446. a, b = self._operand
  447. return "%s >= ? AND %s <= ?" % (var, var), (a, b)
  448. elif self._operator in ("in", "not in"):
  449. return "%s %s %s" % (var, self._operator.upper(),
  450. _list_to_printable(self._operand)), ()
  451. else:
  452. return "%s %s ?" % (var, self._operator.upper()), \
  453. (self._operand,)
  454. class RegexpCache(object):
  455. def __init__(self):
  456. self.last_item = None
  457. self.last_expr = None
  458. def __call__(self, expr, item):
  459. if item is None:
  460. return 0
  461. if self.last_item == item and self.last_item is not None and self.last_expr == expr:
  462. return self.last_result
  463. if self.last_expr != expr:
  464. self.last_expr = re.compile(unicode(expr), re.U)
  465. self.last_item = item
  466. # FIXME: bad conversion to unicode!
  467. self.last_result = self.last_expr.match(unicode(item)) is not None
  468. return self.last_result
  469. class Database(object):
  470. def __init__(self, dbfile):
  471. """
  472. Open a database, creating one if it doesn't already exist.
  473. :param dbfile: path to the database file
  474. :type dbfile: str
  475. SQLite is used to provide the underlying database.
  476. """
  477. super(Database, self).__init__()
  478. # _object_types dict is keyed on type name, where value is a 3-
  479. # tuple (id, attrs, idx), where:
  480. # - id is a unique numeric database id for the type,
  481. # - attrs is a dict containing registered attributes for the type,
  482. # keyed on attribute name, where value is a 3-tuple of (type, flags,
  483. # ivtidx), where type is a python datatype, flags is a bitmask of
  484. # ATTR_*, ivtidx is the name of the associated inverted index (used
  485. # if flags has ATTR_INVERTED_INDEX, otherwise None)
  486. # - idx is a list of n-tuples, where each n-tuple defines two or more
  487. # (non-ATTR_SIMPLE) attributes on which to create a multi-column
  488. # sql index.
  489. self._object_types = {}
  490. # _inverted_indexes dict is keyed on index name, where value is
  491. # a dict keyed on:
  492. # - min: minimum length of terms
  493. # - max: maximum length of terms
  494. # - ignore: list of terms to ignore
  495. # - split: function or regular expression used to split string ATTR_INVERTED_INDEX
  496. # attributes.
  497. self._inverted_indexes = {}
  498. # True when there are uncommitted changes
  499. self._dirty = False
  500. # True when modifications are not allowed to the database, which
  501. # is the case when Python 3 is opening a database created by Python 2
  502. # and upgrade_to_py3() has not been called.
  503. self._readonly = False
  504. self._dbfile = os.path.realpath(dbfile)
  505. self._lock = threading.RLock()
  506. self._lazy_commit_timer = WeakOneShotTimer(self.commit)
  507. self._lazy_commit_interval = None
  508. self._open_db()
  509. def _open_db(self):
  510. self._db = sqlite.connect(self._dbfile, check_same_thread=False)
  511. # Create the function "regexp" for the REGEXP operator of SQLite
  512. self._db.create_function("regexp", 2, RegexpCache())
  513. self._cursor = self._db.cursor()
  514. class Cursor(sqlite.Cursor):
  515. _db = _weakref.ref(self)
  516. self._db.row_factory = ObjectRow
  517. # Queries done through this cursor will use the ObjectRow row factory.
  518. self._qcursor = self._db.cursor(Cursor)
  519. for cursor in self._cursor, self._qcursor:
  520. cursor.execute("PRAGMA synchronous=OFF")
  521. cursor.execute("PRAGMA temp_store=MEMORY")
  522. cursor.execute("PRAGMA cache_size=50000")
  523. cursor.execute("PRAGMA page_size=8192")
  524. if not self._check_table_exists("meta"):
  525. self._db.executescript(CREATE_SCHEMA % SCHEMA_VERSION)
  526. row = self._db_query_row("SELECT value FROM meta WHERE attr='version'")
  527. if float(row[0]) < SCHEMA_VERSION_COMPATIBLE:
  528. raise DatabaseError("Database '%s' has schema version %s; required %s" % \
  529. (self._dbfile, row[0], SCHEMA_VERSION_COMPATIBLE))
  530. self._load_inverted_indexes()
  531. self._load_object_types()
  532. def _set_dirty(self):
  533. if self._lazy_commit_interval is not None:
  534. self._lazy_commit_timer.start(self._lazy_commit_interval)
  535. if self._dirty:
  536. return
  537. self._dirty = True
  538. main.signals['exit'].connect(self.commit)
  539. def _db_query(self, statement, args = (), cursor = None, many = False):
  540. t0=time.time()
  541. with self._lock:
  542. if not cursor:
  543. cursor = self._cursor
  544. if many:
  545. cursor.executemany(statement, args)
  546. else:
  547. cursor.execute(statement, args)
  548. rows = cursor.fetchall()
  549. t1=time.time()
  550. #print "QUERY [%.06f%s]: %s" % (t1-t0, ('', ' (many)')[many], statement), args
  551. return rows
  552. def _db_query_row(self, statement, args = (), cursor = None):
  553. rows = self._db_query(statement, args, cursor)
  554. if len(rows) == 0:
  555. return None
  556. return rows[0]
  557. def _to_obj_tuple(self, obj, numeric=False):
  558. """
  559. Returns a normalized object reference as a 2-tuple (type, id).
  560. :param obj: an ObjectRow, or 2-tuple (type, id)
  561. :param numeric: if True, coerce type name to a type id
  562. Raises ValueError if obj is not valid.
  563. """
  564. if isinstance(obj, ObjectRow):
  565. object_type, object_id = obj['type'], obj['id']
  566. else:
  567. try:
  568. object_type, object_id = obj
  569. if not isinstance(object_type, (int, basestring)) or not isinstance(object_id, (int, long, QExpr)):
  570. raise TypeError
  571. except TypeError:
  572. raise ValueError('Object reference must be either ObjectRow, or (type, id), got %s' % obj)
  573. if numeric:
  574. object_type = self._get_type_id(object_type)
  575. return object_type, object_id
  576. def _check_table_exists(self, table):
  577. res = self._db_query_row("SELECT name FROM sqlite_master where " \
  578. "name=? and type='table'", (table,))
  579. return res != None
  580. def _register_check_indexes(self, indexes, attrs):
  581. for cols in indexes:
  582. if not isinstance(cols, (list, tuple)):
  583. raise ValueError, "Single column index specified ('%s') where multi-column index expected." % cols
  584. for col in cols:
  585. errstr = "Multi-column index (%s) contains" % ",".join(cols)
  586. if col not in attrs:
  587. raise ValueError, "%s unknown attribute '%s'" % (errstr, col)
  588. if not attrs[col][1]:
  589. raise ValueError, "%s ATTR_SIMPLE attribute '%s'" % (errstr, col)
  590. def _register_create_multi_indexes(self, indexes, table_name):
  591. for cols in indexes:
  592. self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
  593. (table_name, "_".join(cols), table_name, ",".join(cols)))
  594. def register_object_type_attrs(self, type_name, indexes = [], **attrs):
  595. """
  596. Register one or more object attributes and/or multi-column indexes for
  597. the given type name.
  598. This function modifies the database as needed to accommodate new
  599. indexes and attributes, either by creating the object's tables (in the
  600. case of a new object type) or by altering the object's tables to add
  601. new columns or indexes.
  602. This method is idempotent: if the attributes and indexes specified have
  603. not changed from previous invocations, no changes will be made to the
  604. database. Moreover, newly registered attributes will not affect
  605. previously registered attributes. This allows, for example, a plugin
  606. to extend an existing object type created by the core application
  607. without interfering with it.
  608. :param type_name: the name of object type the registered attributes or
  609. indexes apply to.
  610. :type type_name: str
  611. :param indexes: a list of tuples where each tuple contains 2 or more
  612. registered :attr:`~kaa.db.ATTR_SEARCHABLE` attributes
  613. for which a composite index will be created in the
  614. underlying database. This is useful for speeding
  615. up queries involving these attributes combined.
  616. :type indexes: list of tuples of strings
  617. :param attrs: keyword arguments defining the attributes to be
  618. registered. The keyword defining the attribute name
  619. cannot conflict with any of the names in
  620. :attr:`~kaa.db.RESERVED_ATTRIBUTES`. See below for a
  621. more complete specification of the value.
  622. :type attrs: 2, 3, or 4-tuple
  623. Previously registered attributes may be updated in limited ways (e.g.
  624. by adding an index to the attribute). If the change requested is
  625. not supported, a ValueError will be raised.
  626. .. note:: Currently, indexes and attributes can only be added, not
  627. removed. That is, once an attribute or index is added, it lives
  628. forever.
  629. Object attributes, which are supplied as keyword arguments, are either
  630. *searchable* or *simple*. *Searchable* attributes occupy a column in
  631. the underlying database table and so queries can be performed on these
  632. attributes, but their types are more restricted. *Simple* attributes
  633. can be any type that can be pickled, but can't be searched.
  634. The attribute kwarg value is a tuple of 2 to 4 items in length and in
  635. the form (attr_type, flags, ivtidx, split).
  636. * attr_type: the type of the object attribute. For simple attributes
  637. (:attr:`~kaa.db.ATTR_SIMPLE` in *flags*), this can be any picklable
  638. type; for searchable attributes (:attr:`~kaa.db.ATTR_SEARCHABLE`
  639. in *flags*), this must be either *int*, *float*, *str*, *unicode*,
  640. *bytes*, or *bool*. (On Python 2.5, you can use ``kaa.db.RAW_TYPE``
  641. instead of *bytes*.)
  642. * flags: a bitmap of :ref:`attribute flags <attrflags>`
  643. * ivtidx: name of a previously registered inverted index used for
  644. this attribute. Only needed if flags contains
  645. :attr:`~kaa.db.ATTR_INVERTED_INDEX`
  646. * split: function or regular expression used to split string-based
  647. values for this attribute into separate terms for indexing. If
  648. this isn't defined, then the default split strategy for the
  649. inverted index wil be used.
  650. Apart from not being allowed to conflict with one of the reserved
  651. names, there is a special case for attribute names: when they have
  652. the same name as a previously registered inverted index. These
  653. attributes must be :attr:`~kaa.db.ATTR_SIMPLE`, and of type *list*.
  654. Terms explicitly associated with the attribute are persisted with the
  655. object, but when accessed, all terms for all attributes for that
  656. inverted index will be contained in the list, not just those explicitly
  657. associated with the same-named attribute.
  658. The following example shows what an application that indexes email might
  659. do::
  660. from kaa.db import *
  661. from datetime import datetime
  662. db = Database('email.db')
  663. db.register_inverted_index('keywords', min=3, max=30)
  664. db.register_object_type_attrs('msg',
  665. # Create a composite index on sender and recipient, because
  666. # (let's suppose) it's we do a lot of searches for specific
  667. # senders emailing specific recipients.
  668. [('sender', 'recipient')],
  669. # Simple attribute can be anything that's picklable, which datetime is.
  670. date = (datetime, ATTR_SIMPLE),
  671. # Sender and recipient names need to be ATTR_SEARCHABLE since
  672. # they're part of a composite index.
  673. sender = (unicode, ATTR_SEARCHABLE),
  674. recipient = (unicode, ATTR_SEARCHABLE),
  675. # Subject is searchable (standard SQL-based substring matches),
  676. # but also being indexed as part of the keywords inverted
  677. # index for fast term-based searching.
  678. subject = (unicode, ATTR_SEARCHABLE | ATTR_INVERTED_INDEX, 'keywords'),
  679. # Special case where an attribute name is the same as a registered
  680. # inverted index. This lets us index on, for example, the message body
  681. # without actually storing the message inside the database.
  682. keywords = (list, ATTR_SIMPLE | ATTR_INVERTED_INDEX, 'keywords')
  683. )
  684. """
  685. if len(indexes) == len(attrs) == 0:
  686. raise ValueError, "Must specify indexes or attributes for object type"
  687. table_name = "objects_%s" % type_name
  688. # First pass over the attributes kwargs, sanity-checking provided values.
  689. for attr_name, attr_defn in attrs.items():
  690. # We allow attribute definition to be either a 2- to 4-tuple (last two
  691. # are optional), so pad the tuple with None if a 2- or 3-tuple was specified.
  692. attrs[attr_name] = attr_defn = tuple(attr_defn) + (None,) * (4-len(attr_defn))
  693. if len(attr_defn) != 4:
  694. raise ValueError, "Definition for attribute '%s' is not a 2- to 4-tuple." % attr_name
  695. # Verify the attribute flags contain either ATTR_SEARCHABLE or ATTR_SIMPLE;
  696. # it can't contain both as that doesn't make sense.
  697. if attr_defn[1] & (ATTR_SIMPLE | ATTR_SEARCHABLE) not in (ATTR_SIMPLE, ATTR_SEARCHABLE):
  698. raise ValueError, "Flags for attribute '%s' must contain exactly one " \
  699. "of ATTR_SIMPLE or ATTR_SEARCHABLE" % attr_name
  700. # Attribute name can't conflict with reserved names.
  701. if attr_name in RESERVED_ATTRIBUTES:
  702. raise ValueError, "Attribute name '%s' is reserved." % attr_name
  703. elif attr_name in self._inverted_indexes:
  704. if not attr_defn[1] & ATTR_INVERTED_INDEX or attr_defn[2] != attr_name:
  705. # Attributes can be named after inverted indexes, but only if
  706. # ATTR_INVERTED_INDEX is specified and the attribute name is the
  707. # same as its ivtidx name.
  708. raise ValueError, "Attribute '%s' conflicts with inverted index of same name, " \
  709. "but ATTR_INVERTED_INDEX not specified in flags." % attr_name
  710. if attr_defn[1] & ATTR_INVERTED_INDEX:
  711. # Attributes with ATTR_INVERTED_INDEX can only be certain types.
  712. if attr_defn[0] not in (str, unicode, tuple, list, set):
  713. raise TypeError, "Type for attribute '%s' must be string, unicode, list, tuple, or set " \
  714. "because it is ATTR_INVERTED_INDEX" % attr_name
  715. # Make sure inverted index name is valid.
  716. if attr_defn[2] is None:
  717. raise ValueError, "Attribute '%s' flags specify inverted index, " \
  718. "but no inverted index name supplied." % attr_name
  719. elif attr_defn[2] not in self._inverted_indexes:
  720. raise ValueError, "Attribute '%s' specifies undefined interverted index '%s'" % \
  721. (attr_name, attr_defn[2])
  722. # Compile split regexp if it was given.
  723. if attr_defn[3] is not None and not callable(attr_defn[3]):
  724. attrs[attr_name] = attr_defn[:3] + (re.compile(attr_defn[3]),)
  725. if type_name in self._object_types:
  726. # This type already exists. Compare given attributes with
  727. # existing attributes for this type to see what needs to be done
  728. # (if anything).
  729. cur_type_id, cur_type_attrs, cur_type_idx = self._object_types[type_name]
  730. new_attrs = {}
  731. table_needs_rebuild = False
  732. changed = False
  733. for attr_name, attr_defn in attrs.items():
  734. attr_type, attr_flags, attr_ivtidx, attr_split = attr_defn
  735. # TODO: converting an attribute from SIMPLE to SEARCHABLE or vice
  736. # versa isn't supported yet. Raise exception here to prevent
  737. # potential data loss.
  738. if attr_name in cur_type_attrs and attr_flags & (ATTR_SEARCHABLE | ATTR_SIMPLE) != \
  739. cur_type_attrs[attr_name][1] & (ATTR_SEARCHABLE | ATTR_SIMPLE):
  740. raise ValueError, "Unsupported attempt to convert attribute '%s' " \
  741. "between ATTR_SIMPLE and ATTR_SEARCHABLE" % attr_name
  742. if attr_name not in cur_type_attrs or cur_type_attrs[attr_name] != attr_defn:
  743. # There is a new attribute specified for this type, or an
  744. # existing one has changed.
  745. new_attrs[attr_name] = attr_defn
  746. changed = True
  747. if attr_flags & ATTR_SEARCHABLE:
  748. # New attribute isn't simple, needs to alter table.
  749. table_needs_rebuild = True
  750. elif attr_flags & ATTR_INVERTED_INDEX:
  751. # TODO: there is no need to rebuild the table when adding/modifying
  752. # an ATTR_SIMPLE | ATTR_INVERTED_INDEX attribute, we just need to
  753. # recreate the delete trigger (and remove any rows from the
  754. # inverted index's map for this object type if we're removing
  755. # an association with that ivtidx). For now we will force a
  756. # rebuild since I'm too lazy to implement the proper way.
  757. table_needs_rebuild = True
  758. if attr_name in cur_type_attrs and not cur_type_attrs[attr_name][1] & ATTR_INVERTED_INDEX:
  759. # FIXME: if we add an inverted index to an existing attribute, we'd
  760. # need to reparse that attribute in all rows to populate the inverted
  761. # map. Right now just log a warning.
  762. log.warning("Adding inverted index '%s' to existing attribute '%s' not fully " \
  763. "implemented; index may be out of sync.", attr_ivtidx, attr_name)
  764. if not changed:
  765. return
  766. if self._readonly:
  767. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  768. # Update the attr list to merge both existing and new attributes.
  769. attrs = cur_type_attrs.copy()
  770. attrs.update(new_attrs)
  771. new_indexes = set(indexes).difference(cur_type_idx)
  772. indexes = set(indexes).union(cur_type_idx)
  773. self._register_check_indexes(indexes, attrs)
  774. if not table_needs_rebuild:
  775. # Only simple (i.e. pickled only) attributes are being added,
  776. # or only new indexes are added, so we don't need to rebuild the
  777. # table.
  778. if len(new_attrs):
  779. self._db_query("UPDATE types SET attrs_pickle=? WHERE id=?", (dbpickle(attrs), cur_type_id))
  780. if len(new_indexes):
  781. self._register_create_multi_indexes(new_indexes, table_name)
  782. self._db_query("UPDATE types SET idx_pickle=? WHERE id=?", (dbpickle(indexes), cur_type_id))
  783. self.commit()
  784. self._load_object_types()
  785. return
  786. # We need to update the database now ...
  787. else:
  788. # New type definition. Populate attrs with required internal
  789. # attributes so they get created with the table.
  790. new_attrs = cur_type_id = None
  791. # Merge standard attributes with user attributes for this new type.
  792. attrs.update({
  793. 'id': (int, ATTR_SEARCHABLE, None, None),
  794. 'parent_type': (int, ATTR_SEARCHABLE, None, None),
  795. 'parent_id': (int, ATTR_SEARCHABLE, None, None),
  796. 'pickle': (RAW_TYPE, ATTR_SEARCHABLE, None, None)
  797. })
  798. self._register_check_indexes(indexes, attrs)
  799. create_stmt = 'CREATE TABLE %s_tmp (' % table_name
  800. # Iterate through type attributes and append to SQL create statement.
  801. sql_types = {int: 'INTEGER', long: 'INTEGER', float: 'FLOAT', RAW_TYPE: 'BLOB',
  802. UNICODE_TYPE: 'TEXT', BYTES_TYPE: 'BLOB', bool: 'INTEGER', basestring: 'TEXT'}
  803. for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
  804. if attr_flags & ATTR_SEARCHABLE:
  805. # Attribute needs to be a column in the table, not a pickled value.
  806. if attr_type not in sql_types:
  807. raise ValueError, "Type '%s' not supported" % str(attr_type)
  808. create_stmt += '%s %s' % (attr_name, sql_types[attr_type])
  809. if attr_name == 'id':
  810. # Special case, these are auto-incrementing primary keys
  811. create_stmt += ' PRIMARY KEY AUTOINCREMENT'
  812. create_stmt += ','
  813. create_stmt = create_stmt.rstrip(',') + ')'
  814. self._db_query(create_stmt)
  815. # Add this type to the types table, including the attributes
  816. # dictionary.
  817. self._db_query('INSERT OR REPLACE INTO types VALUES(?, ?, ?, ?)',
  818. (cur_type_id, type_name, dbpickle(attrs), dbpickle(indexes)))
  819. # Sync self._object_types with the object type definition we just
  820. # stored to the db.
  821. self._load_object_types()
  822. if new_attrs:
  823. # Migrate rows from old table to new temporary one. Here we copy only
  824. # ATTR_SEARCHABLE columns that exist in both old and new definitions.
  825. columns = filter(lambda x: cur_type_attrs[x][1] & ATTR_SEARCHABLE and \
  826. x in attrs and attrs[x][1] & ATTR_SEARCHABLE, cur_type_attrs.keys())
  827. columns = ','.join(columns)
  828. self._db_query('INSERT INTO %s_tmp (%s) SELECT %s FROM %s' % \
  829. (table_name, columns, columns, table_name))
  830. # Delete old table.
  831. self._db_query('DROP TABLE %s' % table_name)
  832. # Rename temporary table.
  833. self._db_query('ALTER TABLE %s_tmp RENAME TO %s' % (table_name, table_name))
  834. # Increase the objectcount for new inverted indexes, and create a
  835. # trigger that reduces the objectcount for each applicable inverted
  836. # index when a row is deleted.
  837. inverted_indexes = self._get_type_inverted_indexes(type_name)
  838. if inverted_indexes:
  839. n_rows = self._db_query_row('SELECT COUNT(*) FROM %s' % table_name)[0]
  840. sql = 'CREATE TRIGGER delete_object_%s DELETE ON %s BEGIN ' % (type_name, table_name)
  841. for idx_name in inverted_indexes:
  842. sql += "UPDATE inverted_indexes SET value=MAX(0, value-1) WHERE name='%s' AND attr='objectcount';" % idx_name
  843. # Add to objectcount (both in db and cached value)
  844. self._db_query("UPDATE inverted_indexes SET value=value+? WHERE name=? and attr='objectcount'",
  845. (n_rows, idx_name))
  846. self._inverted_indexes[idx_name]['objectcount'] += n_rows
  847. sql += 'END'
  848. self._db_query(sql)
  849. # Create index for locating all objects under a given parent.
  850. self._db_query("CREATE INDEX %s_parent_idx on %s (parent_id, "\
  851. "parent_type)" % (table_name, table_name))
  852. # If any of these attributes need to be indexed, create the index
  853. # for that column.
  854. for attr_name, (attr_type, attr_flags, attr_ivtidx, attr_split) in attrs.items():
  855. if attr_flags & ATTR_INDEXED:
  856. self._db_query("CREATE INDEX %s_%s_idx ON %s (%s)" % \
  857. (table_name, attr_name, table_name, attr_name))
  858. # Create multi-column indexes; indexes value has already been verified.
  859. self._register_create_multi_indexes(indexes, table_name)
  860. self.commit()
  861. def register_inverted_index(self, name, min = None, max = None, split = None, ignore = None):
  862. """
  863. Registers a new inverted index with the database.
  864. An inverted index maps arbitrary terms to objects and allows you to
  865. query based on one or more terms. If the inverted index already exists
  866. with the given parameters, no action is performed.
  867. :param name: the name of the inverted index; must be alphanumeric.
  868. :type name: str
  869. :param min: the minimum length of terms to index; terms smaller
  870. than this will be ignored. If None (default), there
  871. is no minimum size.
  872. :type min: int
  873. :param max: the maximum length of terms to index; terms larger than
  874. this will be ignored. If None (default), there is no
  875. maximum size.
  876. :type max: int
  877. :param split: used to parse string-based attributes using this inverted
  878. index into individual terms. In the case of regexps, the
  879. split method will be called. (If a string is specified,
  880. it will be compiled into a regexp first.) If *split* is
  881. a callable, it will receive a string of text and must return
  882. a sequence, and each item in the sequence will be indexed
  883. as an individual term. If split is not specified, the
  884. default is to split words at non-alphanumeric/underscore/digit
  885. boundaries.
  886. :type split: callable, regexp (SRE_Pattern) object, or str
  887. :param ignore: a list of terms that will not be indexed (so-called
  888. *stop words*). If specified, each indexed term for this
  889. inverted index will first be checked against this list.
  890. If it exists, the term is discarded.
  891. For example::
  892. from kaa.db import *
  893. db = Database('test.db')
  894. db.register_inverted_index('tags')
  895. db.register_inverted_index('keywords', min=3, max=30, ignore=STOP_WORDS)
  896. """
  897. # Verify specified name doesn't already exist as some object attribute.
  898. for object_name, object_type in self._object_types.items():
  899. if name in object_type[1] and name != object_type[1][name][2]:
  900. raise ValueError, "Inverted index name '%s' conflicts with registered attribute in object '%s'" % \
  901. (name, object_name)
  902. if split is None:
  903. # Default split regexp is to split words on
  904. # alphanumeric/digits/underscore boundaries.
  905. split = re.compile(u"(\d+)|[_\W]", re.U)
  906. elif isinstance(split, basestring):
  907. split = re.compile(py3_str(split), re.U)
  908. if name not in self._inverted_indexes and not self._readonly:
  909. self._db_query('INSERT INTO inverted_indexes VALUES(?, "objectcount", 0)', (name,))
  910. # Create the tables needed by the inverted index.
  911. with self._lock:
  912. self._db.executescript(CREATE_IVTIDX_TEMPLATE.replace('%IDXNAME%', name))
  913. elif name in self._inverted_indexes:
  914. defn = self._inverted_indexes[name]
  915. if min == defn['min'] and max == defn['max'] and split == defn['split'] and \
  916. ignore == defn['ignore']:
  917. # Definition unchanged, nothing to do.
  918. return
  919. if self._readonly:
  920. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  921. defn = {
  922. 'min': min,
  923. 'max': max,
  924. 'split': split,
  925. 'ignore': ignore,
  926. }
  927. self._db_query("INSERT OR REPLACE INTO inverted_indexes VALUES(?, 'definition', ?)",
  928. (name, dbpickle(defn)))
  929. defn['objectcount'] = 0
  930. self._inverted_indexes[name] = defn
  931. self.commit()
  932. def _load_inverted_indexes(self):
  933. for name, attr, value in self._db_query("SELECT * from inverted_indexes"):
  934. if name not in self._inverted_indexes:
  935. self._inverted_indexes[name] = {}
  936. if attr == 'objectcount':
  937. self._inverted_indexes[name][attr] = int(value)
  938. elif attr == 'definition':
  939. self._inverted_indexes[name].update(dbunpickle(value))
  940. def _load_object_types(self):
  941. is_pickle_proto_2 = False
  942. for id, name, attrs, idx in self._db_query("SELECT * from types"):
  943. if attrs[1] == 0x02 or idx[1] == 0x02:
  944. is_pickle_proto_2 = True
  945. self._object_types[name] = id, dbunpickle(attrs), dbunpickle(idx)
  946. if sys.hexversion >= 0x03000000 and is_pickle_proto_2:
  947. self._readonly = True
  948. log.warning('kaa.db databases created by Python 2 are read-only until upgrade_to_py3() is called')
  949. def _get_type_inverted_indexes(self, type_name):
  950. if type_name not in self._object_types:
  951. return []
  952. indexed_attrs = set()
  953. type_attrs = self._object_types[type_name][1]
  954. for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
  955. if flags & ATTR_INVERTED_INDEX:
  956. indexed_attrs.add(attr_ivtidx)
  957. return list(indexed_attrs)
  958. def _get_type_attrs(self, type_name):
  959. return self._object_types[type_name][1]
  960. def _get_type_id(self, type_name):
  961. return self._object_types[type_name][0]
  962. def _make_query_from_attrs(self, query_type, attrs, type_name):
  963. type_attrs = self._get_type_attrs(type_name)
  964. # True if an attribute from the pickle was removed and we must force an update.
  965. pickle_attr_removed = False
  966. columns = []
  967. values = []
  968. placeholders = []
  969. for key in attrs.keys():
  970. if key not in type_attrs:
  971. if key in self._inverted_indexes:
  972. continue
  973. raise ValueError("Reference to undefined attribute '%s' for type '%s'" % (key, type_name))
  974. if attrs[key] == None:
  975. # Remove all None attributes (even ATTR_SEARCHABLE), otherwise we will
  976. # raise a TypeError later, since NoneType isn't the right type.
  977. if type_attrs[key][1] & ATTR_SIMPLE:
  978. # Attribute removed from a pickle, be sure we update the pickle in
  979. # the db if this is an 'update' query_type.
  980. pickle_attr_removed = True
  981. del attrs[key]
  982. attrs_copy = attrs.copy()
  983. for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
  984. if flags & ATTR_SEARCHABLE and name in attrs:
  985. columns.append(name)
  986. placeholders.append("?")
  987. value = attrs[name]
  988. # Coercion for numeric types
  989. if isinstance(value, (int, long, float)) and attr_type in (int, long, float):
  990. value = attr_type(value)
  991. elif isinstance(value, basestring) and \
  992. flags & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE:
  993. # If the attribute is ATTR_INDEXED_IGNORE_CASE and it's a string
  994. # then we store it as lowercase in the table column, while
  995. # keeping the original (unchanged case) value in the pickle.
  996. # This allows us to do case-insensitive searches on indexed
  997. # columns and still benefit from the index.
  998. attrs_copy["__" + name] = value
  999. value = value.lower()
  1000. if not isinstance(value, attr_type):
  1001. raise TypeError("Type mismatch in query for %s: '%s' (%s) is not a %s" % \
  1002. (name, str(value), str(type(value)), str(attr_type)))
  1003. if attr_type == BYTES_TYPE:
  1004. # For Python 2, convert non-unicode strings to buffers. (For Python 3,
  1005. # BYTES_TYPE == RAW_TYPE so this is a no-op.)
  1006. value = RAW_TYPE(value)
  1007. values.append(value)
  1008. del attrs_copy[name]
  1009. if len(attrs_copy) > 0 or (pickle_attr_removed and query_type == 'update'):
  1010. # From the remaining attributes, remove those named after inverted
  1011. # indexes that aren't explicitly registered as attributes in the
  1012. # object type. Here is where we keep our cached copy of inverted
  1013. # index terms.
  1014. for attr in attrs_copy.keys():
  1015. if attr in self._inverted_indexes and attr not in type_attrs:
  1016. del attrs_copy[attr]
  1017. # What's left gets put into the pickle.
  1018. columns.append("pickle")
  1019. values.append(dbpickle(attrs_copy))
  1020. placeholders.append("?")
  1021. table_name = "objects_" + type_name
  1022. if query_type == "add":
  1023. if columns:
  1024. columns = ",".join(columns)
  1025. placeholders = ",".join(placeholders)
  1026. q = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, columns, placeholders)
  1027. else:
  1028. # Insert empty row (need to specify at least one column to make valid
  1029. # SQL, so just specify id of null, which will assume next value in
  1030. # sequence.
  1031. q = 'INSERT INTO %s (id) VALUES(null)' % table_name
  1032. else:
  1033. q = "UPDATE %s SET " % table_name
  1034. for col, ph in zip(columns, placeholders):
  1035. q += "%s=%s," % (col, ph)
  1036. # Trim off last comma
  1037. q = q.rstrip(",")
  1038. q += " WHERE id=?"
  1039. values.append(attrs["id"])
  1040. return q, values
  1041. def delete(self, obj):
  1042. """
  1043. Delete the specified object.
  1044. :param obj: the object to delete
  1045. :type obj: :class:`ObjectRow` or (object_type, object_id)
  1046. """
  1047. # TODO: support recursive delete (delete all decendents)
  1048. object_type, object_id = self._to_obj_tuple(obj)
  1049. return self._delete_multiple_objects({object_type: (object_id,)})
  1050. def reparent(self, obj, parent):
  1051. """
  1052. Change the parent of an object.
  1053. :param obj: the object to reparent
  1054. :type obj: :class:`ObjectRow`, or (type, id)
  1055. :param parent: the new parent of the object
  1056. :type parent: :class:`ObjectRow`, or (type, id)
  1057. This is a convenience method to improve code readability, and is
  1058. equivalent to::
  1059. database.update(obj, parent=parent)
  1060. """
  1061. return self.update(obj, parent=parent)
  1062. def retype(self, obj, new_type):
  1063. """
  1064. Convert the object to a new type.
  1065. :param obj: the object to be converted to the new type
  1066. :type obj: :class:`ObjectRow`, or (type, id)
  1067. :param new_type: the type to convert the object to
  1068. :type newtype: str
  1069. :returns: an :class:`ObjectRow`, converted to the new type with the new id
  1070. Any attribute that has not also been registered with ``new_type``
  1071. (and with the same name) will be removed. Because the object is
  1072. effectively changing ids, all of its existing children will be
  1073. reparented to the new id.
  1074. """
  1075. if new_type not in self._object_types:
  1076. raise ValueError('Parent type %s not registered in database' % new_type)
  1077. # Reload and force pickled attributes into the dict.
  1078. try:
  1079. attrs = dict(self.get(obj))
  1080. except TypeError:
  1081. raise ValueError('Object (%s, %s) is not found in database' % (obj['type'], obj['id']))
  1082. parent = attrs.get('parent')
  1083. # Remove all attributes that aren't also in the destination type. Also
  1084. # remove type, id, and parent attrs, which get regenerated when we add().
  1085. for attr_name in attrs.keys():
  1086. # TODO: check src and dst attr types and try to coerce, and if
  1087. # not possible, raise an exception.
  1088. if attr_name not in self._object_types[new_type][1] or attr_name in ('type', 'id', 'parent'):
  1089. del attrs[attr_name]
  1090. new_obj = self.add(new_type, parent, **attrs)
  1091. # Reparent all current children to the new id.
  1092. for child in self.query(parent=obj):
  1093. # TODO: if this raises, delete new_obj (to rollback) and reraise.
  1094. self.reparent(child, new_obj)
  1095. self.delete(obj)
  1096. return new_obj
  1097. def delete_by_query(self, **attrs):
  1098. """
  1099. Delete all objects returned by the given query.
  1100. :param attrs: see :meth:`~kaa.db.Database.query` for details.
  1101. :returns: the number of objects deleted
  1102. :rtype: int
  1103. """
  1104. attrs["attrs"] = ["id"]
  1105. results = self.query(**attrs)
  1106. if len(results) == 0:
  1107. return 0
  1108. results_by_type = {}
  1109. for o in results:
  1110. if o["type"] not in results_by_type:
  1111. results_by_type[o["type"]] = []
  1112. results_by_type[o["type"]].append(o["id"])
  1113. return self._delete_multiple_objects(results_by_type)
  1114. def _delete_multiple_objects(self, objects):
  1115. if self._readonly:
  1116. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  1117. child_objects = {}
  1118. count = 0
  1119. for object_type, object_ids in objects.items():
  1120. ivtidxes = self._get_type_inverted_indexes(object_type)
  1121. self._delete_multiple_objects_inverted_index_terms({object_type: (ivtidxes, object_ids)})
  1122. object_type_id = self._get_type_id(object_type)
  1123. if len(object_ids) == 0:
  1124. continue
  1125. object_ids_str = _list_to_printable(object_ids)
  1126. self._db_query("DELETE FROM objects_%s WHERE id IN %s" % \
  1127. (object_type, object_ids_str))
  1128. count += self._cursor.rowcount
  1129. # Record all children of this object so we can later delete them.
  1130. for tp_name, (tp_id, tp_attrs, tp_idx) in self._object_types.items():
  1131. children_ids = self._db_query("SELECT id FROM objects_%s WHERE parent_id IN %s AND parent_type=?" % \
  1132. (tp_name, object_ids_str), (object_type_id,))
  1133. if len(children_ids):
  1134. child_objects[tp_name] = [x[0] for x in children_ids]
  1135. if len(child_objects):
  1136. # If there are any child objects of the objects we just deleted,
  1137. # delete those now.
  1138. count += self._delete_multiple_objects(child_objects)
  1139. if count:
  1140. self._set_dirty()
  1141. return count
  1142. def add(self, object_type, parent=None, **attrs):
  1143. """
  1144. Add an object to the database.
  1145. :param object_type: the name of the object type previously created by
  1146. :meth:`~kaa.db.Database.register_object_type_attrs`.
  1147. :type object_type: str
  1148. :param parent: specifies the parent of this object, if any; does not have
  1149. to be an object of the same type.
  1150. :type parent: :class:`ObjectRow` or 2-tuple (object_type, object_id)
  1151. :param attrs: keyword arguments specifying the attribute (which must
  1152. have been registered) values. Registered attributes that
  1153. are not explicitly specified here will default to None.
  1154. :returns: :class:`ObjectRow` representing the added object
  1155. For example::
  1156. import os
  1157. from kaa.db import *
  1158. db = Database('test.db')
  1159. db.register_object_type_attrs('directory',
  1160. name = (str, ATTR_SEARCHABLE),
  1161. mtime = (float, ATTR_SIMPLE)
  1162. )
  1163. root = db.add('directory', name='/', mtime=os.stat('/').st_mtime)
  1164. db.add('directory', parent=root, name='etc', mtime=os.stat('/etc').st_mtime)
  1165. db.add('directory', parent=root, name='var', mtime=os.stat('/var').st_mtime)
  1166. """
  1167. if self._readonly:
  1168. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  1169. type_attrs = self._get_type_attrs(object_type)
  1170. if parent:
  1171. attrs['parent_type'], attrs['parent_id'] = self._to_obj_tuple(parent, numeric=True)
  1172. # Increment objectcount for the applicable inverted indexes.
  1173. inverted_indexes = self._get_type_inverted_indexes(object_type)
  1174. if inverted_indexes:
  1175. self._db_query("UPDATE inverted_indexes SET value=value+1 WHERE attr='objectcount' AND name IN %s" % \
  1176. _list_to_printable(inverted_indexes))
  1177. # Process inverted index maps for this row
  1178. ivtidx_terms = []
  1179. for ivtidx in inverted_indexes:
  1180. # Sync cached objectcount with the DB (that we just updated above)
  1181. self._inverted_indexes[ivtidx]['objectcount'] += 1
  1182. terms_list = []
  1183. split = self._inverted_indexes[ivtidx]['split']
  1184. for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
  1185. if attr_ivtidx == ivtidx and name in attrs:
  1186. terms_list.append((attrs[name], 1.0, attr_split or split, ivtidx))
  1187. if ivtidx in attrs and ivtidx not in type_attrs:
  1188. # Attribute named after an inverted index is given in kwagrs,
  1189. # but that ivtidx is not a registered attribute (which would be
  1190. # handled in the for loop just above).
  1191. terms_list.append((attrs[ivtidx], 1.0, split, ivtidx))
  1192. terms = self._score_terms(terms_list)
  1193. if terms:
  1194. ivtidx_terms.append((ivtidx, terms))
  1195. # If there are no terms for this ivtidx, we don't bother storing
  1196. # an empty list in the pickle.
  1197. if ivtidx in type_attrs:
  1198. # Registered attribute named after ivtidx; store ivtidx
  1199. # terms in object.
  1200. attrs[ivtidx] = terms.keys()
  1201. query, values = self._make_query_from_attrs("add", attrs, object_type)
  1202. self._db_query(query, values)
  1203. # Add id given by db, as well as object type.
  1204. attrs['id'] = self._cursor.lastrowid
  1205. attrs['type'] = unicode(object_type)
  1206. attrs['parent'] = self._to_obj_tuple(parent) if parent else (None, None)
  1207. for ivtidx, terms in ivtidx_terms:
  1208. self._add_object_inverted_index_terms((object_type, attrs['id']), ivtidx, terms)
  1209. # Populate dictionary with keys for this object type not specified in kwargs.
  1210. attrs.update(dict.fromkeys([k for k in type_attrs if k not in attrs.keys() + ['pickle']]))
  1211. self._set_dirty()
  1212. return ObjectRow(None, None, attrs)
  1213. def get(self, obj):
  1214. """
  1215. Fetch the given object from the database.
  1216. :param obj: a 2-tuple (type, id) representing the object.
  1217. :returns: :class:`ObjectRow`
  1218. obj may also be an :class:`ObjectRow`, however that usage is less
  1219. likely to be useful, because an :class:`ObjectRow` already contains all
  1220. information about the object. One common use-case is to reload a
  1221. possibly changed object from disk.
  1222. This method is essentially shorthand for::
  1223. database.query(object=(object_type, object_id))[0]
  1224. """
  1225. obj = self._to_obj_tuple(obj)
  1226. rows = self.query(object=obj)
  1227. if rows:
  1228. return rows[0]
  1229. def update(self, obj, parent=None, **attrs):
  1230. """
  1231. Update attributes for an existing object in the database.
  1232. :param obj: the object whose attributes are being modified
  1233. :type obj: :class:`ObjectRow` or 2-tuple (object_type, object_id)
  1234. :param parent: if specified, the object is reparented to the given
  1235. parent object, otherwise the parent remains the
  1236. same as when the object was added with
  1237. :meth:`~kaa.db.Database.add`.
  1238. :type parent: :class:`ObjectRow` or 2-tuple (object_type, object_id)
  1239. :param attrs: keyword arguments specifying the attribute (which must
  1240. have been registered) values. Registered attributes that
  1241. are not explicitly specified here will preserve their
  1242. original values (except for special attributes named
  1243. after inverted index; see warning below).
  1244. Continuing from the example in :meth:`~kaa.db.Database.add`, consider::
  1245. >>> d = db.add('directory', parent=root, name='foo')
  1246. >>> db.update(d, name='bar')
  1247. >>> d = db.get(d) # Reload changes
  1248. >>> d['name']
  1249. 'bar'
  1250. .. warning::
  1251. When updating an attribute associated with an inverted index, all
  1252. terms for that inverted index in the object need to be rescored.
  1253. For special attributes with the same name as inverted indexes, it's
  1254. the caller's responsibility to ensure terms are passed back during
  1255. update.
  1256. In the email example from :meth:`register_object_type_attrs`, if the
  1257. subject attribute is updated by itself, any previously indexed terms
  1258. passed to the keywords attribute (the message body) would be
  1259. discarded after the update. If updating the subject, the caller
  1260. would be required to pass the message body in the keywords attribute
  1261. again, in order to preserve those terms.
  1262. If none of the attributes being updated are associated with an
  1263. inverted index that also has a same-named special attribute then
  1264. this warning doesn't apply as the inverted index does not need
  1265. to be updated.
  1266. """
  1267. if self._readonly:
  1268. raise DatabaseReadOnlyError('upgrade_to_py3() must be called before database can be modified')
  1269. object_type, object_id = self._to_obj_tuple(obj)
  1270. type_attrs = self._get_type_attrs(object_type)
  1271. get_pickle = False
  1272. # Determine which inverted indexes need to be regenerated for this
  1273. # object. Builds a dictionary of ivtidxes with a dirty flag and
  1274. # a list of sql columns needed for reindexing.
  1275. ivtidx_columns = {}
  1276. for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
  1277. if flags & ATTR_INVERTED_INDEX:
  1278. if attr_ivtidx not in ivtidx_columns:
  1279. ivtidx_columns[attr_ivtidx] = [ False, [] ]
  1280. if flags & ATTR_SEARCHABLE:
  1281. ivtidx_columns[attr_ivtidx][1].append(name)
  1282. if flags & (ATTR_SIMPLE | ATTR_IGNORE_CASE):
  1283. get_pickle = True
  1284. if name in attrs:
  1285. ivtidx_columns[attr_ivtidx][0] = True
  1286. # If the updated attribute is stored in the pickle (either a simple attr
  1287. # or an case-insensitive indexed attr in which __foo is in the pickle)
  1288. # then we must first retrieve the pickle for this object from the db.
  1289. if (flags & ATTR_SIMPLE or flags & ATTR_INDEXED_IGNORE_CASE == ATTR_INDEXED_IGNORE_CASE) and \
  1290. name in attrs:
  1291. get_pickle = True
  1292. # TODO: if ObjectRow is supplied, don't need to fetch columns
  1293. # that are available in the ObjectRow. (Of course this assumes
  1294. # the object wasn't changed via elsewhere during the life of the
  1295. # ObjectRow object, so maybe we don't want to do that.)
  1296. reqd_columns = ['pickle'] if get_pickle else []
  1297. for dirty, searchable_attrs in ivtidx_columns.values():
  1298. if dirty:
  1299. reqd_columns.extend(searchable_attrs)
  1300. if reqd_columns:
  1301. q = 'SELECT %s FROM objects_%s WHERE id=?' % (','.join(reqd_columns), object_type)
  1302. row = self._db_query_row(q, (object_id,))
  1303. if not row:
  1304. raise ValueError, "Can't update unknown object (%s, %d)" % (object_type, object_id)
  1305. if reqd_columns[0] == 'pickle' and row[0]:
  1306. # One of the attrs we're updating is in the pickle, so we
  1307. # have fetched it; now convert it to a dict.
  1308. row_attrs = dbunpickle(row[0])
  1309. for key, value in row_attrs.items():
  1310. # Rename all __foo to foo for ATTR_IGNORE_CASE columns
  1311. if key.startswith('__') and type_attrs[key[2:]][1] & ATTR_IGNORE_CASE:
  1312. row_attrs[key[2:]] = value
  1313. del row_attrs[key]
  1314. # Update stored pickle data with new ATTR_SIMPLE attribute values
  1315. row_attrs.update(attrs)
  1316. attrs = row_attrs
  1317. if parent:
  1318. attrs['parent_type'], attrs['parent_id'] = self._to_obj_tuple(parent, numeric=True)
  1319. attrs['id'] = object_id
  1320. # Make copy of attrs for later query, since we're now about to mess with it.
  1321. orig_attrs = attrs.copy()
  1322. # Merge the ivtidx columns we grabbed above into attrs dict.
  1323. for n, name in enumerate(reqd_columns):
  1324. if name not in attrs and name != 'pickle':
  1325. attrs[name] = row[n]
  1326. for ivtidx, (dirty, searchable_attrs) in ivtidx_columns.items():
  1327. if not dirty:
  1328. # No attribute for this ivtidx changed.
  1329. continue
  1330. split = self._inverted_indexes[ivtidx]['split']
  1331. # Remove existing indexed words for this object.
  1332. self._delete_object_inverted_index_terms((object_type, object_id), ivtidx)
  1333. # TODO: code duplication from add()
  1334. # Need to reindex all columns in this object using this ivtidx.
  1335. terms_list = []
  1336. for name, (attr_type, flags, attr_ivtidx, attr_split) in type_attrs.items():
  1337. if attr_ivtidx == ivtidx and name in attrs:
  1338. if attr_type == BYTES_TYPE and isinstance(attrs[name], RAW_TYPE):
  1339. # We store string objects in the db as buffers, in
  1340. # order to prevent any unicode issues. So we need
  1341. # to convert the buffer we got from the db back to
  1342. # a string before parsing the attribute into terms.
  1343. attrs[name] = BYTES_TYPE(attrs[name])
  1344. terms_list.append((attrs[name], 1.0, attr_split or split, ivtidx))
  1345. if ivtidx in attrs and ivtidx not in type_attrs:
  1346. # Attribute named after an inverted index is given, but
  1347. # that ivtidx is not a named attribute (which would be handled
  1348. # in the for loop just above).
  1349. terms_list.append((attrs[ivtidx], 1.0, split, ivtidx))
  1350. terms = self._score_terms(terms_list)
  1351. self._add_object_inverted_index_terms((object_type, object_id), ivtidx, terms)
  1352. if ivtidx in type_attrs:
  1353. # Registered attribute named after ivtidx; store ivtidx
  1354. # terms in object.
  1355. if not terms and ivtidx in orig_attrs:
  1356. # Update removed all terms for this ivtidx, remove from pickle.
  1357. orig_attrs[ivtidx] = None
  1358. elif terms:
  1359. # There are terms for this ivtidx, store in pickle.
  1360. orig_attrs[ivtidx] = terms.keys()
  1361. query, values = self._make_query_from_attrs("update", orig_attrs, object_type)
  1362. self._db_query(query, values)
  1363. self._set_dirty()
  1364. # TODO: if an objectrow was given, return an updated objectrow
  1365. def commit(self):
  1366. """
  1367. Explicitly commit any changes made to the database.
  1368. .. note:: Any uncommitted changes will automatically be committed at
  1369. program exit.
  1370. """
  1371. main.signals['exit'].disconnect(self.commit)
  1372. self._dirty = False
  1373. with self._lock:
  1374. self._db.commit()
  1375. def query(self, **attrs):
  1376. """
  1377. Query the database for objects matching all of the given keyword
  1378. attributes.
  1379. Keyword arguments can be any previously registered :attr:`~kaa.db.ATTR_SEARCHABLE`
  1380. object attribute for any object type, or the name of a registered
  1381. inverted index. There are some special keyword arguments:
  1382. :param parent: require all matched objects to have the given object
  1383. (or objects) as their immediate parent ancestor. If
  1384. parent is a list or tuple, then they specify a list
  1385. of possible parents, any of which would do.
  1386. :type parent: :class:`ObjectRow`, 2-tuple (object_type, object_id), 2-tuple
  1387. (object_type, :class:`~kaa.db.QExpr`), or a list of those
  1388. :param object: match only a specific object. Not usually very useful,
  1389. but could be used to test if the given object matches
  1390. terms from an inverted index.
  1391. :type object: :class:`ObjectRow` or 2-tuple (object_type, object_id)
  1392. :param type: only search items of this object type; if None (or not
  1393. specified) then all types are searched
  1394. :type type: str
  1395. :param limit: return only this number of results; if None (or not
  1396. specified), all matches are returned.
  1397. :type limit: int
  1398. :param attrs: a list of attribute names to be returned; if not specified,
  1399. all attributes registered with the object type are available
  1400. in the result. Only specifying the attributes required
  1401. can help performance moderately, but generally it isn't
  1402. required except wit *distinct* below.
  1403. :type attrs: list of str
  1404. :param distinct: if True, ensures that each object in the result set is
  1405. unique with respect to the attributes specified in the
  1406. *attrs* parameter. When distinct is True, *attrs* is
  1407. required and none of the attributes specified may be
  1408. simple.
  1409. :param orattrs: attribute names that will be ORed in the query; by default,
  1410. all attributes are ANDed.
  1411. :type orattrs: list
  1412. :raises: ValueError if the query is invalid (e.g. attempting to query
  1413. on a simple attribute)
  1414. :returns: a list of :class:`ObjectRow` objects
  1415. When any of the attributes are inverted indexes, the result list is
  1416. sorted according to a score. The score is based upon the frequency
  1417. of the matched terms relative to the entire database.
  1418. .. note:: If you know which type of object you're interested in, you
  1419. should specify the *type* as it will help improve performance by
  1420. reducing the scope of the search, especially for inverted indexes.
  1421. Another significant factor in performance is whether or not a *limit*
  1422. is specified. Query time generally scales linearly with respect to
  1423. the number of rows found, but in the case of searches on inverted
  1424. indexes, specifying a limit can drastically reduce search time, but
  1425. does not affect scoring.
  1426. Values supplied to attributes (other than inverted indexes) require
  1427. exact matches. To search based on an expression, such as inequality,
  1428. ranges, substrings, set inclusion, etc. require the use of a
  1429. :class:`~kaa.db.QExpr` object.
  1430. Expanding on the example provided in
  1431. :meth:`~kaa.db.Database.register_object_type_attrs`::
  1432. >>> db.add('msg', sender=u'Stewie Griffin', subject=u'Blast!',
  1433. keywords='When the world is mine, your death shall be quick and painless.')
  1434. >>> # Exact match based on sender
  1435. >>> db.query(sender=u'Stewie Griffin')
  1436. [<kaa.db.ObjectRow object at 0x7f652b251030>]
  1437. >>> # Keyword search requires all keywords
  1438. >>> db.query(keywords=['death', 'blast'])
  1439. [<kaa.db.ObjectRow object at 0x7f652c3d1f90>]
  1440. >>> # This doesn't work, since it does an exact match ...
  1441. >>> db.query(sender=u'Stewie')
  1442. []
  1443. >>> # ... but we can use QExpr to do a substring/pattern match.
  1444. >>> db.query(sender=QExpr('like', u'Stewie%'))
  1445. [<kaa.db.ObjectRow object at 0x7f652c3d1f90>]
  1446. >>> # How about a regexp search.
  1447. >>> db.query(sender=QExpr('regexp', ur'.*\\bGriffin'))
  1448. [<kaa.db.ObjectRow object at 0x7f652b255030>]
  1449. """
  1450. query_info = {}
  1451. parents = []
  1452. query_type = "ALL"
  1453. results = []
  1454. query_info["columns"] = {}
  1455. query_info["attrs"] = {}
  1456. if "object" in attrs:
  1457. attrs['type'], attrs['id'] = self._to_obj_tuple(attrs['object'])
  1458. del attrs['object']
  1459. ivtidx_results = ivtidx_results_by_type = None
  1460. for ivtidx in self._inverted_indexes:
  1461. # TODO: Possible optimization: do ivtidx search after the query
  1462. # below only on types that have results iff all queried columns are
  1463. # indexed.
  1464. # TODO: could be smarter about the order in which we do ivtidx
  1465. # searches (do least populated first)
  1466. if ivtidx in attrs:
  1467. # If search criteria other than this inverted index are specified,
  1468. # we can't enforce a limit on the search, otherwise we
  1469. # might miss intersections.
  1470. if len(set(attrs).difference(('type', 'limit', ivtidx))) > 0:
  1471. limit = None
  1472. else:
  1473. limit = attrs.get('limit')
  1474. r = self._query_inverted_index(ivtidx, attrs[ivtidx], limit, attrs.get('type'))
  1475. if ivtidx_results is None:
  1476. ivtidx_results = r
  1477. else:
  1478. for o in ivtidx_results.keys():
  1479. if o not in r:
  1480. del ivtidx_results[o]
  1481. else:
  1482. ivtidx_results[o] *= r[o]
  1483. if not ivtidx_results:
  1484. # No matches, so we're done.
  1485. return []
  1486. del attrs[ivtidx]
  1487. if ivtidx_results:
  1488. ivtidx_results_by_type = {}
  1489. for tp, id in ivtidx_results.keys():
  1490. if tp not in ivtidx_results_by_type:
  1491. ivtidx_results_by_type[tp] = []
  1492. ivtidx_results_by_type[tp].append(id)
  1493. if attrs.get('type') is not None:
  1494. if attrs["type"] not in self._object_types:
  1495. raise ValueError, "Unknown object type '%s'" % attrs["type"]
  1496. type_list = [(attrs["type"], self._object_types[attrs["type"]])]
  1497. else:
  1498. type_list = self._object_types.items()
  1499. if attrs.get('parent') is not None:
  1500. # ("type", id_or_QExpr) or (("type1", id_or_QExpr), ("type2", id_or_QExpr), ...)
  1501. if isinstance(attrs['parent'], ObjectRow) or \
  1502. (isinstance(attrs['parent'], (list, tuple)) and \
  1503. not isinstance(attrs['parent'][0], (list, tuple))):
  1504. # (type, parent) -> ((type, parent),)
  1505. attrs['parent'] = (attrs['parent'],)
  1506. for parent_obj in attrs['parent']:
  1507. parent_type_id, parent_id = self._to_obj_tuple(parent_obj, numeric=True)
  1508. if not isinstance(parent_id, QExpr):
  1509. parent_id = QExpr("=", parent_id)
  1510. parents.append((parent_type_id, parent_id))
  1511. if attrs.get('limit') is not None:
  1512. result_limit = attrs["limit"]
  1513. else:
  1514. result_limit = None
  1515. if attrs.get('attrs') is not None:
  1516. requested_columns = attrs["attrs"]
  1517. else:
  1518. requested_columns = None
  1519. if attrs.get('distinct') is not None:
  1520. if attrs["distinct"]:
  1521. if not requested_columns:
  1522. raise ValueError, "Distinct query specified, but no attrs kwarg given."
  1523. query_type = "DISTINCT"
  1524. if attrs.get('orattrs') is not None:
  1525. orattrs = set(attrs['orattrs'])
  1526. else:
  1527. orattrs = ()
  1528. # Remove all special keywords
  1529. for attr in ('parent', 'object', 'type', 'limit', 'attrs', 'distinct', 'orattrs'):
  1530. attrs.pop(attr, None)
  1531. for type_name, (type_id, type_attrs, type_idx) in type_list:
  1532. if ivtidx_results and type_id not in ivtidx_results_by_type:
  1533. # If we've done a ivtidx search, don't bother querying
  1534. # object types for which there were no hits.
  1535. continue
  1536. # Select only sql columns (i.e. attrs that aren't ATTR_SIMPLE).
  1537. all_columns = [ x for x in type_attrs if type_attrs[x][1] & ATTR_SEARCHABLE ]
  1538. if requested_columns:
  1539. columns = requested_columns[:]
  1540. # Ensure that all the requested columns exist for this type
  1541. missing = tuple(set(columns).difference(type_attrs.keys()))
  1542. if missing:
  1543. raise ValueError, "One or more requested attributes %s are not available for type '%s'" % \
  1544. (str(missing), type_name)
  1545. # If any of the requested attributes are ATTR_SIMPLE or
  1546. # ATTR_INDEXED_IGNORE_CASE then we need the pickle.
  1547. pickled = [ x for x in columns if type_attrs[x][1] & (ATTR_SIMPLE | ATTR_INDEXED_IGNORE_CASE) in
  1548. (ATTR_SIMPLE, ATTR_INDEXED_IGNORE_CASE)]
  1549. if pickled:
  1550. # One or more attributes from pickle are requested in attrs list,
  1551. # so we need to grab the pickle column.
  1552. if 'pickle' not in columns:
  1553. columns.append('pickle')
  1554. # Remove the list of pickled attributes so we don't
  1555. # request them as sql columns.
  1556. columns = list(set(columns).difference(pickled))
  1557. else:
  1558. columns = all_columns
  1559. # Now construct a query based on the supplied attributes for this
  1560. # object type.
  1561. # If any of the attribute names aren't valid for this type, then we
  1562. # don't bother matching, since this an AND query and there won't be
  1563. # any matches.
  1564. missing = set(attrs).difference(all_columns)
  1565. if missing:
  1566. # Raise exception if user attempts to search on a simple attr.
  1567. simple = [ x for x in missing if x in type_attrs and type_attrs[x][1] & ATTR_SIMPLE ]
  1568. if simple:
  1569. raise ValueError, "Querying on non-searchable attribute '%s'" % simple[0]
  1570. continue
  1571. q, qor = [], []
  1572. query_values, qor_values = [], []
  1573. q.append("SELECT %s '%s',%d,id,%s FROM objects_%s" % \
  1574. (query_type, type_name, type_id, ",".join(columns), type_name))
  1575. if ivtidx_results != None:
  1576. q.append("WHERE")
  1577. q.append("id IN %s" % _list_to_printable(ivtidx_results_by_type[type_id]))
  1578. if len(parents):
  1579. q.append(("WHERE", "AND")["WHERE" in q])
  1580. expr = []
  1581. for parent_type, parent_id in parents:
  1582. sql, values = parent_id.as_sql("parent_id")
  1583. expr.append("(parent_type=? AND %s)" % sql)
  1584. query_values += (parent_type,) + values
  1585. q.append("(%s)" % " OR ".join(expr))
  1586. for attr, value in attrs.items():
  1587. is_or_attr = attr in orattrs
  1588. attr_type, attr_flags = type_attrs[attr][:2]
  1589. if not isinstance(value, QExpr):
  1590. value = QExpr("=", value)
  1591. # Coerce between numeric types; also coerce a string of digits into a numeric
  1592. # type.
  1593. if attr_type in (int, long, float) and (isinstance(value._operand, (int, long, float)) or \
  1594. isinstance(value._operand, basestring) and value._operand.isdigit()):
  1595. value._operand = attr_type(value._operand)
  1596. # Verify expression operand type is correct for this attribute.
  1597. if value._operator not in ("range", "in", "not in") and \
  1598. not isinstance(value._operand, attr_type):
  1599. raise TypeError, "Type mismatch in query: '%s' (%s) is not a %s" % \
  1600. (str(value._operand), str(type(value._operand)), str(attr_type))
  1601. # Queries on ATTR_IGNORE_CASE string columns are case-insensitive.
  1602. if isinstance(value._operand, basestring) and type_attrs[attr][1] & ATTR_IGNORE_CASE:
  1603. value._operand = value._operand.lower()
  1604. if not (type_attrs[attr][1] & ATTR_INDEXED):
  1605. # If this column is ATTR_INDEXED then we already ensure
  1606. # the values are stored in lowercase in the db, so we
  1607. # don't want to get sql to lower() the column because
  1608. # it's needless, and more importantly, we won't be able
  1609. # to use any indices on the column.
  1610. attr = 'lower(%s)' % attr
  1611. if isinstance(value._operand, BYTES_TYPE):
  1612. # For Python 2, convert non-unicode strings to buffers. (For Python 3,
  1613. # BYTES_TYPE == RAW_TYPE so this is a no-op.)
  1614. value._operand = RAW_TYPE(value._operand)
  1615. sql, values = value.as_sql(attr)
  1616. if is_or_attr:
  1617. qor.append(sql)
  1618. qor_values.extend(values)
  1619. else:
  1620. q.append('AND' if 'WHERE' in q else 'WHERE')
  1621. q.append(sql)
  1622. query_values.extend(values)
  1623. if qor:
  1624. q.append('AND' if 'WHERE' in q else 'WHERE')
  1625. q.append('(%s)' % ' OR '.join(qor))
  1626. if query_type == 'DISTINCT':
  1627. q.append(' GROUP BY %s' % ','.join(requested_columns))
  1628. if result_limit != None:
  1629. q.append(" LIMIT %d" % result_limit)
  1630. q = " ".join(q)
  1631. rows = self._db_query(q, query_values + qor_values, cursor=self._qcursor)
  1632. if result_limit != None:
  1633. results.extend(rows[:result_limit - len(results) + 1])
  1634. else:
  1635. results.extend(rows)
  1636. query_info["columns"][type_name] = ["type"] + columns
  1637. query_info["attrs"][type_name] = type_attrs
  1638. if result_limit != None and len(rows) == result_limit:
  1639. # No need to try the other types, we're done.
  1640. break
  1641. # If ivtidx search was done, sort results based on score (highest
  1642. # score first).
  1643. if ivtidx_results:
  1644. results.sort(key=lambda r: ivtidx_results[(r[1], r[2])])
  1645. return results
  1646. def query_one(self, **attrs):
  1647. """
  1648. Like :meth:`~kaa.db.Database.query` but returns a single object only.
  1649. This is a convenience method, and query_one(...) is equivalent to::
  1650. results = db.query(...)
  1651. if results:
  1652. obj = results[0]
  1653. else:
  1654. obj = None
  1655. limit=1 is implied by this query.
  1656. """
  1657. results = self.query(limit=1, **attrs)
  1658. return results[0] if results else None
  1659. def _score_terms(self, terms_list):
  1660. """
  1661. Scores the terms given in terms_list, which is a list of tuples (terms,
  1662. coeff, split, ivtidx), where terms is the string or sequence of
  1663. terms to be scored, coeff is the weight to give each term in this part
  1664. (1.0 is normal), split is the function or regular expression used to
  1665. split terms (only used if a string is given for terms), and ivtidx is
  1666. the name of inverted index we're scoring for.
  1667. Terms are either unicode objects or strings, or sequences of unicode or
  1668. string objects. In the case of strings, they are passed through
  1669. py3_str() to try to decode them intelligently.
  1670. Each term T is given the score:
  1671. sqrt( (T coeff * T count) / total term count )
  1672. Counts are relative to the given object, not all objects in the
  1673. database.
  1674. Returns a dict of term->score. Terms (the keys) are converted to
  1675. unicode objects, but their case is preserved as given (if term is
  1676. given more than once, the case of the first occurence is used), and
  1677. score (the values) are calculated as described above.
  1678. """
  1679. terms_scores = {}
  1680. total_terms = 0
  1681. for terms, coeff, split, ivtidx in terms_list:
  1682. if not terms:
  1683. continue
  1684. # Swap ivtidx name for inverted index definition dict
  1685. ivtidx = self._inverted_indexes[ivtidx]
  1686. if not isinstance(terms, (basestring, list, tuple)):
  1687. raise ValueError, "Invalid type (%s) for ATTR_INVERTED_INDEX attribute. " \
  1688. "Only sequence, unicode or str allowed." % str(type(terms))
  1689. if isinstance(terms, (list, tuple)):
  1690. terms = [py3_str(term) for term in terms]
  1691. parsed = terms
  1692. else:
  1693. terms = py3_str(terms)
  1694. if callable(split):
  1695. parsed = list(split(terms))
  1696. else:
  1697. parsed = split.split(terms)
  1698. for term in parsed:
  1699. if not term or (ivtidx['max'] and len(term) > ivtidx['max']) or \
  1700. (ivtidx['min'] and len(term) < ivtidx['min']):
  1701. continue
  1702. lower_term = term.lower()
  1703. if ivtidx['ignore'] and lower_term in ivtidx['ignore']:
  1704. continue
  1705. if lower_term not in terms_scores:
  1706. terms_scores[lower_term] = [term, coeff]
  1707. else:
  1708. terms_scores[lower_term][1] += coeff
  1709. total_terms += 1
  1710. # Score based on term frequency in document. (Add weight for
  1711. # non-dictionary terms? Or longer terms?)
  1712. for lower_term, score in terms_scores.items():
  1713. terms_scores[lower_term][1] = math.sqrt(terms_scores[lower_term][1] / total_terms)
  1714. return dict(terms_scores.values())
  1715. def _delete_object_inverted_index_terms(self, (object_type, object_id), ivtidx):
  1716. """
  1717. Removes all indexed terms under the specified inverted index for the
  1718. given object. This function must be called when an object is removed
  1719. from the database, or when an ATTR_INVERTED_INDEX attribute of an
  1720. object is being updated (and therefore that inverted index must be
  1721. re-indexed).
  1722. """
  1723. self._delete_multiple_objects_inverted_index_terms({object_type: ((ivtidx,), (object_id,))})
  1724. def _delete_multiple_objects_inverted_index_terms(self, objects):
  1725. """
  1726. objects = dict type_name -> (ivtidx tuple, ids tuple)
  1727. """
  1728. for type_name, (ivtidxes, object_ids) in objects.items():
  1729. # Resolve object type name to id
  1730. type_id = self._get_type_id(type_name)
  1731. for ivtidx in ivtidxes:
  1732. # Remove all terms for the inverted index associated with this
  1733. # object. A trigger will decrement the count column in the
  1734. # terms table for all term_id that get affected.
  1735. self._db_query("DELETE FROM ivtidx_%s_terms_map WHERE object_type=? AND object_id IN %s" % \
  1736. (ivtidx, _list_to_printable(object_ids)), (type_id,))
  1737. self._inverted_indexes[ivtidx]['objectcount'] -= len(object_ids)
  1738. def _add_object_inverted_index_terms(self, (object_type, object_id), ivtidx, terms):
  1739. """
  1740. Adds the dictionary of terms (as computed by _score_terms()) to the
  1741. specified inverted index database for the given object.
  1742. """
  1743. if not terms:
  1744. return
  1745. # Resolve object type name to id
  1746. object_type = self._get_type_id(object_type)
  1747. # Holds any of the given terms that already exist in the database
  1748. # with their id and count.
  1749. db_terms_count = {}
  1750. terms_list = _list_to_printable([ t.lower() for t in terms.keys() ])
  1751. q = "SELECT id,term,count FROM ivtidx_%s_terms WHERE term IN %s" % (ivtidx, terms_list)
  1752. rows = self._db_query(q)
  1753. for row in rows:
  1754. db_terms_count[row[1]] = row[0], row[2]
  1755. # For executemany queries later.
  1756. update_list, map_list = [], []
  1757. for term, score in terms.items():
  1758. term = term.lower()
  1759. if term not in db_terms_count:
  1760. # New term, so insert it now.
  1761. self._db_query('INSERT OR REPLACE INTO ivtidx_%s_terms VALUES(NULL, ?, 1)' % ivtidx, (term,))
  1762. db_id, db_count = self._cursor.lastrowid, 1
  1763. db_terms_count[term] = db_id, db_count
  1764. else:
  1765. db_id, db_count = db_terms_count[term]
  1766. update_list.append((db_count + 1, db_id))
  1767. map_list.append((int(score*10), db_id, object_type, object_id, score))
  1768. self._db_query('UPDATE ivtidx_%s_terms SET count=? WHERE id=?' % ivtidx, update_list, many = True)
  1769. self._db_query('INSERT INTO ivtidx_%s_terms_map VALUES(?, ?, ?, ?, ?)' % ivtidx, map_list, many = True)
  1770. def _query_inverted_index(self, ivtidx, terms, limit = 100, object_type = None):
  1771. """
  1772. Queries the inverted index ivtidx for the terms supplied in the terms
  1773. argument. If terms is a string, it is parsed into individual terms
  1774. based on the split for the given ivtidx. The terms argument may
  1775. also be a list or tuple, in which case no parsing is done.
  1776. The search algorithm tries to optimize for the common case. When
  1777. terms are scored (_score_terms()), each term is assigned a score that
  1778. is stored in the database (as a float) and also as an integer in the
  1779. range 0-10, called rank. (So a term with score 0.35 has a rank 3.)
  1780. Multiple passes are made over the terms map table for the given ivtidx,
  1781. first starting at the highest rank fetching a certain number of rows,
  1782. and progressively drilling down to lower ranks, trying to find enough
  1783. results to fill our limit that intersects on all supplied terms. If
  1784. our limit isn't met and all ranks have been searched but there are
  1785. still more possible matches (because we use LIMIT on the SQL
  1786. statement), we expand the LIMIT (currently by an order of 10) and try
  1787. again, specifying an OFFSET in the query.
  1788. The worst case scenario is given two search terms, each term matches
  1789. 50% of all rows but there is only one intersecting row. (Or, more
  1790. generally, given N terms, each term matches (1/N)*100 percent rows with
  1791. only 1 row intersection between all N terms.) This could be improved
  1792. by avoiding the OFFSET/LIMIT technique as described above, but that
  1793. approach provides a big performance win in more common cases. This
  1794. case can be mitigated by caching common term combinations, but it is
  1795. an extremely difficult problem to solve.
  1796. object_type specifies an type name to search (for example we can
  1797. search type "image" with keywords "2005 vacation"), or if object_type
  1798. is None (default), then all types are searched.
  1799. This function returns a dictionary (object_type, object_id) -> score
  1800. which match the query.
  1801. """
  1802. t0 = time.time()
  1803. # Fetch number of files the inverted index applies to. (Used in score
  1804. # calculations.)
  1805. objectcount = self._inverted_indexes[ivtidx]['objectcount']
  1806. if not isinstance(terms, (list, tuple)):
  1807. split = self._inverted_indexes[ivtidx]['split']
  1808. if callable(split):
  1809. terms = [term for term in split(py3_str(terms).lower()) if term]
  1810. else:
  1811. terms = [term for term in split.split(py3_str(terms).lower()) if term]
  1812. else:
  1813. terms = [ py3_str(x).lower() for x in terms ]
  1814. # Remove terms that aren't indexed (words less than minimum length
  1815. # or and terms in the ignore list for this ivtidx).
  1816. if self._inverted_indexes[ivtidx]['min']:
  1817. terms = [ x for x in terms if len(x) >= self._inverted_indexes[ivtidx]['min'] ]
  1818. if self._inverted_indexes[ivtidx]['ignore']:
  1819. terms = [ x for x in terms if x not in self._inverted_indexes[ivtidx]['ignore'] ]
  1820. terms_list = _list_to_printable(terms)
  1821. nterms = len(terms)
  1822. if nterms == 0:
  1823. return []
  1824. # Find term ids and order by least popular to most popular.
  1825. rows = self._db_query('SELECT id,term,count FROM ivtidx_%s_terms WHERE ' \
  1826. 'term IN %s ORDER BY count' % (ivtidx, terms_list))
  1827. save = map(lambda x: x.lower(), terms)
  1828. terms = {}
  1829. ids = []
  1830. for row in rows:
  1831. if row[2] == 0:
  1832. return []
  1833. # Give terms weight according to their order
  1834. order_weight = 1 + len(save) - list(save).index(row[1])
  1835. terms[row[0]] = {
  1836. 'term': row[1],
  1837. 'count': row[2],
  1838. 'idf_t': math.log(objectcount / row[2] + 1) + order_weight,
  1839. 'ids': {}
  1840. }
  1841. ids.append(row[0])
  1842. # Not all the terms we requested are in the database, so we return
  1843. # 0 results.
  1844. if len(ids) < nterms:
  1845. return []
  1846. if object_type:
  1847. # Resolve object type name to id
  1848. object_type = self._get_type_id(object_type)
  1849. results, state = {}, {}
  1850. for id in ids:
  1851. results[id] = {}
  1852. state[id] = {
  1853. 'offset': [0]*11,
  1854. 'more': [True]*11,
  1855. 'count': 0,
  1856. 'done': False
  1857. }
  1858. all_results = {}
  1859. if limit == None:
  1860. limit = objectcount
  1861. if limit <= 0 or objectcount <= 0:
  1862. return {}
  1863. sql_limit = min(limit*3, 200)
  1864. finished = False
  1865. nqueries = 0
  1866. # Keep a dict keyed on object_id that we can use to narrow queries
  1867. # once we have a full list of all objects that match a given term.
  1868. id_constraints = None
  1869. t1 = time.time()
  1870. while not finished:
  1871. for rank in range(10, -1, -1):
  1872. for id in ids:
  1873. if not state[id]['more'][rank] or state[id]['done']:
  1874. # If there's no more results at this rank, or we know
  1875. # we've already seen all the results for this term, we
  1876. # don't bother with the query.
  1877. continue
  1878. q = 'SELECT object_type,object_id,frequency FROM ivtidx_%s_terms_map ' % ivtidx + \
  1879. 'WHERE term_id=? AND rank=? %s %%s LIMIT ? OFFSET ?'
  1880. if object_type == None:
  1881. q %= ''
  1882. v = [id, rank, sql_limit, state[id]["offset"][rank]]
  1883. else:
  1884. q %= 'AND object_type=?'
  1885. v = [id, rank, object_type, sql_limit, state[id]["offset"][rank]]
  1886. if id_constraints:
  1887. # We know about all objects that match one or more of the other
  1888. # search terms, so we add the constraint that all rows for this
  1889. # term match the others as well. Effectively we push the logic
  1890. # to generate the intersection into the db.
  1891. # XXX: This can't benefit from the index if object_type
  1892. # is not specified.
  1893. q %= ' AND object_id IN %s' % _list_to_printable(tuple(id_constraints))
  1894. # But since we're specifying a list of ids to search for with this
  1895. # term, we can't use limit/offset, since the constraints might be
  1896. # different since the last iteration.
  1897. v[-2:] = [-1, 0]
  1898. else:
  1899. q %= ''
  1900. rows = self._db_query(q, v)
  1901. nqueries += 1
  1902. state[id]['more'][rank] = len(rows) == sql_limit
  1903. state[id]['count'] += len(rows)
  1904. for row in rows:
  1905. results[id][row[0], row[1]] = row[2] * terms[id]['idf_t']
  1906. terms[id]['ids'][row[1]] = 1
  1907. if state[id]['count'] >= terms[id]['count'] or \
  1908. (id_constraints and len(rows) == len(id_constraints)):
  1909. # If we've now retrieved all objects for this term, or if
  1910. # all the results we just got now intersect with our
  1911. # constraints set, we're done this term and don't bother
  1912. # querying it at other ranks.
  1913. #print "Done term '%s' at rank %d" % (terms[id]['term'], rank)
  1914. state[id]['done'] = True
  1915. if id_constraints is not None:
  1916. id_constraints = id_constraints.intersection(terms[id]['ids'])
  1917. else:
  1918. id_constraints = set(terms[id]['ids'])
  1919. #
  1920. # end loop over terms
  1921. for r in reduce(lambda a, b: set(a).intersection(b), results.values()):
  1922. all_results[r] = 0
  1923. for id in ids:
  1924. if r in results[id]:
  1925. all_results[r] += results[id][r]
  1926. # If we have enough results already, no sense in querying the
  1927. # next rank.
  1928. if limit > 0 and len(all_results) > limit*2:
  1929. finished = True
  1930. #print "Breaking at rank:", rank
  1931. break
  1932. #
  1933. # end loop over ranks
  1934. if finished:
  1935. break
  1936. finished = True
  1937. for index in range(len(ids)):
  1938. id = ids[index]
  1939. if index > 0:
  1940. last_id = ids[index-1]
  1941. a = results[last_id]
  1942. b = results[id]
  1943. intersect = set(a).intersection(b)
  1944. if len(intersect) == 0:
  1945. # Is there any more at any rank?
  1946. a_more = b_more = False
  1947. for rank in range(11):
  1948. a_more = a_more or state[last_id]['more'][rank]
  1949. b_more = b_more or state[id]['more'][rank]
  1950. if not a_more and not b_more:
  1951. # There's no intersection between these two search
  1952. # terms and neither have more at any rank, so we
  1953. # can stop the whole query.
  1954. finished = True
  1955. break
  1956. # There's still hope of a match. Go through this term and
  1957. # see if more exists at any rank, increasing offset and
  1958. # unsetting finished flag so we iterate again.
  1959. for rank in range(10, -1, -1):
  1960. if state[id]['more'][rank] and not state[id]['done']:
  1961. state[id]['offset'][rank] += sql_limit
  1962. finished = False
  1963. # If we haven't found enough results after this pass, grow our
  1964. # limit so that we expand our search scope. (XXX: this value may
  1965. # need empirical tweaking.)
  1966. sql_limit *= 10
  1967. # end loop while not finished
  1968. log.info('%d results, did %d subqueries, %.04f seconds (%.04f overhead)',
  1969. len(all_results), nqueries, time.time()-t0, t1-t0)
  1970. return all_results
  1971. def get_inverted_index_terms(self, ivtidx, associated = None, prefix = None):
  1972. """
  1973. Obtain terms used by objects for an inverted index.
  1974. :param ivtidx: the name of an inverted index previously registered with
  1975. :meth:`~kaa.db.Database.register_inverted_index`.
  1976. :type ivtidx: str
  1977. :param associated: specifies a list of terms, and only those terms which are
  1978. mapped to objects *in addition to* the supplied associated
  1979. terms will be returned. If None, all terms for the inverted
  1980. index are returned.
  1981. :type associated: list of str or unicode
  1982. :param prefix: only terms that begin with the specified prefix are returned.
  1983. This is useful for auto-completion while a user is typing a
  1984. query.
  1985. :type prefix: str or unicode
  1986. :returns: a list of 2-tuples, where each tuple is (*term*, *count*). If
  1987. *associated* is not given, *count* is the total number of objects
  1988. that term is mapped to. Otherwise, *count* reflects the number
  1989. of objects which have that term plus all the given associated
  1990. terms. The list is sorted with the highest counts appearing first.
  1991. For example, given an otherwise empty database, if you have an object
  1992. with terms ['vacation', 'hawaii'] and two other object with terms
  1993. ['vacation', 'spain'] and the associated list passed is ['vacation'],
  1994. the return value will be [('spain', 2), ('hawaii', 1)].
  1995. """
  1996. if ivtidx not in self._inverted_indexes:
  1997. raise ValueError, "'%s' is not a registered inverted index." % ivtidx
  1998. if prefix:
  1999. where_clause = 'WHERE terms.term >= ? AND terms.term <= ?'
  2000. where_values = (prefix, prefix + 'z')
  2001. else:
  2002. where_clause = ''
  2003. where_values = ()
  2004. if not associated:
  2005. return self._db_query('''SELECT term, count
  2006. FROM ivtidx_%s_terms AS terms
  2007. %s
  2008. ORDER BY count DESC''' % (ivtidx, where_clause), where_values)
  2009. rows = self._db_query('SELECT id FROM ivtidx_%s_terms WHERE term IN %s ORDER BY count' % \
  2010. (ivtidx, _list_to_printable(associated)))
  2011. term_ids = [ x[0] for x in rows ]
  2012. if len(term_ids) < len(associated):
  2013. return []
  2014. query = '''SELECT term, COUNT(*) AS total
  2015. FROM ivtidx_%s_terms_map AS t0''' % ivtidx
  2016. for n, term_id in enumerate(term_ids):
  2017. query += ''' JOIN ivtidx_%s_terms_map t%d
  2018. ON t%d.object_type = t%d.object_type AND
  2019. t%d.object_id = t%d.object_id AND
  2020. t%d.term_id = %d''' % \
  2021. (ivtidx, n + 1, n, n + 1, n, n + 1, n + 1, term_id)
  2022. query += ''' JOIN ivtidx_%s_terms AS terms
  2023. ON t0.term_id = terms.id AND
  2024. t0.term_id NOT IN %s
  2025. %s
  2026. GROUP BY t0.term_id
  2027. ORDER BY total DESC ''' % \
  2028. (ivtidx, _list_to_printable(term_ids), where_clause)
  2029. return self._db_query(query, where_values)
  2030. def get_db_info(self):
  2031. """
  2032. Return information about the database.
  2033. :returns: a dict
  2034. The returned dictionary has the following keys:
  2035. * count: dict of object types holding their counts
  2036. * total: total number of objects in the database
  2037. * types: a dict keyed on object type which contains:
  2038. * attrs: a dictionary of registered attributes for this type
  2039. * idx: a list of composite indexes for this type
  2040. * termcounts: a dict of the number of indexed terms for each
  2041. inverted index
  2042. * file: full path to the database file
  2043. """
  2044. total = 0
  2045. info = {
  2046. 'count': {},
  2047. 'types': {}
  2048. }
  2049. for name in self._object_types:
  2050. id, attrs, idx = self._object_types[name]
  2051. info['types'][name] = {
  2052. 'attrs': attrs,
  2053. 'idx': idx
  2054. }
  2055. row = self._db_query_row('SELECT COUNT(*) FROM objects_%s' % name)
  2056. info['count'][name] = row[0]
  2057. total += row[0]
  2058. info['total'] = total
  2059. info['termcounts'] = {}
  2060. for ivtidx in self._inverted_indexes:
  2061. row = self._db_query_row('SELECT COUNT(*) FROM ivtidx_%s_terms' % ivtidx)
  2062. info['termcounts'][ivtidx] = int(row[0])
  2063. info['file'] = self._dbfile
  2064. return info
  2065. def set_metadata(self, key, value):
  2066. """
  2067. Associate simple key/value pairs with the database.
  2068. :param key: the key name for the metadata; it is required that key
  2069. is prefixed with ``appname::`` in order to avoid namespace
  2070. collisions.
  2071. :type key: str or unicode
  2072. :param value: the value to associate with the given key
  2073. :type value: str or unicode
  2074. """
  2075. if '::' not in key:
  2076. raise ValueError('Invalid key %s; must be prefixed with "appname::"' % key)
  2077. self._db_query('DELETE FROM meta WHERE attr=?', (key,))
  2078. self._db_query('INSERT INTO meta VALUES (?, ?)', (key, value))
  2079. self._set_dirty()
  2080. def get_metadata(self, key, default=None):
  2081. """
  2082. Fetch metadata previously set by :meth:`~kaa.db.Database.set_metadata`.
  2083. :param key: the key name for the metadata, prefixed with ``appname::``.
  2084. :type key: str
  2085. :param default: value to return if key is not found
  2086. :returns: unicode string containing the value for this key, or
  2087. the ``default`` parameter if the key was not found.
  2088. """
  2089. row = self._db_query_row('SELECT value FROM meta WHERE attr=?', (key,))
  2090. if row:
  2091. return row[0]
  2092. return default
  2093. def vacuum(self):
  2094. """
  2095. Cleans up the database, removing unused inverted index terms.
  2096. This also calls VACUUM on the underlying sqlite database, which
  2097. rebuilds the database to reclaim unused space and reduces
  2098. fragmentation.
  2099. Applications should call this periodically, however this operation
  2100. can be expensive for large databases so it should be done during
  2101. an extended idle period.
  2102. """
  2103. # We need to do this eventually, but there's no index on count, so
  2104. # this could potentially be slow. It doesn't hurt to leave rows
  2105. # with count=0, so this could be done intermittently.
  2106. for ivtidx in self._inverted_indexes:
  2107. self._db_query('DELETE FROM ivtidx_%s_terms WHERE count=0' % ivtidx)
  2108. self._db_query("VACUUM")
  2109. @property
  2110. def filename(self):
  2111. """
  2112. Full path to the database file.
  2113. """
  2114. return self._dbfile
  2115. @property
  2116. def lazy_commit(self):
  2117. """
  2118. The interval after which any changes made to the database will be
  2119. automatically committed, or None to require explicit commiting.
  2120. (Default is None.)
  2121. The timer is restarted upon each change to the database, so prolonged
  2122. updates may still benefit from explicit periodic commits.
  2123. """
  2124. return self._lazy_commit_interval
  2125. @lazy_commit.setter
  2126. def lazy_commit(self, value):
  2127. self._lazy_commit_interval = float(value)
  2128. if value is None:
  2129. self._lazy_commit_timer.stop()
  2130. elif self._dirty:
  2131. self._lazy_commit_timer.start(self._lazy_commit_interval)
  2132. @property
  2133. def readonly(self):
  2134. return self._readonly
  2135. def upgrade_to_py3(self):
  2136. raise NotImplementedError