/rdfextras/store/FOPLRelationalModel/MySQLMassLoader.py
Python | 790 lines | 757 code | 27 blank | 6 comment | 16 complexity | 1a1bebb998c85ac80cbed69476ad93e8 MD5 | raw file
- import sys, re, os
- import datetime
- import rdflib
- from rdflib import BNode
- from rdflib import RDF
- from rdflib.store import Store
- from rdfextras.utils.termutils import OBJECT
- from rdflib.graph import ConjunctiveGraph
- from rdfextras.store.MySQL import SQL, MySQL, PostgreSQL
- from rdfextras.store.FOPLRelationalModel.QuadSlot import genQuadSlots
- from rdfextras.store.FOPLRelationalModel.QuadSlot import normalizeNode
- from Ft.Lib import Uri
- import logging
- log = logging.getLogger(__name__)
- Any = None
- VALUES_EXPR = re.compile('.*VALUES (\(.*\))')
- TABLE_NAME_EXPR = re.compile('INSERT INTO (\S*)\s+VALUES')
- ROW_DELIMITER = '\n'
- COL_DELIMITER = '|'
- DENORMALIZED_INDEX_TARGETS = [
- 'subject', 'subject_term', 'predicate', 'predicate_term',
- 'object', 'object_term', 'context', 'context_term']
- def make_delimited(_list):
- return COL_DELIMITER.join(
- [i is None and 'NULL' or '"%s"' % i for i in _list])
- class LoadError(Exception):
- pass
- class DllError(Exception):
- pass
- class DllNode(object):
- def __init__(self, stuff):
- self.prev = None
- self.next = None
- self.stuff = stuff
-
- def insert_before(self, following):
- prev = following.prev
-
- if prev is not None:
- prev.next = self
-
- self.prev = prev
- self.next = following
-
- following.prev = self
-
- def insert_after(self, preceding):
- next = preceding.next
-
- if next is not None:
- next.prev = self
-
- self.prev = preceding
- self.next = next
-
- preceding.next = self
-
- def remove(self):
- if self.prev is not None:
- self.prev.next = self.next
- if self.next is not None:
- self.next.prev = self.prev
- self.prev = None
- self.next = None
-
- class RecentSet(object):
- def __init__(self, size):
- self.size = size
- self.store = {}
- '''map from objects to their node in the doubly-linked list'''
- self.head = None
- '''first item in the doubly-linked list'''
- self.tail = None
- '''last item in the doubly-linked list'''
-
- def check(self, item):
- if item in self.store:
- node = self.store[item]
- if self.head is not node:
- if self.tail is node:
- self.tail = self.tail.prev
- node.remove()
- node.insert_before(self.head)
- self.head = node
-
- return True
-
- else:
- node = DllNode(item)
-
- if self.head is None:
- assert self.tail is None
- self.head = node
- self.tail = node
- else:
- node.insert_before(self.head)
- self.head = node
-
- self.store[item] = node
-
- if len(self.store) > self.size:
- drop = self.tail
- #print >> sys.stderr, "Dropping", drop.stuff
- self.tail = drop.prev
- drop.remove()
- del self.store[drop.stuff]
-
- return False
-
- class Loader(SQL):
- # Thought: just have one table corresponding to each ending table (for
- # both tables below)
- TRIPLES_SQL_TEMPLATE = '''
- create table triples (
- subject bigint unsigned,
- subject_term enum('U','B','F','V'),
- predicate bigint unsigned,
- predicate_term enum('U','V'),
- object bigint unsigned,
- object_term enum('U','B','F','V','L'),
- data_type bigint unsigned default NULL,
- language varchar(3) default NULL,
- context bigint unsigned,
- context_term enum('U','B','F')
- )'''
-
- LEXICAL_SQL_TEMPLATE = '''
- create table lexical (
- id bigint unsigned,
- term_type enum('U','B','F','V','L'),
- lexical text
- )'''
-
- loadStatement = """LOAD DATA LOCAL INFILE '%s' IGNORE INTO TABLE
- %s FIELDS TERMINATED BY '|' ENCLOSED BY '"'
- ESCAPED BY '\\\\'"""
-
- def __init__(
- self, #triplesFileName=None, lexicalFileName=None,
- delimited_directory='delimited_dumps',
- reuseExistingFiles=False):
- self.init_storage(delimited_directory, reuseExistingFiles)
-
- for table in self.tables:
- table.delimited_file = open(
- self.delimited_filename(table.get_name()), self.mode)
-
- self.recent = RecentSet(100000)
- self.recent_hits = 0
- self.recent_misses = 0
-
- def init_storage(self, delimited_directory, reuseExistingFiles):
- self.log_statement('init_storage')
- self.mode = 'a'
- if not reuseExistingFiles:
- self.mode = 'w'
- try:
- os.mkdir(delimited_directory)
- except OSError:
- raise LoadError('Can\'t create `' + delimited_directory +
- '\': it already exists.')
-
- self.delimited_directory = delimited_directory
-
- def delimited_filename(self, name, extension='.csv'):
- return os.path.join(self.delimited_directory, name + extension)
-
- def open(self, configuration=None, create=False):
- super(Loader, self).open(configuration, create)
-
- def add(self, (subject, predicate, obj), context=None, quoted=False):
- self.addN([(subject, predicate, obj, context)])
-
- def addN(self, quads):
- for s,p,o,c in quads:
- assert c is not None, \
- "Context associated with %s %s %s is None!" % (s, p, o)
- qSlots = genQuadSlots([s, p, o, c.identifier],
- self.useSignedInts)
-
- table = self.get_table((s, p, o))
- table.delimited_file.write(make_delimited(
- table.makeRowComponents(qSlots)) + ROW_DELIMITER)
-
- for row in table.listIdentifiers(qSlots):
- if not self.recent.check((row[2], row[1])):
- self.idHash.delimited_file.write(make_delimited(
- row) + ROW_DELIMITER)
- self.recent_misses += 1
- else:
- self.recent_hits += 1
-
- for row in table.listLiterals(qSlots):
- self.valueHash.delimited_file.write(
- make_delimited(row) + ROW_DELIMITER)
-
- def dumpRDF(self, suffix):
- for table in self.tables:
- table.delimited_file.close()
- print 'Recent hits: %s' % self.recent_hits
- print 'Recent misses: %s' % self.recent_misses
-
- def makeLoadStatement(self, fileName, tableName):
- return self.loadStatement % (fileName, tableName)
-
- def init_workspace(self):
- pass
-
- def close_workspace(self):
- pass
-
- def load_temporary_tables(self, cursor):
- pass
- self.log_statement('load_temporary_tables')
- if self.triplesFileName is None or self.lexicalFileName is None:
- return
-
- if self.triplesFile and not self.triplesFile.closed:
- self.triplesFile.close()
-
- cursor.execute(self.TRIPLES_SQL_TEMPLATE)
- cursor.execute(self.makeLoadStatement(self.triplesFileName, 'triples'))
-
- if self.lexicalFile and not self.lexicalFile.closed:
- self.lexicalFile.close()
-
- cursor.execute(self.LEXICAL_SQL_TEMPLATE)
- cursor.execute(self.makeLoadStatement(self.lexicalFileName, 'lexical'))
-
- def indexTriplesTable(self, cursor, columns=[]):
- self.log_statement('indexTriplesTable')
- for column in columns:
- cursor.execute('CREATE INDEX triples_%s ON triples (%s)' %
- ((column,) * 2))
- cursor.execute('CREATE INDEX triples_all ON triples (%s)' %
- (', '.join(columns),))
-
- def indexLexicalTable(self, cursor):
- self.log_statement('indexLexicalTable')
- cursor.execute('CREATE INDEX lexical_id ON lexical (id)')
- cursor.execute('CREATE INDEX lexical_term_type ON lexical (term_type)')
-
- def loadAssociativeBox(self, indexFirst=False):
- self.log_statement('loadAssociativeBox')
- cursor = self._db.cursor()
- cursor.execute("""
- insert into %s
- select distinct subject as member, subject_term as member_term,
- object as class, object_term as class_term,
- context, context_term
- from triples where predicate = %s and predicate_term = 'U'""" %
- (self.aboxAssertions,
- str(normalizeNode(RDF.type, self.useSignedInts))))
-
- def loadLiteralProperties(self, indexFirst=False):
- self.log_statement('loadLiteralProperties')
- cursor = self._db.cursor()
- cursor.execute("""
- insert into %s
- select distinct subject, subject_term, predicate, predicate_term,
- object, context, context_term, data_type, language
- from triples where object_term = 'L'
- """ %
- (self.literalProperties,))
-
- def loadRelations(self, indexFirst=False):
- self.log_statement('loadRelations')
- cursor = self._db.cursor()
- cursor.execute("""
- insert into %s
- select distinct subject, subject_term, predicate, predicate_term,
- object, object_term, context, context_term
- from triples where predicate != %s and object_term != 'L'""" %
- (self.binaryRelations,
- str(normalizeNode(RDF.type, self.useSignedInts))))
-
- def loadLiterals(self, indexFirst=False):
- self.log_statement('loadLiterals')
- cursor = self._db.cursor()
- cursor.execute("""
- insert into %s select distinct id, lexical from lexical where
- term_type = 'L'""" % (self.valueHash,))
-
- def loadIdentifiers(self, indexFirst=False):
- self.log_statement('loadIdentifiers')
- cursor = self._db.cursor()
- cursor.execute("""
- insert into %s select distinct id, term_type,
- lexical from lexical where
- term_type != 'L'""" % (self.idHash,))
-
- def remove(self, triple_pattern, context):
- pass
-
- #Transactional interfaces
- def commit(self):
- """ """
- pass
-
- def rollback(self):
- """ """
- pass
-
- def bind(self, prefix, namespace):
- """ """
- pass
-
- def prefix(self, namespace):
- """ """
- pass
-
- def namespace(self, prefix):
- """ """
- pass
-
- def namespaces(self):
- pass
-
- class MySQLLoader(Loader, MySQL):
- def __init__(
- self, identifier=None, configuration=None,
- delimited_directory='delimited_dumps',
- reuseExistingFiles=False):
- MySQL.__init__(self, identifier, configuration, debug=True,
- engine="ENGINE=MyISAM")
- Loader.__init__(self, delimited_directory, reuseExistingFiles)
-
- def delimited_filename(self, name):
- return Loader.delimited_filename(self, name, '.csv.mysql')
-
- def init_workspace(self):
- self.applyIndices()
-
- def _loadTable(self, table, indexFirst=False):
- cursor = self._db.cursor()
-
- sql = 'ALTER TABLE %s DISABLE KEYS' % (table.get_name(),)
- self.log_statement(sql)
- cursor.execute(sql)
-
- sql = self.makeLoadStatement(
- self.delimited_filename(table.get_name()), table.get_name())
- self.log_statement(sql)
- cursor.execute(sql)
-
- sql = 'ALTER TABLE %s ENABLE KEYS' % (table.get_name(),)
- self.log_statement(sql)
- cursor.execute(sql)
-
- def loadAssociativeBox(self, indexFirst=False):
- self._loadTable(self.aboxAssertions)
- return
-
- cursor = self._db.cursor()
- sql = """
- insert into %s
- select subject as member, subject_term as member_term,
- object as class, object_term as class_term,
- context, context_term
- from triples
- where predicate = %s and predicate_term = 'U'
- group by subject, object, context
- """ % (self.aboxAssertions,
- str(normalizeNode(RDF.type, self.useSignedInts)))
- log.debug(sql)
- cursor.execute(sql)
-
- def loadLiteralProperties(self, indexFirst=False):
- self._loadTable(self.literalProperties)
- return
-
- cursor = self._db.cursor()
- sql = """
- insert into %s
- select subject, subject_term, predicate, predicate_term,
- object, context, context_term, data_type, language
- from triples
- where object_term = 'L'
- group by subject, predicate, object, context
- """ % (self.literalProperties,)
- log.debug(sql)
- cursor.execute(sql)
-
- def loadRelations(self, indexFirst=False):
- self._loadTable(self.binaryRelations)
- return
-
- cursor = self._db.cursor()
- sql = """
- insert into %s
- select subject, subject_term, predicate, predicate_term,
- object, object_term, context, context_term
- from triples
- where predicate != %s and object_term != 'L'
- group by subject, predicate, object, context
- """ % (self.binaryRelations,
- str(normalizeNode(RDF.type, self.useSignedInts)))
- log.debug(sql)
- cursor.execute(sql)
-
- def loadLiterals(self, indexFirst=False):
- self._loadTable(self.valueHash)
- return
-
- cursor = self._db.cursor()
- sql = """
- insert into %s select id, lexical from lexical
- where term_type = 'L' group by id
- """ % (self.valueHash,)
- log.debug(sql)
- cursor.execute(sql)
-
- def loadIdentifiers(self, indexFirst=False):
- self._loadTable(self.idHash)
- return
-
- cursor = self._db.cursor()
- sql = """
- insert into %s select id, term_type, lexical from lexical
- where term_type != 'L' group by id
- """ % (self.idHash,)
- log.debug(sql)
- cursor.execute(sql)
-
- class PostgreSQLLoader(Loader, PostgreSQL):
-
- TRIPLES_SQL_TEMPLATE = '''
- create table triples (
- subject bigint,
- subject_term char,
- predicate bigint,
- predicate_term char,
- object bigint,
- object_term char,
- data_type bigint default NULL,
- language varchar(3) default NULL,
- context bigint,
- context_term char
- )'''
-
- LEXICAL_SQL_TEMPLATE = '''
- create table lexical (
- id bigint,
- term_type char,
- lexical text
- )'''
-
- loadStatement = """COPY %s FROM STDIN WITH DELIMITER '|'
- NULL AS 'NULL' CSV ESCAPE E'\\\\'"""
-
- def __init__(
- self, identifier=None, configuration=None,
- delimited_directory='delimited_dumps',
- reuseExistingFiles=False):
- PostgreSQL.__init__(self, identifier, configuration, debug=True)
- self.init_storage(delimited_directory, reuseExistingFiles)
-
- self.triplesFileName = self.delimited_filename('triples')
- self.triplesFile = open(self.triplesFileName, self.mode)
- self.lexicalFileName = self.delimited_filename('lexical')
- self.lexicalFile = open(self.lexicalFileName, self.mode)
-
- self.recent = RecentSet(100000)
- self.recent_hits = 0
- self.recent_misses = 0
-
- def delimited_filename(self, name):
- return Loader.delimited_filename(self, name, '.csv.postgresql')
-
- def makeLoadStatement(self, fileName, tableName):
- return self.loadStatement % (tableName,)
-
- try:
- import pg
- def _copy_from_file(self, table, filename):
- f = open(filename, 'r')
- conn = self.pg.connect(
- dbname=self.config['db'], host=self.config['host'],
- port=int(self.config['port']), user=self.config['user'],
- passwd=self.config['password'])
- conn.query(self.makeLoadStatement(filename, table))
- for _line in f:
- conn.putline(_line)
- conn.putline('\\.\n')
- conn.endcopy()
- f.close()
-
- except ImportError:
- def _copy_from_file(self, table, filename):
- raise NotImplementedError(
- 'We need the PyGreSQL module to bulk load into PostgreSQL databases.')
-
- def load_temporary_tables(self, cursor):
- self.log_statement('load_temporary_tables')
- if self.triplesFileName is None or self.lexicalFileName is None:
- return
-
- if self.triplesFile and not self.triplesFile.closed:
- self.triplesFile.close()
-
- cursor.execute(self.TRIPLES_SQL_TEMPLATE)
- cursor.execute('COMMIT')
- self._copy_from_file('triples', self.triplesFileName)
-
- if self.lexicalFile and not self.lexicalFile.closed:
- self.lexicalFile.close()
-
- cursor.execute(self.LEXICAL_SQL_TEMPLATE)
- cursor.execute('COMMIT')
- self._copy_from_file('lexical', self.lexicalFileName)
-
- def init_workspace(self):
- cursor = self._db.cursor()
- self.load_temporary_tables(cursor)
- self.indexTriplesTable(cursor, ['subject', 'predicate', 'object',
- 'context'])
- self.indexLexicalTable(cursor)
-
- def close_workspace(self):
- cursor = self._db.cursor()
- cursor.execute('DROP TABLE triples')
- cursor.execute('DROP TABLE lexical')
- self.applyIndices()
-
- def addN(self, quads):
- for s,p,o,c in quads:
- assert c is not None, \
- "Context associated with %s %s %s is None!" % (s, p, o)
- qSlots = genQuadSlots([s, p, o, c.identifier],
- self.useSignedInts)
-
- # print 'Writing data...', qSlots
- # Add to the denormalized delimited file:
- beginning = True
- for item in qSlots:
- parts = [item.md5Int, item.termType]
- if not self.recent.check((item.termType, item.term)):
- self.lexicalFile.write(make_delimited(
- parts + [item.normalizeTerm()]) + ROW_DELIMITER)
- self.recent_misses += 1
- else:
- self.recent_hits += 1
-
- if item.position == OBJECT:
- dqs = item.getDatatypeQuadSlot()
- if dqs is not None:
- parts.append(dqs.md5Int)
- if not self.recent.check((dqs.termType, dqs.term)):
- self.lexicalFile.write(make_delimited(
- [dqs.md5Int, 'U', dqs.normalizeTerm()]) +
- ROW_DELIMITER)
- self.recent_misses += 1
- else:
- self.recent_hits += 1
- else:
- parts.append(None)
-
- if item.termType == 'L':
- parts.append(item.term.language)
- else:
- parts.append(None)
- if not beginning:
- self.triplesFile.write(COL_DELIMITER)
- beginning = False
- self.triplesFile.write(make_delimited(parts))
- self.triplesFile.write(ROW_DELIMITER)
-
- def dumpRDF(self, suffix):
- self.triplesFile.close()
- self.lexicalFile.close()
- print 'Recent hits: %s' % self.recent_hits
- print 'Recent misses: %s' % self.recent_misses
-
- def timing(config, tableid, dumpfile):
- plugins = ['MySQL', 'PostgreSQL']
- index = 0
- runs = {}
- for plugin in plugins:
- # @@FIXME: incorrect code
- # if options.delete:
- # store = rdflib.plugin.get(plugin, Store)(tableid)
- # store.open(config)
- # cursor = store._db.cursor()
- # #cursor.execute('DROP TABLE denormalized')
- # store.destroy(config)
- # store.close()
-
- start = datetime.datetime.now()
- print start
- store = rdflib.plugin.get(plugin, Store)(tableid)
-
- runid = 'run' + str(index)
- runs[runid] = [plugin, 'no denormalized indices',
- 'index store before load']
- store.create(config, False)
- store.applyIndices()
- store.applyForeignKeys()
-
- cursor = store._db.cursor()
-
- # @@FIXME: unused (incorrect) code ...
- # hashFieldType = store.hashFieldType
- # load_temporary_tables(cursor)
-
- # loadLiterals(store)
- # loadIdentifiers(store)
- # loadAssociativeBox(store)
- # loadLiteralProperties(store)
- # loadRelations(store)
-
- stop = datetime.datetime.now()
- print stop
- runs[runid].append(str(stop - start))
- print '\t'.join(runs[runid])
-
- cursor.execute('DROP TABLE triples')
- cursor.execute('DROP TABLE lexical')
- store.destroy(config)
- index = index + 1
-
- #start = time.clock()
- start = datetime.datetime.now()
- store = rdflib.plugin.get(plugin, Store)(tableid)
-
- runid = 'run' + str(index)
- runs[runid] = [plugin, 'no denormalized indices',
- 'index store after load']
- store.create(config, False)
-
- cursor = store._db.cursor()
-
- # @@FIXME: unused (incorrect) code...
- # hashFieldType = store.hashFieldType
- # load_temporary_tables(cursor)
- # loadLiterals(store)
- # loadIdentifiers(store)
- # loadAssociativeBox(store)
- # loadLiteralProperties(store)
- # loadRelations(store)
-
- store.applyIndices()
- store.applyForeignKeys()
-
- stop = datetime.datetime.now()
- runs[runid].append(str(stop - start))
- print '\t'.join(runs[runid])
-
- cursor.execute('DROP TABLE triples')
- cursor.execute('DROP TABLE lexical')
- store.destroy(config)
- index = index + 1
- PLUGIN_MAP = {
- 'MySQL': MySQLLoader,
- 'PostgreSQL': PostgreSQLLoader,
- }
- def main():
- from optparse import OptionParser
- usage = '''usage: %prog [options] <DB Type>'''
- op = OptionParser(usage=usage)
- op.add_option('-c', '--connection', help='Database connection string')
- op.add_option('-i', '--id', help='Database table set identifier')
- op.add_option('--delimited',
- help = 'Directory in which to store delimited files')
- op.add_option('-r', '--reuse', action='store_true',
- help = 'Reuse existing delimited files instead of creating new ones')
-
- op.add_option('-d', '--delete', action='store_true',
- help = 'Delete old repository before starting')
-
- op.add_option('--name', dest='graphName',
- help = 'The name of the graph to parse the RDF serialization(s) into')
-
- op.add_option('-x', '--xml', action='append',
- help = 'Append to the list of RDF/XML documents to parse')
- op.add_option('-t', '--trix', action='append',
- help = 'Append to the list of TriX documents to parse')
- op.add_option('-n', '--n3', action='append',
- help = 'Append to the list of N3 documents to parse')
- op.add_option('--nt', action='append',
- help = 'Append to the list of NT documents to parse')
- op.add_option('-a', '--rdfa', action='append',
- help = 'Append to the list of RDFa documents to parse')
-
- op.add_option('--directory',
- help = 'Load all children of this directory into the repository')
- op.add_option('--format', type='choice',
- choices = ['xml', 'n3', 'nt', 'rdfa', 'trix'],
- help = 'Format of files found when using `--directory`')
-
- op.add_option('--catalog',
- help = 'Catalog to use to resolve local URIs to identify graphs')
-
- op.add_option('--timing', action='store_true',
- help = 'Run timing tests')
-
- op.set_defaults(connection=None, delimited='delimited_dumps',
- reuse=False, id=None,
- xml=[], trix=[], n3=[], nt=[], rdfa=[],
- graphName=BNode())
- (options, args) = op.parse_args()
-
- if options.delimited is not None:
- options.delimited = os.path.abspath(options.delimited)
-
- if not options.id:
- op.error('You need to provide a table set identifier')
-
- try:
- store = PLUGIN_MAP[args[0]](identifier=options.id,
- configuration=options.connection,
- delimited_directory=options.delimited,
- reuseExistingFiles=options.reuse)
- except Exception, e:
- raise
- op.error('You need to provide a database type (MySQL or PostgreSQL).')
-
- store.open()
-
- factGraph = ConjunctiveGraph(store, identifier=options.graphName)
- if not options.reuse:
- if options.directory:
- if not options.format:
- op.error('You need to provide the format with `--format`\n' +
- 'when loading from a directory')
- dirparts = os.walk(options.directory).next()
- for entry in dirparts[2]:
- graphRef = os.path.join(dirparts[0], entry)
- factGraph.parse(graphRef, publicID=Uri.OsPathToUri(graphRef),
- format = options.format)
-
- for graphRef in options.xml:
- factGraph.parse(graphRef, publicID=Uri.OsPathToUri(graphRef),
- format='xml')
- for graphRef in options.trix:
- factGraph.parse(graphRef, publicID=Uri.OsPathToUri(graphRef),
- format='trix')
- for graphRef in options.n3:
- factGraph.parse(graphRef, publicID=Uri.OsPathToUri(graphRef),
- format='n3')
- for graphRef in options.nt:
- factGraph.parse(graphRef, publicID=Uri.OsPathToUri(graphRef),
- format='nt')
- for graphRef in options.rdfa:
- factGraph.parse(graphRef, publicID=Uri.OsPathToUri(graphRef),
- format='rdfa')
- store.dumpRDF('solo')
- store.close()
-
- if options.connection:
- if options.timing:
- # @@FIXME: incorrect code
- # timing(config, tableid, dumpfile)
- pass
- else:
- if options.delete:
- log.debug(store.open())
- # @@FIXME: incorrect code
- # cursor = store._db.cursor()
- store.destroy()
- store.close()
-
- store.create(populate=False)
- store.init_workspace()
-
- store.loadLiterals()
- store.loadIdentifiers()
- store.loadAssociativeBox()
- store.loadLiteralProperties()
- store.loadRelations()
-
- store.close_workspace()
- #store.applyForeignKeys()
- if __name__ == '__main__':
- main()