PageRenderTime 611ms CodeModel.GetById 161ms app.highlight 213ms RepoModel.GetById 230ms app.codeStats 0ms

/python/engine/PinYin/PYSQLiteDB.py

http://scim-python.googlecode.com/
Python | 543 lines | 515 code | 3 blank | 25 comment | 3 complexity | 4a373450ec2c75aed18ac5b82be67dc3 MD5 | raw file
  1# -*- coding: utf-8 -*-
  2# vim: set noet ts=4:
  3#
  4# scim-python
  5#
  6# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  7#
  8#
  9# This library is free software; you can redistribute it and/or
 10# modify it under the terms of the GNU Lesser General Public
 11# License as published by the Free Software Foundation; either
 12# version 2 of the License, or (at your option) any later version.
 13#
 14# This library is distributed in the hope that it will be useful,
 15# but WITHOUT ANY WARRANTY; without even the implied warranty of
 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17# GNU Lesser General Public License for more details.
 18#
 19# You should have received a copy of the GNU Lesser General Public
 20# License along with this program; if not, write to the
 21# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 22# Boston, MA  02111-1307  USA
 23#
 24# $Id: $
 25#
 26import os
 27import os.path as path
 28import time
 29import sys
 30import sqlite3 as sqlite
 31import re
 32import uuid
 33import traceback
 34import PYUtil
 35import PYDict
 36import PYParser
 37
 38(YLEN, Y0, Y1, Y2, Y3, YX, S0, S1, S2, S3, PHRASE, FREQ, USER_FREQ) = range (0, 13)
 39
 40FLUSH_PERIOD = 60 * 5 # 5 minute
 41
 42class PYSQLiteDB:
 43	"""phrase database that contains all phrases and phrases' pinyin"""
 44	def __init__ (self, name = "py.db", user_db = None, filename = None):
 45	
 46		# init last flush time
 47		self._last_flush_time = None
 48	
 49		if filename:
 50			self.db = sqlite.connect (filename)
 51			self.parser = PYParser.PinYinParser ()
 52			return
 53
 54		name = os.path.join (os.path.dirname (__file__), name)
 55
 56		# open system phrase database
 57		self.db = sqlite.connect (name)
 58
 59		self.parser = PYParser.PinYinParser ()
 60		# self.db.execute ("PRAGMA locking_mode = EXCLUSIVE;")
 61		self.db.execute ("PRAGMA synchronous = NORMAL;")
 62		self.db.execute ("PRAGMA temp_store = MEMORY;")
 63		
 64		if user_db != None:
 65			home_path = os.getenv ("HOME")
 66			pinyin_path = path.join (home_path, ".scim", "scim-python", "pinyin")
 67			user_db = path.join (pinyin_path, user_db)
 68			if not path.isdir (pinyin_path):
 69				os.makedirs (pinyin_path)
 70
 71			try:
 72				desc = get_database_desc (user_db)
 73				if desc == None or desc["id"] != "0.1":
 74					new_name = "%s.%d" %(user_db, os.getpid())
 75					print >> sys.stderr, "Can not support the user db. We will rename it to %s" % new_name
 76					os.rename (user_db, new_name)
 77			except:
 78				pass
 79		else:
 80			user_db = ":memory:"
 81		
 82		
 83		# open user phrase database
 84		try:
 85			self.db.execute ("ATTACH DATABASE \"%s\" AS user_db;" % user_db)
 86		except:
 87			print >> sys.stderr, "The user database was damaged. We will recreate it!"
 88			os.rename (user_db, "%s.%d" % (user_db, os.getpid ()))
 89			self.db.execute ("ATTACH DATABASE \"%s\" AS user_db;" % user_db)
 90
 91		
 92		# try create all tables in user database
 93		self.create_tables ("user_db")
 94		self.create_indexes ("user_db")
 95		self.generate_userdb_desc ()
 96
 97		self.select_cache = Cache ()
 98
 99	def create_tables (self, database = "main"):
100		"""create all phrases tables that contain all phrases"""
101		
102		try:
103			self.db.executescript ("PRAGMA default_cache_size = 5000;")
104			self.flush (True)
105		except:
106			pass
107
108		# create pinyin table
109		sqlstring = "CREATE TABLE IF NOT EXISTS %s.py_pinyin (pinyin TEXT PREMARY KEY);" % database
110		self.db.execute (sqlstring)
111		
112		# create pinyin table
113		sqlstring = "CREATE TABLE IF NOT EXISTS %s.py_shengmu (shengmu TEXT PREMARY KEY);" % database
114		self.db.execute (sqlstring)
115
116		# create phrase table
117		sqlstring = """CREATE TABLE IF NOT EXISTS %(database)s.py_phrase (
118							ylen INTEGER, 
119							y0 INTEGER, y1 INTEGER, y2 INTEGER, y3 INTEGER, yx TEXT,
120							s0 INTEGER, s1 INTEGER, s2 INTEGER, s3 INTEGER,
121							phrase TEXT,
122							freq INTEGER, user_freq INTEGER);"""
123		
124		self.db.executescript (sqlstring % { "database":database })
125		self.flush (True)
126
127	def generate_userdb_desc (self):
128		try:
129			sqlstring = "CREATE TABLE user_db.desc (name PRIMARY KEY, value);"
130			self.db.executescript (sqlstring)
131			sqlstring = "INSERT INTO user_db.desc  VALUES (?, ?);"
132			self.db.execute (sqlstring, ("version", "0.1"))
133			self.db.execute (sqlstring, ("id", str (uuid.uuid4 ())))
134			sqlstring = "INSERT INTO user_db.desc  VALUES (?, DATETIME(\"now\", \"localtime\"));"
135			self.db.execute (sqlstring, ("create-time", ))
136			self.flush (True)
137		except:
138			print "desc table has been created."
139
140	def create_indexes (self, database = "main"):
141		# create indexes
142		sqlstring = """
143				CREATE INDEX IF NOT EXISTS %(database)s.py_phrase_index_1 ON
144					py_phrase (y0, y1, y2, y3);
145				CREATE INDEX IF NOT EXISTS %(database)s.py_phrase_index_2 ON
146					py_phrase (ylen, y0, y1, y2, y3);
147				CREATE INDEX IF NOT EXISTS %(database)s.py_phrase_index_3 ON
148					py_phrase (ylen, s0, s1, s2, s3);
149				CREATE INDEX IF NOT EXISTS %(database)s.py_phrase_index_4 ON
150					py_phrase (s0, s1, s2, s2, s3);
151				CREATE INDEX IF NOT EXISTS %(database)s.py_phrase_index_5 ON
152					py_phrase (phrase);
153				"""
154		self.db.executescript (sqlstring % { "database" : database })
155		self.flush (True)
156
157	def optimize_database (self):
158		sqlstring = """
159				CREATE TABLE tmp AS SELECT * FROM py_phrase;
160				DELETE FROM py_phrase;
161				INSERT INTO py_phrase SELECT * FROM tmp ORDER BY ylen, y0, y1, y2, y3, yx, phrase;
162				DROP TABLE tmp;
163				"""
164		self.db.executescript (sqlstring)
165		self.db.executescript ("VACUUM;")
166
167	def init_pinyin_table (self):
168		"""create table pinyin that contains all pinyin"""
169		sqlstring = "INSERT INTO py_pinyin (pinyin) VALUES (?)"
170		for py in PYDict.PINYIN_DICT.keys ():
171			self.db.execute (sqlstring, (unicode (py),))
172		self.flush (True)
173	
174	def init_shengmu_table (self):
175		"""create table shengmu that contains all shengmu of pinyin"""
176		sqlstring = "INSERT INTO py_shengmu (shengmu) VALUES (?)"
177		for shengmu in PYDict.SHENGMU_DICT.keys ():
178			self.db.execute (sqlstring, (unicode (shengmu),))
179		self.flush (True)
180
181	def add_phrases (self, phrases, database = "main"):
182		""" add phrases to database, phrases is a iterable object
183		Like: [(phrase, pinyin, freq), (phrase, pinyin, freq), ...]
184		"""
185		sqlstring = """INSERT INTO %s.py_phrase (ylen, y0, y1, y2, y3, yx, s0, s1, s2, s3, phrase, freq, user_freq)
186			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
187		line = 1
188		for  phrase, pinyin, freq in phrases:
189			try:
190				py = self.parser.parse (pinyin)
191				if len (py) != len (phrase):
192					error_message = u"%s %s: Can not parse pinyin." % (phrase, pinyin)
193					raise Exception (error_message.encode ("utf8"))
194				record = [None, None, None, None, None, None, None, None, None, None, None, None, None]
195				record [YLEN] = len (py)
196				i = 0
197				for p in py[:4]:
198					record[Y0 + i] = p.get_pinyin_id ()
199					record[S0 + i] = p.get_sheng_mu_id ()
200					i += 1
201				if len (py) > 4:
202					record[YX] = "'".join (map (str, py[4:]))
203				record[PHRASE] = phrase
204				record[FREQ] = freq
205				record[USER_FREQ] = None
206				self.db.execute (sqlstring % database, record)
207			except Exception, e:
208				print line, ":", phrase.encode ("utf-8"), pinyin, freq
209				import traceback
210				traceback.print_exc ()
211				# print e
212			line += 1
213			
214		self.flush (True)
215
216	def get_pinyin_table (self):
217		"""get pinyin table"""
218		try:
219			return self._pinyin_table
220		except:
221			pass
222
223		sql = "SELECT phrase, y0, s0 FROM py_phrase WHERE ylen = 1"
224		
225		pinyin_table = {}
226		result = self.db.execute (sql % i)
227		for phrase, y0, s0 in result:
228			if phrase not in pinyin_table:
229				pinyin_table [phrase] = []
230			pinyin_table [phrase].append ((y0, s0))
231		self._pinyin_table = pinyin_table
232		
233		return pinyin_table
234
235	def select_words_by_pinyin_list (self, pys, mohu = [False] * 8):
236		"""select words from database by list that contains PYUtil.PinYinWord objects"""
237		
238		pinyin_string = u"'".join (map (str, pys))
239		result = self.select_cache [pinyin_string]
240		if result != None:
241			return result
242		
243		tables_union = """( SELECT * FROM main.py_phrase WHERE %(conditions)s UNION ALL
244		SELECT * FROM user_db.py_phrase WHERE %(conditions)s )"""
245		
246		if mohu[0]:
247			sql_conditions = ["+ylen = %d" % len (pys) ]
248		else:
249			sql_conditions = ["ylen = %d" % len (pys) ]
250			
251
252		i = 0
253		if mohu[0] == False:
254			for py in pys[:4]:
255				if py.is_valid_pinyin ():
256					sql_conditions.append ("y%d = %d" % (i, py.get_pinyin_id ()))
257				else:
258					sql_conditions.append ("s%d = %d" % (i, py.get_sheng_mu_id ()))
259				i += 1
260		elif mohu[0] == True:
261			for py in pys[:4]:
262				if py.is_valid_pinyin ():
263					shengmu = py.get_shengmu ()
264					yunmu = py.get_pinyin ()[len (shengmu):]
265					shengmu_list = [shengmu]
266					yunmu_list = [yunmu]
267
268					if mohu[1] and (shengmu == "s" or shengmu == "sh"):
269						shengmu_list = PYDict.MOHU_SHENGMU["s"]
270					if mohu[2] and (shengmu == "c" or shengmu == "ch"):
271						shengmu_list = PYDict.MOHU_SHENGMU["c"]
272					if mohu[3] and (shengmu == "z" or shengmu == "zh"):
273						shengmu_list = PYDict.MOHU_SHENGMU["z"]
274					if mohu[4] and (shengmu == "l" or shengmu == "n"):
275						shengmu_list = PYDict.MOHU_SHENGMU["l"]
276
277					if mohu[5] and (yunmu == "in" or yunmu == "ing"):
278						yunmu_list = PYDict.MOHU_YUNMU["in"]
279					if mohu[6] and (yunmu == "en" or yunmu == "eng"):
280						yunmu_list = PYDict.MOHU_YUNMU["en"]
281					if mohu[7] and (yunmu == "an" or yunmu == "ang"):
282						yunmu_list = PYDict.MOHU_YUNMU["an"]
283						
284					pinyin_ids = []
285					for s in shengmu_list:
286						for y in yunmu_list:
287							pinyin = s + y
288							if pinyin in PYDict.PINYIN_DICT:
289								pinyin_ids.append (str (PYDict.PINYIN_DICT[pinyin]))
290					if len (pinyin_ids) > 1:
291						sql_conditions.append ("y%d in (%s)" % (i, ",".join (pinyin_ids)))
292					else:
293						sql_conditions.append ("y%d == %s" % (i, pinyin_ids[0]))
294						
295				else:
296					shengmu = py.get_shengmu ()
297					if shengmu in PYDict.MOHU_SHENGMU:
298						shengmu_ids = []
299						for s in PYDict.MOHU_SHENGMU[shengmu]:
300							shengmu_ids.append (str (PYDict.SHENGMU_DICT[s]))
301						sql_conditions.append ("s%d in (%s)" % (i, ",".join (shengmu_ids)))
302					else:
303						sql_conditions.append ("s%d = %d" % (i, py.get_sheng_mu_id ()))
304				i += 1
305		
306		if pys[4:]:
307			pp = lambda (x): x.get_pattern (mohu[0])
308			pattern = "'".join (map (pp, pys[4:]))
309			sql_conditions.append ("yx LIKE \"" + pattern + "\"")
310		
311
312		tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
313		sql_prefix = "SELECT * FROM " + tables_union
314
315		sql_string = sql_prefix + " GROUP BY phrase ORDER BY user_freq DESC, freq DESC;"
316
317		result = list (self.db.execute (sql_string).fetchall ())
318		
319		self.select_cache [pinyin_string] = result
320
321		return result
322	
323	def select_words_by_pinyin_string (self, string, mohu = [False] * 8):
324		"""select words from the database by pinyin string"""
325
326		pys = self.parser.parse (string)
327		result = self.select_words_by_pinyin_list (pys, mohu)
328		return result
329
330	def commit_phrases (self, records):
331		"""this function adjusts frequence of phrase in user database."""
332
333		for record in records:
334			if record [USER_FREQ] != None:
335				sql = "UPDATE user_db.py_phrase SET user_freq = user_freq + 1 WHERE ylen = ? AND y0 = ? AND phrase = ?;"
336				self.db.execute (sql, (record[YLEN], record[Y0], record[PHRASE]))
337			else:
338				sql = """INSERT INTO user_db.py_phrase (ylen, y0, y1, y2, y3, yx, s0, s1, s2, s3, phrase, freq, user_freq)
339					VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1);"""
340				self.db.execute (sql, record[:12])
341		self.flush ()
342		self.select_cache.clear ()
343
344	def commit_phrase (self, record):
345		self.commit_phrases ([record])
346
347	def commit_char (self, char, pinyin_id, shengmu_id):
348		sql = "SELECT * FROM main.py_phrase WHERE ylen = 1 AND y0 = ? AND phrase = ?"
349		records = self.db.execute (sql, (pinyin_id, char)).fetchall ()
350		self.commit_phrases (records)
351
352	def new_phrase (self, phrases, freq = None, user_freq = 1):
353		"""this function create a new phrase from a phrase list and put it into user database."""
354		pinyin_ids = []
355		shengmu_ids = []
356		phrase = u""
357		phrase_length = 0
358		for p in phrases:
359			if phrase_length + p[YLEN] > 8:
360				break
361			phrase += p[PHRASE]
362			phrase_length += p[YLEN]
363			if p[YLEN] > 4:
364				ys = p[YX].split ("'")
365			for i in range (0, p[YLEN]):
366				if i < 4:
367					pinyin_ids.append (p[Y0 + i])
368					shengmu_ids.append (p[S0 + i])
369				else:
370					w = PYUtil.PinYinWord (ys[i - 4])
371					pinyin_ids.append (w.get_pinyin_id ())
372					shengmu_ids.append (w.get_sheng_mu_id ())
373		
374		sql = """INSERT INTO user_db.py_phrase
375			(ylen, y0, y1, y2, y3, yx, s0, s1, s2, s3, phrase, freq, user_freq) 
376			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
377
378		values = [phrase_length, None, None, None, None, None, None, None, None, None, phrase, freq, user_freq]
379
380		if phrase_length <=4:
381			values[1: 1 + phrase_length] = pinyin_ids [:phrase_length]
382			values[6: 6 + phrase_length] = shengmu_ids [:phrase_length]
383		else:
384			values[1: 5] = pinyin_ids [:4]
385			values[6: 10] = shengmu_ids [:4]
386			get_pinyin = lambda id: PYDict.ID_PINYIN_DICT[id]
387			values[5] = "'".join (map (get_pinyin, pinyin_ids[4:]))
388
389		self.db.execute (sql, values)
390
391		self.flush ()
392		self.select_cache.clear ()
393
394	def remove_phrase (self, record):
395		"Remove phrase from user database"
396
397		sql = "DELETE FROM user_db.py_phrase WHERE ylen = ? AND y0 = ? AND phrase = ?"
398
399		self.db.execute (sql, (record[YLEN], record[Y0], record[PHRASE]))
400		
401		self.flush ()
402		self.select_cache.clear ()
403
404	def flush (self, force = False):
405		if self._last_flush_time == None:
406			self._last_flush_time = time.time ()
407		if force or time.time() - self._last_flush_time >= FLUSH_PERIOD:
408			self.db.commit ()
409			self._last_flush_time = time.time ()
410
411	
412class Cache:
413	"""cache object for cache history queries"""
414
415	class Slot:
416		"""Slot item of cache object"""
417		def __init__ (self):
418			self.time = -1
419			self.value = None
420			self.index = None
421	
422	def __init__ (self, max_slot = 1024):
423		self._max_slot = max_slot
424		self.clear ()
425
426	def clear (self):
427		self._slots = []
428		self._dict = {}
429		self._time = 0
430
431	def __delitem__ (self, index):
432		if not self._dict.has_key (index):
433			return None
434		del self._dict [index]
435
436	def __getitem__ (self, index):
437		"""return a vale associate with the giving index. It cache does not has this index,
438		it will retrun None."""
439		if not self._dict.has_key (index):
440			return None
441		
442		slot = self._dict [index]
443		self._time += 1
444		slot.time = self._time
445		return slot.value
446
447	def __setitem__ (self, index, value):
448		if self._dict.has_key (index):
449			slot = self._dict[index]
450		else:
451			slot = self.get_slot ()
452		self._time += 1
453		slot.value = value
454		slot.time = self._time
455		slot.index = index
456		self._dict[index] = slot
457
458	def get_slot (self):
459		"""get_slot will return a empty slot. If there is not any empty slot, 
460		it will find the oldest slot and return it."""
461		if len (self._slots) < self._max_slot:
462			slot = Cache.Slot ()
463			self._slots.append (slot)
464		else:
465			self._slots.sort (lambda x,y : x.time - y.time)		
466			slot = self._slots[0]
467			del self._dict[slot.index]
468		return slot
469
470def get_database_desc (db_file):
471	if not path.exists (db_file):
472		raise Exception ("%s does not exist!" % dbname)
473	try:
474		desc = {}
475		db = sqlite.connect (db_file)
476		for row in db.executescript ("SELECT * FROM desc;"):
477			desc [row[0]] = row[1]
478		return desc
479	except:
480		return None
481
482def test_select_words ():
483	import time
484
485	db = PYSQLiteDB ()
486	while True:
487		l = raw_input ().strip ()
488		if not l:
489			break
490		t = time.time ()
491		res = db.select_words_by_pinyin_string (l)
492		
493		t = time.time () - t
494		
495		i = 0
496		for p in res:
497			print "%s = %s %s " % (i, str (p), p[PHRASE].encode ("utf8"))
498			i += 1
499		print "OK t =", t, " count =", len (res)
500		while True:
501			try:
502				commit = int (raw_input ("commit = ").strip ())
503				db.commit_phrase (res[commit])
504			except KeyboardInterrupt, e:
505				print "Exit"
506				sys.exit (0)
507			except:
508				print "Input is invalidate"
509				continue
510			break
511
512	
513
514def test_case (string):
515	db = PYSQLiteDB ()
516	parser = PYParser.PinYinParser ()
517	pys = parser.parse (string)
518
519	result = u""
520
521	while len (result) != len (pys):
522		pps = pys[len (result):]
523		for x in range (len (pps), 0, -1):
524			candidates = db.select_words_by_pinyin_list (pps[:x])
525			if candidates:
526				result += candidates[0][PHRASE]
527				break
528	print "'".join (map (str, pys))
529	print result
530
531def test ():
532	test_case ("gaodangfangdichankaifashangdedongtianjiuyaolaile")
533	test_case ("huanyingshiyongwokaifadezhinengpinyinshurufa")
534	test_case ("beijingshirenminzhengfujuedingzaitongzhouqujianlizhengfuxingzhengjidi")
535	test_case ("woguojuminshoumingqiwangtigaodaoqishisansui")
536	test_case ("wgjmshmqwtgdqshss")
537	test_case ("xjinyhuiyouyongme")
538	
539if __name__ == "__main__":
540	import timeit
541	t = timeit.Timer ("PYSQLiteDB.test ()","import PYSQLiteDB")
542	print t.repeat (3,1)
543