PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/python/engine/XingMa/XMSQLiteDB.py

http://scim-python.googlecode.com/
Python | 914 lines | 872 code | 6 blank | 36 comment | 5 complexity | f7a859d700e8564df830669c483322d7 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. # vim: set noet ts=4:
  3. #
  4. # scim-python
  5. #
  6. # Copyright (c) 2008-2008 Yu Yuwei <acevery@gmail.com>
  7. #
  8. #
  9. # This library is free software; you can redistribute it and/or
  10. # modify it under the terms of the GNU Lesser General Public
  11. # License as published by the Free Software Foundation; either
  12. # version 2 of the License, or (at your option) any later version.
  13. #
  14. # This library is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU Lesser General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Lesser General Public
  20. # License along with this program; if not, write to the
  21. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  22. # Boston, MA 02111-1307 USA
  23. #
  24. # $Id: $
  25. #
  26. import os
  27. import os.path as path
  28. import sqlite3
  29. import XMDict
  30. import uuid
  31. import time
  32. import re
  33. patt_r = re.compile(r'c([ea])(\d):(.*)')
  34. patt_p = re.compile(r'p(-{0,1}\d)(\d)')
  35. # first make some number index we will used :)
  36. #(MLEN, CLEN, M0, M1, M2, M3, M4, PHRASE, FREQ, USER_FREQ) = range (0,10)
  37. class XMSQLiteDB:
  38. '''Phrase database for XingMa'''
  39. def __init__(self, name = 'xm.db', user_db = None, filename = None ):
  40. # first we use the Parse in XMDict, which transform the char(a,b,c,...) to int(1,2,3,...) to fasten the sql enquiry
  41. self.parse = XMDict.Parse
  42. self.deparse = XMDict.Deparse
  43. if filename:
  44. self.db = sqlite3.connect( filename )
  45. else:
  46. try:
  47. os.system('cat %s > /dev/null' % name)
  48. except:
  49. pass
  50. # open system phrase db
  51. self.db = sqlite3.connect( name )
  52. try:
  53. self.db.execute( 'PRAGMA page_size = 4096; ' )
  54. self.db.execute( 'PRAGMA cache_size = 40000; ' )
  55. # increase the cache size to speedup sqlite enquiry
  56. self.db.execute( 'PRAGMA temp_store = MEMORY; ' )
  57. self.db.execute( 'PRAGMA synchronous = OFF; ' )
  58. except:
  59. print 'encountering error when init db'
  60. pass
  61. # create IME property table
  62. sqlstr = 'CREATE TABLE IF NOT EXISTS main.ime (attr TEXT, val TEXT);'
  63. self.db.executescript( sqlstr )
  64. # make sure we have values in ime table.
  65. if not self.db.execute('SELECT * FROM main.ime;').fetchall():
  66. ime_keys={'name':'',
  67. 'name.zh_cn':'',
  68. 'name.zh_hk':'',
  69. 'name.zh_tw':'',
  70. 'author':'somebody',
  71. 'uuid':'%s' % uuid.uuid4(),
  72. 'serial_number':'%s' % time.strftime('%Y%m%d'),
  73. 'icon':'/usr/share/scim/icons/scim-python.png',
  74. 'credit':'GPL',
  75. 'languages':'zh',
  76. 'valid_input_chars':'abcdefghijklmnopqrstuvwxyz',
  77. 'max_key_length':'4',
  78. # 'commit_keys':'space',
  79. # 'forward_keys':'Return',
  80. # 'select_keys':'1,2,3,4,5,6,7,8,9,0',
  81. # 'page_up_keys':'Page_Up,minus',
  82. # 'page_down_keys':'Page_Down,equal',
  83. 'status_prompt':'CN',
  84. 'def_full_width_punct':'TRUE',
  85. 'def_full_width_letter':'FALSE',
  86. 'user_can_define_phrase':'FALSE',
  87. 'pinyin_mode':'FALSE',
  88. 'dynamic_adjust':'FALSE',
  89. 'no_check_chars':u'',
  90. 'rules':''}
  91. #'rules':'ce2:p11+p12+p21+p22;ce3:p11+p21+p22+p31;ca4:p11+p21+p31+p41'}
  92. # inital the attribute in ime table, which should be updated from mabiao
  93. for _name in ime_keys:
  94. sqlstr = 'INSERT INTO main.ime (attr,val) VALUES (?,?);'
  95. self.db.execute( sqlstr, (_name,ime_keys[_name]) )
  96. # share variables in this class:
  97. self._mlen = int ( self.get_ime_property ("max_key_length") )
  98. #(MLEN, CLEN, M0, M1, M2, M3, M4, PHRASE, FREQ, USER_FREQ) = range (0,10)
  99. self._pt_index = ['mlen','clen']
  100. for i in range(self._mlen):
  101. self._pt_index.append ('m%d' %i)
  102. self._pt_index += ['phrase','freq','user_freq']
  103. self.user_can_define_phrase = self.get_ime_property('user_can_define_phrase')
  104. if self.user_can_define_phrase:
  105. if self.user_can_define_phrase.lower() == u'true' :
  106. self.user_can_define_phrase = True
  107. # cache goucima to memory to speedup gouci
  108. try:
  109. self.cache_goucima()
  110. except:
  111. pass
  112. else:
  113. self.user_can_define_phrase = False
  114. else:
  115. print 'Could not find "user_can_define_phrase" entry from database, is it a outdated database?'
  116. self.user_can_define_phrase = False
  117. self.dynamic_adjust = self.get_ime_property('dynamic_adjust')
  118. if self.dynamic_adjust:
  119. if self.dynamic_adjust.lower() == u'true' :
  120. self.dynamic_adjust = True
  121. else:
  122. self.dynamic_adjust = False
  123. else:
  124. print 'Could not find "dynamic_adjust" entry from database, is it a outdated database?'
  125. self.dynamic_adjust = False
  126. self._no_check_chars = self.get_no_check_chars()
  127. self.rules = self.get_rules ()
  128. self._goucima = {}
  129. if filename:
  130. return
  131. # user database:
  132. if user_db != None:
  133. home_path = os.getenv ("HOME")
  134. xingma_path = path.join (home_path, ".scim", "scim-python", "xingma")
  135. user_db = path.join (xingma_path, user_db)
  136. if not path.isdir (xingma_path):
  137. os.makedirs (xingma_path)
  138. try:
  139. desc = self.get_database_desc (user_db)
  140. if desc == None :
  141. self.init_user_db (user_db)
  142. elif desc["version"] != "0.1":
  143. new_name = "%s.%d" %(user_db, os.getpid())
  144. print >> sys.stderr, "Can not support the user db. We will rename it to %s" % new_name
  145. os.rename (user_db, new_name)
  146. self.init_user_db (user_db)
  147. except:
  148. import traceback
  149. traceback.print_exc()
  150. else:
  151. user_db = ":memory:"
  152. # open user phrase database
  153. try:
  154. self.db.execute ('ATTACH DATABASE "%s" AS user_db;' % user_db)
  155. except:
  156. print >> sys.stderr, "The user database was damaged. We will recreate it!"
  157. os.rename (user_db, "%s.%d" % (user_db, os.getpid ()))
  158. self.init_user_db (user_db)
  159. self.db.execute ('ATTACH DATABASE "%s" AS user_db;' % user_db)
  160. # try create all tables in user database
  161. self.create_tables ("user_db")
  162. self.create_indexes ("user_db")
  163. self.generate_userdb_desc ()
  164. # attach mudb for working process
  165. mudb = ":memory:"
  166. self.db.execute ('ATTACH DATABASE "%s" AS mudb;' % mudb )
  167. self.create_tables ("mudb")
  168. def update_phrase (self, entry, database='user_db'):
  169. '''update phrase freqs'''
  170. #print entry
  171. _con = [ entry[-1] ] + list(entry[0:2+entry[0]]) + [entry[-3]]
  172. #print _con
  173. _condition = u''.join( map(lambda x: 'AND m%d = ? ' % x, range(entry[0]) ) )
  174. #print _condition
  175. sqlstr = 'UPDATE %s.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND phrase = ?;' % (database, _condition)
  176. #print sqlstr
  177. self.db.execute ( sqlstr , _con )
  178. # because we may update different db, we'd better commit every time.
  179. self.db.commit()
  180. def sync_usrdb (self):
  181. # we need to update the user_db
  182. #print 'sync userdb'
  183. mudata = self.db.execute ('SELECT * FROM mudb.phrases;').fetchall()
  184. data_u = filter ( lambda x: x[-2] in [1,-3], mudata)
  185. data_a = filter ( lambda x: x[-2]==2, mudata)
  186. data_n = filter ( lambda x: x[-2]==-2, mudata)
  187. #print data_a
  188. data_a = map (lambda x: (u''.join ( map(self.deparse, x[2:2+x[0]])),x[-3],0,x[-1] ), data_a)
  189. data_n = map (lambda x: (u''.join ( map(self.deparse, x[2:2+x[0]])),x[-3],-1,x[-1] ), data_n)
  190. #print data_u
  191. map (self.update_phrase, data_u)
  192. #print self.db.execute('select * from user_db.phrases;').fetchall()
  193. map (self.u_add_phrase,data_a)
  194. map (self.u_add_phrase,data_n)
  195. def create_tables (self, database):
  196. '''Create tables that contain all phrase'''
  197. try:
  198. self.db.execute( 'PRAGMA cache_size = 20000; ' )
  199. # increase the cache size to speedup sqlite enquiry
  200. except:
  201. pass
  202. if database == 'main':
  203. # create ikeys table
  204. sqlstr = 'CREATE TABLE IF NOT EXISTS %s.ikeys (ikey TEXT PRIMARY KEY, id INTEGER);' % database
  205. self.db.execute ( sqlstr )
  206. # create goucima table, this table is used in construct new phrases
  207. sqlstr = 'CREATE TABLE IF NOT EXISTS %s.goucima (zi TEXT PRIMARY KEY' % database
  208. #for i in range(self._mlen):
  209. # sqlstr += ', g%d INTEGER' % i
  210. sqlstr += ''.join(map (lambda x: ', g%d INTEGER' % x, range(self._mlen)) )
  211. sqlstr += ');'
  212. self.db.execute ( sqlstr )
  213. # create pinyin table, this table is used in search single character for user handly
  214. sqlstr = 'CREATE TABLE IF NOT EXISTS %s.pinyin ( plen INTEGER, ' % database
  215. #for i in range(6):
  216. # sqlstr += 'p%d INTEGER, ' % i
  217. sqlstr += ''.join( map (lambda x: 'p%d INTEGER, ' % x, range(6) ) )
  218. sqlstr += 'zi TEXT, freq INTEGER);'
  219. self.db.execute ( sqlstr )
  220. # create phrase table (mabiao)
  221. sqlstr = 'CREATE TABLE IF NOT EXISTS %s.phrases ( mlen INTEGER, clen INTEGER, ' % database
  222. #for i in range(self._mlen):
  223. # sqlstr += 'm%d INTEGER, ' % i
  224. sqlstr += ''.join ( map (lambda x: 'm%d INTEGER, ' % x, range(self._mlen)) )
  225. sqlstr += 'phrase TEXT, freq INTEGER, user_freq INTEGER);'
  226. self.db.execute ( sqlstr )
  227. self.db.commit()
  228. def update_ime (self, attrs):
  229. '''Update attributes in ime table, attrs is a iterable object
  230. Like [(attr,val), (attr,val), ...]
  231. '''
  232. sqlstr = 'UPDATE main.ime SET val = ? WHERE attr = ?;'
  233. for attr,val in attrs:
  234. _sqlstr = 'SELECT * from main.ime WHERE attr = ?'
  235. res = self.db.execute( _sqlstr, (attr,) ).fetchall()
  236. if res:
  237. self.db.execute(sqlstr,(val,attr))
  238. else:
  239. #print '"',attr,'"'," didn't in ime property now!"
  240. pass
  241. # we need to update some self variables now.
  242. self._mlen = int (self.get_ime_property ('max_key_length' ))
  243. self._pt_index = ['mlen','clen']
  244. for i in range(self._mlen):
  245. self._pt_index.append ('m%d' %i)
  246. self._pt_index += ['phrase','freq','user_freq']
  247. self.user_can_define_phrase = self.get_ime_property('user_can_define_phrase')
  248. if self.user_can_define_phrase:
  249. if self.user_can_define_phrase.lower() == u'true' :
  250. self.user_can_define_phrase = True
  251. else:
  252. self.user_can_define_phrase = False
  253. else:
  254. print 'Could not find "user_can_define_phrase" entry from database, is it a outdated database?'
  255. self.user_can_define_phrase = False
  256. self.rules = self.get_rules ()
  257. self.db.commit()
  258. def get_rules (self):
  259. '''Get phrase construct rules'''
  260. rules={'above':4}
  261. if self.user_can_define_phrase:
  262. try:
  263. _rules = self.get_ime_property ('rules')
  264. if _rules:
  265. _rules = _rules.strip().split(';')
  266. for rule in _rules:
  267. res = patt_r.match (rule)
  268. if res:
  269. cms = []
  270. if res.group(1) == 'a':
  271. rules['above'] = int(res.group(2))
  272. _cms = res.group(3).split('+')
  273. if len(_cms) > int(self.get_ime_property('max_key_length')):
  274. print 'rule: "%s" over max key length' % rule
  275. break
  276. for _cm in _cms:
  277. cm_res = patt_p.match(_cm)
  278. cms.append(( int(cm_res.group(1)),int(cm_res.group(2)) ))
  279. rules[int(res.group(2))]=cms
  280. else:
  281. print 'not a legal rule: "%s"' % rule
  282. except Exception:
  283. import traceback
  284. traceback.print_exc ()
  285. return rules
  286. else:
  287. return ""
  288. def get_no_check_chars (self):
  289. '''Get the characters which engine should not change freq'''
  290. _chars= self.get_ime_property('no_check_chars')
  291. try:
  292. _chars = _chars.decode('utf-8')
  293. except:
  294. pass
  295. return _chars
  296. def add_phrases (self, phrases, database = 'main'):
  297. '''Add phrases to database, phrases is a iterable object
  298. Like: [(xingma, phrase, freq ,user_freq), (xingma, phrase, freq, user_freq), ...]
  299. '''
  300. if database == 'main':
  301. map (self.add_phrase, phrases)
  302. else:
  303. map (self.add_phrase, phrases, [database]*len(phrases),[False]*len(phrases) )
  304. self.db.commit()
  305. def add_new_phrases (self, nphrases, database='main'):
  306. '''Add new phrases into db, new phrases is a object
  307. of [(phrase,freq), (phrase,freq),...]'''
  308. n_phrases=[]
  309. for _ph, _freq in nphrases:
  310. try:
  311. _xm = self.parse_phrase_to_xm (_ph)
  312. if not self.check_phrase_internal (_ph, _xm, database):
  313. # we don't have this phrase
  314. n_phrases.append ( (_xm, _ph, _freq, 0) )
  315. except:
  316. print '\"%s\" would not been added' % _ph
  317. if n_phrases:
  318. self.add_phrases ( n_phrases, database )
  319. def u_add_phrase (self,nphrase):
  320. '''Add a phrase to userdb'''
  321. self.add_phrase (nphrase,database='user_db')
  322. def add_phrase (self, aphrase, database = 'main', commit=True):
  323. '''Add phrase to database, phrase is a object of
  324. (xingma, phrase, freq ,user_freq)
  325. '''
  326. sqlstr = 'INSERT INTO %s.phrases ( mlen, clen, '
  327. sql_suffix = 'VALUES ( ?, ?, '
  328. mmlen = range(self._mlen)
  329. sqlstr += ''.join ( map(lambda x: 'm%d, ' %x , mmlen) )
  330. sql_suffix += ''.join ( map (lambda x: '?, ' , mmlen) )
  331. sqlstr += 'phrase, freq, user_freq) '
  332. sql_suffix += '?, ?, ? );'
  333. sqlstr += sql_suffix
  334. try:
  335. xingma,phrase,freq,user_freq = aphrase
  336. except:
  337. xingma,phrase,freq = aphrase
  338. user_freq = 0
  339. try:
  340. xm = self.parse(xingma)
  341. if len(xm) != len(xingma):
  342. print 'In %s %s: we parse xingma fail' % (phrase, xingma )
  343. return
  344. record = [None, None, None, None, None]
  345. map( lambda x: record.append(None), range(self._mlen))
  346. record [0] = len (xingma)
  347. record [1] = len (phrase)
  348. record [2: 2+len(xingma)] = map (lambda x: xm[x].get_key_id(), range(0,len(xingma)))
  349. record [2+self._mlen] = phrase
  350. record [2+self._mlen+1] = freq
  351. record [2+self._mlen+2] = user_freq
  352. self.db.execute (sqlstr % database, record)
  353. if commit:
  354. self.db.commit()
  355. except Exception:
  356. import traceback
  357. def add_goucima (self, gcms):
  358. '''Add goucima into database, gcms is iterable object
  359. Like gcms = [(zi,goucima),(zi,goucima), ...]
  360. '''
  361. count = 1
  362. for zi,gcm in gcms:
  363. _con = ''
  364. _val = ''
  365. _len = min ( len(gcm),self._mlen)
  366. for i in range( _len ):
  367. _con += ', g%d' % i
  368. _val += ', ?'
  369. sqlstr = '''INSERT INTO main.goucima ( zi %s )
  370. VALUES ( ? %s );''' % (_con, _val)
  371. try:
  372. gc = self.parse(gcm)
  373. if len(gc) != len(gcm):
  374. error_m = u'%s %s: Can not parse goucima' % (zi, gcm )
  375. raise Exception ( error_m.encode ('utf8') )
  376. record = [zi]
  377. for i in range(_len):
  378. record.append( gc[i].get_key_id())
  379. self.db.execute (sqlstr , record)
  380. except Exception:
  381. import traceback
  382. traceback.print_exc()
  383. count += 1
  384. self.db.commit()
  385. def add_pinyin (self, pinyins, database = 'main'):
  386. '''Add pinyin to database, pinyins is a iterable object
  387. Like: [(zi,pinyin, freq), (zi, pinyin, freq), ...]
  388. '''
  389. sqlstr = 'INSERT INTO %s.pinyin ( plen, '
  390. sql_suffix = 'VALUES ( ?, '
  391. for i in range(6):
  392. sqlstr += 'p%d, ' % i
  393. sql_suffix += '?, '
  394. sqlstr += 'zi, freq ) '
  395. sql_suffix += '?, ? );'
  396. sqlstr += sql_suffix
  397. count = 1
  398. for pinyin,zi,freq in pinyins:
  399. try:
  400. py = self.parse(pinyin)
  401. if len(py) != len(pinyin):
  402. error_m = u'%s %s: Can not parse pinyin' % (zi, pinyin )
  403. raise Exception ( error_m.encode ('utf8') )
  404. record = [None, None, None, None, None, None, None, None, None]
  405. record [0] = len (pinyin)
  406. for i in range(0,len(pinyin)):
  407. record [ 1+i ] = py[i].get_key_id()
  408. record [-2] = zi
  409. record [-1] = freq
  410. self.db.execute (sqlstr % database, record)
  411. except Exception:
  412. print count, ': ', zi.encode('utf8'), ' ', pinyin
  413. import traceback
  414. traceback.print_exc()
  415. count += 1
  416. self.db.commit()
  417. def optimize_database (self, database='main'):
  418. sqlstr = '''
  419. CREATE TABLE tmp AS SELECT * FROM %(database)s.phrases;
  420. DELETE FROM %(database)s.phrases;
  421. INSERT INTO %(database)s.phrases SELECT * FROM tmp ORDER BY %(xmstr)s mlen ASC, freq DESC;
  422. DROP TABLE tmp;
  423. CREATE TABLE tmp AS SELECT * FROM %(database)s.goucima;
  424. DELETE FROM %(database)s.goucima;
  425. INSERT INTO %(database)s.goucima SELECT * FROM tmp ORDER BY zi,g0,g1;
  426. DROP TABLE tmp;
  427. CREATE TABLE tmp AS SELECT * FROM %(database)s.pinyin;
  428. DELETE FROM %(database)s.pinyin;
  429. INSERT INTO %(database)s.pinyin SELECT * FROM tmp ORDER BY p0,p1,p2,p3,p4,p5,plen ASC;
  430. DROP TABLE tmp;
  431. '''
  432. xmstr = ''
  433. for i in range(self._mlen):
  434. xmstr +='m%d, ' % i
  435. self.db.executescript (sqlstr % {'database':database,'xmstr':xmstr })
  436. self.db.executescript ("VACUUM;")
  437. self.db.commit()
  438. def drop_indexes(self, database):
  439. '''Drop the index in database to reduce it's size'''
  440. sqlstr = '''
  441. DROP INDEX IF EXISTS %(database)s.goucima_index_z;
  442. DROP INDEX IF EXISTS %(database)s.pinyin_index_i;
  443. DROP INDEX IF EXISTS %(database)s.phrases_index_p;
  444. DROP INDEX IF EXISTS %(database)s.phrases_index_i;
  445. VACUUM;
  446. ''' % { 'database':database }
  447. self.db.executescript (sqlstr)
  448. self.db.commit()
  449. def create_indexes(self, database):
  450. sqlstr = '''
  451. DROP INDEX IF EXISTS %(database)s.goucima_index_z;
  452. CREATE INDEX IF NOT EXISTS %(database)s.goucima_index_z ON goucima (zi);
  453. DROP INDEX IF EXISTS %(database)s.pinyin_index_i;
  454. CREATE INDEX IF NOT EXISTS %(database)s.pinyin_index_i ON pinyin (p0,p1,p2,p3,p4,p5,plen ASC, freq DESC);
  455. VACUUM;
  456. ''' % { 'database':database }
  457. sqlstr_t = '''
  458. DROP INDEX IF EXISTS %(database)s.phrases_index_p;
  459. CREATE INDEX IF NOT EXISTS %(database)s.phrases_index_p ON phrases (%(xmstr)s mlen ASC, freq DESC);
  460. DROP INDEX IF EXISTS %(database)s.phrases_index_i;
  461. CREATE INDEX IF NOT EXISTS %(database)s.phrases_index_i ON phrases (phrase, mlen ASC);
  462. '''
  463. xmstr = ''
  464. for i in range(self._mlen):
  465. xmstr +='m%d,' % i
  466. if database == 'main':
  467. sqlstr = sqlstr_t % {'database':database,'xmstr':xmstr } + sqlstr
  468. else:
  469. sqlstr = sqlstr_t % {'database':database,'xmstr':xmstr }
  470. self.db.executescript (sqlstr)
  471. self.db.commit()
  472. def compare (self,x,y):
  473. return cmp (x[0],y[0]) or -(cmp (x[-1],y[-1])) or -(cmp (x[-2],y[-2]))
  474. def select_words( self, xms, onechar=False ):
  475. '''
  476. Get phrases from database by XingMa_Key objects
  477. ( which should be equal or less than the max key length)
  478. This method is called in XingMa by passing UserInput held data
  479. Return result[:]
  480. '''
  481. # firstly, we make sure the len we used is equal or less than the max key length
  482. _len = min( len(xms),self._mlen )
  483. _condition = ''
  484. _condition += ''.join ( map (lambda x: 'AND m%d = ? ' %x, range(_len) ) )
  485. if onechar:
  486. # for some users really like to select only single characters
  487. _condition += 'AND clen=1 '
  488. # you can increase the x in _len + x to include more result, but in the most case, we only need one more key result, so we don't need the extra overhead :)
  489. # we start search for 1 key more, if nothing, then 2 key more and so on
  490. # this is the max len we need to add into the select cause.
  491. w_len = self._mlen - _len +1
  492. # we start from 2, because it is < in the sqlite select, which need 1 more.
  493. x_len = 2
  494. while x_len <= w_len + 1:
  495. sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE mlen < %(mk)d %(condition)s
  496. UNION ALL
  497. SELECT * FROM user_db.phrases WHERE mlen < %(mk)d %(condition)s
  498. UNION ALL
  499. SELECT * FROM mudb.phrases WHERE mlen < %(mk)d %(condition)s )
  500. ORDER BY mlen ASC, user_freq DESC, freq DESC;''' % { 'mk':_len+x_len, 'condition':_condition}
  501. # we have redefine the __int__(self) in class XMDict.XingMa_key to return the key id, so we can use map to got key id :)
  502. _xms = map(int,xms[:_len])
  503. _xms += _xms + _xms
  504. result = self.db.execute(sqlstr, _xms).fetchall()
  505. #self.db.commit()
  506. # if we find word, we stop this while,
  507. if len(result) >0:
  508. break
  509. x_len += 1
  510. # here in order to get high speed, I use complicated map
  511. # to subtitute for
  512. sysdb={}
  513. usrdb={}
  514. mudb={}
  515. _cand = []
  516. #searchres = map ( lambda res: res[-2] and [ True, [(res[:-2],[res[:-1],res[-1:]])] ]\
  517. # or [ False, [(res[:-2] , [res[:-1],res[-1:]])] ] \
  518. # , result )
  519. searchres = map ( lambda res: [ int(res[-2]), int(res[-1]), [(res[:-2],[res[:-1],res[-1:]])] ], result)
  520. # for sysdb
  521. reslist=filter( lambda x: not x[1], searchres )
  522. map (lambda x: sysdb.update(x[2]), reslist)
  523. # for usrdb
  524. reslist=filter( lambda x: ( x[0] in [0,-1] ) and x[1], searchres )
  525. map (lambda x: usrdb.update(x[2]), reslist)
  526. # for mudb
  527. reslist=filter( lambda x: ( x[0] not in [0,-1] ) and x[1], searchres )
  528. map (lambda x: mudb.update(x[2]), reslist)
  529. # first process mudb
  530. searchres = map ( lambda key: mudb[key][0] + mudb[key][1], mudb )
  531. #print searchres
  532. map (_cand.append, searchres)
  533. # now process usrdb and sysdb
  534. searchres = map ( lambda key: (not mudb.has_key(key)) and usrdb[key][0] + usrdb[key][1]\
  535. or None , usrdb )
  536. searchres = filter(lambda x: bool(x), searchres )
  537. #print searchres
  538. map (_cand.append, searchres)
  539. searchres = map ( lambda key: ((not mudb.has_key(key)) and (not usrdb.has_key(key)) )and sysdb[key][0] + sysdb[key][1]\
  540. or None, sysdb )
  541. searchres = filter (lambda x: bool(x), searchres)
  542. map (_cand.append, searchres)
  543. #for key in usrdb:
  544. # if not sysdb.has_key (key):
  545. # _cand.append( usrdb[key][0] + usrdb[key][1] )
  546. # else:
  547. # _cand.append( sysdb[key][0] + usrdb[key][1] )
  548. #for key in sysdb:
  549. # if not usrdb.has_key (key):
  550. # _cand.append( sysdb[key][0] + sysdb[key][1] )
  551. _cand.sort(cmp=self.compare)
  552. return _cand[:]
  553. def select_zi( self, xms ):
  554. '''
  555. Get zi from database by XingMa_Key objects
  556. ( which should be equal or less than 6)
  557. This method is called in XingMa by passing UserInput held data
  558. Return result[:]
  559. '''
  560. # firstly, we make sure the len we used is equal or less than the max pinyin length 6
  561. _len = min( len(xms), 6 )
  562. _condition = ''
  563. #for i in range(_len):
  564. # _condition += 'AND p%d = ? ' % i
  565. _condition += ''.join ( map (lambda x: 'AND p%d = ? ' %x, range(_len)) )
  566. # you can increase the x in _len + x to include more result, but in the most case, we only need one more key result, so we don't need the extra overhead :)
  567. sqlstr = '''SELECT * FROM main.pinyin WHERE plen < %(mk)d %(condition)s
  568. ORDER BY plen ASC, freq DESC;''' % { 'mk':_len+2, 'condition':_condition}
  569. # we have redefine the __int__(self) in class XMDict.XingMa_key to return the key id, so we can use map to got key id :)
  570. _xms = map(int,xms[:_len])
  571. result = self.db.execute(sqlstr, _xms).fetchall()
  572. #self.db.commit()
  573. return result[:]
  574. def get_ime_property( self, attr ):
  575. '''get IME property from database, attr is the string of property,
  576. which should be str.lower() :)
  577. '''
  578. sqlstr = 'SELECT val FROM main.ime WHERE attr = ?'
  579. _result = self.db.execute( sqlstr, (attr,)).fetchall()
  580. #self.db.commit()
  581. if _result:
  582. return _result[0][0]
  583. else:
  584. return None
  585. def get_phrase_table_index (self):
  586. '''get a list of phrase table columns name'''
  587. return self._pt_index[:]
  588. def generate_userdb_desc (self):
  589. try:
  590. sqlstring = 'CREATE TABLE IF NOT EXISTS user_db.desc (name PRIMARY KEY, value);'
  591. self.db.executescript (sqlstring)
  592. sqlstring = 'INSERT OR IGNORE INTO user_db.desc VALUES (?, ?);'
  593. self.db.execute (sqlstring, ('version', '0.1'))
  594. sqlstring = 'INSERT OR IGNORE INTO user_db.desc VALUES (?, DATETIME("now", "localtime"));'
  595. self.db.execute (sqlstring, ("create-time", ))
  596. self.db.commit ()
  597. except:
  598. import traceback
  599. traceback.print_exc ()
  600. def init_user_db (self,db_file):
  601. if not path.exists (db_file):
  602. db = sqlite3.connect (db_file)
  603. db.execute('PRAGMA page_size = 4096;')
  604. db.execute( 'PRAGMA cache_size = 20000;' )
  605. db.execute( 'PRAGMA temp_store = MEMORY; ' )
  606. db.execute( 'PRAGMA synchronous = OFF; ' )
  607. db.commit()
  608. def get_database_desc (self, db_file):
  609. if not path.exists (db_file):
  610. return None
  611. try:
  612. db = sqlite3.connect (db_file)
  613. db.execute('PRAGMA page_size = 4096;')
  614. db.execute( 'PRAGMA cache_size = 20000;' )
  615. db.execute( 'PRAGMA temp_store = MEMORY; ' )
  616. db.execute( 'PRAGMA synchronous = OFF; ' )
  617. desc = {}
  618. for row in db.execute ("SELECT * FROM desc;").fetchall():
  619. desc [row[0]] = row[1]
  620. self.db.commit()
  621. return desc
  622. except:
  623. return None
  624. def cache_goucima (self):
  625. self._goucima = {}
  626. goucima = self.db.execute('SELECT * FROM main.goucima;').fetchall()
  627. map(lambda x: self._goucima.update({x[0]:x[1:]}), goucima)
  628. def get_gcm_id (self, zi):
  629. '''Get goucima of given character'''
  630. if self._goucima:
  631. # we already cache the goucima
  632. if not isinstance(zi,unicode):
  633. zi = zi.decode('utf-8')
  634. try:
  635. gcds = self._goucima[zi]
  636. return gcds
  637. except:
  638. pass
  639. sqlstr = 'SELECT %s FROM main.goucima WHERE zi =?;' % ','.join( map (lambda x: 'g%d' % x, range(self._mlen) ) )
  640. return self.db.execute(sqlstr,(zi,)).fetchall()[0]
  641. def parse_phrase (self, phrase):
  642. '''Parse phrase to get its XingMa code'''
  643. # first we make sure that we are parsing unicode string
  644. try:
  645. phrase = unicode(phrase)
  646. except:
  647. phrase = phrase.decode('utf8')
  648. p_len = len(phrase)
  649. xmlist = []
  650. if p_len < 2:
  651. # phrase should not be shorter than 2
  652. return []
  653. try:
  654. if p_len >= self.rules['above']:
  655. rule = self.rules[ self.rules['above'] ]
  656. elif p_len in self.rules:
  657. rule = self.rules[p_len]
  658. else:
  659. raise Exception ('unsupport len of phrase')
  660. if len(rule) > self._mlen:
  661. raise Exception ('fault rule: %s' % rule)
  662. #for (zi,ma) in rule:
  663. # if zi > 0:
  664. # zi -= 1
  665. # gcm = self.get_gcm_id (phrase[zi])
  666. # xmlist.append(gcm[ma-1])
  667. xmlist = map (lambda x: self.get_gcm_id ( phrase[x[0]-1] )[ x[1]-1 ], rule )
  668. return [len( xmlist)] + [p_len] + xmlist[:] + [phrase]
  669. except Exception:
  670. import traceback
  671. traceback.print_exc ()
  672. def parse_phrase_to_xm (self,phrase):
  673. '''Get the XingMa encoding of the phrase in string form'''
  674. xmres = self.parse_phrase (phrase) [2:-1]
  675. xms= u''.join ( map(self.deparse, xmres) )
  676. return xms
  677. def check_phrase (self,phrase,xmkey=None,database='main'):
  678. # if IME didn't support user define phrase,
  679. # we divide user input phrase into characters,
  680. # and then check its frequence
  681. if type(phrase) != type(u''):
  682. phrase = phrase.decode('utf8')
  683. if self.user_can_define_phrase:
  684. self.check_phrase_internal (phrase, xmkey,database)
  685. else:
  686. map(self.check_phrase_internal, phrase)
  687. def check_phrase_internal (self,phrase,xmkey=None,database='main'):
  688. '''Check word freq and user_freq
  689. '''
  690. if type(phrase) != type(u''):
  691. phrase = phrase.decode('utf8')
  692. try:
  693. if phrase in self._no_check_chars:
  694. # if the phrase is a single char, and in no_check_chars, we skip it.
  695. return
  696. except:
  697. print 'you are using old format of database, please regenerate your database.'
  698. if len(phrase) >=2:
  699. wordattr = self.parse_phrase ( phrase )
  700. _len = len (wordattr) -3
  701. if xmkey == None:
  702. sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE phrase = ?
  703. UNION ALL SELECT * FROM user_db.phrases WHERE phrase = ?
  704. UNION ALL SELECT * FROM mudb.phrases WHERE phrase = ?)
  705. ORDER BY user_freq DESC, freq DESC
  706. '''
  707. result = self.db.execute(sqlstr, (phrase,phrase,phrase)).fetchall()
  708. else:
  709. # we are using this to check whether the tab-key and phrase is in db
  710. xms = self.parse (xmkey)
  711. xmkids = tuple( map(int,xms) )
  712. condition = ' and '.join( map(lambda x: 'm%d = ?' % x, range( len(xms) )) )
  713. sqlstr = '''SELECT * FROM %(database)s.phrases WHERE phrase = ? and %(cond)s;''' % {'database':database, 'cond':condition}
  714. result = self.db.execute(sqlstr, (phrase,)+xmkids ).fetchall()
  715. return bool(result)
  716. sysdb = {}
  717. usrdb = {}
  718. mudb = {}
  719. searchres = map ( lambda res: [ int(res[-2]), int(res[-1]), [(res[:-2],[res[:-1],res[-1]])] ], result)
  720. # for sysdb
  721. reslist=filter( lambda x: not x[1], searchres )
  722. map (lambda x: sysdb.update(x[2]), reslist)
  723. # for usrdb
  724. reslist=filter( lambda x: ( x[0] in [0,-1] ) and x[1], searchres )
  725. map (lambda x: usrdb.update(x[2]), reslist)
  726. # for mudb
  727. reslist=filter( lambda x: (x[0] not in [0,-1]) and x[1], searchres )
  728. map (lambda x: mudb.update(x[2]), reslist)
  729. xmkey = ''
  730. if len(phrase) >=2:
  731. xmkey = u''.join ( map(self.deparse,wordattr[2:2+_len]) )
  732. #for k in wordattr[2:2+_len]:
  733. # xmkey += self.deparse (k)
  734. sqlstr = 'UPDATE mudb.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND phrase = ?;'
  735. try:
  736. if len(phrase) == 1:
  737. if not self.dynamic_adjust:
  738. # we should change the frequency of words
  739. return
  740. # this is a character
  741. # we remove the keys contained in mudb from usrdb
  742. keyout = filter (lambda k: mudb.has_key(k), usrdb.keys() )
  743. map (usrdb.pop, keyout)
  744. # we remove the keys contained in mudb and usrdb from sysdb
  745. keyout = filter (lambda k: mudb.has_key(k) or usrdb.has_key(k) , sysdb.keys() )
  746. map (sysdb.pop, keyout)
  747. # first mudb
  748. map (lambda res: self.db.execute ( sqlstr % ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) ) , [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:]) ) , mudb.keys())
  749. self.db.commit()
  750. # -----original for loop of above map:
  751. #for res in mudb.keys ():
  752. # _con = [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:])
  753. # _condition = ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) )
  754. # self.db.execute ( sqlstr % _condition, _con )
  755. # then usrdb
  756. map ( lambda res: self.add_phrase ( (''.join ( map(self.deparse,res[2:2+int(res[0])] ) ),phrase, 1,usrdb[res][1]+1 ), database = 'mudb') , usrdb.keys() )
  757. # -----original for loop of above map:
  758. #for res in usrdb.keys ():
  759. # #if mudb.has_key (res):
  760. # # continue
  761. # xmkey = ''.join ( map(self.deparse,res[2:2+int(res[0])] ) )
  762. # # here we use freq 1 to denote the phrase needed update in user_db
  763. # self.add_phrase ((xmkey,phrase,1,usrdb[res][1]+1 ), database = 'mudb')
  764. # last sysdb
  765. map ( lambda res: self.add_phrase ( ( ''.join ( map(self.deparse,res[2:2+int(res[0])]) ),phrase,2,1 ), database = 'mudb'), sysdb.keys() )
  766. # -----original for loop of above map:
  767. #for res in sysdb.keys ():
  768. # xmkey = ''.join ( map(self.deparse,res[2:2+int(res[0])]) )
  769. # # here we use freq 2 to denote the word needed addition to user_db
  770. # self.add_phrase ((xmkey,phrase,2,1), database = 'mudb')
  771. else:
  772. # this is a phrase
  773. if len (result) == 0 and self.user_can_define_phrase:
  774. # this is a new phrase, we add it into user_db
  775. self.add_phrase ( (xmkey,phrase,-2,1), database = 'mudb')
  776. elif len (result) > 0:
  777. if not self.dynamic_adjust:
  778. # we should change the frequency of words
  779. return
  780. # we remove the keys contained in mudb from usrdb
  781. keyout = filter (lambda k: mudb.has_key(k), usrdb.keys() )
  782. map (usrdb.pop, keyout)
  783. # we remove the keys contained in mudb and usrdb from sysdb
  784. keyout = filter (lambda k: mudb.has_key(k) or usrdb.has_key(k) , sysdb.keys() )
  785. map (sysdb.pop, keyout)
  786. # first we process mudb
  787. # the original for loop can be found above in 'len==1'
  788. map (lambda res: self.db.execute ( sqlstr % ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) ) , [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:]) ) , mudb.keys())
  789. self.db.commit()
  790. # then usrdb
  791. map ( lambda res: self.add_phrase ( (''.join ( map(self.deparse,res[2:2+int(res[0])] ) ),phrase,(-3 if usrdb[res][0][-1] == -1 else 1),usrdb[res][1]+1 ), database = 'mudb') , usrdb.keys() )
  792. #print self.db.execute('select * from mudb.phrases;').fetchall()
  793. # last sysdb
  794. map ( lambda res: self.add_phrase ( ( ''.join ( map(self.deparse,res[2:2+int(res[0])]) ),phrase,2,1 ), database = 'mudb'), sysdb.keys() )
  795. else:
  796. # we come to here when the ime dosen't support user phrase define
  797. pass
  798. #self.db.commit()
  799. except:
  800. import traceback
  801. traceback.print_exc ()
  802. def find_zi_code (self,zi):
  803. '''Check word freq and user_freq
  804. '''
  805. zi = zi.decode('utf8')
  806. sqlstr = '''SELECT * FROM main.phrases WHERE phrase = ?
  807. ORDER BY mlen ASC;
  808. '''
  809. result = self.db.execute(sqlstr, (zi,)).fetchall()
  810. #self.db.commit()
  811. codes = []
  812. try:
  813. if result:
  814. for _res in result:
  815. xmkey = u''
  816. for i in range ( int ( _res[0] ) ):
  817. xmkey += self.deparse ( _res[2+i] )
  818. codes.append(xmkey)
  819. except:
  820. import traceback
  821. traceback.print_exc ()
  822. return codes[:]
  823. def remove_phrase (self,phrase,database='user_db'):
  824. '''Remove phrase from database, default is from user_db
  825. phrase should be the a row of select * result from database
  826. Like (mlen,clen,m0,m1,m2,m3,phrase,freq,user_freq)
  827. '''
  828. _ph = list(phrase[:-2])
  829. _condition = ''
  830. for i in range(_ph[0]):
  831. _condition += 'AND m%d = ? ' % i
  832. nn =_ph.count(None)
  833. if nn:
  834. for i in range(nn):
  835. _ph.remove(None)
  836. msqlstr= 'SELECT * FROM %(database)s.phrases WHERE mlen = ? and clen = ? %(condition)s AND phrase = ? ;' % { 'database':database, 'condition':_condition }
  837. if self.db.execute(msqlstr, _ph).fetchall():
  838. sqlstr = 'DELETE FROM %(database)s.phrases WHERE mlen = ? AND clen =? %(condition)s AND phrase = ? ;' % { 'database':database, 'condition':_condition }
  839. self.db.execute(sqlstr,_ph)
  840. self.db.commit()
  841. msqlstr= 'SELECT * FROM mudb.phrases WHERE mlen = ? and clen = ? %(condition)s AND phrase = ? ;' % { 'condition':_condition }
  842. if self.db.execute(msqlstr, _ph).fetchall():
  843. sqlstr = 'DELETE FROM mudb.phrases WHERE mlen = ? AND clen =? %(condition)s AND phrase = ? ;' % { 'condition':_condition }
  844. self.db.execute(sqlstr,_ph)
  845. self.db.commit()