PageRenderTime 614ms CodeModel.GetById 41ms RepoModel.GetById 0ms app.codeStats 0ms

/client/vocabcompiler.py

https://gitlab.com/leiftomas/jasper-client
Python | 563 lines | 451 code | 46 blank | 66 comment | 55 complexity | e3bbe2ddbf5f6f6d722a70ae86c470f0 MD5 | raw file
  1. # -*- coding: utf-8-*-
  2. """
  3. Iterates over all the WORDS variables in the modules and creates a
  4. vocabulary for the respective stt_engine if needed.
  5. """
  6. import os
  7. import tempfile
  8. import logging
  9. import hashlib
  10. import subprocess
  11. import tarfile
  12. import re
  13. import contextlib
  14. import shutil
  15. from abc import ABCMeta, abstractmethod, abstractproperty
  16. import yaml
  17. import brain
  18. import jasperpath
  19. from g2p import PhonetisaurusG2P
  20. try:
  21. import cmuclmtk
  22. except ImportError:
  23. logging.getLogger(__name__).error("Error importing CMUCLMTK module. " +
  24. "PocketsphinxVocabulary will not work " +
  25. "correctly.", exc_info=True)
  26. class AbstractVocabulary(object):
  27. """
  28. Abstract base class for Vocabulary classes.
  29. Please note that subclasses have to implement the compile_vocabulary()
  30. method and set a string as the PATH_PREFIX class attribute.
  31. """
  32. __metaclass__ = ABCMeta
  33. @classmethod
  34. def phrases_to_revision(cls, phrases):
  35. """
  36. Calculates a revision from phrases by using the SHA1 hash function.
  37. Arguments:
  38. phrases -- a list of phrases
  39. Returns:
  40. A revision string for given phrases.
  41. """
  42. sorted_phrases = sorted(phrases)
  43. joined_phrases = '\n'.join(sorted_phrases)
  44. sha1 = hashlib.sha1()
  45. sha1.update(joined_phrases)
  46. return sha1.hexdigest()
  47. def __init__(self, name='default', path='.'):
  48. """
  49. Initializes a new Vocabulary instance.
  50. Optional Arguments:
  51. name -- (optional) the name of the vocabulary (Default: 'default')
  52. path -- (optional) the path in which the vocabulary exists or will
  53. be created (Default: '.')
  54. """
  55. self.name = name
  56. self.path = os.path.abspath(os.path.join(path, self.PATH_PREFIX, name))
  57. self._logger = logging.getLogger(__name__)
  58. @property
  59. def revision_file(self):
  60. """
  61. Returns:
  62. The path of the the revision file as string
  63. """
  64. return os.path.join(self.path, 'revision')
  65. @abstractproperty
  66. def is_compiled(self):
  67. """
  68. Checks if the vocabulary is compiled by checking if the revision file
  69. is readable. This method should be overridden by subclasses to check
  70. for class-specific additional files, too.
  71. Returns:
  72. True if the dictionary is compiled, else False
  73. """
  74. return os.access(self.revision_file, os.R_OK)
  75. @property
  76. def compiled_revision(self):
  77. """
  78. Reads the compiled revision from the revision file.
  79. Returns:
  80. the revision of this vocabulary (i.e. the string
  81. inside the revision file), or None if is_compiled
  82. if False
  83. """
  84. if not self.is_compiled:
  85. return None
  86. with open(self.revision_file, 'r') as f:
  87. revision = f.read().strip()
  88. self._logger.debug("compiled_revision is '%s'", revision)
  89. return revision
  90. def matches_phrases(self, phrases):
  91. """
  92. Convenience method to check if this vocabulary exactly contains the
  93. phrases passed to this method.
  94. Arguments:
  95. phrases -- a list of phrases
  96. Returns:
  97. True if phrases exactly matches the phrases inside this
  98. vocabulary.
  99. """
  100. return (self.compiled_revision == self.phrases_to_revision(phrases))
  101. def compile(self, phrases, force=False):
  102. """
  103. Compiles this vocabulary. If the force argument is True, compilation
  104. will be forced regardless of necessity (which means that the
  105. preliminary check if the current revision already equals the
  106. revision after compilation will be skipped).
  107. This method is not meant to be overridden by subclasses - use the
  108. _compile_vocabulary()-method instead.
  109. Arguments:
  110. phrases -- a list of phrases that this vocabulary will contain
  111. force -- (optional) forces compilation (Default: False)
  112. Returns:
  113. The revision of the compiled vocabulary
  114. """
  115. revision = self.phrases_to_revision(phrases)
  116. if not force and self.compiled_revision == revision:
  117. self._logger.debug('Compilation not neccessary, compiled ' +
  118. 'version matches phrases.')
  119. return revision
  120. if not os.path.exists(self.path):
  121. self._logger.debug("Vocabulary dir '%s' does not exist, " +
  122. "creating...", self.path)
  123. try:
  124. os.makedirs(self.path)
  125. except OSError:
  126. self._logger.error("Couldn't create vocabulary dir '%s'",
  127. self.path, exc_info=True)
  128. raise
  129. try:
  130. with open(self.revision_file, 'w') as f:
  131. f.write(revision)
  132. except (OSError, IOError):
  133. self._logger.error("Couldn't write revision file in '%s'",
  134. self.revision_file, exc_info=True)
  135. raise
  136. else:
  137. self._logger.info('Starting compilation...')
  138. try:
  139. self._compile_vocabulary(phrases)
  140. except Exception as e:
  141. self._logger.error("Fatal compilation Error occured, " +
  142. "cleaning up...", exc_info=True)
  143. try:
  144. os.remove(self.revision_file)
  145. except OSError:
  146. pass
  147. raise e
  148. else:
  149. self._logger.info('Compilation done.')
  150. return revision
  151. @abstractmethod
  152. def _compile_vocabulary(self, phrases):
  153. """
  154. Abstract method that should be overridden in subclasses with custom
  155. compilation code.
  156. Arguments:
  157. phrases -- a list of phrases that this vocabulary will contain
  158. """
  159. class DummyVocabulary(AbstractVocabulary):
  160. PATH_PREFIX = 'dummy-vocabulary'
  161. @property
  162. def is_compiled(self):
  163. """
  164. Checks if the vocabulary is compiled by checking if the revision
  165. file is readable.
  166. Returns:
  167. True if this vocabulary has been compiled, else False
  168. """
  169. return super(self.__class__, self).is_compiled
  170. def _compile_vocabulary(self, phrases):
  171. """
  172. Does nothing (because this is a dummy class for testing purposes).
  173. """
  174. pass
  175. class PocketsphinxVocabulary(AbstractVocabulary):
  176. PATH_PREFIX = 'pocketsphinx-vocabulary'
  177. @property
  178. def languagemodel_file(self):
  179. """
  180. Returns:
  181. The path of the the pocketsphinx languagemodel file as string
  182. """
  183. return os.path.join(self.path, 'languagemodel')
  184. @property
  185. def dictionary_file(self):
  186. """
  187. Returns:
  188. The path of the pocketsphinx dictionary file as string
  189. """
  190. return os.path.join(self.path, 'dictionary')
  191. @property
  192. def is_compiled(self):
  193. """
  194. Checks if the vocabulary is compiled by checking if the revision,
  195. languagemodel and dictionary files are readable.
  196. Returns:
  197. True if this vocabulary has been compiled, else False
  198. """
  199. return (super(self.__class__, self).is_compiled and
  200. os.access(self.languagemodel_file, os.R_OK) and
  201. os.access(self.dictionary_file, os.R_OK))
  202. @property
  203. def decoder_kwargs(self):
  204. """
  205. Convenience property to use this Vocabulary with the __init__() method
  206. of the pocketsphinx.Decoder class.
  207. Returns:
  208. A dict containing kwargs for the pocketsphinx.Decoder.__init__()
  209. method.
  210. Example:
  211. decoder = pocketsphinx.Decoder(**vocab_instance.decoder_kwargs,
  212. hmm='/path/to/hmm')
  213. """
  214. return {'lm': self.languagemodel_file, 'dict': self.dictionary_file}
  215. def _compile_vocabulary(self, phrases):
  216. """
  217. Compiles the vocabulary to the Pocketsphinx format by creating a
  218. languagemodel and a dictionary.
  219. Arguments:
  220. phrases -- a list of phrases that this vocabulary will contain
  221. """
  222. text = " ".join([("<s> %s </s>" % phrase) for phrase in phrases])
  223. self._logger.debug('Compiling languagemodel...')
  224. vocabulary = self._compile_languagemodel(text, self.languagemodel_file)
  225. self._logger.debug('Starting dictionary...')
  226. self._compile_dictionary(vocabulary, self.dictionary_file)
  227. def _compile_languagemodel(self, text, output_file):
  228. """
  229. Compiles the languagemodel from a text.
  230. Arguments:
  231. text -- the text the languagemodel will be generated from
  232. output_file -- the path of the file this languagemodel will
  233. be written to
  234. Returns:
  235. A list of all unique words this vocabulary contains.
  236. """
  237. with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f:
  238. vocab_file = f.name
  239. # Create vocab file from text
  240. self._logger.debug("Creating vocab file: '%s'", vocab_file)
  241. cmuclmtk.text2vocab(text, vocab_file)
  242. # Create language model from text
  243. self._logger.debug("Creating languagemodel file: '%s'", output_file)
  244. cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file)
  245. # Get words from vocab file
  246. self._logger.debug("Getting words from vocab file and removing it " +
  247. "afterwards...")
  248. words = []
  249. with open(vocab_file, 'r') as f:
  250. for line in f:
  251. line = line.strip()
  252. if not line.startswith('#') and line not in ('<s>', '</s>'):
  253. words.append(line)
  254. os.remove(vocab_file)
  255. return words
  256. def _compile_dictionary(self, words, output_file):
  257. """
  258. Compiles the dictionary from a list of words.
  259. Arguments:
  260. words -- a list of all unique words this vocabulary contains
  261. output_file -- the path of the file this dictionary will
  262. be written to
  263. """
  264. # create the dictionary
  265. self._logger.debug("Getting phonemes for %d words...", len(words))
  266. g2pconverter = PhonetisaurusG2P(**PhonetisaurusG2P.get_config())
  267. phonemes = g2pconverter.translate(words)
  268. self._logger.debug("Creating dict file: '%s'", output_file)
  269. with open(output_file, "w") as f:
  270. for word, pronounciations in phonemes.items():
  271. for i, pronounciation in enumerate(pronounciations, start=1):
  272. if i == 1:
  273. line = "%s\t%s\n" % (word, pronounciation)
  274. else:
  275. line = "%s(%d)\t%s\n" % (word, i, pronounciation)
  276. f.write(line)
  277. class JuliusVocabulary(AbstractVocabulary):
  278. class VoxForgeLexicon(object):
  279. def __init__(self, fname, membername=None):
  280. self._dict = {}
  281. self.parse(fname, membername)
  282. @contextlib.contextmanager
  283. def open_dict(self, fname, membername=None):
  284. if tarfile.is_tarfile(fname):
  285. if not membername:
  286. raise ValueError('archive membername not set!')
  287. tf = tarfile.open(fname)
  288. f = tf.extractfile(membername)
  289. yield f
  290. f.close()
  291. tf.close()
  292. else:
  293. with open(fname) as f:
  294. yield f
  295. def parse(self, fname, membername=None):
  296. pattern = re.compile(r'\[(.+)\]\W(.+)')
  297. with self.open_dict(fname, membername=membername) as f:
  298. for line in f:
  299. matchobj = pattern.search(line)
  300. if matchobj:
  301. word, phoneme = [x.strip() for x in matchobj.groups()]
  302. if word in self._dict:
  303. self._dict[word].append(phoneme)
  304. else:
  305. self._dict[word] = [phoneme]
  306. def translate_word(self, word):
  307. if word in self._dict:
  308. return self._dict[word]
  309. else:
  310. return []
  311. PATH_PREFIX = 'julius-vocabulary'
  312. @property
  313. def dfa_file(self):
  314. """
  315. Returns:
  316. The path of the the julius dfa file as string
  317. """
  318. return os.path.join(self.path, 'dfa')
  319. @property
  320. def dict_file(self):
  321. """
  322. Returns:
  323. The path of the the julius dict file as string
  324. """
  325. return os.path.join(self.path, 'dict')
  326. @property
  327. def is_compiled(self):
  328. return (super(self.__class__, self).is_compiled and
  329. os.access(self.dfa_file, os.R_OK) and
  330. os.access(self.dict_file, os.R_OK))
  331. def _get_grammar(self, phrases):
  332. return {'S': [['NS_B', 'WORD_LOOP', 'NS_E']],
  333. 'WORD_LOOP': [['WORD_LOOP', 'WORD'], ['WORD']]}
  334. def _get_word_defs(self, lexicon, phrases):
  335. word_defs = {'NS_B': [('<s>', 'sil')],
  336. 'NS_E': [('</s>', 'sil')],
  337. 'WORD': []}
  338. words = []
  339. for phrase in phrases:
  340. if ' ' in phrase:
  341. for word in phrase.split(' '):
  342. words.append(word)
  343. else:
  344. words.append(phrase)
  345. for word in words:
  346. for phoneme in lexicon.translate_word(word):
  347. word_defs['WORD'].append((word, phoneme))
  348. return word_defs
  349. def _compile_vocabulary(self, phrases):
  350. prefix = 'jasper'
  351. tmpdir = tempfile.mkdtemp()
  352. lexicon_file = jasperpath.data('julius-stt', 'VoxForge.tgz')
  353. lexicon_archive_member = 'VoxForge/VoxForgeDict'
  354. profile_path = jasperpath.config('profile.yml')
  355. if os.path.exists(profile_path):
  356. with open(profile_path, 'r') as f:
  357. profile = yaml.safe_load(f)
  358. if 'julius' in profile:
  359. if 'lexicon' in profile['julius']:
  360. lexicon_file = profile['julius']['lexicon']
  361. if 'lexicon_archive_member' in profile['julius']:
  362. lexicon_archive_member = \
  363. profile['julius']['lexicon_archive_member']
  364. lexicon = JuliusVocabulary.VoxForgeLexicon(lexicon_file,
  365. lexicon_archive_member)
  366. # Create grammar file
  367. tmp_grammar_file = os.path.join(tmpdir,
  368. os.extsep.join([prefix, 'grammar']))
  369. with open(tmp_grammar_file, 'w') as f:
  370. grammar = self._get_grammar(phrases)
  371. for definition in grammar.pop('S'):
  372. f.write("%s: %s\n" % ('S', ' '.join(definition)))
  373. for name, definitions in grammar.items():
  374. for definition in definitions:
  375. f.write("%s: %s\n" % (name, ' '.join(definition)))
  376. # Create voca file
  377. tmp_voca_file = os.path.join(tmpdir, os.extsep.join([prefix, 'voca']))
  378. with open(tmp_voca_file, 'w') as f:
  379. for category, words in self._get_word_defs(lexicon,
  380. phrases).items():
  381. f.write("%% %s\n" % category)
  382. for word, phoneme in words:
  383. f.write("%s\t\t\t%s\n" % (word, phoneme))
  384. # mkdfa.pl
  385. olddir = os.getcwd()
  386. os.chdir(tmpdir)
  387. cmd = ['mkdfa.pl', str(prefix)]
  388. with tempfile.SpooledTemporaryFile() as out_f:
  389. subprocess.call(cmd, stdout=out_f, stderr=out_f)
  390. out_f.seek(0)
  391. for line in out_f.read().splitlines():
  392. line = line.strip()
  393. if line:
  394. self._logger.debug(line)
  395. os.chdir(olddir)
  396. tmp_dfa_file = os.path.join(tmpdir, os.extsep.join([prefix, 'dfa']))
  397. tmp_dict_file = os.path.join(tmpdir, os.extsep.join([prefix, 'dict']))
  398. shutil.move(tmp_dfa_file, self.dfa_file)
  399. shutil.move(tmp_dict_file, self.dict_file)
  400. shutil.rmtree(tmpdir)
  401. def get_phrases_from_module(module):
  402. """
  403. Gets phrases from a module.
  404. Arguments:
  405. module -- a module reference
  406. Returns:
  407. The list of phrases in this module.
  408. """
  409. return module.WORDS if hasattr(module, 'WORDS') else []
  410. def get_keyword_phrases():
  411. """
  412. Gets the keyword phrases from the keywords file in the jasper data dir.
  413. Returns:
  414. A list of keyword phrases.
  415. """
  416. phrases = []
  417. with open(jasperpath.data('keyword_phrases'), mode="r") as f:
  418. for line in f:
  419. phrase = line.strip()
  420. if phrase:
  421. phrases.append(phrase)
  422. return phrases
  423. def get_all_phrases():
  424. """
  425. Gets phrases from all modules.
  426. Returns:
  427. A list of phrases in all modules plus additional phrases passed to this
  428. function.
  429. """
  430. phrases = []
  431. modules = brain.Brain.get_modules()
  432. for module in modules:
  433. phrases.extend(get_phrases_from_module(module))
  434. return sorted(list(set(phrases)))
  435. if __name__ == '__main__':
  436. import argparse
  437. parser = argparse.ArgumentParser(description='Vocabcompiler Demo')
  438. parser.add_argument('--base-dir', action='store',
  439. help='the directory in which the vocabulary will be ' +
  440. 'compiled.')
  441. parser.add_argument('--debug', action='store_true',
  442. help='show debug messages')
  443. args = parser.parse_args()
  444. logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
  445. base_dir = args.base_dir if args.base_dir else tempfile.mkdtemp()
  446. phrases = get_all_phrases()
  447. print("Module phrases: %r" % phrases)
  448. for subclass in AbstractVocabulary.__subclasses__():
  449. if hasattr(subclass, 'PATH_PREFIX'):
  450. vocab = subclass(path=base_dir)
  451. print("Vocabulary in: %s" % vocab.path)
  452. print("Revision file: %s" % vocab.revision_file)
  453. print("Compiled revision: %s" % vocab.compiled_revision)
  454. print("Is compiled: %r" % vocab.is_compiled)
  455. print("Matches phrases: %r" % vocab.matches_phrases(phrases))
  456. if not vocab.is_compiled or not vocab.matches_phrases(phrases):
  457. print("Compiling...")
  458. vocab.compile(phrases)
  459. print("")
  460. print("Vocabulary in: %s" % vocab.path)
  461. print("Revision file: %s" % vocab.revision_file)
  462. print("Compiled revision: %s" % vocab.compiled_revision)
  463. print("Is compiled: %r" % vocab.is_compiled)
  464. print("Matches phrases: %r" % vocab.matches_phrases(phrases))
  465. print("")
  466. if not args.base_dir:
  467. print("Removing temporary directory '%s'..." % base_dir)
  468. shutil.rmtree(base_dir)