/egs/gale_mandarin/s5/local/gale_format_data.sh

https://github.com/tramphero/kaldi · Shell · 64 lines · 34 code · 17 blank · 13 comment · 11 complexity · 6248cc52cc1774f627e877ff5c48e024 MD5 · raw file

  1. #!/bin/bash
  2. # Copyright 2014 QCRI (author: Ahmed Ali)
  3. # Apache 2.0
  4. if [ -f path.sh ]; then
  5. . ./path.sh; else
  6. echo "missing path.sh"; exit 1;
  7. fi
  8. set -e -o pipefail
  9. set -x
  10. for dir in dev train; do
  11. cp -prT data/local/$dir data/$dir
  12. done
  13. export LC_ALL=C
  14. arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
  15. [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
  16. rm -r data/lang_test || true
  17. cp -r data/lang data/lang_test
  18. gunzip -c "$arpa_lm" | \
  19. arpa2fst --disambig-symbol=#0 \
  20. --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
  21. echo "Checking how stochastic G is (the first of these numbers should be small):"
  22. fstisstochastic data/lang_test/G.fst || true
  23. ## Check lexicon.
  24. ## just have a look and make sure it seems sane.
  25. echo "First few lines of lexicon FST:"
  26. (
  27. fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
  28. ) || true
  29. echo Performing further checks
  30. # Checking that G.fst is determinizable.
  31. fstdeterminize data/lang_test/G.fst /dev/null || {
  32. echo Error determinizing G.
  33. exit 1
  34. }
  35. # Checking that L_disambig.fst is determinizable.
  36. fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
  37. # Checking that disambiguated lexicon times G is determinizable
  38. # Note: we do this with fstdeterminizestar not fstdeterminize, as
  39. # fstdeterminize was taking forever (presumbaly relates to a bug
  40. # in this version of OpenFst that makes determinization slow for
  41. # some case).
  42. fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
  43. fstdeterminizestar >/dev/null || echo Error
  44. # Checking that LG is stochastic:
  45. fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
  46. fstisstochastic || echo LG is not stochastic
  47. echo gale_format_data succeeded.