/egs/spanish_dimex100/s5/local/data_prep.sh

https://github.com/kaldi-asr/kaldi · Shell · 286 lines · 140 code · 59 blank · 87 comment · 22 complexity · 73cba059e384bc49dab794b4e8e3a468 MD5 · raw file

  1. #!/usr/bin/env bash
  2. ## Only run this file from the example root directory
  3. ## $ ./local/data_prep.sh
  4. mkdir -p "data/train" "data/test" "data/local"
  5. source ./path.sh
  6. # Dimex100 unziped corpus root directory
  7. CORPUS_DIR="$1"
  8. # Corpus data
  9. #
  10. # Number of Different speakers: 100
  11. # Speakers common utterances: 10
  12. # Speakers individual utterances: 50
  13. #
  14. # Training/testing split
  15. #
  16. # Common utterances for training: 10 (100%)
  17. # Individual utterances for training: 40 (80%)
  18. # Individual utterances for testing: 10 (20%)
  19. N_SPEAKERS=100
  20. N_COMMON_UTTERANCES=10
  21. N_INDIVIDUAL_UTTERANCES=50
  22. N_INDIVIDUAL_UTTERANCES_TRAINING=40
  23. N_INDIVIDUAL_UTTERANCES_TESTING=10
  24. # speakerId-utteranceId-[c|i]
  25. # c = speaker common utterances (10)
  26. # i = speaker individual utterances (50)
  27. #
  28. # e.g.:
  29. # s001-01-c
  30. # ...
  31. # s001-10-c
  32. # ...
  33. # s001-01-i
  34. # ...
  35. # s001-50-i
  36. ## 80-20 train-test split
  37. ## Only individual utterances are used in testing
  38. # 10/10 common utterances go into training
  39. # 40/50 individual utterances go into training
  40. # 10/50 individual utterances go into testing
  41. function make_speaker_id
  42. {
  43. printf "s%03d" "$1"
  44. }
  45. function make_sentence_id
  46. {
  47. printf "%02d" "$1"
  48. }
  49. #####################################
  50. # Convert wave audio to 16-bit, 16kHz
  51. #####################################
  52. function convert_to_16khz
  53. {
  54. for i in $(seq 1 $N_SPEAKERS); do
  55. speaker_id=$(make_speaker_id $i)
  56. mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
  57. mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"
  58. # Common utterances
  59. for j in $(seq 1 $N_COMMON_UTTERANCES); do
  60. sentence_id=$(make_sentence_id $j)
  61. old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
  62. new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
  63. sox "$old_wav_file" -r 16k "$new_wav_file"
  64. done
  65. # Individual utterances
  66. for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
  67. sentence_id=$(make_sentence_id $k)
  68. old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
  69. new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
  70. sox "$old_wav_file" -r 16k "$new_wav_file"
  71. done
  72. done
  73. }
  74. if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
  75. echo
  76. echo Converting audio from 44.1kHz to 16kHz
  77. echo
  78. convert_to_16khz
  79. fi
  80. #################
  81. # data/train/text
  82. # data/test/text
  83. #################
  84. # speakerId-utteranceId-[c|i]
  85. # c = speaker common utterances (10)
  86. # i = speaker individual utterances (50)
  87. #
  88. # e.g.:
  89. # s001-01-c
  90. # ...
  91. # s001-10-c
  92. # ...
  93. # s001-01-i
  94. # ...
  95. # s001-50-i
  96. ## 80-20 train-test split
  97. ## Only individual utterances are used in testing
  98. # 10/10 common utterances go into training
  99. # 40/50 individual utterances go into training
  100. # 10/50 individual utterances go into testing
  101. function clean
  102. {
  103. echo "$1" \
  104. | tr -d '\r' \
  105. | tr '[:upper:]' '[:lower:]' \
  106. | sed \
  107. -e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
  108. -e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
  109. -e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
  110. | tr -d -c "a-zA-Z0-9 \r\n"
  111. # | tr -d -c "_,.;:\-?¿!'\"()" \
  112. }
  113. ### Generate data/train/text
  114. for i in $(seq 1 $N_SPEAKERS); do
  115. speaker_id=$(make_speaker_id $i)
  116. # Common utterances
  117. for j in $(seq 1 $N_COMMON_UTTERANCES); do
  118. sentence_id=$(make_sentence_id $j)
  119. utterance_id="$speaker_id-$sentence_id-c"
  120. trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
  121. iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
  122. if [ -f "$trans_file.utf8" ]; then
  123. transcription=$(cat "$trans_file.utf8")
  124. transcription=$(clean "$transcription")
  125. echo "$utterance_id $transcription" >> "data/train/text"
  126. fi
  127. done
  128. # Individual utterances
  129. for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
  130. sentence_id=$(make_sentence_id $k)
  131. utterance_id="$speaker_id-$sentence_id-i"
  132. trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
  133. iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
  134. if [ -f "$trans_file.utf8" ]; then
  135. transcription=$(cat "$trans_file.utf8")
  136. transcription=$(clean "$transcription")
  137. echo "$utterance_id $transcription" >> "data/train/text"
  138. fi
  139. done
  140. done
  141. ### Generate data/test/text
  142. for i in $(seq 1 $N_SPEAKERS); do
  143. speaker_id=$(make_speaker_id $i)
  144. # Individual utterances
  145. for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
  146. sentence_id=$(make_sentence_id $k)
  147. utterance_id="$speaker_id-$sentence_id-i"
  148. trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
  149. iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
  150. if [ -f "$trans_file.utf8" ]; then
  151. transcription=$(cat "$trans_file.utf8")
  152. transcription=$(clean "$transcription")
  153. echo "$utterance_id $transcription" >> "data/test/text"
  154. fi
  155. done
  156. done
  157. ####################
  158. # data/train/wav.scp
  159. # data/test/wav.scp
  160. ####################
  161. ### Generate data/train/wav.scp
  162. for i in $(seq 1 $N_SPEAKERS); do
  163. speaker_id=$(make_speaker_id $i)
  164. # Common utterances
  165. for j in $(seq 1 $N_COMMON_UTTERANCES); do
  166. sentence_id=$(make_sentence_id $j)
  167. utterance_id="$speaker_id-$sentence_id-c"
  168. wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
  169. if [ -f "$wav_file" ]; then
  170. echo "$utterance_id $wav_file" >> "data/train/wav.scp"
  171. fi
  172. done
  173. # Individual utterances
  174. for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
  175. sentence_id=$(make_sentence_id $k)
  176. utterance_id="$speaker_id-$sentence_id-i"
  177. wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
  178. if [ -f "$wav_file" ]; then
  179. echo "$utterance_id $wav_file" >> "data/train/wav.scp"
  180. fi
  181. done
  182. done
  183. ### Generate data/test/wav.scp
  184. for i in $(seq 1 $N_SPEAKERS); do
  185. speaker_id=$(make_speaker_id $i)
  186. # Individual utterances
  187. for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
  188. sentence_id=$(make_sentence_id $k)
  189. utterance_id="$speaker_id-$sentence_id-i"
  190. wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
  191. if [ -f "$wav_file" ]; then
  192. echo "$utterance_id $wav_file" >> "data/test/wav.scp"
  193. fi
  194. done
  195. done
  196. ####################
  197. # data/train/utt2spk
  198. # data/test/utt2spk
  199. ####################
  200. # Take IDs from 'text' file to avoid including missing data's IDs
  201. ### Generate data/train/utt2spk
  202. utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)
  203. while read -r utterance_id; do
  204. speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
  205. echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
  206. done <<< "$utterance_ids"
  207. ### Generate data/test/utt2spk
  208. utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)
  209. while read -r utterance_id; do
  210. speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
  211. echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
  212. done <<< "$utterance_ids"
  213. ############
  214. # Sort files
  215. ############
  216. LC_ALL=C sort -o "data/train/text" "data/train/text"
  217. LC_ALL=C sort -o "data/test/text" "data/test/text"
  218. LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
  219. LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
  220. LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
  221. LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"
  222. ####################
  223. # data/train/spk2utt
  224. # data/test/spk2utt
  225. ####################
  226. utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
  227. utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"