data_prep.sh - Dimex100 unziped corpus root directory Corpu…

/egs/spanish_dimex100/s5/local/data_prep.sh

https://github.com/kaldi-asr/kaldi · Shell · 286 lines · 140 code · 59 blank · 87 comment · 22 complexity · 73cba059e384bc49dab794b4e8e3a468 MD5 · raw file

#!/usr/bin/env bash

## Only run this file from the example root directory
##      $ ./local/data_prep.sh

mkdir -p "data/train" "data/test" "data/local"

source ./path.sh

# Dimex100 unziped corpus root directory
CORPUS_DIR="$1"

# Corpus data
#
#   Number of Different speakers:   100
#   Speakers common utterances:     10
#   Speakers individual utterances: 50
#
# Training/testing split
#
#   Common utterances for training:     10 (100%)
#   Individual utterances for training: 40 (80%)
#   Individual utterances for testing:  10 (20%)
N_SPEAKERS=100
N_COMMON_UTTERANCES=10
N_INDIVIDUAL_UTTERANCES=50
N_INDIVIDUAL_UTTERANCES_TRAINING=40
N_INDIVIDUAL_UTTERANCES_TESTING=10

# speakerId-utteranceId-[c|i]
#   c = speaker common utterances (10)
#   i = speaker individual utterances (50)
#
#   e.g.:
#       s001-01-c
#       ...
#       s001-10-c
#       ...
#       s001-01-i
#       ...
#       s001-50-i

## 80-20 train-test split
## Only individual utterances are used in testing
#    10/10 common utterances go into training
#    40/50 individual utterances go into training
#    10/50 individual utterances go into testing

function make_speaker_id
{
    printf "s%03d" "$1"
}

function make_sentence_id
{
    printf "%02d" "$1"
}

#####################################
# Convert wave audio to 16-bit, 16kHz
#####################################

function convert_to_16khz
{
    for i in $(seq 1 $N_SPEAKERS); do
        speaker_id=$(make_speaker_id $i)

        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"

        # Common utterances
        for j in $(seq 1 $N_COMMON_UTTERANCES); do
            sentence_id=$(make_sentence_id $j)
            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
            sox "$old_wav_file" -r 16k "$new_wav_file"
        done

        # Individual utterances
        for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
            sentence_id=$(make_sentence_id $k)
            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
            sox "$old_wav_file" -r 16k "$new_wav_file"
        done
    done
}

if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
    echo
    echo Converting audio from 44.1kHz to 16kHz
    echo
    convert_to_16khz
fi



#################
# data/train/text
# data/test/text
#################

# speakerId-utteranceId-[c|i]
#   c = speaker common utterances (10)
#   i = speaker individual utterances (50)
#
#   e.g.:
#       s001-01-c
#       ...
#       s001-10-c
#       ...
#       s001-01-i
#       ...
#       s001-50-i

## 80-20 train-test split
## Only individual utterances are used in testing
#    10/10 common utterances go into training
#    40/50 individual utterances go into training
#    10/50 individual utterances go into testing



function clean
{
    echo "$1" \
        | tr -d '\r' \
        | tr '[:upper:]' '[:lower:]' \
        | sed \
            -e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
            -e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
            -e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
        | tr -d -c "a-zA-Z0-9 \r\n"
        # | tr -d -c "_,.;:\-?¿!'\"()" \
}

### Generate data/train/text
for i in $(seq 1 $N_SPEAKERS); do
    speaker_id=$(make_speaker_id $i)

    # Common utterances
    for j in $(seq 1 $N_COMMON_UTTERANCES); do
        sentence_id=$(make_sentence_id $j)
        utterance_id="$speaker_id-$sentence_id-c"
        trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
        if [ -f "$trans_file.utf8" ]; then
            transcription=$(cat "$trans_file.utf8")
            transcription=$(clean "$transcription")
            echo "$utterance_id $transcription" >> "data/train/text"
        fi
    done

    # Individual utterances
    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
        sentence_id=$(make_sentence_id $k)
        utterance_id="$speaker_id-$sentence_id-i"
        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
        if [ -f "$trans_file.utf8" ]; then
            transcription=$(cat "$trans_file.utf8")
            transcription=$(clean "$transcription")
            echo "$utterance_id $transcription" >> "data/train/text"
        fi
    done

done


### Generate data/test/text
for i in $(seq 1 $N_SPEAKERS); do
    speaker_id=$(make_speaker_id $i)

    # Individual utterances
    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
        sentence_id=$(make_sentence_id $k)
        utterance_id="$speaker_id-$sentence_id-i"
        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
        if [ -f "$trans_file.utf8" ]; then
            transcription=$(cat "$trans_file.utf8")
            transcription=$(clean "$transcription")
            echo "$utterance_id $transcription" >> "data/test/text"
        fi
    done

done




####################
# data/train/wav.scp
# data/test/wav.scp
####################


### Generate data/train/wav.scp
for i in $(seq 1 $N_SPEAKERS); do
    speaker_id=$(make_speaker_id $i)

    # Common utterances
    for j in $(seq 1 $N_COMMON_UTTERANCES); do
        sentence_id=$(make_sentence_id $j)
        utterance_id="$speaker_id-$sentence_id-c"
        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
        if [ -f "$wav_file" ]; then
            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
        fi
    done

    # Individual utterances
    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
        sentence_id=$(make_sentence_id $k)
        utterance_id="$speaker_id-$sentence_id-i"
        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
        if [ -f "$wav_file" ]; then
            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
        fi
    done

done


### Generate data/test/wav.scp
for i in $(seq 1 $N_SPEAKERS); do
    speaker_id=$(make_speaker_id $i)

    # Individual utterances
    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
        sentence_id=$(make_sentence_id $k)
        utterance_id="$speaker_id-$sentence_id-i"
        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
        if [ -f "$wav_file" ]; then
            echo "$utterance_id $wav_file" >> "data/test/wav.scp"
        fi
    done

done




####################
# data/train/utt2spk
# data/test/utt2spk
####################

# Take IDs from 'text' file to avoid including missing data's IDs

### Generate data/train/utt2spk
utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)

while read -r utterance_id; do
    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
    echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
done <<< "$utterance_ids"


### Generate data/test/utt2spk
utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)

while read -r utterance_id; do
    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
    echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
done <<< "$utterance_ids"


############
# Sort files
############

LC_ALL=C sort -o "data/train/text" "data/train/text"
LC_ALL=C sort -o "data/test/text" "data/test/text"
LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"


####################
# data/train/spk2utt
# data/test/spk2utt
####################
utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"