OmniVoice/examples/run_eval.sh at master · ModelsLab/OmniVoice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/bin/bash

# Evaluate OmniVoice models on TTS benchmarks.

# Stage 1: Download the test sets and evaluation models.
# Stage 2: LibriSpeech-PC
# Stage 3: seedtts_en
# Stage 4: seedtts_zh
# Stage 5: fleurs
# Stage 6: minimax

set -euo pipefail

# Specify the stages to run by setting the `stage` and `stop_stage` variables.
stage=1
stop_stage=6

# Available GPUs for evaluation. Adjust this according to your setup.
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

# Specify the checkpoint to evaluate.
CHECKPOINT=k2-fsa/OmniVoice
emilia_checkpoint=false

# CHECKPOINT=k2-fsa/OmniVoice
# emilia_checkpoint=true

# For the OmniVoice-Emilia checkpoint, we set denoise to False and lang_id to None
#, as the model is trained without prompt denoising or language id.

if [ "${emilia_checkpoint}" = true ]; then
    infer_options="--preprocess_prompt False \
        --postprocess_output False \
        --batch_duration 600 \
        --denoise False \
        --lang_id None \
        --audio_chunk_threshold 1000"
else
    infer_options="--preprocess_prompt False \
        --postprocess_output False \
        --batch_duration 600 \
        --audio_chunk_threshold 1000"
fi

export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"

download_dir="download"
TTS_EVAL_MODEL_DIR="${download_dir}/tts_eval_models/"
TTS_EVAL_DATA_DIR="${download_dir}/tts_eval_datasets/"

# Map test_name to its test.jsonl path.
get_test_list() {
    case "$1" in
        librispeech_pc) echo "${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean.jsonl" ;;
        seedtts_en)     echo "${TTS_EVAL_DATA_DIR}/seedtts_test_en.jsonl" ;;
        seedtts_zh)     echo "${TTS_EVAL_DATA_DIR}/seedtts_test_zh.jsonl" ;;
        minimax)        echo "${TTS_EVAL_DATA_DIR}/minimax_multilingual_24.jsonl" ;;
        fleurs)         echo "${TTS_EVAL_DATA_DIR}/fleurs_multilingual_102.jsonl" ;;
        *)              echo ""; return 1 ;;
    esac
}

# ============================================================
# Stage 1: Prepare the test sets and evaluation models
# ============================================================

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "Stage 1: Download test sets and evaluation models"

    hf_repo=k2-fsa/TTS_eval_datasets
    mkdir -p ${TTS_EVAL_DATA_DIR}/
    for file in \
        librispeech_pc_test_clean.jsonl \
        librispeech_pc_test_clean_transcript.jsonl \
        seedtts_test_en.jsonl \
        seedtts_test_zh.jsonl \
        minimax_multilingual_24.jsonl \
        fleurs_multilingual_102.jsonl; do
        echo "Downloading ${file}..."
        huggingface-cli download \
                --repo-type dataset \
                --local-dir ${TTS_EVAL_DATA_DIR}/ \
                ${hf_repo} \
                ${file}
    done

    for file in \
        librispeech_pc_testset.tar.gz \
        seedtts_testset.tar.gz \
        minimax_multilingual_24.tar.gz \
        fleurs_multilingual_102.tar.gz; do
        echo "Downloading ${file}..."
        huggingface-cli download \
                --repo-type dataset \
                --local-dir ${TTS_EVAL_DATA_DIR}/ \
                ${hf_repo} \
                ${file}

        echo "Extracting ${file}..."
        tar -xzf ${TTS_EVAL_DATA_DIR}/${file} -C ${TTS_EVAL_DATA_DIR}/
    done

    echo "Download all evaluation models"
    hf_repo=k2-fsa/TTS_eval_models
    mkdir -p ${TTS_EVAL_MODEL_DIR}
    huggingface-cli download \
        --local-dir ${TTS_EVAL_MODEL_DIR} \
        ${hf_repo}
fi

# ============================================================
# Stage 2: Evaluation on LibriSpeech-PC
# ============================================================


if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "Stage 2: Evaluation on LibriSpeech-PC"
    wav_path="results/librispeech_pc"
    test_jsonl="$(get_test_list librispeech_pc)"
    transcript_jsonl="${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean_transcript.jsonl"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}" ${infer_options}

    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.hubert \
        --wav-path "${wav_path}" \
        --test-list "${transcript_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.mos.utmos \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.mos.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 3: Evaluation on Seed-TTS en
# ============================================================

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "Stage 3: Evaluation on Seed-TTS en"
    wav_path="results/seedtts_en"
    test_jsonl="$(get_test_list seedtts_en)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}


    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.seedtts \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}" \
        --lang en

    python -m omnivoice.eval.mos.utmos \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.mos.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 4: Evaluation on Seed-TTS zh
# ============================================================

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "Stage 4: Evaluation on Seed-TTS zh"
    wav_path="results/seedtts_zh"
    test_jsonl="$(get_test_list seedtts_zh)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}


    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.seedtts \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}" \
        --lang zh

    python -m omnivoice.eval.mos.utmos \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.mos.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 5: Evaluation on MiniMax multilingual
# ============================================================

if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    echo "Stage 5: Evaluation on MiniMax multilingual"
    wav_path="results/minimax"
    test_jsonl="$(get_test_list minimax)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}

    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.minimax \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 6: Evaluation on FLEURS multilingual
# ============================================================

if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    echo "Stage 6: Evaluation on FLEURS multilingual"
    wav_path="results/fleurs"
    test_jsonl="$(get_test_list fleurs)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}


    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    # Evaluation on FLEURS requires omnilingual-asr, which has dependencies that
    # conflict with other packages (at least the transformers package) in our project.

    # To evaluate on FLEURS, we suggest users to set up a separate virtual
    # environment to install omnilingual-asr. Install instructions can be found in
    # https://github.com/facebookresearch/omnilingual-asr

    python ${PWD}/../omnivoice/eval/wer/fleurs.py \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-card omniASR_LLM_Unlimited_7B_v2 \
        --chunk-size 100 \
        --batch-size 50
fi