Spaces:

coraKong
/

voice-cloning-demo

Build error

File size: 3,991 Bytes

c962c9a
 
 
 
08d1744
3e0d7e1
08d1744
 
c962c9a
392fff0
 
 
3e0d7e1
 
c2c3684
 
 
 
6045b6b
a6f1a9c
 
 
 
e228397
a6f1a9c
 
 
e228397
4c5bfad
b04ebb9
3e0d7e1
b04ebb9
3e0d7e1
02d8bcc
c962c9a
08d1744
5369878
f47a772
 
 
 
 
 
c962c9a
f47a772
c962c9a
f47a772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08d1744
 
f47a772
 
 
 
 
 
 
 
 
08d1744
 
 
f47a772

import gradio as gr
from TTS.api import TTS

# Init TTS
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
zh_tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=False)
de_tts = TTS(model_name="tts_models/de/thorsten/vits", gpu=False)
es_tts = TTS(model_name="tts_models/es/mai/tacotron2-DDC", progress_bar=False, gpu=False)

def text_to_speech(text: str, speaker_wav, speaker_wav_file, language: str):
    if speaker_wav_file and not speaker_wav:
        speaker_wav = speaker_wav_file
    file_path = "output.wav"
    if language == "zh-CN":
        # if speaker_wav is not None:
        #     zh_tts.tts_to_file(text, speaker_wav=speaker_wav, file_path=file_path)
        # else:
        zh_tts.tts_to_file(text, file_path=file_path)
    elif language == "de":
        # if speaker_wav is not None:
        #     de_tts.tts_to_file(text, speaker_wav=speaker_wav, file_path=file_path)
        # else:
        de_tts.tts_to_file(text, file_path=file_path)
    elif language == "es":
        # if speaker_wav is not None:
        #     es_tts.tts_to_file(text, speaker_wav=speaker_wav, file_path=file_path)
        # else:
        es_tts.tts_to_file(text, file_path=file_path)
    else:
        if speaker_wav is not None:
            tts.tts_to_file(text, speaker_wav=speaker_wav, language=language, file_path=file_path)
        else:
            tts.tts_to_file(text, speaker=tts.speakers[0], language=language, file_path=file_path)
    return file_path



# inputs = [gr.Textbox(label="Input the text", value="", max_lines=3),
#           gr.Audio(label="Voice to clone", source="microphone", type="filepath"),
#           gr.Audio(label="Voice to clone", type="filepath"), 
#           gr.Radio(label="Language", choices=["en", "zh-CN", "fr-fr", "pt-br", "de", "es"], value="en"),     
#           gr.Text(intro_text, font_size=14)]
# outputs = gr.Audio(label="Output")

# demo = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs)

# demo.launch()


title = "Voice-Cloning-Demo"

def toggle(choice):
    if choice == "mic":
        return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
    else:
        return gr.update(visible=False, value=None), gr.update(visible=True, value=None)

def handle_language_change(choice):
    if choice == "zh-CN" or choice == "de" or choice == "es":
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    else:
        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

warming_text = """Please note that Chinese, German, and Spanish are currently not supported for voice cloning."""

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input the text", value="", max_lines=3)
            radio = gr.Radio(["mic", "file"], value="mic",
                             label="How would you like to upload your audio?")
            audio_input_mic = gr.Audio(label="Voice to clone", source="microphone", type="filepath", visible=True)
            audio_input_file = gr.Audio(label="Voice to clone", type="filepath", visible=False)
            lan_input = gr.Radio(label="Language", choices=["en", "zh-CN", "fr-fr", "pt-br", "de", "es"], value="en")
            gradio.Markdown(warming_text)
            
            btn = gr.Button("Submit")
        with gr.Column():
            audio_output = gr.Audio(label="Output")

    # gr.Examples(examples, fn=inference, inputs=[audio_file, text_input],
    #                   outputs=audio_output, cache_examples=True)
    btn.click(text_to_speech, inputs=[text_input, audio_input_mic,
              audio_input_file, lan_input], outputs=audio_output)
    radio.change(toggle, radio, [audio_input_mic, audio_input_file])
    lan_input.change(handle_language_change, lan_input, [radio, audio_input_mic, audio_input_file])

demo.launch(enable_queue=True)