import gradio as gr from transformers import Text2SpeechForConditionalGeneration, Text2SpeechTokenizer # Load TTS model and tokenizer model_name = "facebook/wav2vec2-base-960h" tts_model = Text2SpeechForConditionalGeneration.from_pretrained(model_name) tokenizer = Text2SpeechTokenizer.from_pretrained(model_name) def text_to_speech(text): inputs = tokenizer(text, return_tensors="pt", clean_up_tokenization_spaces=True) with gr.Output() as out: speech = tts_model.generate(**inputs) gr.Audio(speech[0].numpy(), type="audio/wav") iface = gr.Interface(fn=text_to_speech, inputs="text", outputs="audio") iface.launch()