thinhlpg commited on
Commit
773c7bd
1 Parent(s): 03e5637
Files changed (4) hide show
  1. .gitignore +142 -0
  2. README.md +5 -3
  3. app.py +676 -0
  4. requirements.txt +17 -0
.gitignore ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vixtts-demo.code-workspace
2
+ output.wav
3
+ model/
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+ db.sqlite3
61
+ db.sqlite3-journal
62
+
63
+ # Flask stuff:
64
+ instance/
65
+ .webassets-cache
66
+
67
+ # Scrapy stuff:
68
+ .scrapy
69
+
70
+ # Sphinx documentation
71
+ docs/_build/
72
+
73
+ # PyBuilder
74
+ target/
75
+
76
+ # Jupyter Notebook
77
+ .ipynb_checkpoints
78
+
79
+ # IPython
80
+ profile_default/
81
+ ipython_config.py
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # pipenv
87
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
89
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
90
+ # install all needed dependencies.
91
+ #Pipfile.lock
92
+
93
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
94
+ __pypackages__/
95
+
96
+ # Celery stuff
97
+ celerybeat-schedule
98
+ celerybeat.pid
99
+
100
+ # SageMath parsed files
101
+ *.sage.py
102
+
103
+ # Environments
104
+ .env
105
+ .venv
106
+ env/
107
+ venv/
108
+ ENV/
109
+ env.bak/
110
+ venv.bak/
111
+
112
+ # Spyder project settings
113
+ .spyderproject
114
+ .spyproject
115
+
116
+ # Rope project settings
117
+ .ropeproject
118
+
119
+ # mkdocs documentation
120
+ /site
121
+
122
+ # mypy
123
+ .mypy_cache/
124
+ .dmypy.json
125
+ dmypy.json
126
+
127
+ # Pyre type checker
128
+ .pyre/
129
+
130
+ # pytype static type analyzer
131
+ .pytype/
132
+
133
+ # Cython debug symbols
134
+ cython_debug/
135
+
136
+ # C-extensions generated by Cython
137
+ *.c
138
+ *.cpp
139
+ *.pyx
140
+ *.pxd
141
+ *.pyd
142
+ *.so
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Vixtts Demo
3
  emoji: 🚀
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: other
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: viXTTS Demo
3
  emoji: 🚀
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.48.0
8
  app_file: app.py
9
  pinned: false
10
  license: other
11
+ models:
12
+ - capleaf/viXTTS
13
  ---
14
 
15
+ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
app.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import uuid
4
+
5
+ import torch
6
+ import torchaudio
7
+
8
+ # download for mecab
9
+ # os.system("python -m unidic download")
10
+
11
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
12
+ os.environ["COQUI_TOS_AGREED"] = "1"
13
+
14
+ import csv
15
+ import datetime
16
+ import re
17
+ from io import StringIO
18
+
19
+ import gradio as gr
20
+
21
+ # langid is used to detect language for longer text
22
+ # Most users expect text to be their own language, there is checkbox to disable it
23
+ import langid
24
+ from huggingface_hub import hf_hub_download, snapshot_download
25
+ from TTS.api import TTS
26
+ from TTS.tts.configs.xtts_config import XttsConfig
27
+ from TTS.tts.models.xtts import Xtts
28
+ from underthesea import sent_tokenize
29
+ from unidecode import unidecode
30
+ from vinorm import TTSnorm
31
+
32
+ HF_TOKEN = os.environ.get("HF_TOKEN")
33
+
34
+ from huggingface_hub import HfApi
35
+
36
+ # will use api to restart space on a unrecoverable error
37
+ api = HfApi(token=HF_TOKEN)
38
+ repo_id = "coqui/xtts"
39
+
40
+ # This will trigger downloading model
41
+ print("Downloading if not downloaded Coqui XTTS V2")
42
+ checkpoint_dir = "model/"
43
+ repo_id = "capleaf/viXTTS"
44
+ use_deepspeed = False
45
+
46
+ os.makedirs(checkpoint_dir, exist_ok=True)
47
+
48
+ required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
49
+ files_in_dir = os.listdir(checkpoint_dir)
50
+ if not all(file in files_in_dir for file in required_files):
51
+ snapshot_download(
52
+ repo_id=repo_id,
53
+ repo_type="model",
54
+ local_dir=checkpoint_dir,
55
+ )
56
+ hf_hub_download(
57
+ repo_id="coqui/XTTS-v2",
58
+ filename="speakers_xtts.pth",
59
+ local_dir=checkpoint_dir,
60
+ )
61
+
62
+ xtts_config = os.path.join(checkpoint_dir, "config.json")
63
+ config = XttsConfig()
64
+ config.load_json(xtts_config)
65
+ MODEL = Xtts.init_from_config(config)
66
+ MODEL.load_checkpoint(
67
+ config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed
68
+ )
69
+ if torch.cuda.is_available():
70
+ MODEL.cuda()
71
+
72
+ supported_languages = config.languages
73
+ if not "vi" in supported_languages:
74
+ supported_languages.append("vi")
75
+
76
+
77
+ def predict(
78
+ prompt,
79
+ language,
80
+ audio_file_pth,
81
+ mic_file_path,
82
+ use_mic,
83
+ voice_cleanup,
84
+ no_lang_auto_detect,
85
+ agree,
86
+ ):
87
+ if agree == True:
88
+ if language not in supported_languages:
89
+ gr.Warning(
90
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
91
+ )
92
+
93
+ return (
94
+ None,
95
+ None,
96
+ None,
97
+ None,
98
+ )
99
+
100
+ language_predicted = langid.classify(prompt)[
101
+ 0
102
+ ].strip() # strip need as there is space at end!
103
+
104
+ # tts expects chinese as zh-cn
105
+ if language_predicted == "zh":
106
+ # we use zh-cn
107
+ language_predicted = "zh-cn"
108
+
109
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
110
+
111
+ # After text character length 15 trigger language detection
112
+ if len(prompt) > 15:
113
+ # allow any language for short text as some may be common
114
+ # If user unchecks language autodetection it will not trigger
115
+ # You may remove this completely for own use
116
+ if language_predicted != language and not no_lang_auto_detect:
117
+ # Please duplicate and remove this check if you really want this
118
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
119
+ gr.Warning(
120
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
121
+ )
122
+
123
+ return (
124
+ None,
125
+ None,
126
+ None,
127
+ None,
128
+ )
129
+
130
+ if use_mic == True:
131
+ if mic_file_path is not None:
132
+ speaker_wav = mic_file_path
133
+ else:
134
+ gr.Warning(
135
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
136
+ )
137
+ return (
138
+ None,
139
+ None,
140
+ None,
141
+ None,
142
+ )
143
+
144
+ else:
145
+ speaker_wav = audio_file_pth
146
+
147
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
148
+ # This is fast filtering not perfect
149
+
150
+ # Apply all on demand
151
+ lowpassfilter = denoise = trim = loudness = True
152
+
153
+ if lowpassfilter:
154
+ lowpass_highpass = "lowpass=8000,highpass=75,"
155
+ else:
156
+ lowpass_highpass = ""
157
+
158
+ if trim:
159
+ # better to remove silence in beginning and end for microphone
160
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
161
+ else:
162
+ trim_silence = ""
163
+
164
+ speaker_wav = speaker_wav
165
+
166
+ if len(prompt) < 2:
167
+ gr.Warning("Please give a longer prompt text")
168
+ return (
169
+ None,
170
+ None,
171
+ None,
172
+ None,
173
+ )
174
+ if len(prompt) > 200:
175
+ gr.Warning(
176
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
177
+ )
178
+ return (
179
+ None,
180
+ None,
181
+ None,
182
+ None,
183
+ )
184
+
185
+ try:
186
+ metrics_text = ""
187
+ t_latent = time.time()
188
+
189
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
190
+ try:
191
+ (
192
+ gpt_cond_latent,
193
+ speaker_embedding,
194
+ ) = MODEL.get_conditioning_latents(
195
+ audio_path=speaker_wav,
196
+ gpt_cond_len=30,
197
+ gpt_cond_chunk_len=4,
198
+ max_ref_length=60,
199
+ )
200
+ except Exception as e:
201
+ print("Speaker encoding error", str(e))
202
+ gr.Warning(
203
+ "It appears something wrong with reference, did you unmute your microphone?"
204
+ )
205
+ return (
206
+ None,
207
+ None,
208
+ None,
209
+ None,
210
+ )
211
+
212
+ latent_calculation_time = time.time() - t_latent
213
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
214
+
215
+ # temporary comma fix
216
+ prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
217
+
218
+ wav_chunks = []
219
+ ## Direct mode
220
+
221
+ print("I: Generating new audio...")
222
+ t0 = time.time()
223
+ out = MODEL.inference(
224
+ prompt,
225
+ language,
226
+ gpt_cond_latent,
227
+ speaker_embedding,
228
+ repetition_penalty=5.0,
229
+ temperature=0.75,
230
+ )
231
+ inference_time = time.time() - t0
232
+ print(
233
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
234
+ )
235
+ metrics_text += (
236
+ f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
237
+ )
238
+ real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
239
+ print(f"Real-time factor (RTF): {real_time_factor}")
240
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
241
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
242
+
243
+ """
244
+ print("I: Generating new audio in streaming mode...")
245
+ t0 = time.time()
246
+ chunks = model.inference_stream(
247
+ prompt,
248
+ language,
249
+ gpt_cond_latent,
250
+ speaker_embedding,
251
+ repetition_penalty=7.0,
252
+ temperature=0.85,
253
+ )
254
+
255
+ first_chunk = True
256
+ for i, chunk in enumerate(chunks):
257
+ if first_chunk:
258
+ first_chunk_time = time.time() - t0
259
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
260
+ first_chunk = False
261
+ wav_chunks.append(chunk)
262
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
263
+ inference_time = time.time() - t0
264
+ print(
265
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
266
+ )
267
+ #metrics_text += (
268
+ # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
269
+ #)
270
+
271
+ wav = torch.cat(wav_chunks, dim=0)
272
+ print(wav.shape)
273
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
274
+ print(f"Real-time factor (RTF): {real_time_factor}")
275
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
276
+
277
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
278
+ """
279
+
280
+ except RuntimeError as e:
281
+ if "device-side assert" in str(e):
282
+ # cannot do anything on cuda device side error, need tor estart
283
+ print(
284
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
285
+ flush=True,
286
+ )
287
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
288
+ print("Cuda device-assert Runtime encountered need restart")
289
+ if not DEVICE_ASSERT_DETECTED:
290
+ DEVICE_ASSERT_DETECTED = 1
291
+ DEVICE_ASSERT_PROMPT = prompt
292
+ DEVICE_ASSERT_LANG = language
293
+
294
+ # just before restarting save what caused the issue so we can handle it in future
295
+ # Uploading Error data only happens for unrecovarable error
296
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
297
+ error_data = [
298
+ error_time,
299
+ prompt,
300
+ language,
301
+ audio_file_pth,
302
+ mic_file_path,
303
+ use_mic,
304
+ voice_cleanup,
305
+ no_lang_auto_detect,
306
+ agree,
307
+ ]
308
+ error_data = [str(e) if type(e) != str else e for e in error_data]
309
+ print(error_data)
310
+ print(speaker_wav)
311
+ write_io = StringIO()
312
+ csv.writer(write_io).writerows([error_data])
313
+ csv_upload = write_io.getvalue().encode()
314
+
315
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
316
+ print("Writing error csv")
317
+ error_api = HfApi()
318
+ error_api.upload_file(
319
+ path_or_fileobj=csv_upload,
320
+ path_in_repo=filename,
321
+ repo_id="coqui/xtts-flagged-dataset",
322
+ repo_type="dataset",
323
+ )
324
+
325
+ # speaker_wav
326
+ print("Writing error reference audio")
327
+ speaker_filename = (
328
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
329
+ )
330
+ error_api = HfApi()
331
+ error_api.upload_file(
332
+ path_or_fileobj=speaker_wav,
333
+ path_in_repo=speaker_filename,
334
+ repo_id="coqui/xtts-flagged-dataset",
335
+ repo_type="dataset",
336
+ )
337
+
338
+ # HF Space specific.. This error is unrecoverable need to restart space
339
+ space = api.get_space_runtime(repo_id=repo_id)
340
+ if space.stage != "BUILDING":
341
+ api.restart_space(repo_id=repo_id)
342
+ else:
343
+ print("TRIED TO RESTART but space is building")
344
+
345
+ else:
346
+ if "Failed to decode" in str(e):
347
+ print("Speaker encoding error", str(e))
348
+ gr.Warning(
349
+ "It appears something wrong with reference, did you unmute your microphone?"
350
+ )
351
+ else:
352
+ print("RuntimeError: non device-side assert error:", str(e))
353
+ gr.Warning("Something unexpected happened please retry again.")
354
+ return (
355
+ None,
356
+ None,
357
+ None,
358
+ None,
359
+ )
360
+ return (
361
+ gr.make_waveform(
362
+ audio="output.wav",
363
+ ),
364
+ "output.wav",
365
+ metrics_text,
366
+ speaker_wav,
367
+ )
368
+ else:
369
+ gr.Warning("Please accept the Terms & Condition!")
370
+ return (
371
+ None,
372
+ None,
373
+ None,
374
+ None,
375
+ )
376
+
377
+
378
+ title = "viXTTS Demo"
379
+
380
+ description = """
381
+
382
+ <br/>
383
+
384
+ This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
385
+
386
+ <br/>
387
+
388
+ Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
389
+
390
+ <br/>
391
+ """
392
+
393
+
394
+ article = """
395
+
396
+ """
397
+ examples = [
398
+ [
399
+ "Once when I was six years old I saw a magnificent picture",
400
+ "en",
401
+ "examples/female.wav",
402
+ None,
403
+ False,
404
+ False,
405
+ False,
406
+ True,
407
+ ],
408
+ [
409
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
410
+ "fr",
411
+ "examples/male.wav",
412
+ None,
413
+ False,
414
+ False,
415
+ False,
416
+ True,
417
+ ],
418
+ [
419
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
420
+ "de",
421
+ "examples/female.wav",
422
+ None,
423
+ False,
424
+ False,
425
+ False,
426
+ True,
427
+ ],
428
+ [
429
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
430
+ "es",
431
+ "examples/male.wav",
432
+ None,
433
+ False,
434
+ False,
435
+ False,
436
+ True,
437
+ ],
438
+ [
439
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
440
+ "pt",
441
+ "examples/female.wav",
442
+ None,
443
+ False,
444
+ False,
445
+ False,
446
+ True,
447
+ ],
448
+ [
449
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
450
+ "pl",
451
+ "examples/male.wav",
452
+ None,
453
+ False,
454
+ False,
455
+ False,
456
+ True,
457
+ ],
458
+ [
459
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
460
+ "it",
461
+ "examples/female.wav",
462
+ None,
463
+ False,
464
+ False,
465
+ False,
466
+ True,
467
+ ],
468
+ [
469
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
470
+ "tr",
471
+ "examples/female.wav",
472
+ None,
473
+ False,
474
+ False,
475
+ False,
476
+ True,
477
+ ],
478
+ [
479
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
480
+ "ru",
481
+ "examples/female.wav",
482
+ None,
483
+ False,
484
+ False,
485
+ False,
486
+ True,
487
+ ],
488
+ [
489
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
490
+ "nl",
491
+ "examples/male.wav",
492
+ None,
493
+ False,
494
+ False,
495
+ False,
496
+ True,
497
+ ],
498
+ [
499
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
500
+ "cs",
501
+ "examples/female.wav",
502
+ None,
503
+ False,
504
+ False,
505
+ False,
506
+ True,
507
+ ],
508
+ [
509
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
510
+ "zh-cn",
511
+ "examples/female.wav",
512
+ None,
513
+ False,
514
+ False,
515
+ False,
516
+ True,
517
+ ],
518
+ [
519
+ "かつて 六歳のとき、素晴らしい絵を見ました",
520
+ "ja",
521
+ "examples/female.wav",
522
+ None,
523
+ False,
524
+ True,
525
+ False,
526
+ True,
527
+ ],
528
+ [
529
+ "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
530
+ "ko",
531
+ "examples/female.wav",
532
+ None,
533
+ False,
534
+ True,
535
+ False,
536
+ True,
537
+ ],
538
+ [
539
+ "Egyszer hat éves koromban láttam egy csodálatos képet",
540
+ "hu",
541
+ "examples/male.wav",
542
+ None,
543
+ False,
544
+ True,
545
+ False,
546
+ True,
547
+ ],
548
+ ]
549
+
550
+
551
+ with gr.Blocks(analytics_enabled=False) as demo:
552
+ with gr.Row():
553
+ with gr.Column():
554
+ gr.Markdown(
555
+ """
556
+ 😳 Burh
557
+ """
558
+ )
559
+ with gr.Column():
560
+ # placeholder to align the image
561
+ pass
562
+
563
+ with gr.Row():
564
+ with gr.Column():
565
+ gr.Markdown(description)
566
+
567
+ with gr.Row():
568
+ with gr.Column():
569
+ input_text_gr = gr.Textbox(
570
+ label="Text Prompt",
571
+ info="One or two sentences at a time is better. Up to 200 text characters.",
572
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
573
+ )
574
+ language_gr = gr.Dropdown(
575
+ label="Language",
576
+ info="Select an output language for the synthesised speech",
577
+ choices=[
578
+ "vi",
579
+ "en",
580
+ "es",
581
+ "fr",
582
+ "de",
583
+ "it",
584
+ "pt",
585
+ "pl",
586
+ "tr",
587
+ "ru",
588
+ "nl",
589
+ "cs",
590
+ "ar",
591
+ "zh-cn",
592
+ "ja",
593
+ "ko",
594
+ "hu",
595
+ "hi",
596
+ ],
597
+ max_choices=1,
598
+ value="vi",
599
+ )
600
+ ref_gr = gr.Audio(
601
+ label="Reference Audio",
602
+ info="Click on the ✎ button to upload your own target speaker audio",
603
+ type="filepath",
604
+ value="examples/female.wav",
605
+ )
606
+ mic_gr = gr.Audio(
607
+ source="microphone",
608
+ type="filepath",
609
+ info="Use your microphone to record audio",
610
+ label="Use Microphone for Reference",
611
+ )
612
+ use_mic_gr = gr.Checkbox(
613
+ label="Use Microphone",
614
+ value=False,
615
+ info="Notice: Microphone input may not work properly under traffic",
616
+ )
617
+ clean_ref_gr = gr.Checkbox(
618
+ label="Cleanup Reference Voice",
619
+ value=False,
620
+ info="This check can improve output if your microphone or reference voice is noisy",
621
+ )
622
+ auto_det_lang_gr = gr.Checkbox(
623
+ label="Do not use language auto-detect",
624
+ value=False,
625
+ info="Check to disable language auto-detection",
626
+ )
627
+ tos_gr = gr.Checkbox(
628
+ label="Agree",
629
+ value=False,
630
+ info="I agree to the terms of the CPML: https://coqui.ai/cpml",
631
+ )
632
+
633
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
634
+
635
+ with gr.Column():
636
+ video_gr = gr.Video(label="Waveform Visual")
637
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
638
+ out_text_gr = gr.Text(label="Metrics")
639
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
640
+
641
+ with gr.Row():
642
+ gr.Examples(
643
+ examples,
644
+ label="Examples",
645
+ inputs=[
646
+ input_text_gr,
647
+ language_gr,
648
+ ref_gr,
649
+ mic_gr,
650
+ use_mic_gr,
651
+ clean_ref_gr,
652
+ auto_det_lang_gr,
653
+ tos_gr,
654
+ ],
655
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
656
+ fn=predict,
657
+ cache_examples=False,
658
+ )
659
+
660
+ tts_button.click(
661
+ predict,
662
+ [
663
+ input_text_gr,
664
+ language_gr,
665
+ ref_gr,
666
+ mic_gr,
667
+ use_mic_gr,
668
+ clean_ref_gr,
669
+ auto_det_lang_gr,
670
+ tos_gr,
671
+ ],
672
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
673
+ )
674
+
675
+ demo.queue()
676
+ demo.launch(debug=True, show_api=True)
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preinstall requirements from TTS
2
+ TTS @ git+https://github.com/thinhlpg/TTS.git@ff217b3f27b294de194cc59c5119d1e08b06413c
3
+ pydantic==1.10.13
4
+ python-multipart==0.0.6
5
+ typing-extensions>=4.8.0
6
+ cutlet
7
+ mecab-python3==1.0.6
8
+ unidic-lite==1.0.8
9
+ unidic==1.1.0
10
+ langid
11
+ deepspeed
12
+ pydub
13
+ gradio==3.48.0
14
+
15
+ # Vietnamese 101
16
+ vinorm==2.0.7
17
+ underthesea==6.8.0