thinhlpg commited on
Commit
c837795
1 Parent(s): 5d8cb3b

chores: clean up unncessary stuffs

Browse files
Files changed (1) hide show
  1. app.py +130 -301
app.py CHANGED
@@ -8,8 +8,6 @@ import torchaudio
8
  # download for mecab
9
  os.system("python -m unidic download")
10
 
11
- # By using XTTS you agree to CPML license https://coqui.ai/cpml
12
- os.environ["COQUI_TOS_AGREED"] = "1"
13
 
14
  import csv
15
  import datetime
@@ -35,7 +33,6 @@ from huggingface_hub import HfApi
35
 
36
  # will use api to restart space on a unrecoverable error
37
  api = HfApi(token=HF_TOKEN)
38
- repo_id = "coqui/xtts"
39
 
40
  # This will trigger downloading model
41
  print("Downloading if not downloaded Coqui XTTS V2")
@@ -78,301 +75,158 @@ def predict(
78
  prompt,
79
  language,
80
  audio_file_pth,
81
- mic_file_path,
82
- use_mic,
83
  voice_cleanup,
84
- no_lang_auto_detect,
85
- agree,
86
  ):
87
- if agree == True:
88
- if language not in supported_languages:
89
- gr.Warning(
90
- f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
91
- )
92
-
93
- return (
94
- None,
95
- None,
96
- None,
97
- None,
98
- )
99
-
100
- language_predicted = langid.classify(prompt)[
101
- 0
102
- ].strip() # strip need as there is space at end!
103
-
104
- # tts expects chinese as zh-cn
105
- if language_predicted == "zh":
106
- # we use zh-cn
107
- language_predicted = "zh-cn"
108
-
109
- print(f"Detected language:{language_predicted}, Chosen language:{language}")
110
-
111
- # After text character length 15 trigger language detection
112
- if len(prompt) > 15:
113
- # allow any language for short text as some may be common
114
- # If user unchecks language autodetection it will not trigger
115
- # You may remove this completely for own use
116
- if language_predicted != language and not no_lang_auto_detect:
117
- # Please duplicate and remove this check if you really want this
118
- # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
119
- gr.Warning(
120
- f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
121
- )
122
-
123
- return (
124
- None,
125
- None,
126
- None,
127
- None,
128
- )
129
-
130
- if use_mic == True:
131
- if mic_file_path is not None:
132
- speaker_wav = mic_file_path
133
- else:
134
- gr.Warning(
135
- "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
136
- )
137
- return (
138
- None,
139
- None,
140
- None,
141
- None,
142
- )
143
-
144
- else:
145
- speaker_wav = audio_file_pth
146
-
147
- # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
148
- # This is fast filtering not perfect
149
 
150
- # Apply all on demand
151
- lowpassfilter = denoise = trim = loudness = True
 
 
 
 
152
 
153
- if lowpassfilter:
154
- lowpass_highpass = "lowpass=8000,highpass=75,"
155
- else:
156
- lowpass_highpass = ""
157
 
158
- if trim:
159
- # better to remove silence in beginning and end for microphone
160
- trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
161
- else:
162
- trim_silence = ""
163
 
164
- speaker_wav = speaker_wav
 
 
 
 
165
 
166
- if len(prompt) < 2:
167
- gr.Warning("Please give a longer prompt text")
168
- return (
169
- None,
170
- None,
171
- None,
172
- None,
173
- )
174
- if len(prompt) > 200:
175
- gr.Warning(
176
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
177
- )
178
- return (
179
- None,
180
- None,
181
- None,
182
- None,
183
- )
184
 
185
  try:
186
- metrics_text = ""
187
- t_latent = time.time()
188
-
189
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
190
- try:
191
- (
192
- gpt_cond_latent,
193
- speaker_embedding,
194
- ) = MODEL.get_conditioning_latents(
195
- audio_path=speaker_wav,
196
- gpt_cond_len=30,
197
- gpt_cond_chunk_len=4,
198
- max_ref_length=60,
199
- )
200
- except Exception as e:
201
- print("Speaker encoding error", str(e))
202
- gr.Warning(
203
- "It appears something wrong with reference, did you unmute your microphone?"
204
- )
205
- return (
206
- None,
207
- None,
208
- None,
209
- None,
210
- )
211
-
212
- latent_calculation_time = time.time() - t_latent
213
- # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
214
-
215
- # temporary comma fix
216
- prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
217
-
218
- wav_chunks = []
219
- ## Direct mode
220
-
221
- print("I: Generating new audio...")
222
- t0 = time.time()
223
- out = MODEL.inference(
224
- prompt,
225
- language,
226
  gpt_cond_latent,
227
  speaker_embedding,
228
- repetition_penalty=5.0,
229
- temperature=0.75,
 
 
 
230
  )
231
- inference_time = time.time() - t0
232
- print(
233
- f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
 
 
234
  )
235
- metrics_text += (
236
- f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  )
238
- real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
239
- print(f"Real-time factor (RTF): {real_time_factor}")
240
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
241
- torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
242
-
243
- """
244
- print("I: Generating new audio in streaming mode...")
245
- t0 = time.time()
246
- chunks = model.inference_stream(
 
 
 
247
  prompt,
248
  language,
249
- gpt_cond_latent,
250
- speaker_embedding,
251
- repetition_penalty=7.0,
252
- temperature=0.85,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  )
254
 
255
- first_chunk = True
256
- for i, chunk in enumerate(chunks):
257
- if first_chunk:
258
- first_chunk_time = time.time() - t0
259
- metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
260
- first_chunk = False
261
- wav_chunks.append(chunk)
262
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
263
- inference_time = time.time() - t0
264
- print(
265
- f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
266
  )
267
- #metrics_text += (
268
- # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
269
- #)
270
-
271
- wav = torch.cat(wav_chunks, dim=0)
272
- print(wav.shape)
273
- real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
274
- print(f"Real-time factor (RTF): {real_time_factor}")
275
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
276
-
277
- torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
278
- """
279
-
280
- except RuntimeError as e:
281
- if "device-side assert" in str(e):
282
- # cannot do anything on cuda device side error, need tor estart
283
- print(
284
- f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
285
- flush=True,
286
- )
287
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
288
- print("Cuda device-assert Runtime encountered need restart")
289
- if not DEVICE_ASSERT_DETECTED:
290
- DEVICE_ASSERT_DETECTED = 1
291
- DEVICE_ASSERT_PROMPT = prompt
292
- DEVICE_ASSERT_LANG = language
293
-
294
- # just before restarting save what caused the issue so we can handle it in future
295
- # Uploading Error data only happens for unrecovarable error
296
- error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
297
- error_data = [
298
- error_time,
299
- prompt,
300
- language,
301
- audio_file_pth,
302
- mic_file_path,
303
- use_mic,
304
- voice_cleanup,
305
- no_lang_auto_detect,
306
- agree,
307
- ]
308
- error_data = [str(e) if type(e) != str else e for e in error_data]
309
- print(error_data)
310
- print(speaker_wav)
311
- write_io = StringIO()
312
- csv.writer(write_io).writerows([error_data])
313
- csv_upload = write_io.getvalue().encode()
314
-
315
- filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
316
- print("Writing error csv")
317
- error_api = HfApi()
318
- error_api.upload_file(
319
- path_or_fileobj=csv_upload,
320
- path_in_repo=filename,
321
- repo_id="coqui/xtts-flagged-dataset",
322
- repo_type="dataset",
323
- )
324
-
325
- # speaker_wav
326
- print("Writing error reference audio")
327
- speaker_filename = (
328
- error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
329
- )
330
- error_api = HfApi()
331
- error_api.upload_file(
332
- path_or_fileobj=speaker_wav,
333
- path_in_repo=speaker_filename,
334
- repo_id="coqui/xtts-flagged-dataset",
335
- repo_type="dataset",
336
- )
337
-
338
- # HF Space specific.. This error is unrecoverable need to restart space
339
- space = api.get_space_runtime(repo_id=repo_id)
340
- if space.stage != "BUILDING":
341
- api.restart_space(repo_id=repo_id)
342
- else:
343
- print("TRIED TO RESTART but space is building")
344
 
 
 
 
 
345
  else:
346
- if "Failed to decode" in str(e):
347
- print("Speaker encoding error", str(e))
348
- gr.Warning(
349
- "It appears something wrong with reference, did you unmute your microphone?"
350
- )
351
- else:
352
- print("RuntimeError: non device-side assert error:", str(e))
353
- gr.Warning("Something unexpected happened please retry again.")
354
- return (
355
- None,
356
- None,
357
- None,
358
- None,
359
  )
360
- return (
361
- gr.make_waveform(
362
- audio="output.wav",
363
- ),
364
- "output.wav",
365
- metrics_text,
366
- speaker_wav,
367
- )
368
- else:
369
- gr.Warning("Please accept the Terms & Condition!")
370
- return (
371
- None,
372
- None,
373
- None,
374
- None,
375
- )
 
376
 
377
 
378
  title = "viXTTS Demo"
@@ -456,27 +310,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
456
  info="Use your microphone to record audio",
457
  label="Use Microphone for Reference",
458
  )
459
- use_mic_gr = gr.Checkbox(
460
- label="Use Microphone",
461
- value=False,
462
- info="Notice: Microphone input may not work properly under traffic",
463
- )
464
- clean_ref_gr = gr.Checkbox(
465
- label="Cleanup Reference Voice",
466
- value=False,
467
- info="This check can improve output if your microphone or reference voice is noisy",
468
- )
469
- auto_det_lang_gr = gr.Checkbox(
470
- label="Do not use language auto-detect",
471
- value=False,
472
- info="Check to disable language auto-detection",
473
- )
474
- tos_gr = gr.Checkbox(
475
- label="Agree",
476
- value=False,
477
- info="I agree to the terms of the CPML: https://coqui.ai/cpml",
478
- )
479
-
480
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
481
 
482
  with gr.Column():
@@ -492,10 +325,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
492
  language_gr,
493
  ref_gr,
494
  mic_gr,
495
- use_mic_gr,
496
- clean_ref_gr,
497
- auto_det_lang_gr,
498
- tos_gr,
499
  ],
500
  outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
501
  )
 
8
  # download for mecab
9
  os.system("python -m unidic download")
10
 
 
 
11
 
12
  import csv
13
  import datetime
 
33
 
34
  # will use api to restart space on a unrecoverable error
35
  api = HfApi(token=HF_TOKEN)
 
36
 
37
  # This will trigger downloading model
38
  print("Downloading if not downloaded Coqui XTTS V2")
 
75
  prompt,
76
  language,
77
  audio_file_pth,
 
 
78
  voice_cleanup,
 
 
79
  ):
80
+ if language not in supported_languages:
81
+ gr.Warning(
82
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
83
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ return (
86
+ None,
87
+ None,
88
+ None,
89
+ None,
90
+ )
91
 
92
+ speaker_wav = audio_file_pth
 
 
 
93
 
94
+ if len(prompt) < 2:
95
+ gr.Warning("Please give a longer prompt text")
96
+ return (None, None, None, None)
 
 
97
 
98
+ if len(prompt) > 200:
99
+ gr.Warning(
100
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
101
+ )
102
+ return (None, None, None, None)
103
 
104
+ try:
105
+ metrics_text = ""
106
+ t_latent = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  try:
109
+ (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  gpt_cond_latent,
111
  speaker_embedding,
112
+ ) = MODEL.get_conditioning_latents(
113
+ audio_path=speaker_wav,
114
+ gpt_cond_len=30,
115
+ gpt_cond_chunk_len=4,
116
+ max_ref_length=60,
117
  )
118
+
119
+ except Exception as e:
120
+ print("Speaker encoding error", str(e))
121
+ gr.Warning(
122
+ "It appears something wrong with reference, did you unmute your microphone?"
123
  )
124
+ return (None, None, None, None)
125
+
126
+ prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
127
+
128
+ print("I: Generating new audio...")
129
+ t0 = time.time()
130
+ out = MODEL.inference(
131
+ prompt,
132
+ language,
133
+ gpt_cond_latent,
134
+ speaker_embedding,
135
+ repetition_penalty=5.0,
136
+ temperature=0.75,
137
+ )
138
+ inference_time = time.time() - t0
139
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
140
+ metrics_text += (
141
+ f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
142
+ )
143
+ real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
144
+ print(f"Real-time factor (RTF): {real_time_factor}")
145
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
146
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
147
+
148
+ except RuntimeError as e:
149
+ if "device-side assert" in str(e):
150
+ # cannot do anything on cuda device side error, need tor estart
151
+ print(
152
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
153
+ flush=True,
154
  )
155
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
156
+ print("Cuda device-assert Runtime encountered need restart")
157
+ if not DEVICE_ASSERT_DETECTED:
158
+ DEVICE_ASSERT_DETECTED = 1
159
+ DEVICE_ASSERT_PROMPT = prompt
160
+ DEVICE_ASSERT_LANG = language
161
+
162
+ # just before restarting save what caused the issue so we can handle it in future
163
+ # Uploading Error data only happens for unrecovarable error
164
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
165
+ error_data = [
166
+ error_time,
167
  prompt,
168
  language,
169
+ audio_file_pth,
170
+ voice_cleanup,
171
+ ]
172
+ error_data = [str(e) if type(e) != str else e for e in error_data]
173
+ print(error_data)
174
+ print(speaker_wav)
175
+ write_io = StringIO()
176
+ csv.writer(write_io).writerows([error_data])
177
+ csv_upload = write_io.getvalue().encode()
178
+
179
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
180
+ print("Writing error csv")
181
+ error_api = HfApi()
182
+ error_api.upload_file(
183
+ path_or_fileobj=csv_upload,
184
+ path_in_repo=filename,
185
+ repo_id="coqui/xtts-flagged-dataset",
186
+ repo_type="dataset",
187
  )
188
 
189
+ # speaker_wav
190
+ print("Writing error reference audio")
191
+ speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
192
+ error_api = HfApi()
193
+ error_api.upload_file(
194
+ path_or_fileobj=speaker_wav,
195
+ path_in_repo=speaker_filename,
196
+ repo_id="coqui/xtts-flagged-dataset",
197
+ repo_type="dataset",
 
 
198
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ # HF Space specific.. This error is unrecoverable need to restart space
201
+ space = api.get_space_runtime(repo_id=repo_id)
202
+ if space.stage != "BUILDING":
203
+ api.restart_space(repo_id=repo_id)
204
  else:
205
+ print("TRIED TO RESTART but space is building")
206
+
207
+ else:
208
+ if "Failed to decode" in str(e):
209
+ print("Speaker encoding error", str(e))
210
+ gr.Warning(
211
+ "It appears something wrong with reference, did you unmute your microphone?"
 
 
 
 
 
 
212
  )
213
+ else:
214
+ print("RuntimeError: non device-side assert error:", str(e))
215
+ gr.Warning("Something unexpected happened please retry again.")
216
+ return (
217
+ None,
218
+ None,
219
+ None,
220
+ None,
221
+ )
222
+ return (
223
+ gr.make_waveform(
224
+ audio="output.wav",
225
+ ),
226
+ "output.wav",
227
+ metrics_text,
228
+ speaker_wav,
229
+ )
230
 
231
 
232
  title = "viXTTS Demo"
 
310
  info="Use your microphone to record audio",
311
  label="Use Microphone for Reference",
312
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
314
 
315
  with gr.Column():
 
325
  language_gr,
326
  ref_gr,
327
  mic_gr,
 
 
 
 
328
  ],
329
  outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
330
  )