thinhlpg commited on
Commit
f58d262
1 Parent(s): 376b5d9

feat: add vietnamese normalize

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +55 -34
.gitignore CHANGED
@@ -1,6 +1,7 @@
1
  vixtts-demo.code-workspace
2
  output.wav
3
  model/
 
4
 
5
  # Byte-compiled / optimized / DLL files
6
  __pycache__/
 
1
  vixtts-demo.code-workspace
2
  output.wav
3
  model/
4
+ test_api.ipynb
5
 
6
  # Byte-compiled / optimized / DLL files
7
  __pycache__/
app.py CHANGED
@@ -1,14 +1,3 @@
1
- import os
2
- import time
3
- import uuid
4
-
5
- import torch
6
- import torchaudio
7
-
8
- # download for mecab
9
- os.system("python -m unidic download")
10
-
11
-
12
  import csv
13
  import datetime
14
  import os
@@ -68,36 +57,55 @@ if not "vi" in supported_languages:
68
  supported_languages.append("vi")
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def predict(
72
  prompt,
73
  language,
74
  audio_file_pth,
75
- voice_cleanup,
76
  ):
77
  if language not in supported_languages:
78
- gr.Warning(
79
  f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
80
  )
81
 
82
- return (
83
- None,
84
- None,
85
- None,
86
- None,
87
- )
88
 
89
  speaker_wav = audio_file_pth
90
 
91
  if len(prompt) < 2:
92
- gr.Warning("Please give a longer prompt text")
93
- return (None, None, None, None)
94
-
95
- if len(prompt) > 200:
96
- gr.Warning(
97
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
98
- )
99
- return (None, None, None, None)
100
-
101
  try:
102
  metrics_text = ""
103
  t_latent = time.time()
@@ -115,13 +123,16 @@ def predict(
115
 
116
  except Exception as e:
117
  print("Speaker encoding error", str(e))
118
- gr.Warning(
119
  "It appears something wrong with reference, did you unmute your microphone?"
120
  )
121
- return (None, None, None, None)
122
 
123
  prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
124
 
 
 
 
125
  print("I: Generating new audio...")
126
  t0 = time.time()
127
  out = MODEL.inference(
@@ -131,6 +142,7 @@ def predict(
131
  speaker_embedding,
132
  repetition_penalty=5.0,
133
  temperature=0.75,
 
134
  )
135
  inference_time = time.time() - t0
136
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
@@ -140,6 +152,11 @@ def predict(
140
  real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
141
  print(f"Real-time factor (RTF): {real_time_factor}")
142
  metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
 
 
 
 
 
143
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
144
 
145
  except RuntimeError as e:
@@ -158,7 +175,6 @@ def predict(
158
  prompt,
159
  language,
160
  audio_file_pth,
161
- voice_cleanup,
162
  ]
163
  error_data = [str(e) if type(e) != str else e for e in error_data]
164
  print(error_data)
@@ -198,8 +214,8 @@ def predict(
198
  else:
199
  if "Failed to decode" in str(e):
200
  print("Speaker encoding error", str(e))
201
- gr.Warning(
202
- "It appears something wrong with reference, did you unmute your microphone?"
203
  )
204
  else:
205
  print("RuntimeError: non device-side assert error:", str(e))
@@ -230,7 +246,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
230
  input_text_gr = gr.Textbox(
231
  label="Text Prompt",
232
  info="One or two sentences at a time is better. Up to 200 text characters.",
233
- value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
234
  )
235
  language_gr = gr.Dropdown(
236
  label="Language",
@@ -258,6 +274,11 @@ with gr.Blocks(analytics_enabled=False) as demo:
258
  max_choices=1,
259
  value="vi",
260
  )
 
 
 
 
 
261
  ref_gr = gr.Audio(
262
  label="Reference Audio",
263
  info="Click on the ✎ button to upload your own target speaker audio",
 
 
 
 
 
 
 
 
 
 
 
 
1
  import csv
2
  import datetime
3
  import os
 
57
  supported_languages.append("vi")
58
 
59
 
60
+ def normalize_vietnamese_text(text):
61
+ text = (
62
+ TTSnorm(text, unknown=False, lower=False, rule=True)
63
+ .replace("..", ".")
64
+ .replace("!.", "!")
65
+ .replace("?.", "?")
66
+ .replace(" .", ".")
67
+ .replace(" ,", ",")
68
+ .replace('"', "")
69
+ .replace("'", "")
70
+ .replace("AI", "Ây Ai")
71
+ .replace("A.I", "Ây Ai")
72
+ )
73
+ return text
74
+
75
+
76
+ def calculate_keep_len(text, lang):
77
+ """Simple hack for short sentences"""
78
+ if lang in ["ja", "zh-cn"]:
79
+ return -1
80
+
81
+ word_count = len(text.split())
82
+ num_punct = text.count(".") + text.count("!") + text.count("?") + text.count(",")
83
+
84
+ if word_count < 5:
85
+ return 15000 * word_count + 2000 * num_punct
86
+ elif word_count < 10:
87
+ return 13000 * word_count + 2000 * num_punct
88
+ return -1
89
+
90
+
91
  def predict(
92
  prompt,
93
  language,
94
  audio_file_pth,
95
+ normalize_text=True,
96
  ):
97
  if language not in supported_languages:
98
+ metrics_text = gr.Warning(
99
  f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
100
  )
101
 
102
+ return (None, metrics_text)
 
 
 
 
 
103
 
104
  speaker_wav = audio_file_pth
105
 
106
  if len(prompt) < 2:
107
+ metrics_text = gr.Warning("Please give a longer prompt text")
108
+ return (None, metrics_text)
 
 
 
 
 
 
 
109
  try:
110
  metrics_text = ""
111
  t_latent = time.time()
 
123
 
124
  except Exception as e:
125
  print("Speaker encoding error", str(e))
126
+ metrics_text = gr.Warning(
127
  "It appears something wrong with reference, did you unmute your microphone?"
128
  )
129
+ return (None, metrics_text)
130
 
131
  prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
132
 
133
+ if normalize_text and language == "vi":
134
+ prompt = normalize_vietnamese_text(prompt)
135
+
136
  print("I: Generating new audio...")
137
  t0 = time.time()
138
  out = MODEL.inference(
 
142
  speaker_embedding,
143
  repetition_penalty=5.0,
144
  temperature=0.75,
145
+ enable_text_splitting=True,
146
  )
147
  inference_time = time.time() - t0
148
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
 
152
  real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
153
  print(f"Real-time factor (RTF): {real_time_factor}")
154
  metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
155
+
156
+ # Temporary hack for short sentences
157
+ keep_len = calculate_keep_len(prompt, language)
158
+ out["wav"] = out["wav"][:keep_len]
159
+
160
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
161
 
162
  except RuntimeError as e:
 
175
  prompt,
176
  language,
177
  audio_file_pth,
 
178
  ]
179
  error_data = [str(e) if type(e) != str else e for e in error_data]
180
  print(error_data)
 
214
  else:
215
  if "Failed to decode" in str(e):
216
  print("Speaker encoding error", str(e))
217
+ metrics_text = gr.Warning(
218
+ metrics_text="It appears something wrong with reference, did you unmute your microphone?"
219
  )
220
  else:
221
  print("RuntimeError: non device-side assert error:", str(e))
 
246
  input_text_gr = gr.Textbox(
247
  label="Text Prompt",
248
  info="One or two sentences at a time is better. Up to 200 text characters.",
249
+ value="Xin chào, tôi một hình chuyển đổi văn bản thành giọng nói tiếng Việt",
250
  )
251
  language_gr = gr.Dropdown(
252
  label="Language",
 
274
  max_choices=1,
275
  value="vi",
276
  )
277
+ normalize_text = gr.Checkbox(
278
+ label="Normalize Vietnamese Text",
279
+ info="Normalize Vietnamese Text",
280
+ default=True,
281
+ )
282
  ref_gr = gr.Audio(
283
  label="Reference Audio",
284
  info="Click on the ✎ button to upload your own target speaker audio",