Vineel Pratap commited on
Commit
f138a14
1 Parent(s): a7567f9

resampling fix

Browse files
Files changed (6) hide show
  1. app.py +2 -1
  2. requirements.txt +2 -2
  3. utils/lm.py +9 -11
  4. utils/norm_config.py +8 -9
  5. utils/text_norm.py +15 -6
  6. zeroshot.py +9 -8
app.py CHANGED
@@ -53,7 +53,8 @@ with gr.Blocks(css="style.css") as demo:
53
  )
54
  with gr.Column():
55
  autolm = gr.Checkbox(
56
- label="Automatically create Unigram LM from text data", value=True
 
57
  )
58
  btn = gr.Button("Submit", elem_id="submit")
59
 
 
53
  )
54
  with gr.Column():
55
  autolm = gr.Checkbox(
56
+ label="Automatically create Unigram LM from text data",
57
+ value=True,
58
  )
59
  btn = gr.Button("Submit", elem_id="submit")
60
 
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f328d8dae24238aaabf770e035eefc60f84e7bdd844cb04787e4b049b85e0e22
3
- size 171
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b8b278a5c4d2fc182c7893bcc683ad261ab0612cea1da58aaed1b358fd9649
3
+ size 164
utils/lm.py CHANGED
@@ -1,14 +1,15 @@
1
- # Creates unigram LM following KenLM
2
- import math
3
  import shutil, tempfile
4
 
 
5
  def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
6
  """
7
  Calculate log probabilities for each word in the corpus,
8
  including a special <unk> token for unknown words.
9
  """
10
- total_words = sum(word_counts.values())
11
- total_words += 2 * num_sentences # add counts for <s> and </s>
12
  # Adjust total for <unk>
13
  total_words_with_unk = total_words + 1 # Adding 1 for <unk>
14
  total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
@@ -25,6 +26,7 @@ def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
25
  # Convert to log probabilities
26
  return {word: math.log10(prob) for word, prob in probabilities.items()}
27
 
 
28
  def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
29
  with open(arpa_fpath, "r") as file:
30
  lines = file.readlines()
@@ -46,6 +48,7 @@ def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
46
 
47
  file.write(line)
48
 
 
49
  def save_log_probabilities(log_probabilities, file_path):
50
  with open(file_path, "w") as file:
51
  file.write(f"\data\\")
@@ -59,13 +62,8 @@ def save_log_probabilities(log_probabilities, file_path):
59
  file.write(f"{log_prob}\t{word}\n")
60
  file.write(f"\n")
61
  file.write(f"\end\\")
62
-
 
63
  def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
64
  log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
65
  save_log_probabilities(log_probs, file_path)
66
-
67
-
68
-
69
-
70
-
71
-
 
1
+ # Creates unigram LM following KenLM
2
+ import math
3
  import shutil, tempfile
4
 
5
+
6
  def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
7
  """
8
  Calculate log probabilities for each word in the corpus,
9
  including a special <unk> token for unknown words.
10
  """
11
+ total_words = sum(word_counts.values())
12
+ total_words += 2 * num_sentences # add counts for <s> and </s>
13
  # Adjust total for <unk>
14
  total_words_with_unk = total_words + 1 # Adding 1 for <unk>
15
  total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
 
26
  # Convert to log probabilities
27
  return {word: math.log10(prob) for word, prob in probabilities.items()}
28
 
29
+
30
  def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
31
  with open(arpa_fpath, "r") as file:
32
  lines = file.readlines()
 
48
 
49
  file.write(line)
50
 
51
+
52
  def save_log_probabilities(log_probabilities, file_path):
53
  with open(file_path, "w") as file:
54
  file.write(f"\data\\")
 
62
  file.write(f"{log_prob}\t{word}\n")
63
  file.write(f"\n")
64
  file.write(f"\end\\")
65
+
66
+
67
  def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
68
  log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
69
  save_log_probabilities(log_probs, file_path)
 
 
 
 
 
 
utils/norm_config.py CHANGED
@@ -42,7 +42,7 @@ inverted_question_mark = r"\u00BF"
42
 
43
 
44
  # Hindi
45
- hindi_danda = u"\u0964"
46
 
47
  # Egyptian Arabic
48
  # arabic_percent = r"\u066A"
@@ -175,7 +175,7 @@ nominal_digit_shapes = r"\u206f"
175
  with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
176
  punc_list = punc_f.readlines()
177
 
178
- punct_pattern = r""
179
  for punc in punc_list:
180
  # the first character in the tab separated line is the punc to be removed
181
  punct_pattern += re.escape(punc.split("\t")[0])
@@ -213,7 +213,6 @@ shared_punc_list = (
213
  + arabic_question_mark
214
  + chinese_punc
215
  + punct_pattern
216
-
217
  )
218
 
219
  shared_mappping = {
@@ -242,11 +241,11 @@ norm_config = {
242
  "mapping": shared_mappping,
243
  "digit_set": shared_digits,
244
  "unicode_norm": "NFKC",
245
- "rm_diacritics" : False,
246
  }
247
  }
248
 
249
- #=============== Mongolian ===============#
250
 
251
  norm_config["mon"] = norm_config["*"].copy()
252
  # add soft hyphen to punc list to match with fleurs
@@ -254,23 +253,23 @@ norm_config["mon"]["del_set"] += r"\u00AD"
254
 
255
  norm_config["khk"] = norm_config["mon"].copy()
256
 
257
- #=============== Hebrew ===============#
258
 
259
  norm_config["heb"] = norm_config["*"].copy()
260
  # add "HEBREW POINT" symbols to match with fleurs
261
  norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
262
 
263
- #=============== Thai ===============#
264
 
265
  norm_config["tha"] = norm_config["*"].copy()
266
  # add "Zero width joiner" symbols to match with fleurs
267
  norm_config["tha"]["punc_set"] += r"\u200D"
268
 
269
- #=============== Arabic ===============#
270
  norm_config["ara"] = norm_config["*"].copy()
271
  norm_config["ara"]["mapping"]["ٱ"] = "ا"
272
  norm_config["arb"] = norm_config["ara"].copy()
273
 
274
- #=============== Javanese ===============#
275
  norm_config["jav"] = norm_config["*"].copy()
276
  norm_config["jav"]["rm_diacritics"] = True
 
42
 
43
 
44
  # Hindi
45
+ hindi_danda = "\u0964"
46
 
47
  # Egyptian Arabic
48
  # arabic_percent = r"\u066A"
 
175
  with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
176
  punc_list = punc_f.readlines()
177
 
178
+ punct_pattern = r""
179
  for punc in punc_list:
180
  # the first character in the tab separated line is the punc to be removed
181
  punct_pattern += re.escape(punc.split("\t")[0])
 
213
  + arabic_question_mark
214
  + chinese_punc
215
  + punct_pattern
 
216
  )
217
 
218
  shared_mappping = {
 
241
  "mapping": shared_mappping,
242
  "digit_set": shared_digits,
243
  "unicode_norm": "NFKC",
244
+ "rm_diacritics": False,
245
  }
246
  }
247
 
248
+ # =============== Mongolian ===============#
249
 
250
  norm_config["mon"] = norm_config["*"].copy()
251
  # add soft hyphen to punc list to match with fleurs
 
253
 
254
  norm_config["khk"] = norm_config["mon"].copy()
255
 
256
+ # =============== Hebrew ===============#
257
 
258
  norm_config["heb"] = norm_config["*"].copy()
259
  # add "HEBREW POINT" symbols to match with fleurs
260
  norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
261
 
262
+ # =============== Thai ===============#
263
 
264
  norm_config["tha"] = norm_config["*"].copy()
265
  # add "Zero width joiner" symbols to match with fleurs
266
  norm_config["tha"]["punc_set"] += r"\u200D"
267
 
268
+ # =============== Arabic ===============#
269
  norm_config["ara"] = norm_config["*"].copy()
270
  norm_config["ara"]["mapping"]["ٱ"] = "ا"
271
  norm_config["arb"] = norm_config["ara"].copy()
272
 
273
+ # =============== Javanese ===============#
274
  norm_config["jav"] = norm_config["*"].copy()
275
  norm_config["jav"]["rm_diacritics"] = True
utils/text_norm.py CHANGED
@@ -5,7 +5,9 @@ import unicodedata
5
  from utils.norm_config import norm_config
6
 
7
 
8
- def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
 
 
9
 
10
  """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
11
 
@@ -15,17 +17,23 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
15
  remove_numbers : Boolean flag to specify if words containing only digits should be removed
16
 
17
  Returns:
18
- normalized_text : the string after all normalization
19
 
20
  """
21
 
22
  config = norm_config.get(iso_code, norm_config["*"])
23
 
24
- for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]:
 
 
 
 
 
 
 
25
  if field not in config:
26
  config[field] = norm_config["*"][field]
27
 
28
-
29
  text = unicodedata.normalize(config["unicode_norm"], text)
30
 
31
  # Convert to lower case
@@ -34,7 +42,7 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
34
  text = text.lower()
35
 
36
  # brackets
37
-
38
  # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
39
  text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
40
  if remove_brackets:
@@ -84,9 +92,10 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
84
 
85
  if config["rm_diacritics"]:
86
  from unidecode import unidecode
 
87
  normalized_text = unidecode(normalized_text)
88
 
89
  # Remove extra spaces
90
  normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
91
 
92
- return normalized_text
 
5
  from utils.norm_config import norm_config
6
 
7
 
8
+ def text_normalize(
9
+ text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
10
+ ):
11
 
12
  """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
13
 
 
17
  remove_numbers : Boolean flag to specify if words containing only digits should be removed
18
 
19
  Returns:
20
+ normalized_text : the string after all normalization
21
 
22
  """
23
 
24
  config = norm_config.get(iso_code, norm_config["*"])
25
 
26
+ for field in [
27
+ "lower_case",
28
+ "punc_set",
29
+ "del_set",
30
+ "mapping",
31
+ "digit_set",
32
+ "unicode_norm",
33
+ ]:
34
  if field not in config:
35
  config[field] = norm_config["*"][field]
36
 
 
37
  text = unicodedata.normalize(config["unicode_norm"], text)
38
 
39
  # Convert to lower case
 
42
  text = text.lower()
43
 
44
  # brackets
45
+
46
  # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
47
  text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
48
  if remove_brackets:
 
92
 
93
  if config["rm_diacritics"]:
94
  from unidecode import unidecode
95
+
96
  normalized_text = unidecode(normalized_text)
97
 
98
  # Remove extra spaces
99
  normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
100
 
101
+ return normalized_text
zeroshot.py CHANGED
@@ -34,7 +34,7 @@ class MY_LOG:
34
  def __init__(self):
35
  self.text = "[START]"
36
 
37
- def add(self, new_log, new_line= True):
38
  self.text = self.text + ("\n" if new_line else " ") + new_log
39
  self.text = self.text.strip()
40
  return self.text
@@ -127,7 +127,9 @@ def process(
127
  audio_samples = (audio_samples / 32768.0).astype(float)
128
 
129
  if sr != ASR_SAMPLING_RATE:
130
- audio_samples = librosa.resample(audio_samples, sr, ASR_SAMPLING_RATE)
 
 
131
  else:
132
  # file upload
133
  assert isinstance(audio_data, str)
@@ -179,15 +181,14 @@ def process(
179
  # print(k, v)
180
  yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
181
 
182
- # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
183
- tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
184
  if autolm and any([cnt > 2 for cnt in word_counts.values()]):
185
  yield transcription, logs.add(f"Creating unigram LM...", False)
186
- lm_path = tmp_file.name
187
  create_unigram_lm(word_counts, num_sentences, lm_path)
188
  yield transcription, logs.add(f"OK")
189
 
190
-
191
  if lm_path is None:
192
  yield transcription, logs.add(f"Filtering lexicon....")
193
  lexicon = filter_lexicon(lexicon, word_counts)
@@ -195,8 +196,8 @@ def process(
195
  f"Ok. Leixcon size after filtering: {len(lexicon)}"
196
  )
197
  else:
198
- # kenlm throws an error if unigram LM is being used
199
- # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
200
  maybe_generate_pseudo_bigram_arpa(lm_path)
201
 
202
  # for k, v in lexicon.items():
 
34
  def __init__(self):
35
  self.text = "[START]"
36
 
37
+ def add(self, new_log, new_line=True):
38
  self.text = self.text + ("\n" if new_line else " ") + new_log
39
  self.text = self.text.strip()
40
  return self.text
 
127
  audio_samples = (audio_samples / 32768.0).astype(float)
128
 
129
  if sr != ASR_SAMPLING_RATE:
130
+ audio_samples = librosa.resample(
131
+ audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
132
+ )
133
  else:
134
  # file upload
135
  assert isinstance(audio_data, str)
 
181
  # print(k, v)
182
  yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
183
 
184
+ # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
185
+ tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
186
  if autolm and any([cnt > 2 for cnt in word_counts.values()]):
187
  yield transcription, logs.add(f"Creating unigram LM...", False)
188
+ lm_path = tmp_file.name
189
  create_unigram_lm(word_counts, num_sentences, lm_path)
190
  yield transcription, logs.add(f"OK")
191
 
 
192
  if lm_path is None:
193
  yield transcription, logs.add(f"Filtering lexicon....")
194
  lexicon = filter_lexicon(lexicon, word_counts)
 
196
  f"Ok. Leixcon size after filtering: {len(lexicon)}"
197
  )
198
  else:
199
+ # kenlm throws an error if unigram LM is being used
200
+ # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
201
  maybe_generate_pseudo_bigram_arpa(lm_path)
202
 
203
  # for k, v in lexicon.items():