Vineel Pratap commited on
Commit
6f27821
β€’
1 Parent(s): 78e8beb
app.py CHANGED
@@ -51,18 +51,22 @@ with gr.Blocks(css="style.css") as demo:
51
  interactive=False,
52
  label="Language Model Score",
53
  )
 
 
 
 
54
  btn = gr.Button("Submit", elem_id="submit")
55
 
56
  @gr.on(
57
- inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
58
  outputs=[wscore, lmscore],
59
  )
60
- def update_slider(ws, ls, lm):
61
 
62
  ws_slider = gr.Slider(
63
  minimum=-10.0,
64
  maximum=10.0,
65
- value=LM_SCORE_DEFAULT if lm is not None else 0,
66
  step=0.1,
67
  interactive=not ws,
68
  label="Word Insertion Score",
@@ -71,7 +75,7 @@ with gr.Blocks(css="style.css") as demo:
71
  minimum=-10.0,
72
  maximum=10.0,
73
  value=WORD_SCORE_DEFAULT_IF_NOLM
74
- if lm is None
75
  else WORD_SCORE_DEFAULT_IF_LM,
76
  step=0.1,
77
  interactive=not ls,
@@ -97,6 +101,7 @@ with gr.Blocks(css="style.css") as demo:
97
  lmscore,
98
  wscore_usedefault,
99
  lmscore_usedefault,
 
100
  reference,
101
  ],
102
  outputs=[text, logs],
@@ -118,7 +123,7 @@ with gr.Blocks(css="style.css") as demo:
118
  ],
119
  [
120
  "upload/english/english.mp3",
121
- "upload/english/cv8_top10k_words.txt",
122
  " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
123
  ],
124
  ],
 
51
  interactive=False,
52
  label="Language Model Score",
53
  )
54
+ with gr.Column():
55
+ autolm = gr.Checkbox(
56
+ label="Automatically create Unigram LM from text data", value=True
57
+ )
58
  btn = gr.Button("Submit", elem_id="submit")
59
 
60
  @gr.on(
61
+ inputs=[wscore_usedefault, lmscore_usedefault, lm_file, autolm],
62
  outputs=[wscore, lmscore],
63
  )
64
+ def update_slider(ws, ls, lm, alm):
65
 
66
  ws_slider = gr.Slider(
67
  minimum=-10.0,
68
  maximum=10.0,
69
+ value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
70
  step=0.1,
71
  interactive=not ws,
72
  label="Word Insertion Score",
 
75
  minimum=-10.0,
76
  maximum=10.0,
77
  value=WORD_SCORE_DEFAULT_IF_NOLM
78
+ if (lm is None and not alm)
79
  else WORD_SCORE_DEFAULT_IF_LM,
80
  step=0.1,
81
  interactive=not ls,
 
101
  lmscore,
102
  wscore_usedefault,
103
  lmscore_usedefault,
104
+ autolm,
105
  reference,
106
  ],
107
  outputs=[text, logs],
 
123
  ],
124
  [
125
  "upload/english/english.mp3",
126
+ "upload/english/gutenberg_27045.txt",
127
  " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
128
  ],
129
  ],
upload/english/gutenberg_27045.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6cb4e9c754924333e37dde766098f862ddd079c81009c77454f377c96b9ac19
3
+ size 84138
{normalization β†’ utils}/README.txt RENAMED
File without changes
{normalization β†’ utils}/__init__.py RENAMED
File without changes
utils/lm.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creates unigram LM following KenLM
2
+ import math
3
+ import shutil, tempfile
4
+
5
+ def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
6
+ """
7
+ Calculate log probabilities for each word in the corpus,
8
+ including a special <unk> token for unknown words.
9
+ """
10
+ total_words = sum(word_counts.values())
11
+ total_words += 2 * num_sentences # add counts for <s> and </s>
12
+ # Adjust total for <unk>
13
+ total_words_with_unk = total_words + 1 # Adding 1 for <unk>
14
+ total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
15
+
16
+ # Calculate probabilities, adjust for <unk>
17
+ probabilities = {
18
+ word: ((count + n_smoothing) / total_words_with_unk)
19
+ for word, count in word_counts.items()
20
+ }
21
+ probabilities["<unk>"] = 1 / total_words_with_unk
22
+ probabilities["<s>"] = (num_sentences + n_smoothing) / total_words_with_unk
23
+ probabilities["</s>"] = (num_sentences + n_smoothing) / total_words_with_unk
24
+
25
+ # Convert to log probabilities
26
+ return {word: math.log10(prob) for word, prob in probabilities.items()}
27
+
28
+ def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
29
+ with open(arpa_fpath, "r") as file:
30
+ lines = file.readlines()
31
+
32
+ # if ngram order >=2 , do not modify
33
+ if any(["2-grams:" in l for l in lines]):
34
+ return
35
+
36
+ with open(arpa_fpath, "w") as file:
37
+ for line in lines:
38
+ if line.strip().startswith("ngram 1="):
39
+ file.write(line)
40
+ file.write("ngram 2=1\n") # Add the new ngram line
41
+ continue
42
+
43
+ if line.strip() == "\\end\\":
44
+ file.write("\\2-grams:\n")
45
+ file.write("-9.9999999\t</s> <s>\n\n")
46
+
47
+ file.write(line)
48
+
49
+ def save_log_probabilities(log_probabilities, file_path):
50
+ with open(file_path, "w") as file:
51
+ file.write(f"\data\\")
52
+ file.write(f"\n")
53
+ file.write(f"ngram 1={len(log_probabilities)}\n\n")
54
+ file.write(f"\\1-grams:")
55
+ file.write(f"\n")
56
+ for word, log_prob in log_probabilities.items():
57
+ if word == "<s>":
58
+ log_prob = 0
59
+ file.write(f"{log_prob}\t{word}\n")
60
+ file.write(f"\n")
61
+ file.write(f"\end\\")
62
+
63
+ def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
64
+ log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
65
+ save_log_probabilities(log_probs, file_path)
66
+
67
+
68
+
69
+
70
+
71
+
{normalization β†’ utils}/norm_config.py RENAMED
File without changes
{normalization β†’ utils}/punctuations.lst RENAMED
File without changes
{normalization β†’ utils}/text_norm.py RENAMED
@@ -2,7 +2,7 @@ import json
2
  import re
3
  import unicodedata
4
 
5
- from normalization.norm_config import norm_config
6
 
7
 
8
  def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
 
2
  import re
3
  import unicodedata
4
 
5
+ from utils.norm_config import norm_config
6
 
7
 
8
  def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
zeroshot.py CHANGED
@@ -9,7 +9,8 @@ import numpy as np
9
  from transformers import Wav2Vec2ForCTC, AutoProcessor
10
  from huggingface_hub import hf_hub_download
11
  from torchaudio.models.decoder import ctc_decoder
12
- from normalization.text_norm import text_normalize
 
13
 
14
  uroman_dir = "uroman"
15
  assert os.path.exists(uroman_dir)
@@ -33,8 +34,8 @@ class MY_LOG:
33
  def __init__(self):
34
  self.text = "[START]"
35
 
36
- def add(self, new_log):
37
- self.text = self.text + "\n" + new_log
38
  self.text = self.text.strip()
39
  return self.text
40
 
@@ -92,15 +93,17 @@ def filter_lexicon(lexicon, word_counts):
92
 
93
  def load_words(filepath):
94
  words = {}
 
95
  with open(filepath) as f:
96
  for line in f:
97
  line = line.strip().lower()
 
98
  line = text_normalize(line, iso_code="xxx")
99
  # ignore invalid words.
100
  for w in line.split():
101
  words.setdefault(w, 0)
102
  words[w] += 1
103
- return words
104
 
105
 
106
  def process(
@@ -111,6 +114,7 @@ def process(
111
  lmscore=None,
112
  wscore_usedefault=True,
113
  lmscore_usedefault=True,
 
114
  reference=None,
115
  ):
116
  transcription, logs = "", MY_LOG()
@@ -154,13 +158,13 @@ def process(
154
  # Setup lexicon and decoder
155
  yield transcription, logs.add(f"Loading words....")
156
  try:
157
- word_counts = load_words(words_file)
158
  except Exception as e:
159
  yield f"ERROR: Loading words failed '{str(e)}'", logs.text
160
  return
161
 
162
  yield transcription, logs.add(
163
- f"Loaded {len(word_counts)} words.\nPreparing lexicon...."
164
  )
165
 
166
  try:
@@ -168,15 +172,35 @@ def process(
168
  except Exception as e:
169
  yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
170
  return
171
-
 
 
172
  yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
173
 
 
 
 
 
 
 
 
 
 
174
  if lm_path is None:
175
  yield transcription, logs.add(f"Filtering lexicon....")
176
  lexicon = filter_lexicon(lexicon, word_counts)
177
  yield transcription, logs.add(
178
  f"Ok. Leixcon size after filtering: {len(lexicon)}"
179
  )
 
 
 
 
 
 
 
 
 
180
  # print(lexicon["the"], lexicon["\"(t)he"])
181
  with tempfile.NamedTemporaryFile() as lexicon_file:
182
  if lm_path is not None and not lm_path.strip():
 
9
  from transformers import Wav2Vec2ForCTC, AutoProcessor
10
  from huggingface_hub import hf_hub_download
11
  from torchaudio.models.decoder import ctc_decoder
12
+ from utils.text_norm import text_normalize
13
+ from utils.lm import create_unigram_lm, maybe_generate_pseudo_bigram_arpa
14
 
15
  uroman_dir = "uroman"
16
  assert os.path.exists(uroman_dir)
 
34
  def __init__(self):
35
  self.text = "[START]"
36
 
37
+ def add(self, new_log, new_line= True):
38
+ self.text = self.text + ("\n" if new_line else " ") + new_log
39
  self.text = self.text.strip()
40
  return self.text
41
 
 
93
 
94
  def load_words(filepath):
95
  words = {}
96
+ num_sentences = 0
97
  with open(filepath) as f:
98
  for line in f:
99
  line = line.strip().lower()
100
+ num_sentences += 1
101
  line = text_normalize(line, iso_code="xxx")
102
  # ignore invalid words.
103
  for w in line.split():
104
  words.setdefault(w, 0)
105
  words[w] += 1
106
+ return words, num_sentences
107
 
108
 
109
  def process(
 
114
  lmscore=None,
115
  wscore_usedefault=True,
116
  lmscore_usedefault=True,
117
+ autolm=True,
118
  reference=None,
119
  ):
120
  transcription, logs = "", MY_LOG()
 
158
  # Setup lexicon and decoder
159
  yield transcription, logs.add(f"Loading words....")
160
  try:
161
+ word_counts, num_sentences = load_words(words_file)
162
  except Exception as e:
163
  yield f"ERROR: Loading words failed '{str(e)}'", logs.text
164
  return
165
 
166
  yield transcription, logs.add(
167
+ f"Loaded {len(word_counts)} words from {num_sentences} lines.\nPreparing lexicon...."
168
  )
169
 
170
  try:
 
172
  except Exception as e:
173
  yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
174
  return
175
+ # for k, v in lexicon.items():
176
+ # if len(v) < 5:
177
+ # print(k, v)
178
  yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
179
 
180
+ # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
181
+ tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
182
+ if autolm and any([cnt > 2 for cnt in word_counts.values()]):
183
+ yield transcription, logs.add(f"Creating unigram LM...", False)
184
+ lm_path = tmp_file.name
185
+ create_unigram_lm(word_counts, num_sentences, lm_path)
186
+ yield transcription, logs.add(f"OK")
187
+
188
+
189
  if lm_path is None:
190
  yield transcription, logs.add(f"Filtering lexicon....")
191
  lexicon = filter_lexicon(lexicon, word_counts)
192
  yield transcription, logs.add(
193
  f"Ok. Leixcon size after filtering: {len(lexicon)}"
194
  )
195
+ else:
196
+ # kenlm throws an error if unigram LM is being used
197
+ # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
198
+ maybe_generate_pseudo_bigram_arpa(lm_path)
199
+
200
+ # for k, v in lexicon.items():
201
+ # if len(v) < 5:
202
+ # print(k, v)
203
+
204
  # print(lexicon["the"], lexicon["\"(t)he"])
205
  with tempfile.NamedTemporaryFile() as lexicon_file:
206
  if lm_path is not None and not lm_path.strip():