txya900619 commited on
Commit
e18ca68
1 Parent(s): bfffc8e

feat: init upload

Browse files
Files changed (3) hide show
  1. app.py +115 -0
  2. configs/models.yaml +9 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from omegaconf import OmegaConf
4
+ from transformers import pipeline
5
+
6
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+
9
+
10
+ def load_pipe(model_id: str):
11
+ return pipeline(
12
+ "automatic-speech-recognition",
13
+ model=model_id,
14
+ max_new_tokens=128,
15
+ chunk_length_s=30,
16
+ torch_dtype=torch_dtype,
17
+ device=device,
18
+ )
19
+
20
+ OmegaConf.register_new_resolver("load_pipe", load_pipe)
21
+
22
+ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
23
+
24
+ def automatic_speech_recognition(model_id: str, dialect_id: str, audio_file: str):
25
+ model = models_config[model_id]["model"]
26
+ print(model)
27
+ generate_kwargs = {
28
+ "task": "transcribe",
29
+ "language": "Chinese",
30
+ "prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to(
31
+ device
32
+ ),
33
+ }
34
+ return model(audio_file, generate_kwargs=generate_kwargs)["text"].replace(f" {dialect_id}", "")
35
+
36
+
37
+ def when_model_selected(model_id: str):
38
+ model_config = models_config[model_id]
39
+
40
+ dialect_drop_down_choices = [
41
+ (k, v) for k, v in model_config["dialect_mapping"].items()
42
+ ]
43
+
44
+ return gr.update(
45
+ choices=dialect_drop_down_choices,
46
+ value=dialect_drop_down_choices[0][1],
47
+ )
48
+
49
+
50
+ demo = gr.Blocks(
51
+ title="臺灣客語語音辨識系統",
52
+ css="@import url(https://tauhu.tw/tauhu-oo.css);",
53
+ theme=gr.themes.Default(
54
+ font=(
55
+ "tauhu-oo",
56
+ gr.themes.GoogleFont("Source Sans Pro"),
57
+ "ui-sans-serif",
58
+ "system-ui",
59
+ "sans-serif",
60
+ )
61
+ ),
62
+ )
63
+
64
+ with demo:
65
+ default_model_id = list(models_config.keys())[0]
66
+ model_drop_down = gr.Dropdown(
67
+ models_config.keys(),
68
+ value=default_model_id,
69
+ label="模型",
70
+ )
71
+
72
+ dialect_drop_down = gr.Dropdown(
73
+ choices=[
74
+ (k, v)
75
+ for k, v in models_config[default_model_id]["dialect_mapping"].items()
76
+ ],
77
+ value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
78
+ label="腔調",
79
+ )
80
+
81
+ model_drop_down.input(
82
+ when_model_selected,
83
+ inputs=[model_drop_down],
84
+ outputs=[dialect_drop_down],
85
+ )
86
+
87
+ gr.Markdown(
88
+ """
89
+ # 臺灣客語語音合成系統
90
+ ### Taiwanese Hakka Text-to-Speech System
91
+ ### 研發
92
+ - **[李鴻欣 Hung-Shin Lee](mailto:[email protected])(諾思資訊 North Co., Ltd.)**
93
+ - **[陳力瑋 Li-Wei Chen](mailto:[email protected])(諾思資訊 North Co., Ltd.)**
94
+ """
95
+ )
96
+ gr.Interface(
97
+ automatic_speech_recognition,
98
+ inputs=[
99
+ model_drop_down,
100
+ dialect_drop_down,
101
+ gr.Audio(
102
+ label="上傳或錄音",
103
+ type="filepath",
104
+ waveform_options=gr.WaveformOptions(
105
+ sample_rate=16000,
106
+ ),
107
+ ),
108
+ ],
109
+ outputs=[
110
+ gr.Text(interactive=False, label="客語漢字"),
111
+ ],
112
+ allow_flagging="auto",
113
+ )
114
+
115
+ demo.launch()
configs/models.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ whisper-large-v3:
2
+ model: ${load_pipe:formospeech/whisper-large-v3-taiwanese-hakka}
3
+ dialect_mapping:
4
+ 四縣: htia_sixian
5
+ 海陸: htia_hailu
6
+ 大埔: htia_dapu
7
+ 饒平: htia_raoping
8
+ 詔安: htia_zhaoan
9
+ 南四縣: htia_nansixian
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ omegaconf
2
+ torch
3
+ transformers