codewithdark commited on
Commit
f12e846
1 Parent(s): f74f938

Upload 4 files

Browse files
Files changed (4) hide show
  1. demo.py +67 -0
  2. poetry.lock +0 -0
  3. pyproject.toml +22 -0
  4. requirements.txt +6 -0
demo.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sounddevice as sd
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
6
+ import soundfile as sf # Using soundfile for audio file handling
7
+ import librosa
8
+
9
+ # Load model
10
+ @st.cache_resource
11
+ def load_model():
12
+ processor = AutoProcessor.from_pretrained("codewithdark/WhisperLiveSubs")
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("codewithdark/WhisperLiveSubs")
14
+ return processor, model
15
+
16
+ try:
17
+ processor, model = load_model()
18
+ except ConnectionError as e:
19
+ st.error(f"Error loading model: Check your Internet Connection")
20
+ except Exception as e:
21
+ st.error(f"Error loading model: Please try again")
22
+
23
+ # Function to transcribe audio
24
+ def transcribe_audio(audio, sample_rate):
25
+ # Ensure audio is in the expected format
26
+ audio = np.array(audio) # Convert to numpy array if needed
27
+ input_features = processor(audio, sampling_rate=sample_rate, return_tensors="pt").input_features
28
+ predicted_ids = model.generate(input_features)
29
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
30
+ return transcription[0]
31
+
32
+ # Streamlit app
33
+ st.title("Speech-to-Text Transcription")
34
+
35
+ # File upload
36
+ uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3"])
37
+ if uploaded_file is not None:
38
+ try:
39
+ # Read the audio file
40
+ audio_data, sample_rate = sf.read(uploaded_file)
41
+
42
+ # Resample if necessary
43
+ target_sample_rate = 16000
44
+ if sample_rate != target_sample_rate:
45
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=target_sample_rate)
46
+
47
+ # Ensure audio_data is 1D
48
+ if audio_data.ndim > 1:
49
+ audio_data = audio_data.mean(axis=1)
50
+
51
+ st.audio(uploaded_file, format="audio/wav")
52
+ transcription = transcribe_audio(audio_data, target_sample_rate)
53
+ st.write("Transcription:", transcription)
54
+ except Exception as e:
55
+ st.error(f"Error processing the file: {e}")
56
+
57
+ # Real-time voice input
58
+ if st.button("Start Recording"):
59
+ duration = 15 # Record for 15 seconds
60
+ sample_rate = 16000
61
+ st.write("Recording...")
62
+ recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
63
+ sd.wait()
64
+ st.write("Recording finished!")
65
+ audio_data = recording.flatten()
66
+ transcription = transcribe_audio(audio_data, sample_rate)
67
+ st.write("Transcription:", transcription)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "WhisperLiveSubs"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Dark Coder <[email protected]>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ streamlit = "^1.38.0"
12
+ sounddevice = "^0.5.0"
13
+ numpy = "^2.1.1"
14
+ scipy = "^1.14.1"
15
+ torch = "^2.4.1"
16
+ transformers = "^4.44.2"
17
+ soundfile = "^0.12.1"
18
+
19
+
20
+ [build-system]
21
+ requires = ["poetry-core"]
22
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ sounddevice==0.4.7
3
+ numpy==1.25.2
4
+ torch==2.0.1
5
+ transformers==4.31.0
6
+ scipy==1.12.0