SivaResearch's picture
Update app.py
72c5e70 verified
raw
history blame contribute delete
No virus
3.22 kB
import joblib
from transformers import AutoFeatureExtractor, WavLMModel
import torch
import soundfile as sf
import numpy as np
import gradio as gr
import librosa
class HuggingFaceFeatureExtractor:
def __init__(self, model_class, name):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.feature_extractor = AutoFeatureExtractor.from_pretrained(name)
self.model = model_class.from_pretrained(name)
self.model.eval()
self.model.to(self.device)
def __call__(self, audio, sr):
inputs = self.feature_extractor(
audio,
sampling_rate=sr,
return_tensors="pt",
padding=True,
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
return outputs.last_hidden_state
FEATURE_EXTRACTORS = {
"wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"),
"wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V1"),
"wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V2"),
"wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V3"),
}
model1 = joblib.load('model1.joblib')
model2 = joblib.load('model2.joblib')
model3 = joblib.load('model3.joblib')
model4 = joblib.load('model4.joblib')
final_model = joblib.load('final_model.joblib')
def process_audio(file_audio):
audio, sr = librosa.load(file_audio, sr=16000) # Resample to 16 kHz
if len(audio.shape) > 1:
audio = audio[0]
extractor_1 = FEATURE_EXTRACTORS['wavlm-base']()
extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']()
extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']()
extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']()
eval1 = extractor_1(audio, sr)
eval1 = torch.mean(eval1, dim=1).cpu().numpy()
eval2 = extractor_2(audio, sr)
eval2 = torch.mean(eval2, dim=1).cpu().numpy()
eval3 = extractor_3(audio, sr)
eval3 = torch.mean(eval3, dim=1).cpu().numpy()
eval4 = extractor_4(audio, sr)
eval4 = torch.mean(eval4, dim=1).cpu().numpy()
eval1 = eval1.reshape(1, -1)
eval2 = eval2.reshape(1, -1)
eval3 = eval3.reshape(1, -1)
eval4 = eval4.reshape(1, -1)
eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1)
eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1)
eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1)
eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1)
eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4))
final_prob = final_model.predict_proba(eval_combined_probs)[:, 1]
if final_prob < 0.5:
return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%"
else:
return f"Real with a confidence of: {final_prob[0] * 100:.2f}%"
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Deepfake Detection",
description="Upload an audio file to detect whether it is fake or real"
)
interface.launch(share=True)