cjayic's picture
link to soft-vc-widowmaker space
b2ab782
raw
history blame contribute delete
No virus
3.77 kB
import os
import json
import math
import torch
import torchaudio
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import commons
import utils
from data_utils import UnitAudioLoader, UnitAudioCollate
from models import SynthesizerTrn
import gradio
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
hps = utils.get_hparams_from_file("configs/sovits_ow2.json")
net_g = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("logs/ow2/G_195000.pth", net_g, None)
def infer(md, mic_audio, audio, speaker_id, pitch_shift, length_scale, noise_scale=.667, noise_scale_w=0.8):
source = None
sr = None
if mic_audio:
sr, source = mic_audio
source = torch.Tensor(source)
if source.dim() == 1:
source = source.unsqueeze(1)
source = source.T
if audio:
source, sr = torchaudio.load(audio)
source = torchaudio.functional.pitch_shift(source, sr, int(pitch_shift))#, n_fft=256)
source = torchaudio.functional.resample(source, sr, 16000)
source = torch.mean(source, dim=0).unsqueeze(0)
source = source.unsqueeze(0)
with torch.inference_mode():
# Extract speech units
unit = hubert.units(source)
unit_lengths = torch.LongTensor([unit.size(1)])
# for multi-speaker inference
sid = torch.LongTensor([speaker_id])
# Synthesize audio
audio_out = net_g.infer(unit, unit_lengths, sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.float().numpy()
return (22050, audio_out)
demo = gradio.Interface(
fn=infer,
inputs=[
gradio.Markdown(
"""
# SOVITS Any-to-Many VC | Overwatch 2
Upload any voice recording and turn it into a mangled approximation of any* Overwatch 2 Hero!
For a higher quality single-speaker model, check out my [soft-vc-widowmaker](https://huggingface.co/spaces/cjayic/soft-vc-widowmaker) space!
SOVITS doesn't really appear to adjust the pitch to the target speaker, so it helps to have your input voice at a similar pitch to the target voice.
I added a pitch shift option to preprocess the input voice, but it's slow and sometimes outright broken, use at your own risk.
( * up to Kiriko and without Bastion. Please forgive. )
"""),
gradio.Audio(label="Record Input Audio", source="microphone"),
gradio.Audio(label="Upload Input Audio", type="filepath"),
gradio.Dropdown(label="Target Voice", choices=["Ana", "Ashe", "Baptiste", "Brigitte", "Cassidy", "Doomfist", "D.Va", "Echo", "Genji", "Hanzo", "Junker Queen", "Junkrat", "Kiriko", "Lúcio", "Mei", "Mercy", "Moira", "Orisa", "Pharah", "Reaper", "Reinhardt", "Roadhog", "Sigma", "Sojourn", "Soldier_ 76", "Sombra", "Symmetra", "Torbjörn", "Tracer", "Widowmaker", "Winston", "Zarya", "Zenyatta"], type="index", value="Ana"),
gradio.Slider(label="Pitch Shift Input (+12 = up one octave, ⚠️ broken AF ⚠️)", minimum=-12.0, maximum=12.0, value=0, step=1),
gradio.Slider(label="Length Factor (higher = slower speech)", minimum=0.1, maximum=2.0, value=1.0),
gradio.Slider(label="Noise Scale (higher = more expressive and erratic)", minimum=0.0, maximum=2.0, value=.667),
gradio.Slider(label="Noise Scale W (higher = more variation in cadence)", minimum=0.0, maximum=2.0, value=.8)
],
outputs=[gradio.Audio(label="Audio as Target Voice")],
)
#demo.launch(share=True)
demo.launch(server_name="0.0.0.0")