import os import json import math import torch import torchaudio from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader import commons import utils from data_utils import UnitAudioLoader, UnitAudioCollate from models import SynthesizerTrn import gradio hubert = torch.hub.load("bshall/hubert:main", "hubert_soft") hps = utils.get_hparams_from_file("configs/sovits_ow2.json") net_g = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model) _ = net_g.eval() _ = utils.load_checkpoint("logs/ow2/G_195000.pth", net_g, None) def infer(audio, speaker_id, pitch_shift, length_scale, noise_scale=.667, noise_scale_w=0.8): fname = audio source, sr = torchaudio.load(fname) source = torchaudio.functional.pitch_shift(source, sr, int(pitch_shift))#, n_fft=256) source = torchaudio.functional.resample(source, sr, 16000) source = torch.mean(source, dim=0).unsqueeze(0) source = source.unsqueeze(0) with torch.inference_mode(): # Extract speech units unit = hubert.units(source) unit_lengths = torch.LongTensor([unit.size(1)]) # for multi-speaker inference sid = torch.LongTensor([speaker_id]) # Synthesize audio audio_out = net_g.infer(unit, unit_lengths, sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.float().numpy() return (22050, audio_out) demo = gradio.Interface( fn=infer, inputs=[ gradio.Audio(label="Input Audio", type="filepath"), gradio.Dropdown(label="Target Voice", choices=["Ana", "Ashe", "Baptiste", "Brigitte", "Cassidy", "Doomfist", "D.Va", "Echo", "Genji", "Hanzo", "Junker Queen", "Junkrat", "Kiriko", "Lúcio", "Mei", "Mercy", "Moira", "Orisa", "Pharah", "Reaper", "Reinhardt", "Roadhog", "Sigma", "Sojourn", "Soldier_ 76", "Sombra", "Symmetra", "Torbjörn", "Tracer", "Widowmaker", "Winston", "Zarya", "Zenyatta"], type="index", value="Ana"), gradio.Slider(label="Pitch Shift Input (+12 = up one octave)", minimum=-12.0, maximum=12.0, value=0, step=1), gradio.Slider(label="Length Factor", minimum=0.1, maximum=2.0, value=1.0), gradio.Slider(label="Noise Scale (higher = more expressive and erratic)", minimum=0.0, maximum=2.0, value=.667), gradio.Slider(label="Noise Scale W (higher = more variation in cadence)", minimum=0.0, maximum=2.0, value=.8) ], outputs=[gradio.Audio(label="Audio as Target Voice")], ) #demo.launch(share=True) demo.launch(server_name="0.0.0.0")