Spaces:

lenML
/

ChatTTS-Forge

Running on Zero

File size: 3,993 Bytes

01e655b
 
 
d2b7e94
 
bed01bd
 
d2b7e94
84cfd61
 
 
bed01bd
d5b3cd8
 
 
 
 
 
84cfd61
 
 
01e655b
bed01bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae79826
 
 
 
 
01e655b
 
 
 
 
 
 
 
 
 
bed01bd
 
 
01e655b
bed01bd
01e655b
bed01bd
01e655b
bed01bd
 
 
 
01e655b
bed01bd
 
 
 
01e655b
 
 
bed01bd
 
 
 
 
 
01e655b
bed01bd
 
 
 
 
 
01e655b
bed01bd
 
01e655b
 
 
1df74c6
 
 
 
 
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
bed01bd
 
 
 
 
 
 
 
 
 
 
01e655b
 
 
 
 
 
 
 
 
bed01bd
 
 
01e655b
 
bed01bd

import sys
from io import BytesIO

import numpy as np
import soundfile as sf
from pydub import AudioSegment, effects
import pyrubberband as pyrb

INT16_MAX = np.iinfo(np.int16).max


def audio_to_int16(audio_data: np.ndarray) -> np.ndarray:
    if (
        audio_data.dtype == np.float32
        or audio_data.dtype == np.float64
        or audio_data.dtype == np.float128
        or audio_data.dtype == np.float16
    ):
        audio_data = (audio_data * INT16_MAX).astype(np.int16)
    return audio_data


def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
    """
    Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
    where each value is in range [-1.0, 1.0].
    Returns tuple (audio_np_array, sample_rate).
    """
    nd_array = np.array(audio.get_array_of_samples(), dtype=np.float32)
    if audio.channels != 1:
        nd_array = nd_array.reshape((-1, audio.channels))
    nd_array = nd_array / (1 << (8 * audio.sample_width - 1))

    return (
        audio.frame_rate,
        nd_array,
    )


def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
    """
    Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
    where each value is in range [-1.0, 1.0].
    """
    channel_sounds = audiosegment.split_to_mono()
    samples = [s.get_array_of_samples() for s in channel_sounds]

    fp_arr = np.array(samples).T.astype(np.float32)
    fp_arr /= np.iinfo(samples[0].typecode).max
    fp_arr = fp_arr.reshape(-1)

    return fp_arr


def ndarray_to_segment(
    ndarray: np.ndarray, frame_rate: int, sample_width: int = None, channels: int = None
) -> AudioSegment:
    buffer = BytesIO()
    sf.write(buffer, ndarray, frame_rate, format="wav", subtype="PCM_16")
    buffer.seek(0)
    sound: AudioSegment = AudioSegment.from_wav(buffer)

    if sample_width is None:
        sample_width = sound.sample_width
    if channels is None:
        channels = sound.channels

    return (
        sound.set_frame_rate(frame_rate)
        .set_sample_width(sample_width)
        .set_channels(channels)
    )


def apply_prosody_to_audio_segment(
    audio_segment: AudioSegment,
    rate: float = 1,
    volume: float = 0,
    pitch: int = 0,
    sr: int = 24000,
) -> AudioSegment:
    audio_data = audiosegment_to_librosawav(audio_segment)

    audio_data = apply_prosody_to_audio_data(audio_data, rate, volume, pitch, sr)

    audio_segment = ndarray_to_segment(
        audio_data, sr, audio_segment.sample_width, audio_segment.channels
    )

    return audio_segment


def apply_prosody_to_audio_data(
    audio_data: np.ndarray,
    rate: float = 1,
    volume: float = 0,
    pitch: float = 0,
    sr: int = 24000,
) -> np.ndarray:
    if rate != 1:
        audio_data = pyrb.time_stretch(audio_data, sr=sr, rate=rate)

    if volume != 0:
        audio_data = audio_data * volume

    if pitch != 0:
        audio_data = pyrb.pitch_shift(audio_data, sr=sr, n_steps=pitch)

    return audio_data


def apply_normalize(
    audio_data: np.ndarray,
    headroom: float = 1,
    sr: int = 24000,
):
    segment = ndarray_to_segment(audio_data, sr)
    segment = effects.normalize(seg=segment, headroom=headroom)

    return pydub_to_np(segment)


if __name__ == "__main__":
    input_file = sys.argv[1]

    time_stretch_factors = [0.5, 0.75, 1.5, 1.0]
    pitch_shift_factors = [-12, -5, 0, 5, 12]

    input_sound = AudioSegment.from_mp3(input_file)

    for time_factor in time_stretch_factors:
        output_wav = f"{input_file}_time_{time_factor}.wav"
        output_sound = apply_prosody_to_audio_segment(input_sound, rate=time_factor)
        output_sound.export(output_wav, format="wav")

    for pitch_factor in pitch_shift_factors:
        output_wav = f"{input_file}_pitch_{pitch_factor}.wav"
        output_sound = apply_prosody_to_audio_segment(input_sound, pitch=pitch_factor)
        output_sound.export(output_wav, format="wav")