File size: 7,859 Bytes
0c98d02
398c97e
0c98d02
05e653a
f4a11e0
85d7512
f4a11e0
85d7512
c4e12c1
 
48fb8e1
f4a11e0
 
48fb8e1
1c42a58
48fb8e1
85d7512
1c42a58
05e653a
1c42a58
 
 
 
 
 
 
51aa8a2
 
 
b5dc05a
05e653a
c380881
c2149f9
18b0529
 
c4e12c1
18b0529
1c42a58
 
c8d083e
504a7bd
9150f85
1c42a58
 
 
f4a11e0
b0e04c2
 
 
 
 
 
 
 
 
d6b0b08
b0e04c2
 
 
02b84d5
b0e04c2
 
 
02b84d5
95bce55
b0e04c2
c0f3731
b0e04c2
 
fba55a6
b0e04c2
02b84d5
b0e04c2
02b84d5
b0e04c2
 
 
 
 
 
 
fba55a6
b0e04c2
 
fba55a6
9ca1653
fba55a6
 
d6b0b08
 
fba55a6
 
b0e04c2
 
 
 
 
 
 
 
 
 
 
 
 
bebe771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
bebe771
 
b0e04c2
bebe771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
 
 
 
 
 
 
 
 
 
c4e12c1
1c42a58
05e653a
 
 
 
c4e12c1
b0e04c2
 
c4e12c1
 
 
 
 
 
 
 
 
b0e04c2
 
1c42a58
c4e12c1
b0e04c2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import gradio as gr
import torch

from spectro import wav_bytes_from_spectrogram_image

from diffusers import StableDiffusionPipeline
from diffusers import StableDiffusionImg2ImgPipeline

from share_btn import community_icon_html, loading_icon_html, share_js

device = "cuda"
MODEL_ID = "riffusion/riffusion-model-v1"
pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe = pipe.to(device)
pipe2 = StableDiffusionImg2ImgPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe2 = pipe2.to(device)

spectro_from_wav = gr.Interface.load("spaces/fffiloni/audio-to-spectrogram")

def predict(prompt, negative_prompt, audio_input, duration):
    if audio_input == None :
        return classic(prompt, negative_prompt, duration)
    else :
        return style_transfer(prompt, negative_prompt, audio_input)

def classic(prompt, negative_prompt, duration):
    if duration == 5:
        width_duration=512
    else :
        width_duration = 512 + ((int(duration)-5) * 128)
    spec = pipe(prompt, negative_prompt=negative_prompt, height=512, width=width_duration).images[0]
    print(spec)
    wav = wav_bytes_from_spectrogram_image(spec)
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return spec, 'output.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def style_transfer(prompt, negative_prompt, audio_input):
    spec = spectro_from_wav(audio_input)
    print(spec)
    new_spectro = pipe2(prompt=prompt, image=spec, strength=0.5, guidance_scale=7).images
    wav = wav_bytes_from_spectrogram_image(new_spectro[0])
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return new_spectro, 'output.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

title = """
    <div style="text-align: center; max-width: 500px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
            margin-bottom: 10px;
            line-height: 1em;
        "
        >
        <h1 style="font-weight: 600; margin-bottom: 7px;">
            Riffusion real-time music generation
        </h1>
        </div>
        <p style="margin-bottom: 10px;font-size: 94%;font-weight: 100;line-height: 1.5em;">
        Describe a musical prompt, generate music by getting a spectrogram image & sound.
        </p>
    </div>
"""

article = """
    <p style="font-size: 0.8em;line-height: 1.2em;border: 1px solid #374151;border-radius: 8px;padding: 20px;">
    About the model: Riffusion is a latent text-to-image diffusion model capable of generating spectrogram images given any text input. These spectrograms can be converted into audio clips.
    <br />β€”
    <br />The Riffusion model was created by fine-tuning the Stable-Diffusion-v1-5 checkpoint.
    <br />β€”
    <br />The model is intended for research purposes only. Possible research areas and tasks include 
    generation of artworks, audio, and use in creative processes, applications in educational or creative tools, research on generative models.

    </p>
    <div class="footer">
        <p>
        <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion model</a> by Seth Forsgren and Hayk Martiros - 
        Demo by πŸ€— <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>
        </p>
    </div>

    <p style="text-align: center;font-size: 94%">
        Do you need faster results ? You can skip the queue by duplicating this space: 
        <span style="display: flex;align-items: center;justify-content: center;height: 30px;">
            <a href="https://huggingface.co/fffiloni/spectrogram-to-music?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>       
            <a href="https://colab.research.google.com/drive/1FhH3HlN8Ps_Pr9OR6Qcfbfz7utDvICl0?usp=sharing" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
        </span>
    </p>
"""

css = '''
    #col-container, #col-container-2 {max-width: 510px; margin-left: auto; margin-right: auto;}
    a {text-decoration-line: underline; font-weight: 600;}
    div#record_btn > .mt-6 {
        margin-top: 0!important;
    }
    div#record_btn > .mt-6 button {
        width: 100%;
        height: 40px;
    }
    .footer {
        margin-bottom: 45px;
        margin-top: 10px;
        text-align: center;
        border-bottom: 1px solid #e5e5e5;
    }
    .footer>p {
        font-size: .8rem;
        display: inline-block;
        padding: 0 10px;
        transform: translateY(10px);
        background: white;
    }
    .dark .footer {
        border-color: #303030;
    }
    .dark .footer>p {
        background: #0b0f19;
    }
    .animate-spin {
        animation: spin 1s linear infinite;
    }
    @keyframes spin {
        from {
            transform: rotate(0deg);
        }
        to {
            transform: rotate(360deg);
        }
    }
    #share-btn-container {
        display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
    }
    #share-btn {
        all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
    }
    #share-btn * {
        all: unset;
    }
    #share-btn-container div:nth-child(-n+2){
        width: auto !important;
        min-height: 0px !important;
    }
    #share-btn-container .wrap {
        display: none !important;
    }

'''
 


with gr.Blocks(css=css) as demo:
    
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
        
        prompt_input = gr.Textbox(placeholder="a cat diva singing in a New York jazz club", label="Musical prompt", elem_id="prompt-in")
        audio_input = gr.Audio(source="upload", type="filepath")
        with gr.Row():
            negative_prompt = gr.Textbox(label="Negative prompt")
            duration_input = gr.Slider(label="Duration in seconds", minimum=5, maximum=10, step=1, value=8, elem_id="duration-slider")
            
        send_btn = gr.Button(value="Get a new spectrogram ! ", elem_id="submit-btn")
            
    with gr.Column(elem_id="col-container-2"):
        
        spectrogram_output = gr.Image(label="spectrogram image result", elem_id="img-out")
        sound_output = gr.Audio(type='filepath', label="spectrogram sound", elem_id="music-out")
        
        with gr.Group(elem_id="share-btn-container"):
            community_icon = gr.HTML(community_icon_html, visible=False)
            loading_icon = gr.HTML(loading_icon_html, visible=False)
            share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
        
        gr.HTML(article)
    
    send_btn.click(predict, inputs=[prompt_input, negative_prompt, audio_input, duration_input], outputs=[spectrogram_output, sound_output, share_button, community_icon, loading_icon])
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=250).launch(debug=True)