File size: 9,380 Bytes
0c98d02
398c97e
0c98d02
96bc128
e5ca74b
05e653a
f4a11e0
85d7512
f4a11e0
85d7512
c4e12c1
 
48fb8e1
f4a11e0
 
48fb8e1
1c42a58
48fb8e1
85d7512
1c42a58
05e653a
1c42a58
 
 
 
 
 
 
51aa8a2
 
 
b5dc05a
05e653a
c380881
c2149f9
18b0529
 
c4e12c1
18b0529
1c42a58
 
c8d083e
96bc128
fef57d1
 
f5a7a36
fef57d1
63e6915
1f944a7
 
48c20ac
9150f85
1c42a58
 
9186464
f4a11e0
e5b27eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
 
 
 
 
 
 
 
 
d6b0b08
b0e04c2
 
 
02b84d5
b0e04c2
 
 
02b84d5
95bce55
b0e04c2
c0f3731
b0e04c2
 
fba55a6
b0e04c2
02b84d5
b0e04c2
02b84d5
b0e04c2
 
 
 
fba55a6
9ca1653
fba55a6
 
d6b0b08
 
fba55a6
 
6179052
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
 
 
 
 
 
 
 
 
 
 
 
 
bebe771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
bebe771
 
b0e04c2
bebe771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0e04c2
 
 
 
5c1706d
b0e04c2
 
 
 
 
c4e12c1
8057d43
05e653a
 
 
 
c4e12c1
b0e04c2
 
c4e12c1
 
 
 
 
 
 
 
 
b0e04c2
 
1c42a58
c4e12c1
b0e04c2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import gradio as gr
import torch

from PIL import Image
import numpy as np
from spectro import wav_bytes_from_spectrogram_image

from diffusers import StableDiffusionPipeline
from diffusers import StableDiffusionImg2ImgPipeline

from share_btn import community_icon_html, loading_icon_html, share_js

device = "cuda"
MODEL_ID = "riffusion/riffusion-model-v1"
pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe = pipe.to(device)
pipe2 = StableDiffusionImg2ImgPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe2 = pipe2.to(device)

spectro_from_wav = gr.Interface.load("spaces/fffiloni/audio-to-spectrogram")

def predict(prompt, negative_prompt, audio_input, duration):
    if audio_input == None :
        return classic(prompt, negative_prompt, duration)
    else :
        return style_transfer(prompt, negative_prompt, audio_input)

def classic(prompt, negative_prompt, duration):
    if duration == 5:
        width_duration=512
    else :
        width_duration = 512 + ((int(duration)-5) * 128)
    spec = pipe(prompt, negative_prompt=negative_prompt, height=512, width=width_duration).images[0]
    print(spec)
    wav = wav_bytes_from_spectrogram_image(spec)
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return spec, 'output.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def style_transfer(prompt, negative_prompt, audio_input):
    spec = spectro_from_wav(audio_input)
    print(spec)
    # Open the image
    im = Image.open(spec)
    
    
    # Open the image
    im = image_from_spectrogram(im, 1)
   
    
    new_spectro = pipe2(prompt=prompt, image=im, strength=0.5, guidance_scale=7).images
    wav = wav_bytes_from_spectrogram_image(new_spectro[0])
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return new_spectro[0], 'output.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def image_from_spectrogram(
    spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
) -> Image.Image:
    """
    Compute a spectrogram image from a spectrogram magnitude array.
    """
    # Apply the power curve
    data = np.power(spectrogram, power_for_image)

    # Rescale to 0-255
    data = data * 255 / max_volume

    # Invert
    data = 255 - data

    # Convert to a PIL image
    image = Image.fromarray(data.astype(np.uint8))

    # Flip Y
    image = image.transpose(Image.FLIP_TOP_BOTTOM)

    # Convert to RGB
    image = image.convert("RGB")

    return image

title = """
    <div style="text-align: center; max-width: 500px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
            margin-bottom: 10px;
            line-height: 1em;
        "
        >
        <h1 style="font-weight: 600; margin-bottom: 7px;">
            Riffusion real-time music generation
        </h1>
        </div>
        <p style="margin-bottom: 10px;font-size: 94%;font-weight: 100;line-height: 1.5em;">
        Describe a musical prompt, generate music by getting a spectrogram image & sound.
        </p>
    </div>
"""

article = """
    <p style="font-size: 0.8em;line-height: 1.2em;border: 1px solid #374151;border-radius: 8px;padding: 20px;">
    About the model: Riffusion is a latent text-to-image diffusion model capable of generating spectrogram images given any text input. These spectrograms can be converted into audio clips.
    <br />β€”
    <br />The Riffusion model was created by fine-tuning the Stable-Diffusion-v1-5 checkpoint.
    <br />β€”
    <br />The model is intended for research purposes only. Possible research areas and tasks include 
    generation of artworks, audio, and use in creative processes, applications in educational or creative tools, research on generative models.

    </p>

    <p style="text-align: center;font-size: 94%">
        Do you need faster results ? You can skip the queue by duplicating this space: 
        <span style="display: flex;align-items: center;justify-content: center;height: 30px;">
            <a href="https://huggingface.co/fffiloni/spectrogram-to-music?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>       
            <a href="https://colab.research.google.com/drive/1FhH3HlN8Ps_Pr9OR6Qcfbfz7utDvICl0?usp=sharing" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
        </span>
    </p>
    
    <div class="footer">
        <p>
        <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion model</a> by Seth Forsgren and Hayk Martiros - 
        Demo by πŸ€— <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;">
        <p style="font-size: 0.8em;margin-bottom: 4px;">You may also like: </p>
        <div id="may-like" style="display:flex; align-items:center; justify-content: center;height:20px;">
            <svg height="20" width="158" style="margin-left:4px">       
                <a href="https://huggingface.co/spaces/fffiloni/img-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-Image to Music-blue" src="https://img.shields.io/badge/πŸ€— Spaces-Image to Music-blue.png" height="20"/>
                </a>
    </svg>
        </div>
    </div>

"""

css = '''
    #col-container, #col-container-2 {max-width: 510px; margin-left: auto; margin-right: auto;}
    a {text-decoration-line: underline; font-weight: 600;}
    div#record_btn > .mt-6 {
        margin-top: 0!important;
    }
    div#record_btn > .mt-6 button {
        width: 100%;
        height: 40px;
    }
    .footer {
        margin-bottom: 45px;
        margin-top: 10px;
        text-align: center;
        border-bottom: 1px solid #e5e5e5;
    }
    .footer>p {
        font-size: .8rem;
        display: inline-block;
        padding: 0 10px;
        transform: translateY(10px);
        background: white;
    }
    .dark .footer {
        border-color: #303030;
    }
    .dark .footer>p {
        background: #0b0f19;
    }
    .animate-spin {
        animation: spin 1s linear infinite;
    }
    @keyframes spin {
        from {
            transform: rotate(0deg);
        }
        to {
            transform: rotate(360deg);
        }
    }
    #share-btn-container {
        display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
    }
    #share-btn {
        all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
    }
    #share-btn * {
        all: unset;
    }
    #share-btn-container div:nth-child(-n+2){
        width: auto !important;
        min-height: 0px !important;
    }
    #share-btn-container .wrap {
        display: none !important;
    }

'''
 


with gr.Blocks(css="style.css") as demo:
    
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
        
        prompt_input = gr.Textbox(placeholder="a cat diva singing in a New York jazz club", label="Musical prompt", elem_id="prompt-in")
        audio_input = gr.Audio(source="upload", type="filepath", visible=False)
        with gr.Row():
            negative_prompt = gr.Textbox(label="Negative prompt")
            duration_input = gr.Slider(label="Duration in seconds", minimum=5, maximum=10, step=1, value=8, elem_id="duration-slider")
            
        send_btn = gr.Button(value="Get a new spectrogram ! ", elem_id="submit-btn")
            
    with gr.Column(elem_id="col-container-2"):
        
        spectrogram_output = gr.Image(label="spectrogram image result", elem_id="img-out")
        sound_output = gr.Audio(type='filepath', label="spectrogram sound", elem_id="music-out")
        
        with gr.Group(elem_id="share-btn-container"):
            community_icon = gr.HTML(community_icon_html, visible=False)
            loading_icon = gr.HTML(loading_icon_html, visible=False)
            share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
        
        gr.HTML(article)
    
    send_btn.click(predict, inputs=[prompt_input, negative_prompt, audio_input, duration_input], outputs=[spectrogram_output, sound_output, share_button, community_icon, loading_icon])
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=250).launch(debug=True)