File size: 4,264 Bytes
372395e
caed802
b19e4a9
f694503
04d2706
 
caed802
372395e
caed802
372395e
41b5a1b
6cc068b
eb1af87
b6e8417
cb934a1
6cc068b
41b5a1b
cb934a1
 
36b4db6
41b5a1b
b5357a4
6cc068b
 
 
372395e
79b4496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c215d
79b4496
 
 
 
 
 
 
 
 
 
 
e7c2915
 
 
 
 
 
 
 
 
 
 
 
12bd467
 
 
 
 
 
e7c2915
 
 
79b4496
 
372395e
 
 
79b4496
372395e
 
6cc068b
 
 
 
8e6038a
f694503
372395e
570b690
79b4496
53f5458
 
 
 
 
79b4496
fdef21e
6cc068b
bc6b39c
372395e
a63d987
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import os
import time
from moviepy.editor import *
from share_btn import community_icon_html, loading_icon_html, share_js

token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/SRDdev/Image-Caption")
audio_gen = gr.Blocks.load(name="spaces/fffiloni/audioldm-text-to-audio-generation-clone", api_key=token)

ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"
def infer(image_input, manual_caption, duration_in, seed):
    print(duration_in)
    if manual_caption == "":
        cap = caption(image_input, fn_index=0)
        print("gpt2 caption: '" + cap + "' β€’ ")
        ph_update = "GP2 Caption: " + cap
    else:
        cap = manual_caption
        print("manual captiony: " + cap)
        ph_update=""
    
    sound = audio_gen(cap, duration_in, 2.5, seed, 3, fn_index=0)
    
    return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)

title = """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
            Image to Sound Effect
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Convert an image to a corresponding sound effect generated through GPT2 Image Captioning & AudioLDM
        </p>
    </div>
"""

article = """
    
    <div class="footer">
        <p>
         
        Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates πŸ€—
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
        <p>You may also like: </p>
        
        <div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
            
            <svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
                 </a>
            </svg>

            <svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue" src="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue.png" height="20"/>
                 </a>
            </svg>
            
        </div>
    </div>
"""

with gr.Blocks(css="style.css") as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
    
        input_img = gr.Image(type="filepath", elem_id="input-img")
        manual_cap = gr.Textbox(label="Manual Image description (optional)", placeholder=ph_message)
        with gr.Row():
            duration_in = gr.Slider(minimum=5, maximum=10, step=5, value=5, label="Duration")
            seed_in = gr.Number(label="Seed", value=45)
        caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
        sound_output = gr.Audio(label="Result", elem_id="sound-output")
        
        generate = gr.Button("Generate SFX from Image")

        with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
            community_icon = gr.HTML(community_icon_html)
            loading_icon = gr.HTML(loading_icon_html)
            share_button = gr.Button("Share to community", elem_id="share-btn")

        gr.HTML(article)
        
    generate.click(infer, inputs=[input_img, manual_cap, duration_in, seed_in], outputs=[caption_output, sound_output, manual_cap, share_group], api_name="i2fx")
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=32).launch(debug=True)