Spaces:

TencentARC
/

MotionCtrl_SVD

Running on A10G

File size: 39,915 Bytes

import argparse
import os
import tempfile

import gradio as gr
import numpy as np
import torch
from glob import glob
from torchvision.transforms import CenterCrop, Compose, Resize

from gradio_utils.camera_utils import CAMERA_MOTION_MODE, process_camera, create_relative

from gradio_utils.utils import vis_camera
from gradio_utils.motionctrl_cmcm_gradio import build_model, motionctrl_sample

os.environ['KMP_DUPLICATE_LIB_OK']='True'
SPACE_ID = os.environ.get('SPACE_ID', '')


#### Description ####
title = r"""<h1 align="center">MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</h1>"""
subtitle = r"""<h2 align="center">Deployed on SVD Generation</h2>"""
important_link = r"""
<div align='center'>
<a href='https://wzhouxiff.github.io/projects/MotionCtrl/assets/paper/MotionCtrl.pdf'>[Paper]</a>
&ensp; <a href='https://wzhouxiff.github.io/projects/MotionCtrl/'>[Project Page]</a>
&ensp; <a href='https://github.com/TencentARC/MotionCtrl'>[Code]</a>
&ensp; <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/showcase_svd.md'>[Showcases]</a>
&ensp; <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md'>[Tutorial]</a>
</div>
"""

description = r"""
<b>Official Gradio demo</b> for <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'><b>MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</b></a>.<br>
🔥 MotionCtrl is capable of independently and flexibly controling the camera motion and object motion of a generated video, with only a unified model.<br>
🤗 Try to control the motion of the generated videos yourself!<br>
❗❗❗ Please note **ONLY** Camera Motion Control in the current version of **MotionCtrl** deployed on **SVD** is avaliable.<br>
❗❗❗ <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/showcase_svd.md' target='_blank'>Showcases</a> and 
<a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md' target='_blank'>Tutorial</a> can be found 
<a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md' target='_blank'>here</a><br>.
"""
# <div>
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/00_ibzz5-dxv2h.gif", width="300">
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/01_5guvn-0x6v2.gif", width="300">
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/12_sn7bz-0hcaf.gif", width="300">
# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/13_3lyco-4ru8j.gif", width="300">
# </div>
article = r"""
If MotionCtrl is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'>Github Repo</a>. Thanks! 
[![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
)](https://github.com/TencentARC/MotionCtrl)

---

📝 **Citation**
<br>
If our work is useful for your research, please consider citing:
```bibtex
@inproceedings{wang2023motionctrl,
  title={MotionCtrl: A Unified and Flexible Motion Controller for Video Generation},
  author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying},
  booktitle={arXiv preprint arXiv:2312.03641},
  year={2023}
}
```

📧 **Contact**
<br>
If you have any questions, please feel free to reach me out at <b>[email protected]</b>.

"""
css = """
.gradio-container {width: 85% !important}
.gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important;}
span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; color: #d30f2f !important;}
button {border-radius: 8px !important;}
.add_button {background-color: #4CAF50 !important;}
.remove_button {background-color: #f44336 !important;}
.clear_button {background-color: gray !important;}
.mask_button_group {gap: 10px !important;}
.video {height: 300px !important;}
.image {height: 300px !important;}
.video .wrap.svelte-lcpz3o {display: flex !important; align-items: center !important; justify-content: center !important;}
.video .wrap.svelte-lcpz3o > :first-child {height: 100% !important;}
.margin_center {width: 50% !important; margin: auto !important;}
.jc_center {justify-content: center !important;}
"""


T_base = [
            [1.,0.,0.],             ## W2C  x 的正方向： 相机朝左  left
            [-1.,0.,0.],            ## W2C  x 的负方向： 相机朝右  right
            [0., 1., 0.],           ## W2C  y 的正方向： 相机朝上  up     
            [0.,-1.,0.],            ## W2C  y 的负方向： 相机朝下  down
            [0.,0.,1.],             ## W2C  z 的正方向： 相机往前  zoom out
            [0.,0.,-1.],            ## W2C  z 的负方向： 相机往前  zoom in
        ]   
radius = 1
n = 16
# step = 
look_at = np.array([0, 0, 0.8]).reshape(3,1)
# look_at = np.array([0, 0, 0.2]).reshape(3,1)

T_list = []
base_R = np.array([[1., 0., 0.],
                [0., 1., 0.],
                [0., 0., 1.]])
res = [] 
res_forsave = []
T_range = 1.8



for i in range(0, 16):
    # theta = (1)*np.pi*i/n

    R = base_R[:,:3]
    T = np.array([0.,0.,1.]).reshape(3,1) * (i/n)*2
    RT = np.concatenate([R,T], axis=1)
    res.append(RT)
    
fig = vis_camera(res)
    
# MODE = ["camera motion control", "object motion control", "camera + object motion control"]
MODE = ["control camera poses", "control object trajectory", "control both camera and object motion"]
RESIZE_MODE = ['Center Crop To 576x1024', 'Keep original spatial ratio']
DIY_MODE = ['Customized Mode 1: First A then B', 
            'Customized Mode 2: Both A and B', 
            'Customized Mode 3: RAW Camera Poses']

## load default model
num_frames = 14
num_steps = 25
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device {device}")

config = "configs/inference/config_motionctrl_cmcm.yaml"
ckpt='checkpoints/motionctrl_svd.ckpt'
if not os.path.exists(ckpt):
    os.system(f'wget https://huggingface.co/TencentARC/MotionCtrl/resolve/main/motionctrl_svd.ckpt?download=true -P .')
    os.system(f'mkdir checkpoints')
    os.system(f'mv motionctrl_svd.ckpt?download=true {ckpt}')
model = build_model(config, ckpt, device, num_frames, num_steps)
width, height = 1024, 576 

traj_list = [] 
camera_dict = {
                "motion":[],
                "mode": "Customized Mode 1: First A then B",  # "First A then B", "Both A and B", "Custom"
                "speed": 1.0,
                "complex": None
                }   

def fn_vis_camera(camera_args):
    global camera_dict, num_frames, width, height
    RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height) # [t, 3, 4]

    rescale_T = 1.0
    rescale_T = max(rescale_T, np.max(np.abs(RT[:,:,-1])) / 1.9)

    fig = vis_camera(create_relative(RT), rescale_T=rescale_T)

    vis_step3_prompt_generate = True
    vis_generation_dec = True
    vis_prompt = True
    vis_num_samples = True
    vis_seed = True
    vis_start = True
    vis_gen_video = True
    vis_repeat_highlight = True

    return fig, \
            gr.update(visible=vis_step3_prompt_generate), \
            gr.update(visible=vis_generation_dec), \
            gr.update(visible=vis_prompt), \
            gr.update(visible=vis_num_samples), \
            gr.update(visible=vis_seed), \
            gr.update(visible=vis_start), \
            gr.update(visible=vis_gen_video, value=None), \
            gr.update(visible=vis_repeat_highlight)

def display_camera_info(camera_dict, camera_mode=None):
    if camera_dict['complex'] is not None:
        res = f"complex : {camera_dict['complex']}. "
        res += f"speed : {camera_dict['speed']}. "
    else:
        res = ""
        res += f"motion : {[_ for _ in camera_dict['motion']]}. "
        res += f"speed : {camera_dict['speed']}. "
        if camera_mode == CAMERA_MOTION_MODE[2]:
            res += f"mode : {camera_dict['mode']}. "
    return res

def add_camera_motion(camera_motion, camera_mode):  
    global camera_dict
    if camera_dict['complex'] is not None:
        camera_dict['complex'] = None
    if camera_mode == CAMERA_MOTION_MODE[2] and len(camera_dict['motion']) <2:
        camera_dict['motion'].append(camera_motion)
    else:
        camera_dict['motion']=[camera_motion]
    
    return display_camera_info(camera_dict, camera_mode)

def add_complex_camera_motion(camera_motion):
    global camera_dict
    camera_dict['complex']=camera_motion
    return display_camera_info(camera_dict)

def input_raw_camera_pose(combine_type, camera_mode):
    global camera_dict
    camera_dict['mode'] = combine_type

    vis_U = False
    vis_D = False
    vis_L = False
    vis_R = False
    vis_I = False
    vis_O = False
    vis_ACW = False
    vis_CW = False
    vis_speed = True
    vis_combine3_des = True

    return gr.update(value='1 0 0 0 0 1 0 0 0 0 1 0\n1 0 0 0 0 1 0 0 0 0 1 -0.225\n1 0 0 0 0 1 0 0 0 0 1 -0.45\n1 0 0 0 0 1 0 0 0 0 1 -0.675\n1 0 0 0 0 1 0 0 0 0 1 -0.9\n1 0 0 0 0 1 0 0 0 0 1 -1.125\n1 0 0 0 0 1 0 0 0 0 1 -1.35\n1 0 0 0 0 1 0 0 0 0 1 -1.575\n1 0 0 0 0 1 0 0 0 0 1 -1.8\n1 0 0 0 0 1 0 0 0 0 1 -2.025\n1 0 0 0 0 1 0 0 0 0 1 -2.25\n1 0 0 0 0 1 0 0 0 0 1 -2.475\n1 0 0 0 0 1 0 0 0 0 1 -2.7\n1 0 0 0 0 1 0 0 0 0 1 -2.925\n', max_lines=16, interactive=True), \
            gr.update(visible=vis_U), \
            gr.update(visible=vis_D), \
            gr.update(visible=vis_L),\
            gr.update(visible=vis_R), \
            gr.update(visible=vis_I), \
            gr.update(visible=vis_O), \
            gr.update(visible=vis_ACW), \
            gr.update(visible=vis_CW), \
            gr.update(visible=vis_speed), \
            gr.update(visible=vis_combine3_des)

def change_camera_mode(combine_type, camera_mode):
    global camera_dict
    camera_dict['mode'] = combine_type

    vis_U = True
    vis_D = True
    vis_L = True
    vis_R = True
    vis_I = True
    vis_O = True
    vis_ACW = True
    vis_CW = True
    vis_speed = True
    vis_combine3_des = False

    return display_camera_info(camera_dict, camera_mode), \
            gr.update(visible=vis_U), \
            gr.update(visible=vis_D), \
            gr.update(visible=vis_L),\
            gr.update(visible=vis_R), \
            gr.update(visible=vis_I), \
            gr.update(visible=vis_O), \
            gr.update(visible=vis_ACW), \
            gr.update(visible=vis_CW), \
            gr.update(visible=vis_speed), \
            gr.update(visible=vis_combine3_des)

def change_camera_speed(camera_speed):
    global camera_dict
    camera_dict['speed'] = camera_speed
    return display_camera_info(camera_dict)

def reset_camera():
    global camera_dict
    camera_dict = {
                    "motion":[],
                    "mode": "Customized Mode 1: First A then B",
                    "speed": 1.0,
                    "complex": None
                    }   
    return display_camera_info(camera_dict)


def visualized_camera_poses(step2_camera_motion):
    reset_camera()

    # generate video
    vis_step3_prompt_generate = False
    vis_generation_dec = False
    vis_prompt = False
    vis_num_samples = False
    vis_seed = False
    vis_start = False
    vis_gen_video = False
    vis_repeat_highlight = False

    if step2_camera_motion == CAMERA_MOTION_MODE[0]:
        vis_basic_camera_motion = True
        vis_basic_camera_motion_des = True
        vis_custom_camera_motion = False
        vis_custom_run_status = False
        vis_complex_camera_motion = False
        vis_complex_camera_motion_des = False
        vis_U = True
        vis_D = True
        vis_L = True
        vis_R = True
        vis_I = True
        vis_O = True
        vis_ACW = True
        vis_CW = True
        vis_combine1 = False
        vis_combine2 = False
        vis_combine3 = False
        vis_combine3_des = False
        vis_speed = True

        vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
        vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False

    elif step2_camera_motion == CAMERA_MOTION_MODE[1]:
        vis_basic_camera_motion = False
        vis_basic_camera_motion_des = False
        vis_custom_camera_motion = False
        vis_custom_run_status = False
        vis_complex_camera_motion = True
        vis_complex_camera_motion_des = True
        vis_U = False
        vis_D = False
        vis_L = False
        vis_R = False
        vis_I = False
        vis_O = False
        vis_ACW = False
        vis_CW = False
        vis_combine1 = False
        vis_combine2 = False
        vis_combine3 = False
        vis_combine3_des = False
        vis_speed = True

        vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = True, True, True, True
        vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = True, True, True, True

    else: # step2_camera_motion = CAMERA_MOTION_MODE[2]:
        vis_basic_camera_motion = False
        vis_basic_camera_motion_des = False
        vis_custom_camera_motion = True
        vis_custom_run_status = True
        vis_complex_camera_motion = False
        vis_complex_camera_motion_des = False
        vis_U = False
        vis_D = False
        vis_L = False
        vis_R = False
        vis_I = False
        vis_O = False
        vis_ACW = False
        vis_CW = False
        vis_combine1 = True
        vis_combine2 = True
        vis_combine3 = True
        vis_combine3_des = False
        vis_speed = False

        vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
        vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False

    vis_camera_args = True
    vis_camera_reset = True
    vis_camera_vis = True
    vis_vis_camera = True

    return gr.update(visible=vis_basic_camera_motion), \
            gr.update(visible=vis_basic_camera_motion_des), \
            gr.update(visible=vis_custom_camera_motion), \
            gr.update(visible=vis_custom_run_status), \
            gr.update(visible=vis_complex_camera_motion), \
            gr.update(visible=vis_complex_camera_motion_des), \
            gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
            gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
            gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
            gr.update(visible=vis_combine3_des), \
            gr.update(visible=vis_speed), \
            gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
            gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
            gr.update(visible=vis_camera_args, value=None), \
            gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
            gr.update(visible=vis_vis_camera, value=None), \
            gr.update(visible=vis_step3_prompt_generate), \
            gr.update(visible=vis_generation_dec), \
            gr.update(visible=vis_prompt), \
            gr.update(visible=vis_num_samples), \
            gr.update(visible=vis_seed), \
            gr.update(visible=vis_start), \
            gr.update(visible=vis_gen_video), \
            gr.update(visible=vis_repeat_highlight)


def process_input_image(input_image, resize_mode):
    global width, height
    if resize_mode == RESIZE_MODE[0]:
        height = 576
        width = 1024
        w, h = input_image.size
        h_ratio = h / height
        w_ratio = w / width

        if h_ratio > w_ratio:
            h = int(h / w_ratio)
            if h < height:
                h = height
            input_image = Resize((h, width))(input_image)
            
        else:
            w = int(w / h_ratio)
            if w < width:
                w = width
            input_image = Resize((height, w))(input_image)

        transformer = Compose([
            # Resize(width),
            CenterCrop((height, width)),
        ])

        input_image = transformer(input_image)
    else:
        w, h = input_image.size
        if h > w:
            height = 576
            width = int(w * height / h)
        else:
            width = 1024
            height = int(h * width / w)

        input_image = Resize((height, width))(input_image)
        # print(f'input_image size: {input_image.size}')

    vis_step2_camera_motion = True
    vis_step2_camera_motion_des = True
    vis_camera_mode = True
    vis_camera_info = True

    ####
    # camera motion control
    vis_basic_camera_motion = False
    vis_basic_camera_motion_des = False
    vis_custom_camera_motion = False
    vis_custom_run_status = False
    vis_complex_camera_motion = False
    vis_complex_camera_motion_des = False
    vis_U = False
    vis_D = False
    vis_L = False
    vis_R = False
    vis_I = False
    vis_O = False
    vis_ACW = False
    vis_CW = False
    vis_combine1 = False
    vis_combine2 = False
    vis_combine3 = False
    vis_combine3_des = False
    vis_speed = False

    vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
    vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False

    vis_camera_args = False
    vis_camera_reset = False
    vis_camera_vis = False
    vis_vis_camera = False

    # generate video
    vis_step3_prompt_generate = False
    vis_generation_dec = False
    vis_prompt = False
    vis_num_samples = False
    vis_seed = False
    vis_start = False
    vis_gen_video = False
    vis_repeat_highlight = False
    
    return gr.update(visible=True, value=input_image, height=height, width=width), \
            gr.update(visible=vis_step2_camera_motion), \
            gr.update(visible=vis_step2_camera_motion_des), \
            gr.update(visible=vis_camera_mode), \
            gr.update(visible=vis_camera_info), \
            gr.update(visible=vis_basic_camera_motion), \
            gr.update(visible=vis_basic_camera_motion_des), \
            gr.update(visible=vis_custom_camera_motion), \
            gr.update(visible=vis_custom_run_status), \
            gr.update(visible=vis_complex_camera_motion), \
            gr.update(visible=vis_complex_camera_motion_des), \
            gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
            gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
            gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
            gr.update(visible=vis_combine3_des), \
            gr.update(visible=vis_speed), \
            gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
            gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
            gr.update(visible=vis_camera_args, value=None), \
            gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
            gr.update(visible=vis_vis_camera, value=None), \
            gr.update(visible=vis_step3_prompt_generate), \
            gr.update(visible=vis_generation_dec), \
            gr.update(visible=vis_prompt), \
            gr.update(visible=vis_num_samples), \
            gr.update(visible=vis_seed), \
            gr.update(visible=vis_start), \
            gr.update(visible=vis_gen_video), \
            gr.update(visible=vis_repeat_highlight)

def model_run(input_image, fps_id, seed, n_samples, camera_args):
    global model, device, camera_dict, num_frames, num_steps, width, height
    RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height).reshape(-1,12)

    video_path = motionctrl_sample(
        model=model,
        image=input_image,
        RT=RT,
        num_frames=num_frames,
        fps_id=fps_id,
        decoding_t=1,
        seed=seed,
        sample_num=n_samples,
        device=device
    )

    return video_path

def main(args):
    demo = gr.Blocks()
    with demo:

        gr.Markdown(title)
        gr.Markdown(subtitle)
        gr.Markdown(important_link)
        gr.Markdown(description)

        with gr.Column():
            
            # step 0: Some useful tricks
            gr.Markdown("## Step 0/3: Some Useful Tricks", show_label=False)
            gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
                                                \n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
                                                or increase `FPS`.", "Normal")],
                                color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=True)

            # step 2: input an image
            step2_title = gr.Markdown("---\n## Step 1/3: Input an Image", show_label=False, visible=True)
            step2_dec = gr.Markdown(f"\n 1. Upload an Image by `Drag` or Click `Upload Image`; \
                                    \n 2. Click `{RESIZE_MODE[0]}` or `{RESIZE_MODE[1]}` to select the image resize mode. \
                                    You will get a processed image and go into the next step. \
                                    \n - `{RESIZE_MODE[0]}`: Our MotionCtrl is train on image with spatial size 576x1024. Choose `{RESIZE_MODE[0]}` can get better generated video. \
                                    \n - `{RESIZE_MODE[1]}`: Choose `{RESIZE_MODE[1]}` if you want to generate video with the same spatial ratio as the input image.", 
                                    show_label=False, visible=True)
                                    
            with gr.Row(equal_height=True):
                with gr.Column(scale=2):
                    input_image = gr.Image(type="pil", interactive=True, elem_id="input_image", elem_classes='image', visible=True)
                    # process_input_image_button = gr.Button(value="Process Input Image", visible=False)
                    with gr.Row():
                        center_crop_botton = gr.Button(value=RESIZE_MODE[0], visible=True)
                        keep_spatial_raition_botton = gr.Button(value=RESIZE_MODE[1], visible=True)
                with gr.Column(scale=2):
                    process_image = gr.Image(type="pil", interactive=False, elem_id="process_image", elem_classes='image', visible=False)
            # step2_proceed_button = gr.Button(value="Proceed", visible=False)

            
            # step3 - camera motion control
            step2_camera_motion = gr.Markdown("---\n## Step 2/3: Select the camera poses", show_label=False, visible=False)
            step2_camera_motion_des = gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Including 8 basic camera poses, such as pan up, pan down, zoom in, and zoom out. \
                                                    \n - {CAMERA_MOTION_MODE[1]}: Complex camera poses extracted from the real videos. \
                                                    \n - {CAMERA_MOTION_MODE[2]}: You can customize complex camera poses yourself by combining or fusing two of the eight basic camera poses or input RAW RT matrix. \
                                                    \n - Click `Proceed` to go into next step", 
                                                  show_label=False, visible=False)
            camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
            camera_info = gr.Button(value="Proceed", visible=False)

            with gr.Row():
                with gr.Column():
                    # step3.1 - camera motion control - basic
                    basic_camera_motion = gr.Markdown("---\n### Basic Camera Poses", show_label=False, visible=False)
                    basic_camera_motion_des = gr.Markdown(f"\n 1. Click one of the basic camera poses, such as `Pan Up`; \
                                                            \n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
                                                            \n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
                                                            \n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
                                                        show_label=False, visible=False)
                    
                    
                    # step3.2 - camera motion control - provided complex
                    complex_camera_motion = gr.Markdown("---\n### Provided Complex Camera Poses", show_label=False, visible=False)
                    complex_camera_motion_des = gr.Markdown(f"\n 1. Click one of the complex camera poses, such as `Pose_1`; \
                                                            \n 2. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
                                                            \n 3. Click `Reset Camera` to reset the camera poses (If needed). ",
                                                        show_label=False, visible=False)

                    # step3.3 - camera motion control - custom
                    custom_camera_motion = gr.Markdown(f"---\n### {CAMERA_MOTION_MODE[2]}", show_label=False, visible=False)
                    custom_run_status = gr.Markdown(f"\n 1. Click `{DIY_MODE[0]}`, `{DIY_MODE[1]}`, or `{DIY_MODE[2]}` \
                                                    \n - `Customized Mode 1: First A then B`: For example, click `Pan Up` and `Pan Left`, the camera will first `Pan Up` and then `Pan Left`; \
                                                    \n - `Customized Mode 2: Both A and B`: For example, click `Pan Up` and `Pan Left`, the camera will move towards the upper left corner; \
                                                    \n - `{DIY_MODE[2]}`: Input the RAW RT matrix yourselves. \
                                                    \n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
                                                    \n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
                                                    \n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
                                                        show_label=False, visible=False)

                    gr.HighlightedText(value=[("",""), ("1. Select two of the basic camera poses; 2. Select Customized Mode 1 OR Customized Mode 2. 3. Visualized Camera to show the customized camera poses", "Normal")],
                                                        color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
                    
                    with gr.Row():
                        combine1 = gr.Button(value=DIY_MODE[0], visible=False)
                        combine2 = gr.Button(value=DIY_MODE[1], visible=False)
                        combine3 = gr.Button(value=DIY_MODE[2], visible=False)
                    with gr.Row():
                        combine3_des = gr.Markdown(f"---\n#### Input your camera pose in the following textbox. \
                                                A total of 14 lines and each line contains 12 float number, indicated \
                                                the RT matrix in the shape of 1x12. \
                                                The example is RT matrix of ZOOM IN.", show_label=False, visible=False)

                    with gr.Row():
                        U = gr.Button(value="Pan Up", visible=False)
                        D = gr.Button(value="Pan Down", visible=False)
                        L = gr.Button(value="Pan Left", visible=False)
                        R = gr.Button(value="Pan Right", visible=False)
                    with gr.Row():
                        I = gr.Button(value="Zoom In", visible=False)
                        O = gr.Button(value="Zoom Out", visible=False)
                        ACW = gr.Button(value="ACW", visible=False)
                        CW = gr.Button(value="CW", visible=False)

                    with gr.Row():    
                        speed = gr.Slider(minimum=0, maximum=8, step=0.2, label="Motion Speed", value=1.0, visible=False)

                    with gr.Row():
                        Pose_1 = gr.Button(value="Pose_1", visible=False)
                        Pose_2 = gr.Button(value="Pose_2", visible=False)
                        Pose_3 = gr.Button(value="Pose_3", visible=False)
                        Pose_4 = gr.Button(value="Pose_4", visible=False)
                    with gr.Row():
                        Pose_5 = gr.Button(value="Pose_5", visible=False)
                        Pose_6 = gr.Button(value="Pose_6", visible=False)
                        Pose_7 = gr.Button(value="Pose_7", visible=False)
                        Pose_8 = gr.Button(value="Pose_8", visible=False)
                
                    with gr.Row():
                        camera_args = gr.Textbox(value="Camera Type", label="Camera Type", visible=False)
                    with gr.Row():
                        camera_vis= gr.Button(value="Visualize Camera and Proceed", visible=False)
                        camera_reset = gr.Button(value="Reset Camera", visible=False)
                with gr.Column():
                    vis_camera = gr.Plot(fig, label='Camera Poses', visible=False)

            
            # step4 - Generate videos
            with gr.Row():
                with gr.Column():
                    step3_prompt_generate = gr.Markdown("---\n## Step 3/3: Generate videos", show_label=False, visible=False)
                    generation_dec = gr.Markdown(f"\n 1. Set `FPS`.; \
                                                    \n 2. Set `n_samples`; \
                                                    \n 3. Set `seed`; \
                                                    \n 4. Click `Start generation !` to generate videos; ", visible=False)
                    # prompt = gr.Textbox(value="a dog sitting on grass", label="Prompt", interactive=True, visible=False)
                    prompt = gr.Slider(minimum=5, maximum=30, step=1, label="FPS", value=10, visible=False)
                    n_samples = gr.Number(value=1, precision=0, interactive=True, label="n_samples", visible=False)
                    seed = gr.Number(value=1234, precision=0, interactive=True, label="Seed", visible=False)
                    start = gr.Button(value="Start generation !", visible=False)
                with gr.Column():
                    gen_video = gr.Video(value=None, label="Generate Video", visible=False)
                    repeat_highlight=gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
                                                \n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
                                                or increase `FPS`.", "Normal")],
                                color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)

        center_crop_botton.click(
            fn=process_input_image, 
            inputs=[input_image, center_crop_botton], 
            outputs=[
                process_image,
                step2_camera_motion, 
                step2_camera_motion_des,
                camera_mode, 
                camera_info,
                basic_camera_motion,
                basic_camera_motion_des,
                custom_camera_motion,
                custom_run_status,
                complex_camera_motion,
                complex_camera_motion_des,
                U, D, L, R, 
                I, O, ACW, CW, 
                combine1, combine2, combine3, combine3_des,
                speed, 
                Pose_1, Pose_2, Pose_3, Pose_4, 
                Pose_5, Pose_6, Pose_7, Pose_8,
                camera_args, 
                camera_reset, camera_vis,
                vis_camera,

                step3_prompt_generate, 
                generation_dec, 
                prompt, 
                n_samples, 
                seed, start, gen_video, repeat_highlight])

        keep_spatial_raition_botton.click(
            fn=process_input_image, 
            inputs=[input_image, keep_spatial_raition_botton], 
            outputs=[
                process_image,
                step2_camera_motion, 
                step2_camera_motion_des,
                camera_mode, 
                camera_info,
                basic_camera_motion,
                basic_camera_motion_des,
                custom_camera_motion,
                custom_run_status,
                complex_camera_motion,
                complex_camera_motion_des,
                U, D, L, R, 
                I, O, ACW, CW, 
                combine1, combine2, combine3, combine3_des,
                speed, 
                Pose_1, Pose_2, Pose_3, Pose_4, 
                Pose_5, Pose_6, Pose_7, Pose_8,
                camera_args, 
                camera_reset, camera_vis,
                vis_camera,

                step3_prompt_generate, 
                generation_dec, 
                prompt, 
                n_samples, 
                seed, start, gen_video, repeat_highlight])
        

        camera_info.click(
            fn=visualized_camera_poses,
            inputs=[camera_mode],
            outputs=[basic_camera_motion,
                     basic_camera_motion_des,
                     custom_camera_motion,
                     custom_run_status,
                     complex_camera_motion,
                     complex_camera_motion_des,
                     U, D, L, R, 
                     I, O, ACW, CW, 
                     combine1, combine2, combine3, combine3_des,
                     speed, 
                     Pose_1, Pose_2, Pose_3, Pose_4, 
                     Pose_5, Pose_6, Pose_7, Pose_8,
                     camera_args, 
                     camera_reset, camera_vis,
                     vis_camera,
                     step3_prompt_generate, generation_dec, prompt, n_samples, seed, start, gen_video, repeat_highlight],
        )


        U.click(fn=add_camera_motion, inputs=[U, camera_mode], outputs=camera_args)
        D.click(fn=add_camera_motion, inputs=[D, camera_mode], outputs=camera_args)
        L.click(fn=add_camera_motion, inputs=[L, camera_mode], outputs=camera_args)
        R.click(fn=add_camera_motion, inputs=[R, camera_mode], outputs=camera_args)
        I.click(fn=add_camera_motion, inputs=[I, camera_mode], outputs=camera_args)
        O.click(fn=add_camera_motion, inputs=[O, camera_mode], outputs=camera_args)
        ACW.click(fn=add_camera_motion, inputs=[ACW, camera_mode], outputs=camera_args)
        CW.click(fn=add_camera_motion, inputs=[CW, camera_mode], outputs=camera_args)
        speed.change(fn=change_camera_speed, inputs=speed, outputs=camera_args)
        camera_reset.click(fn=reset_camera, inputs=None, outputs=[camera_args])

        combine1.click(fn=change_camera_mode, 
                       inputs=[combine1, camera_mode], 
                       outputs=[camera_args,
                                U, D, L, R, 
                                I, O, ACW, CW, speed,
                                combine3_des])
        combine2.click(fn=change_camera_mode, 
                       inputs=[combine2, camera_mode], 
                       outputs=[camera_args,
                                U, D, L, R, 
                                I, O, ACW, CW, 
                                speed,
                                combine3_des])
        combine3.click(fn=input_raw_camera_pose, 
                       inputs=[combine3, camera_mode], 
                       outputs=[camera_args,
                                U, D, L, R, 
                                I, O, ACW, CW, 
                                speed, 
                                combine3_des])

        camera_vis.click(fn=fn_vis_camera, inputs=[camera_args], 
                         outputs=[vis_camera, 
                                  step3_prompt_generate, 
                                  generation_dec,
                                  prompt, 
                                  n_samples, 
                                  seed, 
                                  start, 
                                  gen_video,
                                  repeat_highlight])

        Pose_1.click(fn=add_complex_camera_motion, inputs=Pose_1, outputs=camera_args)
        Pose_2.click(fn=add_complex_camera_motion, inputs=Pose_2, outputs=camera_args)
        Pose_3.click(fn=add_complex_camera_motion, inputs=Pose_3, outputs=camera_args)
        Pose_4.click(fn=add_complex_camera_motion, inputs=Pose_4, outputs=camera_args)
        Pose_5.click(fn=add_complex_camera_motion, inputs=Pose_5, outputs=camera_args)
        Pose_6.click(fn=add_complex_camera_motion, inputs=Pose_6, outputs=camera_args)
        Pose_7.click(fn=add_complex_camera_motion, inputs=Pose_7, outputs=camera_args)
        Pose_8.click(fn=add_complex_camera_motion, inputs=Pose_8, outputs=camera_args)


        start.click(fn=model_run, 
                    inputs=[process_image, prompt, seed, n_samples, camera_args], 
                    outputs=gen_video)

        # set example
        gr.Markdown("## Examples")
        examples = glob(os.path.join(os.path.dirname(__file__), "./assets/demo/images", "*.png"))
        gr.Examples(
            examples=examples,
            inputs=[input_image],
            examples_per_page=15

        )

        gr.Markdown(article)

    # demo.launch(server_name='0.0.0.0', share=False, server_port=args['server_port'])
    # demo.queue(concurrency_count=1, max_size=10)
    # demo.launch()
    demo.queue(max_size=10).launch(**args)


if __name__=="__main__":
    parser = argparse.ArgumentParser()
    # parser.add_argument("--port", type=int, default=12345)

    parser.add_argument(
        '--listen',
        type=str,
        default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
        help='IP to listen on for connections to Gradio',
    )
    parser.add_argument(
        '--username', type=str, default='', help='Username for authentication'
    )
    parser.add_argument(
        '--password', type=str, default='', help='Password for authentication'
    )
    parser.add_argument(
        '--server_port',
        type=int,
        default=0,
        help='Port to run the server listener on',
    )
    parser.add_argument(
        '--inbrowser', action='store_true', help='Open in browser'
    )
    parser.add_argument(
        '--share', action='store_true', help='Share the gradio UI'
    )

    args = parser.parse_args()

    launch_kwargs = {}
    launch_kwargs['server_name'] = args.listen

    if args.username and args.password:
        launch_kwargs['auth'] = (args.username, args.password)
    if args.server_port:
        launch_kwargs['server_port'] = args.server_port
    if args.inbrowser:
        launch_kwargs['inbrowser'] = args.inbrowser
    if args.share:
        launch_kwargs['share'] = args.share

    main(launch_kwargs)