diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..67c392142e7d1596ee7c4062563de05d3d458c58
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,39 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4d9e940d059eeb5bf101253178c53eb2534c4e29
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.vscode/
+.DS_Store
+*.gif
+*.png
+*.jpg
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb5e074bb76e3e44ffe9399b76c44b2d69f67e11
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+---
+title: MotionCtrl SVD
+emoji: 📉
+colorFrom: yellow
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.37.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..d47677afedbae03d4245b5f501db07ba1f11308b
--- /dev/null
+++ b/app.py
@@ -0,0 +1,861 @@
+import argparse
+import os
+import tempfile
+
+import gradio as gr
+import numpy as np
+import torch
+from glob import glob
+from torchvision.transforms import CenterCrop, Compose, Resize
+
+from gradio_utils.camera_utils import CAMERA_MOTION_MODE, process_camera, create_relative
+
+from gradio_utils.utils import vis_camera
+from gradio_utils.motionctrl_cmcm_gradio import build_model, motionctrl_sample
+
+os.environ['KMP_DUPLICATE_LIB_OK']='True'
+SPACE_ID = os.environ.get('SPACE_ID', '')
+
+
+#### Description ####
+title = r"""<h1 align="center">MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</h1>"""
+subtitle = r"""<h2 align="center">Deployed on SVD Generation</h2>"""
+
+description = r"""
+<b>Official Gradio demo</b> for <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'><b>MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</b></a>.<br>
+🔥 MotionCtrl is capable of independently and flexibly controling the camera motion and object motion of a generated video, with only a unified model.<br>
+🤗 Try to control the motion of the generated videos yourself!<br>
+❗❗❗ Please note **ONLY** Camera Motion Control in the current version of **MotionCtrl** deployed on **SVD** is avaliable.<br>
+"""
+# <div>
+# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/00_ibzz5-dxv2h.gif", width="300">
+# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/01_5guvn-0x6v2.gif", width="300">
+# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/12_sn7bz-0hcaf.gif", width="300">
+# <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/13_3lyco-4ru8j.gif", width="300">
+# </div>
+article = r"""
+If MotionCtrl is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'>Github Repo</a>. Thanks! 
+[![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
+)](https://github.com/TencentARC/MotionCtrl)
+
+---
+
+📝 **Citation**
+<br>
+If our work is useful for your research, please consider citing:
+```bibtex
+@inproceedings{wang2023motionctrl,
+  title={MotionCtrl: A Unified and Flexible Motion Controller for Video Generation},
+  author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Yin},
+  booktitle={arXiv preprint arXiv:2312.03641},
+  year={2023}
+}
+```
+
+📧 **Contact**
+<br>
+If you have any questions, please feel free to reach me out at <b>wzhoux@connect.hku.hk</b>.
+
+"""
+css = """
+.gradio-container {width: 85% !important}
+.gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important;}
+span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; color: #d30f2f !important;}
+button {border-radius: 8px !important;}
+.add_button {background-color: #4CAF50 !important;}
+.remove_button {background-color: #f44336 !important;}
+.clear_button {background-color: gray !important;}
+.mask_button_group {gap: 10px !important;}
+.video {height: 300px !important;}
+.image {height: 300px !important;}
+.video .wrap.svelte-lcpz3o {display: flex !important; align-items: center !important; justify-content: center !important;}
+.video .wrap.svelte-lcpz3o > :first-child {height: 100% !important;}
+.margin_center {width: 50% !important; margin: auto !important;}
+.jc_center {justify-content: center !important;}
+"""
+
+
+T_base = [
+            [1.,0.,0.],             ## W2C  x 的正方向： 相机朝左  left
+            [-1.,0.,0.],            ## W2C  x 的负方向： 相机朝右  right
+            [0., 1., 0.],           ## W2C  y 的正方向： 相机朝上  up     
+            [0.,-1.,0.],            ## W2C  y 的负方向： 相机朝下  down
+            [0.,0.,1.],             ## W2C  z 的正方向： 相机往前  zoom out
+            [0.,0.,-1.],            ## W2C  z 的负方向： 相机往前  zoom in
+        ]   
+radius = 1
+n = 16
+# step = 
+look_at = np.array([0, 0, 0.8]).reshape(3,1)
+# look_at = np.array([0, 0, 0.2]).reshape(3,1)
+
+T_list = []
+base_R = np.array([[1., 0., 0.],
+                [0., 1., 0.],
+                [0., 0., 1.]])
+res = [] 
+res_forsave = []
+T_range = 1.8
+
+
+
+for i in range(0, 16):
+    # theta = (1)*np.pi*i/n
+
+    R = base_R[:,:3]
+    T = np.array([0.,0.,1.]).reshape(3,1) * (i/n)*2
+    RT = np.concatenate([R,T], axis=1)
+    res.append(RT)
+    
+fig = vis_camera(res)
+    
+# MODE = ["camera motion control", "object motion control", "camera + object motion control"]
+MODE = ["control camera poses", "control object trajectory", "control both camera and object motion"]
+RESIZE_MODE = ['Center Crop To 576x1024', 'Keep original spatial ratio']
+DIY_MODE = ['Customized Mode 1: First A then B', 
+            'Customized Mode 2: Both A and B', 
+            'Customized Mode 3: RAW Camera Poses']
+
+## load default model
+num_frames = 14
+num_steps = 25
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device {device}")
+
+config = "configs/inference/config_motionctrl_cmcm.yaml"
+ckpt='checkpoints/motionctrl_svd.ckpt'
+if not os.path.exists(ckpt):
+    os.system(f'wget https://huggingface.co/TencentARC/MotionCtrl/resolve/main/motionctrl_svd.ckpt?download=true -P .')
+    os.system(f'mkdir checkpoints')
+    os.system(f'mv motionctrl_svd.ckpt?download=true {ckpt}')
+model = build_model(config, ckpt, device, num_frames, num_steps)
+width, height = 1024, 576 
+
+traj_list = [] 
+camera_dict = {
+                "motion":[],
+                "mode": "Customized Mode 1: First A then B",  # "First A then B", "Both A and B", "Custom"
+                "speed": 1.0,
+                "complex": None
+                }   
+
+def fn_vis_camera(camera_args):
+    global camera_dict, num_frames, width, height
+    RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height) # [t, 3, 4]
+
+    rescale_T = 1.0
+    rescale_T = max(rescale_T, np.max(np.abs(RT[:,:,-1])) / 1.9)
+
+    fig = vis_camera(create_relative(RT), rescale_T=rescale_T)
+
+    vis_step3_prompt_generate = True
+    vis_generation_dec = True
+    vis_prompt = True
+    vis_num_samples = True
+    vis_seed = True
+    vis_start = True
+    vis_gen_video = True
+    vis_repeat_highlight = True
+
+    return fig, \
+            gr.update(visible=vis_step3_prompt_generate), \
+            gr.update(visible=vis_generation_dec), \
+            gr.update(visible=vis_prompt), \
+            gr.update(visible=vis_num_samples), \
+            gr.update(visible=vis_seed), \
+            gr.update(visible=vis_start), \
+            gr.update(visible=vis_gen_video, value=None), \
+            gr.update(visible=vis_repeat_highlight)
+
+def display_camera_info(camera_dict, camera_mode=None):
+    if camera_dict['complex'] is not None:
+        res = f"complex : {camera_dict['complex']}. "
+        res += f"speed : {camera_dict['speed']}. "
+    else:
+        res = ""
+        res += f"motion : {[_ for _ in camera_dict['motion']]}. "
+        res += f"speed : {camera_dict['speed']}. "
+        if camera_mode == CAMERA_MOTION_MODE[2]:
+            res += f"mode : {camera_dict['mode']}. "
+    return res
+
+def add_camera_motion(camera_motion, camera_mode):  
+    global camera_dict
+    if camera_dict['complex'] is not None:
+        camera_dict['complex'] = None
+    if camera_mode == CAMERA_MOTION_MODE[2] and len(camera_dict['motion']) <2:
+        camera_dict['motion'].append(camera_motion)
+    else:
+        camera_dict['motion']=[camera_motion]
+    
+    return display_camera_info(camera_dict, camera_mode)
+
+def add_complex_camera_motion(camera_motion):
+    global camera_dict
+    camera_dict['complex']=camera_motion
+    return display_camera_info(camera_dict)
+
+def input_raw_camera_pose(combine_type, camera_mode):
+    global camera_dict
+    camera_dict['mode'] = combine_type
+
+    vis_U = False
+    vis_D = False
+    vis_L = False
+    vis_R = False
+    vis_I = False
+    vis_O = False
+    vis_ACW = False
+    vis_CW = False
+    vis_speed = True
+    vis_combine3_des = True
+
+    return gr.update(value='1 0 0 0 0 1 0 0 0 0 1 0\n1 0 0 0 0 1 0 0 0 0 1 -0.225\n1 0 0 0 0 1 0 0 0 0 1 -0.45\n1 0 0 0 0 1 0 0 0 0 1 -0.675\n1 0 0 0 0 1 0 0 0 0 1 -0.9\n1 0 0 0 0 1 0 0 0 0 1 -1.125\n1 0 0 0 0 1 0 0 0 0 1 -1.35\n1 0 0 0 0 1 0 0 0 0 1 -1.575\n1 0 0 0 0 1 0 0 0 0 1 -1.8\n1 0 0 0 0 1 0 0 0 0 1 -2.025\n1 0 0 0 0 1 0 0 0 0 1 -2.25\n1 0 0 0 0 1 0 0 0 0 1 -2.475\n1 0 0 0 0 1 0 0 0 0 1 -2.7\n1 0 0 0 0 1 0 0 0 0 1 -2.925\n', max_lines=16, interactive=True), \
+            gr.update(visible=vis_U), \
+            gr.update(visible=vis_D), \
+            gr.update(visible=vis_L),\
+            gr.update(visible=vis_R), \
+            gr.update(visible=vis_I), \
+            gr.update(visible=vis_O), \
+            gr.update(visible=vis_ACW), \
+            gr.update(visible=vis_CW), \
+            gr.update(visible=vis_speed), \
+            gr.update(visible=vis_combine3_des)
+
+def change_camera_mode(combine_type, camera_mode):
+    global camera_dict
+    camera_dict['mode'] = combine_type
+
+    vis_U = True
+    vis_D = True
+    vis_L = True
+    vis_R = True
+    vis_I = True
+    vis_O = True
+    vis_ACW = True
+    vis_CW = True
+    vis_speed = True
+    vis_combine3_des = False
+
+    return display_camera_info(camera_dict, camera_mode), \
+            gr.update(visible=vis_U), \
+            gr.update(visible=vis_D), \
+            gr.update(visible=vis_L),\
+            gr.update(visible=vis_R), \
+            gr.update(visible=vis_I), \
+            gr.update(visible=vis_O), \
+            gr.update(visible=vis_ACW), \
+            gr.update(visible=vis_CW), \
+            gr.update(visible=vis_speed), \
+            gr.update(visible=vis_combine3_des)
+
+def change_camera_speed(camera_speed):
+    global camera_dict
+    camera_dict['speed'] = camera_speed
+    return display_camera_info(camera_dict)
+
+def reset_camera():
+    global camera_dict
+    camera_dict = {
+                    "motion":[],
+                    "mode": "Customized Mode 1: First A then B",
+                    "speed": 1.0,
+                    "complex": None
+                    }   
+    return display_camera_info(camera_dict)
+
+
+def visualized_camera_poses(step2_camera_motion):
+    reset_camera()
+
+    # generate video
+    vis_step3_prompt_generate = False
+    vis_generation_dec = False
+    vis_prompt = False
+    vis_num_samples = False
+    vis_seed = False
+    vis_start = False
+    vis_gen_video = False
+    vis_repeat_highlight = False
+
+    if step2_camera_motion == CAMERA_MOTION_MODE[0]:
+        vis_basic_camera_motion = True
+        vis_basic_camera_motion_des = True
+        vis_custom_camera_motion = False
+        vis_custom_run_status = False
+        vis_complex_camera_motion = False
+        vis_complex_camera_motion_des = False
+        vis_U = True
+        vis_D = True
+        vis_L = True
+        vis_R = True
+        vis_I = True
+        vis_O = True
+        vis_ACW = True
+        vis_CW = True
+        vis_combine1 = False
+        vis_combine2 = False
+        vis_combine3 = False
+        vis_combine3_des = False
+        vis_speed = True
+
+        vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
+        vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
+
+    elif step2_camera_motion == CAMERA_MOTION_MODE[1]:
+        vis_basic_camera_motion = False
+        vis_basic_camera_motion_des = False
+        vis_custom_camera_motion = False
+        vis_custom_run_status = False
+        vis_complex_camera_motion = True
+        vis_complex_camera_motion_des = True
+        vis_U = False
+        vis_D = False
+        vis_L = False
+        vis_R = False
+        vis_I = False
+        vis_O = False
+        vis_ACW = False
+        vis_CW = False
+        vis_combine1 = False
+        vis_combine2 = False
+        vis_combine3 = False
+        vis_combine3_des = False
+        vis_speed = True
+
+        vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = True, True, True, True
+        vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = True, True, True, True
+
+    else: # step2_camera_motion = CAMERA_MOTION_MODE[2]:
+        vis_basic_camera_motion = False
+        vis_basic_camera_motion_des = False
+        vis_custom_camera_motion = True
+        vis_custom_run_status = True
+        vis_complex_camera_motion = False
+        vis_complex_camera_motion_des = False
+        vis_U = False
+        vis_D = False
+        vis_L = False
+        vis_R = False
+        vis_I = False
+        vis_O = False
+        vis_ACW = False
+        vis_CW = False
+        vis_combine1 = True
+        vis_combine2 = True
+        vis_combine3 = True
+        vis_combine3_des = False
+        vis_speed = False
+
+        vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
+        vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
+
+    vis_camera_args = True
+    vis_camera_reset = True
+    vis_camera_vis = True
+    vis_vis_camera = True
+
+    return gr.update(visible=vis_basic_camera_motion), \
+            gr.update(visible=vis_basic_camera_motion_des), \
+            gr.update(visible=vis_custom_camera_motion), \
+            gr.update(visible=vis_custom_run_status), \
+            gr.update(visible=vis_complex_camera_motion), \
+            gr.update(visible=vis_complex_camera_motion_des), \
+            gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
+            gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
+            gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
+            gr.update(visible=vis_combine3_des), \
+            gr.update(visible=vis_speed), \
+            gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
+            gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
+            gr.update(visible=vis_camera_args, value=None), \
+            gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
+            gr.update(visible=vis_vis_camera, value=None), \
+            gr.update(visible=vis_step3_prompt_generate), \
+            gr.update(visible=vis_generation_dec), \
+            gr.update(visible=vis_prompt), \
+            gr.update(visible=vis_num_samples), \
+            gr.update(visible=vis_seed), \
+            gr.update(visible=vis_start), \
+            gr.update(visible=vis_gen_video), \
+            gr.update(visible=vis_repeat_highlight)
+
+
+def process_input_image(input_image, resize_mode):
+    global width, height
+    if resize_mode == RESIZE_MODE[0]:
+        height = 576
+        width = 1024
+        w, h = input_image.size
+        h_ratio = h / height
+        w_ratio = w / width
+
+        if h_ratio > w_ratio:
+            h = int(h / w_ratio)
+            if h < height:
+                h = height
+            input_image = Resize((h, width))(input_image)
+            
+        else:
+            w = int(w / h_ratio)
+            if w < width:
+                w = width
+            input_image = Resize((height, w))(input_image)
+
+        transformer = Compose([
+            # Resize(width),
+            CenterCrop((height, width)),
+        ])
+
+        input_image = transformer(input_image)
+    else:
+        w, h = input_image.size
+        if h > w:
+            height = 576
+            width = int(w * height / h)
+        else:
+            width = 1024
+            height = int(h * width / w)
+
+        input_image = Resize((height, width))(input_image)
+        # print(f'input_image size: {input_image.size}')
+
+    vis_step2_camera_motion = True
+    vis_step2_camera_motion_des = True
+    vis_camera_mode = True
+    vis_camera_info = True
+
+    ####
+    # camera motion control
+    vis_basic_camera_motion = False
+    vis_basic_camera_motion_des = False
+    vis_custom_camera_motion = False
+    vis_custom_run_status = False
+    vis_complex_camera_motion = False
+    vis_complex_camera_motion_des = False
+    vis_U = False
+    vis_D = False
+    vis_L = False
+    vis_R = False
+    vis_I = False
+    vis_O = False
+    vis_ACW = False
+    vis_CW = False
+    vis_combine1 = False
+    vis_combine2 = False
+    vis_combine3 = False
+    vis_combine3_des = False
+    vis_speed = False
+
+    vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
+    vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
+
+    vis_camera_args = False
+    vis_camera_reset = False
+    vis_camera_vis = False
+    vis_vis_camera = False
+
+    # generate video
+    vis_step3_prompt_generate = False
+    vis_generation_dec = False
+    vis_prompt = False
+    vis_num_samples = False
+    vis_seed = False
+    vis_start = False
+    vis_gen_video = False
+    vis_repeat_highlight = False
+    
+    return gr.update(visible=True, value=input_image, height=height, width=width), \
+            gr.update(visible=vis_step2_camera_motion), \
+            gr.update(visible=vis_step2_camera_motion_des), \
+            gr.update(visible=vis_camera_mode), \
+            gr.update(visible=vis_camera_info), \
+            gr.update(visible=vis_basic_camera_motion), \
+            gr.update(visible=vis_basic_camera_motion_des), \
+            gr.update(visible=vis_custom_camera_motion), \
+            gr.update(visible=vis_custom_run_status), \
+            gr.update(visible=vis_complex_camera_motion), \
+            gr.update(visible=vis_complex_camera_motion_des), \
+            gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
+            gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
+            gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
+            gr.update(visible=vis_combine3_des), \
+            gr.update(visible=vis_speed), \
+            gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
+            gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
+            gr.update(visible=vis_camera_args, value=None), \
+            gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
+            gr.update(visible=vis_vis_camera, value=None), \
+            gr.update(visible=vis_step3_prompt_generate), \
+            gr.update(visible=vis_generation_dec), \
+            gr.update(visible=vis_prompt), \
+            gr.update(visible=vis_num_samples), \
+            gr.update(visible=vis_seed), \
+            gr.update(visible=vis_start), \
+            gr.update(visible=vis_gen_video), \
+            gr.update(visible=vis_repeat_highlight)
+
+def model_run(input_image, fps_id, seed, n_samples, camera_args):
+    global model, device, camera_dict, num_frames, num_steps, width, height
+    RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height).reshape(-1,12)
+
+    video_path = motionctrl_sample(
+        model=model,
+        image=input_image,
+        RT=RT,
+        num_frames=num_frames,
+        fps_id=fps_id,
+        decoding_t=1,
+        seed=seed,
+        sample_num=n_samples,
+        device=device
+    )
+
+    return video_path
+
+def main(args):
+    demo = gr.Blocks()
+    with demo:
+
+        gr.Markdown(title)
+        gr.Markdown(subtitle)
+        gr.Markdown(description)
+
+        with gr.Column():
+            
+            # step 0: Some useful tricks
+            gr.Markdown("## Step 0/3: Some Useful Tricks", show_label=False)
+            gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
+                                                \n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
+                                                or increase `FPS`.", "Normal")],
+                                color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=True)
+
+            # step 2: input an image
+            step2_title = gr.Markdown("---\n## Step 1/3: Input an Image", show_label=False, visible=True)
+            step2_dec = gr.Markdown(f"\n 1. Upload an Image by `Drag` or Click `Upload Image`; \
+                                    \n 2. Click `{RESIZE_MODE[0]}` or `{RESIZE_MODE[1]}` to select the image resize mode. \
+                                    You will get a processed image and go into the next step. \
+                                    \n - `{RESIZE_MODE[0]}`: Our MotionCtrl is train on image with spatial size 576x1024. Choose `{RESIZE_MODE[0]}` can get better generated video. \
+                                    \n - `{RESIZE_MODE[1]}`: Choose `{RESIZE_MODE[1]}` if you want to generate video with the same spatial ratio as the input image.", 
+                                    show_label=False, visible=True)
+                                    
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=2):
+                    input_image = gr.Image(type="pil", interactive=True, elem_id="input_image", elem_classes='image', visible=True)
+                    # process_input_image_button = gr.Button(value="Process Input Image", visible=False)
+                    with gr.Row():
+                        center_crop_botton = gr.Button(value=RESIZE_MODE[0], visible=True)
+                        keep_spatial_raition_botton = gr.Button(value=RESIZE_MODE[1], visible=True)
+                with gr.Column(scale=2):
+                    process_image = gr.Image(type="pil", interactive=False, elem_id="process_image", elem_classes='image', visible=False)
+            # step2_proceed_button = gr.Button(value="Proceed", visible=False)
+
+            
+            # step3 - camera motion control
+            step2_camera_motion = gr.Markdown("---\n## Step 2/3: Select the camera poses", show_label=False, visible=False)
+            step2_camera_motion_des = gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Including 8 basic camera poses, such as pan up, pan down, zoom in, and zoom out. \
+                                                    \n - {CAMERA_MOTION_MODE[1]}: Complex camera poses extracted from the real videos. \
+                                                    \n - {CAMERA_MOTION_MODE[2]}: You can customize complex camera poses yourself by combining or fusing two of the eight basic camera poses or input RAW RT matrix. \
+                                                    \n - Click `Proceed` to go into next step", 
+                                                  show_label=False, visible=False)
+            camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
+            camera_info = gr.Button(value="Proceed", visible=False)
+
+            with gr.Row():
+                with gr.Column():
+                    # step3.1 - camera motion control - basic
+                    basic_camera_motion = gr.Markdown("---\n### Basic Camera Poses", show_label=False, visible=False)
+                    basic_camera_motion_des = gr.Markdown(f"\n 1. Click one of the basic camera poses, such as `Pan Up`; \
+                                                            \n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
+                                                            \n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
+                                                            \n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
+                                                        show_label=False, visible=False)
+                    
+                    
+                    # step3.2 - camera motion control - provided complex
+                    complex_camera_motion = gr.Markdown("---\n### Provided Complex Camera Poses", show_label=False, visible=False)
+                    complex_camera_motion_des = gr.Markdown(f"\n 1. Click one of the complex camera poses, such as `Pose_1`; \
+                                                            \n 2. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
+                                                            \n 3. Click `Reset Camera` to reset the camera poses (If needed). ",
+                                                        show_label=False, visible=False)
+
+                    # step3.3 - camera motion control - custom
+                    custom_camera_motion = gr.Markdown(f"---\n### {CAMERA_MOTION_MODE[2]}", show_label=False, visible=False)
+                    custom_run_status = gr.Markdown(f"\n 1. Click `{DIY_MODE[0]}`, `{DIY_MODE[1]}`, or `{DIY_MODE[2]}` \
+                                                    \n - `Customized Mode 1: First A then B`: For example, click `Pan Up` and `Pan Left`, the camera will first `Pan Up` and then `Pan Left`; \
+                                                    \n - `Customized Mode 2: Both A and B`: For example, click `Pan Up` and `Pan Left`, the camera will move towards the upper left corner; \
+                                                    \n - `{DIY_MODE[2]}`: Input the RAW RT matrix yourselves. \
+                                                    \n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
+                                                    \n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
+                                                    \n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
+                                                        show_label=False, visible=False)
+
+                    gr.HighlightedText(value=[("",""), ("1. Select two of the basic camera poses; 2. Select Customized Mode 1 OR Customized Mode 2. 3. Visualized Camera to show the customized camera poses", "Normal")],
+                                                        color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
+                    
+                    with gr.Row():
+                        combine1 = gr.Button(value=DIY_MODE[0], visible=False)
+                        combine2 = gr.Button(value=DIY_MODE[1], visible=False)
+                        combine3 = gr.Button(value=DIY_MODE[2], visible=False)
+                    with gr.Row():
+                        combine3_des = gr.Markdown(f"---\n#### Input your camera pose in the following textbox. \
+                                                A total of 14 lines and each line contains 12 float number, indicated \
+                                                the RT matrix in the shape of 1x12. \
+                                                The example is RT matrix of ZOOM IN.", show_label=False, visible=False)
+
+                    with gr.Row():
+                        U = gr.Button(value="Pan Up", visible=False)
+                        D = gr.Button(value="Pan Down", visible=False)
+                        L = gr.Button(value="Pan Left", visible=False)
+                        R = gr.Button(value="Pan Right", visible=False)
+                    with gr.Row():
+                        I = gr.Button(value="Zoom In", visible=False)
+                        O = gr.Button(value="Zoom Out", visible=False)
+                        ACW = gr.Button(value="ACW", visible=False)
+                        CW = gr.Button(value="CW", visible=False)
+
+                    with gr.Row():    
+                        speed = gr.Slider(minimum=0, maximum=8, step=0.2, label="Motion Speed", value=1.0, visible=False)
+
+                    with gr.Row():
+                        Pose_1 = gr.Button(value="Pose_1", visible=False)
+                        Pose_2 = gr.Button(value="Pose_2", visible=False)
+                        Pose_3 = gr.Button(value="Pose_3", visible=False)
+                        Pose_4 = gr.Button(value="Pose_4", visible=False)
+                    with gr.Row():
+                        Pose_5 = gr.Button(value="Pose_5", visible=False)
+                        Pose_6 = gr.Button(value="Pose_6", visible=False)
+                        Pose_7 = gr.Button(value="Pose_7", visible=False)
+                        Pose_8 = gr.Button(value="Pose_8", visible=False)
+                
+                    with gr.Row():
+                        camera_args = gr.Textbox(value="Camera Type", label="Camera Type", visible=False)
+                    with gr.Row():
+                        camera_vis= gr.Button(value="Visualize Camera and Proceed", visible=False)
+                        camera_reset = gr.Button(value="Reset Camera", visible=False)
+                with gr.Column():
+                    vis_camera = gr.Plot(fig, label='Camera Poses', visible=False)
+
+            
+            # step4 - Generate videos
+            with gr.Row():
+                with gr.Column():
+                    step3_prompt_generate = gr.Markdown("---\n## Step 3/3: Generate videos", show_label=False, visible=False)
+                    generation_dec = gr.Markdown(f"\n 1. Set `FPS`.; \
+                                                    \n 2. Set `n_samples`; \
+                                                    \n 3. Set `seed`; \
+                                                    \n 4. Click `Start generation !` to generate videos; ", visible=False)
+                    # prompt = gr.Textbox(value="a dog sitting on grass", label="Prompt", interactive=True, visible=False)
+                    prompt = gr.Slider(minimum=5, maximum=30, step=1, label="FPS", value=10, visible=False)
+                    n_samples = gr.Number(value=2, precision=0, interactive=True, label="n_samples", visible=False)
+                    seed = gr.Number(value=1234, precision=0, interactive=True, label="Seed", visible=False)
+                    start = gr.Button(value="Start generation !", visible=False)
+                with gr.Column():
+                    gen_video = gr.Video(value=None, label="Generate Video", visible=False)
+                    repeat_highlight=gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
+                                                \n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
+                                                or increase `FPS`.", "Normal")],
+                                color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
+
+        center_crop_botton.click(
+            fn=process_input_image, 
+            inputs=[input_image, center_crop_botton], 
+            outputs=[
+                process_image,
+                step2_camera_motion, 
+                step2_camera_motion_des,
+                camera_mode, 
+                camera_info,
+                basic_camera_motion,
+                basic_camera_motion_des,
+                custom_camera_motion,
+                custom_run_status,
+                complex_camera_motion,
+                complex_camera_motion_des,
+                U, D, L, R, 
+                I, O, ACW, CW, 
+                combine1, combine2, combine3, combine3_des,
+                speed, 
+                Pose_1, Pose_2, Pose_3, Pose_4, 
+                Pose_5, Pose_6, Pose_7, Pose_8,
+                camera_args, 
+                camera_reset, camera_vis,
+                vis_camera,
+
+                step3_prompt_generate, 
+                generation_dec, 
+                prompt, 
+                n_samples, 
+                seed, start, gen_video, repeat_highlight])
+
+        keep_spatial_raition_botton.click(
+            fn=process_input_image, 
+            inputs=[input_image, keep_spatial_raition_botton], 
+            outputs=[
+                process_image,
+                step2_camera_motion, 
+                step2_camera_motion_des,
+                camera_mode, 
+                camera_info,
+                basic_camera_motion,
+                basic_camera_motion_des,
+                custom_camera_motion,
+                custom_run_status,
+                complex_camera_motion,
+                complex_camera_motion_des,
+                U, D, L, R, 
+                I, O, ACW, CW, 
+                combine1, combine2, combine3, combine3_des,
+                speed, 
+                Pose_1, Pose_2, Pose_3, Pose_4, 
+                Pose_5, Pose_6, Pose_7, Pose_8,
+                camera_args, 
+                camera_reset, camera_vis,
+                vis_camera,
+
+                step3_prompt_generate, 
+                generation_dec, 
+                prompt, 
+                n_samples, 
+                seed, start, gen_video, repeat_highlight])
+        
+
+        camera_info.click(
+            fn=visualized_camera_poses,
+            inputs=[camera_mode],
+            outputs=[basic_camera_motion,
+                     basic_camera_motion_des,
+                     custom_camera_motion,
+                     custom_run_status,
+                     complex_camera_motion,
+                     complex_camera_motion_des,
+                     U, D, L, R, 
+                     I, O, ACW, CW, 
+                     combine1, combine2, combine3, combine3_des,
+                     speed, 
+                     Pose_1, Pose_2, Pose_3, Pose_4, 
+                     Pose_5, Pose_6, Pose_7, Pose_8,
+                     camera_args, 
+                     camera_reset, camera_vis,
+                     vis_camera,
+                     step3_prompt_generate, generation_dec, prompt, n_samples, seed, start, gen_video, repeat_highlight],
+        )
+
+
+        U.click(fn=add_camera_motion, inputs=[U, camera_mode], outputs=camera_args)
+        D.click(fn=add_camera_motion, inputs=[D, camera_mode], outputs=camera_args)
+        L.click(fn=add_camera_motion, inputs=[L, camera_mode], outputs=camera_args)
+        R.click(fn=add_camera_motion, inputs=[R, camera_mode], outputs=camera_args)
+        I.click(fn=add_camera_motion, inputs=[I, camera_mode], outputs=camera_args)
+        O.click(fn=add_camera_motion, inputs=[O, camera_mode], outputs=camera_args)
+        ACW.click(fn=add_camera_motion, inputs=[ACW, camera_mode], outputs=camera_args)
+        CW.click(fn=add_camera_motion, inputs=[CW, camera_mode], outputs=camera_args)
+        speed.change(fn=change_camera_speed, inputs=speed, outputs=camera_args)
+        camera_reset.click(fn=reset_camera, inputs=None, outputs=[camera_args])
+
+        combine1.click(fn=change_camera_mode, 
+                       inputs=[combine1, camera_mode], 
+                       outputs=[camera_args,
+                                U, D, L, R, 
+                                I, O, ACW, CW, speed,
+                                combine3_des])
+        combine2.click(fn=change_camera_mode, 
+                       inputs=[combine2, camera_mode], 
+                       outputs=[camera_args,
+                                U, D, L, R, 
+                                I, O, ACW, CW, 
+                                speed,
+                                combine3_des])
+        combine3.click(fn=input_raw_camera_pose, 
+                       inputs=[combine3, camera_mode], 
+                       outputs=[camera_args,
+                                U, D, L, R, 
+                                I, O, ACW, CW, 
+                                speed, 
+                                combine3_des])
+
+        camera_vis.click(fn=fn_vis_camera, inputs=[camera_args], 
+                         outputs=[vis_camera, 
+                                  step3_prompt_generate, 
+                                  generation_dec,
+                                  prompt, 
+                                  n_samples, 
+                                  seed, 
+                                  start, 
+                                  gen_video,
+                                  repeat_highlight])
+
+        Pose_1.click(fn=add_complex_camera_motion, inputs=Pose_1, outputs=camera_args)
+        Pose_2.click(fn=add_complex_camera_motion, inputs=Pose_2, outputs=camera_args)
+        Pose_3.click(fn=add_complex_camera_motion, inputs=Pose_3, outputs=camera_args)
+        Pose_4.click(fn=add_complex_camera_motion, inputs=Pose_4, outputs=camera_args)
+        Pose_5.click(fn=add_complex_camera_motion, inputs=Pose_5, outputs=camera_args)
+        Pose_6.click(fn=add_complex_camera_motion, inputs=Pose_6, outputs=camera_args)
+        Pose_7.click(fn=add_complex_camera_motion, inputs=Pose_7, outputs=camera_args)
+        Pose_8.click(fn=add_complex_camera_motion, inputs=Pose_8, outputs=camera_args)
+
+
+        start.click(fn=model_run, 
+                    inputs=[process_image, prompt, seed, n_samples, camera_args], 
+                    outputs=gen_video)
+
+        # set example
+        gr.Markdown("## Examples")
+        examples = glob(os.path.join(os.path.dirname(__file__), "./assets/demo/images", "*.png"))
+        gr.Examples(
+            examples=examples,
+            inputs=[input_image],
+        )
+
+        gr.Markdown(article)
+
+    # demo.launch(server_name='0.0.0.0', share=False, server_port=args['server_port'])
+    # demo.queue(concurrency_count=1, max_size=10)
+    # demo.launch()
+    demo.queue(max_size=10).launch(**args)
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--port", type=int, default=12345)
+
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
+    )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
+    )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+
+    args = parser.parse_args()
+
+    launch_kwargs = {}
+    launch_kwargs['server_name'] = args.listen
+
+    if args.username and args.password:
+        launch_kwargs['auth'] = (args.username, args.password)
+    if args.server_port:
+        launch_kwargs['server_port'] = args.server_port
+    if args.inbrowser:
+        launch_kwargs['inbrowser'] = args.inbrowser
+    if args.share:
+        launch_kwargs['share'] = args.share
+
+    main(launch_kwargs)
diff --git a/configs/inference/config_motionctrl_cmcm.yaml b/configs/inference/config_motionctrl_cmcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae993fdf4ad21e2b36006def3030fce1e9901435
--- /dev/null
+++ b/configs/inference/config_motionctrl_cmcm.yaml
@@ -0,0 +1,169 @@
+model:
+  base_learning_rate: 3.0e-5
+  target: sgm.motionctrl.camera_motion_control.CameraMotionControl
+  params:
+    ckpt_path: /group/30098/zhouxiawang/env/share/weights/svd/stable-video-diffusion-img2vid/svd.safetensors
+    scale_factor: 0.18215
+    input_key: video
+    no_cond_log: true
+    en_and_decode_n_samples_a_time: 1
+    use_ema: false
+    disable_first_stage_autocast: true
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        num_frames: 14
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: false
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: true
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: true
+        use_spatial_context: true
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: false
+          input_key: cond_frames_without_noise
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: true
+                # version: "/apdcephfs_cq3/share_1290939/vg_zoo/dependencies/OpenCLIP-ViT-H-14-laion2B-s32B-b79K/blobs/9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
+
+        - input_key: fps_id
+          is_trainable: false
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: motion_bucket_id
+          is_trainable: false
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: false
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: true
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: true
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: true
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        - input_key: cond_aug
+          is_trainable: false
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config: 
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: true
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: true
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+
+    # loss_fn_config:
+    #   target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+    #   params:
+    #     batch2model_keys: ['RT']  
+    #     loss_weighting_config:
+    #       target: sgm.modules.diffusionmodules.loss_weighting.VWeighting
+    #     sigma_sampler_config:
+    #       target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+    #       params:
+    #         p_mean: 1.0
+    #         p_std: 1.6
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 25
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            num_frames: 14
+            max_scale: 2.5
+            min_scale: 1.0
diff --git a/configs/inference/motionctrl_run.sh b/configs/inference/motionctrl_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2e7ee6c7811076d4cf5ee7dd3d459dc2775080cd
--- /dev/null
+++ b/configs/inference/motionctrl_run.sh
@@ -0,0 +1,36 @@
+
+
+ckpt='checkpoints/motionctrl_svd.ckpt'
+config='configs/inference/config_motionctrl_cmcm.yaml'
+
+height=576
+width=1024
+cond_aug=0.02
+
+fps=10
+
+image_input='examples/basic/eduardo-gorghetto-5auIBbcoRNw-unsplash.jpg'
+
+res_dir="outputs/motionctrl_svd"
+if [ ! -d $res_dir ]; then
+    mkdir -p $res_dir
+fi
+
+CUDA_VISIBLE_DEVICES=7 python main/inference/motionctrl_cmcm.py \
+--seed 12345 \
+--ckpt $ckpt \
+--config $config \
+--savedir $res_dir \
+--savefps 10 \
+--ddim_steps 25 \
+--frames 14 \
+--input $image_input \
+--fps $fps \
+--motion 127 \
+--cond_aug $cond_aug \
+--decoding_t 1 --resize \
+--height $height --width $width \
+--sample_num 2 \
+--transform \
+--pose_dir 'examples/camera_poses' \
+--speed 2.0 \
\ No newline at end of file
diff --git a/examples/camera_poses/test.txt b/examples/camera_poses/test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7bb0cd257e2112c3b97b7da31749a06d1cd95434
--- /dev/null
+++ b/examples/camera_poses/test.txt
@@ -0,0 +1,42 @@
+gr.update(value=f'1.0 -4.493872218791495e-10 5.58983348497577e-09 1.9967236752904682e-09 \
+                     -4.493872218791495e-10 1.0 -6.144247333139674e-10 1.0815730533408896e-09 \
+                     5.58983348497577e-09 -6.144247333139674e-10 1.0 -7.984015226725205e-09 \n\
+                     0.9982863664627075 -0.0024742060340940952 0.05846544727683067 -0.024547122418880463 \
+                     0.002410230925306678 0.9999964237213135 0.0011647245846688747 -0.003784072818234563 \
+                     -0.05846811458468437 -0.0010218139505013824 0.9982887506484985 -0.09103696048259735 \n\
+                     0.9933298230171204 -0.006303737405687571 0.11513543128967285 -0.053876250982284546 \
+                     0.00586089538410306 0.9999741315841675 0.004184383898973465 -0.006566310301423073 \
+                    -0.115158811211586 -0.0034816779661923647 0.9933409690856934 -0.18525512516498566\n \
+                     0.9849286675453186 -0.013619760051369667 0.17242403328418732 -0.08322551101446152 \
+                     0.01256392989307642 0.9998950958251953 0.0072133541107177734 -0.004579910542815924 \
+                     -0.17250417172908783 -0.004938316997140646 0.9849964380264282 -0.28701746463775635 \n\
+                    0.9731453657150269 -0.022617166861891747 0.2290775030851364 -0.11655563861131668 \
+                     0.02060025744140148 0.9997251629829407 0.011192308738827705 -0.0017426757840439677\
+                     -0.2292676568031311 -0.006172688212245703 0.9733438491821289 -0.37736839056015015\n \
+                    0.9582399725914001 -0.03294993191957474 0.2840607464313507 -0.15743066370487213 \
+                     0.030182993039488792 0.9994447827339172 0.014113469049334526 -0.002769832033663988 \
+                     -0.28436803817749023 -0.004950287751853466 0.9587023854255676 -0.46959081292152405 \n \
+                    0.940129816532135 -0.03991429880261421 0.3384712040424347 -0.22889098525047302 \
+                     0.03725311905145645 0.9992027282714844 0.01435780432075262 -0.0028311305213719606 \
+                     -0.3387744128704071 -0.0008890923927538097 0.9408671855926514 -0.5631460547447205\n \
+                     0.9222924709320068 -0.044258520007133484 0.38395029306411743 -0.2986142039299011 \
+                     0.04110203683376312 0.9990199208259583 0.01642671786248684 0.0013055746676400304 \
+                     -0.38430097699165344 0.000630900904070586 0.9232076406478882 -0.6414245367050171\n \
+                     0.9061535000801086 -0.04851173609495163 0.4201577305793762 -0.3483412563800812 \
+                     0.04521748423576355 0.9988185167312622 0.017803886905312538 0.0010280977003276348 \
+                     -0.4205249547958374 0.0028654206544160843 0.907276451587677 -0.7144853472709656\n \
+                     0.8919307589530945 -0.05171844735741615 0.4492044746875763 -0.37905213236808777 \
+                     0.04818608984351158 0.9986518621444702 0.019300933927297592 0.00036871168413199484 \
+                     -0.44959715008735657 0.004430312197655439 0.8932204246520996,-0.7976372241973877\n \
+                     0.8792291879653931 -0.05425972864031792 0.47329893708229065 -0.39671003818511963 \
+                     0.05076585337519646 0.998507022857666 0.02016463316977024 0.001104982104152441 \
+                     -0.4736863970756531 0.00629808846861124 0.8806710243225098 -0.8874085545539856\n \
+                     0.8659296035766602 -0.0567130371928215 0.49694016575813293 -0.4097800552845001 \
+                     0.05366959795355797 0.9983500838279724 0.020415671169757843 0.0009228077251464128 \
+                     -0.497278094291687 0.008992047980427742 0.8675445914268494 -0.9762357473373413\n \
+                     0.8503361940383911 -0.055699657648801804 0.5232837200164795 -0.44268566370010376 \
+                     0.054582174867391586 0.9983546733856201 0.01757136546075344 0.005412018392235041 \
+                     -0.5234014391899109, 0.013620397076010704, 0.8519773483276367, -1.069865107536316 \n \
+                     0.836037814617157 -0.05214058235287666 0.5461887717247009 -0.4671085774898529 \
+                    0.05177384987473488 0.9985294938087463 0.01607322134077549 0.008980141952633858 \
+                      -0.5462236404418945 0.014840473420917988 0.8375079035758972 -1.1569048166275024\n', interactive=True), \
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_018f7907401f2fef.json b/examples/camera_poses/test_camera_018f7907401f2fef.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9cc8fa2c3ca1200ca75b2e3a4a381cf75d8f157
--- /dev/null
+++ b/examples/camera_poses/test_camera_018f7907401f2fef.json
@@ -0,0 +1 @@
+[[1.0, -4.493872218791495e-10, 5.58983348497577e-09, 1.9967236752904682e-09, -4.493872218791495e-10, 1.0, -6.144247333139674e-10, 1.0815730533408896e-09, 5.58983348497577e-09, -6.144247333139674e-10, 1.0, -7.984015226725205e-09], [0.9982863664627075, -0.0024742060340940952, 0.05846544727683067, -0.024547122418880463, 0.002410230925306678, 0.9999964237213135, 0.0011647245846688747, -0.003784072818234563, -0.05846811458468437, -0.0010218139505013824, 0.9982887506484985, -0.09103696048259735], [0.9933298230171204, -0.006303737405687571, 0.11513543128967285, -0.053876250982284546, 0.00586089538410306, 0.9999741315841675, 0.004184383898973465, -0.006566310301423073, -0.115158811211586, -0.0034816779661923647, 0.9933409690856934, -0.18525512516498566], [0.9849286675453186, -0.013619760051369667, 0.17242403328418732, -0.08322551101446152, 0.01256392989307642, 0.9998950958251953, 0.0072133541107177734, -0.004579910542815924, -0.17250417172908783, -0.004938316997140646, 0.9849964380264282, -0.28701746463775635], [0.9731453657150269, -0.022617166861891747, 0.2290775030851364, -0.11655563861131668, 0.02060025744140148, 0.9997251629829407, 0.011192308738827705, -0.0017426757840439677, -0.2292676568031311, -0.006172688212245703, 0.9733438491821289, -0.37736839056015015], [0.9582399725914001, -0.03294993191957474, 0.2840607464313507, -0.15743066370487213, 0.030182993039488792, 0.9994447827339172, 0.014113469049334526, -0.002769832033663988, -0.28436803817749023, -0.004950287751853466, 0.9587023854255676, -0.46959081292152405], [0.940129816532135, -0.03991429880261421, 0.3384712040424347, -0.22889098525047302, 0.03725311905145645, 0.9992027282714844, 0.01435780432075262, -0.0028311305213719606, -0.3387744128704071, -0.0008890923927538097, 0.9408671855926514, -0.5631460547447205], [0.9222924709320068, -0.044258520007133484, 0.38395029306411743, -0.2986142039299011, 0.04110203683376312, 0.9990199208259583, 0.01642671786248684, 0.0013055746676400304, -0.38430097699165344, 0.000630900904070586, 0.9232076406478882, -0.6414245367050171], [0.9061535000801086, -0.04851173609495163, 0.4201577305793762, -0.3483412563800812, 0.04521748423576355, 0.9988185167312622, 0.017803886905312538, 0.0010280977003276348, -0.4205249547958374, 0.0028654206544160843, 0.907276451587677, -0.7144853472709656], [0.8919307589530945, -0.05171844735741615, 0.4492044746875763, -0.37905213236808777, 0.04818608984351158, 0.9986518621444702, 0.019300933927297592, 0.00036871168413199484, -0.44959715008735657, 0.004430312197655439, 0.8932204246520996, -0.7976372241973877], [0.8792291879653931, -0.05425972864031792, 0.47329893708229065, -0.39671003818511963, 0.05076585337519646, 0.998507022857666, 0.02016463316977024, 0.001104982104152441, -0.4736863970756531, 0.00629808846861124, 0.8806710243225098, -0.8874085545539856], [0.8659296035766602, -0.0567130371928215, 0.49694016575813293, -0.4097800552845001, 0.05366959795355797, 0.9983500838279724, 0.020415671169757843, 0.0009228077251464128, -0.497278094291687, 0.008992047980427742, 0.8675445914268494, -0.9762357473373413], [0.8503361940383911, -0.055699657648801804, 0.5232837200164795, -0.44268566370010376, 0.054582174867391586, 0.9983546733856201, 0.01757136546075344, 0.005412018392235041, -0.5234014391899109, 0.013620397076010704, 0.8519773483276367, -1.069865107536316], [0.836037814617157, -0.05214058235287666, 0.5461887717247009, -0.4671085774898529, 0.05177384987473488, 0.9985294938087463, 0.01607322134077549, 0.008980141952633858, -0.5462236404418945, 0.014840473420917988, 0.8375079035758972, -1.1569048166275024], [0.82603919506073, -0.04987695440649986, 0.5614013671875, -0.4677649438381195, 0.05124447122216225, 0.9985973834991455, 0.013318539597094059, 0.012170637026429176, -0.5612781643867493, 0.017767081037163734, 0.8274364471435547, -1.2651430368423462], [0.8179472088813782, -0.0496118925511837, 0.573150098323822, -0.45822662115097046, 0.052784956991672516, 0.9985441565513611, 0.011104168370366096, 0.018991567194461823, -0.5728666186332703, 0.0211710836738348, 0.8193751573562622, -1.3895009756088257]]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_088b93f15ca8745d.json b/examples/camera_poses/test_camera_088b93f15ca8745d.json
new file mode 100644
index 0000000000000000000000000000000000000000..eeabf8894882e430e672f1cfeb5003ca419da7dc
--- /dev/null
+++ b/examples/camera_poses/test_camera_088b93f15ca8745d.json
@@ -0,0 +1 @@
+[[0.9999999403953552, 3.8618797049139175e-10, -1.3441345814158012e-08, 1.3928219289027766e-07, 3.8618797049139175e-10, 1.0, -4.134579345560496e-10, -6.074658998045379e-09, -1.3441345814158012e-08, -4.134579345560496e-10, 1.0, 7.038884319854333e-08], [0.9994913339614868, 0.003077245783060789, -0.031741149723529816, 0.08338673412799835, -0.0030815028585493565, 0.999995231628418, -8.520588744431734e-05, 0.006532138213515282, 0.0317407064139843, 0.00018297435599379241, 0.9994961619377136, -0.02256060019135475], [0.9979938268661499, 0.0051255361177027225, -0.06310292333364487, 0.18344485759735107, -0.005117486696690321, 0.9999868869781494, 0.00028916727751493454, 0.018134046345949173, 0.06310353428125381, 3.434090831433423e-05, 0.9980069994926453, -0.030579563230276108], [0.9954646825790405, 0.00820203311741352, -0.0947771891951561, 0.29663264751434326, -0.00811922550201416, 0.9999662041664124, 0.0012593322899192572, 0.02404301054775715, 0.09478426724672318, -0.0004841022891923785, 0.9954977035522461, -0.02678978443145752], [0.9913660883903503, 0.012001598253846169, -0.13057230412960052, 0.4076530337333679, -0.011968829669058323, 0.999927818775177, 0.0010357286082580686, 0.024977533146739006, 0.1305752843618393, 0.0005360084469430149, 0.9914382100105286, -0.010779343545436859], [0.985666811466217, 0.017323914915323257, -0.16781197488307953, 0.509911060333252, -0.017399737611413002, 0.9998481273651123, 0.0010186078725382686, 0.023117201402783394, 0.16780413687229156, 0.0019158748909831047, 0.9858185052871704, 0.018053216859698296], [0.9784473180770874, 0.022585421800613403, -0.20525763928890228, 0.5957884192466736, -0.022850200533866882, 0.9997382760047913, 0.0010805513011291623, 0.020451901480555534, 0.2052282989025116, 0.003632916137576103, 0.9787073731422424, 0.03460140898823738], [0.9711515307426453, 0.026846906170248985, -0.23694702982902527, 0.6832671165466309, -0.02745947800576687, 0.999622642993927, 0.0007151798927225173, 0.012211678549647331, 0.23687675595283508, 0.005811895243823528, 0.971522331237793, 0.03236595541238785], [0.9641746878623962, 0.030338184908032417, -0.26352745294570923, 0.7764986157417297, -0.031404945999383926, 0.9995067715644836, 0.0001645474840188399, 0.0011497576488181949, 0.26340243220329285, 0.008117412216961384, 0.964651882648468, 0.022656364366412163], [0.9573631882667542, 0.0335896760225296, -0.2869274914264679, 0.8815275430679321, -0.03532479330897331, 0.9993755221366882, -0.0008711823611520231, -0.003618708113208413, 0.2867189943790436, 0.010969695635139942, 0.9579519033432007, 0.005283573176711798], [0.9507063627243042, 0.036557890474796295, -0.3079299330711365, 0.9931321740150452, -0.03846294432878494, 0.9992600679397583, -0.00011733790597645566, 0.0018704120302572846, 0.30769774317741394, 0.01195544097572565, 0.9514090418815613, -0.035360634326934814], [0.9448517560958862, 0.039408694952726364, -0.3251185715198517, 1.1025006771087646, -0.041503626853227615, 0.9991382360458374, 0.0004919985658489168, 0.007425118237733841, 0.32485777139663696, 0.013028733432292938, 0.9456731081008911, -0.09869624674320221], [0.940796971321106, 0.04081147164106369, -0.33650481700897217, 1.1961394548416138, -0.0429220013320446, 0.9990777373313904, 0.0011677180882543325, 0.019955899566411972, 0.336242139339447, 0.013344875536859035, 0.9416810274124146, -0.16835527122020721], [0.9376427531242371, 0.04111124947667122, -0.3451607823371887, 1.2392503023147583, -0.043144747614860535, 0.9990671873092651, 0.0017920633545145392, 0.03982722759246826, 0.34491249918937683, 0.013211555778980255, 0.938541829586029, -0.24618202447891235], [0.9353355765342712, 0.04122937470674515, -0.3513509929180145, 1.285768747329712, -0.043183211237192154, 0.9990646243095398, 0.0022769556380808353, 0.06841164082288742, 0.3511161506175995, 0.01304274145513773, 0.936241090297699, -0.3213619291782379], [0.9342393279075623, 0.041213057935237885, -0.3542574644088745, 1.3363462686538696, -0.04236872121691704, 0.9990919232368469, 0.0044970144517719746, 0.08925694227218628, 0.35412102937698364, 0.010808154009282589, 0.9351370930671692, -0.40201041102409363]]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_1424acd0007d40b5.json b/examples/camera_poses/test_camera_1424acd0007d40b5.json
new file mode 100644
index 0000000000000000000000000000000000000000..02b37fd5a70f4db9b156257145734dc8f6b1499e
--- /dev/null
+++ b/examples/camera_poses/test_camera_1424acd0007d40b5.json
@@ -0,0 +1 @@
+[[1.0, 9.44418099280142e-10, 3.889182664806867e-08, 6.214055492392845e-09, 9.44418099280142e-10, 1.0, -1.0644604121756718e-11, -7.621465680784922e-10, 3.889182664806867e-08, -1.0644604121756718e-11, 1.0, -2.7145965475483536e-08], [0.9873979091644287, -0.007892023772001266, 0.15806053578853607, 0.4749181270599365, 0.008024877868592739, 0.9999678134918213, -0.00020230526570230722, 0.1585356593132019, -0.15805381536483765, 0.0014681711327284575, 0.9874294400215149, -0.2091633826494217], [0.9708925485610962, -0.011486345902085304, 0.23923994600772858, 0.8120080828666687, 0.012198254466056824, 0.9999244809150696, -0.0014952132478356361, 0.2486257702112198, -0.23920467495918274, 0.004370000213384628, 0.9709593057632446, -0.5957822799682617], [0.9619541168212891, -0.013188007287681103, 0.2728927433490753, 1.1486873626708984, 0.014017474837601185, 0.9999011754989624, -0.001090032048523426, 0.3114692270755768, -0.2728513777256012, 0.0048738280311226845, 0.962043821811676, -1.0323039293289185], [0.9586812257766724, -0.013936692848801613, 0.284140944480896, 1.5948307514190674, 0.014867136254906654, 0.9998888373374939, -0.0011181083973497152, 0.36000898480415344, -0.2840937674045563, 0.0052962712943553925, 0.958781898021698, -1.4377187490463257], [0.9583359360694885, -0.011928150430321693, 0.28539448976516724, 2.002793788909912, 0.014221147634088993, 0.9998810887336731, -0.0059633455239236355, 0.35464340448379517, -0.28528934717178345, 0.009773525409400463, 0.9583916068077087, -1.8953297138214111], [0.9584393501281738, -0.010862396098673344, 0.28508952260017395, 2.3351645469665527, 0.012857729569077492, 0.9999041557312012, -0.005128204356878996, 0.38934090733528137, -0.28500640392303467, 0.008580676279962063, 0.9584871530532837, -2.4214961528778076], [0.9587277173995972, -0.009760312736034393, 0.28415825963020325, 2.6858017444610596, 0.012186127714812756, 0.9999027848243713, -0.0067702098749578, 0.4173329174518585, -0.28406453132629395, 0.009953574277460575, 0.9587535262107849, -3.030754327774048], [0.9589635729789734, -0.0070899901911616325, 0.2834406793117523, 2.8917219638824463, 0.010482418350875378, 0.9998904466629028, -0.01045384630560875, 0.4001043438911438, -0.28333547711372375, 0.012996001169085503, 0.9589328169822693, -3.6960957050323486], [0.9590328931808472, -0.005921780597418547, 0.2832328677177429, 3.034579038619995, 0.00947173498570919, 0.9998928308486938, -0.01116593275219202, 0.4193899631500244, -0.2831363379955292, 0.013391205109655857, 0.9589861631393433, -4.384733200073242], [0.9593284726142883, -0.004661113955080509, 0.28225383162498474, 3.2042288780212402, 0.0076906089670956135, 0.9999240636825562, -0.009626304730772972, 0.4752484858036041, -0.28218749165534973, 0.011405492201447487, 0.9592914581298828, -5.098723411560059], [0.9591755867004395, -0.0035665074829012156, 0.2827887237071991, 3.263953924179077, 0.0062992447055876255, 0.9999418258666992, -0.008754877373576164, 0.4543868899345398, -0.282740980386734, 0.010178821161389351, 0.95914226770401, -5.7807512283325195], [0.9591742753982544, -0.003413048107177019, 0.2827949523925781, 3.3116462230682373, 0.003146615345031023, 0.9999940991401672, 0.0013963348465040326, 0.4299861788749695, -0.28279799222946167, -0.0004494813329074532, 0.9591793417930603, -6.478931903839111], [0.9585762619972229, -0.002857929328456521, 0.28482162952423096, 3.4120190143585205, -0.008201238699257374, 0.9992581605911255, 0.0376281812787056, 0.21596357226371765, -0.2847178280353546, -0.03840537369251251, 0.957841694355011, -7.178638935089111], [0.9572952389717102, -0.002719884505495429, 0.28909942507743835, 3.4662365913391113, -0.030634215101599693, 0.9933721423149109, 0.11078491061925888, -0.29767531156539917, -0.2874845862388611, -0.11491020023822784, 0.9508671164512634, -7.794575214385986], [0.9545961618423462, -0.005338957067579031, 0.29785510897636414, 3.5083835124969482, -0.06037351116538048, 0.9756243824958801, 0.2109788954257965, -1.0968165397644043, -0.29172107577323914, -0.21938219666481018, 0.9310050010681152, -8.306528091430664]]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_D.json b/examples/camera_poses/test_camera_D.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fa6462cba65992ec66f014933ca7c3e1ac9f9a8
--- /dev/null
+++ b/examples/camera_poses/test_camera_D.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.2,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.28750000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.37500000000000006,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.4625000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.55,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.6375000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.7250000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.8125000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.9000000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -0.9875000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -1.0750000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -1.1625000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -1.2500000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -1.3375000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -1.4250000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        -1.5125000000000004,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_I.json b/examples/camera_poses/test_camera_I.json
new file mode 100644
index 0000000000000000000000000000000000000000..a44cc11f8bad536d2feb86a56f3e45441f4d12b9
--- /dev/null
+++ b/examples/camera_poses/test_camera_I.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.2
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.28750000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.37500000000000006
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.4625000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.55
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.6375000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.7250000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.8125000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.9000000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.9875000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.0750000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.1625000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.2500000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.3375000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.4250000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.5125000000000004
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_I_0.2x.json b/examples/camera_poses/test_camera_I_0.2x.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4c72597b15c09450618be858523e77f5c16d2b2
--- /dev/null
+++ b/examples/camera_poses/test_camera_I_0.2x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.022500000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.045000000000000005
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.0675
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.09000000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.11250000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.135
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.15750000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.18000000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.2025
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.22500000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.24750000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.27
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.29250000000000004
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.31500000000000006
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.3375
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_I_0.4x.json b/examples/camera_poses/test_camera_I_0.4x.json
new file mode 100644
index 0000000000000000000000000000000000000000..2359910f99fc32b0d54cfda17ad864afcadbf5c7
--- /dev/null
+++ b/examples/camera_poses/test_camera_I_0.4x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.045000000000000005
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.09000000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.135
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.18000000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.22500000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.27
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.31500000000000006
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.36000000000000004
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.405
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.45000000000000007
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.49500000000000005
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.54
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.5850000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.6300000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.675
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_I_1.0x.json b/examples/camera_poses/test_camera_I_1.0x.json
new file mode 100644
index 0000000000000000000000000000000000000000..46e3f75ee8ea4f0b204f27690dbccb61e814f7cd
--- /dev/null
+++ b/examples/camera_poses/test_camera_I_1.0x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.1125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.225
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.3375
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.45
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.5625
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.675
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.7875
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.9
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.0125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.2375
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.35
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.4625000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.575
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.6875
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_I_2.0x.json b/examples/camera_poses/test_camera_I_2.0x.json
new file mode 100644
index 0000000000000000000000000000000000000000..81207f6ef53151f1e01c771d0e13599d0183968b
--- /dev/null
+++ b/examples/camera_poses/test_camera_I_2.0x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.225
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.45
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.675
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -0.9
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.35
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.575
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -1.8
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -2.025
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -2.25
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -2.475
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -2.7
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -2.9250000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -3.15
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        -3.375
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_L.json b/examples/camera_poses/test_camera_L.json
new file mode 100644
index 0000000000000000000000000000000000000000..992b371f2e14ec52574f6777c78c89ecd23119b4
--- /dev/null
+++ b/examples/camera_poses/test_camera_L.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.2,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.28750000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.37500000000000006,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.4625000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.55,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.6375000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.7250000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.8125000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.9000000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.9875000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        1.0750000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        1.1625000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        1.2500000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        1.3375000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        1.4250000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        1.5125000000000004,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_O.json b/examples/camera_poses/test_camera_O.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0bacc0cbf544e1fc4ee96047bd18e6cc8da0e9c
--- /dev/null
+++ b/examples/camera_poses/test_camera_O.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.2
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.28750000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.37500000000000006
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.4625000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.55
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.6375000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.7250000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.8125000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.9000000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.9875000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.0750000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.1625000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.2500000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.3375000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.4250000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.5125000000000004
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_O_0.2x.json b/examples/camera_poses/test_camera_O_0.2x.json
new file mode 100644
index 0000000000000000000000000000000000000000..689535fb82ffc79c2ef0123a42a9b3c1b2dca5a3
--- /dev/null
+++ b/examples/camera_poses/test_camera_O_0.2x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.022500000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.045000000000000005
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0675
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.09000000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.11250000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.135
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.15750000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.18000000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.2025
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.22500000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.24750000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.27
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.29250000000000004
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.31500000000000006
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.3375
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_O_0.4x.json b/examples/camera_poses/test_camera_O_0.4x.json
new file mode 100644
index 0000000000000000000000000000000000000000..01b6ede93af2e2daecbc325d029046195bed7ac0
--- /dev/null
+++ b/examples/camera_poses/test_camera_O_0.4x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.045000000000000005
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.09000000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.135
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.18000000000000002
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.22500000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.27
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.31500000000000006
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.36000000000000004
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.405
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.45000000000000007
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.49500000000000005
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.54
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.5850000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.6300000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.675
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_O_1.0x.json b/examples/camera_poses/test_camera_O_1.0x.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b91d2bf8993e20c20b0d9dc17870e4854aeef9a
--- /dev/null
+++ b/examples/camera_poses/test_camera_O_1.0x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.1125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.225
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.3375
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.45
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.5625
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.675
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.7875
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.9
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.0125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.2375
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.35
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.4625000000000001
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.575
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.6875
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_O_2.0x.json b/examples/camera_poses/test_camera_O_2.0x.json
new file mode 100644
index 0000000000000000000000000000000000000000..5481cc64af88c55ccb3ba8e7ac897ad96fdd88c3
--- /dev/null
+++ b/examples/camera_poses/test_camera_O_2.0x.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.225
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.45
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.675
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.9
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.125
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.35
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.575
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.8
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        2.025
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        2.25
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        2.475
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        2.7
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        2.9250000000000003
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        3.15
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        3.375
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_R.json b/examples/camera_poses/test_camera_R.json
new file mode 100644
index 0000000000000000000000000000000000000000..93be7cff1ce3dc0f4663e6394a6bda23c78bba35
--- /dev/null
+++ b/examples/camera_poses/test_camera_R.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.2,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.28750000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.37500000000000006,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.4625000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.55,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.6375000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.7250000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.8125000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.9000000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -0.9875000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -1.0750000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -1.1625000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -1.2500000000000002,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -1.3375000000000001,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -1.4250000000000003,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        -1.5125000000000004,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_Round-RI-120.json b/examples/camera_poses/test_camera_Round-RI-120.json
new file mode 100644
index 0000000000000000000000000000000000000000..1798456ed380de19d1711035fea74d57c33ebb4e
--- /dev/null
+++ b/examples/camera_poses/test_camera_Round-RI-120.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.8
+    ],
+    [
+        0.9914448613738104,
+        0.0,
+        0.13052619222005157,
+        0.13052619222005157,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.13052619222005157,
+        0.0,
+        0.9914448613738104,
+        1.7914448613738103
+    ],
+    [
+        0.9659258262890683,
+        0.0,
+        0.25881904510252074,
+        0.25881904510252074,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.25881904510252074,
+        0.0,
+        0.9659258262890683,
+        1.7659258262890685
+    ],
+    [
+        0.9238795325112867,
+        0.0,
+        0.3826834323650898,
+        0.3826834323650898,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.3826834323650898,
+        0.0,
+        0.9238795325112867,
+        1.7238795325112868
+    ],
+    [
+        0.8660254037844387,
+        0.0,
+        0.49999999999999994,
+        0.49999999999999994,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.49999999999999994,
+        0.0,
+        0.8660254037844387,
+        1.6660254037844386
+    ],
+    [
+        0.7933533402912353,
+        0.0,
+        0.6087614290087205,
+        0.6087614290087205,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.6087614290087205,
+        0.0,
+        0.7933533402912353,
+        1.5933533402912352
+    ],
+    [
+        0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        0.7071067811865476,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        1.5071067811865477
+    ],
+    [
+        0.6087614290087207,
+        0.0,
+        0.7933533402912352,
+        0.7933533402912352,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7933533402912352,
+        0.0,
+        0.6087614290087207,
+        1.4087614290087207
+    ],
+    [
+        0.5000000000000001,
+        0.0,
+        0.8660254037844386,
+        0.8660254037844386,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.8660254037844386,
+        0.0,
+        0.5000000000000001,
+        1.3000000000000003
+    ],
+    [
+        0.38268343236508984,
+        0.0,
+        0.9238795325112867,
+        0.9238795325112867,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9238795325112867,
+        0.0,
+        0.38268343236508984,
+        1.1826834323650899
+    ],
+    [
+        0.25881904510252096,
+        0.0,
+        0.9659258262890682,
+        0.9659258262890682,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9659258262890682,
+        0.0,
+        0.25881904510252096,
+        1.058819045102521
+    ],
+    [
+        0.1305261922200517,
+        0.0,
+        0.9914448613738104,
+        0.9914448613738104,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9914448613738104,
+        0.0,
+        0.1305261922200517,
+        0.9305261922200517
+    ],
+    [
+        6.123233995736766e-17,
+        0.0,
+        1.0,
+        1.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -1.0,
+        0.0,
+        6.123233995736766e-17,
+        0.8000000000000002
+    ],
+    [
+        -0.13052619222005138,
+        0.0,
+        0.9914448613738105,
+        0.9914448613738105,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9914448613738105,
+        0.0,
+        -0.13052619222005138,
+        0.6694738077799487
+    ],
+    [
+        -0.25881904510252063,
+        0.0,
+        0.9659258262890683,
+        0.9659258262890683,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9659258262890683,
+        0.0,
+        -0.25881904510252063,
+        0.5411809548974794
+    ],
+    [
+        -0.3826834323650895,
+        0.0,
+        0.9238795325112868,
+        0.9238795325112868,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9238795325112868,
+        0.0,
+        -0.3826834323650895,
+        0.41731656763491054
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_Round-RI.json b/examples/camera_poses/test_camera_Round-RI.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c2c7a00d262c3f3363fa2c8a613a29377b14c25
--- /dev/null
+++ b/examples/camera_poses/test_camera_Round-RI.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.7000000000000002
+    ],
+    [
+        0.9807852804032304,
+        0.0,
+        0.19509032201612825,
+        0.17558128981451543,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.19509032201612825,
+        0.0,
+        0.9807852804032304,
+        1.6827067523629076
+    ],
+    [
+        0.9238795325112867,
+        0.0,
+        0.3826834323650898,
+        0.3444150891285808,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.3826834323650898,
+        0.0,
+        0.9238795325112867,
+        1.631491579260158
+    ],
+    [
+        0.8314696123025452,
+        0.0,
+        0.5555702330196022,
+        0.500013209717642,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.5555702330196022,
+        0.0,
+        0.8314696123025452,
+        1.5483226510722907
+    ],
+    [
+        0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        0.6363961030678928,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        1.436396103067893
+    ],
+    [
+        0.5555702330196023,
+        0.0,
+        0.8314696123025452,
+        0.7483226510722907,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.8314696123025452,
+        0.0,
+        0.5555702330196023,
+        1.3000132097176422
+    ],
+    [
+        0.38268343236508984,
+        0.0,
+        0.9238795325112867,
+        0.831491579260158,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9238795325112867,
+        0.0,
+        0.38268343236508984,
+        1.144415089128581
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.8827067523629074,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.9755812898145155
+    ],
+    [
+        6.123233995736766e-17,
+        0.0,
+        1.0,
+        0.9,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -1.0,
+        0.0,
+        6.123233995736766e-17,
+        0.8
+    ],
+    [
+        -0.1950903220161282,
+        0.0,
+        0.9807852804032304,
+        0.8827067523629074,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        -0.1950903220161282,
+        0.6244187101854847
+    ],
+    [
+        -0.3826834323650897,
+        0.0,
+        0.9238795325112867,
+        0.831491579260158,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9238795325112867,
+        0.0,
+        -0.3826834323650897,
+        0.4555849108714193
+    ],
+    [
+        -0.555570233019602,
+        0.0,
+        0.8314696123025453,
+        0.7483226510722908,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.8314696123025453,
+        0.0,
+        -0.555570233019602,
+        0.2999867902823583
+    ],
+    [
+        -0.7071067811865475,
+        0.0,
+        0.7071067811865476,
+        0.6363961030678928,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7071067811865476,
+        0.0,
+        -0.7071067811865475,
+        0.1636038969321073
+    ],
+    [
+        -0.8314696123025453,
+        0.0,
+        0.5555702330196022,
+        0.500013209717642,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.5555702330196022,
+        0.0,
+        -0.8314696123025453,
+        0.051677348927709255
+    ],
+    [
+        -0.9238795325112867,
+        0.0,
+        0.3826834323650899,
+        0.34441508912858093,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.3826834323650899,
+        0.0,
+        -0.9238795325112867,
+        -0.031491579260158
+    ],
+    [
+        -0.9807852804032304,
+        0.0,
+        0.1950903220161286,
+        0.17558128981451576,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.1950903220161286,
+        0.0,
+        -0.9807852804032304,
+        -0.08270675236290737
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_Round-RI_90.json b/examples/camera_poses/test_camera_Round-RI_90.json
new file mode 100644
index 0000000000000000000000000000000000000000..a817879cc2b5650fbde1dbb20262086b7a7e0ad8
--- /dev/null
+++ b/examples/camera_poses/test_camera_Round-RI_90.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        1.7000000000000002
+    ],
+    [
+        0.9951847266721969,
+        0.0,
+        0.0980171403295606,
+        0.08821542629660455,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.0980171403295606,
+        0.0,
+        0.9951847266721969,
+        1.6956662540049772
+    ],
+    [
+        0.9807852804032304,
+        0.0,
+        0.19509032201612825,
+        0.17558128981451543,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.19509032201612825,
+        0.0,
+        0.9807852804032304,
+        1.6827067523629076
+    ],
+    [
+        0.9569403357322088,
+        0.0,
+        0.29028467725446233,
+        0.2612562095290161,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.29028467725446233,
+        0.0,
+        0.9569403357322088,
+        1.661246302158988
+    ],
+    [
+        0.9238795325112867,
+        0.0,
+        0.3826834323650898,
+        0.3444150891285808,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.3826834323650898,
+        0.0,
+        0.9238795325112867,
+        1.631491579260158
+    ],
+    [
+        0.881921264348355,
+        0.0,
+        0.47139673682599764,
+        0.4242570631433979,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.47139673682599764,
+        0.0,
+        0.881921264348355,
+        1.5937291379135194
+    ],
+    [
+        0.8314696123025452,
+        0.0,
+        0.5555702330196022,
+        0.500013209717642,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.5555702330196022,
+        0.0,
+        0.8314696123025452,
+        1.5483226510722907
+    ],
+    [
+        0.773010453362737,
+        0.0,
+        0.6343932841636455,
+        0.5709539557472809,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.6343932841636455,
+        0.0,
+        0.773010453362737,
+        1.4957094080264635
+    ],
+    [
+        0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        0.6363961030678928,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        1.436396103067893
+    ],
+    [
+        0.6343932841636455,
+        0.0,
+        0.7730104533627369,
+        0.6957094080264632,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7730104533627369,
+        0.0,
+        0.6343932841636455,
+        1.370953955747281
+    ],
+    [
+        0.5555702330196023,
+        0.0,
+        0.8314696123025452,
+        0.7483226510722907,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.8314696123025452,
+        0.0,
+        0.5555702330196023,
+        1.3000132097176422
+    ],
+    [
+        0.4713967368259978,
+        0.0,
+        0.8819212643483549,
+        0.7937291379135195,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.8819212643483549,
+        0.0,
+        0.4713967368259978,
+        1.2242570631433982
+    ],
+    [
+        0.38268343236508984,
+        0.0,
+        0.9238795325112867,
+        0.831491579260158,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9238795325112867,
+        0.0,
+        0.38268343236508984,
+        1.144415089128581
+    ],
+    [
+        0.29028467725446233,
+        0.0,
+        0.9569403357322089,
+        0.861246302158988,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9569403357322089,
+        0.0,
+        0.29028467725446233,
+        1.0612562095290161
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.8827067523629074,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.9755812898145155
+    ],
+    [
+        0.09801714032956077,
+        0.0,
+        0.9951847266721968,
+        0.8956662540049771,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9951847266721968,
+        0.0,
+        0.09801714032956077,
+        0.8882154262966048
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_Round-ZoomIn.json b/examples/camera_poses/test_camera_Round-ZoomIn.json
new file mode 100644
index 0000000000000000000000000000000000000000..85eccf2144a24125ba418c778ff7fb783a71f050
--- /dev/null
+++ b/examples/camera_poses/test_camera_Round-ZoomIn.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.81
+    ],
+    [
+        0.9807852804032304,
+        0.0,
+        0.19509032201612825,
+        0.0019509032201612826,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.19509032201612825,
+        0.0,
+        0.9807852804032304,
+        0.8098078528040323
+    ],
+    [
+        0.9238795325112867,
+        0.0,
+        0.3826834323650898,
+        0.003826834323650898,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.3826834323650898,
+        0.0,
+        0.9238795325112867,
+        0.8092387953251129
+    ],
+    [
+        0.8314696123025452,
+        0.0,
+        0.5555702330196022,
+        0.005555702330196022,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.5555702330196022,
+        0.0,
+        0.8314696123025452,
+        0.8083146961230255
+    ],
+    [
+        0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        0.007071067811865476,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.7071067811865476,
+        0.0,
+        0.7071067811865476,
+        0.8070710678118656
+    ],
+    [
+        0.5555702330196023,
+        0.0,
+        0.8314696123025452,
+        0.008314696123025453,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.8314696123025452,
+        0.0,
+        0.5555702330196023,
+        0.805555702330196
+    ],
+    [
+        0.38268343236508984,
+        0.0,
+        0.9238795325112867,
+        0.009238795325112868,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9238795325112867,
+        0.0,
+        0.38268343236508984,
+        0.803826834323651
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.8019509032201614
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.8019509032201614
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.7019509032201614
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.6019509032201614
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.5019509032201613
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.4019509032201613
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.3019509032201613
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.2019509032201613
+    ],
+    [
+        0.19509032201612833,
+        0.0,
+        0.9807852804032304,
+        0.009807852804032305,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        -0.9807852804032304,
+        0.0,
+        0.19509032201612833,
+        0.10195090322016129
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_SPIN-ACW-60.json b/examples/camera_poses/test_camera_SPIN-ACW-60.json
new file mode 100644
index 0000000000000000000000000000000000000000..647ab13af36b35c8a47ae2a6edc3154b3418b114
--- /dev/null
+++ b/examples/camera_poses/test_camera_SPIN-ACW-60.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9978589232386035,
+        -0.06540312923014306,
+        0.0,
+        0.0,
+        0.06540312923014306,
+        0.9978589232386035,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9914448613738104,
+        -0.13052619222005157,
+        0.0,
+        0.0,
+        0.13052619222005157,
+        0.9914448613738104,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9807852804032304,
+        -0.19509032201612825,
+        0.0,
+        0.0,
+        0.19509032201612825,
+        0.9807852804032304,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9659258262890683,
+        -0.25881904510252074,
+        0.0,
+        0.0,
+        0.25881904510252074,
+        0.9659258262890683,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9469301294951057,
+        -0.32143946530316153,
+        0.0,
+        0.0,
+        0.32143946530316153,
+        0.9469301294951057,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9238795325112867,
+        -0.3826834323650898,
+        0.0,
+        0.0,
+        0.3826834323650898,
+        0.9238795325112867,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.8968727415326884,
+        -0.44228869021900125,
+        0.0,
+        0.0,
+        0.44228869021900125,
+        0.8968727415326884,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.8660254037844387,
+        -0.49999999999999994,
+        0.0,
+        0.0,
+        0.49999999999999994,
+        0.8660254037844387,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.8314696123025452,
+        -0.5555702330196022,
+        0.0,
+        0.0,
+        0.5555702330196022,
+        0.8314696123025452,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.7933533402912353,
+        -0.6087614290087205,
+        0.0,
+        0.0,
+        0.6087614290087205,
+        0.7933533402912353,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.7518398074789774,
+        -0.6593458151000688,
+        0.0,
+        0.0,
+        0.6593458151000688,
+        0.7518398074789774,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.7071067811865476,
+        -0.7071067811865476,
+        0.0,
+        0.0,
+        0.7071067811865476,
+        0.7071067811865476,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.659345815100069,
+        -0.7518398074789773,
+        0.0,
+        0.0,
+        0.7518398074789773,
+        0.659345815100069,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.6087614290087207,
+        -0.7933533402912352,
+        0.0,
+        0.0,
+        0.7933533402912352,
+        0.6087614290087207,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.5555702330196024,
+        -0.8314696123025451,
+        0.0,
+        0.0,
+        0.8314696123025451,
+        0.5555702330196024,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_SPIN-CW-60.json b/examples/camera_poses/test_camera_SPIN-CW-60.json
new file mode 100644
index 0000000000000000000000000000000000000000..bab5e326a69c7ff99d562281bcf6873af0ff242b
--- /dev/null
+++ b/examples/camera_poses/test_camera_SPIN-CW-60.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9978589232386035,
+        0.06540312923014306,
+        0.0,
+        0.0,
+        -0.06540312923014306,
+        0.9978589232386035,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9914448613738104,
+        0.13052619222005157,
+        0.0,
+        0.0,
+        -0.13052619222005157,
+        0.9914448613738104,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9807852804032304,
+        0.19509032201612825,
+        0.0,
+        0.0,
+        -0.19509032201612825,
+        0.9807852804032304,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9659258262890683,
+        0.25881904510252074,
+        0.0,
+        0.0,
+        -0.25881904510252074,
+        0.9659258262890683,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9469301294951057,
+        0.32143946530316153,
+        0.0,
+        0.0,
+        -0.32143946530316153,
+        0.9469301294951057,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.9238795325112867,
+        0.3826834323650898,
+        0.0,
+        0.0,
+        -0.3826834323650898,
+        0.9238795325112867,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.8968727415326884,
+        0.44228869021900125,
+        0.0,
+        0.0,
+        -0.44228869021900125,
+        0.8968727415326884,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.8660254037844387,
+        0.49999999999999994,
+        0.0,
+        0.0,
+        -0.49999999999999994,
+        0.8660254037844387,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.8314696123025452,
+        0.5555702330196022,
+        0.0,
+        0.0,
+        -0.5555702330196022,
+        0.8314696123025452,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.7933533402912353,
+        0.6087614290087205,
+        0.0,
+        0.0,
+        -0.6087614290087205,
+        0.7933533402912353,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.7518398074789774,
+        0.6593458151000688,
+        0.0,
+        0.0,
+        -0.6593458151000688,
+        0.7518398074789774,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.7071067811865476,
+        0.7071067811865476,
+        0.0,
+        0.0,
+        -0.7071067811865476,
+        0.7071067811865476,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.659345815100069,
+        0.7518398074789773,
+        0.0,
+        0.0,
+        -0.7518398074789773,
+        0.659345815100069,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.6087614290087207,
+        0.7933533402912352,
+        0.0,
+        0.0,
+        -0.7933533402912352,
+        0.6087614290087207,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        0.5555702330196024,
+        0.8314696123025451,
+        0.0,
+        0.0,
+        -0.8314696123025451,
+        0.5555702330196024,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_U.json b/examples/camera_poses/test_camera_U.json
new file mode 100644
index 0000000000000000000000000000000000000000..f63aabf1e10d52d9770727e75c659bf1ae2f3d16
--- /dev/null
+++ b/examples/camera_poses/test_camera_U.json
@@ -0,0 +1,226 @@
+[
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.2,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.28750000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.37500000000000006,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.4625000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.55,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.6375000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.7250000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.8125000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.9000000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        0.9875000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.0750000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.1625000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.2500000000000002,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.3375000000000001,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.4250000000000003,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ],
+    [
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.5125000000000004,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+    ]
+]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_b133a504fc90a2d1.json b/examples/camera_poses/test_camera_b133a504fc90a2d1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9df09dde068f9316c0db51ead1ad6cdd3dc5628
--- /dev/null
+++ b/examples/camera_poses/test_camera_b133a504fc90a2d1.json
@@ -0,0 +1 @@
+[[0.9999999403953552, -3.2563843288535566e-10, 8.624932434919685e-10, -5.431840754965833e-09, -3.2563843288535566e-10, 1.0, -7.078895802870022e-10, -2.678919752696629e-09, 8.624932434919685e-10, -7.078895802870022e-10, 1.0, 7.303774474110014e-09], [0.9999998807907104, 0.00026603759033605456, -0.0003414043167140335, -0.030107486993074417, -0.0002659278397914022, 0.9999999403953552, 0.00032293255208060145, 0.011978899128735065, 0.0003414914826862514, -0.0003228419227525592, 0.9999999403953552, -0.07893969118595123], [0.9999994039535522, -0.0006866907933726907, -0.0008186722989194095, -0.050785429775714874, 0.0006877407431602478, 0.9999989867210388, 0.0012827541213482618, 0.023323407396674156, 0.0008177922572940588, -0.0012833180371671915, 0.9999988675117493, -0.1486724615097046], [0.999999463558197, -0.0005364732351154089, -0.0008647883078083396, -0.0673225075006485, 0.0005379383219406009, 0.9999984502792358, 0.0016958877677097917, 0.03169107437133789, 0.0008638793369755149, -0.0016963522648438811, 0.9999982118606567, -0.21477271616458893], [0.9999995231628418, -0.0005853328038938344, 0.000731398060452193, -0.08539554476737976, 0.0005843221442773938, 0.9999988675117493, 0.0013807439245283604, 0.037947509437799454, -0.0007322035962715745, -0.0013803173787891865, 0.9999987483024597, -0.2858566641807556], [0.9999935030937195, -0.0007727851625531912, 0.00352613371796906, -0.10591986775398254, 0.0007684561423957348, 0.9999989867210388, 0.0012290359009057283, 0.0426139310002327, -0.003527079476043582, -0.0012263187672942877, 0.999993085861206, -0.3645589053630829], [0.9999805688858032, -0.0012497249990701675, 0.006103655323386192, -0.12257411330938339, 0.0012419418198987842, 0.9999984502792358, 0.001278668874874711, 0.05143251642584801, -0.006105243694037199, -0.0012710647424682975, 0.9999805688858032, -0.44212815165519714], [0.9999697804450989, -0.001364423311315477, 0.0076533216051757336, -0.13916294276714325, 0.0013530527940019965, 0.9999980330467224, 0.0014907552395015955, 0.06076823174953461, -0.0076553407125175, -0.0014803558588027954, 0.9999696016311646, -0.5131306648254395], [0.9999656081199646, -0.001309191924519837, 0.00818517804145813, -0.1577269434928894, 0.0012902054004371166, 0.9999964833259583, 0.0023244714830070734, 0.07221967726945877, -0.008188189007341862, -0.0023138325195759535, 0.9999638199806213, -0.5706046223640442], [0.9999715089797974, -0.000632039678748697, 0.0075194560922682285, -0.1842648684978485, 0.0006101227481849492, 0.9999955892562866, 0.002916603581979871, 0.0780157595872879, -0.0075212628580629826, -0.0029119334649294615, 0.9999675154685974, -0.6207911372184753], [0.999971330165863, 9.749359742272645e-05, 0.007570990361273289, -0.2200128734111786, -0.00012177121971035376, 0.9999948740005493, 0.0032062893733382225, 0.07083319127559662, -0.007570638321340084, -0.003207120578736067, 0.9999662041664124, -0.671477735042572], [0.9999842047691345, 0.0011085873702540994, 0.005505275446921587, -0.265299528837204, -0.0011242963373661041, 0.9999953508377075, 0.002851187251508236, 0.06430258601903915, -0.00550208892673254, -0.002857332583516836, 0.9999808073043823, -0.7294802069664001], [0.9999891519546509, 0.0030521126464009285, -0.003507567336782813, -0.3109421133995056, -0.0030410285107791424, 0.9999904632568359, 0.003161099273711443, 0.06503432989120483, 0.003517185803502798, -0.0031503995414823294, 0.999988853931427, -0.7917969226837158], [0.9995473623275757, 0.00647346256300807, -0.029379382729530334, -0.3455933630466461, -0.006376372650265694, 0.9999739527702332, 0.0033971993252635, 0.06981948018074036, 0.02940060943365097, -0.003208327107131481, 0.9995625615119934, -0.8474093675613403], [0.9966378808021545, 0.011493867263197899, -0.08112204819917679, -0.3555213510990143, -0.011167873628437519, 0.9999276399612427, 0.004471173509955406, 0.0734858587384224, 0.08116757124662399, -0.00355018163099885, 0.9966941475868225, -0.9062771201133728], [0.9889613389968872, 0.01762101612985134, -0.14712226390838623, -0.33937495946884155, -0.01693679392337799, 0.999839186668396, 0.0059022014029324055, 0.0776127353310585, 0.14720259606838226, -0.0033452697098255157, 0.9891006946563721, -0.9548948407173157]]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_d9642c8efc01481d.json b/examples/camera_poses/test_camera_d9642c8efc01481d.json
new file mode 100644
index 0000000000000000000000000000000000000000..50f031204bcc0bbde6758ad34f091e87b072ca45
--- /dev/null
+++ b/examples/camera_poses/test_camera_d9642c8efc01481d.json
@@ -0,0 +1 @@
+[[1.0, -1.4532828274127496e-09, 3.928266045782891e-10, -8.700192566379883e-09, -1.4532828274127496e-09, 1.0, -1.7456260048565042e-10, 9.261455491405002e-11, 3.928266045782891e-10, -1.7456260048565042e-10, 1.0, -1.5881864712241622e-08], [0.9850640296936035, -0.04082970321178436, 0.16727784276008606, 0.20439539849758148, 0.03502606600522995, 0.9986825585365295, 0.03750050067901611, -0.15185177326202393, -0.16858860850334167, -0.03108130767941475, 0.9851963520050049, 0.10121102631092072], [0.9168016910552979, -0.09482394903898239, 0.387921541929245, 0.44365406036376953, 0.06679922342300415, 0.9941277503967285, 0.08513444662094116, -0.26796984672546387, -0.39371633529663086, -0.0521385557949543, 0.9177521467208862, 0.06753730028867722], [0.7643709182739258, -0.15802493691444397, 0.6251122951507568, 0.6479341983795166, 0.09576083719730377, 0.9865723252296448, 0.13230620324611664, -0.33551496267318726, -0.6376261115074158, -0.04126973822712898, 0.7692397236824036, -0.07632352411746979], [0.5624101758003235, -0.2065712958574295, 0.8006392121315002, 0.8095709085464478, 0.10458854585886002, 0.9782856106758118, 0.17893710732460022, -0.3449772298336029, -0.8202170729637146, -0.016898371279239655, 0.5718027949333191, -0.267223596572876], [0.3860713243484497, -0.23765023052692413, 0.8913311958312988, 1.017128586769104, 0.10842984914779663, 0.9712379574775696, 0.21198998391628265, -0.34939509630203247, -0.9160742163658142, 0.014803661964833736, 0.4007355272769928, -0.5046621561050415], [0.25270596146583557, -0.2560441493988037, 0.9330493807792664, 1.2611535787582397, 0.09987718611955643, 0.9661006331443787, 0.23806333541870117, -0.36809343099594116, -0.9623742699623108, 0.03303031623363495, 0.26971235871315, -0.8164812922477722], [0.12896282970905304, -0.26742202043533325, 0.9549105167388916, 1.4084848165512085, 0.09037449955940247, 0.9621138572692871, 0.25723403692245483, -0.3597045838832855, -0.9875227212905884, 0.05312592536211014, 0.14824506640434265, -1.1886308193206787], [-0.037818778306245804, -0.2752484083175659, 0.9606289863586426, 1.3755683898925781, 0.08146493136882782, 0.957267701625824, 0.277492493391037, -0.353816956281662, -0.9959584474563599, 0.08875200152397156, -0.013779602944850922, -1.6891316175460815], [-0.1970304548740387, -0.2752428948879242, 0.9409677982330322, 1.1786020994186401, 0.07280438393354416, 0.9530242681503296, 0.2940141260623932, -0.36850038170814514, -0.9776904582977295, 0.12643630802631378, -0.16773590445518494, -2.261430025100708], [-0.3677733540534973, -0.26668044924736023, 0.8908559679985046, 0.8089978098869324, 0.06243090331554413, 0.9487544894218445, 0.30978602170944214, -0.37012383341789246, -0.9278174042701721, 0.16954797506332397, -0.33227747678756714, -2.8139760494232178], [-0.5150132775306702, -0.2527574300765991, 0.8190696835517883, 0.4462648332118988, 0.050335872918367386, 0.9449706673622131, 0.32325950264930725, -0.381427526473999, -0.8557030558586121, 0.20771150290966034, -0.473949670791626, -3.2886314392089844], [-0.6352092623710632, -0.23657281696796417, 0.7352160215377808, 0.0826139897108078, 0.036419764161109924, 0.9416990280151367, 0.33447936177253723, -0.3719184398651123, -0.771480917930603, 0.23924078047275543, -0.5895600318908691, -3.677316904067993], [-0.7218965888023376, -0.21918001770973206, 0.6563730239868164, -0.2607730031013489, 0.024064550176262856, 0.9399895071983337, 0.340353786945343, -0.359468549489975, -0.6915825009346008, 0.26149556040763855, -0.6733006834983826, -3.9900803565979004], [-0.7831082344055176, -0.2019210308790207, 0.5881916880607605, -0.609000563621521, 0.015634668990969658, 0.9391284584999084, 0.3432103097438812, -0.35386255383491516, -0.621688961982727, 0.2779669761657715, -0.732282280921936, -4.316582202911377], [-0.8348898887634277, -0.1848600059747696, 0.5184455513954163, -0.9689992070198059, 0.007868430577218533, 0.93780916929245, 0.3470619320869446, -0.3402447998523712, -0.5503608584403992, 0.2938378155231476, -0.7815127968788147, -4.561704635620117]]
\ No newline at end of file
diff --git a/examples/camera_poses/test_camera_d971457c81bca597.json b/examples/camera_poses/test_camera_d971457c81bca597.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bd9bf5dd6993fd3b7fe6d678595060378a3c997
--- /dev/null
+++ b/examples/camera_poses/test_camera_d971457c81bca597.json
@@ -0,0 +1 @@
+[[1.0, -1.9455232563858615e-11, -8.611562019034125e-10, -1.0473515388298438e-09, -1.9455232563858615e-11, 1.0, 8.674055917978762e-10, 1.8375034827045056e-09, -8.611562019034125e-10, 8.674055917978762e-10, 1.0, 8.45981773522908e-09], [0.9973401427268982, -0.0032562706619501114, 0.07281485944986343, -0.07197967171669006, 0.00329946493729949, 0.9999944567680359, -0.00047293686657212675, -0.0565447174012661, -0.07281292229890823, 0.0007119301008060575, 0.9973453879356384, -0.3188611567020416], [0.9906007051467896, -0.0066957552917301655, 0.13662146031856537, -0.15415722131729126, 0.006810691673308611, 0.9999766945838928, -0.00037385127507150173, -0.10014614462852478, -0.1366157978773117, 0.0013008243404328823, 0.9906232953071594, -0.6077051758766174], [0.9827210903167725, -0.009423106908798218, 0.18485280871391296, -0.23053960502147675, 0.009786692447960377, 0.9999515414237976, -0.0010545575059950352, -0.11272216588258743, -0.18483391404151917, 0.002845433307811618, 0.9827656745910645, -0.8897562623023987], [0.9756309986114502, -0.011318936944007874, 0.2191258817911148, -0.2929331362247467, 0.01199379749596119, 0.9999265670776367, -0.0017497432418167591, -0.10348402708768845, -0.21908999979496002, 0.004335256293416023, 0.9756950736045837, -1.1368639469146729], [0.9685831069946289, -0.012191502377390862, 0.24839124083518982, -0.379209041595459, 0.013217172585427761, 0.9999096393585205, -0.0024619612377136946, -0.12078691273927689, -0.24833877384662628, 0.005667644087225199, 0.9686566591262817, -1.3666095733642578], [0.9615083932876587, -0.012519675306975842, 0.2744903266429901, -0.5242342352867126, 0.013806014321744442, 0.9999009370803833, -0.0027547888457775116, -0.13015854358673096, -0.2744286358356476, 0.006438371259719133, 0.9615859389305115, -1.5762972831726074], [0.9528889656066895, -0.014083069749176502, 0.3029923439025879, -0.6702165007591248, 0.015383570455014706, 0.9998798370361328, -0.0019058430334553123, -0.14860114455223083, -0.3029291331768036, 0.00647716224193573, 0.9529911279678345, -1.7173497676849365], [0.9457509517669678, -0.015159872360527515, 0.3245387375354767, -0.8001917600631714, 0.016729505732655525, 0.9998579621315002, -0.0020466779824346304, -0.15521110594272614, -0.32446160912513733, 0.007365020923316479, 0.9458702206611633, -1.8571592569351196], [0.9417706727981567, -0.015540024265646935, 0.33589670062065125, -0.9244012236595154, 0.017394419759511948, 0.9998455047607422, -0.0025124624371528625, -0.1416105479001999, -0.3358057737350464, 0.008208892308175564, 0.9418954849243164, -1.9969842433929443], [0.9381415247917175, -0.016401933506131172, 0.34586337208747864, -1.0128529071807861, 0.01851990446448326, 0.9998245239257812, -0.0028197101783007383, -0.11503250896930695, -0.3457564413547516, 0.009050644934177399, 0.9382806420326233, -2.080850124359131], [0.9357938766479492, -0.016873901709914207, 0.3521435558795929, -1.1008079051971436, 0.01865064539015293, 0.9998247027397156, -0.0016533475136384368, -0.048780668526887894, -0.3520539104938507, 0.008114897646009922, 0.9359445571899414, -2.088500499725342], [0.9334627985954285, -0.016492612659931183, 0.3582947850227356, -1.2417148351669312, 0.017931735143065453, 0.9998390078544617, -0.0006939812446944416, 0.01625901833176613, -0.35822567343711853, 0.007072653621435165, 0.9336082339286804, -2.0250792503356934], [0.9281281232833862, -0.016213543713092804, 0.3719078302383423, -1.4365860223770142, 0.01746317930519581, 0.9998475313186646, 8.084020009846427e-06, 0.060201361775398254, -0.37185123562812805, 0.006487190257757902, 0.9282696843147278, -1.8963005542755127], [0.9220678806304932, -0.017285069450736046, 0.3866421580314636, -1.604330062866211, 0.01880805380642414, 0.9998230934143066, -0.00015593231364618987, 0.0799601748585701, -0.38657107949256897, 0.007415766827762127, 0.9222298264503479, -1.7388361692428589], [0.9185494184494019, -0.018356993794441223, 0.3948797583580017, -1.7530856132507324, 0.019847355782985687, 0.9998030066490173, 0.0003104731731582433, 0.10000917315483093, -0.3948076665401459, 0.007552135270088911, 0.9187328219413757, -1.5337872505187988]]
\ No newline at end of file
diff --git a/gradio_utils/__pycache__/camera_utils.cpython-310.pyc b/gradio_utils/__pycache__/camera_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f5db453cfc93215e681384f6404d951171921fb
Binary files /dev/null and b/gradio_utils/__pycache__/camera_utils.cpython-310.pyc differ
diff --git a/gradio_utils/__pycache__/flow_utils.cpython-310.pyc b/gradio_utils/__pycache__/flow_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49cc8b2dabfbc88d2d79ba34a23b8d9e34f098c6
Binary files /dev/null and b/gradio_utils/__pycache__/flow_utils.cpython-310.pyc differ
diff --git a/gradio_utils/__pycache__/motionctrl_cmcm_gradio.cpython-310.pyc b/gradio_utils/__pycache__/motionctrl_cmcm_gradio.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c6692f926f1a1c0ab32574072ea3c60fd15aefb
Binary files /dev/null and b/gradio_utils/__pycache__/motionctrl_cmcm_gradio.cpython-310.pyc differ
diff --git a/gradio_utils/__pycache__/traj_utils.cpython-310.pyc b/gradio_utils/__pycache__/traj_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65e19d6353c812c3c7c048a33b27b06043fc2823
Binary files /dev/null and b/gradio_utils/__pycache__/traj_utils.cpython-310.pyc differ
diff --git a/gradio_utils/__pycache__/utils.cpython-310.pyc b/gradio_utils/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5765edbc743f657238644e12c70b32f55c56eaa9
Binary files /dev/null and b/gradio_utils/__pycache__/utils.cpython-310.pyc differ
diff --git a/gradio_utils/camera_utils.py b/gradio_utils/camera_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3cef2f497f261e0e30e196b3812bb76fc87ce2
--- /dev/null
+++ b/gradio_utils/camera_utils.py
@@ -0,0 +1,157 @@
+import copy
+# import plotly.express as px
+# import plotly.graph_objects as go
+import json
+
+import numpy as np
+
+CAMERA_MOTION_MODE = ["Basic Camera Poses", "Provided Complex Camera Poses", "Custom Camera Poses"]
+
+CAMERA = {
+    # T
+    "base_T_norm": 1.5,
+    "base_angle": np.pi/3,
+
+    "Pan Up": {     "angle":[0., 0., 0.],   "T":[0., 1., 0.]},
+    "Pan Down": {   "angle":[0., 0., 0.],   "T":[0.,-1.,0.]},
+    "Pan Left": {   "angle":[0., 0., 0.],   "T":[3.,0.,0.]},
+    "Pan Right": {  "angle":[0., 0., 0.],   "T": [-3.,0.,0.]},
+    "Zoom In": {    "angle":[0., 0., 0.],   "T": [0.,0.,-4.]},
+    "Zoom Out": {   "angle":[0., 0., 0.],   "T": [0.,0.,4.]},
+    "ACW": {        "angle": [0., 0., 1.],  "T":[0., 0., 0.]},
+    "CW": {         "angle": [0., 0., -1.], "T":[0., 0., 0.]},
+}
+
+COMPLEX_CAMERA = {
+    "Pose_1": "examples/camera_poses/test_camera_1424acd0007d40b5.json",
+    "Pose_2": "examples/camera_poses/test_camera_d971457c81bca597.json",
+    "Pose_3": "examples/camera_poses/test_camera_Round-ZoomIn.json",
+    "Pose_4": "examples/camera_poses/test_camera_Round-RI_90.json",
+    "Pose_5": "examples/camera_poses/test_camera_Round-RI-120.json",
+    "Pose_6": "examples/camera_poses/test_camera_018f7907401f2fef.json",
+    "Pose_7": "examples/camera_poses/test_camera_088b93f15ca8745d.json",
+    "Pose_8": "examples/camera_poses/test_camera_b133a504fc90a2d1.json",
+}
+
+
+
+def compute_R_form_rad_angle(angles):
+    theta_x, theta_y, theta_z = angles
+    Rx = np.array([[1, 0, 0],
+                   [0, np.cos(theta_x), -np.sin(theta_x)],
+                   [0, np.sin(theta_x), np.cos(theta_x)]])
+    
+    Ry = np.array([[np.cos(theta_y), 0, np.sin(theta_y)],
+                   [0, 1, 0],
+                   [-np.sin(theta_y), 0, np.cos(theta_y)]])
+    
+    Rz = np.array([[np.cos(theta_z), -np.sin(theta_z), 0],
+                   [np.sin(theta_z), np.cos(theta_z), 0],
+                   [0, 0, 1]])
+    
+    # 计算相机外参的旋转矩阵
+    R = np.dot(Rz, np.dot(Ry, Rx))
+    return R
+
+def get_camera_motion(angle, T, speed, n=16):
+    RT = []
+    for i in range(n):
+        _angle = (i/n)*speed*(CAMERA["base_angle"])*angle
+        R = compute_R_form_rad_angle(_angle) 
+        # _T = (i/n)*speed*(T.reshape(3,1))
+        _T=(i/n)*speed*(CAMERA["base_T_norm"])*(T.reshape(3,1))
+        _RT = np.concatenate([R,_T], axis=1)
+        RT.append(_RT)
+    RT = np.stack(RT)
+    return RT
+    
+def create_relative(RT_list, K_1=4.7, dataset="syn"):
+    RT = copy.deepcopy(RT_list[0])
+    R_inv = RT[:,:3].T
+    T =  RT[:,-1]
+
+    temp = []
+    for _RT in RT_list:
+        _RT[:,:3] = np.dot(_RT[:,:3], R_inv)
+        _RT[:,-1] =  _RT[:,-1] - np.dot(_RT[:,:3], T)
+        temp.append(_RT)
+    RT_list = temp
+
+    return RT_list
+    
+def combine_camera_motion(RT_0, RT_1):
+    RT = copy.deepcopy(RT_0[-1])
+    R = RT[:,:3]
+    R_inv = RT[:,:3].T
+    T =  RT[:,-1]
+
+    temp = []
+    for _RT in RT_1:
+        _RT[:,:3] = np.dot(_RT[:,:3], R)
+        _RT[:,-1] =  _RT[:,-1] + np.dot(np.dot(_RT[:,:3], R_inv), T) 
+        temp.append(_RT)
+
+    RT_1 = np.stack(temp)
+
+    return np.concatenate([RT_0, RT_1], axis=0)
+
+def process_camera(camera_dict, camera_args, num_frames=16, width=256, height=256):
+    speed = camera_dict['speed']
+    motion_list = camera_dict['motion']
+    mode = camera_dict['mode']
+
+    if mode == 'Customized Mode 3: RAW Camera Poses':
+        # print(camera_args)
+        RT = camera_args.strip().split()
+        assert(len(RT) == num_frames*12), "The number of camera poses should be equal to the number of frames"
+        RT = [float(x) for x in RT]
+        RT = np.array(RT).reshape(-1, 3, 4)
+        RT[:, :, -1] = RT[:, :, -1] * np.array([1.5, 1, 1.3]) * speed
+        return RT
+
+    if camera_dict['complex'] is not None:
+        with open(COMPLEX_CAMERA[camera_dict['complex']]) as f:
+            RT = json.load(f) # [16, 12]
+        if num_frames < len(RT):
+            half = (len(RT) - num_frames) // 2
+            RT = RT[half:half+num_frames]
+        RT = np.array(RT).reshape(-1, 3, 4)
+        RT[:, :, -1] = RT[:, :, -1] * np.array([1.5, 1, 1.3]) * speed
+        return RT
+    
+    half_num_frames = num_frames//2
+
+    
+    
+    print(len(motion_list))
+    if len(motion_list) == 0:
+        angle = np.array([0,0,0])
+        T = np.array([0,0,0])
+        RT = get_camera_motion(angle, T, speed, num_frames)
+        
+    elif len(motion_list) == 1:
+        angle = np.array(CAMERA[motion_list[0]]["angle"])
+        T = np.array(CAMERA[motion_list[0]]["T"])
+        print(angle, T)
+        RT = get_camera_motion(angle, T, speed, num_frames)
+        
+    
+    elif len(motion_list) == 2:
+        if mode == "Customized Mode 1: First A then B":
+            angle = np.array(CAMERA[motion_list[0]]["angle"]) 
+            T = np.array(CAMERA[motion_list[0]]["T"]) 
+            RT_0 = get_camera_motion(angle, T, speed, half_num_frames)
+
+            angle = np.array(CAMERA[motion_list[1]]["angle"]) 
+            T = np.array(CAMERA[motion_list[1]]["T"]) 
+            RT_1 = get_camera_motion(angle, T, speed, num_frames-half_num_frames)
+
+            RT = combine_camera_motion(RT_0, RT_1)
+
+        elif mode == "Customized Mode 2: Both A and B":
+            angle = np.array(CAMERA[motion_list[0]]["angle"]) + np.array(CAMERA[motion_list[1]]["angle"])
+            T = np.array(CAMERA[motion_list[0]]["T"]) + np.array(CAMERA[motion_list[1]]["T"])
+            RT = get_camera_motion(angle, T, speed, num_frames)
+
+    return RT
+
diff --git a/gradio_utils/flow_utils.py b/gradio_utils/flow_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a86b38a3f555650fe6dd0f6d504bd6d5bbbf933
--- /dev/null
+++ b/gradio_utils/flow_utils.py
@@ -0,0 +1,69 @@
+import numpy as np
+
+
+def sigma_matrix2(sig_x, sig_y, theta):
+    """Calculate the rotated sigma matrix (two dimensional matrix).
+    Args:
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+    Returns:
+        ndarray: Rotated sigma matrix.
+    """
+    d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
+    u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+    return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
+
+
+def mesh_grid(kernel_size):
+    """Generate the mesh grid, centering at zero.
+    Args:
+        kernel_size (int):
+    Returns:
+        xy (ndarray): with the shape (kernel_size, kernel_size, 2)
+        xx (ndarray): with the shape (kernel_size, kernel_size)
+        yy (ndarray): with the shape (kernel_size, kernel_size)
+    """
+    ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
+    xx, yy = np.meshgrid(ax, ax)
+    xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size,
+                                                                           1))).reshape(kernel_size, kernel_size, 2)
+    return xy, xx, yy
+
+
+def pdf2(sigma_matrix, grid):
+    """Calculate PDF of the bivariate Gaussian distribution.
+    Args:
+        sigma_matrix (ndarray): with the shape (2, 2)
+        grid (ndarray): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size.
+    Returns:
+        kernel (ndarrray): un-normalized kernel.
+    """
+    inverse_sigma = np.linalg.inv(sigma_matrix)
+    kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
+    return kernel
+
+def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
+    """Generate a bivariate isotropic or anisotropic Gaussian kernel.
+    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+    Args:
+        kernel_size (int):
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+        grid (ndarray, optional): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size. Default: None
+        isotropic (bool):
+    Returns:
+        kernel (ndarray): normalized kernel.
+    """
+    if grid is None:
+        grid, _, _ = mesh_grid(kernel_size)
+    if isotropic:
+        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+    else:
+        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+    kernel = pdf2(sigma_matrix, grid)
+    kernel = kernel / np.sum(kernel)
+    return kernel
diff --git a/gradio_utils/motionctrl_cmcm_gradio.py b/gradio_utils/motionctrl_cmcm_gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ca1cc47162f30f111df9408e7a1e778e4ebd050
--- /dev/null
+++ b/gradio_utils/motionctrl_cmcm_gradio.py
@@ -0,0 +1,276 @@
+import argparse
+import datetime
+import json
+import math
+import os
+import sys
+import time
+from glob import glob
+from pathlib import Path
+from typing import Optional
+
+import cv2
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange, repeat
+from fire import Fire
+from omegaconf import OmegaConf
+from PIL import Image
+from torchvision.transforms import CenterCrop, Compose, Resize, ToTensor
+import tempfile
+
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+from sgm.util import default, instantiate_from_config
+
+
+
+def to_relative_RT2(org_pose, keyframe_idx=0, keyframe_zero=False):
+        org_pose = org_pose.reshape(-1, 3, 4) # [t, 3, 4]
+        R_dst = org_pose[:, :, :3]
+        T_dst = org_pose[:, :, 3:]
+
+        R_src = R_dst[keyframe_idx: keyframe_idx+1].repeat(org_pose.shape[0], axis=0) # [t, 3, 3]
+        T_src = T_dst[keyframe_idx: keyframe_idx+1].repeat(org_pose.shape[0], axis=0)
+
+        R_src_inv = R_src.transpose(0, 2, 1) # [t, 3, 3]
+        
+        R_rel = R_dst @ R_src_inv # [t, 3, 3]
+        T_rel = T_dst - R_rel@T_src
+
+        RT_rel = np.concatenate([R_rel, T_rel], axis=-1) # [t, 3, 4]
+        RT_rel = RT_rel.reshape(-1, 12) # [t, 12]
+
+        if keyframe_zero:
+            RT_rel[keyframe_idx] = np.zeros_like(RT_rel[keyframe_idx])
+
+        return RT_rel
+
+def build_model(config, ckpt, device, num_frames, num_steps):
+    num_frames = default(num_frames, 14)
+    num_steps = default(num_steps, 25)
+    model_config = default(config, "configs/inference/config_motionctrl_cmcm.yaml")
+
+    print(f"Loading model from {ckpt}")
+    model, filter = load_model(
+        model_config,
+        ckpt,
+        device,
+        num_frames,
+        num_steps,
+    )
+
+    model.eval()
+
+    return model
+
+def motionctrl_sample(
+    model,
+    image: Image = None,  # Can either be image file or folder with image files
+    RT: np.ndarray = None,
+    num_frames: Optional[int] = None,
+    fps_id: int = 6,
+    motion_bucket_id: int = 127,
+    cond_aug: float = 0.02,
+    seed: int = 23,
+    decoding_t: int = 1,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
+    save_fps: int = 10,
+    sample_num: int = 1,
+    device: str = "cuda",
+):
+    """
+    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
+    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
+    """
+
+    torch.manual_seed(seed)
+
+    w, h = image.size
+
+    # RT: [t, 3, 4]
+    # RT = RT.reshape(-1, 3, 4) # [t, 3, 4]
+    # adaptive to different spatial ratio
+    # base_len = min(w, h) * 0.5
+    # K = np.array([[w/base_len, 0, w/base_len],
+    #               [0, h/base_len, h/base_len],
+    #               [0, 0, 1]])
+    # for i in range(RT.shape[0]):
+    #     RT[i,:,:] = np.dot(K, RT[i,:,:])
+
+    RT = to_relative_RT2(RT) # [t, 12]
+    RT = torch.tensor(RT).float().to(device) # [t, 12]
+    RT = RT.unsqueeze(0).repeat(2,1,1)
+
+    if h % 64 != 0 or w % 64 != 0:
+        width, height = map(lambda x: x - x % 64, (w, h))
+        image = image.resize((width, height))
+        print(
+            f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
+        )
+
+    image = ToTensor()(image)
+    image = image * 2.0 - 1.0
+
+    image = image.unsqueeze(0).to(device)
+    H, W = image.shape[2:]
+    assert image.shape[1] == 3
+    F = 8
+    C = 4
+    shape = (num_frames, C, H // F, W // F)
+
+    if motion_bucket_id > 255:
+        print(
+            "WARNING: High motion bucket! This may lead to suboptimal performance."
+        )
+
+    if fps_id < 5:
+        print("WARNING: Small fps value! This may lead to suboptimal performance.")
+
+    if fps_id > 30:
+        print("WARNING: Large fps value! This may lead to suboptimal performance.")
+
+    value_dict = {}
+    value_dict["motion_bucket_id"] = motion_bucket_id
+    value_dict["fps_id"] = fps_id
+    value_dict["cond_aug"] = cond_aug
+    value_dict["cond_frames_without_noise"] = image
+    value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
+
+    with torch.no_grad():
+        with torch.autocast(device):
+            batch, batch_uc = get_batch(
+                get_unique_embedder_keys_from_conditioner(model.conditioner),
+                value_dict,
+                [1, num_frames],
+                T=num_frames,
+                device=device,
+            )
+            c, uc = model.conditioner.get_unconditional_conditioning(
+                batch,
+                batch_uc=batch_uc,
+                force_uc_zero_embeddings=[
+                    "cond_frames",
+                    "cond_frames_without_noise",
+                ],
+            )
+
+            for k in ["crossattn", "concat"]:
+                uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+
+            
+
+            additional_model_inputs = {}
+            additional_model_inputs["image_only_indicator"] = torch.zeros(
+                2, num_frames
+            ).to(device)
+            #additional_model_inputs["image_only_indicator"][:,0] = 1
+            additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+
+            
+            additional_model_inputs["RT"] = RT.clone()
+
+            def denoiser(input, sigma, c):
+                return model.denoiser(
+                    model.model, input, sigma, c, **additional_model_inputs
+                )
+
+            results = []
+            for j in range(sample_num):
+                randn = torch.randn(shape, device=device)
+                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+                model.en_and_decode_n_samples_a_time = decoding_t
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0) # [1*t, c, h, w]
+                results.append(samples)
+
+            samples = torch.stack(results, dim=0) # [sample_num, t, c, h, w]
+            samples = samples.data.cpu()
+
+            video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+            save_results(samples, video_path, fps=save_fps)
+    
+    return video_path
+
+def save_results(resutls, filename, fps=10):
+    video = resutls.permute(1, 0, 2, 3, 4) # [t, sample_num, c, h, w]
+    frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w]
+    grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
+    # already in [0,1]
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list(set([x.input_key for x in conditioner.embedders]))
+
+
+def get_batch(keys, value_dict, N, T, device):
+    batch = {}
+    batch_uc = {}
+
+    for key in keys:
+        if key == "fps_id":
+            batch[key] = (
+                torch.tensor([value_dict["fps_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "motion_bucket_id":
+            batch[key] = (
+                torch.tensor([value_dict["motion_bucket_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "cond_aug":
+            batch[key] = repeat(
+                torch.tensor([value_dict["cond_aug"]]).to(device),
+                "1 -> b",
+                b=math.prod(N),
+            )
+        elif key == "cond_frames":
+            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
+        elif key == "cond_frames_without_noise":
+            batch[key] = repeat(
+                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
+            )
+        else:
+            batch[key] = value_dict[key]
+
+    if T is not None:
+        batch["num_video_frames"] = T
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def load_model(
+    config: str,
+    ckpt: str,
+    device: str,
+    num_frames: int,
+    num_steps: int,
+):
+
+    config = OmegaConf.load(config)
+    config.model.params.ckpt_path = ckpt
+    if device == "cuda":
+        config.model.params.conditioner_config.params.emb_models[
+            0
+        ].params.open_clip_embedding_config.params.init_device = device
+
+    config.model.params.sampler_config.params.num_steps = num_steps
+    config.model.params.sampler_config.params.guider_config.params.num_frames = (
+        num_frames
+    )
+
+    model = instantiate_from_config(config.model)
+
+    model = model.to(device).eval()    
+
+    filter = None #DeepFloydDataFiltering(verbose=False, device=device)
+    return model, filter
+
diff --git a/gradio_utils/traj_utils.py b/gradio_utils/traj_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0513c915a675c89347c1e461b5b8c461003155
--- /dev/null
+++ b/gradio_utils/traj_utils.py
@@ -0,0 +1,104 @@
+import cv2
+import numpy as np
+
+from gradio_utils.flow_utils import bivariate_Gaussian
+
+OBJECT_MOTION_MODE = ["Provided Trajectory", "Custom Trajectory"]
+
+PROVIDED_TRAJS = {
+    "horizon_1": "examples/trajectories/horizon_2.txt",
+    "swaying_1": "examples/trajectories/shake_1.txt",
+    "swaying_2": "examples/trajectories/shake_2.txt",
+    "swaying_3": "examples/trajectories/shaking_10.txt",
+    "curve_1": "examples/trajectories/curve_1.txt",
+    "curve_2": "examples/trajectories/curve_2.txt",
+    "curve_3": "examples/trajectories/curve_3.txt",
+    "curve_4": "examples/trajectories/curve_4.txt",
+}
+
+
+def read_points(file, video_len=16, reverse=False):
+    with open(file, 'r') as f:
+        lines = f.readlines()
+    points = []
+    for line in lines:
+        x, y = line.strip().split(',')
+        points.append((int(x), int(y)))
+    if reverse:
+        points = points[::-1]
+
+    if len(points) > video_len:
+        skip = len(points) // video_len
+        points = points[::skip]
+    points = points[:video_len]
+    
+    return points
+
+def get_provided_traj(traj_name):
+    traj = read_points(PROVIDED_TRAJS[traj_name])
+    # xrange from 256 to 1024
+    traj = [[int(1024*x/256), int(1024*y/256)] for x,y in traj]
+    return traj
+
+blur_kernel = bivariate_Gaussian(99, 10, 10, 0, grid=None, isotropic=True)
+
+def process_points(points):
+    frames = 16
+    defualt_points = [[512,512]]*16
+
+    if len(points) < 2:
+        return defualt_points
+    elif len(points) >= frames:
+        skip = len(points)//frames
+        return points[::skip][:15] + points[-1:]
+    else:
+        insert_num = frames - len(points)
+        insert_num_dict = {}
+        interval = len(points) - 1
+        n = insert_num // interval
+        m = insert_num % interval
+        for i in range(interval):
+            insert_num_dict[i] = n
+        for i in range(m):
+            insert_num_dict[i] += 1
+
+        res = []
+        for i in range(interval):
+            insert_points = []
+            x0,y0 = points[i]
+            x1,y1 = points[i+1]
+
+            delta_x = x1 - x0
+            delta_y = y1 - y0
+            for j in range(insert_num_dict[i]):
+                x = x0 + (j+1)/(insert_num_dict[i]+1)*delta_x
+                y = y0 + (j+1)/(insert_num_dict[i]+1)*delta_y
+                insert_points.append([int(x), int(y)])
+
+            res += points[i:i+1] + insert_points
+        res += points[-1:]
+        return res
+
+def get_flow(points, video_len=16):
+    optical_flow = np.zeros((video_len, 256, 256, 2), dtype=np.float32)
+    for i in range(video_len-1):
+        p = points[i]
+        p1 = points[i+1]
+        optical_flow[i+1, p[1], p[0], 0] = p1[0] - p[0]
+        optical_flow[i+1, p[1], p[0], 1] = p1[1] - p[1]
+    for i in range(1, video_len):
+        optical_flow[i] = cv2.filter2D(optical_flow[i], -1, blur_kernel)
+
+
+    return optical_flow
+
+
+def process_traj(points, device='cpu'):
+    xy_range = 1024
+    points = process_points(points)
+    points = [[int(256*x/xy_range), int(256*y/xy_range)] for x,y in points]
+    
+    optical_flow = get_flow(points)
+    # optical_flow = torch.tensor(optical_flow).to(device)
+
+    return optical_flow
\ No newline at end of file
diff --git a/gradio_utils/utils.py b/gradio_utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d783d0a7ed9791e7ff2c4f002282f8061e00348
--- /dev/null
+++ b/gradio_utils/utils.py
@@ -0,0 +1,175 @@
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+
+def vis_camera(RT_list, rescale_T=1):
+    fig = go.Figure()
+    showticklabels = True
+    visible = True
+    scene_bounds = 2
+    base_radius = 2.5
+    zoom_scale = 1.5
+    fov_deg = 50.0
+    
+    edges = [(0, 1), (0, 2), (0, 3), (1, 2), (2, 3), (3, 1), (3, 4)] 
+    
+    colors = px.colors.qualitative.Plotly
+    
+    cone_list = []
+    n = len(RT_list)
+    for i, RT in enumerate(RT_list):
+        R = RT[:,:3]
+        T = RT[:,-1]/rescale_T
+        cone = calc_cam_cone_pts_3d(R, T, fov_deg)
+        cone_list.append((cone, (i*1/n, "green"), f"view_{i}"))
+
+    
+    for (cone, clr, legend) in cone_list:
+        for (i, edge) in enumerate(edges):
+            (x1, x2) = (cone[edge[0], 0], cone[edge[1], 0])
+            (y1, y2) = (cone[edge[0], 1], cone[edge[1], 1])
+            (z1, z2) = (cone[edge[0], 2], cone[edge[1], 2])
+            fig.add_trace(go.Scatter3d(
+                x=[x1, x2], y=[y1, y2], z=[z1, z2], mode='lines',
+                line=dict(color=clr, width=3),
+                name=legend, showlegend=(i == 0))) 
+    fig.update_layout(
+                    height=500,
+                    autosize=True,
+                    # hovermode=False,
+                    margin=go.layout.Margin(l=0, r=0, b=0, t=0),
+                    
+                    showlegend=True,
+                    legend=dict(
+                        yanchor='bottom',
+                        y=0.01,
+                        xanchor='right',
+                        x=0.99,
+                    ),
+                    scene=dict(
+                        aspectmode='manual',
+                        aspectratio=dict(x=1, y=1, z=1.0),
+                        camera=dict(
+                            center=dict(x=0.0, y=0.0, z=0.0),
+                            up=dict(x=0.0, y=-1.0, z=0.0),
+                            eye=dict(x=scene_bounds/2, y=-scene_bounds/2, z=-scene_bounds/2),
+                            ),
+
+                        xaxis=dict(
+                            range=[-scene_bounds, scene_bounds],
+                            showticklabels=showticklabels,
+                            visible=visible,
+                        ),
+                            
+                        
+                        yaxis=dict(
+                            range=[-scene_bounds, scene_bounds],
+                            showticklabels=showticklabels,
+                            visible=visible,
+                        ),
+                            
+                        
+                        zaxis=dict(
+                            range=[-scene_bounds, scene_bounds],
+                            showticklabels=showticklabels,
+                            visible=visible,
+                        )
+                    ))
+    return fig
+
+
+def calc_cam_cone_pts_3d(R_W2C, T_W2C, fov_deg, scale=0.1, set_canonical=False, first_frame_RT=None):
+    fov_rad = np.deg2rad(fov_deg)
+    R_W2C_inv = np.linalg.inv(R_W2C)
+
+    # Camera pose center:
+    T = np.zeros_like(T_W2C) - T_W2C
+    T = np.dot(R_W2C_inv, T)
+    cam_x = T[0]
+    cam_y = T[1]
+    cam_z = T[2]
+    if set_canonical:
+        T = np.zeros_like(T_W2C)
+        T = np.dot(first_frame_RT[:,:3], T) + first_frame_RT[:,-1]
+        T = T - T_W2C 
+        T = np.dot(R_W2C_inv, T)
+        cam_x = T[0]
+        cam_y = T[1]
+        cam_z = T[2]
+
+    # vertex
+    corn1 = np.array([np.tan(fov_rad / 2.0), 0.5*np.tan(fov_rad / 2.0), 1.0]) *scale 
+    corn2 = np.array([-np.tan(fov_rad / 2.0), 0.5*np.tan(fov_rad / 2.0), 1.0]) *scale
+    corn3 = np.array([0, -0.25*np.tan(fov_rad / 2.0), 1.0]) *scale
+    corn4 = np.array([0, -0.5*np.tan(fov_rad / 2.0), 1.0]) *scale
+
+    corn1 = corn1 - T_W2C
+    corn2 = corn2 - T_W2C
+    corn3 = corn3 - T_W2C
+    corn4 = corn4 - T_W2C
+    
+    corn1 = np.dot(R_W2C_inv, corn1)
+    corn2 = np.dot(R_W2C_inv, corn2)
+    corn3 = np.dot(R_W2C_inv, corn3) 
+    corn4 = np.dot(R_W2C_inv, corn4) 
+
+    # Now attach as offset to actual 3D camera position:
+    corn_x1 = corn1[0]
+    corn_y1 = corn1[1]
+    corn_z1 = corn1[2]
+    
+    corn_x2 = corn2[0]
+    corn_y2 = corn2[1]
+    corn_z2 = corn2[2]
+    
+    corn_x3 = corn3[0]
+    corn_y3 = corn3[1]
+    corn_z3 = corn3[2]
+    
+    corn_x4 = corn4[0]
+    corn_y4 = corn4[1]
+    corn_z4 = corn4[2]
+            
+
+    xs = [cam_x, corn_x1, corn_x2, corn_x3, corn_x4, ]
+    ys = [cam_y, corn_y1, corn_y2, corn_y3, corn_y4, ]
+    zs = [cam_z, corn_z1, corn_z2, corn_z3, corn_z4, ]
+
+    return np.array([xs, ys, zs]).T
+
+
+
+    # T_base = [
+    #             [1.,0.,0.],             ## W2C  x 的正方向： 相机朝左  left
+    #             [-1.,0.,0.],            ## W2C  x 的负方向： 相机朝右  right
+    #             [0., 1., 0.],           ## W2C  y 的正方向： 相机朝上  up     
+    #             [0.,-1.,0.],            ## W2C  y 的负方向： 相机朝下  down
+    #             [0.,0.,1.],             ## W2C  z 的正方向： 相机往前  zoom out
+    #             [0.,0.,-1.],            ## W2C  z 的负方向： 相机往前  zoom in
+    #         ]   
+    # radius = 1
+    # n = 16
+    # # step = 
+    # look_at = np.array([0, 0, 0.8]).reshape(3,1)
+    # # look_at = np.array([0, 0, 0.2]).reshape(3,1)
+
+    # T_list = []
+    # base_R = np.array([[1., 0., 0.],
+    #                 [0., 1., 0.],
+    #                 [0., 0., 1.]])
+    # res = [] 
+    # res_forsave = []
+    # T_range = 1.8
+
+
+
+    # for i in range(0, 16):
+    #     # theta = (1)*np.pi*i/n
+
+    #     R = base_R[:,:3]
+    #     T = np.array([0.,0.,1.]).reshape(3,1) * (i/n)*2
+    #     RT = np.concatenate([R,T], axis=1)
+    #     res.append(RT)
+        
+    # fig = vis_camera(res)
+    
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e03c1c5b617b5efc9590a6815fcbb01202026f9
--- /dev/null
+++ b/main.py
@@ -0,0 +1,943 @@
+import argparse
+import datetime
+import glob
+import inspect
+import os
+import sys
+from inspect import Parameter
+from typing import Union
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torchvision
+import wandb
+from matplotlib import pyplot as plt
+from natsort import natsorted
+from omegaconf import OmegaConf
+from packaging import version
+from PIL import Image
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.trainer import Trainer
+from pytorch_lightning.utilities import rank_zero_only
+
+from sgm.util import exists, instantiate_from_config, isheatmap
+
+MULTINODE_HACKS = True
+
+
+def default_trainer_args():
+    argspec = dict(inspect.signature(Trainer.__init__).parameters)
+    argspec.pop("self")
+    default_args = {
+        param: argspec[param].default
+        for param in argspec
+        if argspec[param] != Parameter.empty
+    }
+    return default_args
+
+
+def get_parser(**parser_kwargs):
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ("yes", "true", "t", "y", "1"):
+            return True
+        elif v.lower() in ("no", "false", "f", "n", "0"):
+            return False
+        else:
+            raise argparse.ArgumentTypeError("Boolean value expected.")
+
+    parser = argparse.ArgumentParser(**parser_kwargs)
+    parser.add_argument(
+        "-n",
+        "--name",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="postfix for logdir",
+    )
+    parser.add_argument(
+        "--no_date",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="if True, skip date generation for logdir and only use naming via opt.base or opt.name (+ opt.postfix, optionally)",
+    )
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="resume from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-t",
+        "--train",
+        type=str2bool,
+        const=True,
+        default=True,
+        nargs="?",
+        help="train",
+    )
+    parser.add_argument(
+        "--no-test",
+        type=str2bool,
+        const=True,
+        default=False,
+        nargs="?",
+        help="disable test",
+    )
+    parser.add_argument(
+        "-p", "--project", help="name of new or path to existing project"
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="enable post-mortem debugging",
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=23,
+        help="seed for seed_everything",
+    )
+    parser.add_argument(
+        "-f",
+        "--postfix",
+        type=str,
+        default="",
+        help="post-postfix for default name",
+    )
+    parser.add_argument(
+        "--projectname",
+        type=str,
+        default="stablediffusion",
+    )
+    parser.add_argument(
+        "-l",
+        "--logdir",
+        type=str,
+        default="logs",
+        help="directory for logging dat shit",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="scale base-lr by ngpu * batch_size * n_accumulate",
+    )
+    parser.add_argument(
+        "--legacy_naming",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="name run based on config file name if true, else by whole path",
+    )
+    parser.add_argument(
+        "--enable_tf32",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="enables the TensorFloat32 format both for matmuls and cuDNN for pytorch 1.12",
+    )
+    parser.add_argument(
+        "--startup",
+        type=str,
+        default=None,
+        help="Startuptime from distributed script",
+    )
+    parser.add_argument(
+        "--wandb",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,  # TODO: later default to True
+        help="log to wandb",
+    )
+    parser.add_argument(
+        "--no_base_name",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,  # TODO: later default to True
+        help="log to wandb",
+    )
+    if version.parse(torch.__version__) >= version.parse("2.0.0"):
+        parser.add_argument(
+            "--resume_from_checkpoint",
+            type=str,
+            default=None,
+            help="single checkpoint file to resume from",
+        )
+    default_args = default_trainer_args()
+    for key in default_args:
+        parser.add_argument("--" + key, default=default_args[key])
+    return parser
+
+
+def get_checkpoint_name(logdir):
+    ckpt = os.path.join(logdir, "checkpoints", "last**.ckpt")
+    ckpt = natsorted(glob.glob(ckpt))
+    print('available "last" checkpoints:')
+    print(ckpt)
+    if len(ckpt) > 1:
+        print("got most recent checkpoint")
+        ckpt = sorted(ckpt, key=lambda x: os.path.getmtime(x))[-1]
+        print(f"Most recent ckpt is {ckpt}")
+        with open(os.path.join(logdir, "most_recent_ckpt.txt"), "w") as f:
+            f.write(ckpt + "\n")
+        try:
+            version = int(ckpt.split("/")[-1].split("-v")[-1].split(".")[0])
+        except Exception as e:
+            print("version confusion but not bad")
+            print(e)
+            version = 1
+        # version = last_version + 1
+    else:
+        # in this case, we only have one "last.ckpt"
+        ckpt = ckpt[0]
+        version = 1
+    melk_ckpt_name = f"last-v{version}.ckpt"
+    print(f"Current melk ckpt name: {melk_ckpt_name}")
+    return ckpt, melk_ckpt_name
+
+
+class SetupCallback(Callback):
+    def __init__(
+        self,
+        resume,
+        now,
+        logdir,
+        ckptdir,
+        cfgdir,
+        config,
+        lightning_config,
+        debug,
+        ckpt_name=None,
+    ):
+        super().__init__()
+        self.resume = resume
+        self.now = now
+        self.logdir = logdir
+        self.ckptdir = ckptdir
+        self.cfgdir = cfgdir
+        self.config = config
+        self.lightning_config = lightning_config
+        self.debug = debug
+        self.ckpt_name = ckpt_name
+
+    def on_exception(self, trainer: pl.Trainer, pl_module, exception):
+        if not self.debug and trainer.global_rank == 0:
+            print("Summoning checkpoint.")
+            if self.ckpt_name is None:
+                ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
+            else:
+                ckpt_path = os.path.join(self.ckptdir, self.ckpt_name)
+            trainer.save_checkpoint(ckpt_path)
+
+    def on_fit_start(self, trainer, pl_module):
+        if trainer.global_rank == 0:
+            # Create logdirs and save configs
+            os.makedirs(self.logdir, exist_ok=True)
+            os.makedirs(self.ckptdir, exist_ok=True)
+            os.makedirs(self.cfgdir, exist_ok=True)
+
+            if "callbacks" in self.lightning_config:
+                if (
+                    "metrics_over_trainsteps_checkpoint"
+                    in self.lightning_config["callbacks"]
+                ):
+                    os.makedirs(
+                        os.path.join(self.ckptdir, "trainstep_checkpoints"),
+                        exist_ok=True,
+                    )
+            print("Project config")
+            print(OmegaConf.to_yaml(self.config))
+            if MULTINODE_HACKS:
+                import time
+
+                time.sleep(5)
+            OmegaConf.save(
+                self.config,
+                os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)),
+            )
+
+            print("Lightning config")
+            print(OmegaConf.to_yaml(self.lightning_config))
+            OmegaConf.save(
+                OmegaConf.create({"lightning": self.lightning_config}),
+                os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)),
+            )
+
+        else:
+            # ModelCheckpoint callback created log directory --- remove it
+            if not MULTINODE_HACKS and not self.resume and os.path.exists(self.logdir):
+                dst, name = os.path.split(self.logdir)
+                dst = os.path.join(dst, "child_runs", name)
+                os.makedirs(os.path.split(dst)[0], exist_ok=True)
+                try:
+                    os.rename(self.logdir, dst)
+                except FileNotFoundError:
+                    pass
+
+
+class ImageLogger(Callback):
+    def __init__(
+        self,
+        batch_frequency,
+        max_images,
+        clamp=True,
+        increase_log_steps=True,
+        rescale=True,
+        disabled=False,
+        log_on_batch_idx=False,
+        log_first_step=False,
+        log_images_kwargs=None,
+        log_before_first_step=False,
+        enable_autocast=True,
+    ):
+        super().__init__()
+        self.enable_autocast = enable_autocast
+        self.rescale = rescale
+        self.batch_freq = batch_frequency
+        self.max_images = max_images
+        self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)]
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+        self.disabled = disabled
+        self.log_on_batch_idx = log_on_batch_idx
+        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
+        self.log_first_step = log_first_step
+        self.log_before_first_step = log_before_first_step
+
+    @rank_zero_only
+    def log_local(
+        self,
+        save_dir,
+        split,
+        images,
+        global_step,
+        current_epoch,
+        batch_idx,
+        pl_module: Union[None, pl.LightningModule] = None,
+    ):
+        root = os.path.join(save_dir, "images", split)
+        for k in images:
+            if isheatmap(images[k]):
+                fig, ax = plt.subplots()
+                ax = ax.matshow(
+                    images[k].cpu().numpy(), cmap="hot", interpolation="lanczos"
+                )
+                plt.colorbar(ax)
+                plt.axis("off")
+
+                filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
+                    k, global_step, current_epoch, batch_idx
+                )
+                os.makedirs(root, exist_ok=True)
+                path = os.path.join(root, filename)
+                plt.savefig(path)
+                plt.close()
+                # TODO: support wandb
+            else:
+                grid = torchvision.utils.make_grid(images[k], nrow=4)
+                if self.rescale:
+                    grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+                grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+                grid = grid.numpy()
+                grid = (grid * 255).astype(np.uint8)
+                filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
+                    k, global_step, current_epoch, batch_idx
+                )
+                path = os.path.join(root, filename)
+                os.makedirs(os.path.split(path)[0], exist_ok=True)
+                img = Image.fromarray(grid)
+                img.save(path)
+                if exists(pl_module):
+                    assert isinstance(
+                        pl_module.logger, WandbLogger
+                    ), "logger_log_image only supports WandbLogger currently"
+                    pl_module.logger.log_image(
+                        key=f"{split}/{k}",
+                        images=[
+                            img,
+                        ],
+                        step=pl_module.global_step,
+                    )
+
+    @rank_zero_only
+    def log_img(self, pl_module, batch, batch_idx, split="train"):
+        check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step
+        if (
+            self.check_frequency(check_idx)
+            and hasattr(pl_module, "log_images")  # batch_idx % self.batch_freq == 0
+            and callable(pl_module.log_images)
+            and
+            # batch_idx > 5 and
+            self.max_images > 0
+        ):
+            logger = type(pl_module.logger)
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+
+            gpu_autocast_kwargs = {
+                "enabled": self.enable_autocast,  # torch.is_autocast_enabled(),
+                "dtype": torch.get_autocast_gpu_dtype(),
+                "cache_enabled": torch.is_autocast_cache_enabled(),
+            }
+            with torch.no_grad(), torch.cuda.amp.autocast(**gpu_autocast_kwargs):
+                images = pl_module.log_images(
+                    batch, split=split, **self.log_images_kwargs
+                )
+
+            for k in images:
+                N = min(images[k].shape[0], self.max_images)
+                if not isheatmap(images[k]):
+                    images[k] = images[k][:N]
+                if isinstance(images[k], torch.Tensor):
+                    images[k] = images[k].detach().float().cpu()
+                    if self.clamp and not isheatmap(images[k]):
+                        images[k] = torch.clamp(images[k], -1.0, 1.0)
+
+            self.log_local(
+                pl_module.logger.save_dir,
+                split,
+                images,
+                pl_module.global_step,
+                pl_module.current_epoch,
+                batch_idx,
+                pl_module=pl_module
+                if isinstance(pl_module.logger, WandbLogger)
+                else None,
+            )
+
+            if is_train:
+                pl_module.train()
+
+    def check_frequency(self, check_idx):
+        if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and (
+            check_idx > 0 or self.log_first_step
+        ):
+            try:
+                self.log_steps.pop(0)
+            except IndexError as e:
+                print(e)
+                pass
+            return True
+        return False
+
+    @rank_zero_only
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        if not self.disabled and (pl_module.global_step > 0 or self.log_first_step):
+            self.log_img(pl_module, batch, batch_idx, split="train")
+
+    @rank_zero_only
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        if self.log_before_first_step and pl_module.global_step == 0:
+            print(f"{self.__class__.__name__}: logging before training")
+            self.log_img(pl_module, batch, batch_idx, split="train")
+
+    @rank_zero_only
+    def on_validation_batch_end(
+        self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs
+    ):
+        if not self.disabled and pl_module.global_step > 0:
+            self.log_img(pl_module, batch, batch_idx, split="val")
+        if hasattr(pl_module, "calibrate_grad_norm"):
+            if (
+                pl_module.calibrate_grad_norm and batch_idx % 25 == 0
+            ) and batch_idx > 0:
+                self.log_gradients(trainer, pl_module, batch_idx=batch_idx)
+
+
+@rank_zero_only
+def init_wandb(save_dir, opt, config, group_name, name_str):
+    print(f"setting WANDB_DIR to {save_dir}")
+    os.makedirs(save_dir, exist_ok=True)
+
+    os.environ["WANDB_DIR"] = save_dir
+    if opt.debug:
+        wandb.init(project=opt.projectname, mode="offline", group=group_name)
+    else:
+        wandb.init(
+            project=opt.projectname,
+            config=config,
+            settings=wandb.Settings(code_dir="./sgm"),
+            group=group_name,
+            name=name_str,
+        )
+
+
+if __name__ == "__main__":
+    # custom parser to specify config files, train, test and debug mode,
+    # postfix, resume.
+    # `--key value` arguments are interpreted as arguments to the trainer.
+    # `nested.key=value` arguments are interpreted as config parameters.
+    # configs are merged from left-to-right followed by command line parameters.
+
+    # model:
+    #   base_learning_rate: float
+    #   target: path to lightning module
+    #   params:
+    #       key: value
+    # data:
+    #   target: main.DataModuleFromConfig
+    #   params:
+    #      batch_size: int
+    #      wrap: bool
+    #      train:
+    #          target: path to train dataset
+    #          params:
+    #              key: value
+    #      validation:
+    #          target: path to validation dataset
+    #          params:
+    #              key: value
+    #      test:
+    #          target: path to test dataset
+    #          params:
+    #              key: value
+    # lightning: (optional, has sane defaults and can be specified on cmdline)
+    #   trainer:
+    #       additional arguments to trainer
+    #   logger:
+    #       logger to instantiate
+    #   modelcheckpoint:
+    #       modelcheckpoint to instantiate
+    #   callbacks:
+    #       callback1:
+    #           target: importpath
+    #           params:
+    #               key: value
+
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+
+    # add cwd for convenience and to make classes in this file available when
+    # running as `python main.py`
+    # (in particular `main.DataModuleFromConfig`)
+    sys.path.append(os.getcwd())
+
+    parser = get_parser()
+
+    opt, unknown = parser.parse_known_args()
+
+    if opt.name and opt.resume:
+        raise ValueError(
+            "-n/--name and -r/--resume cannot be specified both."
+            "If you want to resume training in a new log folder, "
+            "use -n/--name in combination with --resume_from_checkpoint"
+        )
+    melk_ckpt_name = None
+    name = None
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            # idx = len(paths)-paths[::-1].index("logs")+1
+            # logdir = "/".join(paths[:idx])
+            logdir = "/".join(paths[:-2])
+            ckpt = opt.resume
+            _, melk_ckpt_name = get_checkpoint_name(logdir)
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt, melk_ckpt_name = get_checkpoint_name(logdir)
+
+        print("#" * 100)
+        print(f'Resuming from checkpoint "{ckpt}"')
+        print("#" * 100)
+
+        opt.resume_from_checkpoint = ckpt
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
+        opt.base = base_configs + opt.base
+        _tmp = logdir.split("/")
+        nowname = _tmp[-1]
+    else:
+        if opt.name:
+            name = "_" + opt.name
+        elif opt.base:
+            if opt.no_base_name:
+                name = ""
+            else:
+                if opt.legacy_naming:
+                    cfg_fname = os.path.split(opt.base[0])[-1]
+                    cfg_name = os.path.splitext(cfg_fname)[0]
+                else:
+                    assert "configs" in os.path.split(opt.base[0])[0], os.path.split(
+                        opt.base[0]
+                    )[0]
+                    cfg_path = os.path.split(opt.base[0])[0].split(os.sep)[
+                        os.path.split(opt.base[0])[0].split(os.sep).index("configs")
+                        + 1 :
+                    ]  # cut away the first one (we assert all configs are in "configs")
+                    cfg_name = os.path.splitext(os.path.split(opt.base[0])[-1])[0]
+                    cfg_name = "-".join(cfg_path) + f"-{cfg_name}"
+                name = "_" + cfg_name
+        else:
+            name = ""
+        if not opt.no_date:
+            nowname = now + name + opt.postfix
+        else:
+            nowname = name + opt.postfix
+            if nowname.startswith("_"):
+                nowname = nowname[1:]
+        logdir = os.path.join(opt.logdir, nowname)
+        print(f"LOGDIR: {logdir}")
+
+    ckptdir = os.path.join(logdir, "checkpoints")
+    cfgdir = os.path.join(logdir, "configs")
+    seed_everything(opt.seed, workers=True)
+
+    # move before model init, in case a torch.compile(...) is called somewhere
+    if opt.enable_tf32:
+        # pt_version = version.parse(torch.__version__)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        print(f"Enabling TF32 for PyTorch {torch.__version__}")
+    else:
+        print(f"Using default TF32 settings for PyTorch {torch.__version__}:")
+        print(
+            f"torch.backends.cuda.matmul.allow_tf32={torch.backends.cuda.matmul.allow_tf32}"
+        )
+        print(f"torch.backends.cudnn.allow_tf32={torch.backends.cudnn.allow_tf32}")
+
+    try:
+        # init and save configs
+        configs = [OmegaConf.load(cfg) for cfg in opt.base]
+        cli = OmegaConf.from_dotlist(unknown)
+        config = OmegaConf.merge(*configs, cli)
+        lightning_config = config.pop("lightning", OmegaConf.create())
+        # merge trainer cli with config
+        trainer_config = lightning_config.get("trainer", OmegaConf.create())
+
+        # default to gpu
+        trainer_config["accelerator"] = "gpu"
+        #
+        standard_args = default_trainer_args()
+        for k in standard_args:
+            if getattr(opt, k) != standard_args[k]:
+                trainer_config[k] = getattr(opt, k)
+
+        ckpt_resume_path = opt.resume_from_checkpoint
+
+        if not "devices" in trainer_config and trainer_config["accelerator"] != "gpu":
+            del trainer_config["accelerator"]
+            cpu = True
+        else:
+            gpuinfo = trainer_config["devices"]
+            print(f"Running on GPUs {gpuinfo}")
+            cpu = False
+        trainer_opt = argparse.Namespace(**trainer_config)
+        lightning_config.trainer = trainer_config
+
+        # model
+        model = instantiate_from_config(config.model)
+
+        # trainer and callbacks
+        trainer_kwargs = dict()
+
+        # default logger configs
+        default_logger_cfgs = {
+            "wandb": {
+                "target": "pytorch_lightning.loggers.WandbLogger",
+                "params": {
+                    "name": nowname,
+                    # "save_dir": logdir,
+                    "offline": opt.debug,
+                    "id": nowname,
+                    "project": opt.projectname,
+                    "log_model": False,
+                    # "dir": logdir,
+                },
+            },
+            "csv": {
+                "target": "pytorch_lightning.loggers.CSVLogger",
+                "params": {
+                    "name": "testtube",  # hack for sbord fanatics
+                    "save_dir": logdir,
+                },
+            },
+        }
+        default_logger_cfg = default_logger_cfgs["wandb" if opt.wandb else "csv"]
+        if opt.wandb:
+            # TODO change once leaving "swiffer" config directory
+            try:
+                group_name = nowname.split(now)[-1].split("-")[1]
+            except:
+                group_name = nowname
+            default_logger_cfg["params"]["group"] = group_name
+            init_wandb(
+                os.path.join(os.getcwd(), logdir),
+                opt=opt,
+                group_name=group_name,
+                config=config,
+                name_str=nowname,
+            )
+        if "logger" in lightning_config:
+            logger_cfg = lightning_config.logger
+        else:
+            logger_cfg = OmegaConf.create()
+        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
+        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
+
+        # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
+        # specify which metric is used to determine best models
+        default_modelckpt_cfg = {
+            "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+            "params": {
+                "dirpath": ckptdir,
+                "filename": "{epoch:06}",
+                "verbose": True,
+                "save_last": True,
+            },
+        }
+        if hasattr(model, "monitor"):
+            print(f"Monitoring {model.monitor} as checkpoint metric.")
+            default_modelckpt_cfg["params"]["monitor"] = model.monitor
+            default_modelckpt_cfg["params"]["save_top_k"] = 3
+
+        if "modelcheckpoint" in lightning_config:
+            modelckpt_cfg = lightning_config.modelcheckpoint
+        else:
+            modelckpt_cfg = OmegaConf.create()
+        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
+        print(f"Merged modelckpt-cfg: \n{modelckpt_cfg}")
+
+        # https://pytorch-lightning.readthedocs.io/en/stable/extensions/strategy.html
+        # default to ddp if not further specified
+        default_strategy_config = {"target": "pytorch_lightning.strategies.DDPStrategy"}
+
+        if "strategy" in lightning_config:
+            strategy_cfg = lightning_config.strategy
+        else:
+            strategy_cfg = OmegaConf.create()
+            default_strategy_config["params"] = {
+                "find_unused_parameters": False,
+                # "static_graph": True,
+                # "ddp_comm_hook": default.fp16_compress_hook  # TODO: experiment with this, also for DDPSharded
+            }
+        strategy_cfg = OmegaConf.merge(default_strategy_config, strategy_cfg)
+        print(
+            f"strategy config: \n ++++++++++++++ \n {strategy_cfg} \n ++++++++++++++ "
+        )
+        trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
+
+        # add callback which sets up log directory
+        default_callbacks_cfg = {
+            "setup_callback": {
+                "target": "main.SetupCallback",
+                "params": {
+                    "resume": opt.resume,
+                    "now": now,
+                    "logdir": logdir,
+                    "ckptdir": ckptdir,
+                    "cfgdir": cfgdir,
+                    "config": config,
+                    "lightning_config": lightning_config,
+                    "debug": opt.debug,
+                    "ckpt_name": melk_ckpt_name,
+                },
+            },
+            "image_logger": {
+                "target": "main.ImageLogger",
+                "params": {"batch_frequency": 1000, "max_images": 4, "clamp": True},
+            },
+            "learning_rate_logger": {
+                "target": "pytorch_lightning.callbacks.LearningRateMonitor",
+                "params": {
+                    "logging_interval": "step",
+                    # "log_momentum": True
+                },
+            },
+        }
+        if version.parse(pl.__version__) >= version.parse("1.4.0"):
+            default_callbacks_cfg.update({"checkpoint_callback": modelckpt_cfg})
+
+        if "callbacks" in lightning_config:
+            callbacks_cfg = lightning_config.callbacks
+        else:
+            callbacks_cfg = OmegaConf.create()
+
+        if "metrics_over_trainsteps_checkpoint" in callbacks_cfg:
+            print(
+                "Caution: Saving checkpoints every n train steps without deleting. This might require some free space."
+            )
+            default_metrics_over_trainsteps_ckpt_dict = {
+                "metrics_over_trainsteps_checkpoint": {
+                    "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+                    "params": {
+                        "dirpath": os.path.join(ckptdir, "trainstep_checkpoints"),
+                        "filename": "{epoch:06}-{step:09}",
+                        "verbose": True,
+                        "save_top_k": -1,
+                        "every_n_train_steps": 10000,
+                        "save_weights_only": True,
+                    },
+                }
+            }
+            default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
+
+        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
+        if "ignore_keys_callback" in callbacks_cfg and ckpt_resume_path is not None:
+            callbacks_cfg.ignore_keys_callback.params["ckpt_path"] = ckpt_resume_path
+        elif "ignore_keys_callback" in callbacks_cfg:
+            del callbacks_cfg["ignore_keys_callback"]
+
+        trainer_kwargs["callbacks"] = [
+            instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg
+        ]
+        if not "plugins" in trainer_kwargs:
+            trainer_kwargs["plugins"] = list()
+
+        # cmd line trainer args (which are in trainer_opt) have always priority over config-trainer-args (which are in trainer_kwargs)
+        trainer_opt = vars(trainer_opt)
+        trainer_kwargs = {
+            key: val for key, val in trainer_kwargs.items() if key not in trainer_opt
+        }
+        trainer = Trainer(**trainer_opt, **trainer_kwargs)
+
+        trainer.logdir = logdir  ###
+
+        # data
+        data = instantiate_from_config(config.data)
+        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
+        # calling these ourselves should not be necessary but it is.
+        # lightning still takes care of proper multiprocessing though
+        data.prepare_data()
+        # data.setup()
+        print("#### Data #####")
+        try:
+            for k in data.datasets:
+                print(
+                    f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}"
+                )
+        except:
+            print("datasets not yet initialized.")
+
+        # configure learning rate
+        if "batch_size" in config.data.params:
+            bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
+        else:
+            bs, base_lr = (
+                config.data.params.train.loader.batch_size,
+                config.model.base_learning_rate,
+            )
+        if not cpu:
+            ngpu = len(lightning_config.trainer.devices.strip(",").split(","))
+        else:
+            ngpu = 1
+        if "accumulate_grad_batches" in lightning_config.trainer:
+            accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
+        else:
+            accumulate_grad_batches = 1
+        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
+        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
+        if opt.scale_lr:
+            model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
+            print(
+                "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format(
+                    model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr
+                )
+            )
+        else:
+            model.learning_rate = base_lr
+            print("++++ NOT USING LR SCALING ++++")
+            print(f"Setting learning rate to {model.learning_rate:.2e}")
+
+        # allow checkpointing via USR1
+        def melk(*args, **kwargs):
+            # run all checkpoint hooks
+            if trainer.global_rank == 0:
+                print("Summoning checkpoint.")
+                if melk_ckpt_name is None:
+                    ckpt_path = os.path.join(ckptdir, "last.ckpt")
+                else:
+                    ckpt_path = os.path.join(ckptdir, melk_ckpt_name)
+                trainer.save_checkpoint(ckpt_path)
+
+        def divein(*args, **kwargs):
+            if trainer.global_rank == 0:
+                import pudb
+
+                pudb.set_trace()
+
+        import signal
+
+        signal.signal(signal.SIGUSR1, melk)
+        signal.signal(signal.SIGUSR2, divein)
+
+        # run
+        if opt.train:
+            try:
+                trainer.fit(model, data, ckpt_path=ckpt_resume_path)
+            except Exception:
+                if not opt.debug:
+                    melk()
+                raise
+        if not opt.no_test and not trainer.interrupted:
+            trainer.test(model, data)
+    except RuntimeError as err:
+        if MULTINODE_HACKS:
+            import datetime
+            import os
+            import socket
+
+            import requests
+
+            device = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
+            hostname = socket.gethostname()
+            ts = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
+            resp = requests.get("http://169.254.169.254/latest/meta-data/instance-id")
+            print(
+                f"ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}",
+                flush=True,
+            )
+        raise err
+    except Exception:
+        if opt.debug and trainer.global_rank == 0:
+            try:
+                import pudb as debugger
+            except ImportError:
+                import pdb as debugger
+            debugger.post_mortem()
+        raise
+    finally:
+        # move newly created debug project to debug_runs
+        if opt.debug and not opt.resume and trainer.global_rank == 0:
+            dst, name = os.path.split(logdir)
+            dst = os.path.join(dst, "debug_runs", name)
+            os.makedirs(os.path.split(dst)[0], exist_ok=True)
+            os.rename(logdir, dst)
+
+        if opt.wandb:
+            wandb.finish()
+        # if trainer.global_rank == 0:
+        #    print(trainer.profiler.summary())
diff --git a/main/inference/motionctrl_cmcm.py b/main/inference/motionctrl_cmcm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd22751452d0d10524b3b02446edb852284be94
--- /dev/null
+++ b/main/inference/motionctrl_cmcm.py
@@ -0,0 +1,416 @@
+import argparse
+import datetime
+import json
+import math
+import os
+import sys
+import time
+from glob import glob
+from pathlib import Path
+from typing import Optional
+
+import cv2
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange, repeat
+from fire import Fire
+from omegaconf import OmegaConf
+from PIL import Image
+from torchvision.transforms import CenterCrop, Compose, Resize, ToTensor
+
+sys.path.insert(1, os.path.join(sys.path[0], '..', '..'))
+from sgm.util import default, instantiate_from_config
+
+camera_poses = [
+    'test_camera_L',
+    'test_camera_D',
+    'test_camera_I',
+    'test_camera_O',
+    'test_camera_R',
+    'test_camera_U',
+    'test_camera_Round-ZoomIn',
+    'test_camera_Round-RI_90',
+]
+
+def to_relative_RT2(org_pose, keyframe_idx=0, keyframe_zero=False):
+        org_pose = org_pose.reshape(-1, 3, 4) # [t, 3, 4]
+        R_dst = org_pose[:, :, :3]
+        T_dst = org_pose[:, :, 3:]
+
+        R_src = R_dst[keyframe_idx: keyframe_idx+1].repeat(org_pose.shape[0], axis=0) # [t, 3, 3]
+        T_src = T_dst[keyframe_idx: keyframe_idx+1].repeat(org_pose.shape[0], axis=0)
+
+        R_src_inv = R_src.transpose(0, 2, 1) # [t, 3, 3]
+        
+        R_rel = R_dst @ R_src_inv # [t, 3, 3]
+        T_rel = T_dst - R_rel@T_src
+
+        RT_rel = np.concatenate([R_rel, T_rel], axis=-1) # [t, 3, 4]
+        RT_rel = RT_rel.reshape(-1, 12) # [t, 12]
+
+        if keyframe_zero:
+            RT_rel[keyframe_idx] = np.zeros_like(RT_rel[keyframe_idx])
+
+        return RT_rel
+
+def get_RT(pose_dir='', video_frames=14, frame_stride=1, speed=1.0, **kwargs):
+    pose_file = [f'{pose_dir}/{pose}.json' for pose in camera_poses]
+    pose_sample_num = len(pose_file)
+
+    pose_sample_num = len(pose_file)
+
+    data_list = []
+    pose_name = []
+
+
+    for idx in range(pose_sample_num):
+        cur_pose_name = camera_poses[idx].replace('test_camera_', '')
+        pose_name.append(cur_pose_name)
+
+        with open(pose_file[idx], 'r') as f:
+            pose = json.load(f)
+        pose = np.array(pose) # [t, 12]
+        
+        while frame_stride * video_frames > pose.shape[0]:
+            frame_stride -= 1
+
+        pose = pose[::frame_stride]
+        if video_frames < 16:
+            half = (pose.shape[0] - video_frames) // 2
+            pose = pose[half:half+video_frames]
+        # pose = pose[:video_frames]
+        pose = pose.reshape(-1, 3, 4) # [t, 3, 4]
+        # rescale
+        pose[:, :, -1] = pose[:, :, -1] * np.array([3, 1, 4]) * speed
+        pose = to_relative_RT2(pose)
+        
+            
+        pose = torch.tensor(pose).float() # [t, 12]
+        data_list.append(pose)
+
+    # data_list = torch.stack(data_list, dim=0) # [pose_sample_num, t, 12]
+    return data_list, pose_name
+
+def sample(
+    input_path: str = "examples/camera_poses",  # Can either be image file or folder with image files
+    ckpt: str = "checkpoints/motionctrl_svd.ckpt",
+    config: str = None,
+    num_frames: Optional[int] = None,
+    num_steps: Optional[int] = None,
+    version: str = "svd",
+    fps_id: int = 6,
+    motion_bucket_id: int = 127,
+    cond_aug: float = 0.02,
+    seed: int = 23,
+    decoding_t: int = 1,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
+    device: str = "cuda",
+    output_folder: Optional[str] = None,
+    save_fps: int = 10,
+    resize: Optional[bool] = False,
+    pose_dir: str = '',
+    sample_num: int = 1,
+    height: int = 576,
+    width: int = 1024,
+    transform: Optional[bool] = False,
+    save_images: Optional[bool] = False,
+    speed: float = 1.0,
+):
+    """
+    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
+    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
+    """
+
+    assert (version == "svd"), "Only SVD is supported for now."
+    num_frames = default(num_frames, 14)
+    num_steps = default(num_steps, 25)
+    output_folder = default(output_folder, "outputs/motionctrl_svd/")
+    model_config = default(config, "configs/inference/config_motionctrl_cmcm.yaml")
+
+    model, filter = load_model(
+        model_config,
+        ckpt,
+        device,
+        num_frames,
+        num_steps,
+    )
+    torch.manual_seed(seed)
+
+    path = Path(input_path)
+    all_img_paths = []
+    if path.is_file():
+        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
+            all_img_paths = [input_path]
+        else:
+            raise ValueError("Path is not valid image file.")
+    elif path.is_dir():
+        all_img_paths = sorted(
+            [
+                f
+                for f in path.iterdir()
+                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
+            ]
+        )
+        if len(all_img_paths) == 0:
+            raise ValueError("Folder does not contain any images.")
+    else:
+        raise ValueError
+    
+    if transform:
+        spatial_transform = Compose([
+            Resize(size=width),
+            CenterCrop(size=(height, width)),
+        ])
+    
+    # get camera poses
+    RTs, pose_name = get_RT(pose_dir=pose_dir, video_frames=num_frames, frame_stride=1, speed=speed)
+
+    print(f'loaded {len(all_img_paths)} images.')
+    os.makedirs(output_folder, exist_ok=True)
+    for no, input_img_path in enumerate(all_img_paths):
+        
+        filepath, fullflname = os.path.split(input_img_path)
+        filename, ext = os.path.splitext(fullflname)
+        print(f'-sample {no+1}: {filename} ...')
+
+        # RTs = RTs[0:1]
+        for RT_idx in range(len(RTs)):
+            cur_pose_name = pose_name[RT_idx]
+            print(f'--pose: {cur_pose_name} ...')
+            RT = RTs[RT_idx]
+            RT = RT.unsqueeze(0).repeat(2,1,1)
+            RT = RT.to(device)
+
+            with Image.open(input_img_path) as image:
+                if image.mode == "RGBA":
+                    image = image.convert("RGB")
+                if transform:
+                    image = spatial_transform(image)
+                if resize:
+                    image = image.resize((width, height))
+                w, h = image.size
+
+                if h % 64 != 0 or w % 64 != 0:
+                    width, height = map(lambda x: x - x % 64, (w, h))
+                    image = image.resize((width, height))
+                    print(
+                        f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
+                    )
+
+                image = ToTensor()(image)
+                image = image * 2.0 - 1.0
+
+            image = image.unsqueeze(0).to(device)
+            H, W = image.shape[2:]
+            assert image.shape[1] == 3
+            F = 8
+            C = 4
+            shape = (num_frames, C, H // F, W // F)
+            if (H, W) != (576, 1024):
+                print(
+                    "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
+                )
+            if motion_bucket_id > 255:
+                print(
+                    "WARNING: High motion bucket! This may lead to suboptimal performance."
+                )
+
+            if fps_id < 5:
+                print("WARNING: Small fps value! This may lead to suboptimal performance.")
+
+            if fps_id > 30:
+                print("WARNING: Large fps value! This may lead to suboptimal performance.")
+
+            value_dict = {}
+            value_dict["motion_bucket_id"] = motion_bucket_id
+            value_dict["fps_id"] = fps_id
+            value_dict["cond_aug"] = cond_aug
+            value_dict["cond_frames_without_noise"] = image
+            value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
+
+            with torch.no_grad():
+                with torch.autocast(device):
+                    batch, batch_uc = get_batch(
+                        get_unique_embedder_keys_from_conditioner(model.conditioner),
+                        value_dict,
+                        [1, num_frames],
+                        T=num_frames,
+                        device=device,
+                    )
+                    c, uc = model.conditioner.get_unconditional_conditioning(
+                        batch,
+                        batch_uc=batch_uc,
+                        force_uc_zero_embeddings=[
+                            "cond_frames",
+                            "cond_frames_without_noise",
+                        ],
+                    )
+
+                    for k in ["crossattn", "concat"]:
+                        uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                        uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                        c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                        c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+
+                    
+
+                    additional_model_inputs = {}
+                    additional_model_inputs["image_only_indicator"] = torch.zeros(
+                        2, num_frames
+                    ).to(device)
+                    #additional_model_inputs["image_only_indicator"][:,0] = 1
+                    additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+
+                    
+                    additional_model_inputs["RT"] = RT
+
+                    def denoiser(input, sigma, c):
+                        return model.denoiser(
+                            model.model, input, sigma, c, **additional_model_inputs
+                        )
+
+                    results = []
+                    for j in range(sample_num):
+                        randn = torch.randn(shape, device=device)
+                        samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+                        model.en_and_decode_n_samples_a_time = decoding_t
+                        samples_x = model.decode_first_stage(samples_z)
+                        samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0) # [1*t, c, h, w]
+                        results.append(samples)
+
+                    samples = torch.stack(results, dim=0) # [sample_num, t, c, h, w]
+                    samples = samples.data.cpu()
+
+                    video_path = os.path.join(output_folder, f"{filename}_{cur_pose_name}.mp4")
+                    save_results(samples, video_path, fps=save_fps)
+
+                    if save_images:
+                        for i in range(sample_num):
+                            cur_output_folder = os.path.join(output_folder, f"{filename}", f"{cur_pose_name}", f"{i}")
+                            os.makedirs(cur_output_folder, exist_ok=True)
+                            for j in range(num_frames):
+                                cur_img_path = os.path.join(cur_output_folder, f"{j:06d}.png")
+                                torchvision.utils.save_image(samples[i,j], cur_img_path)
+    
+    print(f'Done! results saved in {output_folder}.')
+
+def save_results(resutls, filename, fps=10):
+    video = resutls.permute(1, 0, 2, 3, 4) # [t, sample_num, c, h, w]
+    frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w]
+    grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
+    # already in [0,1]
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list(set([x.input_key for x in conditioner.embedders]))
+
+
+def get_batch(keys, value_dict, N, T, device):
+    batch = {}
+    batch_uc = {}
+
+    for key in keys:
+        if key == "fps_id":
+            batch[key] = (
+                torch.tensor([value_dict["fps_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "motion_bucket_id":
+            batch[key] = (
+                torch.tensor([value_dict["motion_bucket_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "cond_aug":
+            batch[key] = repeat(
+                torch.tensor([value_dict["cond_aug"]]).to(device),
+                "1 -> b",
+                b=math.prod(N),
+            )
+        elif key == "cond_frames":
+            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
+        elif key == "cond_frames_without_noise":
+            batch[key] = repeat(
+                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
+            )
+        else:
+            batch[key] = value_dict[key]
+
+    if T is not None:
+        batch["num_video_frames"] = T
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def load_model(
+    config: str,
+    ckpt: str,
+    device: str,
+    num_frames: int,
+    num_steps: int,
+):
+
+    config = OmegaConf.load(config)
+    config.model.params.ckpt_path = ckpt
+    if device == "cuda":
+        config.model.params.conditioner_config.params.emb_models[
+            0
+        ].params.open_clip_embedding_config.params.init_device = device
+
+    config.model.params.sampler_config.params.num_steps = num_steps
+    config.model.params.sampler_config.params.guider_config.params.num_frames = (
+        num_frames
+    )
+
+    model = instantiate_from_config(config.model)
+
+    model = model.to(device).eval()    
+
+    filter = None #DeepFloydDataFiltering(verbose=False, device=device)
+    return model, filter
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=23, help="seed for seed_everything")
+    parser.add_argument("--ckpt", type=str, default=None, help="checkpoint path")
+    parser.add_argument("--config", type=str, help="config (yaml) path")
+    parser.add_argument("--input", type=str, default=None, help="image path or folder")
+    parser.add_argument("--savedir", type=str, default=None, help="results saving path")
+    parser.add_argument("--savefps", type=int, default=10, help="video fps to generate")
+    parser.add_argument("--n_samples", type=int, default=1, help="num of samples per prompt",)
+    parser.add_argument("--ddim_steps", type=int, default=50, help="steps of ddim if positive, otherwise use DDPM",)
+    parser.add_argument("--ddim_eta", type=float, default=1.0, help="eta for ddim sampling (0.0 yields deterministic sampling)",)
+    parser.add_argument("--frames", type=int, default=-1, help="frames num to inference")
+    parser.add_argument("--fps", type=int, default=6, help="control the fps")
+    parser.add_argument("--motion", type=int, default=127, help="control the motion magnitude")
+    parser.add_argument("--cond_aug", type=float, default=0.02, help="adding noise to input image")
+    parser.add_argument("--decoding_t", type=int, default=1, help="frames num to decoding per time")
+    parser.add_argument("--resize", action='store_true', default=False, help="resize all input to default resolution")
+    parser.add_argument("--sample_num", type=int, default=1, help="frames num to decoding per time")
+    parser.add_argument("--pose_dir", type=str, default='', help="checkpoint path")
+    parser.add_argument("--height", type=int, default=576, help="frames num to decoding per time")
+    parser.add_argument("--width", type=int, default=1024, help="frames num to decoding per time")
+    parser.add_argument("--transform", action='store_true', default=False, help="resize all input to specific resolution")
+    parser.add_argument("--save_images", action='store_true', default=False, help="save images")
+    parser.add_argument("--speed", type=float, default=1.0, help="speed of camera motion")
+    return parser
+
+
+if __name__ == "__main__":
+    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    print("@MotionCrl+SVD Inference: %s"%now)
+    #Fire(sample)
+    parser = get_parser()
+    args = parser.parse_args()
+    sample(input_path=args.input, ckpt=args.ckpt, config=args.config, num_frames=args.frames, num_steps=args.ddim_steps, \
+        fps_id=args.fps, motion_bucket_id=args.motion, cond_aug=args.cond_aug, seed=args.seed, \
+        decoding_t=args.decoding_t, output_folder=args.savedir, save_fps=args.savefps, resize=args.resize,
+        pose_dir=args.pose_dir, sample_num=args.sample_num, height=args.height, width=args.width,
+        transform=args.transform, save_images=args.save_images, speed=args.speed)
+    
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..d79bd9b2efec0bcf3d4594b6261ca6bfc68b170c
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers = 
+  inference: mark as inference test (deselect with '-m "not inference"')
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e9e36300a14bf808560b20896d2cad5875657820
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,32 @@
+Pytorch-Lightning==1.9.0
+decord
+kornia
+timm
+open_clip_torch
+av
+omegaconf
+transformers
+einops
+scikit-learn
+taming-transformers-rom1504
+pandas
+triton
+xformers==0.0.16
+torch==1.13.1
+torchvision
+fairscale
+psutil==5.9.5
+annotated-types==0.5.0
+plotly
+imageio==2.14.1
+imageio-ffmpeg==0.4.7
+opencv-python==4.8.0.74
+moviepy
+Pillow
+tqdm
+gradio==3.37.0
+webdataset
+Fire
+natsort
+wandb
+clip @ git+https://github.com/openai/CLIP.git
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/demo/__init__.py b/scripts/demo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/demo/detect.py b/scripts/demo/detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e9f212b868e60602b0b1880a0f3d7c66e16703
--- /dev/null
+++ b/scripts/demo/detect.py
@@ -0,0 +1,156 @@
+import argparse
+
+import cv2
+import numpy as np
+
+try:
+    from imwatermark import WatermarkDecoder
+except ImportError as e:
+    try:
+        # Assume some of the other dependencies such as torch are not fulfilled
+        # import file without loading unnecessary libraries.
+        import importlib.util
+        import sys
+
+        spec = importlib.util.find_spec("imwatermark.maxDct")
+        assert spec is not None
+        maxDct = importlib.util.module_from_spec(spec)
+        sys.modules["maxDct"] = maxDct
+        spec.loader.exec_module(maxDct)
+
+        class WatermarkDecoder(object):
+            """A minimal version of
+            https://github.com/ShieldMnt/invisible-watermark/blob/main/imwatermark/watermark.py
+            to only reconstruct bits using dwtDct"""
+
+            def __init__(self, wm_type="bytes", length=0):
+                assert wm_type == "bits", "Only bits defined in minimal import"
+                self._wmType = wm_type
+                self._wmLen = length
+
+            def reconstruct(self, bits):
+                if len(bits) != self._wmLen:
+                    raise RuntimeError("bits are not matched with watermark length")
+
+                return bits
+
+            def decode(self, cv2Image, method="dwtDct", **configs):
+                (r, c, channels) = cv2Image.shape
+                if r * c < 256 * 256:
+                    raise RuntimeError("image too small, should be larger than 256x256")
+
+                bits = []
+                assert method == "dwtDct"
+                embed = maxDct.EmbedMaxDct(watermarks=[], wmLen=self._wmLen, **configs)
+                bits = embed.decode(cv2Image)
+                return self.reconstruct(bits)
+
+    except:
+        raise e
+
+
+# A fixed 48-bit message that was choosen at random
+# WATERMARK_MESSAGE = 0xB3EC907BB19E
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+MATCH_VALUES = [
+    [27, "No watermark detected"],
+    [33, "Partial watermark match. Cannot determine with certainty."],
+    [
+        35,
+        (
+            "Likely watermarked. In our test 0.02% of real images were "
+            'falsely detected as "Likely watermarked"'
+        ),
+    ],
+    [
+        49,
+        (
+            "Very likely watermarked. In our test no real images were "
+            'falsely detected as "Very likely watermarked"'
+        ),
+    ],
+]
+
+
+class GetWatermarkMatch:
+    def __init__(self, watermark):
+        self.watermark = watermark
+        self.num_bits = len(self.watermark)
+        self.decoder = WatermarkDecoder("bits", self.num_bits)
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """
+        Detects the number of matching bits the predefined watermark with one
+        or multiple images. Images should be in cv2 format, e.g. h x w x c BGR.
+
+        Args:
+            x: ([B], h w, c) in range [0, 255]
+
+        Returns:
+           number of matched bits ([B],)
+        """
+        squeeze = len(x.shape) == 3
+        if squeeze:
+            x = x[None, ...]
+
+        bs = x.shape[0]
+        detected = np.empty((bs, self.num_bits), dtype=bool)
+        for k in range(bs):
+            detected[k] = self.decoder.decode(x[k], "dwtDct")
+        result = np.sum(detected == self.watermark, axis=-1)
+        if squeeze:
+            return result[0]
+        else:
+            return result
+
+
+get_watermark_match = GetWatermarkMatch(WATERMARK_BITS)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "filename",
+        nargs="+",
+        type=str,
+        help="Image files to check for watermarks",
+    )
+    opts = parser.parse_args()
+
+    print(
+        """
+        This script tries to detect watermarked images. Please be aware of
+        the following:
+        - As the watermark is supposed to be invisible, there is the risk that
+          watermarked images may not be detected.
+        - To maximize the chance of detection make sure that the image has the same
+          dimensions as when the watermark was applied (most likely 1024x1024
+          or 512x512).
+        - Specific image manipulation may drastically decrease the chance that
+          watermarks can be detected.
+        - There is also the chance that an image has the characteristics of the
+          watermark by chance.
+        - The watermark script is public, anybody may watermark any images, and
+          could therefore claim it to be generated.
+        - All numbers below are based on a test using 10,000 images without any
+          modifications after applying the watermark.
+        """
+    )
+
+    for fn in opts.filename:
+        image = cv2.imread(fn)
+        if image is None:
+            print(f"Couldn't read {fn}. Skipping")
+            continue
+
+        num_bits = get_watermark_match(image)
+        k = 0
+        while num_bits > MATCH_VALUES[k][0]:
+            k += 1
+        print(
+            f"{fn}: {MATCH_VALUES[k][1]}",
+            f"Bits that matched the watermark {num_bits} from {len(WATERMARK_BITS)}\n",
+            sep="\n\t",
+        )
diff --git a/scripts/demo/discretization.py b/scripts/demo/discretization.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7030a229692e25ddf8d2f516aef2aba0e6149b9
--- /dev/null
+++ b/scripts/demo/discretization.py
@@ -0,0 +1,59 @@
+import torch
+
+from sgm.modules.diffusionmodules.discretizer import Discretization
+
+
+class Img2ImgDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
+    """
+
+    def __init__(self, discretization: Discretization, strength: float = 1.0):
+        self.discretization = discretization
+        self.strength = strength
+        assert 0.0 <= self.strength <= 1.0
+
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
+        print("prune index:", max(int(self.strength * len(sigmas)), 1))
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
+
+
+class Txt2NoisyDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 0.0 means full sampling (all sigmas are returned)
+    """
+
+    def __init__(
+        self, discretization: Discretization, strength: float = 0.0, original_steps=None
+    ):
+        self.discretization = discretization
+        self.strength = strength
+        self.original_steps = original_steps
+        assert 0.0 <= self.strength <= 1.0
+
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        if self.original_steps is None:
+            steps = len(sigmas)
+        else:
+            steps = self.original_steps + 1
+        prune_index = max(min(int(self.strength * steps) - 1, steps - 1), 0)
+        sigmas = sigmas[prune_index:]
+        print("prune index:", prune_index)
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
diff --git a/scripts/demo/sampling.py b/scripts/demo/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c20048176aeb0db96114f95d760091a46d49cd
--- /dev/null
+++ b/scripts/demo/sampling.py
@@ -0,0 +1,364 @@
+from pytorch_lightning import seed_everything
+
+from scripts.demo.streamlit_helpers import *
+
+SAVE_PATH = "outputs/demo/txt2img/"
+
+SD_XL_BASE_RATIOS = {
+    "0.5": (704, 1408),
+    "0.52": (704, 1344),
+    "0.57": (768, 1344),
+    "0.6": (768, 1280),
+    "0.68": (832, 1216),
+    "0.72": (832, 1152),
+    "0.78": (896, 1152),
+    "0.82": (896, 1088),
+    "0.88": (960, 1088),
+    "0.94": (960, 1024),
+    "1.0": (1024, 1024),
+    "1.07": (1024, 960),
+    "1.13": (1088, 960),
+    "1.21": (1088, 896),
+    "1.29": (1152, 896),
+    "1.38": (1152, 832),
+    "1.46": (1216, 832),
+    "1.67": (1280, 768),
+    "1.75": (1344, 768),
+    "1.91": (1344, 704),
+    "2.0": (1408, 704),
+    "2.09": (1472, 704),
+    "2.4": (1536, 640),
+    "2.5": (1600, 640),
+    "2.89": (1664, 576),
+    "3.0": (1728, 576),
+}
+
+VERSION2SPECS = {
+    "SDXL-base-1.0": {
+        "H": 1024,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "is_legacy": False,
+        "config": "configs/inference/sd_xl_base.yaml",
+        "ckpt": "checkpoints/sd_xl_base_1.0.safetensors",
+    },
+    "SDXL-base-0.9": {
+        "H": 1024,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "is_legacy": False,
+        "config": "configs/inference/sd_xl_base.yaml",
+        "ckpt": "checkpoints/sd_xl_base_0.9.safetensors",
+    },
+    "SD-2.1": {
+        "H": 512,
+        "W": 512,
+        "C": 4,
+        "f": 8,
+        "is_legacy": True,
+        "config": "configs/inference/sd_2_1.yaml",
+        "ckpt": "checkpoints/v2-1_512-ema-pruned.safetensors",
+    },
+    "SD-2.1-768": {
+        "H": 768,
+        "W": 768,
+        "C": 4,
+        "f": 8,
+        "is_legacy": True,
+        "config": "configs/inference/sd_2_1_768.yaml",
+        "ckpt": "checkpoints/v2-1_768-ema-pruned.safetensors",
+    },
+    "SDXL-refiner-0.9": {
+        "H": 1024,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "is_legacy": True,
+        "config": "configs/inference/sd_xl_refiner.yaml",
+        "ckpt": "checkpoints/sd_xl_refiner_0.9.safetensors",
+    },
+    "SDXL-refiner-1.0": {
+        "H": 1024,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "is_legacy": True,
+        "config": "configs/inference/sd_xl_refiner.yaml",
+        "ckpt": "checkpoints/sd_xl_refiner_1.0.safetensors",
+    },
+}
+
+
+def load_img(display=True, key=None, device="cuda"):
+    image = get_interactive_image(key=key)
+    if image is None:
+        return None
+    if display:
+        st.image(image)
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    width, height = map(
+        lambda x: x - x % 64, (w, h)
+    )  # resize to integer multiple of 64
+    image = image.resize((width, height))
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    return image.to(device)
+
+
+def run_txt2img(
+    state,
+    version,
+    version_dict,
+    is_legacy=False,
+    return_latents=False,
+    filter=None,
+    stage2strength=None,
+):
+    if version.startswith("SDXL-base"):
+        W, H = st.selectbox("Resolution:", list(SD_XL_BASE_RATIOS.values()), 10)
+    else:
+        H = st.number_input("H", value=version_dict["H"], min_value=64, max_value=2048)
+        W = st.number_input("W", value=version_dict["W"], min_value=64, max_value=2048)
+    C = version_dict["C"]
+    F = version_dict["f"]
+
+    init_dict = {
+        "orig_width": W,
+        "orig_height": H,
+        "target_width": W,
+        "target_height": H,
+    }
+    value_dict = init_embedder_options(
+        get_unique_embedder_keys_from_conditioner(state["model"].conditioner),
+        init_dict,
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+    )
+    sampler, num_rows, num_cols = init_sampling(stage2strength=stage2strength)
+    num_samples = num_rows * num_cols
+
+    if st.button("Sample"):
+        st.write(f"**Model I:** {version}")
+        out = do_sample(
+            state["model"],
+            sampler,
+            value_dict,
+            num_samples,
+            H,
+            W,
+            C,
+            F,
+            force_uc_zero_embeddings=["txt"] if not is_legacy else [],
+            return_latents=return_latents,
+            filter=filter,
+        )
+        return out
+
+
+def run_img2img(
+    state,
+    version_dict,
+    is_legacy=False,
+    return_latents=False,
+    filter=None,
+    stage2strength=None,
+):
+    img = load_img()
+    if img is None:
+        return None
+    H, W = img.shape[2], img.shape[3]
+
+    init_dict = {
+        "orig_width": W,
+        "orig_height": H,
+        "target_width": W,
+        "target_height": H,
+    }
+    value_dict = init_embedder_options(
+        get_unique_embedder_keys_from_conditioner(state["model"].conditioner),
+        init_dict,
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+    )
+    strength = st.number_input(
+        "**Img2Img Strength**", value=0.75, min_value=0.0, max_value=1.0
+    )
+    sampler, num_rows, num_cols = init_sampling(
+        img2img_strength=strength,
+        stage2strength=stage2strength,
+    )
+    num_samples = num_rows * num_cols
+
+    if st.button("Sample"):
+        out = do_img2img(
+            repeat(img, "1 ... -> n ...", n=num_samples),
+            state["model"],
+            sampler,
+            value_dict,
+            num_samples,
+            force_uc_zero_embeddings=["txt"] if not is_legacy else [],
+            return_latents=return_latents,
+            filter=filter,
+        )
+        return out
+
+
+def apply_refiner(
+    input,
+    state,
+    sampler,
+    num_samples,
+    prompt,
+    negative_prompt,
+    filter=None,
+    finish_denoising=False,
+):
+    init_dict = {
+        "orig_width": input.shape[3] * 8,
+        "orig_height": input.shape[2] * 8,
+        "target_width": input.shape[3] * 8,
+        "target_height": input.shape[2] * 8,
+    }
+
+    value_dict = init_dict
+    value_dict["prompt"] = prompt
+    value_dict["negative_prompt"] = negative_prompt
+
+    value_dict["crop_coords_top"] = 0
+    value_dict["crop_coords_left"] = 0
+
+    value_dict["aesthetic_score"] = 6.0
+    value_dict["negative_aesthetic_score"] = 2.5
+
+    st.warning(f"refiner input shape: {input.shape}")
+    samples = do_img2img(
+        input,
+        state["model"],
+        sampler,
+        value_dict,
+        num_samples,
+        skip_encode=True,
+        filter=filter,
+        add_noise=not finish_denoising,
+    )
+
+    return samples
+
+
+if __name__ == "__main__":
+    st.title("Stable Diffusion")
+    version = st.selectbox("Model Version", list(VERSION2SPECS.keys()), 0)
+    version_dict = VERSION2SPECS[version]
+    if st.checkbox("Load Model"):
+        mode = st.radio("Mode", ("txt2img", "img2img"), 0)
+    else:
+        mode = "skip"
+    st.write("__________________________")
+
+    set_lowvram_mode(st.checkbox("Low vram mode", True))
+
+    if version.startswith("SDXL-base"):
+        add_pipeline = st.checkbox("Load SDXL-refiner?", False)
+        st.write("__________________________")
+    else:
+        add_pipeline = False
+
+    seed = st.sidebar.number_input("seed", value=42, min_value=0, max_value=int(1e9))
+    seed_everything(seed)
+
+    save_locally, save_path = init_save_locally(os.path.join(SAVE_PATH, version))
+
+    if mode != "skip":
+        state = init_st(version_dict, load_filter=True)
+        if state["msg"]:
+            st.info(state["msg"])
+        model = state["model"]
+
+    is_legacy = version_dict["is_legacy"]
+
+    prompt = st.text_input(
+        "prompt",
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    )
+    if is_legacy:
+        negative_prompt = st.text_input("negative prompt", "")
+    else:
+        negative_prompt = ""  # which is unused
+
+    stage2strength = None
+    finish_denoising = False
+
+    if add_pipeline:
+        st.write("__________________________")
+        version2 = st.selectbox("Refiner:", ["SDXL-refiner-1.0", "SDXL-refiner-0.9"])
+        st.warning(
+            f"Running with {version2} as the second stage model. Make sure to provide (V)RAM :) "
+        )
+        st.write("**Refiner Options:**")
+
+        version_dict2 = VERSION2SPECS[version2]
+        state2 = init_st(version_dict2, load_filter=False)
+        st.info(state2["msg"])
+
+        stage2strength = st.number_input(
+            "**Refinement strength**", value=0.15, min_value=0.0, max_value=1.0
+        )
+
+        sampler2, *_ = init_sampling(
+            key=2,
+            img2img_strength=stage2strength,
+            specify_num_samples=False,
+        )
+        st.write("__________________________")
+        finish_denoising = st.checkbox("Finish denoising with refiner.", True)
+        if not finish_denoising:
+            stage2strength = None
+
+    if mode == "txt2img":
+        out = run_txt2img(
+            state,
+            version,
+            version_dict,
+            is_legacy=is_legacy,
+            return_latents=add_pipeline,
+            filter=state.get("filter"),
+            stage2strength=stage2strength,
+        )
+    elif mode == "img2img":
+        out = run_img2img(
+            state,
+            version_dict,
+            is_legacy=is_legacy,
+            return_latents=add_pipeline,
+            filter=state.get("filter"),
+            stage2strength=stage2strength,
+        )
+    elif mode == "skip":
+        out = None
+    else:
+        raise ValueError(f"unknown mode {mode}")
+    if isinstance(out, (tuple, list)):
+        samples, samples_z = out
+    else:
+        samples = out
+        samples_z = None
+
+    if add_pipeline and samples_z is not None:
+        st.write("**Running Refinement Stage**")
+        samples = apply_refiner(
+            samples_z,
+            state2,
+            sampler2,
+            samples_z.shape[0],
+            prompt=prompt,
+            negative_prompt=negative_prompt if is_legacy else "",
+            filter=state.get("filter"),
+            finish_denoising=finish_denoising,
+        )
+
+    if save_locally and samples is not None:
+        perform_save_locally(save_path, samples)
diff --git a/scripts/demo/streamlit_helpers.py b/scripts/demo/streamlit_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c5760e26fcffefc95ddc5435870b2a804257e51
--- /dev/null
+++ b/scripts/demo/streamlit_helpers.py
@@ -0,0 +1,887 @@
+import copy
+import math
+import os
+from glob import glob
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+import streamlit as st
+import torch
+import torch.nn as nn
+import torchvision.transforms as TT
+from einops import rearrange, repeat
+from imwatermark import WatermarkEncoder
+from omegaconf import ListConfig, OmegaConf
+from PIL import Image
+from safetensors.torch import load_file as load_safetensors
+from torch import autocast
+from torchvision import transforms
+from torchvision.utils import make_grid, save_image
+
+from scripts.demo.discretization import (Img2ImgDiscretizationWrapper,
+                                         Txt2NoisyDiscretizationWrapper)
+from scripts.util.detection.nsfw_and_watermark_dectection import \
+    DeepFloydDataFiltering
+from sgm.inference.helpers import embed_watermark
+from sgm.modules.diffusionmodules.guiders import (LinearPredictionGuider,
+                                                  VanillaCFG)
+from sgm.modules.diffusionmodules.sampling import (DPMPP2MSampler,
+                                                   DPMPP2SAncestralSampler,
+                                                   EulerAncestralSampler,
+                                                   EulerEDMSampler,
+                                                   HeunEDMSampler,
+                                                   LinearMultistepSampler)
+from sgm.util import append_dims, default, instantiate_from_config
+
+
+@st.cache_resource()
+def init_st(version_dict, load_ckpt=True, load_filter=True):
+    state = dict()
+    if not "model" in state:
+        config = version_dict["config"]
+        ckpt = version_dict["ckpt"]
+
+        config = OmegaConf.load(config)
+        model, msg = load_model_from_config(config, ckpt if load_ckpt else None)
+
+        state["msg"] = msg
+        state["model"] = model
+        state["ckpt"] = ckpt if load_ckpt else None
+        state["config"] = config
+        if load_filter:
+            state["filter"] = DeepFloydDataFiltering(verbose=False)
+    return state
+
+
+def load_model(model):
+    model.cuda()
+
+
+lowvram_mode = False
+
+
+def set_lowvram_mode(mode):
+    global lowvram_mode
+    lowvram_mode = mode
+
+
+def initial_model_load(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.model.half()
+    else:
+        model.cuda()
+    return model
+
+
+def unload_model(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.cpu()
+        torch.cuda.empty_cache()
+
+
+def load_model_from_config(config, ckpt=None, verbose=True):
+    model = instantiate_from_config(config.model)
+
+    if ckpt is not None:
+        print(f"Loading model from {ckpt}")
+        if ckpt.endswith("ckpt"):
+            pl_sd = torch.load(ckpt, map_location="cpu")
+            if "global_step" in pl_sd:
+                global_step = pl_sd["global_step"]
+                st.info(f"loaded ckpt from global step {global_step}")
+                print(f"Global Step: {pl_sd['global_step']}")
+            sd = pl_sd["state_dict"]
+        elif ckpt.endswith("safetensors"):
+            sd = load_safetensors(ckpt)
+        else:
+            raise NotImplementedError
+
+        msg = None
+
+        m, u = model.load_state_dict(sd, strict=False)
+
+        if len(m) > 0 and verbose:
+            print("missing keys:")
+            print(m)
+        if len(u) > 0 and verbose:
+            print("unexpected keys:")
+            print(u)
+    else:
+        msg = None
+
+    model = initial_model_load(model)
+    model.eval()
+    return model, msg
+
+
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list(set([x.input_key for x in conditioner.embedders]))
+
+
+def init_embedder_options(keys, init_dict, prompt=None, negative_prompt=None):
+    # Hardcoded demo settings; might undergo some changes in the future
+
+    value_dict = {}
+    for key in keys:
+        if key == "txt":
+            if prompt is None:
+                prompt = "A professional photograph of an astronaut riding a pig"
+            if negative_prompt is None:
+                negative_prompt = ""
+
+            prompt = st.text_input("Prompt", prompt)
+            negative_prompt = st.text_input("Negative prompt", negative_prompt)
+
+            value_dict["prompt"] = prompt
+            value_dict["negative_prompt"] = negative_prompt
+
+        if key == "original_size_as_tuple":
+            orig_width = st.number_input(
+                "orig_width",
+                value=init_dict["orig_width"],
+                min_value=16,
+            )
+            orig_height = st.number_input(
+                "orig_height",
+                value=init_dict["orig_height"],
+                min_value=16,
+            )
+
+            value_dict["orig_width"] = orig_width
+            value_dict["orig_height"] = orig_height
+
+        if key == "crop_coords_top_left":
+            crop_coord_top = st.number_input("crop_coords_top", value=0, min_value=0)
+            crop_coord_left = st.number_input("crop_coords_left", value=0, min_value=0)
+
+            value_dict["crop_coords_top"] = crop_coord_top
+            value_dict["crop_coords_left"] = crop_coord_left
+
+        if key == "aesthetic_score":
+            value_dict["aesthetic_score"] = 6.0
+            value_dict["negative_aesthetic_score"] = 2.5
+
+        if key == "target_size_as_tuple":
+            value_dict["target_width"] = init_dict["target_width"]
+            value_dict["target_height"] = init_dict["target_height"]
+
+        if key in ["fps_id", "fps"]:
+            fps = st.number_input("fps", value=6, min_value=1)
+
+            value_dict["fps"] = fps
+            value_dict["fps_id"] = fps - 1
+
+        if key == "motion_bucket_id":
+            mb_id = st.number_input("motion bucket id", 0, 511, value=127)
+            value_dict["motion_bucket_id"] = mb_id
+
+        if key == "pool_image":
+            st.text("Image for pool conditioning")
+            image = load_img(
+                key="pool_image_input",
+                size=224,
+                center_crop=True,
+            )
+            if image is None:
+                st.info("Need an image here")
+                image = torch.zeros(1, 3, 224, 224)
+            value_dict["pool_image"] = image
+
+    return value_dict
+
+
+def perform_save_locally(save_path, samples):
+    os.makedirs(os.path.join(save_path), exist_ok=True)
+    base_count = len(os.listdir(os.path.join(save_path)))
+    samples = embed_watermark(samples)
+    for sample in samples:
+        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+        Image.fromarray(sample.astype(np.uint8)).save(
+            os.path.join(save_path, f"{base_count:09}.png")
+        )
+        base_count += 1
+
+
+def init_save_locally(_dir, init_value: bool = False):
+    save_locally = st.sidebar.checkbox("Save images locally", value=init_value)
+    if save_locally:
+        save_path = st.text_input("Save path", value=os.path.join(_dir, "samples"))
+    else:
+        save_path = None
+
+    return save_locally, save_path
+
+
+def get_guider(options, key):
+    guider = st.sidebar.selectbox(
+        f"Discretization #{key}",
+        [
+            "VanillaCFG",
+            "IdentityGuider",
+            "LinearPredictionGuider",
+        ],
+        options.get("guider", 0),
+    )
+
+    additional_guider_kwargs = options.pop("additional_guider_kwargs", {})
+
+    if guider == "IdentityGuider":
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif guider == "VanillaCFG":
+        scale = st.number_input(
+            f"cfg-scale #{key}",
+            value=options.get("cfg", 5.0),
+            min_value=0.0,
+        )
+
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {
+                "scale": scale,
+                **additional_guider_kwargs,
+            },
+        }
+    elif guider == "LinearPredictionGuider":
+        max_scale = st.number_input(
+            f"max-cfg-scale #{key}",
+            value=options.get("cfg", 1.5),
+            min_value=1.0,
+        )
+        min_scale = st.number_input(
+            f"min guidance scale",
+            value=options.get("min_cfg", 1.0),
+            min_value=1.0,
+            max_value=10.0,
+        )
+
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.LinearPredictionGuider",
+            "params": {
+                "max_scale": max_scale,
+                "min_scale": min_scale,
+                "num_frames": options["num_frames"],
+                **additional_guider_kwargs,
+            },
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+
+
+def init_sampling(
+    key=1,
+    img2img_strength: Optional[float] = None,
+    specify_num_samples: bool = True,
+    stage2strength: Optional[float] = None,
+    options: Optional[Dict[str, int]] = None,
+):
+    options = {} if options is None else options
+
+    num_rows, num_cols = 1, 1
+    if specify_num_samples:
+        num_cols = st.number_input(
+            f"num cols #{key}", value=num_cols, min_value=1, max_value=10
+        )
+
+    steps = st.sidebar.number_input(
+        f"steps #{key}", value=options.get("num_steps", 40), min_value=1, max_value=1000
+    )
+    sampler = st.sidebar.selectbox(
+        f"Sampler #{key}",
+        [
+            "EulerEDMSampler",
+            "HeunEDMSampler",
+            "EulerAncestralSampler",
+            "DPMPP2SAncestralSampler",
+            "DPMPP2MSampler",
+            "LinearMultistepSampler",
+        ],
+        options.get("sampler", 0),
+    )
+    discretization = st.sidebar.selectbox(
+        f"Discretization #{key}",
+        [
+            "LegacyDDPMDiscretization",
+            "EDMDiscretization",
+        ],
+        options.get("discretization", 0),
+    )
+
+    discretization_config = get_discretization(discretization, options=options, key=key)
+
+    guider_config = get_guider(options=options, key=key)
+
+    sampler = get_sampler(sampler, steps, discretization_config, guider_config, key=key)
+    if img2img_strength is not None:
+        st.warning(
+            f"Wrapping {sampler.__class__.__name__} with Img2ImgDiscretizationWrapper"
+        )
+        sampler.discretization = Img2ImgDiscretizationWrapper(
+            sampler.discretization, strength=img2img_strength
+        )
+    if stage2strength is not None:
+        sampler.discretization = Txt2NoisyDiscretizationWrapper(
+            sampler.discretization, strength=stage2strength, original_steps=steps
+        )
+    return sampler, num_rows, num_cols
+
+
+def get_discretization(discretization, options, key=1):
+    if discretization == "LegacyDDPMDiscretization":
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
+        }
+    elif discretization == "EDMDiscretization":
+        sigma_min = st.number_input(
+            f"sigma_min #{key}", value=options.get("sigma_min", 0.03)
+        )  # 0.0292
+        sigma_max = st.number_input(
+            f"sigma_max #{key}", value=options.get("sigma_max", 14.61)
+        )  # 14.6146
+        rho = st.number_input(f"rho #{key}", value=options.get("rho", 3.0))
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {
+                "sigma_min": sigma_min,
+                "sigma_max": sigma_max,
+                "rho": rho,
+            },
+        }
+
+    return discretization_config
+
+
+def get_sampler(sampler_name, steps, discretization_config, guider_config, key=1):
+    if sampler_name == "EulerEDMSampler" or sampler_name == "HeunEDMSampler":
+        s_churn = st.sidebar.number_input(f"s_churn #{key}", value=0.0, min_value=0.0)
+        s_tmin = st.sidebar.number_input(f"s_tmin #{key}", value=0.0, min_value=0.0)
+        s_tmax = st.sidebar.number_input(f"s_tmax #{key}", value=999.0, min_value=0.0)
+        s_noise = st.sidebar.number_input(f"s_noise #{key}", value=1.0, min_value=0.0)
+
+        if sampler_name == "EulerEDMSampler":
+            sampler = EulerEDMSampler(
+                num_steps=steps,
+                discretization_config=discretization_config,
+                guider_config=guider_config,
+                s_churn=s_churn,
+                s_tmin=s_tmin,
+                s_tmax=s_tmax,
+                s_noise=s_noise,
+                verbose=True,
+            )
+        elif sampler_name == "HeunEDMSampler":
+            sampler = HeunEDMSampler(
+                num_steps=steps,
+                discretization_config=discretization_config,
+                guider_config=guider_config,
+                s_churn=s_churn,
+                s_tmin=s_tmin,
+                s_tmax=s_tmax,
+                s_noise=s_noise,
+                verbose=True,
+            )
+    elif (
+        sampler_name == "EulerAncestralSampler"
+        or sampler_name == "DPMPP2SAncestralSampler"
+    ):
+        s_noise = st.sidebar.number_input("s_noise", value=1.0, min_value=0.0)
+        eta = st.sidebar.number_input("eta", value=1.0, min_value=0.0)
+
+        if sampler_name == "EulerAncestralSampler":
+            sampler = EulerAncestralSampler(
+                num_steps=steps,
+                discretization_config=discretization_config,
+                guider_config=guider_config,
+                eta=eta,
+                s_noise=s_noise,
+                verbose=True,
+            )
+        elif sampler_name == "DPMPP2SAncestralSampler":
+            sampler = DPMPP2SAncestralSampler(
+                num_steps=steps,
+                discretization_config=discretization_config,
+                guider_config=guider_config,
+                eta=eta,
+                s_noise=s_noise,
+                verbose=True,
+            )
+    elif sampler_name == "DPMPP2MSampler":
+        sampler = DPMPP2MSampler(
+            num_steps=steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            verbose=True,
+        )
+    elif sampler_name == "LinearMultistepSampler":
+        order = st.sidebar.number_input("order", value=4, min_value=1)
+        sampler = LinearMultistepSampler(
+            num_steps=steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            order=order,
+            verbose=True,
+        )
+    else:
+        raise ValueError(f"unknown sampler {sampler_name}!")
+
+    return sampler
+
+
+def get_interactive_image() -> Image.Image:
+    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"])
+    if image is not None:
+        image = Image.open(image)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        return image
+
+
+def load_img(
+    display: bool = True,
+    size: Union[None, int, Tuple[int, int]] = None,
+    center_crop: bool = False,
+):
+    image = get_interactive_image()
+    if image is None:
+        return None
+    if display:
+        st.image(image)
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+
+    transform = []
+    if size is not None:
+        transform.append(transforms.Resize(size))
+    if center_crop:
+        transform.append(transforms.CenterCrop(size))
+    transform.append(transforms.ToTensor())
+    transform.append(transforms.Lambda(lambda x: 2.0 * x - 1.0))
+
+    transform = transforms.Compose(transform)
+    img = transform(image)[None, ...]
+    st.text(f"input min/max/mean: {img.min():.3f}/{img.max():.3f}/{img.mean():.3f}")
+    return img
+
+
+def get_init_img(batch_size=1, key=None):
+    init_image = load_img(key=key).cuda()
+    init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
+    return init_image
+
+
+def do_sample(
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    H,
+    W,
+    C,
+    F,
+    force_uc_zero_embeddings: Optional[List] = None,
+    force_cond_zero_embeddings: Optional[List] = None,
+    batch2model_input: List = None,
+    return_latents=False,
+    filter=None,
+    T=None,
+    additional_batch_uc_fields=None,
+    decoding_t=None,
+):
+    force_uc_zero_embeddings = default(force_uc_zero_embeddings, [])
+    batch2model_input = default(batch2model_input, [])
+    additional_batch_uc_fields = default(additional_batch_uc_fields, [])
+
+    st.text("Sampling")
+
+    outputs = st.empty()
+    precision_scope = autocast
+    with torch.no_grad():
+        with precision_scope("cuda"):
+            with model.ema_scope():
+                if T is not None:
+                    num_samples = [num_samples, T]
+                else:
+                    num_samples = [num_samples]
+
+                load_model(model.conditioner)
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    num_samples,
+                    T=T,
+                    additional_batch_uc_fields=additional_batch_uc_fields,
+                )
+
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                    force_cond_zero_embeddings=force_cond_zero_embeddings,
+                )
+                unload_model(model.conditioner)
+
+                for k in c:
+                    if not k == "crossattn":
+                        c[k], uc[k] = map(
+                            lambda y: y[k][: math.prod(num_samples)].to("cuda"), (c, uc)
+                        )
+                    if k in ["crossattn", "concat"] and T is not None:
+                        uc[k] = repeat(uc[k], "b ... -> b t ...", t=T)
+                        uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=T)
+                        c[k] = repeat(c[k], "b ... -> b t ...", t=T)
+                        c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=T)
+
+                additional_model_inputs = {}
+                for k in batch2model_input:
+                    if k == "image_only_indicator":
+                        assert T is not None
+
+                        if isinstance(
+                            sampler.guider, (VanillaCFG, LinearPredictionGuider)
+                        ):
+                            additional_model_inputs[k] = torch.zeros(
+                                num_samples[0] * 2, num_samples[1]
+                            ).to("cuda")
+                        else:
+                            additional_model_inputs[k] = torch.zeros(num_samples).to(
+                                "cuda"
+                            )
+                    else:
+                        additional_model_inputs[k] = batch[k]
+
+                shape = (math.prod(num_samples), C, H // F, W // F)
+                randn = torch.randn(shape).to("cuda")
+
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+
+                load_model(model.denoiser)
+                load_model(model.model)
+                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
+                unload_model(model.model)
+                unload_model(model.denoiser)
+
+                load_model(model.first_stage_model)
+                model.en_and_decode_n_samples_a_time = (
+                    decoding_t  # Decode n frames at a time
+                )
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                unload_model(model.first_stage_model)
+
+                if filter is not None:
+                    samples = filter(samples)
+
+                if T is None:
+                    grid = torch.stack([samples])
+                    grid = rearrange(grid, "n b c h w -> (n h) (b w) c")
+                    outputs.image(grid.cpu().numpy())
+                else:
+                    as_vids = rearrange(samples, "(b t) c h w -> b t c h w", t=T)
+                    for i, vid in enumerate(as_vids):
+                        grid = rearrange(make_grid(vid, nrow=4), "c h w -> h w c")
+                        st.image(
+                            grid.cpu().numpy(),
+                            f"Sample #{i} as image",
+                        )
+
+                if return_latents:
+                    return samples, samples_z
+                return samples
+
+
+def get_batch(
+    keys,
+    value_dict: dict,
+    N: Union[List, ListConfig],
+    device: str = "cuda",
+    T: int = None,
+    additional_batch_uc_fields: List[str] = [],
+):
+    # Hardcoded demo setups; might undergo some changes in the future
+
+    batch = {}
+    batch_uc = {}
+
+    for key in keys:
+        if key == "txt":
+            batch["txt"] = [value_dict["prompt"]] * math.prod(N)
+
+            batch_uc["txt"] = [value_dict["negative_prompt"]] * math.prod(N)
+
+        elif key == "original_size_as_tuple":
+            batch["original_size_as_tuple"] = (
+                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]])
+                .to(device)
+                .repeat(math.prod(N), 1)
+            )
+        elif key == "crop_coords_top_left":
+            batch["crop_coords_top_left"] = (
+                torch.tensor(
+                    [value_dict["crop_coords_top"], value_dict["crop_coords_left"]]
+                )
+                .to(device)
+                .repeat(math.prod(N), 1)
+            )
+        elif key == "aesthetic_score":
+            batch["aesthetic_score"] = (
+                torch.tensor([value_dict["aesthetic_score"]])
+                .to(device)
+                .repeat(math.prod(N), 1)
+            )
+            batch_uc["aesthetic_score"] = (
+                torch.tensor([value_dict["negative_aesthetic_score"]])
+                .to(device)
+                .repeat(math.prod(N), 1)
+            )
+
+        elif key == "target_size_as_tuple":
+            batch["target_size_as_tuple"] = (
+                torch.tensor([value_dict["target_height"], value_dict["target_width"]])
+                .to(device)
+                .repeat(math.prod(N), 1)
+            )
+        elif key == "fps":
+            batch[key] = (
+                torch.tensor([value_dict["fps"]]).to(device).repeat(math.prod(N))
+            )
+        elif key == "fps_id":
+            batch[key] = (
+                torch.tensor([value_dict["fps_id"]]).to(device).repeat(math.prod(N))
+            )
+        elif key == "motion_bucket_id":
+            batch[key] = (
+                torch.tensor([value_dict["motion_bucket_id"]])
+                .to(device)
+                .repeat(math.prod(N))
+            )
+        elif key == "pool_image":
+            batch[key] = repeat(value_dict[key], "1 ... -> b ...", b=math.prod(N)).to(
+                device, dtype=torch.half
+            )
+        elif key == "cond_aug":
+            batch[key] = repeat(
+                torch.tensor([value_dict["cond_aug"]]).to("cuda"),
+                "1 -> b",
+                b=math.prod(N),
+            )
+        elif key == "cond_frames":
+            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
+        elif key == "cond_frames_without_noise":
+            batch[key] = repeat(
+                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
+            )
+        else:
+            batch[key] = value_dict[key]
+
+    if T is not None:
+        batch["num_video_frames"] = T
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+        elif key in additional_batch_uc_fields and key not in batch_uc:
+            batch_uc[key] = copy.copy(batch[key])
+    return batch, batch_uc
+
+
+@torch.no_grad()
+def do_img2img(
+    img,
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    force_uc_zero_embeddings: Optional[List] = None,
+    force_cond_zero_embeddings: Optional[List] = None,
+    additional_kwargs={},
+    offset_noise_level: int = 0.0,
+    return_latents=False,
+    skip_encode=False,
+    filter=None,
+    add_noise=True,
+):
+    st.text("Sampling")
+
+    outputs = st.empty()
+    precision_scope = autocast
+    with torch.no_grad():
+        with precision_scope("cuda"):
+            with model.ema_scope():
+                load_model(model.conditioner)
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [num_samples],
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                    force_cond_zero_embeddings=force_cond_zero_embeddings,
+                )
+                unload_model(model.conditioner)
+                for k in c:
+                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to("cuda"), (c, uc))
+
+                for k in additional_kwargs:
+                    c[k] = uc[k] = additional_kwargs[k]
+                if skip_encode:
+                    z = img
+                else:
+                    load_model(model.first_stage_model)
+                    z = model.encode_first_stage(img)
+                    unload_model(model.first_stage_model)
+
+                noise = torch.randn_like(z)
+
+                sigmas = sampler.discretization(sampler.num_steps).cuda()
+                sigma = sigmas[0]
+
+                st.info(f"all sigmas: {sigmas}")
+                st.info(f"noising sigma: {sigma}")
+                if offset_noise_level > 0.0:
+                    noise = noise + offset_noise_level * append_dims(
+                        torch.randn(z.shape[0], device=z.device), z.ndim
+                    )
+                if add_noise:
+                    noised_z = z + noise * append_dims(sigma, z.ndim).cuda()
+                    noised_z = noised_z / torch.sqrt(
+                        1.0 + sigmas[0] ** 2.0
+                    )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
+                else:
+                    noised_z = z / torch.sqrt(1.0 + sigmas[0] ** 2.0)
+
+                def denoiser(x, sigma, c):
+                    return model.denoiser(model.model, x, sigma, c)
+
+                load_model(model.denoiser)
+                load_model(model.model)
+                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
+                unload_model(model.model)
+                unload_model(model.denoiser)
+
+                load_model(model.first_stage_model)
+                samples_x = model.decode_first_stage(samples_z)
+                unload_model(model.first_stage_model)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+
+                if filter is not None:
+                    samples = filter(samples)
+
+                grid = rearrange(grid, "n b c h w -> (n h) (b w) c")
+                outputs.image(grid.cpu().numpy())
+                if return_latents:
+                    return samples, samples_z
+                return samples
+
+
+def get_resizing_factor(
+    desired_shape: Tuple[int, int], current_shape: Tuple[int, int]
+) -> float:
+    r_bound = desired_shape[1] / desired_shape[0]
+    aspect_r = current_shape[1] / current_shape[0]
+    if r_bound >= 1.0:
+        if aspect_r >= r_bound:
+            factor = min(desired_shape) / min(current_shape)
+        else:
+            if aspect_r < 1.0:
+                factor = max(desired_shape) / min(current_shape)
+            else:
+                factor = max(desired_shape) / max(current_shape)
+    else:
+        if aspect_r <= r_bound:
+            factor = min(desired_shape) / min(current_shape)
+        else:
+            if aspect_r > 1:
+                factor = max(desired_shape) / min(current_shape)
+            else:
+                factor = max(desired_shape) / max(current_shape)
+
+    return factor
+
+
+def get_interactive_image(key=None) -> Image.Image:
+    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"], key=key)
+    if image is not None:
+        image = Image.open(image)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        return image
+
+
+def load_img_for_prediction(
+    W: int, H: int, display=True, key=None, device="cuda"
+) -> torch.Tensor:
+    image = get_interactive_image(key=key)
+    if image is None:
+        return None
+    if display:
+        st.image(image)
+    w, h = image.size
+
+    image = np.array(image).transpose(2, 0, 1)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 255.0
+    image = image.unsqueeze(0)
+
+    rfs = get_resizing_factor((H, W), (h, w))
+    resize_size = [int(np.ceil(rfs * s)) for s in (h, w)]
+    top = (resize_size[0] - H) // 2
+    left = (resize_size[1] - W) // 2
+
+    image = torch.nn.functional.interpolate(
+        image, resize_size, mode="area", antialias=False
+    )
+    image = TT.functional.crop(image, top=top, left=left, height=H, width=W)
+
+    if display:
+        numpy_img = np.transpose(image[0].numpy(), (1, 2, 0))
+        pil_image = Image.fromarray((numpy_img * 255).astype(np.uint8))
+        st.image(pil_image)
+    return image.to(device) * 2.0 - 1.0
+
+
+def save_video_as_grid_and_mp4(
+    video_batch: torch.Tensor, save_path: str, T: int, fps: int = 5
+):
+    os.makedirs(save_path, exist_ok=True)
+    base_count = len(glob(os.path.join(save_path, "*.mp4")))
+
+    video_batch = rearrange(video_batch, "(b t) c h w -> b t c h w", t=T)
+    video_batch = embed_watermark(video_batch)
+    for vid in video_batch:
+        save_image(vid, fp=os.path.join(save_path, f"{base_count:06d}.png"), nrow=4)
+
+        video_path = os.path.join(save_path, f"{base_count:06d}.mp4")
+
+        writer = cv2.VideoWriter(
+            video_path,
+            cv2.VideoWriter_fourcc(*"MP4V"),
+            fps,
+            (vid.shape[-1], vid.shape[-2]),
+        )
+
+        vid = (
+            (rearrange(vid, "t c h w -> t h w c") * 255).cpu().numpy().astype(np.uint8)
+        )
+        for frame in vid:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            writer.write(frame)
+
+        writer.release()
+
+        video_path_h264 = video_path[:-4] + "_h264.mp4"
+        os.system(f"ffmpeg -i {video_path} -c:v libx264 {video_path_h264}")
+
+        with open(video_path_h264, "rb") as f:
+            video_bytes = f.read()
+        st.video(video_bytes)
+
+        base_count += 1
diff --git a/scripts/demo/video_sampling.py b/scripts/demo/video_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..95789020110f3abaf6e5a7755a7e5910b864c6df
--- /dev/null
+++ b/scripts/demo/video_sampling.py
@@ -0,0 +1,200 @@
+import os
+
+from pytorch_lightning import seed_everything
+
+from scripts.demo.streamlit_helpers import *
+
+SAVE_PATH = "outputs/demo/vid/"
+
+VERSION2SPECS = {
+    "svd": {
+        "T": 14,
+        "H": 576,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "config": "configs/inference/svd.yaml",
+        "ckpt": "checkpoints/svd.safetensors",
+        "options": {
+            "discretization": 1,
+            "cfg": 2.5,
+            "sigma_min": 0.002,
+            "sigma_max": 700.0,
+            "rho": 7.0,
+            "guider": 2,
+            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
+            "num_steps": 25,
+        },
+    },
+    "svd_image_decoder": {
+        "T": 14,
+        "H": 576,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "config": "configs/inference/svd_image_decoder.yaml",
+        "ckpt": "checkpoints/svd_image_decoder.safetensors",
+        "options": {
+            "discretization": 1,
+            "cfg": 2.5,
+            "sigma_min": 0.002,
+            "sigma_max": 700.0,
+            "rho": 7.0,
+            "guider": 2,
+            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
+            "num_steps": 25,
+        },
+    },
+    "svd_xt": {
+        "T": 25,
+        "H": 576,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "config": "configs/inference/svd.yaml",
+        "ckpt": "checkpoints/svd_xt.safetensors",
+        "options": {
+            "discretization": 1,
+            "cfg": 3.0,
+            "min_cfg": 1.5,
+            "sigma_min": 0.002,
+            "sigma_max": 700.0,
+            "rho": 7.0,
+            "guider": 2,
+            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
+            "num_steps": 30,
+            "decoding_t": 14,
+        },
+    },
+    "svd_xt_image_decoder": {
+        "T": 25,
+        "H": 576,
+        "W": 1024,
+        "C": 4,
+        "f": 8,
+        "config": "configs/inference/svd_image_decoder.yaml",
+        "ckpt": "checkpoints/svd_xt_image_decoder.safetensors",
+        "options": {
+            "discretization": 1,
+            "cfg": 3.0,
+            "min_cfg": 1.5,
+            "sigma_min": 0.002,
+            "sigma_max": 700.0,
+            "rho": 7.0,
+            "guider": 2,
+            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
+            "num_steps": 30,
+            "decoding_t": 14,
+        },
+    },
+}
+
+
+if __name__ == "__main__":
+    st.title("Stable Video Diffusion")
+    version = st.selectbox(
+        "Model Version",
+        [k for k in VERSION2SPECS.keys()],
+        0,
+    )
+    version_dict = VERSION2SPECS[version]
+    if st.checkbox("Load Model"):
+        mode = "img2vid"
+    else:
+        mode = "skip"
+
+    H = st.sidebar.number_input(
+        "H", value=version_dict["H"], min_value=64, max_value=2048
+    )
+    W = st.sidebar.number_input(
+        "W", value=version_dict["W"], min_value=64, max_value=2048
+    )
+    T = st.sidebar.number_input(
+        "T", value=version_dict["T"], min_value=0, max_value=128
+    )
+    C = version_dict["C"]
+    F = version_dict["f"]
+    options = version_dict["options"]
+
+    if mode != "skip":
+        state = init_st(version_dict, load_filter=True)
+        if state["msg"]:
+            st.info(state["msg"])
+        model = state["model"]
+
+        ukeys = set(
+            get_unique_embedder_keys_from_conditioner(state["model"].conditioner)
+        )
+
+        value_dict = init_embedder_options(
+            ukeys,
+            {},
+        )
+
+        value_dict["image_only_indicator"] = 0
+
+        if mode == "img2vid":
+            img = load_img_for_prediction(W, H)
+            cond_aug = st.number_input(
+                "Conditioning augmentation:", value=0.02, min_value=0.0
+            )
+            value_dict["cond_frames_without_noise"] = img
+            value_dict["cond_frames"] = img + cond_aug * torch.randn_like(img)
+            value_dict["cond_aug"] = cond_aug
+
+        seed = st.sidebar.number_input(
+            "seed", value=23, min_value=0, max_value=int(1e9)
+        )
+        seed_everything(seed)
+
+        save_locally, save_path = init_save_locally(
+            os.path.join(SAVE_PATH, version), init_value=True
+        )
+
+        options["num_frames"] = T
+
+        sampler, num_rows, num_cols = init_sampling(options=options)
+        num_samples = num_rows * num_cols
+
+        decoding_t = st.number_input(
+            "Decode t frames at a time (set small if you are low on VRAM)",
+            value=options.get("decoding_t", T),
+            min_value=1,
+            max_value=int(1e9),
+        )
+
+        if st.checkbox("Overwrite fps in mp4 generator", False):
+            saving_fps = st.number_input(
+                f"saving video at fps:", value=value_dict["fps"], min_value=1
+            )
+        else:
+            saving_fps = value_dict["fps"]
+
+        if st.button("Sample"):
+            out = do_sample(
+                model,
+                sampler,
+                value_dict,
+                num_samples,
+                H,
+                W,
+                C,
+                F,
+                T=T,
+                batch2model_input=["num_video_frames", "image_only_indicator"],
+                force_uc_zero_embeddings=options.get("force_uc_zero_embeddings", None),
+                force_cond_zero_embeddings=options.get(
+                    "force_cond_zero_embeddings", None
+                ),
+                return_latents=False,
+                decoding_t=decoding_t,
+            )
+
+            if isinstance(out, (tuple, list)):
+                samples, samples_z = out
+            else:
+                samples = out
+                samples_z = None
+
+            if save_locally:
+                save_video_as_grid_and_mp4(samples, save_path, T, fps=saving_fps)
diff --git a/scripts/sampling/configs/svd.yaml b/scripts/sampling/configs/svd.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec8e80552ff47df39e59c5f2e3411677a51c09b1
--- /dev/null
+++ b/scripts/sampling/configs/svd.yaml
@@ -0,0 +1,146 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path: checkpoints/svd.safetensors
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        - input_key: fps_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        - input_key: cond_aug
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config: 
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 2.5
+            min_scale: 1.0
\ No newline at end of file
diff --git a/scripts/sampling/configs/svd_image_decoder.yaml b/scripts/sampling/configs/svd_image_decoder.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f731c5958f1935e4f3c1913c97fff4932520c12d
--- /dev/null
+++ b/scripts/sampling/configs/svd_image_decoder.yaml
@@ -0,0 +1,129 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path: checkpoints/svd_image_decoder.safetensors
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        - input_key: fps_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        - input_key: cond_aug
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 2.5
+            min_scale: 1.0
\ No newline at end of file
diff --git a/scripts/sampling/configs/svd_xt.yaml b/scripts/sampling/configs/svd_xt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e98c466aced5a540e2c8b0d81ee178d572400dc
--- /dev/null
+++ b/scripts/sampling/configs/svd_xt.yaml
@@ -0,0 +1,146 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path: checkpoints/svd_xt.safetensors
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        - input_key: fps_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        - input_key: cond_aug
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config: 
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 3.0
+            min_scale: 1.5
\ No newline at end of file
diff --git a/scripts/sampling/configs/svd_xt_image_decoder.yaml b/scripts/sampling/configs/svd_xt_image_decoder.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afb38387d2344b8f75371b3b51c281acca523272
--- /dev/null
+++ b/scripts/sampling/configs/svd_xt_image_decoder.yaml
@@ -0,0 +1,129 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path: checkpoints/svd_xt_image_decoder.safetensors
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        - input_key: fps_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        - input_key: cond_aug
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 3.0
+            min_scale: 1.5
\ No newline at end of file
diff --git a/scripts/sampling/simple_video_sample.py b/scripts/sampling/simple_video_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad738f17e3f054c89a17c331c607eb0d2a9d4ef
--- /dev/null
+++ b/scripts/sampling/simple_video_sample.py
@@ -0,0 +1,319 @@
+import datetime, time
+import os, sys, argparse
+import math
+from glob import glob
+from pathlib import Path
+from typing import Optional
+
+import cv2
+import numpy as np
+import torch
+from einops import rearrange, repeat
+from fire import Fire
+from omegaconf import OmegaConf
+from PIL import Image
+from torchvision.transforms import ToTensor
+
+sys.path.insert(1, os.path.join(sys.path[0], '..', '..'))
+from sgm.util import default, instantiate_from_config
+
+
+def sample(
+    input_path: str = "outputs/inputs/test_image.png",  # Can either be image file or folder with image files
+    ckpt: str = "checkpoints/svd.safetensors",
+    num_frames: Optional[int] = None,
+    num_steps: Optional[int] = None,
+    version: str = "svd",
+    fps_id: int = 6,
+    motion_bucket_id: int = 127,
+    cond_aug: float = 0.02,
+    seed: int = 23,
+    decoding_t: int = 1,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
+    device: str = "cuda",
+    output_folder: Optional[str] = None,
+    save_fps: int = 10,
+    resize: Optional[bool] = False,
+):
+    """
+    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
+    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
+    """
+
+    if version == "svd":
+        num_frames = default(num_frames, 14)
+        num_steps = default(num_steps, 25)
+        output_folder = default(output_folder, "outputs/svd/")
+        model_config = "scripts/sampling/configs/svd.yaml"
+    elif version == "svd_xt":
+        num_frames = default(num_frames, 25)
+        num_steps = default(num_steps, 30)
+        output_folder = default(output_folder, "outputs/svd_xt/")
+        model_config = "scripts/sampling/configs/svd_xt.yaml"
+    elif version == "svd_image_decoder":
+        num_frames = default(num_frames, 14)
+        num_steps = default(num_steps, 25)
+        output_folder = default(
+            output_folder, "outputs/svd_image_decoder/"
+        )
+        model_config = "scripts/sampling/configs/svd_image_decoder.yaml"
+    elif version == "svd_xt_image_decoder":
+        num_frames = default(num_frames, 25)
+        num_steps = default(num_steps, 30)
+        output_folder = default(
+            output_folder, "outputs/svd_xt_image_decoder/"
+        )
+        model_config = "scripts/sampling/configs/svd_xt_image_decoder.yaml"
+    else:
+        raise ValueError(f"Version {version} does not exist.")
+
+    model, filter = load_model(
+        model_config,
+        ckpt,
+        device,
+        num_frames,
+        num_steps,
+    )
+    torch.manual_seed(seed)
+
+    path = Path(input_path)
+    all_img_paths = []
+    if path.is_file():
+        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
+            all_img_paths = [input_path]
+        else:
+            raise ValueError("Path is not valid image file.")
+    elif path.is_dir():
+        all_img_paths = sorted(
+            [
+                f
+                for f in path.iterdir()
+                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
+            ]
+        )
+        if len(all_img_paths) == 0:
+            raise ValueError("Folder does not contain any images.")
+    else:
+        raise ValueError
+
+    print(f'loaded {len(all_img_paths)} images.')
+    os.makedirs(output_folder, exist_ok=True)
+    for no, input_img_path in enumerate(all_img_paths):
+        filepath, fullflname = os.path.split(input_img_path)
+        filename, ext = os.path.splitext(fullflname)
+        print(f'-sample {no+1}: {filename} ...')
+        with Image.open(input_img_path) as image:
+            if image.mode == "RGBA":
+                image = image.convert("RGB")
+            if resize:
+                image = image.resize((1024,576))
+            w, h = image.size
+
+            if h % 64 != 0 or w % 64 != 0:
+                width, height = map(lambda x: x - x % 64, (w, h))
+                image = image.resize((width, height))
+                print(
+                    f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
+                )
+
+            image = ToTensor()(image)
+            image = image * 2.0 - 1.0
+
+        image = image.unsqueeze(0).to(device)
+        H, W = image.shape[2:]
+        assert image.shape[1] == 3
+        F = 8
+        C = 4
+        shape = (num_frames, C, H // F, W // F)
+        if (H, W) != (576, 1024):
+            print(
+                "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
+            )
+        if motion_bucket_id > 255:
+            print(
+                "WARNING: High motion bucket! This may lead to suboptimal performance."
+            )
+
+        if fps_id < 5:
+            print("WARNING: Small fps value! This may lead to suboptimal performance.")
+
+        if fps_id > 30:
+            print("WARNING: Large fps value! This may lead to suboptimal performance.")
+
+        value_dict = {}
+        value_dict["motion_bucket_id"] = motion_bucket_id
+        value_dict["fps_id"] = fps_id
+        value_dict["cond_aug"] = cond_aug
+        value_dict["cond_frames_without_noise"] = image
+        value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
+
+        with torch.no_grad():
+            with torch.autocast(device):
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [1, num_frames],
+                    T=num_frames,
+                    device=device,
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=[
+                        "cond_frames",
+                        "cond_frames_without_noise",
+                    ],
+                )
+
+                for k in ["crossattn", "concat"]:
+                    uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                    uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                    c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                    c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+
+                randn = torch.randn(shape, device=device)
+
+                additional_model_inputs = {}
+                additional_model_inputs["image_only_indicator"] = torch.zeros(
+                    2, num_frames
+                ).to(device)
+                #additional_model_inputs["image_only_indicator"][:,0] = 1
+                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+
+                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+                model.en_and_decode_n_samples_a_time = decoding_t
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+
+                #base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+                #video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+                video_path = os.path.join(output_folder, f"{filename}.mp4")
+                writer = cv2.VideoWriter(
+                    video_path,
+                    cv2.VideoWriter_fourcc(*'mp4v'),
+                    save_fps,
+                    (samples.shape[-1], samples.shape[-2]),
+                )
+
+                #samples = embed_watermark(samples)
+                #samples = filter(samples)
+                vid = (
+                    (rearrange(samples, "t c h w -> t h w c") * 255)
+                    .cpu()
+                    .numpy()
+                    .astype(np.uint8)
+                )
+                for frame in vid:
+                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                    writer.write(frame)
+                writer.release()
+    
+    print(f'Done! results saved in {output_folder}.')
+
+
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list(set([x.input_key for x in conditioner.embedders]))
+
+
+def get_batch(keys, value_dict, N, T, device):
+    batch = {}
+    batch_uc = {}
+
+    for key in keys:
+        if key == "fps_id":
+            batch[key] = (
+                torch.tensor([value_dict["fps_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "motion_bucket_id":
+            batch[key] = (
+                torch.tensor([value_dict["motion_bucket_id"]])
+                .to(device)
+                .repeat(int(math.prod(N)))
+            )
+        elif key == "cond_aug":
+            batch[key] = repeat(
+                torch.tensor([value_dict["cond_aug"]]).to(device),
+                "1 -> b",
+                b=math.prod(N),
+            )
+        elif key == "cond_frames":
+            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
+        elif key == "cond_frames_without_noise":
+            batch[key] = repeat(
+                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
+            )
+        else:
+            batch[key] = value_dict[key]
+
+    if T is not None:
+        batch["num_video_frames"] = T
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def load_model(
+    config: str,
+    ckpt: str,
+    device: str,
+    num_frames: int,
+    num_steps: int,
+):
+    config = OmegaConf.load(config)
+    config.model.params.ckpt_path = ckpt
+    if device == "cuda":
+        config.model.params.conditioner_config.params.emb_models[
+            0
+        ].params.open_clip_embedding_config.params.init_device = device
+
+    config.model.params.sampler_config.params.num_steps = num_steps
+    config.model.params.sampler_config.params.guider_config.params.num_frames = (
+        num_frames
+    )
+    if device == "cuda":
+        #with torch.device(device):
+        model = instantiate_from_config(config.model).to(device).eval()
+    else:
+        model = instantiate_from_config(config.model).to(device).eval()
+
+    filter = None #DeepFloydDataFiltering(verbose=False, device=device)
+    return model, filter
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=23, help="seed for seed_everything")
+    parser.add_argument("--ckpt", type=str, default=None, help="checkpoint path")
+    parser.add_argument("--config", type=str, help="config (yaml) path")
+    parser.add_argument("--input", type=str, default=None, help="image path or folder")
+    parser.add_argument("--savedir", type=str, default=None, help="results saving path")
+    parser.add_argument("--savefps", type=int, default=10, help="video fps to generate")
+    parser.add_argument("--n_samples", type=int, default=1, help="num of samples per prompt",)
+    parser.add_argument("--ddim_steps", type=int, default=50, help="steps of ddim if positive, otherwise use DDPM",)
+    parser.add_argument("--ddim_eta", type=float, default=1.0, help="eta for ddim sampling (0.0 yields deterministic sampling)",)
+    parser.add_argument("--frames", type=int, default=-1, help="frames num to inference")
+    parser.add_argument("--fps", type=int, default=6, help="control the fps")
+    parser.add_argument("--motion", type=int, default=127, help="control the motion magnitude")
+    parser.add_argument("--cond_aug", type=float, default=0.02, help="adding noise to input image")
+    parser.add_argument("--decoding_t", type=int, default=1, help="frames num to decoding per time")
+    parser.add_argument("--resize", action='store_true', default=False, help="resize all input to default resolution")
+    return parser
+
+
+if __name__ == "__main__":
+    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    print("@SVD Inference: %s"%now)
+    #Fire(sample)
+    parser = get_parser()
+    args = parser.parse_args()
+    sample(input_path=args.input, ckpt=args.ckpt, num_frames=args.frames, num_steps=args.ddim_steps, \
+        fps_id=args.fps, motion_bucket_id=args.motion, cond_aug=args.cond_aug, seed=args.seed, \
+        decoding_t=args.decoding_t, output_folder=args.savedir, save_fps=args.savefps, resize=args.resize)
diff --git a/scripts/tests/attention.py b/scripts/tests/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7c3f7c8da27c577a7ce0ea3a01ab7f9e9c1baa2
--- /dev/null
+++ b/scripts/tests/attention.py
@@ -0,0 +1,319 @@
+import einops
+import torch
+import torch.nn.functional as F
+import torch.utils.benchmark as benchmark
+from torch.backends.cuda import SDPBackend
+
+from sgm.modules.attention import BasicTransformerBlock, SpatialTransformer
+
+
+def benchmark_attn():
+    # Lets define a helpful benchmarking function:
+    # https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+
+    # Lets define the hyper-parameters of our input
+    batch_size = 32
+    max_sequence_len = 1024
+    num_heads = 32
+    embed_dimension = 32
+
+    dtype = torch.float16
+
+    query = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    key = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    value = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+
+    print(f"q/k/v shape:", query.shape, key.shape, value.shape)
+
+    # Lets explore the speed of each of the 3 implementations
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+
+    # Helpful arguments mapper
+    backend_map = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+    }
+
+    from torch.profiler import ProfilerActivity, profile, record_function
+
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+
+    print(
+        f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with profile(
+        activities=activities, record_shapes=False, profile_memory=True
+    ) as prof:
+        with record_function("Default detailed stats"):
+            for _ in range(25):
+                o = F.scaled_dot_product_attention(query, key, value)
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+    print(
+        f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with sdp_kernel(**backend_map[SDPBackend.MATH]):
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("Math implmentation stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+    with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+        try:
+            print(
+                f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("FlashAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("FlashAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+    with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        try:
+            print(
+                f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("EfficientAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("EfficientAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+
+def run_model(model, x, context):
+    return model(x, context)
+
+
+def benchmark_transformer_blocks():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    import torch.utils.benchmark as benchmark
+
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+
+    checkpoint = True
+    compile = False
+
+    batch_size = 32
+    h, w = 64, 64
+    context_len = 77
+    embed_dimension = 1024
+    context_dim = 1024
+    d_head = 64
+
+    transformer_depth = 4
+
+    n_heads = embed_dimension // d_head
+
+    dtype = torch.float16
+
+    model_native = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        use_checkpoint=checkpoint,
+        attn_type="softmax",
+        depth=transformer_depth,
+        sdp_backend=SDPBackend.FLASH_ATTENTION,
+    ).to(device)
+    model_efficient_attn = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        depth=transformer_depth,
+        use_checkpoint=checkpoint,
+        attn_type="softmax-xformers",
+    ).to(device)
+    if not checkpoint and compile:
+        print("compiling models")
+        model_native = torch.compile(model_native)
+        model_efficient_attn = torch.compile(model_efficient_attn)
+
+    x = torch.rand(batch_size, embed_dimension, h, w, device=device, dtype=dtype)
+    c = torch.rand(batch_size, context_len, context_dim, device=device, dtype=dtype)
+
+    from torch.profiler import ProfilerActivity, profile, record_function
+
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+
+    with torch.autocast("cuda"):
+        print(
+            f"The native model runs in {benchmark_torch_function_in_microseconds(model_native.forward, x, c):.3f} microseconds"
+        )
+        print(
+            f"The efficientattn model runs in {benchmark_torch_function_in_microseconds(model_efficient_attn.forward, x, c):.3f} microseconds"
+        )
+
+        print(75 * "+")
+        print("NATIVE")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("NativeAttention stats"):
+                for _ in range(25):
+                    model_native(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by native block")
+
+        print(75 * "+")
+        print("Xformers")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("xformers stats"):
+                for _ in range(25):
+                    model_efficient_attn(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by xformers block")
+
+
+def test01():
+    # conv1x1 vs linear
+    from sgm.util import count_params
+
+    conv = torch.nn.Conv2d(3, 32, kernel_size=1).cuda()
+    print(count_params(conv))
+    linear = torch.nn.Linear(3, 32).cuda()
+    print(count_params(linear))
+
+    print(conv.weight.shape)
+
+    # use same initialization
+    linear.weight = torch.nn.Parameter(conv.weight.squeeze(-1).squeeze(-1))
+    linear.bias = torch.nn.Parameter(conv.bias)
+
+    print(linear.weight.shape)
+
+    x = torch.randn(11, 3, 64, 64).cuda()
+
+    xr = einops.rearrange(x, "b c h w -> b (h w) c").contiguous()
+    print(xr.shape)
+    out_linear = linear(xr)
+    print(out_linear.mean(), out_linear.shape)
+
+    out_conv = conv(x)
+    print(out_conv.mean(), out_conv.shape)
+    print("done with test01.\n")
+
+
+def test02():
+    # try cosine flash attention
+    import time
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    print("testing cosine flash attention...")
+    DIM = 1024
+    SEQLEN = 4096
+    BS = 16
+
+    print(" softmax (vanilla) first...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="softmax",
+    ).cuda()
+    try:
+        x = torch.randn(BS, SEQLEN, DIM).cuda()
+        tic = time.time()
+        y = model(x)
+        toc = time.time()
+        print(y.shape, toc - tic)
+    except RuntimeError as e:
+        # likely oom
+        print(str(e))
+
+    print("\n now flash-cosine...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="flash-cosine",
+    ).cuda()
+    x = torch.randn(BS, SEQLEN, DIM).cuda()
+    tic = time.time()
+    y = model(x)
+    toc = time.time()
+    print(y.shape, toc - tic)
+    print("done with test02.\n")
+
+
+if __name__ == "__main__":
+    # test01()
+    # test02()
+    # test03()
+
+    # benchmark_attn()
+    benchmark_transformer_blocks()
+
+    print("done.")
diff --git a/scripts/util/__init__.py b/scripts/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/util/detection/__init__.py b/scripts/util/detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/util/detection/nsfw_and_watermark_dectection.py b/scripts/util/detection/nsfw_and_watermark_dectection.py
new file mode 100644
index 0000000000000000000000000000000000000000..1096b8177d8e3dbcf8e913f924e98d5ce58cb120
--- /dev/null
+++ b/scripts/util/detection/nsfw_and_watermark_dectection.py
@@ -0,0 +1,110 @@
+import os
+
+import clip
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image
+
+RESOURCES_ROOT = "scripts/util/detection/"
+
+
+def predict_proba(X, weights, biases):
+    logits = X @ weights.T + biases
+    proba = np.where(
+        logits >= 0, 1 / (1 + np.exp(-logits)), np.exp(logits) / (1 + np.exp(logits))
+    )
+    return proba.T
+
+
+def load_model_weights(path: str):
+    model_weights = np.load(path)
+    return model_weights["weights"], model_weights["biases"]
+
+
+def clip_process_images(images: torch.Tensor) -> torch.Tensor:
+    min_size = min(images.shape[-2:])
+    return T.Compose(
+        [
+            T.CenterCrop(min_size),  # TODO: this might affect the watermark, check this
+            T.Resize(224, interpolation=T.InterpolationMode.BICUBIC, antialias=True),
+            T.Normalize(
+                (0.48145466, 0.4578275, 0.40821073),
+                (0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )(images)
+
+
+class DeepFloydDataFiltering(object):
+    def __init__(
+        self, verbose: bool = False, device: torch.device = torch.device("cpu")
+    ):
+        super().__init__()
+        self.verbose = verbose
+        self._device = None
+        self.clip_model, _ = clip.load("ViT-L/14", device=device)
+        self.clip_model.eval()
+
+        self.cpu_w_weights, self.cpu_w_biases = load_model_weights(
+            os.path.join(RESOURCES_ROOT, "w_head_v1.npz")
+        )
+        self.cpu_p_weights, self.cpu_p_biases = load_model_weights(
+            os.path.join(RESOURCES_ROOT, "p_head_v1.npz")
+        )
+        self.w_threshold, self.p_threshold = 0.5, 0.5
+
+    @torch.inference_mode()
+    def __call__(self, images: torch.Tensor) -> torch.Tensor:
+        imgs = clip_process_images(images)
+        if self._device is None:
+            self._device = next(p for p in self.clip_model.parameters()).device
+        image_features = self.clip_model.encode_image(imgs.to(self._device))
+        image_features = image_features.detach().cpu().numpy().astype(np.float16)
+        p_pred = predict_proba(image_features, self.cpu_p_weights, self.cpu_p_biases)
+        w_pred = predict_proba(image_features, self.cpu_w_weights, self.cpu_w_biases)
+        print(f"p_pred = {p_pred}, w_pred = {w_pred}") if self.verbose else None
+        query = p_pred > self.p_threshold
+        if query.sum() > 0:
+            print(f"Hit for p_threshold: {p_pred}") if self.verbose else None
+            images[query] = T.GaussianBlur(99, sigma=(100.0, 100.0))(images[query])
+        query = w_pred > self.w_threshold
+        if query.sum() > 0:
+            print(f"Hit for w_threshold: {w_pred}") if self.verbose else None
+            images[query] = T.GaussianBlur(99, sigma=(100.0, 100.0))(images[query])
+        return images
+
+
+def load_img(path: str) -> torch.Tensor:
+    image = Image.open(path)
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+    image_transforms = T.Compose(
+        [
+            T.ToTensor(),
+        ]
+    )
+    return image_transforms(image)[None, ...]
+
+
+def test(root):
+    from einops import rearrange
+
+    filter = DeepFloydDataFiltering(verbose=True)
+    for p in os.listdir((root)):
+        print(f"running on {p}...")
+        img = load_img(os.path.join(root, p))
+        filtered_img = filter(img)
+        filtered_img = rearrange(
+            255.0 * (filtered_img.numpy())[0], "c h w -> h w c"
+        ).astype(np.uint8)
+        Image.fromarray(filtered_img).save(
+            os.path.join(root, f"{os.path.splitext(p)[0]}-filtered.jpg")
+        )
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(test)
+    print("done.")
diff --git a/scripts/util/detection/p_head_v1.npz b/scripts/util/detection/p_head_v1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c1a824795d85811de3192d8ac20403444e19510b
--- /dev/null
+++ b/scripts/util/detection/p_head_v1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4653a64d5f85d8d4c5f6c5ec175f1c5c5e37db8f38d39b2ed8b5979da7fdc76
+size 3588
diff --git a/scripts/util/detection/w_head_v1.npz b/scripts/util/detection/w_head_v1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..57789e17153038c529439b38f9a540ba0cb8bbac
--- /dev/null
+++ b/scripts/util/detection/w_head_v1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6af23687aa347073e692025f405ccc48c14aadc5dbe775b3312041006d496d1
+size 3588
diff --git a/sgm/__init__.py b/sgm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..24bc84af8b1041de34b9816e0507cb1ac207bd13
--- /dev/null
+++ b/sgm/__init__.py
@@ -0,0 +1,4 @@
+from .models import AutoencodingEngine, DiffusionEngine
+from .util import get_configs_path, instantiate_from_config
+
+__version__ = "0.1.0"
diff --git a/sgm/__pycache__/__init__.cpython-310.pyc b/sgm/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d376806746b5645590c1d8bfcde7f01bdb18c06
Binary files /dev/null and b/sgm/__pycache__/__init__.cpython-310.pyc differ
diff --git a/sgm/data_org/__init__.py b/sgm/data_org/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7664a25c655c376bd1a7b0ccbaca7b983a2bf9ad
--- /dev/null
+++ b/sgm/data_org/__init__.py
@@ -0,0 +1 @@
+from .dataset import StableDataModuleFromConfig
diff --git a/sgm/data_org/cifar10.py b/sgm/data_org/cifar10.py
new file mode 100644
index 0000000000000000000000000000000000000000..6083646f136bad308a0485843b89234cf7a9d6cd
--- /dev/null
+++ b/sgm/data_org/cifar10.py
@@ -0,0 +1,67 @@
+import pytorch_lightning as pl
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+
+
+class CIFAR10DataDictWrapper(Dataset):
+    def __init__(self, dset):
+        super().__init__()
+        self.dset = dset
+
+    def __getitem__(self, i):
+        x, y = self.dset[i]
+        return {"jpg": x, "cls": y}
+
+    def __len__(self):
+        return len(self.dset)
+
+
+class CIFAR10Loader(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, shuffle=True):
+        super().__init__()
+
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+        )
+
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.train_dataset = CIFAR10DataDictWrapper(
+            torchvision.datasets.CIFAR10(
+                root=".data/", train=True, download=True, transform=transform
+            )
+        )
+        self.test_dataset = CIFAR10DataDictWrapper(
+            torchvision.datasets.CIFAR10(
+                root=".data/", train=False, download=True, transform=transform
+            )
+        )
+
+    def prepare_data(self):
+        pass
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+        )
diff --git a/sgm/data_org/dataset.py b/sgm/data_org/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b726149996591c6c3db69230e1bb68c07d2faa12
--- /dev/null
+++ b/sgm/data_org/dataset.py
@@ -0,0 +1,80 @@
+from typing import Optional
+
+import torchdata.datapipes.iter
+import webdataset as wds
+from omegaconf import DictConfig
+from pytorch_lightning import LightningDataModule
+
+try:
+    from sdata import create_dataset, create_dummy_dataset, create_loader
+except ImportError as e:
+    print("#" * 100)
+    print("Datasets not yet available")
+    print("to enable, we need to add stable-datasets as a submodule")
+    print("please use ``git submodule update --init --recursive``")
+    print("and do ``pip install -e stable-datasets/`` from the root of this repo")
+    print("#" * 100)
+    exit(1)
+
+
+class StableDataModuleFromConfig(LightningDataModule):
+    def __init__(
+        self,
+        train: DictConfig,
+        validation: Optional[DictConfig] = None,
+        test: Optional[DictConfig] = None,
+        skip_val_loader: bool = False,
+        dummy: bool = False,
+    ):
+        super().__init__()
+        self.train_config = train
+        assert (
+            "datapipeline" in self.train_config and "loader" in self.train_config
+        ), "train config requires the fields `datapipeline` and `loader`"
+
+        self.val_config = validation
+        if not skip_val_loader:
+            if self.val_config is not None:
+                assert (
+                    "datapipeline" in self.val_config and "loader" in self.val_config
+                ), "validation config requires the fields `datapipeline` and `loader`"
+            else:
+                print(
+                    "Warning: No Validation datapipeline defined, using that one from training"
+                )
+                self.val_config = train
+
+        self.test_config = test
+        if self.test_config is not None:
+            assert (
+                "datapipeline" in self.test_config and "loader" in self.test_config
+            ), "test config requires the fields `datapipeline` and `loader`"
+
+        self.dummy = dummy
+        if self.dummy:
+            print("#" * 100)
+            print("USING DUMMY DATASET: HOPE YOU'RE DEBUGGING ;)")
+            print("#" * 100)
+
+    def setup(self, stage: str) -> None:
+        print("Preparing datasets")
+        if self.dummy:
+            data_fn = create_dummy_dataset
+        else:
+            data_fn = create_dataset
+
+        self.train_datapipeline = data_fn(**self.train_config.datapipeline)
+        if self.val_config:
+            self.val_datapipeline = data_fn(**self.val_config.datapipeline)
+        if self.test_config:
+            self.test_datapipeline = data_fn(**self.test_config.datapipeline)
+
+    def train_dataloader(self) -> torchdata.datapipes.iter.IterDataPipe:
+        loader = create_loader(self.train_datapipeline, **self.train_config.loader)
+        return loader
+
+    def val_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.val_datapipeline, **self.val_config.loader)
+
+    def test_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.test_datapipeline, **self.test_config.loader)
diff --git a/sgm/data_org/mnist.py b/sgm/data_org/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..dea4d7e670666bec80ecb22aa89603345e173d09
--- /dev/null
+++ b/sgm/data_org/mnist.py
@@ -0,0 +1,85 @@
+import pytorch_lightning as pl
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+
+
+class MNISTDataDictWrapper(Dataset):
+    def __init__(self, dset):
+        super().__init__()
+        self.dset = dset
+
+    def __getitem__(self, i):
+        x, y = self.dset[i]
+        return {"jpg": x, "cls": y}
+
+    def __len__(self):
+        return len(self.dset)
+
+
+class MNISTLoader(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, prefetch_factor=2, shuffle=True):
+        super().__init__()
+
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+        )
+
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor if num_workers > 0 else 0
+        self.shuffle = shuffle
+        self.train_dataset = MNISTDataDictWrapper(
+            torchvision.datasets.MNIST(
+                root=".data/", train=True, download=True, transform=transform
+            )
+        )
+        self.test_dataset = MNISTDataDictWrapper(
+            torchvision.datasets.MNIST(
+                root=".data/", train=False, download=True, transform=transform
+            )
+        )
+
+    def prepare_data(self):
+        pass
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor,
+        )
+
+
+if __name__ == "__main__":
+    dset = MNISTDataDictWrapper(
+        torchvision.datasets.MNIST(
+            root=".data/",
+            train=False,
+            download=True,
+            transform=transforms.Compose(
+                [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+            ),
+        )
+    )
+    ex = dset[0]
diff --git a/sgm/inference/api.py b/sgm/inference/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..7171ff4abb774556b638c98ad809e195082bdccf
--- /dev/null
+++ b/sgm/inference/api.py
@@ -0,0 +1,385 @@
+import pathlib
+from dataclasses import asdict, dataclass
+from enum import Enum
+from typing import Optional
+
+from omegaconf import OmegaConf
+
+from sgm.inference.helpers import (Img2ImgDiscretizationWrapper, do_img2img,
+                                   do_sample)
+from sgm.modules.diffusionmodules.sampling import (DPMPP2MSampler,
+                                                   DPMPP2SAncestralSampler,
+                                                   EulerAncestralSampler,
+                                                   EulerEDMSampler,
+                                                   HeunEDMSampler,
+                                                   LinearMultistepSampler)
+from sgm.util import load_model_from_config
+
+
+class ModelArchitecture(str, Enum):
+    SD_2_1 = "stable-diffusion-v2-1"
+    SD_2_1_768 = "stable-diffusion-v2-1-768"
+    SDXL_V0_9_BASE = "stable-diffusion-xl-v0-9-base"
+    SDXL_V0_9_REFINER = "stable-diffusion-xl-v0-9-refiner"
+    SDXL_V1_BASE = "stable-diffusion-xl-v1-base"
+    SDXL_V1_REFINER = "stable-diffusion-xl-v1-refiner"
+
+
+class Sampler(str, Enum):
+    EULER_EDM = "EulerEDMSampler"
+    HEUN_EDM = "HeunEDMSampler"
+    EULER_ANCESTRAL = "EulerAncestralSampler"
+    DPMPP2S_ANCESTRAL = "DPMPP2SAncestralSampler"
+    DPMPP2M = "DPMPP2MSampler"
+    LINEAR_MULTISTEP = "LinearMultistepSampler"
+
+
+class Discretization(str, Enum):
+    LEGACY_DDPM = "LegacyDDPMDiscretization"
+    EDM = "EDMDiscretization"
+
+
+class Guider(str, Enum):
+    VANILLA = "VanillaCFG"
+    IDENTITY = "IdentityGuider"
+
+
+class Thresholder(str, Enum):
+    NONE = "None"
+
+
+@dataclass
+class SamplingParams:
+    width: int = 1024
+    height: int = 1024
+    steps: int = 50
+    sampler: Sampler = Sampler.DPMPP2M
+    discretization: Discretization = Discretization.LEGACY_DDPM
+    guider: Guider = Guider.VANILLA
+    thresholder: Thresholder = Thresholder.NONE
+    scale: float = 6.0
+    aesthetic_score: float = 5.0
+    negative_aesthetic_score: float = 5.0
+    img2img_strength: float = 1.0
+    orig_width: int = 1024
+    orig_height: int = 1024
+    crop_coords_top: int = 0
+    crop_coords_left: int = 0
+    sigma_min: float = 0.0292
+    sigma_max: float = 14.6146
+    rho: float = 3.0
+    s_churn: float = 0.0
+    s_tmin: float = 0.0
+    s_tmax: float = 999.0
+    s_noise: float = 1.0
+    eta: float = 1.0
+    order: int = 4
+
+
+@dataclass
+class SamplingSpec:
+    width: int
+    height: int
+    channels: int
+    factor: int
+    is_legacy: bool
+    config: str
+    ckpt: str
+    is_guided: bool
+
+
+model_specs = {
+    ModelArchitecture.SD_2_1: SamplingSpec(
+        height=512,
+        width=512,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1.yaml",
+        ckpt="v2-1_512-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SD_2_1_768: SamplingSpec(
+        height=768,
+        width=768,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_2_1_768.yaml",
+        ckpt="v2-1_768-ema-pruned.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V0_9_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_0.9.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_BASE: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=False,
+        config="sd_xl_base.yaml",
+        ckpt="sd_xl_base_1.0.safetensors",
+        is_guided=True,
+    ),
+    ModelArchitecture.SDXL_V1_REFINER: SamplingSpec(
+        height=1024,
+        width=1024,
+        channels=4,
+        factor=8,
+        is_legacy=True,
+        config="sd_xl_refiner.yaml",
+        ckpt="sd_xl_refiner_1.0.safetensors",
+        is_guided=True,
+    ),
+}
+
+
+class SamplingPipeline:
+    def __init__(
+        self,
+        model_id: ModelArchitecture,
+        model_path="checkpoints",
+        config_path="configs/inference",
+        device="cuda",
+        use_fp16=True,
+    ) -> None:
+        if model_id not in model_specs:
+            raise ValueError(f"Model {model_id} not supported")
+        self.model_id = model_id
+        self.specs = model_specs[self.model_id]
+        self.config = str(pathlib.Path(config_path, self.specs.config))
+        self.ckpt = str(pathlib.Path(model_path, self.specs.ckpt))
+        self.device = device
+        self.model = self._load_model(device=device, use_fp16=use_fp16)
+
+    def _load_model(self, device="cuda", use_fp16=True):
+        config = OmegaConf.load(self.config)
+        model = load_model_from_config(config, self.ckpt)
+        if model is None:
+            raise ValueError(f"Model {self.model_id} could not be loaded")
+        model.to(device)
+        if use_fp16:
+            model.conditioner.half()
+            model.model.half()
+        return model
+
+    def text_to_image(
+        self,
+        params: SamplingParams,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = params.width
+        value_dict["target_height"] = params.height
+        return do_sample(
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            params.height,
+            params.width,
+            self.specs.channels,
+            self.specs.factor,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+
+    def image_to_image(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+
+        if params.img2img_strength < 1.0:
+            sampler.discretization = Img2ImgDiscretizationWrapper(
+                sampler.discretization,
+                strength=params.img2img_strength,
+            )
+        height, width = image.shape[2], image.shape[3]
+        value_dict = asdict(params)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = width
+        value_dict["target_height"] = height
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            force_uc_zero_embeddings=["txt"] if not self.specs.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+        )
+
+    def refiner(
+        self,
+        params: SamplingParams,
+        image,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        samples: int = 1,
+        return_latents: bool = False,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = {
+            "orig_width": image.shape[3] * 8,
+            "orig_height": image.shape[2] * 8,
+            "target_width": image.shape[3] * 8,
+            "target_height": image.shape[2] * 8,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "crop_coords_top": 0,
+            "crop_coords_left": 0,
+            "aesthetic_score": 6.0,
+            "negative_aesthetic_score": 2.5,
+        }
+
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            skip_encode=True,
+            return_latents=return_latents,
+            filter=None,
+        )
+
+
+def get_guider_config(params: SamplingParams):
+    if params.guider == Guider.IDENTITY:
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif params.guider == Guider.VANILLA:
+        scale = params.scale
+
+        thresholder = params.thresholder
+
+        if thresholder == Thresholder.NONE:
+            dyn_thresh_config = {
+                "target": "sgm.modules.diffusionmodules.sampling_utils.NoDynamicThresholding"
+            }
+        else:
+            raise NotImplementedError
+
+        guider_config = {
+            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {"scale": scale, "dyn_thresh_config": dyn_thresh_config},
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+
+
+def get_discretization_config(params: SamplingParams):
+    if params.discretization == Discretization.LEGACY_DDPM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
+        }
+    elif params.discretization == Discretization.EDM:
+        discretization_config = {
+            "target": "sgm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {
+                "sigma_min": params.sigma_min,
+                "sigma_max": params.sigma_max,
+                "rho": params.rho,
+            },
+        }
+    else:
+        raise ValueError(f"unknown discretization {params.discretization}")
+    return discretization_config
+
+
+def get_sampler_config(params: SamplingParams):
+    discretization_config = get_discretization_config(params)
+    guider_config = get_guider_config(params)
+    sampler = None
+    if params.sampler == Sampler.EULER_EDM:
+        return EulerEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.HEUN_EDM:
+        return HeunEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.EULER_ANCESTRAL:
+        return EulerAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2S_ANCESTRAL:
+        return DPMPP2SAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == Sampler.DPMPP2M:
+        return DPMPP2MSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            verbose=True,
+        )
+    if params.sampler == Sampler.LINEAR_MULTISTEP:
+        return LinearMultistepSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            order=params.order,
+            verbose=True,
+        )
+
+    raise ValueError(f"unknown sampler {params.sampler}!")
diff --git a/sgm/inference/helpers.py b/sgm/inference/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b0ec3dc414bf522261e35f73805810cd35582d
--- /dev/null
+++ b/sgm/inference/helpers.py
@@ -0,0 +1,305 @@
+import math
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from einops import rearrange
+from imwatermark import WatermarkEncoder
+from omegaconf import ListConfig
+from PIL import Image
+from torch import autocast
+
+from sgm.util import append_dims
+
+
+class WatermarkEmbedder:
+    def __init__(self, watermark):
+        self.watermark = watermark
+        self.num_bits = len(WATERMARK_BITS)
+        self.encoder = WatermarkEncoder()
+        self.encoder.set_watermark("bits", self.watermark)
+
+    def __call__(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Adds a predefined watermark to the input image
+
+        Args:
+            image: ([N,] B, RGB, H, W) in range [0, 1]
+
+        Returns:
+            same as input but watermarked
+        """
+        squeeze = len(image.shape) == 4
+        if squeeze:
+            image = image[None, ...]
+        n = image.shape[0]
+        image_np = rearrange(
+            (255 * image).detach().cpu(), "n b c h w -> (n b) h w c"
+        ).numpy()[:, :, :, ::-1]
+        # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
+        # watermarking libary expects input as cv2 BGR format
+        for k in range(image_np.shape[0]):
+            image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
+        image = torch.from_numpy(
+            rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)
+        ).to(image.device)
+        image = torch.clamp(image / 255, min=0.0, max=1.0)
+        if squeeze:
+            image = image[0]
+        return image
+
+
+# A fixed 48-bit message that was choosen at random
+# WATERMARK_MESSAGE = 0xB3EC907BB19E
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+embed_watermark = WatermarkEmbedder(WATERMARK_BITS)
+
+
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list({x.input_key for x in conditioner.embedders})
+
+
+def perform_save_locally(save_path, samples):
+    os.makedirs(os.path.join(save_path), exist_ok=True)
+    base_count = len(os.listdir(os.path.join(save_path)))
+    samples = embed_watermark(samples)
+    for sample in samples:
+        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+        Image.fromarray(sample.astype(np.uint8)).save(
+            os.path.join(save_path, f"{base_count:09}.png")
+        )
+        base_count += 1
+
+
+class Img2ImgDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
+    """
+
+    def __init__(self, discretization, strength: float = 1.0):
+        self.discretization = discretization
+        self.strength = strength
+        assert 0.0 <= self.strength <= 1.0
+
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
+        print("prune index:", max(int(self.strength * len(sigmas)), 1))
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
+
+
+def do_sample(
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    H,
+    W,
+    C,
+    F,
+    force_uc_zero_embeddings: Optional[List] = None,
+    batch2model_input: Optional[List] = None,
+    return_latents=False,
+    filter=None,
+    device="cuda",
+):
+    if force_uc_zero_embeddings is None:
+        force_uc_zero_embeddings = []
+    if batch2model_input is None:
+        batch2model_input = []
+
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                num_samples = [num_samples]
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    num_samples,
+                )
+                for key in batch:
+                    if isinstance(batch[key], torch.Tensor):
+                        print(key, batch[key].shape)
+                    elif isinstance(batch[key], list):
+                        print(key, [len(l) for l in batch[key]])
+                    else:
+                        print(key, batch[key])
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+
+                for k in c:
+                    if not k == "crossattn":
+                        c[k], uc[k] = map(
+                            lambda y: y[k][: math.prod(num_samples)].to(device), (c, uc)
+                        )
+
+                additional_model_inputs = {}
+                for k in batch2model_input:
+                    additional_model_inputs[k] = batch[k]
+
+                shape = (math.prod(num_samples), C, H // F, W // F)
+                randn = torch.randn(shape).to(device)
+
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+
+                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+
+                if filter is not None:
+                    samples = filter(samples)
+
+                if return_latents:
+                    return samples, samples_z
+                return samples
+
+
+def get_batch(keys, value_dict, N: Union[List, ListConfig], device="cuda"):
+    # Hardcoded demo setups; might undergo some changes in the future
+
+    batch = {}
+    batch_uc = {}
+
+    for key in keys:
+        if key == "txt":
+            batch["txt"] = (
+                np.repeat([value_dict["prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+            batch_uc["txt"] = (
+                np.repeat([value_dict["negative_prompt"]], repeats=math.prod(N))
+                .reshape(N)
+                .tolist()
+            )
+        elif key == "original_size_as_tuple":
+            batch["original_size_as_tuple"] = (
+                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "crop_coords_top_left":
+            batch["crop_coords_top_left"] = (
+                torch.tensor(
+                    [value_dict["crop_coords_top"], value_dict["crop_coords_left"]]
+                )
+                .to(device)
+                .repeat(*N, 1)
+            )
+        elif key == "aesthetic_score":
+            batch["aesthetic_score"] = (
+                torch.tensor([value_dict["aesthetic_score"]]).to(device).repeat(*N, 1)
+            )
+            batch_uc["aesthetic_score"] = (
+                torch.tensor([value_dict["negative_aesthetic_score"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+
+        elif key == "target_size_as_tuple":
+            batch["target_size_as_tuple"] = (
+                torch.tensor([value_dict["target_height"], value_dict["target_width"]])
+                .to(device)
+                .repeat(*N, 1)
+            )
+        else:
+            batch[key] = value_dict[key]
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def get_input_image_tensor(image: Image.Image, device="cuda"):
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    width, height = map(
+        lambda x: x - x % 64, (w, h)
+    )  # resize to integer multiple of 64
+    image = image.resize((width, height))
+    image_array = np.array(image.convert("RGB"))
+    image_array = image_array[None].transpose(0, 3, 1, 2)
+    image_tensor = torch.from_numpy(image_array).to(dtype=torch.float32) / 127.5 - 1.0
+    return image_tensor.to(device)
+
+
+def do_img2img(
+    img,
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    force_uc_zero_embeddings=[],
+    additional_kwargs={},
+    offset_noise_level: float = 0.0,
+    return_latents=False,
+    skip_encode=False,
+    filter=None,
+    device="cuda",
+):
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [num_samples],
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+
+                for k in c:
+                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+
+                for k in additional_kwargs:
+                    c[k] = uc[k] = additional_kwargs[k]
+                if skip_encode:
+                    z = img
+                else:
+                    z = model.encode_first_stage(img)
+                noise = torch.randn_like(z)
+                sigmas = sampler.discretization(sampler.num_steps)
+                sigma = sigmas[0].to(z.device)
+
+                if offset_noise_level > 0.0:
+                    noise = noise + offset_noise_level * append_dims(
+                        torch.randn(z.shape[0], device=z.device), z.ndim
+                    )
+                noised_z = z + noise * append_dims(sigma, z.ndim)
+                noised_z = noised_z / torch.sqrt(
+                    1.0 + sigmas[0] ** 2.0
+                )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
+
+                def denoiser(x, sigma, c):
+                    return model.denoiser(model.model, x, sigma, c)
+
+                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+
+                if filter is not None:
+                    samples = filter(samples)
+
+                if return_latents:
+                    return samples, samples_z
+                return samples
diff --git a/sgm/lr_scheduler.py b/sgm/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f4d384c1fcaff0df13e0564450d3fa972ace42
--- /dev/null
+++ b/sgm/lr_scheduler.py
@@ -0,0 +1,135 @@
+import numpy as np
+
+
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+
+    def __init__(
+        self,
+        warm_up_steps,
+        lr_min,
+        lr_max,
+        lr_start,
+        max_decay_steps,
+        verbosity_interval=0,
+    ):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.0
+        self.verbosity_interval = verbosity_interval
+
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (
+                self.lr_max - self.lr_start
+            ) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (
+                self.lr_max_decay_steps - self.lr_warm_up_steps
+            )
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_lr = lr
+            return lr
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+
+    def __init__(
+        self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0
+    ):
+        assert (
+            len(warm_up_steps)
+            == len(f_min)
+            == len(f_max)
+            == len(f_start)
+            == len(cycle_lengths)
+        )
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.0
+        self.verbosity_interval = verbosity_interval
+
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (
+                self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]
+            )
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_f = f
+            return f
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
+                self.cycle_lengths[cycle] - n
+            ) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f
diff --git a/sgm/models/__init__.py b/sgm/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c410b3747afc208e4204c8f140170e0a7808eace
--- /dev/null
+++ b/sgm/models/__init__.py
@@ -0,0 +1,2 @@
+from .autoencoder import AutoencodingEngine
+from .diffusion import DiffusionEngine
diff --git a/sgm/models/__pycache__/__init__.cpython-310.pyc b/sgm/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19816f04e6dc25d8869d6efaf5bccf565e11f878
Binary files /dev/null and b/sgm/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/sgm/models/__pycache__/autoencoder.cpython-310.pyc b/sgm/models/__pycache__/autoencoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f698252719089b4d9ac553030ca767983c19d94
Binary files /dev/null and b/sgm/models/__pycache__/autoencoder.cpython-310.pyc differ
diff --git a/sgm/models/autoencoder.py b/sgm/models/autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2949b91011a2be7a6b8ca17ce260812f20ce8b75
--- /dev/null
+++ b/sgm/models/autoencoder.py
@@ -0,0 +1,615 @@
+import logging
+import math
+import re
+from abc import abstractmethod
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+
+from ..modules.autoencoding.regularizers import AbstractRegularizer
+from ..modules.ema import LitEma
+from ..util import (default, get_nested_attribute, get_obj_from_str,
+                    instantiate_from_config)
+
+logpy = logging.getLogger(__name__)
+
+
+class AbstractAutoencoder(pl.LightningModule):
+    """
+    This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
+    unCLIP models, etc. Hence, it is fairly general, and specific features
+    (e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
+    """
+
+    def __init__(
+        self,
+        ema_decay: Union[None, float] = None,
+        monitor: Union[None, str] = None,
+        input_key: str = "jpg",
+    ):
+        super().__init__()
+
+        self.input_key = input_key
+        self.use_ema = ema_decay is not None
+        if monitor is not None:
+            self.monitor = monitor
+
+        if self.use_ema:
+            self.model_ema = LitEma(self, decay=ema_decay)
+            logpy.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            self.automatic_optimization = False
+
+    def apply_ckpt(self, ckpt: Union[None, str, dict]):
+        if ckpt is None:
+            return
+        if isinstance(ckpt, str):
+            ckpt = {
+                "target": "sgm.modules.checkpoint.CheckpointEngine",
+                "params": {"ckpt_path": ckpt},
+            }
+        engine = instantiate_from_config(ckpt)
+        engine(self)
+
+    @abstractmethod
+    def get_input(self, batch) -> Any:
+        raise NotImplementedError()
+
+    def on_train_batch_end(self, *args, **kwargs):
+        # for EMA computation
+        if self.use_ema:
+            self.model_ema(self)
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                logpy.info(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    logpy.info(f"{context}: Restored training weights")
+
+    @abstractmethod
+    def encode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("encode()-method of abstract base class called")
+
+    @abstractmethod
+    def decode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("decode()-method of abstract base class called")
+
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        logpy.info(f"loading >>> {cfg['target']} <<< optimizer from config")
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+
+    def configure_optimizers(self) -> Any:
+        raise NotImplementedError()
+
+
+class AutoencodingEngine(AbstractAutoencoder):
+    """
+    Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
+    (we also restore them explicitly as special cases for legacy reasons).
+    Regularizations such as KL or VQ are moved to the regularizer class.
+    """
+
+    def __init__(
+        self,
+        *args,
+        encoder_config: Dict,
+        decoder_config: Dict,
+        loss_config: Dict,
+        regularizer_config: Dict,
+        optimizer_config: Union[Dict, None] = None,
+        lr_g_factor: float = 1.0,
+        trainable_ae_params: Optional[List[List[str]]] = None,
+        ae_optimizer_args: Optional[List[dict]] = None,
+        trainable_disc_params: Optional[List[List[str]]] = None,
+        disc_optimizer_args: Optional[List[dict]] = None,
+        disc_start_iter: int = 0,
+        diff_boost_factor: float = 3.0,
+        ckpt_engine: Union[None, str, dict] = None,
+        ckpt_path: Optional[str] = None,
+        additional_decode_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.automatic_optimization = False  # pytorch lightning
+
+        self.encoder: torch.nn.Module = instantiate_from_config(encoder_config)
+        self.decoder: torch.nn.Module = instantiate_from_config(decoder_config)
+        self.loss: torch.nn.Module = instantiate_from_config(loss_config)
+        self.regularization: AbstractRegularizer = instantiate_from_config(
+            regularizer_config
+        )
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.Adam"}
+        )
+        self.diff_boost_factor = diff_boost_factor
+        self.disc_start_iter = disc_start_iter
+        self.lr_g_factor = lr_g_factor
+        self.trainable_ae_params = trainable_ae_params
+        if self.trainable_ae_params is not None:
+            self.ae_optimizer_args = default(
+                ae_optimizer_args,
+                [{} for _ in range(len(self.trainable_ae_params))],
+            )
+            assert len(self.ae_optimizer_args) == len(self.trainable_ae_params)
+        else:
+            self.ae_optimizer_args = [{}]  # makes type consitent
+
+        self.trainable_disc_params = trainable_disc_params
+        if self.trainable_disc_params is not None:
+            self.disc_optimizer_args = default(
+                disc_optimizer_args,
+                [{} for _ in range(len(self.trainable_disc_params))],
+            )
+            assert len(self.disc_optimizer_args) == len(self.trainable_disc_params)
+        else:
+            self.disc_optimizer_args = [{}]  # makes type consitent
+
+        if ckpt_path is not None:
+            assert ckpt_engine is None, "Can't set ckpt_engine and ckpt_path"
+            logpy.warn("Checkpoint path is deprecated, use `checkpoint_egnine` instead")
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+        self.additional_decode_keys = set(default(additional_decode_keys, []))
+
+    def get_input(self, batch: Dict) -> torch.Tensor:
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in channels-first
+        # format (e.g., bchw instead if bhwc)
+        return batch[self.input_key]
+
+    def get_autoencoder_params(self) -> list:
+        params = []
+        if hasattr(self.loss, "get_trainable_autoencoder_parameters"):
+            params += list(self.loss.get_trainable_autoencoder_parameters())
+        if hasattr(self.regularization, "get_trainable_parameters"):
+            params += list(self.regularization.get_trainable_parameters())
+        params = params + list(self.encoder.parameters())
+        params = params + list(self.decoder.parameters())
+        return params
+
+    def get_discriminator_params(self) -> list:
+        if hasattr(self.loss, "get_trainable_parameters"):
+            params = list(self.loss.get_trainable_parameters())  # e.g., discriminator
+        else:
+            params = []
+        return params
+
+    def get_last_layer(self):
+        return self.decoder.get_last_layer()
+
+    def encode(
+        self,
+        x: torch.Tensor,
+        return_reg_log: bool = False,
+        unregularized: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
+        z = self.encoder(x)
+        if unregularized:
+            return z, dict()
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+
+    def decode(self, z: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.decoder(z, **kwargs)
+        return x
+
+    def forward(
+        self, x: torch.Tensor, **additional_decode_kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
+        z, reg_log = self.encode(x, return_reg_log=True)
+        dec = self.decode(z, **additional_decode_kwargs)
+        return z, dec, reg_log
+
+    def inner_training_step(
+        self, batch: dict, batch_idx: int, optimizer_idx: int = 0
+    ) -> torch.Tensor:
+        x = self.get_input(batch)
+        additional_decode_kwargs = {
+            key: batch[key] for key in self.additional_decode_keys.intersection(batch)
+        }
+        z, xrec, regularization_log = self(x, **additional_decode_kwargs)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": optimizer_idx,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "train",
+                "regularization_log": regularization_log,
+                "autoencoder": self,
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+
+        if optimizer_idx == 0:
+            # autoencode
+            out_loss = self.loss(x, xrec, **extra_info)
+            if isinstance(out_loss, tuple):
+                aeloss, log_dict_ae = out_loss
+            else:
+                # simple loss function
+                aeloss = out_loss
+                log_dict_ae = {"train/loss/rec": aeloss.detach()}
+
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=False,
+            )
+            self.log(
+                "loss",
+                aeloss.mean().detach(),
+                prog_bar=True,
+                logger=False,
+                on_epoch=False,
+                on_step=True,
+            )
+            return aeloss
+        elif optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(x, xrec, **extra_info)
+            # -> discriminator always needs to return a tuple
+            self.log_dict(
+                log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True
+            )
+            return discloss
+        else:
+            raise NotImplementedError(f"Unknown optimizer {optimizer_idx}")
+
+    def training_step(self, batch: dict, batch_idx: int):
+        opts = self.optimizers()
+        if not isinstance(opts, list):
+            # Non-adversarial case
+            opts = [opts]
+        optimizer_idx = batch_idx % len(opts)
+        if self.global_step < self.disc_start_iter:
+            optimizer_idx = 0
+        opt = opts[optimizer_idx]
+        opt.zero_grad()
+        with opt.toggle_model():
+            loss = self.inner_training_step(
+                batch, batch_idx, optimizer_idx=optimizer_idx
+            )
+            self.manual_backward(loss)
+        opt.step()
+
+    def validation_step(self, batch: dict, batch_idx: int) -> Dict:
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+            log_dict.update(log_dict_ema)
+        return log_dict
+
+    def _validation_step(self, batch: dict, batch_idx: int, postfix: str = "") -> Dict:
+        x = self.get_input(batch)
+
+        z, xrec, regularization_log = self(x)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": 0,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "val" + postfix,
+                "regularization_log": regularization_log,
+                "autoencoder": self,
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+        out_loss = self.loss(x, xrec, **extra_info)
+        if isinstance(out_loss, tuple):
+            aeloss, log_dict_ae = out_loss
+        else:
+            # simple loss function
+            aeloss = out_loss
+            log_dict_ae = {f"val{postfix}/loss/rec": aeloss.detach()}
+        full_log_dict = log_dict_ae
+
+        if "optimizer_idx" in extra_info:
+            extra_info["optimizer_idx"] = 1
+            discloss, log_dict_disc = self.loss(x, xrec, **extra_info)
+            full_log_dict.update(log_dict_disc)
+        self.log(
+            f"val{postfix}/loss/rec",
+            log_dict_ae[f"val{postfix}/loss/rec"],
+            sync_dist=True,
+        )
+        self.log_dict(full_log_dict, sync_dist=True)
+        return full_log_dict
+
+    def get_param_groups(
+        self, parameter_names: List[List[str]], optimizer_args: List[dict]
+    ) -> Tuple[List[Dict[str, Any]], int]:
+        groups = []
+        num_params = 0
+        for names, args in zip(parameter_names, optimizer_args):
+            params = []
+            for pattern_ in names:
+                pattern_params = []
+                pattern = re.compile(pattern_)
+                for p_name, param in self.named_parameters():
+                    if re.match(pattern, p_name):
+                        pattern_params.append(param)
+                        num_params += param.numel()
+                if len(pattern_params) == 0:
+                    logpy.warn(f"Did not find parameters for pattern {pattern_}")
+                params.extend(pattern_params)
+            groups.append({"params": params, **args})
+        return groups, num_params
+
+    def configure_optimizers(self) -> List[torch.optim.Optimizer]:
+        if self.trainable_ae_params is None:
+            ae_params = self.get_autoencoder_params()
+        else:
+            ae_params, num_ae_params = self.get_param_groups(
+                self.trainable_ae_params, self.ae_optimizer_args
+            )
+            logpy.info(f"Number of trainable autoencoder parameters: {num_ae_params:,}")
+        if self.trainable_disc_params is None:
+            disc_params = self.get_discriminator_params()
+        else:
+            disc_params, num_disc_params = self.get_param_groups(
+                self.trainable_disc_params, self.disc_optimizer_args
+            )
+            logpy.info(
+                f"Number of trainable discriminator parameters: {num_disc_params:,}"
+            )
+        opt_ae = self.instantiate_optimizer_from_config(
+            ae_params,
+            default(self.lr_g_factor, 1.0) * self.learning_rate,
+            self.optimizer_config,
+        )
+        opts = [opt_ae]
+        if len(disc_params) > 0:
+            opt_disc = self.instantiate_optimizer_from_config(
+                disc_params, self.learning_rate, self.optimizer_config
+            )
+            opts.append(opt_disc)
+
+        return opts
+
+    @torch.no_grad()
+    def log_images(
+        self, batch: dict, additional_log_kwargs: Optional[Dict] = None, **kwargs
+    ) -> dict:
+        log = dict()
+        additional_decode_kwargs = {}
+        x = self.get_input(batch)
+        additional_decode_kwargs.update(
+            {key: batch[key] for key in self.additional_decode_keys.intersection(batch)}
+        )
+
+        _, xrec, _ = self(x, **additional_decode_kwargs)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        diff = 0.5 * torch.abs(torch.clamp(xrec, -1.0, 1.0) - x)
+        diff.clamp_(0, 1.0)
+        log["diff"] = 2.0 * diff - 1.0
+        # diff_boost shows location of small errors, by boosting their
+        # brightness.
+        log["diff_boost"] = (
+            2.0 * torch.clamp(self.diff_boost_factor * diff, 0.0, 1.0) - 1
+        )
+        if hasattr(self.loss, "log_images"):
+            log.update(self.loss.log_images(x, xrec))
+        with self.ema_scope():
+            _, xrec_ema, _ = self(x, **additional_decode_kwargs)
+            log["reconstructions_ema"] = xrec_ema
+            diff_ema = 0.5 * torch.abs(torch.clamp(xrec_ema, -1.0, 1.0) - x)
+            diff_ema.clamp_(0, 1.0)
+            log["diff_ema"] = 2.0 * diff_ema - 1.0
+            log["diff_boost_ema"] = (
+                2.0 * torch.clamp(self.diff_boost_factor * diff_ema, 0.0, 1.0) - 1
+            )
+        if additional_log_kwargs:
+            additional_decode_kwargs.update(additional_log_kwargs)
+            _, xrec_add, _ = self(x, **additional_decode_kwargs)
+            log_str = "reconstructions-" + "-".join(
+                [f"{key}={additional_log_kwargs[key]}" for key in additional_log_kwargs]
+            )
+            log[log_str] = xrec_add
+        return log
+
+
+class AutoencodingEngineLegacy(AutoencodingEngine):
+    def __init__(self, embed_dim: int, **kwargs):
+        self.max_batch_size = kwargs.pop("max_batch_size", None)
+        ddconfig = kwargs.pop("ddconfig")
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ckpt_engine = kwargs.pop("ckpt_engine", None)
+        super().__init__(
+            encoder_config={
+                "target": "sgm.modules.diffusionmodules.model.Encoder",
+                "params": ddconfig,
+            },
+            decoder_config={
+                "target": "sgm.modules.diffusionmodules.model.Decoder",
+                "params": ddconfig,
+            },
+            **kwargs,
+        )
+        self.quant_conv = torch.nn.Conv2d(
+            (1 + ddconfig["double_z"]) * ddconfig["z_channels"],
+            (1 + ddconfig["double_z"]) * embed_dim,
+            1,
+        )
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+
+    def get_autoencoder_params(self) -> list:
+        params = super().get_autoencoder_params()
+        return params
+
+    def encode(
+        self, x: torch.Tensor, return_reg_log: bool = False
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
+        if self.max_batch_size is None:
+            z = self.encoder(x)
+            z = self.quant_conv(z)
+        else:
+            N = x.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            z = list()
+            for i_batch in range(n_batches):
+                z_batch = self.encoder(x[i_batch * bs : (i_batch + 1) * bs])
+                z_batch = self.quant_conv(z_batch)
+                z.append(z_batch)
+            z = torch.cat(z, 0)
+
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+
+    def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
+        if self.max_batch_size is None:
+            dec = self.post_quant_conv(z)
+            dec = self.decoder(dec, **decoder_kwargs)
+        else:
+            N = z.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            dec = list()
+            for i_batch in range(n_batches):
+                dec_batch = self.post_quant_conv(z[i_batch * bs : (i_batch + 1) * bs])
+                dec_batch = self.decoder(dec_batch, **decoder_kwargs)
+                dec.append(dec_batch)
+            dec = torch.cat(dec, 0)
+
+        return dec
+
+
+class AutoencoderKL(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        if "lossconfig" in kwargs:
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers"
+                    ".DiagonalGaussianRegularizer"
+                )
+            },
+            **kwargs,
+        )
+
+
+class AutoencoderLegacyVQ(AutoencodingEngineLegacy):
+    def __init__(
+        self,
+        embed_dim: int,
+        n_embed: int,
+        sane_index_shape: bool = False,
+        **kwargs,
+    ):
+        if "lossconfig" in kwargs:
+            logpy.warn(f"Parameter `lossconfig` is deprecated, use `loss_config`.")
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers.quantize" ".VectorQuantizer"
+                ),
+                "params": {
+                    "n_e": n_embed,
+                    "e_dim": embed_dim,
+                    "sane_index_shape": sane_index_shape,
+                },
+            },
+            **kwargs,
+        )
+
+
+class IdentityFirstStage(AbstractAutoencoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_input(self, x: Any) -> Any:
+        return x
+
+    def encode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+
+    def decode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+
+
+class AEIntegerWrapper(nn.Module):
+    def __init__(
+        self,
+        model: nn.Module,
+        shape: Union[None, Tuple[int, int], List[int]] = (16, 16),
+        regularization_key: str = "regularization",
+        encoder_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.model = model
+        assert hasattr(model, "encode") and hasattr(
+            model, "decode"
+        ), "Need AE interface"
+        self.regularization = get_nested_attribute(model, regularization_key)
+        self.shape = shape
+        self.encoder_kwargs = default(encoder_kwargs, {"return_reg_log": True})
+
+    def encode(self, x) -> torch.Tensor:
+        assert (
+            not self.training
+        ), f"{self.__class__.__name__} only supports inference currently"
+        _, log = self.model.encode(x, **self.encoder_kwargs)
+        assert isinstance(log, dict)
+        inds = log["min_encoding_indices"]
+        return rearrange(inds, "b ... -> b (...)")
+
+    def decode(
+        self, inds: torch.Tensor, shape: Union[None, tuple, list] = None
+    ) -> torch.Tensor:
+        # expect inds shape (b, s) with s = h*w
+        shape = default(shape, self.shape)  # Optional[(h, w)]
+        if shape is not None:
+            assert len(shape) == 2, f"Unhandeled shape {shape}"
+            inds = rearrange(inds, "b (h w) -> b h w", h=shape[0], w=shape[1])
+        h = self.regularization.get_codebook_entry(inds)  # (b, h, w, c)
+        h = rearrange(h, "b h w c -> b c h w")
+        return self.model.decode(h)
+
+
+class AutoencoderKLModeOnly(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        if "lossconfig" in kwargs:
+            kwargs["loss_config"] = kwargs.pop("lossconfig")
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "sgm.modules.autoencoding.regularizers"
+                    ".DiagonalGaussianRegularizer"
+                ),
+                "params": {"sample": False},
+            },
+            **kwargs,
+        )
diff --git a/sgm/models/diffusion.py b/sgm/models/diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c7f393bdead9c7700c31a19682d0989db8773c7
--- /dev/null
+++ b/sgm/models/diffusion.py
@@ -0,0 +1,291 @@
+import math
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytorch_lightning as pl
+import torch
+from einops import rearrange, repeat
+from omegaconf import ListConfig, OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+
+from ..modules import UNCONDITIONAL_CONFIG
+from ..modules.autoencoding.temporal_ae import VideoDecoder
+from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from ..modules.ema import LitEma
+from ..util import (default, disabled_train, get_obj_from_str,
+                    instantiate_from_config, log_txt_as_img)
+
+
+class DiffusionEngine(pl.LightningModule):
+    def __init__(
+        self,
+        network_config,
+        denoiser_config,
+        first_stage_config,
+        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        network_wrapper: Union[None, str] = None,
+        ckpt_path: Union[None, str] = None,
+        use_ema: bool = False,
+        ema_decay_rate: float = 0.9999,
+        scale_factor: float = 1.0,
+        disable_first_stage_autocast=False,
+        input_key: str = "jpg",
+        log_keys: Union[List, None] = None,
+        no_cond_log: bool = False,
+        compile_model: bool = False,
+        en_and_decode_n_samples_a_time: Optional[int] = None,
+    ):
+        super().__init__()
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
+            model, compile_model=compile_model
+        )
+
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+
+        ## update with num_frames
+        self.num_frames = network_config.params.num_frames
+        if loss_fn_config is not None:
+            loss_fn_config.params.num_frames = self.num_frames
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+
+    def init_from_ckpt(
+        self,
+        path: str,
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+
+        n_rounds = math.ceil(z.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
+                else:
+                    kwargs = {}
+                out = self.first_stage_model.decode(
+                    z[n * n_samples : (n + 1) * n_samples], **kwargs
+                )
+                all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        return out
+
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
+        n_rounds = math.ceil(x.shape[0] / n_samples)
+        all_out = []
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                out = self.first_stage_model.encode(
+                    x[n * n_samples : (n + 1) * n_samples]
+                )
+                all_out.append(out)
+        z = torch.cat(all_out, dim=0)
+        z = self.scale_factor * z
+        return z
+
+    def forward(self, x, batch):
+        loss = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch)
+        loss_mean = loss.mean()
+        loss_dict = {"loss": loss_mean}
+        return loss_mean, loss_dict
+
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        x = batch[self.input_key]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+
+    def shared_step(self, batch: Dict) -> Any:
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        self.log("global_step", self.global_step, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log("lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        return loss
+
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                params = params + list(embedder.parameters())
+
+        print(f"@Training [{len(params)}] paramters.")
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    "interval": "step",
+                    "frequency": 1,
+                }
+            ]
+            return [opt], scheduler
+        return opt
+
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        # randn = torch.randn(batch_size, *shape).to(self.device)
+        if 'randn' in kwargs:
+            randn = kwargs.pop('randn')
+        else:
+            randn = torch.randn(batch_size, *shape).to(self.device)
+
+        denoiser = lambda input, sigma, c: self.denoiser(
+            self.model, input, sigma, c, **kwargs
+        )
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[2:]
+        log = dict()
+
+        for embedder in self.conditioner.embedders:
+            if (
+                (self.log_keys is None) or (embedder.input_key in self.log_keys)
+            ) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = [
+                            "x".join([str(xx) for xx in x[i].tolist()])
+                            for i in range(x.shape[0])
+                        ]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
\ No newline at end of file
diff --git a/sgm/modules/__init__.py b/sgm/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0db1d7716a6e48f77b86a4b59c9289d6fb76b50b
--- /dev/null
+++ b/sgm/modules/__init__.py
@@ -0,0 +1,6 @@
+from .encoders.modules import GeneralConditioner
+
+UNCONDITIONAL_CONFIG = {
+    "target": "sgm.modules.GeneralConditioner",
+    "params": {"emb_models": []},
+}
diff --git a/sgm/modules/attention.py b/sgm/modules/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a50b7bd744bea0f0cdca23b0cfd14ad87794be
--- /dev/null
+++ b/sgm/modules/attention.py
@@ -0,0 +1,759 @@
+import logging
+import math
+from inspect import isfunction
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from packaging import version
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+logpy = logging.getLogger(__name__)
+
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    logpy.warn(
+        f"No SDP backend available, likely because you are running in pytorch "
+        f"versions < 2.0. In fact, you are using PyTorch {torch.__version__}. "
+        f"You might want to consider upgrading."
+    )
+
+try:
+    import xformers
+    import xformers.ops
+
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    logpy.warn("no module 'xformers'. Processing without...")
+
+# from .diffusionmodules.util import mixed_checkpoint as checkpoint
+
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(
+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
+        )
+        return self.to_out(out)
+
+
+class SelfAttention(nn.Module):
+    ATTENTION_MODES = ("xformers", "torch", "math")
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_scale: Optional[float] = None,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        attn_mode: str = "xformers",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        assert attn_mode in self.ATTENTION_MODES
+        self.attn_mode = attn_mode
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, C = x.shape
+
+        qkv = self.qkv(x)
+        if self.attn_mode == "torch":
+            qkv = rearrange(
+                qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+            ).float()
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            x = rearrange(x, "B H L D -> B L (H D)")
+        elif self.attn_mode == "xformers":
+            qkv = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = rearrange(x, "B L H D -> B L (H D)", H=self.num_heads)
+        elif self.attn_mode == "math":
+            qkv = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplemented
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        backend=None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.backend = backend
+
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        h = self.heads
+
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+
+        ## old
+        """
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        del q, k
+
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+
+        out = einsum('b i j, b j d -> b i d', sim, v)
+        """
+        ## new
+        with sdp_kernel(**BACKEND_MAP[self.backend]):
+            # print("dispatching into backend", self.backend, "q/k/v shape: ", q.shape, k.shape, v.shape)
+            out = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=mask
+            )  # scale is dim_head ** -0.5 per default
+
+        del q, k, v
+        out = rearrange(out, "b h n d -> b n (h d)", h=h)
+
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+
+
+class MemoryEfficientCrossAttention(nn.Module):
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(
+        self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs
+    ):
+        super().__init__()
+        logpy.debug(
+            f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, "
+            f"context_dim is {context_dim} and using {heads} heads with a "
+            f"dimension of {dim_head}."
+        )
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            # n_cp = x.shape[0]//n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+
+        # actually compute the attention, what we cannot get enough of
+        if version.parse(xformers.__version__) >= version.parse("0.0.21"):
+            # NOTE: workaround for
+            # https://github.com/facebookresearch/xformers/issues/845
+            max_bs = 32768
+            N = q.shape[0]
+            n_batches = math.ceil(N / max_bs)
+            out = list()
+            for i_batch in range(n_batches):
+                batch = slice(i_batch * max_bs, (i_batch + 1) * max_bs)
+                out.append(
+                    xformers.ops.memory_efficient_attention(
+                        q[batch],
+                        k[batch],
+                        v[batch],
+                        attn_bias=None,
+                        op=self.attention_op,
+                    )
+                )
+            out = torch.cat(out, 0)
+        else:
+            out = xformers.ops.memory_efficient_attention(
+                q, k, v, attn_bias=None, op=self.attention_op
+            )
+
+        # TODO: Use this directly in the attention operation, as a bias
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+
+
+class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention,  # ampere
+    }
+
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        disable_self_attn=False,
+        attn_mode="softmax",
+        sdp_backend=None,
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
+            logpy.warn(
+                f"Attention mode '{attn_mode}' is not available. Falling "
+                f"back to native attention. This is not a problem in "
+                f"Pytorch >= 2.0. FYI, you are running with PyTorch "
+                f"version {torch.__version__}."
+            )
+            attn_mode = "softmax"
+        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
+            logpy.warn(
+                "We do not support vanilla attention anymore, as it is too "
+                "expensive. Sorry."
+            )
+            if not XFORMERS_IS_AVAILABLE:
+                assert (
+                    False
+                ), "Please install xformers via e.g. 'pip install xformers==0.0.16'"
+            else:
+                logpy.info("Falling back to xformers efficient attention.")
+                attn_mode = "softmax-xformers"
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
+        else:
+            assert sdp_backend is None
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None,
+            backend=sdp_backend,
+        )  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            backend=sdp_backend,
+        )  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+        if self.checkpoint:
+            logpy.debug(f"{self.__class__.__name__} is using checkpointing")
+
+    def forward(
+        self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        kwargs = {"x": x}
+
+        if context is not None:
+            kwargs.update({"context": context})
+
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+
+        if n_times_crossframe_attn_in_self:
+            kwargs.update(
+                {"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self}
+            )
+
+        # return mixed_checkpoint(self._forward, kwargs, self.parameters(), self.checkpoint)
+        if self.checkpoint:
+            # inputs = {"x": x, "context": context}
+            return checkpoint(self._forward, x, context)
+            # return checkpoint(self._forward, inputs, self.parameters(), self.checkpoint)
+        else:
+            return self._forward(**kwargs)
+
+    def _forward(
+        self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        x = (
+            self.attn1(
+                self.norm1(x),
+                context=context if self.disable_self_attn else None,
+                additional_tokens=additional_tokens,
+                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+                if not self.disable_self_attn
+                else 0,
+            )
+            + x
+        )
+        x = (
+            self.attn2(
+                self.norm2(x), context=context, additional_tokens=additional_tokens
+            )
+            + x
+        )
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class BasicTransformerSingleLayerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # on the A100s not quite as fast as the above version
+        # (todo might depend on head_dim, check, falls back to semi-optimized kernels for dim!=[16,32,64,128])
+    }
+
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        attn_mode="softmax",
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim,
+        )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        # inputs = {"x": x, "context": context}
+        # return checkpoint(self._forward, inputs, self.parameters(), self.checkpoint)
+        return checkpoint(self._forward, x, context)
+
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context) + x
+        x = self.ff(self.norm2(x)) + x
+        return x
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        disable_self_attn=False,
+        use_linear=False,
+        attn_type="softmax",
+        use_checkpoint=True,
+        # sdp_backend=SDPBackend.FLASH_ATTENTION
+        sdp_backend=None,
+    ):
+        super().__init__()
+        logpy.debug(
+            f"constructing {self.__class__.__name__} of depth {depth} w/ "
+            f"{in_channels} channels and {n_heads} heads."
+        )
+
+        if exists(context_dim) and not isinstance(context_dim, list):
+            context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                logpy.warn(
+                    f"{self.__class__.__name__}: Found context dims "
+                    f"{context_dim} of depth {len(context_dim)}, which does not "
+                    f"match the specified 'depth' of {depth}. Setting context_dim "
+                    f"to {depth * [context_dim[0]]} now."
+                )
+                # depth does not match context dims.
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+            )
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim[d],
+                    disable_self_attn=disable_self_attn,
+                    attn_mode=attn_type,
+                    checkpoint=use_checkpoint,
+                    sdp_backend=sdp_backend,
+                )
+                for d in range(depth)
+            ]
+        )
+        if not use_linear:
+            self.proj_out = zero_module(
+                nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+            )
+        else:
+            # self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        if not isinstance(context, list):
+            context = [context]
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c").contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
+            x = block(x, context=context[i])
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+
+
+class SimpleTransformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        context_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        checkpoint: bool = True,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                BasicTransformerBlock(
+                    dim,
+                    heads,
+                    dim_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    attn_mode="softmax-xformers",
+                    checkpoint=checkpoint,
+                )
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, context)
+        return x
diff --git a/sgm/modules/autoencoding/__init__.py b/sgm/modules/autoencoding/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/autoencoding/losses/__init__.py b/sgm/modules/autoencoding/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b316c7aa6ea1c5e31a58987aa3b37b2933eb7e2
--- /dev/null
+++ b/sgm/modules/autoencoding/losses/__init__.py
@@ -0,0 +1,7 @@
+__all__ = [
+    "GeneralLPIPSWithDiscriminator",
+    "LatentLPIPS",
+]
+
+from .discriminator_loss import GeneralLPIPSWithDiscriminator
+from .lpips import LatentLPIPS
diff --git a/sgm/modules/autoencoding/losses/discriminator_loss.py b/sgm/modules/autoencoding/losses/discriminator_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b6829267bf8e4d98c3f29abdc19e58dcbcbe64
--- /dev/null
+++ b/sgm/modules/autoencoding/losses/discriminator_loss.py
@@ -0,0 +1,306 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
+from einops import rearrange
+from matplotlib import colormaps
+from matplotlib import pyplot as plt
+
+from ....util import default, instantiate_from_config
+from ..lpips.loss.lpips import LPIPS
+from ..lpips.model.model import weights_init
+from ..lpips.vqperceptual import hinge_d_loss, vanilla_d_loss
+
+
+class GeneralLPIPSWithDiscriminator(nn.Module):
+    def __init__(
+        self,
+        disc_start: int,
+        logvar_init: float = 0.0,
+        disc_num_layers: int = 3,
+        disc_in_channels: int = 3,
+        disc_factor: float = 1.0,
+        disc_weight: float = 1.0,
+        perceptual_weight: float = 1.0,
+        disc_loss: str = "hinge",
+        scale_input_to_tgt_size: bool = False,
+        dims: int = 2,
+        learn_logvar: bool = False,
+        regularization_weights: Union[None, Dict[str, float]] = None,
+        additional_log_keys: Optional[List[str]] = None,
+        discriminator_config: Optional[Dict] = None,
+    ):
+        super().__init__()
+        self.dims = dims
+        if self.dims > 2:
+            print(
+                f"running with dims={dims}. This means that for perceptual loss "
+                f"calculation, the LPIPS loss will be applied to each frame "
+                f"independently."
+            )
+        self.scale_input_to_tgt_size = scale_input_to_tgt_size
+        assert disc_loss in ["hinge", "vanilla"]
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # output log variance
+        self.logvar = nn.Parameter(
+            torch.full((), logvar_init), requires_grad=learn_logvar
+        )
+        self.learn_logvar = learn_logvar
+
+        discriminator_config = default(
+            discriminator_config,
+            {
+                "target": "sgm.modules.autoencoding.lpips.model.model.NLayerDiscriminator",
+                "params": {
+                    "input_nc": disc_in_channels,
+                    "n_layers": disc_num_layers,
+                    "use_actnorm": False,
+                },
+            },
+        )
+
+        self.discriminator = instantiate_from_config(discriminator_config).apply(
+            weights_init
+        )
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.regularization_weights = default(regularization_weights, {})
+
+        self.forward_keys = [
+            "optimizer_idx",
+            "global_step",
+            "last_layer",
+            "split",
+            "regularization_log",
+        ]
+
+        self.additional_log_keys = set(default(additional_log_keys, []))
+        self.additional_log_keys.update(set(self.regularization_weights.keys()))
+
+    def get_trainable_parameters(self) -> Iterator[nn.Parameter]:
+        return self.discriminator.parameters()
+
+    def get_trainable_autoencoder_parameters(self) -> Iterator[nn.Parameter]:
+        if self.learn_logvar:
+            yield self.logvar
+        yield from ()
+
+    @torch.no_grad()
+    def log_images(
+        self, inputs: torch.Tensor, reconstructions: torch.Tensor
+    ) -> Dict[str, torch.Tensor]:
+        # calc logits of real/fake
+        logits_real = self.discriminator(inputs.contiguous().detach())
+        if len(logits_real.shape) < 4:
+            # Non patch-discriminator
+            return dict()
+        logits_fake = self.discriminator(reconstructions.contiguous().detach())
+        # -> (b, 1, h, w)
+
+        # parameters for colormapping
+        high = max(logits_fake.abs().max(), logits_real.abs().max()).item()
+        cmap = colormaps["PiYG"]  # diverging colormap
+
+        def to_colormap(logits: torch.Tensor) -> torch.Tensor:
+            """(b, 1, ...) -> (b, 3, ...)"""
+            logits = (logits + high) / (2 * high)
+            logits_np = cmap(logits.cpu().numpy())[..., :3]  # truncate alpha channel
+            # -> (b, 1, ..., 3)
+            logits = torch.from_numpy(logits_np).to(logits.device)
+            return rearrange(logits, "b 1 ... c -> b c ...")
+
+        logits_real = torch.nn.functional.interpolate(
+            logits_real,
+            size=inputs.shape[-2:],
+            mode="nearest",
+            antialias=False,
+        )
+        logits_fake = torch.nn.functional.interpolate(
+            logits_fake,
+            size=reconstructions.shape[-2:],
+            mode="nearest",
+            antialias=False,
+        )
+
+        # alpha value of logits for overlay
+        alpha_real = torch.abs(logits_real) / high
+        alpha_fake = torch.abs(logits_fake) / high
+        # -> (b, 1, h, w) in range [0, 0.5]
+        # alpha value of lines don't really matter, since the values are the same
+        # for both images and logits anyway
+        grid_alpha_real = torchvision.utils.make_grid(alpha_real, nrow=4)
+        grid_alpha_fake = torchvision.utils.make_grid(alpha_fake, nrow=4)
+        grid_alpha = 0.8 * torch.cat((grid_alpha_real, grid_alpha_fake), dim=1)
+        # -> (1, h, w)
+        # blend logits and images together
+
+        # prepare logits for plotting
+        logits_real = to_colormap(logits_real)
+        logits_fake = to_colormap(logits_fake)
+        # resize logits
+        # -> (b, 3, h, w)
+
+        # make some grids
+        # add all logits to one plot
+        logits_real = torchvision.utils.make_grid(logits_real, nrow=4)
+        logits_fake = torchvision.utils.make_grid(logits_fake, nrow=4)
+        # I just love how torchvision calls the number of columns `nrow`
+        grid_logits = torch.cat((logits_real, logits_fake), dim=1)
+        # -> (3, h, w)
+
+        grid_images_real = torchvision.utils.make_grid(0.5 * inputs + 0.5, nrow=4)
+        grid_images_fake = torchvision.utils.make_grid(
+            0.5 * reconstructions + 0.5, nrow=4
+        )
+        grid_images = torch.cat((grid_images_real, grid_images_fake), dim=1)
+        # -> (3, h, w) in range [0, 1]
+
+        grid_blend = grid_alpha * grid_logits + (1 - grid_alpha) * grid_images
+
+        # Create labeled colorbar
+        dpi = 100
+        height = 128 / dpi
+        width = grid_logits.shape[2] / dpi
+        fig, ax = plt.subplots(figsize=(width, height), dpi=dpi)
+        img = ax.imshow(np.array([[-high, high]]), cmap=cmap)
+        plt.colorbar(
+            img,
+            cax=ax,
+            orientation="horizontal",
+            fraction=0.9,
+            aspect=width / height,
+            pad=0.0,
+        )
+        img.set_visible(False)
+        fig.tight_layout()
+        fig.canvas.draw()
+        # manually convert figure to numpy
+        cbar_np = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        cbar_np = cbar_np.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+        cbar = torch.from_numpy(cbar_np.copy()).to(grid_logits.dtype) / 255.0
+        cbar = rearrange(cbar, "h w c -> c h w").to(grid_logits.device)
+
+        # Add colorbar to plot
+        annotated_grid = torch.cat((grid_logits, cbar), dim=1)
+        blended_grid = torch.cat((grid_blend, cbar), dim=1)
+        return {
+            "vis_logits": 2 * annotated_grid[None, ...] - 1,
+            "vis_logits_blended": 2 * blended_grid[None, ...] - 1,
+        }
+
+    def calculate_adaptive_weight(
+        self, nll_loss: torch.Tensor, g_loss: torch.Tensor, last_layer: torch.Tensor
+    ) -> torch.Tensor:
+        nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+        g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        reconstructions: torch.Tensor,
+        *,  # added because I changed the order here
+        regularization_log: Dict[str, torch.Tensor],
+        optimizer_idx: int,
+        global_step: int,
+        last_layer: torch.Tensor,
+        split: str = "train",
+        weights: Union[None, float, torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, dict]:
+        if self.scale_input_to_tgt_size:
+            inputs = torch.nn.functional.interpolate(
+                inputs, reconstructions.shape[2:], mode="bicubic", antialias=True
+            )
+
+        if self.dims > 2:
+            inputs, reconstructions = map(
+                lambda x: rearrange(x, "b c t h w -> (b t) c h w"),
+                (inputs, reconstructions),
+            )
+
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+
+        nll_loss, weighted_nll_loss = self.get_nll_loss(rec_loss, weights)
+
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if global_step >= self.discriminator_iter_start or not self.training:
+                logits_fake = self.discriminator(reconstructions.contiguous())
+                g_loss = -torch.mean(logits_fake)
+                if self.training:
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
+                else:
+                    d_weight = torch.tensor(1.0)
+            else:
+                d_weight = torch.tensor(0.0)
+                g_loss = torch.tensor(0.0, requires_grad=True)
+
+            loss = weighted_nll_loss + d_weight * self.disc_factor * g_loss
+            log = dict()
+            for k in regularization_log:
+                if k in self.regularization_weights:
+                    loss = loss + self.regularization_weights[k] * regularization_log[k]
+                if k in self.additional_log_keys:
+                    log[f"{split}/{k}"] = regularization_log[k].detach().float().mean()
+
+            log.update(
+                {
+                    f"{split}/loss/total": loss.clone().detach().mean(),
+                    f"{split}/loss/nll": nll_loss.detach().mean(),
+                    f"{split}/loss/rec": rec_loss.detach().mean(),
+                    f"{split}/loss/g": g_loss.detach().mean(),
+                    f"{split}/scalars/logvar": self.logvar.detach(),
+                    f"{split}/scalars/d_weight": d_weight.detach(),
+                }
+            )
+
+            return loss, log
+        elif optimizer_idx == 1:
+            # second pass for discriminator update
+            logits_real = self.discriminator(inputs.contiguous().detach())
+            logits_fake = self.discriminator(reconstructions.contiguous().detach())
+
+            if global_step >= self.discriminator_iter_start or not self.training:
+                d_loss = self.disc_factor * self.disc_loss(logits_real, logits_fake)
+            else:
+                d_loss = torch.tensor(0.0, requires_grad=True)
+
+            log = {
+                f"{split}/loss/disc": d_loss.clone().detach().mean(),
+                f"{split}/logits/real": logits_real.detach().mean(),
+                f"{split}/logits/fake": logits_fake.detach().mean(),
+            }
+            return d_loss, log
+        else:
+            raise NotImplementedError(f"Unknown optimizer_idx {optimizer_idx}")
+
+    def get_nll_loss(
+        self,
+        rec_loss: torch.Tensor,
+        weights: Optional[Union[float, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights * nll_loss
+        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+
+        return nll_loss, weighted_nll_loss
diff --git a/sgm/modules/autoencoding/losses/lpips.py b/sgm/modules/autoencoding/losses/lpips.py
new file mode 100644
index 0000000000000000000000000000000000000000..b329fcc2ee9477f0122aa7d066866cdfe71ce521
--- /dev/null
+++ b/sgm/modules/autoencoding/losses/lpips.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+
+from ....util import default, instantiate_from_config
+from ..lpips.loss.lpips import LPIPS
+
+
+class LatentLPIPS(nn.Module):
+    def __init__(
+        self,
+        decoder_config,
+        perceptual_weight=1.0,
+        latent_weight=1.0,
+        scale_input_to_tgt_size=False,
+        scale_tgt_to_input_size=False,
+        perceptual_weight_on_inputs=0.0,
+    ):
+        super().__init__()
+        self.scale_input_to_tgt_size = scale_input_to_tgt_size
+        self.scale_tgt_to_input_size = scale_tgt_to_input_size
+        self.init_decoder(decoder_config)
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        self.latent_weight = latent_weight
+        self.perceptual_weight_on_inputs = perceptual_weight_on_inputs
+
+    def init_decoder(self, config):
+        self.decoder = instantiate_from_config(config)
+        if hasattr(self.decoder, "encoder"):
+            del self.decoder.encoder
+
+    def forward(self, latent_inputs, latent_predictions, image_inputs, split="train"):
+        log = dict()
+        loss = (latent_inputs - latent_predictions) ** 2
+        log[f"{split}/latent_l2_loss"] = loss.mean().detach()
+        image_reconstructions = None
+        if self.perceptual_weight > 0.0:
+            image_reconstructions = self.decoder.decode(latent_predictions)
+            image_targets = self.decoder.decode(latent_inputs)
+            perceptual_loss = self.perceptual_loss(
+                image_targets.contiguous(), image_reconstructions.contiguous()
+            )
+            loss = (
+                self.latent_weight * loss.mean()
+                + self.perceptual_weight * perceptual_loss.mean()
+            )
+            log[f"{split}/perceptual_loss"] = perceptual_loss.mean().detach()
+
+        if self.perceptual_weight_on_inputs > 0.0:
+            image_reconstructions = default(
+                image_reconstructions, self.decoder.decode(latent_predictions)
+            )
+            if self.scale_input_to_tgt_size:
+                image_inputs = torch.nn.functional.interpolate(
+                    image_inputs,
+                    image_reconstructions.shape[2:],
+                    mode="bicubic",
+                    antialias=True,
+                )
+            elif self.scale_tgt_to_input_size:
+                image_reconstructions = torch.nn.functional.interpolate(
+                    image_reconstructions,
+                    image_inputs.shape[2:],
+                    mode="bicubic",
+                    antialias=True,
+                )
+
+            perceptual_loss2 = self.perceptual_loss(
+                image_inputs.contiguous(), image_reconstructions.contiguous()
+            )
+            loss = loss + self.perceptual_weight_on_inputs * perceptual_loss2.mean()
+            log[f"{split}/perceptual_loss_on_inputs"] = perceptual_loss2.mean().detach()
+        return loss, log
diff --git a/sgm/modules/autoencoding/lpips/__init__.py b/sgm/modules/autoencoding/lpips/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/autoencoding/lpips/loss/.gitignore b/sgm/modules/autoencoding/lpips/loss/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a92958a1cd4ffe005e1f5448ab3e6fd9c795a43a
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/loss/.gitignore
@@ -0,0 +1 @@
+vgg.pth
\ No newline at end of file
diff --git a/sgm/modules/autoencoding/lpips/loss/LICENSE b/sgm/modules/autoencoding/lpips/loss/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..924cfc85b8d63ef538f5676f830a2a8497932108
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/loss/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2018, Richard Zhang, Phillip Isola, Alexei A. Efros, Eli Shechtman, Oliver Wang
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/sgm/modules/autoencoding/lpips/loss/__init__.py b/sgm/modules/autoencoding/lpips/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/autoencoding/lpips/loss/lpips.py b/sgm/modules/autoencoding/lpips/loss/lpips.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e34f3d083674f675a5ca024e9bd27fb77e2b6b5
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/loss/lpips.py
@@ -0,0 +1,147 @@
+"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+
+from collections import namedtuple
+
+import torch
+import torch.nn as nn
+from torchvision import models
+
+from ..util import get_ckpt_path
+
+
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def load_from_pretrained(self, name="vgg_lpips"):
+        ckpt = get_ckpt_path(name, "sgm/modules/autoencoding/lpips/loss")
+        self.load_state_dict(
+            torch.load(ckpt, map_location=torch.device("cpu")), strict=False
+        )
+        print("loaded pretrained LPIPS loss from {}".format(ckpt))
+
+    @classmethod
+    def from_pretrained(cls, name="vgg_lpips"):
+        if name != "vgg_lpips":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name)
+        model.load_state_dict(
+            torch.load(ckpt, map_location=torch.device("cpu")), strict=False
+        )
+        return model
+
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(
+                outs1[kk]
+            )
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+
+        res = [
+            spatial_average(lins[kk].model(diffs[kk]), keepdim=True)
+            for kk in range(len(self.chns))
+        ]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+
+
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer(
+            "shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]
+        )
+        self.register_buffer(
+            "scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]
+        )
+
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+
+
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv"""
+
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+
+
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple(
+            "VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]
+        )
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+
+
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+
+
+def spatial_average(x, keepdim=True):
+    return x.mean([2, 3], keepdim=keepdim)
diff --git a/sgm/modules/autoencoding/lpips/model/LICENSE b/sgm/modules/autoencoding/lpips/model/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..4b356e66b5aa689b339f1a80a9f1b5ba378003bb
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/model/LICENSE
@@ -0,0 +1,58 @@
+Copyright (c) 2017, Jun-Yan Zhu and Taesung Park
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+--------------------------- LICENSE FOR pix2pix --------------------------------
+BSD License
+
+For pix2pix software
+Copyright (c) 2016, Phillip Isola and Jun-Yan Zhu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+----------------------------- LICENSE FOR DCGAN --------------------------------
+BSD License
+
+For dcgan.torch software
+
+Copyright (c) 2015, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+Neither the name Facebook nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/sgm/modules/autoencoding/lpips/model/__init__.py b/sgm/modules/autoencoding/lpips/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/autoencoding/lpips/model/model.py b/sgm/modules/autoencoding/lpips/model/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..66357d4e627f9a69a5abbbad15546c96fcd758fe
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/model/model.py
@@ -0,0 +1,88 @@
+import functools
+
+import torch.nn as nn
+
+from ..util import ActNorm
+
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm") != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+
+
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if (
+            type(norm_layer) == functools.partial
+        ):  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+
+        kw = 4
+        padw = 1
+        sequence = [
+            nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+            nn.LeakyReLU(0.2, True),
+        ]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2**n, 8)
+            sequence += [
+                nn.Conv2d(
+                    ndf * nf_mult_prev,
+                    ndf * nf_mult,
+                    kernel_size=kw,
+                    stride=2,
+                    padding=padw,
+                    bias=use_bias,
+                ),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True),
+            ]
+
+        nf_mult_prev = nf_mult
+        nf_mult = min(2**n_layers, 8)
+        sequence += [
+            nn.Conv2d(
+                ndf * nf_mult_prev,
+                ndf * nf_mult,
+                kernel_size=kw,
+                stride=1,
+                padding=padw,
+                bias=use_bias,
+            ),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True),
+        ]
+
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
+        ]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)
diff --git a/sgm/modules/autoencoding/lpips/util.py b/sgm/modules/autoencoding/lpips/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..49c76e370bf16888ab61f42844b3c9f14ad9014c
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/util.py
@@ -0,0 +1,128 @@
+import hashlib
+import os
+
+import requests
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"}
+
+CKPT_MAP = {"vgg_lpips": "vgg.pth"}
+
+MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"}
+
+
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+
+
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+
+
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+
+
+class ActNorm(nn.Module):
+    def __init__(
+        self, num_features, logdet=False, affine=True, allow_reverse_init=False
+    ):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+
+        self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8))
+
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+
+        _, _, height, width = input.shape
+
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+
+        h = self.scale * (input + self.loc)
+
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height * width * torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+
+        return h
+
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+
+        if len(output.shape) == 2:
+            output = output[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+
+        h = output / self.scale - self.loc
+
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
diff --git a/sgm/modules/autoencoding/lpips/vqperceptual.py b/sgm/modules/autoencoding/lpips/vqperceptual.py
new file mode 100644
index 0000000000000000000000000000000000000000..6195f0a6ed7ee6fd32c1bccea071e6075e95ee43
--- /dev/null
+++ b/sgm/modules/autoencoding/lpips/vqperceptual.py
@@ -0,0 +1,17 @@
+import torch
+import torch.nn.functional as F
+
+
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+
+
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(torch.nn.functional.softplus(-logits_real))
+        + torch.mean(torch.nn.functional.softplus(logits_fake))
+    )
+    return d_loss
diff --git a/sgm/modules/autoencoding/regularizers/__init__.py b/sgm/modules/autoencoding/regularizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff2b1815a5ba88892375e8ec9bedacea49024113
--- /dev/null
+++ b/sgm/modules/autoencoding/regularizers/__init__.py
@@ -0,0 +1,31 @@
+from abc import abstractmethod
+from typing import Any, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ....modules.distributions.distributions import \
+    DiagonalGaussianDistribution
+from .base import AbstractRegularizer
+
+
+class DiagonalGaussianRegularizer(AbstractRegularizer):
+    def __init__(self, sample: bool = True):
+        super().__init__()
+        self.sample = sample
+
+    def get_trainable_parameters(self) -> Any:
+        yield from ()
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        log = dict()
+        posterior = DiagonalGaussianDistribution(z)
+        if self.sample:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        kl_loss = posterior.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+        log["kl_loss"] = kl_loss
+        return z, log
diff --git a/sgm/modules/autoencoding/regularizers/base.py b/sgm/modules/autoencoding/regularizers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca681bb3c1f4818b57e956e31b98f76077ccb67
--- /dev/null
+++ b/sgm/modules/autoencoding/regularizers/base.py
@@ -0,0 +1,40 @@
+from abc import abstractmethod
+from typing import Any, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class AbstractRegularizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_trainable_parameters(self) -> Any:
+        raise NotImplementedError()
+
+
+class IdentityRegularizer(AbstractRegularizer):
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        return z, dict()
+
+    def get_trainable_parameters(self) -> Any:
+        yield from ()
+
+
+def measure_perplexity(
+    predicted_indices: torch.Tensor, num_centroids: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
+    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
+    encodings = (
+        F.one_hot(predicted_indices, num_centroids).float().reshape(-1, num_centroids)
+    )
+    avg_probs = encodings.mean(0)
+    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
+    cluster_use = torch.sum(avg_probs > 0)
+    return perplexity, cluster_use
diff --git a/sgm/modules/autoencoding/regularizers/quantize.py b/sgm/modules/autoencoding/regularizers/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a4dbdd10101b24f03bba134c4f8d2ab007f0db
--- /dev/null
+++ b/sgm/modules/autoencoding/regularizers/quantize.py
@@ -0,0 +1,487 @@
+import logging
+from abc import abstractmethod
+from typing import Dict, Iterator, Literal, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch import einsum
+
+from .base import AbstractRegularizer, measure_perplexity
+
+logpy = logging.getLogger(__name__)
+
+
+class AbstractQuantizer(AbstractRegularizer):
+    def __init__(self):
+        super().__init__()
+        # Define these in your init
+        # shape (N,)
+        self.used: Optional[torch.Tensor]
+        self.re_embed: int
+        self.unknown_index: Union[Literal["random"], int]
+
+    def remap_to_used(self, inds: torch.Tensor) -> torch.Tensor:
+        assert self.used is not None, "You need to define used indices for remap"
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(
+                device=new.device
+            )
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+
+    def unmap_to_all(self, inds: torch.Tensor) -> torch.Tensor:
+        assert self.used is not None, "You need to define used indices for remap"
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+
+    @abstractmethod
+    def get_codebook_entry(
+        self, indices: torch.Tensor, shape: Optional[Tuple[int, ...]] = None
+    ) -> torch.Tensor:
+        raise NotImplementedError()
+
+    def get_trainable_parameters(self) -> Iterator[torch.nn.Parameter]:
+        yield from self.parameters()
+
+
+class GumbelQuantizer(AbstractQuantizer):
+    """
+    credit to @karpathy:
+    https://github.com/karpathy/deep-vector-quantization/blob/main/model.py (thanks!)
+    Gumbel Softmax trick quantizer
+    Categorical Reparameterization with Gumbel-Softmax, Jang et al. 2016
+    https://arxiv.org/abs/1611.01144
+    """
+
+    def __init__(
+        self,
+        num_hiddens: int,
+        embedding_dim: int,
+        n_embed: int,
+        straight_through: bool = True,
+        kl_weight: float = 5e-4,
+        temp_init: float = 1.0,
+        remap: Optional[str] = None,
+        unknown_index: str = "random",
+        loss_key: str = "loss/vq",
+    ) -> None:
+        super().__init__()
+
+        self.loss_key = loss_key
+        self.embedding_dim = embedding_dim
+        self.n_embed = n_embed
+
+        self.straight_through = straight_through
+        self.temperature = temp_init
+        self.kl_weight = kl_weight
+
+        self.proj = nn.Conv2d(num_hiddens, n_embed, 1)
+        self.embed = nn.Embedding(n_embed, embedding_dim)
+
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+        else:
+            self.used = None
+            self.re_embed = n_embed
+        if unknown_index == "extra":
+            self.unknown_index = self.re_embed
+            self.re_embed = self.re_embed + 1
+        else:
+            assert unknown_index == "random" or isinstance(
+                unknown_index, int
+            ), "unknown index needs to be 'random', 'extra' or any integer"
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+        if self.remap is not None:
+            logpy.info(
+                f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+
+    def forward(
+        self, z: torch.Tensor, temp: Optional[float] = None, return_logits: bool = False
+    ) -> Tuple[torch.Tensor, Dict]:
+        # force hard = True when we are in eval mode, as we must quantize.
+        # actually, always true seems to work
+        hard = self.straight_through if self.training else True
+        temp = self.temperature if temp is None else temp
+        out_dict = {}
+        logits = self.proj(z)
+        if self.remap is not None:
+            # continue only with used logits
+            full_zeros = torch.zeros_like(logits)
+            logits = logits[:, self.used, ...]
+
+        soft_one_hot = F.gumbel_softmax(logits, tau=temp, dim=1, hard=hard)
+        if self.remap is not None:
+            # go back to all entries but unused set to zero
+            full_zeros[:, self.used, ...] = soft_one_hot
+            soft_one_hot = full_zeros
+        z_q = einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
+
+        # + kl divergence to the prior loss
+        qy = F.softmax(logits, dim=1)
+        diff = (
+            self.kl_weight
+            * torch.sum(qy * torch.log(qy * self.n_embed + 1e-10), dim=1).mean()
+        )
+        out_dict[self.loss_key] = diff
+
+        ind = soft_one_hot.argmax(dim=1)
+        out_dict["indices"] = ind
+        if self.remap is not None:
+            ind = self.remap_to_used(ind)
+
+        if return_logits:
+            out_dict["logits"] = logits
+
+        return z_q, out_dict
+
+    def get_codebook_entry(self, indices, shape):
+        # TODO: shape not yet optional
+        b, h, w, c = shape
+        assert b * h * w == indices.shape[0]
+        indices = rearrange(indices, "(b h w) -> b h w", b=b, h=h, w=w)
+        if self.remap is not None:
+            indices = self.unmap_to_all(indices)
+        one_hot = (
+            F.one_hot(indices, num_classes=self.n_embed).permute(0, 3, 1, 2).float()
+        )
+        z_q = einsum("b n h w, n d -> b d h w", one_hot, self.embed.weight)
+        return z_q
+
+
+class VectorQuantizer(AbstractQuantizer):
+    """
+    ____________________________________________
+    Discretization bottleneck part of the VQ-VAE.
+    Inputs:
+    - n_e : number of embeddings
+    - e_dim : dimension of embedding
+    - beta : commitment cost used in loss term,
+        beta * ||z_e(x)-sg[e]||^2
+    _____________________________________________
+    """
+
+    def __init__(
+        self,
+        n_e: int,
+        e_dim: int,
+        beta: float = 0.25,
+        remap: Optional[str] = None,
+        unknown_index: str = "random",
+        sane_index_shape: bool = False,
+        log_perplexity: bool = False,
+        embedding_weight_norm: bool = False,
+        loss_key: str = "loss/vq",
+    ):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.loss_key = loss_key
+
+        if not embedding_weight_norm:
+            self.embedding = nn.Embedding(self.n_e, self.e_dim)
+            self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        else:
+            self.embedding = torch.nn.utils.weight_norm(
+                nn.Embedding(self.n_e, self.e_dim), dim=1
+            )
+
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+        else:
+            self.used = None
+            self.re_embed = n_e
+        if unknown_index == "extra":
+            self.unknown_index = self.re_embed
+            self.re_embed = self.re_embed + 1
+        else:
+            assert unknown_index == "random" or isinstance(
+                unknown_index, int
+            ), "unknown index needs to be 'random', 'extra' or any integer"
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+        if self.remap is not None:
+            logpy.info(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+
+        self.sane_index_shape = sane_index_shape
+        self.log_perplexity = log_perplexity
+
+    def forward(
+        self,
+        z: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict]:
+        do_reshape = z.ndim == 4
+        if do_reshape:
+            #     # reshape z -> (batch, height, width, channel) and flatten
+            z = rearrange(z, "b c h w -> b h w c").contiguous()
+
+        else:
+            assert z.ndim < 4, "No reshaping strategy for inputs > 4 dimensions defined"
+            z = z.contiguous()
+
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn", z_flattened, rearrange(self.embedding.weight, "n d -> d n")
+            )
+        )
+
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        loss_dict = {}
+        if self.log_perplexity:
+            perplexity, cluster_usage = measure_perplexity(
+                min_encoding_indices.detach(), self.n_e
+            )
+            loss_dict.update({"perplexity": perplexity, "cluster_usage": cluster_usage})
+
+        # compute loss for embedding
+        loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean(
+            (z_q - z.detach()) ** 2
+        )
+        loss_dict[self.loss_key] = loss
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        if do_reshape:
+            z_q = rearrange(z_q, "b h w c -> b c h w").contiguous()
+
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(
+                z.shape[0], -1
+            )  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+
+        if self.sane_index_shape:
+            if do_reshape:
+                min_encoding_indices = min_encoding_indices.reshape(
+                    z_q.shape[0], z_q.shape[2], z_q.shape[3]
+                )
+            else:
+                min_encoding_indices = rearrange(
+                    min_encoding_indices, "(b s) 1 -> b s", b=z_q.shape[0]
+                )
+
+        loss_dict["min_encoding_indices"] = min_encoding_indices
+
+        return z_q, loss_dict
+
+    def get_codebook_entry(
+        self, indices: torch.Tensor, shape: Optional[Tuple[int, ...]] = None
+    ) -> torch.Tensor:
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            assert shape is not None, "Need to give shape for remap"
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+
+        # get quantized latent vectors
+        z_q = self.embedding(indices)
+
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        return z_q
+
+
+class EmbeddingEMA(nn.Module):
+    def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5):
+        super().__init__()
+        self.decay = decay
+        self.eps = eps
+        weight = torch.randn(num_tokens, codebook_dim)
+        self.weight = nn.Parameter(weight, requires_grad=False)
+        self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False)
+        self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
+        self.update = True
+
+    def forward(self, embed_id):
+        return F.embedding(embed_id, self.weight)
+
+    def cluster_size_ema_update(self, new_cluster_size):
+        self.cluster_size.data.mul_(self.decay).add_(
+            new_cluster_size, alpha=1 - self.decay
+        )
+
+    def embed_avg_ema_update(self, new_embed_avg):
+        self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay)
+
+    def weight_update(self, num_tokens):
+        n = self.cluster_size.sum()
+        smoothed_cluster_size = (
+            (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n
+        )
+        # normalize embedding average with smoothed cluster size
+        embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
+        self.weight.data.copy_(embed_normalized)
+
+
+class EMAVectorQuantizer(AbstractQuantizer):
+    def __init__(
+        self,
+        n_embed: int,
+        embedding_dim: int,
+        beta: float,
+        decay: float = 0.99,
+        eps: float = 1e-5,
+        remap: Optional[str] = None,
+        unknown_index: str = "random",
+        loss_key: str = "loss/vq",
+    ):
+        super().__init__()
+        self.codebook_dim = embedding_dim
+        self.num_tokens = n_embed
+        self.beta = beta
+        self.loss_key = loss_key
+
+        self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps)
+
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+        else:
+            self.used = None
+            self.re_embed = n_embed
+        if unknown_index == "extra":
+            self.unknown_index = self.re_embed
+            self.re_embed = self.re_embed + 1
+        else:
+            assert unknown_index == "random" or isinstance(
+                unknown_index, int
+            ), "unknown index needs to be 'random', 'extra' or any integer"
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+        if self.remap is not None:
+            logpy.info(
+                f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
+        # reshape z -> (batch, height, width, channel) and flatten
+        # z, 'b c h w -> b h w c'
+        z = rearrange(z, "b c h w -> b h w c")
+        z_flattened = z.reshape(-1, self.codebook_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = (
+            z_flattened.pow(2).sum(dim=1, keepdim=True)
+            + self.embedding.weight.pow(2).sum(dim=1)
+            - 2 * torch.einsum("bd,nd->bn", z_flattened, self.embedding.weight)
+        )  # 'n d -> d n'
+
+        encoding_indices = torch.argmin(d, dim=1)
+
+        z_q = self.embedding(encoding_indices).view(z.shape)
+        encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
+        avg_probs = torch.mean(encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+
+        if self.training and self.embedding.update:
+            # EMA cluster size
+            encodings_sum = encodings.sum(0)
+            self.embedding.cluster_size_ema_update(encodings_sum)
+            # EMA embedding average
+            embed_sum = encodings.transpose(0, 1) @ z_flattened
+            self.embedding.embed_avg_ema_update(embed_sum)
+            # normalize embed_avg and update weight
+            self.embedding.weight_update(self.num_tokens)
+
+        # compute loss for embedding
+        loss = self.beta * F.mse_loss(z_q.detach(), z)
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        # z_q, 'b h w c -> b c h w'
+        z_q = rearrange(z_q, "b h w c -> b c h w")
+
+        out_dict = {
+            self.loss_key: loss,
+            "encodings": encodings,
+            "encoding_indices": encoding_indices,
+            "perplexity": perplexity,
+        }
+
+        return z_q, out_dict
+
+
+class VectorQuantizerWithInputProjection(VectorQuantizer):
+    def __init__(
+        self,
+        input_dim: int,
+        n_codes: int,
+        codebook_dim: int,
+        beta: float = 1.0,
+        output_dim: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(n_codes, codebook_dim, beta, **kwargs)
+        self.proj_in = nn.Linear(input_dim, codebook_dim)
+        self.output_dim = output_dim
+        if output_dim is not None:
+            self.proj_out = nn.Linear(codebook_dim, output_dim)
+        else:
+            self.proj_out = nn.Identity()
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
+        rearr = False
+        in_shape = z.shape
+
+        if z.ndim > 3:
+            rearr = self.output_dim is not None
+            z = rearrange(z, "b c ... -> b (...) c")
+        z = self.proj_in(z)
+        z_q, loss_dict = super().forward(z)
+
+        z_q = self.proj_out(z_q)
+        if rearr:
+            if len(in_shape) == 4:
+                z_q = rearrange(z_q, "b (h w) c -> b c h w ", w=in_shape[-1])
+            elif len(in_shape) == 5:
+                z_q = rearrange(
+                    z_q, "b (t h w) c -> b c t h w ", w=in_shape[-1], h=in_shape[-2]
+                )
+            else:
+                raise NotImplementedError(
+                    f"rearranging not available for {len(in_shape)}-dimensional input."
+                )
+
+        return z_q, loss_dict
diff --git a/sgm/modules/autoencoding/temporal_ae.py b/sgm/modules/autoencoding/temporal_ae.py
new file mode 100644
index 0000000000000000000000000000000000000000..374373e2e4330846ffef28d9061dcc64f70d2722
--- /dev/null
+++ b/sgm/modules/autoencoding/temporal_ae.py
@@ -0,0 +1,349 @@
+from typing import Callable, Iterable, Union
+
+import torch
+from einops import rearrange, repeat
+
+from sgm.modules.diffusionmodules.model import (
+    XFORMERS_IS_AVAILABLE,
+    AttnBlock,
+    Decoder,
+    MemoryEfficientAttnBlock,
+    ResnetBlock,
+)
+from sgm.modules.diffusionmodules.openaimodel import ResBlock, timestep_embedding
+from sgm.modules.video_attention import VideoTransformerBlock
+from sgm.util import partialclass
+
+
+class VideoResBlock(ResnetBlock):
+    def __init__(
+        self,
+        out_channels,
+        *args,
+        dropout=0.0,
+        video_kernel_size=3,
+        alpha=0.0,
+        merge_strategy="learned",
+        **kwargs,
+    ):
+        super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs)
+        if video_kernel_size is None:
+            video_kernel_size = [3, 1, 1]
+        self.time_stack = ResBlock(
+            channels=out_channels,
+            emb_channels=0,
+            dropout=dropout,
+            dims=3,
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=False,
+            skip_t_emb=True,
+        )
+
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self, bs):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError()
+
+    def forward(self, x, temb, skip_video=False, timesteps=None):
+        if timesteps is None:
+            timesteps = self.timesteps
+
+        b, c, h, w = x.shape
+
+        x = super().forward(x, temb)
+
+        if not skip_video:
+            x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+
+            x = self.time_stack(x, temb)
+
+            alpha = self.get_alpha(bs=b // timesteps)
+            x = alpha * x + (1.0 - alpha) * x_mix
+
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+
+
+class AE3DConv(torch.nn.Conv2d):
+    def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs):
+        super().__init__(in_channels, out_channels, *args, **kwargs)
+        if isinstance(video_kernel_size, Iterable):
+            padding = [int(k // 2) for k in video_kernel_size]
+        else:
+            padding = int(video_kernel_size // 2)
+
+        self.time_mix_conv = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=video_kernel_size,
+            padding=padding,
+        )
+
+    def forward(self, input, timesteps, skip_video=False):
+        x = super().forward(input)
+        if skip_video:
+            return x
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+        x = self.time_mix_conv(x)
+        return rearrange(x, "b c t h w -> (b t) c h w")
+
+
+class VideoBlock(AttnBlock):
+    def __init__(
+        self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned"
+    ):
+        super().__init__(in_channels)
+        # no context, single headed, as in base class
+        self.time_mix_block = VideoTransformerBlock(
+            dim=in_channels,
+            n_heads=1,
+            d_head=in_channels,
+            checkpoint=False,
+            ff_in=True,
+            attn_mode="softmax",
+        )
+
+        time_embed_dim = self.in_channels * 4
+        self.video_time_embed = torch.nn.Sequential(
+            torch.nn.Linear(self.in_channels, time_embed_dim),
+            torch.nn.SiLU(),
+            torch.nn.Linear(time_embed_dim, self.in_channels),
+        )
+
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+
+    def forward(self, x, timesteps, skip_video=False):
+        if skip_video:
+            return super().forward(x)
+
+        x_in = x
+        x = self.attention(x)
+        h, w = x.shape[2:]
+        x = rearrange(x, "b c h w -> b (h w) c")
+
+        x_mix = x
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
+        num_frames = rearrange(num_frames, "b t -> (b t)")
+        t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False)
+        emb = self.video_time_embed(t_emb)  # b, n_channels
+        emb = emb[:, None, :]
+        x_mix = x_mix + emb
+
+        alpha = self.get_alpha()
+        x_mix = self.time_mix_block(x_mix, timesteps=timesteps)
+        x = alpha * x + (1.0 - alpha) * x_mix  # alpha merge
+
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        x = self.proj_out(x)
+
+        return x_in + x
+
+    def get_alpha(
+        self,
+    ):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}")
+
+
+class MemoryEfficientVideoBlock(MemoryEfficientAttnBlock):
+    def __init__(
+        self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned"
+    ):
+        super().__init__(in_channels)
+        # no context, single headed, as in base class
+        self.time_mix_block = VideoTransformerBlock(
+            dim=in_channels,
+            n_heads=1,
+            d_head=in_channels,
+            checkpoint=False,
+            ff_in=True,
+            attn_mode="softmax-xformers",
+        )
+
+        time_embed_dim = self.in_channels * 4
+        self.video_time_embed = torch.nn.Sequential(
+            torch.nn.Linear(self.in_channels, time_embed_dim),
+            torch.nn.SiLU(),
+            torch.nn.Linear(time_embed_dim, self.in_channels),
+        )
+
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+
+    def forward(self, x, timesteps, skip_time_block=False):
+        if skip_time_block:
+            return super().forward(x)
+
+        x_in = x
+        x = self.attention(x)
+        h, w = x.shape[2:]
+        x = rearrange(x, "b c h w -> b (h w) c")
+
+        x_mix = x
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
+        num_frames = rearrange(num_frames, "b t -> (b t)")
+        t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False)
+        emb = self.video_time_embed(t_emb)  # b, n_channels
+        emb = emb[:, None, :]
+        x_mix = x_mix + emb
+
+        alpha = self.get_alpha()
+        x_mix = self.time_mix_block(x_mix, timesteps=timesteps)
+        x = alpha * x + (1.0 - alpha) * x_mix  # alpha merge
+
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        x = self.proj_out(x)
+
+        return x_in + x
+
+    def get_alpha(
+        self,
+    ):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}")
+
+
+def make_time_attn(
+    in_channels,
+    attn_type="vanilla",
+    attn_kwargs=None,
+    alpha: float = 0,
+    merge_strategy: str = "learned",
+):
+    assert attn_type in [
+        "vanilla",
+        "vanilla-xformers",
+    ], f"attn_type {attn_type} not supported for spatio-temporal attention"
+    print(
+        f"making spatial and temporal attention of type '{attn_type}' with {in_channels} in_channels"
+    )
+    if not XFORMERS_IS_AVAILABLE and attn_type == "vanilla-xformers":
+        print(
+            f"Attention mode '{attn_type}' is not available. Falling back to vanilla attention. "
+            f"This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version {torch.__version__}"
+        )
+        attn_type = "vanilla"
+
+    if attn_type == "vanilla":
+        assert attn_kwargs is None
+        return partialclass(
+            VideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy
+        )
+    elif attn_type == "vanilla-xformers":
+        print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...")
+        return partialclass(
+            MemoryEfficientVideoBlock,
+            in_channels,
+            alpha=alpha,
+            merge_strategy=merge_strategy,
+        )
+    else:
+        return NotImplementedError()
+
+
+class Conv2DWrapper(torch.nn.Conv2d):
+    def forward(self, input: torch.Tensor, **kwargs) -> torch.Tensor:
+        return super().forward(input)
+
+
+class VideoDecoder(Decoder):
+    available_time_modes = ["all", "conv-only", "attn-only"]
+
+    def __init__(
+        self,
+        *args,
+        video_kernel_size: Union[int, list] = 3,
+        alpha: float = 0.0,
+        merge_strategy: str = "learned",
+        time_mode: str = "conv-only",
+        **kwargs,
+    ):
+        self.video_kernel_size = video_kernel_size
+        self.alpha = alpha
+        self.merge_strategy = merge_strategy
+        self.time_mode = time_mode
+        assert (
+            self.time_mode in self.available_time_modes
+        ), f"time_mode parameter has to be in {self.available_time_modes}"
+        super().__init__(*args, **kwargs)
+
+    def get_last_layer(self, skip_time_mix=False, **kwargs):
+        if self.time_mode == "attn-only":
+            raise NotImplementedError("TODO")
+        else:
+            return (
+                self.conv_out.time_mix_conv.weight
+                if not skip_time_mix
+                else self.conv_out.weight
+            )
+
+    def _make_attn(self) -> Callable:
+        if self.time_mode not in ["conv-only", "only-last-conv"]:
+            return partialclass(
+                make_time_attn,
+                alpha=self.alpha,
+                merge_strategy=self.merge_strategy,
+            )
+        else:
+            return super()._make_attn()
+
+    def _make_conv(self) -> Callable:
+        if self.time_mode != "attn-only":
+            return partialclass(AE3DConv, video_kernel_size=self.video_kernel_size)
+        else:
+            return Conv2DWrapper
+
+    def _make_resblock(self) -> Callable:
+        if self.time_mode not in ["attn-only", "only-last-conv"]:
+            return partialclass(
+                VideoResBlock,
+                video_kernel_size=self.video_kernel_size,
+                alpha=self.alpha,
+                merge_strategy=self.merge_strategy,
+            )
+        else:
+            return super()._make_resblock()
diff --git a/sgm/modules/diffusionmodules/__init__.py b/sgm/modules/diffusionmodules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/diffusionmodules/denoiser.py b/sgm/modules/diffusionmodules/denoiser.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86e7a262d1f036139e41f500d8579a2b95071ef
--- /dev/null
+++ b/sgm/modules/diffusionmodules/denoiser.py
@@ -0,0 +1,75 @@
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+
+from ...util import append_dims, instantiate_from_config
+from .denoiser_scaling import DenoiserScaling
+from .discretizer import Discretization
+
+
+class Denoiser(nn.Module):
+    def __init__(self, scaling_config: Dict):
+        super().__init__()
+
+        self.scaling: DenoiserScaling = instantiate_from_config(scaling_config)
+
+    def possibly_quantize_sigma(self, sigma: torch.Tensor) -> torch.Tensor:
+        return sigma
+
+    def possibly_quantize_c_noise(self, c_noise: torch.Tensor) -> torch.Tensor:
+        return c_noise
+
+    def forward(
+        self,
+        network: nn.Module,
+        input: torch.Tensor,
+        sigma: torch.Tensor,
+        cond: Dict,
+        **additional_model_inputs,
+    ) -> torch.Tensor:
+        sigma = self.possibly_quantize_sigma(sigma)
+        sigma_shape = sigma.shape
+        sigma = append_dims(sigma, input.ndim)
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma)
+        c_noise = self.possibly_quantize_c_noise(c_noise.reshape(sigma_shape))
+        return (
+            network(input * c_in, c_noise, cond, **additional_model_inputs) * c_out
+            + input * c_skip
+        )
+
+
+class DiscreteDenoiser(Denoiser):
+    def __init__(
+        self,
+        scaling_config: Dict,
+        num_idx: int,
+        discretization_config: Dict,
+        do_append_zero: bool = False,
+        quantize_c_noise: bool = True,
+        flip: bool = True,
+    ):
+        super().__init__(scaling_config)
+        self.discretization: Discretization = instantiate_from_config(
+            discretization_config
+        )
+        sigmas = self.discretization(num_idx, do_append_zero=do_append_zero, flip=flip)
+        self.register_buffer("sigmas", sigmas)
+        self.quantize_c_noise = quantize_c_noise
+        self.num_idx = num_idx
+
+    def sigma_to_idx(self, sigma: torch.Tensor) -> torch.Tensor:
+        dists = sigma - self.sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape)
+
+    def idx_to_sigma(self, idx: Union[torch.Tensor, int]) -> torch.Tensor:
+        return self.sigmas[idx]
+
+    def possibly_quantize_sigma(self, sigma: torch.Tensor) -> torch.Tensor:
+        return self.idx_to_sigma(self.sigma_to_idx(sigma))
+
+    def possibly_quantize_c_noise(self, c_noise: torch.Tensor) -> torch.Tensor:
+        if self.quantize_c_noise:
+            return self.sigma_to_idx(c_noise)
+        else:
+            return c_noise
diff --git a/sgm/modules/diffusionmodules/denoiser_scaling.py b/sgm/modules/diffusionmodules/denoiser_scaling.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e287bfe8a82839a9a12fbd25c3446f43ab493b
--- /dev/null
+++ b/sgm/modules/diffusionmodules/denoiser_scaling.py
@@ -0,0 +1,59 @@
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+import torch
+
+
+class DenoiserScaling(ABC):
+    @abstractmethod
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        pass
+
+
+class EDMScaling:
+    def __init__(self, sigma_data: float = 0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
+        c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
+
+
+class EpsScaling:
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = torch.ones_like(sigma, device=sigma.device)
+        c_out = -sigma
+        c_in = 1 / (sigma**2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+
+
+class VScaling:
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = 1.0 / (sigma**2 + 1.0)
+        c_out = -sigma / (sigma**2 + 1.0) ** 0.5
+        c_in = 1.0 / (sigma**2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+
+
+class VScalingWithEDMcNoise(DenoiserScaling):
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = 1.0 / (sigma**2 + 1.0)
+        c_out = -sigma / (sigma**2 + 1.0) ** 0.5
+        c_in = 1.0 / (sigma**2 + 1.0) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
diff --git a/sgm/modules/diffusionmodules/denoiser_weighting.py b/sgm/modules/diffusionmodules/denoiser_weighting.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8b03ca58f17ea3d7374f4bbb7bf1d2994755e00
--- /dev/null
+++ b/sgm/modules/diffusionmodules/denoiser_weighting.py
@@ -0,0 +1,24 @@
+import torch
+
+
+class UnitWeighting:
+    def __call__(self, sigma):
+        return torch.ones_like(sigma, device=sigma.device)
+
+
+class EDMWeighting:
+    def __init__(self, sigma_data=0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma):
+        return (sigma**2 + self.sigma_data**2) / (sigma * self.sigma_data) ** 2
+
+
+class VWeighting(EDMWeighting):
+    def __init__(self):
+        super().__init__(sigma_data=1.0)
+
+
+class EpsWeighting:
+    def __call__(self, sigma):
+        return sigma**-2.0
diff --git a/sgm/modules/diffusionmodules/discretizer.py b/sgm/modules/diffusionmodules/discretizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..02add6081c5e3164d4402619b44d5be235d3ec58
--- /dev/null
+++ b/sgm/modules/diffusionmodules/discretizer.py
@@ -0,0 +1,69 @@
+from abc import abstractmethod
+from functools import partial
+
+import numpy as np
+import torch
+
+from ...modules.diffusionmodules.util import make_beta_schedule
+from ...util import append_zero
+
+
+def generate_roughly_equally_spaced_steps(
+    num_substeps: int, max_step: int
+) -> np.ndarray:
+    return np.linspace(max_step - 1, 0, num_substeps, endpoint=False).astype(int)[::-1]
+
+
+class Discretization:
+    def __call__(self, n, do_append_zero=True, device="cpu", flip=False):
+        sigmas = self.get_sigmas(n, device=device)
+        sigmas = append_zero(sigmas) if do_append_zero else sigmas
+        return sigmas if not flip else torch.flip(sigmas, (0,))
+
+    @abstractmethod
+    def get_sigmas(self, n, device):
+        pass
+
+
+class EDMDiscretization(Discretization):
+    def __init__(self, sigma_min=0.002, sigma_max=80.0, rho=7.0):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.rho = rho
+
+    def get_sigmas(self, n, device="cpu"):
+        ramp = torch.linspace(0, 1, n, device=device)
+        min_inv_rho = self.sigma_min ** (1 / self.rho)
+        max_inv_rho = self.sigma_max ** (1 / self.rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** self.rho
+        return sigmas
+
+
+class LegacyDDPMDiscretization(Discretization):
+    def __init__(
+        self,
+        linear_start=0.00085,
+        linear_end=0.0120,
+        num_timesteps=1000,
+    ):
+        super().__init__()
+        self.num_timesteps = num_timesteps
+        betas = make_beta_schedule(
+            "linear", num_timesteps, linear_start=linear_start, linear_end=linear_end
+        )
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.to_torch = partial(torch.tensor, dtype=torch.float32)
+
+    def get_sigmas(self, n, device="cpu"):
+        if n < self.num_timesteps:
+            timesteps = generate_roughly_equally_spaced_steps(n, self.num_timesteps)
+            alphas_cumprod = self.alphas_cumprod[timesteps]
+        elif n == self.num_timesteps:
+            alphas_cumprod = self.alphas_cumprod
+        else:
+            raise ValueError
+
+        to_torch = partial(torch.tensor, dtype=torch.float32, device=device)
+        sigmas = to_torch((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        return torch.flip(sigmas, (0,))
diff --git a/sgm/modules/diffusionmodules/guiders.py b/sgm/modules/diffusionmodules/guiders.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8eca43e8b7b8b7b0e6b9f3e2bddbae6e3456fee
--- /dev/null
+++ b/sgm/modules/diffusionmodules/guiders.py
@@ -0,0 +1,99 @@
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from einops import rearrange, repeat
+
+from ...util import append_dims, default
+
+logpy = logging.getLogger(__name__)
+
+
+class Guider(ABC):
+    @abstractmethod
+    def __call__(self, x: torch.Tensor, sigma: float) -> torch.Tensor:
+        pass
+
+    def prepare_inputs(
+        self, x: torch.Tensor, s: float, c: Dict, uc: Dict
+    ) -> Tuple[torch.Tensor, float, Dict]:
+        pass
+
+
+class VanillaCFG(Guider):
+    def __init__(self, scale: float):
+        self.scale = scale
+
+    def __call__(self, x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        x_pred = x_u + self.scale * (x_c - x_u)
+        return x_pred
+
+    def prepare_inputs(self, x, s, c, uc):
+        c_out = dict()
+
+        for k in c:
+            if k in ["vector", "crossattn", "concat"]:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out
+
+
+class IdentityGuider(Guider):
+    def __call__(self, x: torch.Tensor, sigma: float) -> torch.Tensor:
+        return x
+
+    def prepare_inputs(
+        self, x: torch.Tensor, s: float, c: Dict, uc: Dict
+    ) -> Tuple[torch.Tensor, float, Dict]:
+        c_out = dict()
+
+        for k in c:
+            c_out[k] = c[k]
+
+        return x, s, c_out
+
+
+class LinearPredictionGuider(Guider):
+    def __init__(
+        self,
+        max_scale: float,
+        num_frames: int,
+        min_scale: float = 1.0,
+        additional_cond_keys: Optional[Union[List[str], str]] = None,
+    ):
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.num_frames = num_frames
+        self.scale = torch.linspace(min_scale, max_scale, num_frames).unsqueeze(0)
+
+        additional_cond_keys = default(additional_cond_keys, [])
+        if isinstance(additional_cond_keys, str):
+            additional_cond_keys = [additional_cond_keys]
+        self.additional_cond_keys = additional_cond_keys
+
+    def __call__(self, x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+
+        x_u = rearrange(x_u, "(b t) ... -> b t ...", t=self.num_frames)
+        x_c = rearrange(x_c, "(b t) ... -> b t ...", t=self.num_frames)
+        scale = repeat(self.scale, "1 t -> b t", b=x_u.shape[0])
+        scale = append_dims(scale, x_u.ndim).to(x_u.device)
+
+        return rearrange(x_u + scale * (x_c - x_u), "b t ... -> (b t) ...")
+
+    def prepare_inputs(
+        self, x: torch.Tensor, s: torch.Tensor, c: dict, uc: dict
+    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
+        c_out = dict()
+
+        for k in c:
+            if k in ["vector", "crossattn", "concat"] + self.additional_cond_keys:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out
diff --git a/sgm/modules/diffusionmodules/loss_weighting.py b/sgm/modules/diffusionmodules/loss_weighting.py
new file mode 100644
index 0000000000000000000000000000000000000000..e12c0a76635435babd1af33969e82fa284525af8
--- /dev/null
+++ b/sgm/modules/diffusionmodules/loss_weighting.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class DiffusionLossWeighting(ABC):
+    @abstractmethod
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class UnitWeighting(DiffusionLossWeighting):
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        return torch.ones_like(sigma, device=sigma.device)
+
+
+class EDMWeighting(DiffusionLossWeighting):
+    def __init__(self, sigma_data: float = 0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        return (sigma**2 + self.sigma_data**2) / (sigma * self.sigma_data) ** 2
+
+
+class VWeighting(EDMWeighting):
+    def __init__(self):
+        super().__init__(sigma_data=1.0)
+
+
+class EpsWeighting(DiffusionLossWeighting):
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        return sigma**-2.0
diff --git a/sgm/modules/diffusionmodules/model.py b/sgm/modules/diffusionmodules/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf9d92140dee8443a0ea6b5cf218f2879ad88f4
--- /dev/null
+++ b/sgm/modules/diffusionmodules/model.py
@@ -0,0 +1,748 @@
+# pytorch_diffusion + derived encoder decoder
+import logging
+import math
+from typing import Any, Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+
+logpy = logging.getLogger(__name__)
+
+try:
+    import xformers
+    import xformers.ops
+
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    logpy.warning("no module 'xformers'. Processing without...")
+
+from ...modules.attention import LinearAttention, MemoryEfficientCrossAttention
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def attention(self, h_: torch.Tensor) -> torch.Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, h, w = q.shape
+        q, k, v = map(
+            lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)
+        )
+        h_ = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v
+        )  # scale is dim ** -0.5 per default
+        # compute attention
+
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x, **kwargs):
+        h_ = x
+        h_ = self.attention(h_)
+        h_ = self.proj_out(h_)
+        return x + h_
+
+
+class MemoryEfficientAttnBlock(nn.Module):
+    """
+    Uses xformers efficient implementation,
+    see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    Note: this is a single-head self-attention operation
+    """
+
+    #
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.attention_op: Optional[Any] = None
+
+    def attention(self, h_: torch.Tensor) -> torch.Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        B, C, H, W = q.shape
+        q, k, v = map(lambda x: rearrange(x, "b c h w -> b (h w) c"), (q, k, v))
+
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(B, t.shape[1], 1, C)
+            .permute(0, 2, 1, 3)
+            .reshape(B * 1, t.shape[1], C)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op
+        )
+
+        out = (
+            out.unsqueeze(0)
+            .reshape(B, 1, out.shape[1], C)
+            .permute(0, 2, 1, 3)
+            .reshape(B, out.shape[1], C)
+        )
+        return rearrange(out, "b (h w) c -> b c h w", b=B, h=H, w=W, c=C)
+
+    def forward(self, x, **kwargs):
+        h_ = x
+        h_ = self.attention(h_)
+        h_ = self.proj_out(h_)
+        return x + h_
+
+
+class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
+    def forward(self, x, context=None, mask=None, **unused_kwargs):
+        b, c, h, w = x.shape
+        x = rearrange(x, "b c h w -> b (h w) c")
+        out = super().forward(x, context=context, mask=mask)
+        out = rearrange(out, "b (h w) c -> b c h w", h=h, w=w, c=c)
+        return x + out
+
+
+def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
+    assert attn_type in [
+        "vanilla",
+        "vanilla-xformers",
+        "memory-efficient-cross-attn",
+        "linear",
+        "none",
+    ], f"attn_type {attn_type} unknown"
+    if (
+        version.parse(torch.__version__) < version.parse("2.0.0")
+        and attn_type != "none"
+    ):
+        assert XFORMERS_IS_AVAILABLE, (
+            f"We do not support vanilla attention in {torch.__version__} anymore, "
+            f"as it is too expensive. Please install xformers via e.g. 'pip install xformers==0.0.16'"
+        )
+        attn_type = "vanilla-xformers"
+    logpy.info(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        assert attn_kwargs is None
+        return AttnBlock(in_channels)
+    elif attn_type == "vanilla-xformers":
+        logpy.info(
+            f"building MemoryEfficientAttnBlock with {in_channels} in_channels..."
+        )
+        return MemoryEfficientAttnBlock(in_channels)
+    elif type == "memory-efficient-cross-attn":
+        attn_kwargs["query_dim"] = in_channels
+        return MemoryEfficientCrossAttentionWrapper(**attn_kwargs)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        use_timestep=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList(
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
+            )
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in + skip_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x, t=None, context=None):
+        # assert x.shape[2] == x.shape[3] == self.resolution
+        if context is not None:
+            # assume aligned context, cat along channel axis
+            x = torch.cat((x, context), dim=1)
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb
+                )
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+    def get_last_layer(self):
+        return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        double_z=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        give_pre_end=False,
+        tanh_out=False,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignorekwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        logpy.info(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+
+        make_attn_cls = self._make_attn()
+        make_resblock_cls = self._make_resblock()
+        make_conv_cls = self._make_conv()
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = make_resblock_cls(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn_cls(block_in, attn_type=attn_type)
+        self.mid.block_2 = make_resblock_cls(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    make_resblock_cls(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn_cls(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = make_conv_cls(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+
+    def _make_attn(self) -> Callable:
+        return make_attn
+
+    def _make_resblock(self) -> Callable:
+        return ResnetBlock
+
+    def _make_conv(self) -> Callable:
+        return torch.nn.Conv2d
+
+    def get_last_layer(self, **kwargs):
+        return self.conv_out.weight
+
+    def forward(self, z, **kwargs):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb, **kwargs)
+        h = self.mid.attn_1(h, **kwargs)
+        h = self.mid.block_2(h, temb, **kwargs)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb, **kwargs)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, **kwargs)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h, **kwargs)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
diff --git a/sgm/modules/diffusionmodules/openaimodel.py b/sgm/modules/diffusionmodules/openaimodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58e1b0e9be031cd09803d451fc59f2a5ce88eea
--- /dev/null
+++ b/sgm/modules/diffusionmodules/openaimodel.py
@@ -0,0 +1,853 @@
+import logging
+import math
+from abc import abstractmethod
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.utils.checkpoint import checkpoint
+
+from ...modules.attention import SpatialTransformer
+from ...modules.diffusionmodules.util import (avg_pool_nd, conv_nd, linear,
+                                              normalization,
+                                              timestep_embedding, zero_module)
+from ...modules.video_attention import SpatialVideoTransformer
+from ...util import exists
+
+logpy = logging.getLogger(__name__)
+
+
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+
+    def forward(self, x: th.Tensor) -> th.Tensor:
+        b, c, _ = x.shape
+        x = x.reshape(b, c, -1)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x: th.Tensor, emb: th.Tensor):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(
+        self,
+        x: th.Tensor,
+        emb: th.Tensor,
+        context: Optional[th.Tensor] = None,
+        image_only_indicator: Optional[th.Tensor] = None,
+        time_context: Optional[int] = None,
+        num_video_frames: Optional[int] = None,
+    ):
+        from ...modules.diffusionmodules.video_model import VideoResBlock
+
+        for layer in self:
+            module = layer
+
+            if isinstance(module, TimestepBlock) and not isinstance(
+                module, VideoResBlock
+            ):
+                x = layer(x, emb)
+            elif isinstance(module, VideoResBlock):
+                x = layer(x, emb, num_video_frames, image_only_indicator)
+            elif isinstance(module, SpatialVideoTransformer):
+                x = layer(
+                    x,
+                    context,
+                    time_context,
+                    num_video_frames,
+                    image_only_indicator,
+                )
+            elif isinstance(module, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool,
+        dims: int = 2,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        third_up: bool = False,
+        kernel_size: int = 3,
+        scale_factor: int = 2,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        self.third_up = third_up
+        self.scale_factor = scale_factor
+        if use_conv:
+            self.conv = conv_nd(
+                dims, self.channels, self.out_channels, kernel_size, padding=padding
+            )
+
+    def forward(self, x: th.Tensor) -> th.Tensor:
+        assert x.shape[1] == self.channels
+
+        if self.dims == 3:
+            t_factor = 1 if not self.third_up else self.scale_factor
+            x = F.interpolate(
+                x,
+                (
+                    t_factor * x.shape[2],
+                    x.shape[3] * self.scale_factor,
+                    x.shape[4] * self.scale_factor,
+                ),
+                mode="nearest",
+            )
+        else:
+            x = F.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool,
+        dims: int = 2,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        third_down: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else ((1, 2, 2) if not third_down else (2, 2, 2))
+        if use_conv:
+            logpy.info(f"Building a Downsample layer with {dims} dims.")
+            logpy.info(
+                f"  --> settings are: \n in-chn: {self.channels}, out-chn: {self.out_channels}, "
+                f"kernel-size: 3, stride: {stride}, padding: {padding}"
+            )
+            if dims == 3:
+                logpy.info(f"  --> Downsampling third axis (time): {third_down}")
+            self.op = conv_nd(
+                dims,
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                padding=padding,
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x: th.Tensor) -> th.Tensor:
+        assert x.shape[1] == self.channels
+
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        dropout: float,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        use_scale_shift_norm: bool = False,
+        dims: int = 2,
+        use_checkpoint: bool = False,
+        up: bool = False,
+        down: bool = False,
+        kernel_size: int = 3,
+        exchange_temb_dims: bool = False,
+        skip_t_emb: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.exchange_temb_dims = exchange_temb_dims
+
+        if isinstance(kernel_size, Iterable):
+            padding = [k // 2 for k in kernel_size]
+        else:
+            padding = kernel_size // 2
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.skip_t_emb = skip_t_emb
+        self.emb_out_channels = (
+            2 * self.out_channels if use_scale_shift_norm else self.out_channels
+        )
+        if self.skip_t_emb:
+            logpy.info(f"Skipping timestep embedding in {self.__class__.__name__}")
+            assert not self.use_scale_shift_norm
+            self.emb_layers = None
+            self.exchange_temb_dims = False
+        else:
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(
+                    emb_channels,
+                    self.emb_out_channels,
+                ),
+            )
+
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(
+                    dims,
+                    self.out_channels,
+                    self.out_channels,
+                    kernel_size,
+                    padding=padding,
+                )
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, kernel_size, padding=padding
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+
+    def forward(self, x: th.Tensor, emb: th.Tensor) -> th.Tensor:
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x, emb)
+        else:
+            return self._forward(x, emb)
+
+    def _forward(self, x: th.Tensor, emb: th.Tensor) -> th.Tensor:
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+
+        if self.skip_t_emb:
+            emb_out = th.zeros_like(h)
+        else:
+            emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            if self.exchange_temb_dims:
+                emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int = 1,
+        num_head_channels: int = -1,
+        use_checkpoint: bool = False,
+        use_new_attention_order: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x: th.Tensor, **kwargs) -> th.Tensor:
+        return checkpoint(self._forward, x)
+
+    def _forward(self, x: th.Tensor) -> th.Tensor:
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+
+    def __init__(self, n_heads: int):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv: th.Tensor) -> th.Tensor:
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads: int):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv: th.Tensor) -> th.Tensor:
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+
+class Timestep(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, t: th.Tensor) -> th.Tensor:
+        return timestep_embedding(t, self.dim)
+
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        num_res_blocks: int,
+        attention_resolutions: int,
+        dropout: float = 0.0,
+        channel_mult: Union[List, Tuple] = (1, 2, 4, 8),
+        conv_resample: bool = True,
+        dims: int = 2,
+        num_classes: Optional[Union[int, str]] = None,
+        use_checkpoint: bool = False,
+        num_heads: int = -1,
+        num_head_channels: int = -1,
+        num_heads_upsample: int = -1,
+        use_scale_shift_norm: bool = False,
+        resblock_updown: bool = False,
+        transformer_depth: int = 1,
+        context_dim: Optional[int] = None,
+        disable_self_attentions: Optional[List[bool]] = None,
+        num_attention_blocks: Optional[List[int]] = None,
+        disable_middle_self_attn: bool = False,
+        disable_middle_transformer: bool = False,
+        use_linear_in_transformer: bool = False,
+        spatial_transformer_attn_type: str = "softmax",
+        adm_in_channels: Optional[int] = None,
+    ):
+        super().__init__()
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert (
+                num_head_channels != -1
+            ), "Either num_heads or num_head_channels has to be set"
+
+        if num_head_channels == -1:
+            assert (
+                num_heads != -1
+            ), "Either num_heads or num_head_channels has to be set"
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        transformer_depth_middle = transformer_depth[-1]
+
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    "provide num_res_blocks either as an int (globally constant) or "
+                    "as a list/tuple (per-level) with the same length as channel_mult"
+                )
+            self.num_res_blocks = num_res_blocks
+
+        if disable_self_attentions is not None:
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
+            )
+            logpy.info(
+                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                f"attention will still not be set."
+            )
+
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                logpy.info("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = nn.Sequential(
+                    Timestep(model_channels),
+                    nn.Sequential(
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    ),
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+
+                    if context_dim is not None and exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if (
+                        not exists(num_attention_blocks)
+                        or nr < num_attention_blocks[level]
+                    ):
+                        layers.append(
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth[level],
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                attn_type=spatial_transformer_attn_type,
+                                use_checkpoint=use_checkpoint,
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                out_channels=ch,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            SpatialTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth_middle,
+                context_dim=context_dim,
+                disable_self_attn=disable_middle_self_attn,
+                use_linear=use_linear_in_transformer,
+                attn_type=spatial_transformer_attn_type,
+                use_checkpoint=use_checkpoint,
+            )
+            if not disable_middle_transformer
+            else th.nn.Identity(),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(self.num_res_blocks[level] + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if (
+                        not exists(num_attention_blocks)
+                        or i < num_attention_blocks[level]
+                    ):
+                        layers.append(
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth[level],
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                attn_type=spatial_transformer_attn_type,
+                                use_checkpoint=use_checkpoint,
+                            )
+                        )
+                if level and i == self.num_res_blocks[level]:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+
+    def forward(
+        self,
+        x: th.Tensor,
+        timesteps: Optional[th.Tensor] = None,
+        context: Optional[th.Tensor] = None,
+        y: Optional[th.Tensor] = None,
+        **kwargs,
+    ) -> th.Tensor:
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+
+        h = x
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+
+        return self.out(h)
diff --git a/sgm/modules/diffusionmodules/sampling.py b/sgm/modules/diffusionmodules/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..af07566d599fdd6f255f8b9fd4592a962b0d2ace
--- /dev/null
+++ b/sgm/modules/diffusionmodules/sampling.py
@@ -0,0 +1,362 @@
+"""
+    Partially ported from https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py
+"""
+
+
+from typing import Dict, Union
+
+import torch
+from omegaconf import ListConfig, OmegaConf
+from tqdm import tqdm
+
+from ...modules.diffusionmodules.sampling_utils import (get_ancestral_step,
+                                                        linear_multistep_coeff,
+                                                        to_d, to_neg_log_sigma,
+                                                        to_sigma)
+from ...util import append_dims, default, instantiate_from_config
+
+DEFAULT_GUIDER = {"target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"}
+
+
+class BaseDiffusionSampler:
+    def __init__(
+        self,
+        discretization_config: Union[Dict, ListConfig, OmegaConf],
+        num_steps: Union[int, None] = None,
+        guider_config: Union[Dict, ListConfig, OmegaConf, None] = None,
+        verbose: bool = False,
+        device: str = "cuda",
+    ):
+        self.num_steps = num_steps
+        self.discretization = instantiate_from_config(discretization_config)
+        self.guider = instantiate_from_config(
+            default(
+                guider_config,
+                DEFAULT_GUIDER,
+            )
+        )
+        self.verbose = verbose
+        self.device = device
+
+    def prepare_sampling_loop(self, x, cond, uc=None, num_steps=None):
+        sigmas = self.discretization(
+            self.num_steps if num_steps is None else num_steps, device=self.device
+        )
+        uc = default(uc, cond)
+
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        num_sigmas = len(sigmas)
+
+        s_in = x.new_ones([x.shape[0]])
+
+        return x, s_in, sigmas, num_sigmas, cond, uc
+
+    def denoise(self, x, denoiser, sigma, cond, uc):
+        denoised = denoiser(*self.guider.prepare_inputs(x, sigma, cond, uc))
+        denoised = self.guider(denoised, sigma)
+        return denoised
+
+    def get_sigma_gen(self, num_sigmas):
+        sigma_generator = range(num_sigmas - 1)
+        if self.verbose:
+            print("#" * 30, " Sampling setting ", "#" * 30)
+            print(f"Sampler: {self.__class__.__name__}")
+            print(f"Discretization: {self.discretization.__class__.__name__}")
+            print(f"Guider: {self.guider.__class__.__name__}")
+            sigma_generator = tqdm(
+                sigma_generator,
+                total=num_sigmas,
+                desc=f"Sampling with {self.__class__.__name__} for {num_sigmas} steps",
+            )
+        return sigma_generator
+
+
+class SingleStepDiffusionSampler(BaseDiffusionSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc, *args, **kwargs):
+        raise NotImplementedError
+
+    def euler_step(self, x, d, dt):
+        return x + dt * d
+
+
+class EDMSampler(SingleStepDiffusionSampler):
+    def __init__(
+        self, s_churn=0.0, s_tmin=0.0, s_tmax=float("inf"), s_noise=1.0, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.s_churn = s_churn
+        self.s_tmin = s_tmin
+        self.s_tmax = s_tmax
+        self.s_noise = s_noise
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, gamma=0.0):
+        sigma_hat = sigma * (gamma + 1.0)
+        if gamma > 0:
+            eps = torch.randn_like(x) * self.s_noise
+            x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+
+        denoised = self.denoise(x, denoiser, sigma_hat, cond, uc)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+
+        euler_step = self.euler_step(x, d, dt)
+        x = self.possible_correction_step(
+            euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+        )
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        for i in self.get_sigma_gen(num_sigmas):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+                gamma,
+            )
+
+        return x
+
+
+class AncestralSampler(SingleStepDiffusionSampler):
+    def __init__(self, eta=1.0, s_noise=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.eta = eta
+        self.s_noise = s_noise
+        self.noise_sampler = lambda x: torch.randn_like(x)
+
+    def ancestral_euler_step(self, x, denoised, sigma, sigma_down):
+        d = to_d(x, sigma, denoised)
+        dt = append_dims(sigma_down - sigma, x.ndim)
+
+        return self.euler_step(x, d, dt)
+
+    def ancestral_step(self, x, sigma, next_sigma, sigma_up):
+        x = torch.where(
+            append_dims(next_sigma, x.ndim) > 0.0,
+            x + self.noise_sampler(x) * self.s_noise * append_dims(sigma_up, x.ndim),
+            x,
+        )
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        for i in self.get_sigma_gen(num_sigmas):
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+            )
+
+        return x
+
+
+class LinearMultistepSampler(BaseDiffusionSampler):
+    def __init__(
+        self,
+        order=4,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.order = order
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None, **kwargs):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        ds = []
+        sigmas_cpu = sigmas.detach().cpu().numpy()
+        for i in self.get_sigma_gen(num_sigmas):
+            sigma = s_in * sigmas[i]
+            denoised = denoiser(
+                *self.guider.prepare_inputs(x, sigma, cond, uc), **kwargs
+            )
+            denoised = self.guider(denoised, sigma)
+            d = to_d(x, sigma, denoised)
+            ds.append(d)
+            if len(ds) > self.order:
+                ds.pop(0)
+            cur_order = min(i + 1, self.order)
+            coeffs = [
+                linear_multistep_coeff(cur_order, sigmas_cpu, i, j)
+                for j in range(cur_order)
+            ]
+            x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
+
+        return x
+
+
+class EulerEDMSampler(EDMSampler):
+    def possible_correction_step(
+        self, euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+    ):
+        return euler_step
+
+
+class HeunEDMSampler(EDMSampler):
+    def possible_correction_step(
+        self, euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+    ):
+        if torch.sum(next_sigma) < 1e-14:
+            # Save a network evaluation if all noise levels are 0
+            return euler_step
+        else:
+            denoised = self.denoise(euler_step, denoiser, next_sigma, cond, uc)
+            d_new = to_d(euler_step, next_sigma, denoised)
+            d_prime = (d + d_new) / 2.0
+
+            # apply correction if noise level is not 0
+            x = torch.where(
+                append_dims(next_sigma, x.ndim) > 0.0, x + d_prime * dt, euler_step
+            )
+            return x
+
+
+class EulerAncestralSampler(AncestralSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc):
+        sigma_down, sigma_up = get_ancestral_step(sigma, next_sigma, eta=self.eta)
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        x = self.ancestral_euler_step(x, denoised, sigma, sigma_down)
+        x = self.ancestral_step(x, sigma, next_sigma, sigma_up)
+
+        return x
+
+
+class DPMPP2SAncestralSampler(AncestralSampler):
+    def get_variables(self, sigma, sigma_down):
+        t, t_next = [to_neg_log_sigma(s) for s in (sigma, sigma_down)]
+        h = t_next - t
+        s = t + 0.5 * h
+        return h, s, t, t_next
+
+    def get_mult(self, h, s, t, t_next):
+        mult1 = to_sigma(s) / to_sigma(t)
+        mult2 = (-0.5 * h).expm1()
+        mult3 = to_sigma(t_next) / to_sigma(t)
+        mult4 = (-h).expm1()
+
+        return mult1, mult2, mult3, mult4
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, **kwargs):
+        sigma_down, sigma_up = get_ancestral_step(sigma, next_sigma, eta=self.eta)
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        x_euler = self.ancestral_euler_step(x, denoised, sigma, sigma_down)
+
+        if torch.sum(sigma_down) < 1e-14:
+            # Save a network evaluation if all noise levels are 0
+            x = x_euler
+        else:
+            h, s, t, t_next = self.get_variables(sigma, sigma_down)
+            mult = [
+                append_dims(mult, x.ndim) for mult in self.get_mult(h, s, t, t_next)
+            ]
+
+            x2 = mult[0] * x - mult[1] * denoised
+            denoised2 = self.denoise(x2, denoiser, to_sigma(s), cond, uc)
+            x_dpmpp2s = mult[2] * x - mult[3] * denoised2
+
+            # apply correction if noise level is not 0
+            x = torch.where(append_dims(sigma_down, x.ndim) > 0.0, x_dpmpp2s, x_euler)
+
+        x = self.ancestral_step(x, sigma, next_sigma, sigma_up)
+        return x
+
+
+class DPMPP2MSampler(BaseDiffusionSampler):
+    def get_variables(self, sigma, next_sigma, previous_sigma=None):
+        t, t_next = [to_neg_log_sigma(s) for s in (sigma, next_sigma)]
+        h = t_next - t
+
+        if previous_sigma is not None:
+            h_last = t - to_neg_log_sigma(previous_sigma)
+            r = h_last / h
+            return h, r, t, t_next
+        else:
+            return h, None, t, t_next
+
+    def get_mult(self, h, r, t, t_next, previous_sigma):
+        mult1 = to_sigma(t_next) / to_sigma(t)
+        mult2 = (-h).expm1()
+
+        if previous_sigma is not None:
+            mult3 = 1 + 1 / (2 * r)
+            mult4 = 1 / (2 * r)
+            return mult1, mult2, mult3, mult4
+        else:
+            return mult1, mult2
+
+    def sampler_step(
+        self,
+        old_denoised,
+        previous_sigma,
+        sigma,
+        next_sigma,
+        denoiser,
+        x,
+        cond,
+        uc=None,
+    ):
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+
+        h, r, t, t_next = self.get_variables(sigma, next_sigma, previous_sigma)
+        mult = [
+            append_dims(mult, x.ndim)
+            for mult in self.get_mult(h, r, t, t_next, previous_sigma)
+        ]
+
+        x_standard = mult[0] * x - mult[1] * denoised
+        if old_denoised is None or torch.sum(next_sigma) < 1e-14:
+            # Save a network evaluation if all noise levels are 0 or on the first step
+            return x_standard, denoised
+        else:
+            denoised_d = mult[2] * denoised - mult[3] * old_denoised
+            x_advanced = mult[0] * x - mult[1] * denoised_d
+
+            # apply correction if noise level is not 0 and not first step
+            x = torch.where(
+                append_dims(next_sigma, x.ndim) > 0.0, x_advanced, x_standard
+            )
+
+        return x, denoised
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None, **kwargs):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        old_denoised = None
+        for i in self.get_sigma_gen(num_sigmas):
+            x, old_denoised = self.sampler_step(
+                old_denoised,
+                None if i == 0 else s_in * sigmas[i - 1],
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc=uc,
+            )
+
+        return x
diff --git a/sgm/modules/diffusionmodules/sampling_utils.py b/sgm/modules/diffusionmodules/sampling_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce78527ea9052a8bfd0856ed2278901516fb9130
--- /dev/null
+++ b/sgm/modules/diffusionmodules/sampling_utils.py
@@ -0,0 +1,43 @@
+import torch
+from scipy import integrate
+
+from ...util import append_dims
+
+
+def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
+    if order - 1 > i:
+        raise ValueError(f"Order {order} too high for step {i}")
+
+    def fn(tau):
+        prod = 1.0
+        for k in range(order):
+            if j == k:
+                continue
+            prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
+        return prod
+
+    return integrate.quad(fn, t[i], t[i + 1], epsrel=epsrel)[0]
+
+
+def get_ancestral_step(sigma_from, sigma_to, eta=1.0):
+    if not eta:
+        return sigma_to, 0.0
+    sigma_up = torch.minimum(
+        sigma_to,
+        eta
+        * (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5,
+    )
+    sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    return sigma_down, sigma_up
+
+
+def to_d(x, sigma, denoised):
+    return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+def to_neg_log_sigma(sigma):
+    return sigma.log().neg()
+
+
+def to_sigma(neg_log_sigma):
+    return neg_log_sigma.neg().exp()
diff --git a/sgm/modules/diffusionmodules/sigma_sampling.py b/sgm/modules/diffusionmodules/sigma_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54724c6ef6a7b8067784a4192b0fe2f41123063
--- /dev/null
+++ b/sgm/modules/diffusionmodules/sigma_sampling.py
@@ -0,0 +1,31 @@
+import torch
+
+from ...util import default, instantiate_from_config
+
+
+class EDMSampling:
+    def __init__(self, p_mean=-1.2, p_std=1.2):
+        self.p_mean = p_mean
+        self.p_std = p_std
+
+    def __call__(self, n_samples, rand=None):
+        log_sigma = self.p_mean + self.p_std * default(rand, torch.randn((n_samples,)))
+        return log_sigma.exp()
+
+
+class DiscreteSampling:
+    def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True):
+        self.num_idx = num_idx
+        self.sigmas = instantiate_from_config(discretization_config)(
+            num_idx, do_append_zero=do_append_zero, flip=flip
+        )
+
+    def idx_to_sigma(self, idx):
+        return self.sigmas[idx]
+
+    def __call__(self, n_samples, rand=None):
+        idx = default(
+            rand,
+            torch.randint(0, self.num_idx, (n_samples,)),
+        )
+        return self.idx_to_sigma(idx)
diff --git a/sgm/modules/diffusionmodules/util.py b/sgm/modules/diffusionmodules/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c63e122781a9626c36f1aac859e23890e9f44a4
--- /dev/null
+++ b/sgm/modules/diffusionmodules/util.py
@@ -0,0 +1,371 @@
+"""
+partially adopted from
+https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+and
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+and
+https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+
+thanks!
+"""
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+
+
+def make_beta_schedule(
+    schedule,
+    n_timestep,
+    linear_start=1e-4,
+    linear_end=2e-2,
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+    return betas.numpy()
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def mixed_checkpoint(func, inputs: dict, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass. This differs from the original checkpoint function
+    borrowed from https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py in that
+    it also works with non-tensor inputs
+    :param func: the function to evaluate.
+    :param inputs: the argument dictionary to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        tensor_keys = [key for key in inputs if isinstance(inputs[key], torch.Tensor)]
+        tensor_inputs = [
+            inputs[key] for key in inputs if isinstance(inputs[key], torch.Tensor)
+        ]
+        non_tensor_keys = [
+            key for key in inputs if not isinstance(inputs[key], torch.Tensor)
+        ]
+        non_tensor_inputs = [
+            inputs[key] for key in inputs if not isinstance(inputs[key], torch.Tensor)
+        ]
+        args = tuple(tensor_inputs) + tuple(non_tensor_inputs) + tuple(params)
+        return MixedCheckpointFunction.apply(
+            func,
+            len(tensor_inputs),
+            len(non_tensor_inputs),
+            tensor_keys,
+            non_tensor_keys,
+            *args,
+        )
+    else:
+        return func(**inputs)
+
+
+class MixedCheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        run_function,
+        length_tensors,
+        length_non_tensors,
+        tensor_keys,
+        non_tensor_keys,
+        *args,
+    ):
+        ctx.end_tensors = length_tensors
+        ctx.end_non_tensors = length_tensors + length_non_tensors
+        ctx.gpu_autocast_kwargs = {
+            "enabled": torch.is_autocast_enabled(),
+            "dtype": torch.get_autocast_gpu_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled(),
+        }
+        assert (
+            len(tensor_keys) == length_tensors
+            and len(non_tensor_keys) == length_non_tensors
+        )
+
+        ctx.input_tensors = {
+            key: val for (key, val) in zip(tensor_keys, list(args[: ctx.end_tensors]))
+        }
+        ctx.input_non_tensors = {
+            key: val
+            for (key, val) in zip(
+                non_tensor_keys, list(args[ctx.end_tensors : ctx.end_non_tensors])
+            )
+        }
+        ctx.run_function = run_function
+        ctx.input_params = list(args[ctx.end_non_tensors :])
+
+        with torch.no_grad():
+            output_tensors = ctx.run_function(
+                **ctx.input_tensors, **ctx.input_non_tensors
+            )
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        # additional_args = {key: ctx.input_tensors[key] for key in ctx.input_tensors if not isinstance(ctx.input_tensors[key],torch.Tensor)}
+        ctx.input_tensors = {
+            key: ctx.input_tensors[key].detach().requires_grad_(True)
+            for key in ctx.input_tensors
+        }
+
+        with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = {
+                key: ctx.input_tensors[key].view_as(ctx.input_tensors[key])
+                for key in ctx.input_tensors
+            }
+            # shallow_copies.update(additional_args)
+            output_tensors = ctx.run_function(**shallow_copies, **ctx.input_non_tensors)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            list(ctx.input_tensors.values()) + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (
+            (None, None, None, None, None)
+            + input_grads[: ctx.end_tensors]
+            + (None,) * (ctx.end_non_tensors - ctx.end_tensors)
+            + input_grads[ctx.end_tensors :]
+        )
+
+
+ckpt = torch.utils.checkpoint.checkpoint
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        #args = tuple(inputs) + tuple(params)
+        #return CheckpointFunction.apply(func, len(inputs), *args)
+        return ckpt(func, *inputs)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        ctx.gpu_autocast_kwargs = {
+            "enabled": torch.is_autocast_enabled(),
+            "dtype": torch.get_autocast_gpu_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled(),
+        }
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+class AlphaBlender(nn.Module):
+    strategies = ["learned", "fixed", "learned_with_images"]
+
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        rearrange_pattern: str = "b t -> (b t) 1 1",
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.rearrange_pattern = rearrange_pattern
+
+        assert (
+            merge_strategy in self.strategies
+        ), f"merge_strategy needs to be in {self.strategies}"
+
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif (
+            self.merge_strategy == "learned"
+            or self.merge_strategy == "learned_with_images"
+        ):
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self, image_only_indicator: torch.Tensor) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+        elif self.merge_strategy == "learned_with_images":
+            assert image_only_indicator is not None, "need image_only_indicator ..."
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                rearrange(torch.sigmoid(self.mix_factor), "... -> ... 1"),
+            )
+            alpha = rearrange(alpha, self.rearrange_pattern)
+        else:
+            raise NotImplementedError
+        return alpha
+
+    def forward(
+        self,
+        x_spatial: torch.Tensor,
+        x_temporal: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator)
+        x = (
+            alpha.to(x_spatial.dtype) * x_spatial
+            + (1.0 - alpha).to(x_spatial.dtype) * x_temporal
+        )
+        return x
diff --git a/sgm/modules/diffusionmodules/video_model.py b/sgm/modules/diffusionmodules/video_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..71e8264f9e5ff00d0d56c00a00c5df9dfd70aaaf
--- /dev/null
+++ b/sgm/modules/diffusionmodules/video_model.py
@@ -0,0 +1,502 @@
+from functools import partial
+from typing import List, Optional, Union
+
+from einops import rearrange
+
+import torch
+from ...modules.diffusionmodules.openaimodel import *
+from ...modules.video_attention import SpatialVideoTransformer
+from ...util import default
+from .util import AlphaBlender
+
+
+class VideoResBlock(ResBlock):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        dropout: float,
+        video_kernel_size: Union[int, List[int]] = 3,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        use_scale_shift_norm: bool = False,
+        dims: int = 2,
+        use_checkpoint: bool = False,
+        up: bool = False,
+        down: bool = False,
+    ):
+        super().__init__(
+            channels,
+            emb_channels,
+            dropout,
+            out_channels=out_channels,
+            use_conv=use_conv,
+            use_scale_shift_norm=use_scale_shift_norm,
+            dims=dims,
+            use_checkpoint=use_checkpoint,
+            up=up,
+            down=down,
+        )
+
+        self.time_stack = ResBlock(
+            default(out_channels, channels),
+            emb_channels,
+            dropout=dropout,
+            dims=3,
+            out_channels=default(out_channels, channels),
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=use_checkpoint,
+            exchange_temb_dims=True,
+        )
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            rearrange_pattern="b t -> b 1 t 1 1",
+        )
+
+    def forward(
+        self,
+        x: th.Tensor,
+        emb: th.Tensor,
+        num_video_frames: int,
+        image_only_indicator: Optional[th.Tensor] = None,
+    ) -> th.Tensor:
+        x = super().forward(x, emb)
+
+        x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
+
+        x = self.time_stack(
+            x, rearrange(emb, "(b t) ... -> b t ...", t=num_video_frames)
+        )
+        x = self.time_mixer(
+            x_spatial=x_mix, x_temporal=x, image_only_indicator=image_only_indicator
+        )
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+
+
+class VideoUNet(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        num_frames: int,
+        num_res_blocks: int,
+        attention_resolutions: int,
+        dropout: float = 0.0,
+        channel_mult: List[int] = (1, 2, 4, 8),
+        conv_resample: bool = True,
+        dims: int = 2,
+        num_classes: Optional[int] = None,
+        use_checkpoint: bool = False,
+        num_heads: int = -1,
+        num_head_channels: int = -1,
+        num_heads_upsample: int = -1,
+        use_scale_shift_norm: bool = False,
+        resblock_updown: bool = False,
+        transformer_depth: Union[List[int], int] = 1,
+        transformer_depth_middle: Optional[int] = None,
+        context_dim: Optional[int] = None,
+        time_downup: bool = False,
+        time_context_dim: Optional[int] = None,
+        extra_ff_mix_layer: bool = False,
+        use_spatial_context: bool = False,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        spatial_transformer_attn_type: str = "softmax",
+        video_kernel_size: Union[int, List[int]] = 3,
+        use_linear_in_transformer: bool = False,
+        adm_in_channels: Optional[int] = None,
+        disable_temporal_crossattention: bool = False,
+        max_ddpm_temb_period: int = 10000,
+    ):
+        super(VideoUNet, self).__init__()
+        assert context_dim is not None
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1
+
+        if num_head_channels == -1:
+            assert num_heads != -1
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_frames = num_frames
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = nn.Sequential(
+                    Timestep(model_channels),
+                    nn.Sequential(
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    ),
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+
+        def get_attention_layer(
+            ch,
+            num_heads,
+            dim_head,
+            depth=1,
+            context_dim=None,
+            use_checkpoint=False,
+            disabled_sa=False,
+        ):
+            return SpatialVideoTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=depth,
+                context_dim=context_dim,
+                time_context_dim=time_context_dim,
+                dropout=dropout,
+                ff_in=extra_ff_mix_layer,
+                use_spatial_context=use_spatial_context,
+                merge_strategy=merge_strategy,
+                merge_factor=merge_factor,
+                checkpoint=use_checkpoint,
+                use_linear=use_linear_in_transformer,
+                attn_mode=spatial_transformer_attn_type,
+                disable_self_attn=disabled_sa,
+                disable_temporal_crossattention=disable_temporal_crossattention,
+                max_time_embed_period=max_ddpm_temb_period,
+            )
+
+        def get_resblock(
+            merge_factor,
+            merge_strategy,
+            video_kernel_size,
+            ch,
+            time_embed_dim,
+            dropout,
+            out_ch,
+            dims,
+            use_checkpoint,
+            use_scale_shift_norm,
+            down=False,
+            up=False,
+        ):
+            return VideoResBlock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                channels=ch,
+                emb_channels=time_embed_dim,
+                dropout=dropout,
+                out_channels=out_ch,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                down=down,
+                up=up,
+            )
+
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            third_down=time_downup,
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+
+        self.middle_block = TimestepEmbedSequential(
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                time_embed_dim=time_embed_dim,
+                out_ch=None,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            get_attention_layer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth_middle,
+                context_dim=context_dim,
+                use_checkpoint=use_checkpoint,
+            ),
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                out_ch=None,
+                time_embed_dim=time_embed_dim,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch + ich,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    ds //= 2
+                    layers.append(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            third_up=time_downup,
+                        )
+                    )
+
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+
+    def forward(
+        self,
+        x: th.Tensor,
+        timesteps: th.Tensor,
+        context: Optional[th.Tensor] = None,
+        y: Optional[th.Tensor] = None,
+        time_context: Optional[th.Tensor] = None,
+        num_video_frames: Optional[int] = None,
+        image_only_indicator: Optional[th.Tensor] = None,
+    ):
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional -> no, relax this TODO"
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        ## tbd: check the role of "image_only_indicator"
+        num_video_frames = self.num_frames
+        image_only_indicator = torch.zeros(
+                    x.shape[0]//num_video_frames, num_video_frames
+                ).to(x.device) if image_only_indicator is None else image_only_indicator
+
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+
+        ## x shape: [bt,c,h,w]
+        h = x
+        hs = []
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames,
+            )
+            hs.append(h)
+        h = self.middle_block(
+            h,
+            emb,
+            context=context,
+            image_only_indicator=image_only_indicator,
+            time_context=time_context,
+            num_video_frames=num_video_frames,
+        )
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames,
+            )
+        h = h.type(x.dtype)
+        return self.out(h)
diff --git a/sgm/modules/diffusionmodules/wrappers.py b/sgm/modules/diffusionmodules/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..37449ea63e992b9f89856f1f47c18ba68be8e334
--- /dev/null
+++ b/sgm/modules/diffusionmodules/wrappers.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+from packaging import version
+
+OPENAIUNETWRAPPER = "sgm.modules.diffusionmodules.wrappers.OpenAIWrapper"
+
+
+class IdentityWrapper(nn.Module):
+    def __init__(self, diffusion_model, compile_model: bool = False):
+        super().__init__()
+        compile = (
+            torch.compile
+            if (version.parse(torch.__version__) >= version.parse("2.0.0"))
+            and compile_model
+            else lambda x: x
+        )
+        self.diffusion_model = compile(diffusion_model)
+
+    def forward(self, *args, **kwargs):
+        return self.diffusion_model(*args, **kwargs)
+
+
+class OpenAIWrapper(IdentityWrapper):
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs
+    ) -> torch.Tensor:
+        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        return self.diffusion_model(
+            x,
+            timesteps=t,
+            context=c.get("crossattn", None),
+            y=c.get("vector", None),
+            **kwargs,
+        )
diff --git a/sgm/modules/distributions/__init__.py b/sgm/modules/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/distributions/distributions.py b/sgm/modules/distributions/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..016be35523187ea366db9ade391fe8ee276db60b
--- /dev/null
+++ b/sgm/modules/distributions/distributions.py
@@ -0,0 +1,102 @@
+import numpy as np
+import torch
+
+
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+
+    def mode(self):
+        raise NotImplementedError()
+
+
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self):
+        return self.mean
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )
diff --git a/sgm/modules/ema.py b/sgm/modules/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b5ae2b230f89b4dba57e44c4f851478ad86f68
--- /dev/null
+++ b/sgm/modules/ema.py
@@ -0,0 +1,86 @@
+import torch
+from torch import nn
+
+
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+
+        self.m_name2s_name = {}
+        self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer(
+            "num_updates",
+            torch.tensor(0, dtype=torch.int)
+            if use_num_upates
+            else torch.tensor(-1, dtype=torch.int),
+        )
+
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace(".", "")
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
+
+        self.collected_params = []
+
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer("num_updates", torch.tensor(0, dtype=torch.int))
+
+    def forward(self, model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
+
+        one_minus_decay = 1.0 - decay
+
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(
+                        one_minus_decay * (shadow_params[sname] - m_param[key])
+                    )
+                else:
+                    assert not key in self.m_name2s_name
+
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert not key in self.m_name2s_name
+
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)
diff --git a/sgm/modules/encoders/__init__.py b/sgm/modules/encoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sgm/modules/encoders/modules.py b/sgm/modules/encoders/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..504d7a14409b6a5c7b76dea8ba60ecd236349ac6
--- /dev/null
+++ b/sgm/modules/encoders/modules.py
@@ -0,0 +1,1045 @@
+import math
+from contextlib import nullcontext
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+
+import kornia
+import numpy as np
+import open_clip
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from omegaconf import ListConfig
+from torch.utils.checkpoint import checkpoint
+from transformers import (ByT5Tokenizer, CLIPTextModel, CLIPTokenizer,
+                          T5EncoderModel, T5Tokenizer)
+
+from ...modules.autoencoding.regularizers import DiagonalGaussianRegularizer
+from ...modules.diffusionmodules.model import Encoder
+from ...modules.diffusionmodules.openaimodel import Timestep
+from ...modules.diffusionmodules.util import (extract_into_tensor,
+                                              make_beta_schedule)
+from ...modules.distributions.distributions import DiagonalGaussianDistribution
+from ...util import (append_dims, autocast, count_params, default,
+                     disabled_train, expand_dims_like, instantiate_from_config)
+
+
+class AbstractEmbModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+
+
+class GeneralConditioner(nn.Module):
+    OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
+    KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
+
+    def __init__(self, emb_models: Union[List, ListConfig]):
+        super().__init__()
+        embedders = []
+        for n, embconfig in enumerate(emb_models):
+            embedder = instantiate_from_config(embconfig)
+            assert isinstance(
+                embedder, AbstractEmbModel
+            ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+            embedder.is_trainable = embconfig.get("is_trainable", False)
+            embedder.ucg_rate = embconfig.get("ucg_rate", 0.0)
+            if not embedder.is_trainable:
+                embedder.train = disabled_train
+                for param in embedder.parameters():
+                    param.requires_grad = False
+                embedder.eval()
+            print(
+                f"Initialized embedder #{n}: {embedder.__class__.__name__} "
+                f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
+            )
+
+            if "input_key" in embconfig:
+                embedder.input_key = embconfig["input_key"]
+            elif "input_keys" in embconfig:
+                embedder.input_keys = embconfig["input_keys"]
+            else:
+                raise KeyError(
+                    f"need either 'input_key' or 'input_keys' for embedder {embedder.__class__.__name__}"
+                )
+
+            embedder.legacy_ucg_val = embconfig.get("legacy_ucg_value", None)
+            if embedder.legacy_ucg_val is not None:
+                embedder.ucg_prng = np.random.RandomState()
+
+            embedders.append(embedder)
+        self.embedders = nn.ModuleList(embedders)
+
+    def possibly_get_ucg_val(self, embedder: AbstractEmbModel, batch: Dict) -> Dict:
+        assert embedder.legacy_ucg_val is not None
+        p = embedder.ucg_rate
+        val = embedder.legacy_ucg_val
+        for i in range(len(batch[embedder.input_key])):
+            if embedder.ucg_prng.choice(2, p=[1 - p, p]):
+                batch[embedder.input_key][i] = val
+        return batch
+
+    def forward(
+        self, batch: Dict, force_zero_embeddings: Optional[List] = None
+    ) -> Dict:
+        output = dict()
+        if force_zero_embeddings is None:
+            force_zero_embeddings = []
+        for embedder in self.embedders:
+            embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+            with embedding_context():
+                if hasattr(embedder, "input_key") and (embedder.input_key is not None):
+                    if embedder.legacy_ucg_val is not None:
+                        batch = self.possibly_get_ucg_val(embedder, batch)
+                    emb_out = embedder(batch[embedder.input_key])
+                elif hasattr(embedder, "input_keys"):
+                    emb_out = embedder(*[batch[k] for k in embedder.input_keys])
+            assert isinstance(
+                emb_out, (torch.Tensor, list, tuple)
+            ), f"encoder outputs must be tensors or a sequence, but got {type(emb_out)}"
+            if not isinstance(emb_out, (list, tuple)):
+                emb_out = [emb_out]
+            for emb in emb_out:
+                out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
+                if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
+                    emb = (
+                        expand_dims_like(
+                            torch.bernoulli(
+                                (1.0 - embedder.ucg_rate)
+                                * torch.ones(emb.shape[0], device=emb.device)
+                            ),
+                            emb,
+                        )
+                        * emb
+                    )
+                if (
+                    hasattr(embedder, "input_key")
+                    and embedder.input_key in force_zero_embeddings
+                ):
+                    emb = torch.zeros_like(emb)
+                if out_key in output:
+                    output[out_key] = torch.cat(
+                        (output[out_key], emb), self.KEY2CATDIM[out_key]
+                    )
+                else:
+                    output[out_key] = emb
+        return output
+
+    def get_unconditional_conditioning(
+        self,
+        batch_c: Dict,
+        batch_uc: Optional[Dict] = None,
+        force_uc_zero_embeddings: Optional[List[str]] = None,
+        force_cond_zero_embeddings: Optional[List[str]] = None,
+    ):
+        if force_uc_zero_embeddings is None:
+            force_uc_zero_embeddings = []
+        ucg_rates = list()
+        for embedder in self.embedders:
+            ucg_rates.append(embedder.ucg_rate)
+            embedder.ucg_rate = 0.0
+        c = self(batch_c, force_cond_zero_embeddings)
+        uc = self(batch_c if batch_uc is None else batch_uc, force_uc_zero_embeddings)
+
+        for embedder, rate in zip(self.embedders, ucg_rates):
+            embedder.ucg_rate = rate
+        return c, uc
+
+
+class InceptionV3(nn.Module):
+    """Wrapper around the https://github.com/mseitzer/pytorch-fid inception
+    port with an additional squeeze at the end"""
+
+    def __init__(self, normalize_input=False, **kwargs):
+        super().__init__()
+        from pytorch_fid import inception
+
+        kwargs["resize_input"] = True
+        self.model = inception.InceptionV3(normalize_input=normalize_input, **kwargs)
+
+    def forward(self, inp):
+        outp = self.model(inp)
+
+        if len(outp) == 1:
+            return outp[0].squeeze()
+
+        return outp
+
+
+class IdentityEncoder(AbstractEmbModel):
+    def encode(self, x):
+        return x
+
+    def forward(self, x):
+        return x
+
+
+class ClassEmbedder(AbstractEmbModel):
+    def __init__(self, embed_dim, n_classes=1000, add_sequence_dim=False):
+        super().__init__()
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+        self.n_classes = n_classes
+        self.add_sequence_dim = add_sequence_dim
+
+    def forward(self, c):
+        c = self.embedding(c)
+        if self.add_sequence_dim:
+            c = c[:, None, :]
+        return c
+
+    def get_unconditional_conditioning(self, bs, device="cuda"):
+        uc_class = (
+            self.n_classes - 1
+        )  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
+        uc = torch.ones((bs,), device=device) * uc_class
+        uc = {self.key: uc.long()}
+        return uc
+
+
+class ClassEmbedderForMultiCond(ClassEmbedder):
+    def forward(self, batch, key=None, disable_dropout=False):
+        out = batch
+        key = default(key, self.key)
+        islist = isinstance(batch[key], list)
+        if islist:
+            batch[key] = batch[key][0]
+        c_out = super().forward(batch, key, disable_dropout)
+        out[key] = [c_out] if islist else c_out
+        return out
+
+
+class FrozenT5Embedder(AbstractEmbModel):
+    """Uses the T5 transformer encoder for text"""
+
+    def __init__(
+        self, version="google/t5-v1_1-xxl", device="cuda", max_length=77, freeze=True
+    ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        with torch.autocast("cuda", enabled=False):
+            outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenByT5Embedder(AbstractEmbModel):
+    """
+    Uses the ByT5 transformer encoder for text. Is character-aware.
+    """
+
+    def __init__(
+        self, version="google/byt5-base", device="cuda", max_length=77, freeze=True
+    ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = ByT5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        with torch.autocast("cuda", enabled=False):
+            outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPEmbedder(AbstractEmbModel):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+
+    LAYERS = ["last", "pooled", "hidden"]
+
+    def __init__(
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+        layer_idx=None,
+        always_return_pooled=False,
+    ):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in self.LAYERS
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.return_pooled = always_return_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(
+            input_ids=tokens, output_hidden_states=self.layer == "hidden"
+        )
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+        if self.return_pooled:
+            return z, outputs.pooler_output
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPEmbedder2(AbstractEmbModel):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+
+    LAYERS = ["pooled", "last", "penultimate"]
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+        always_return_pooled=False,
+        legacy=True,
+    ):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch,
+            device=torch.device("cpu"),
+            pretrained=version,
+        )
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        self.return_pooled = always_return_pooled
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+        self.legacy = legacy
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        if not self.return_pooled and self.legacy:
+            return z
+        if self.return_pooled:
+            assert not self.legacy
+            return z[self.layer], z["pooled"]
+        return z[self.layer]
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        if self.legacy:
+            x = x[self.layer]
+            x = self.model.ln_final(x)
+            return x
+        else:
+            # x is a dict and will stay a dict
+            o = x["last"]
+            o = self.model.ln_final(o)
+            pooled = self.pool(o, text)
+            x["pooled"] = pooled
+            return x
+
+    def pool(self, x, text):
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = (
+            x[torch.arange(x.shape[0]), text.argmax(dim=-1)]
+            @ self.model.text_projection
+        )
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        outputs = {}
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - 1:
+                outputs["penultimate"] = x.permute(1, 0, 2)  # LND -> NLD
+            if (
+                self.model.transformer.grad_checkpointing
+                and not torch.jit.is_scripting()
+            ):
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        outputs["last"] = x.permute(1, 0, 2)  # LND -> NLD
+        return outputs
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPEmbedder(AbstractEmbModel):
+    LAYERS = [
+        # "pooled",
+        "last",
+        "penultimate",
+    ]
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+    ):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch, device=torch.device("cpu"), pretrained=version
+        )
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        return z
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if (
+                self.model.transformer.grad_checkpointing
+                and not torch.jit.is_scripting()
+            ):
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPImageEmbedder(AbstractEmbModel):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        antialias=True,
+        ucg_rate=0.0,
+        unsqueeze_dim=False,
+        repeat_to_max_len=False,
+        num_image_crops=0,
+        output_tokens=False,
+        init_device=None,
+    ):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch,
+            device=torch.device(default(init_device, "cuda")),
+            pretrained=version,
+        )
+        del model.transformer
+        self.model = model
+        self.max_crops = num_image_crops
+        self.pad_to_max_len = self.max_crops > 0
+        self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+        self.antialias = antialias
+
+        self.register_buffer(
+            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+        )
+        self.register_buffer(
+            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+        )
+        self.ucg_rate = ucg_rate
+        self.unsqueeze_dim = unsqueeze_dim
+        self.stored_batch = None
+        self.model.visual.output_tokens = output_tokens
+        self.output_tokens = output_tokens
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=self.antialias,
+        )
+        x = (x + 1.0) / 2.0
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, image, no_dropout=False):
+        z = self.encode_with_vision_transformer(image)
+        tokens = None
+        if self.output_tokens:
+            z, tokens = z[0], z[1]
+        z = z.to(image.dtype)
+        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+            z = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+                )[:, None]
+                * z
+            )
+            if tokens is not None:
+                tokens = (
+                    expand_dims_like(
+                        torch.bernoulli(
+                            (1.0 - self.ucg_rate)
+                            * torch.ones(tokens.shape[0], device=tokens.device)
+                        ),
+                        tokens,
+                    )
+                    * tokens
+                )
+        if self.unsqueeze_dim:
+            z = z[:, None, :]
+        if self.output_tokens:
+            assert not self.repeat_to_max_len
+            assert not self.pad_to_max_len
+            return tokens, z
+        if self.repeat_to_max_len:
+            if z.dim() == 2:
+                z_ = z[:, None, :]
+            else:
+                z_ = z
+            return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
+        elif self.pad_to_max_len:
+            assert z.dim() == 3
+            z_pad = torch.cat(
+                (
+                    z,
+                    torch.zeros(
+                        z.shape[0],
+                        self.max_length - z.shape[1],
+                        z.shape[2],
+                        device=z.device,
+                    ),
+                ),
+                1,
+            )
+            return z_pad, z_pad[:, 0, ...]
+        return z
+
+    def encode_with_vision_transformer(self, img):
+        # if self.max_crops > 0:
+        #    img = self.preprocess_by_cropping(img)
+        if img.dim() == 5:
+            assert self.max_crops == img.shape[1]
+            img = rearrange(img, "b n c h w -> (b n) c h w")
+        img = self.preprocess(img)
+        if not self.output_tokens:
+            assert not self.model.visual.output_tokens
+            x = self.model.visual(img)
+            tokens = None
+        else:
+            assert self.model.visual.output_tokens
+            x, tokens = self.model.visual(img)
+        if self.max_crops > 0:
+            x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
+            # drop out between 0 and all along the sequence axis
+            x = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate)
+                    * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
+                )
+                * x
+            )
+            if tokens is not None:
+                tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
+                print(
+                    f"You are running very experimental token-concat in {self.__class__.__name__}. "
+                    f"Check what you are doing, and then remove this message."
+                )
+        if self.output_tokens:
+            return x, tokens
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPT5Encoder(AbstractEmbModel):
+    def __init__(
+        self,
+        clip_version="openai/clip-vit-large-patch14",
+        t5_version="google/t5-v1_1-xl",
+        device="cuda",
+        clip_max_length=77,
+        t5_max_length=77,
+    ):
+        super().__init__()
+        self.clip_encoder = FrozenCLIPEmbedder(
+            clip_version, device, max_length=clip_max_length
+        )
+        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
+        print(
+            f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, "
+            f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params."
+        )
+
+    def encode(self, text):
+        return self(text)
+
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]
+
+
+class SpatialRescaler(nn.Module):
+    def __init__(
+        self,
+        n_stages=1,
+        method="bilinear",
+        multiplier=0.5,
+        in_channels=3,
+        out_channels=None,
+        bias=False,
+        wrap_video=False,
+        kernel_size=1,
+        remap_output=False,
+    ):
+        super().__init__()
+        self.n_stages = n_stages
+        assert self.n_stages >= 0
+        assert method in [
+            "nearest",
+            "linear",
+            "bilinear",
+            "trilinear",
+            "bicubic",
+            "area",
+        ]
+        self.multiplier = multiplier
+        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
+        self.remap_output = out_channels is not None or remap_output
+        if self.remap_output:
+            print(
+                f"Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing."
+            )
+            self.channel_mapper = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                bias=bias,
+                padding=kernel_size // 2,
+            )
+        self.wrap_video = wrap_video
+
+    def forward(self, x):
+        if self.wrap_video and x.ndim == 5:
+            B, C, T, H, W = x.shape
+            x = rearrange(x, "b c t h w -> b t c h w")
+            x = rearrange(x, "b t c h w -> (b t) c h w")
+
+        for stage in range(self.n_stages):
+            x = self.interpolator(x, scale_factor=self.multiplier)
+
+        if self.wrap_video:
+            x = rearrange(x, "(b t) c h w -> b t c h w", b=B, t=T, c=C)
+            x = rearrange(x, "b t c h w -> b c t h w")
+        if self.remap_output:
+            x = self.channel_mapper(x)
+        return x
+
+    def encode(self, x):
+        return self(x)
+
+
+class LowScaleEncoder(nn.Module):
+    def __init__(
+        self,
+        model_config,
+        linear_start,
+        linear_end,
+        timesteps=1000,
+        max_noise_level=250,
+        output_size=64,
+        scale_factor=1.0,
+    ):
+        super().__init__()
+        self.max_noise_level = max_noise_level
+        self.model = instantiate_from_config(model_config)
+        self.augmentation_schedule = self.register_schedule(
+            timesteps=timesteps, linear_start=linear_start, linear_end=linear_end
+        )
+        self.out_size = output_size
+        self.scale_factor = scale_factor
+
+    def register_schedule(
+        self,
+        beta_schedule="linear",
+        timesteps=1000,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+    ):
+        betas = make_beta_schedule(
+            beta_schedule,
+            timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s,
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+
+        (timesteps,) = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert (
+            alphas_cumprod.shape[0] == self.num_timesteps
+        ), "alphas have to be defined for each timestep"
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer("betas", to_torch(betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1))
+        )
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+            extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+
+    def forward(self, x):
+        z = self.model.encode(x)
+        if isinstance(z, DiagonalGaussianDistribution):
+            z = z.sample()
+        z = z * self.scale_factor
+        noise_level = torch.randint(
+            0, self.max_noise_level, (x.shape[0],), device=x.device
+        ).long()
+        z = self.q_sample(z, noise_level)
+        if self.out_size is not None:
+            z = torch.nn.functional.interpolate(z, size=self.out_size, mode="nearest")
+        return z, noise_level
+
+    def decode(self, z):
+        z = z / self.scale_factor
+        return self.model.decode(z)
+
+
+class ConcatTimestepEmbedderND(AbstractEmbModel):
+    """embeds each dimension independently and concatenates them"""
+
+    def __init__(self, outdim):
+        super().__init__()
+        self.timestep = Timestep(outdim)
+        self.outdim = outdim
+
+    def forward(self, x):
+        if x.ndim == 1:
+            x = x[:, None]
+        assert len(x.shape) == 2
+        b, dims = x.shape[0], x.shape[1]
+        x = rearrange(x, "b d -> (b d)")
+        emb = self.timestep(x)
+        emb = rearrange(emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        return emb
+
+
+class GaussianEncoder(Encoder, AbstractEmbModel):
+    def __init__(
+        self, weight: float = 1.0, flatten_output: bool = True, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.posterior = DiagonalGaussianRegularizer()
+        self.weight = weight
+        self.flatten_output = flatten_output
+
+    def forward(self, x) -> Tuple[Dict, torch.Tensor]:
+        z = super().forward(x)
+        z, log = self.posterior(z)
+        log["loss"] = log["kl_loss"]
+        log["weight"] = self.weight
+        if self.flatten_output:
+            z = rearrange(z, "b c h w -> b (h w ) c")
+        return log, z
+
+
+class VideoPredictionEmbedderWithEncoder(AbstractEmbModel):
+    def __init__(
+        self,
+        n_cond_frames: int,
+        n_copies: int,
+        encoder_config: dict,
+        sigma_sampler_config: Optional[dict] = None,
+        sigma_cond_config: Optional[dict] = None,
+        is_ae: bool = False,
+        scale_factor: float = 1.0,
+        disable_encoder_autocast: bool = False,
+        en_and_decode_n_samples_a_time: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.n_cond_frames = n_cond_frames
+        self.n_copies = n_copies
+        self.encoder = instantiate_from_config(encoder_config)
+        self.sigma_sampler = (
+            instantiate_from_config(sigma_sampler_config)
+            if sigma_sampler_config is not None
+            else None
+        )
+        self.sigma_cond = (
+            instantiate_from_config(sigma_cond_config)
+            if sigma_cond_config is not None
+            else None
+        )
+        self.is_ae = is_ae
+        self.scale_factor = scale_factor
+        self.disable_encoder_autocast = disable_encoder_autocast
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+
+    def forward(
+        self, vid: torch.Tensor
+    ) -> Union[
+        torch.Tensor,
+        Tuple[torch.Tensor, torch.Tensor],
+        Tuple[torch.Tensor, dict],
+        Tuple[Tuple[torch.Tensor, torch.Tensor], dict],
+    ]:
+        if self.sigma_sampler is not None:
+            b = vid.shape[0] // self.n_cond_frames
+            sigmas = self.sigma_sampler(b).to(vid.device)
+            if self.sigma_cond is not None:
+                sigma_cond = self.sigma_cond(sigmas)
+                sigma_cond = repeat(sigma_cond, "b d -> (b t) d", t=self.n_copies)
+            sigmas = repeat(sigmas, "b -> (b t)", t=self.n_cond_frames)
+            noise = torch.randn_like(vid)
+            vid = vid + noise * append_dims(sigmas, vid.ndim)
+
+        with torch.autocast("cuda", enabled=not self.disable_encoder_autocast):
+            n_samples = (
+                self.en_and_decode_n_samples_a_time
+                if self.en_and_decode_n_samples_a_time is not None
+                else vid.shape[0]
+            )
+            n_rounds = math.ceil(vid.shape[0] / n_samples)
+            all_out = []
+            for n in range(n_rounds):
+                if self.is_ae:
+                    out = self.encoder.encode(vid[n * n_samples : (n + 1) * n_samples])
+                else:
+                    out = self.encoder(vid[n * n_samples : (n + 1) * n_samples])
+                all_out.append(out)
+
+        vid = torch.cat(all_out, dim=0)
+        vid *= self.scale_factor
+
+        vid = rearrange(vid, "(b t) c h w -> b () (t c) h w", t=self.n_cond_frames)
+        vid = repeat(vid, "b 1 c h w -> (b t) c h w", t=self.n_copies)
+
+        return_val = (vid, sigma_cond) if self.sigma_cond is not None else vid
+        return return_val
+
+
+class FrozenOpenCLIPImagePredictionEmbedder(AbstractEmbModel):
+    def __init__(
+        self,
+        open_clip_embedding_config: Dict,
+        n_cond_frames: int,
+        n_copies: int,
+    ):
+        super().__init__()
+
+        self.n_cond_frames = n_cond_frames
+        self.n_copies = n_copies
+        self.open_clip = instantiate_from_config(open_clip_embedding_config)
+
+    def forward(self, vid):
+        vid = self.open_clip(vid)
+        vid = rearrange(vid, "(b t) d -> b t d", t=self.n_cond_frames)
+        vid = repeat(vid, "b t d -> (b s) t d", s=self.n_copies)
+
+        return vid
diff --git a/sgm/modules/video_attention.py b/sgm/modules/video_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..5152dbb99f9ce8c3afc4464c6d4817544c5345c4
--- /dev/null
+++ b/sgm/modules/video_attention.py
@@ -0,0 +1,303 @@
+import torch
+
+from ..modules.attention import *
+from ..modules.diffusionmodules.util import AlphaBlender, linear, timestep_embedding
+
+
+class TimeMixSequential(nn.Sequential):
+    def forward(self, x, context=None, timesteps=None):
+        for layer in self:
+            x = layer(x, context, timesteps)
+
+        return x
+
+
+class VideoTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,
+        "softmax-xformers": MemoryEfficientCrossAttention,
+    }
+
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        timesteps=None,
+        ff_in=False,
+        inner_dim=None,
+        attn_mode="softmax",
+        disable_self_attn=False,
+        disable_temporal_crossattention=False,
+        switch_temporal_ca_to_sa=False,
+    ):
+        super().__init__()
+
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+
+        self.ff_in = ff_in or inner_dim is not None
+        if inner_dim is None:
+            inner_dim = dim
+
+        assert int(n_heads * d_head) == inner_dim
+
+        self.is_res = inner_dim == dim
+
+        if self.ff_in:
+            self.norm_in = nn.LayerNorm(dim)
+            self.ff_in = FeedForward(
+                dim, dim_out=inner_dim, dropout=dropout, glu=gated_ff
+            )
+
+        self.timesteps = timesteps
+        self.disable_self_attn = disable_self_attn
+        if self.disable_self_attn:
+            self.attn1 = attn_cls(
+                query_dim=inner_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                context_dim=context_dim,
+                dropout=dropout,
+            )  # is a cross-attention
+        else:
+            self.attn1 = attn_cls(
+                query_dim=inner_dim, heads=n_heads, dim_head=d_head, dropout=dropout
+            )  # is a self-attention
+
+        self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff)
+
+        if disable_temporal_crossattention:
+            if switch_temporal_ca_to_sa:
+                raise ValueError
+            else:
+                self.attn2 = None
+        else:
+            self.norm2 = nn.LayerNorm(inner_dim)
+            if switch_temporal_ca_to_sa:
+                self.attn2 = attn_cls(
+                    query_dim=inner_dim, heads=n_heads, dim_head=d_head, dropout=dropout
+                )  # is a self-attention
+            else:
+                self.attn2 = attn_cls(
+                    query_dim=inner_dim,
+                    context_dim=context_dim,
+                    heads=n_heads,
+                    dim_head=d_head,
+                    dropout=dropout,
+                )  # is self-attn if context is none
+
+        self.norm1 = nn.LayerNorm(inner_dim)
+        self.norm3 = nn.LayerNorm(inner_dim)
+        self.switch_temporal_ca_to_sa = switch_temporal_ca_to_sa
+
+        self.checkpoint = checkpoint
+        '''
+        if self.checkpoint:
+            print(f"{self.__class__.__name__} is using checkpointing")
+        '''
+
+    def forward(
+        self, x: torch.Tensor, context: torch.Tensor = None, timesteps: int = None
+    ) -> torch.Tensor:
+        if self.checkpoint:
+            return checkpoint(self._forward, x, context, timesteps)
+        else:
+            return self._forward(x, context, timesteps=timesteps)
+
+    def _forward(self, x, context=None, timesteps=None):
+        assert self.timesteps or timesteps
+        assert not (self.timesteps and timesteps) or self.timesteps == timesteps
+        timesteps = self.timesteps or timesteps
+        B, S, C = x.shape
+        x = rearrange(x, "(b t) s c -> (b s) t c", t=timesteps)
+
+        if self.ff_in:
+            x_skip = x
+            x = self.ff_in(self.norm_in(x))
+            if self.is_res:
+                x += x_skip
+
+        if self.disable_self_attn:
+            x = self.attn1(self.norm1(x), context=context) + x
+        else:
+            x = self.attn1(self.norm1(x)) + x
+
+        if self.attn2 is not None:
+            if self.switch_temporal_ca_to_sa:
+                x = self.attn2(self.norm2(x)) + x
+            else:
+                x = self.attn2(self.norm2(x), context=context) + x
+        x_skip = x
+        x = self.ff(self.norm3(x))
+        if self.is_res:
+            x += x_skip
+
+        x = rearrange(
+            x, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps
+        )
+        return x
+
+    def get_last_layer(self):
+        return self.ff.net[-1].weight
+
+
+class SpatialVideoTransformer(SpatialTransformer):
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        use_linear=False,
+        context_dim=None,
+        use_spatial_context=False,
+        timesteps=None,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        time_context_dim=None,
+        ff_in=False,
+        checkpoint=False,
+        time_depth=1,
+        attn_mode="softmax",
+        disable_self_attn=False,
+        disable_temporal_crossattention=False,
+        max_time_embed_period: int = 10000,
+    ):
+        super().__init__(
+            in_channels,
+            n_heads,
+            d_head,
+            depth=depth,
+            dropout=dropout,
+            attn_type=attn_mode,
+            use_checkpoint=checkpoint,
+            context_dim=context_dim,
+            use_linear=use_linear,
+            disable_self_attn=disable_self_attn,
+        )
+        self.time_depth = time_depth
+        self.depth = depth
+        self.max_time_embed_period = max_time_embed_period
+
+        time_mix_d_head = d_head
+        n_time_mix_heads = n_heads
+
+        time_mix_inner_dim = int(time_mix_d_head * n_time_mix_heads)
+
+        inner_dim = n_heads * d_head
+        if use_spatial_context:
+            time_context_dim = context_dim
+
+        self.time_stack = nn.ModuleList(
+            [
+                VideoTransformerBlock(
+                    inner_dim,
+                    n_time_mix_heads,
+                    time_mix_d_head,
+                    dropout=dropout,
+                    context_dim=time_context_dim,
+                    timesteps=timesteps,
+                    checkpoint=checkpoint,
+                    ff_in=ff_in,
+                    inner_dim=time_mix_inner_dim,
+                    attn_mode=attn_mode,
+                    disable_self_attn=disable_self_attn,
+                    disable_temporal_crossattention=disable_temporal_crossattention,
+                )
+                for _ in range(self.depth)
+            ]
+        )
+
+        assert len(self.time_stack) == len(self.transformer_blocks)
+
+        self.use_spatial_context = use_spatial_context
+        self.in_channels = in_channels
+
+        time_embed_dim = self.in_channels * 4
+        self.time_pos_embed = nn.Sequential(
+            linear(self.in_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, self.in_channels),
+        )
+
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor, merge_strategy=merge_strategy
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        time_context: Optional[torch.Tensor] = None,
+        timesteps: Optional[int] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        _, _, h, w = x.shape
+        x_in = x
+        spatial_context = None
+        if exists(context):
+            spatial_context = context
+
+        if self.use_spatial_context:
+            assert (
+                context.ndim == 3
+            ), f"n dims of spatial context should be 3 but are {context.ndim}"
+
+            time_context = context
+            time_context_first_timestep = time_context[::timesteps]
+            time_context = repeat(
+                time_context_first_timestep, "b ... -> (b n) ...", n=h * w
+            )
+        elif time_context is not None and not self.use_spatial_context:
+            time_context = repeat(time_context, "b ... -> (b n) ...", n=h * w)
+            if time_context.ndim == 2:
+                time_context = rearrange(time_context, "b c -> b 1 c")
+
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        if self.use_linear:
+            x = self.proj_in(x)
+
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
+        num_frames = rearrange(num_frames, "b t -> (b t)")
+        t_emb = timestep_embedding(
+            num_frames,
+            self.in_channels,
+            repeat_only=False,
+            max_period=self.max_time_embed_period,
+        )
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]
+
+        for it_, (block, mix_block) in enumerate(
+            zip(self.transformer_blocks, self.time_stack)
+        ):
+            x = block(
+                x,
+                context=spatial_context,
+            )
+
+            x_mix = x
+            x_mix = x_mix + emb
+
+            x_mix = mix_block(x_mix, context=time_context, timesteps=timesteps)
+            x = self.time_mixer(
+                x_spatial=x,
+                x_temporal=x_mix,
+                image_only_indicator=image_only_indicator,
+            )
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        if not self.use_linear:
+            x = self.proj_out(x)
+        out = x + x_in
+        return out
diff --git a/sgm/motionctrl/camera_motion_control.py b/sgm/motionctrl/camera_motion_control.py
new file mode 100644
index 0000000000000000000000000000000000000000..329359fcdb20d9b802f81ac9092f2b0f0ef53104
--- /dev/null
+++ b/sgm/motionctrl/camera_motion_control.py
@@ -0,0 +1,63 @@
+
+import torch.nn as nn
+
+from sgm.models.diffusion import DiffusionEngine
+from sgm.motionctrl.modified_svd import (
+                                         _forward_VideoTransformerBlock_attan2,
+                                         forward_SpatialVideoTransformer,
+                                         forward_VideoTransformerBlock,
+                                         forward_VideoUnet)
+
+class CameraMotionControl(DiffusionEngine):
+    def __init__(self,
+                 pose_embedding_dim = 1,
+                 pose_dim = 12,
+                 *args, **kwargs):
+        
+        if 'ckpt_path' in kwargs:
+            ckpt_path = kwargs.pop('ckpt_path')
+        else:
+            ckpt_path = None
+
+        self.use_checkpoint = kwargs['network_config']['params']['use_checkpoint']
+
+        super().__init__(*args, **kwargs)
+
+        bound_method = forward_VideoUnet.__get__(
+                self.model.diffusion_model, 
+                self.model.diffusion_model.__class__)
+        setattr(self.model.diffusion_model, 'forward', bound_method)
+
+        self.train_module_names = []
+        for _name, _module in self.model.diffusion_model.named_modules():
+            if _module.__class__.__name__ == 'VideoTransformerBlock':
+                bound_method = forward_VideoTransformerBlock.__get__(
+                    _module, _module.__class__)
+                setattr(_module, 'forward', bound_method)
+
+                
+                bound_method = _forward_VideoTransformerBlock_attan2.__get__(
+                    _module, _module.__class__)
+                setattr(_module, '_forward', bound_method)
+                
+                cc_projection = nn.Linear(_module.attn2.to_q.in_features + pose_embedding_dim*pose_dim, _module.attn2.to_q.in_features) # 1024
+                nn.init.eye_(list(cc_projection.parameters())[0][:_module.attn2.to_q.in_features, :_module.attn2.to_q.in_features])
+                nn.init.zeros_(list(cc_projection.parameters())[1])
+            
+                cc_projection.requires_grad_(True)
+
+                _module.add_module('cc_projection', cc_projection)
+
+                self.train_module_names.append(f'{_name}.cc_projection')
+                
+                self.train_module_names.append(f'{_name}.attn2')
+                self.train_module_names.append(f'{_name}.norm2')
+
+
+            if _module.__class__.__name__ == 'SpatialVideoTransformer':
+                bound_method = forward_SpatialVideoTransformer.__get__(
+                    _module, _module.__class__)
+                setattr(_module, 'forward', bound_method)
+
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
\ No newline at end of file
diff --git a/sgm/motionctrl/modified_svd.py b/sgm/motionctrl/modified_svd.py
new file mode 100644
index 0000000000000000000000000000000000000000..87532b0b79ab0c88cf32d1abc3cce0a2d50931a0
--- /dev/null
+++ b/sgm/motionctrl/modified_svd.py
@@ -0,0 +1,240 @@
+from functools import partial
+from typing import List, Optional, Union
+
+import torch
+from einops import rearrange, repeat
+
+from sgm.modules.attention import checkpoint, exists
+from sgm.modules.diffusionmodules.util import timestep_embedding
+
+
+### VideoUnet #####
+def forward_VideoUnet(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        y: Optional[torch.Tensor] = None,
+        time_context: Optional[torch.Tensor] = None,
+        num_video_frames: Optional[int] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        RT: Optional[torch.Tensor] = None
+    ):
+        if RT is not None:
+             context = {'RT': RT, 'context': context}
+
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional -> no, relax this TODO"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        ## tbd: check the role of "image_only_indicator"
+        num_video_frames = self.num_frames
+        image_only_indicator = torch.zeros(
+                    x.shape[0]//num_video_frames, num_video_frames
+                ).to(x.device) if image_only_indicator is None else image_only_indicator
+
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+
+        h = x
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames
+            )
+            hs.append(h)
+        h = self.middle_block(
+            h,
+            emb,
+            context=context,
+            image_only_indicator=image_only_indicator,
+            time_context=time_context,
+            num_video_frames=num_video_frames
+        )
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(
+                h,
+                emb,
+                context=context,
+                image_only_indicator=image_only_indicator,
+                time_context=time_context,
+                num_video_frames=num_video_frames
+            )
+        h = h.type(x.dtype)
+        return self.out(h)
+
+### VideoTransformerBlock #####
+
+def forward_VideoTransformerBlock(self, x, context, timesteps):
+    if self.checkpoint:
+        return checkpoint(self._forward, x, context, timesteps)
+    else:
+        return self._forward(x, context, timesteps=timesteps)
+
+
+def _forward_VideoTransformerBlock_attan2(self, x, context=None, timesteps=None):
+    assert self.timesteps or timesteps
+    assert not (self.timesteps and timesteps) or self.timesteps == timesteps
+    timesteps = self.timesteps or timesteps
+    B, S, C = x.shape
+    x = rearrange(x, "(b t) s c -> (b s) t c", t=timesteps)
+
+    if isinstance(context, dict):
+        RT = context['RT'] # (b, t, 12)
+        context = context['context']
+    else:
+        RT = None
+
+    if self.ff_in:
+        x_skip = x
+        x = self.ff_in(self.norm_in(x))
+        if self.is_res:
+            x += x_skip
+
+    if self.disable_self_attn:
+        x = self.attn1(self.norm1(x), context=context) + x
+    else:
+        x = self.attn1(self.norm1(x)) + x
+
+    if RT is not None:
+        # import pdb; pdb.set_trace()
+        RT = RT.repeat_interleave(repeats=S, dim=0) # (b*s, t, 12)
+        x = torch.cat([x, RT], dim=-1)
+            
+        x = self.cc_projection(x)
+
+    if self.attn2 is not None:
+        if self.switch_temporal_ca_to_sa:
+            x = self.attn2(self.norm2(x)) + x
+        else:
+            x = self.attn2(self.norm2(x), context=context) + x
+    x_skip = x
+    x = self.ff(self.norm3(x))
+    if self.is_res:
+        x += x_skip
+
+    x = rearrange(
+        x, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps
+    )
+    return x
+
+
+#### BasicTransformerBlock #####
+def _forward_BasicTransformerBlock(
+    self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+):
+    if isinstance(context, dict):
+        context = context['context']
+    x = (
+        self.attn1(
+            self.norm1(x),
+            context=context if self.disable_self_attn else None,
+            additional_tokens=additional_tokens,
+            n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+            if not self.disable_self_attn
+            else 0,
+        )
+        + x
+    )
+    x = (
+        self.attn2(
+            self.norm2(x), context=context, additional_tokens=additional_tokens
+        )
+        + x
+    )
+    x = self.ff(self.norm3(x)) + x
+    return x
+    
+
+#### SpatialVideoTransformer #####
+def forward_SpatialVideoTransformer(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        time_context: Optional[torch.Tensor] = None,
+        timesteps: Optional[int] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        _, _, h, w = x.shape
+        x_in = x
+
+        if isinstance(context, dict):
+            RT = context['RT']
+            context = context['context']
+        else:
+            RT = None
+
+        spatial_context = None
+        if exists(context):
+            spatial_context = context
+
+        if self.use_spatial_context:
+            assert (
+                context.ndim == 3
+            ), f"n dims of spatial context should be 3 but are {context.ndim}"
+
+            time_context = context
+            time_context_first_timestep = time_context[::timesteps]
+            time_context = repeat(
+                time_context_first_timestep, "b ... -> (b n) ...", n=h * w
+            )
+        elif time_context is not None and not self.use_spatial_context:
+            time_context = repeat(time_context, "b ... -> (b n) ...", n=h * w)
+            if time_context.ndim == 2:
+                time_context = rearrange(time_context, "b c -> b 1 c")
+
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        if self.use_linear:
+            x = self.proj_in(x)
+
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
+        num_frames = rearrange(num_frames, "b t -> (b t)")
+        t_emb = timestep_embedding(
+            num_frames,
+            self.in_channels,
+            repeat_only=False,
+            max_period=self.max_time_embed_period,
+        )
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]
+
+        for it_, (block, mix_block) in enumerate(
+            zip(self.transformer_blocks, self.time_stack)
+        ):
+            x = block(
+                x,
+                context=spatial_context,
+            )
+
+            x_mix = x
+            x_mix = x_mix + emb
+
+            if RT is not None:
+                x_mix = mix_block(x_mix, context={'context': time_context, 'RT': RT}, timesteps=timesteps)
+            else:
+                x_mix = mix_block(x_mix, context=time_context, timesteps=timesteps)
+            x = self.time_mixer(
+                x_spatial=x,
+                x_temporal=x_mix,
+                image_only_indicator=image_only_indicator,
+            )
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        if not self.use_linear:
+            x = self.proj_out(x)
+        out = x + x_in
+        return out
\ No newline at end of file
diff --git a/sgm/util.py b/sgm/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d9b2a69db2898323cbf2ad26a09ac8b2facd11
--- /dev/null
+++ b/sgm/util.py
@@ -0,0 +1,275 @@
+import functools
+import importlib
+import os
+from functools import partial
+from inspect import isfunction
+
+import fsspec
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from safetensors.torch import load_file as load_safetensors
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+def get_string_from_tuple(s):
+    try:
+        # Check if the string starts and ends with parentheses
+        if s[0] == "(" and s[-1] == ")":
+            # Convert the string to a tuple
+            t = eval(s)
+            # Check if the type of t is tuple
+            if type(t) == tuple:
+                return t[0]
+            else:
+                pass
+    except:
+        pass
+    return s
+
+
+def is_power_of_two(n):
+    """
+    chat.openai.com/chat
+    Return True if n is a power of 2, otherwise return False.
+
+    The function is_power_of_two takes an integer n as input and returns True if n is a power of 2, otherwise it returns False.
+    The function works by first checking if n is less than or equal to 0. If n is less than or equal to 0, it can't be a power of 2, so the function returns False.
+    If n is greater than 0, the function checks whether n is a power of 2 by using a bitwise AND operation between n and n-1. If n is a power of 2, then it will have only one bit set to 1 in its binary representation. When we subtract 1 from a power of 2, all the bits to the right of that bit become 1, and the bit itself becomes 0. So, when we perform a bitwise AND between n and n-1, we get 0 if n is a power of 2, and a non-zero value otherwise.
+    Thus, if the result of the bitwise AND operation is 0, then n is a power of 2 and the function returns True. Otherwise, the function returns False.
+
+    """
+    if n <= 0:
+        return False
+    return (n & (n - 1)) == 0
+
+
+def autocast(f, enabled=True):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+            enabled=enabled,
+            dtype=torch.get_autocast_gpu_dtype(),
+            cache_enabled=torch.is_autocast_cache_enabled(),
+        ):
+            return f(*args, **kwargs)
+
+    return do_autocast
+
+
+def load_partial_from_config(config):
+    return partial(get_obj_from_str(config["target"]), **config.get("params", dict()))
+
+
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype("data/DejaVuSans.ttf", size=size)
+        nc = int(40 * (wh[0] / 256))
+        if isinstance(xc[bi], list):
+            text_seq = xc[bi][0]
+        else:
+            text_seq = xc[bi]
+        lines = "\n".join(
+            text_seq[start : start + nc] for start in range(0, len(text_seq), nc)
+        )
+
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+
+
+def partialclass(cls, *args, **kwargs):
+    class NewCls(cls):
+        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
+
+    return NewCls
+
+
+def make_path_absolute(path):
+    fs, p = fsspec.core.url_to_fs(path)
+    if fs.protocol == "file":
+        return os.path.abspath(p)
+    return path
+
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+
+def isheatmap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+
+    return x.ndim == 2
+
+
+def isneighbors(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return x.ndim == 5 and (x.shape[2] == 3 or x.shape[2] == 1)
+
+
+def exists(x):
+    return x is not None
+
+
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+    return total_params
+
+
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+
+def get_obj_from_str(string, reload=False, invalidate_cache=True):
+    module, cls = string.rsplit(".", 1)
+    if invalidate_cache:
+        importlib.invalidate_caches()
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+
+
+def load_model_from_config(config, ckpt, verbose=True, freeze=True):
+    print(f"Loading model from {ckpt}")
+    if ckpt.endswith("ckpt"):
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        if "global_step" in pl_sd:
+            print(f"Global Step: {pl_sd['global_step']}")
+        sd = pl_sd["state_dict"]
+    elif ckpt.endswith("safetensors"):
+        sd = load_safetensors(ckpt)
+    else:
+        raise NotImplementedError
+
+    model = instantiate_from_config(config.model)
+
+    m, u = model.load_state_dict(sd, strict=False)
+
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+
+    if freeze:
+        for param in model.parameters():
+            param.requires_grad = False
+
+    model.eval()
+    return model
+
+
+def get_configs_path() -> str:
+    """
+    Get the `configs` directory.
+    For a working copy, this is the one in the root of the repository,
+    but for an installed copy, it's in the `sgm` package (see pyproject.toml).
+    """
+    this_dir = os.path.dirname(__file__)
+    candidates = (
+        os.path.join(this_dir, "configs"),
+        os.path.join(this_dir, "..", "configs"),
+    )
+    for candidate in candidates:
+        candidate = os.path.abspath(candidate)
+        if os.path.isdir(candidate):
+            return candidate
+    raise FileNotFoundError(f"Could not find SGM configs in {candidates}")
+
+
+def get_nested_attribute(obj, attribute_path, depth=None, return_key=False):
+    """
+    Will return the result of a recursive get attribute call.
+    E.g.:
+        a.b.c
+        = getattr(getattr(a, "b"), "c")
+        = get_nested_attribute(a, "b.c")
+    If any part of the attribute call is an integer x with current obj a, will
+    try to call a[x] instead of a.x first.
+    """
+    attributes = attribute_path.split(".")
+    if depth is not None and depth > 0:
+        attributes = attributes[:depth]
+    assert len(attributes) > 0, "At least one attribute should be selected"
+    current_attribute = obj
+    current_key = None
+    for level, attribute in enumerate(attributes):
+        current_key = ".".join(attributes[: level + 1])
+        try:
+            id_ = int(attribute)
+            current_attribute = current_attribute[id_]
+        except ValueError:
+            current_attribute = getattr(current_attribute, attribute)
+
+    return (current_attribute, current_key) if return_key else current_attribute