File size: 19,016 Bytes
d25ae12
8b54513
 
 
 
 
d25ae12
 
8b54513
d25ae12
8b54513
d25ae12
8b54513
 
 
 
 
 
 
 
 
 
 
1a84a43
 
 
 
288480f
5bbfa70
8b54513
 
 
 
 
 
288480f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b54513
 
 
 
 
 
 
 
 
 
1a84a43
 
 
 
 
 
 
 
 
 
 
8b54513
288480f
653564f
 
 
288480f
 
 
 
 
 
 
 
 
 
e7d575b
 
 
8f80216
8b54513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d25ae12
8b54513
 
 
 
 
 
 
 
 
 
 
 
53238be
e79860f
 
8a794c5
 
 
 
8b54513
 
 
 
 
 
 
e7d575b
 
288480f
8f80216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b54513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288480f
e7d575b
 
 
 
8b54513
288480f
8b54513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225975e
1a84a43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b54513
 
 
 
 
 
5813aac
5a055ba
 
40b4b02
5a055ba
225975e
ec63b44
5a055ba
8b54513
225975e
8b54513
225975e
 
085922d
 
047b007
a927763
085922d
a927763
085922d
225975e
578e7e2
b5c97db
 
 
c0b5779
b5c97db
 
 
225975e
 
b5c97db
 
 
 
 
 
 
159853d
1a84a43
 
 
 
 
 
 
 
 
159853d
 
 
1a84a43
930047c
1a84a43
 
 
 
 
 
 
 
159853d
288480f
 
1c2dd5c
 
653564f
1c2dd5c
 
96f8446
 
1c2dd5c
159853d
288480f
 
eaf0661
 
 
effc1b6
eaf0661
 
 
 
8b54513
 
 
d25ae12
a66a8c1
 
 
288480f
a66a8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
225975e
1a84a43
 
 
 
 
288480f
 
225975e
fc68709
 
 
 
 
 
 
 
8b54513
 
 
288480f
8b54513
 
 
 
288480f
8b54513
 
 
 
 
 
 
 
 
 
 
 
 
5b036b0
e79860f
8a794c5
 
1a84a43
8b54513
 
 
 
 
9b8e41b
8b54513
 
09f4959
8b54513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d25ae12
578e7e2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
import sys
import os
import argparse
import multiprocessing as mp
import numpy as np
from typing import List, Optional

import torch
import torch.distributed as dist

from fairscale.nn.model_parallel import initialize as fs_init

import gradio as gr
from util.misc import setup_for_distributed
from util.misc import default_tensor_type
from model.meta import MetaModel
from data.conversation_lib import conv_templates, SeparatorStyle
from PIL import Image
import torchvision.transforms as transforms
from data.fintune_dataset import make_audio_features
from data import video_utils 
from dataclasses import dataclass
from huggingface_hub import hf_hub_download
import plotly.graph_objects as go
from data.fintune_dataset import pc_norm
from functools import partial
import glob
import torchvision.transforms.functional as F

T_random_resized_crop = transforms.Compose([
    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3,
                                 antialias=None),  # 3 is bicubic
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])

class PairRandomResizedCrop(transforms.RandomResizedCrop):
    def forward(self, imgs):
        i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
        return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs]

class PairToTensor(transforms.ToTensor):
    def __call__(self, pics):
        return [F.to_tensor(pic) for pic in pics]

class PairNormalize(transforms.Normalize):
    def forward(self, tensors):
        return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors]
    
transform_pairimg_train = transforms.Compose([
    PairRandomResizedCrop(size=(224, 224), scale=(0.99, 1.0), ratio=(0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
    PairToTensor(),
    PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])

def load_audio(audio_path):
    fbank = make_audio_features(audio_path, mel_bins=128)
    fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024]
    return fbank
    
def load_video(video_path):
    video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5)
    return video_feats[:, :, 0]

def load_point(point_path):
    point_feat = np.load(point_path)
    point_feat = torch.tensor(point_feat)
    point_feat = pc_norm(point_feat)
    return point_feat

def load_fmri(fmri_path):
    data = np.load(fmri_path)
    data = data.mean(axis=0)
    data = torch.tensor(data[None])
    return data

def load_rgbx(image_path, x_image_path):
    # trick: replace path if 'depth_scaled' in path
    x_image_path = x_image_path.replace('depth_scaled', 'depth')

    image = Image.open(image_path).convert('RGB')
    x_image = Image.open(x_image_path).convert('RGB')
    x_image = x_image.resize(image.size[-2:])

    image, x_image = transform_pairimg_train([image, x_image])

    # [2, 3, H, W]
    image = torch.stack([image, x_image], dim=0)
    return image


class Ready: pass


def model_worker(
    rank: int, args: argparse.Namespace, barrier: mp.Barrier,
    request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None,
) -> None:
    """
    The worker function that manipulates the GPU to run the inference.
    Exact n_gpu workers are started, with each one operating on a separate GPU.

    Args:
        rank (int): Distributed rank of the worker.
        args (argparse.Namespace): All command line arguments.
        barrier (multiprocessing.Barrier): A barrier used to delay the start
            of Web UI to be after the start of the model.
    """

    world_size = len(args.gpu_ids)
    gpu_id = args.gpu_ids[rank]
    dist.init_process_group(
        backend="nccl", rank=rank, world_size=world_size,
        init_method=f"tcp://{args.master_addr}:{args.master_port}",
    )
    print(f"| distributed init on worker {rank}/{world_size}. "
          f"using gpu: {gpu_id}")
    fs_init.initialize_model_parallel(world_size)
    torch.cuda.set_device(gpu_id)

    torch.manual_seed(1)
    np.random.seed(1)

    # set the print behavior.
    setup_for_distributed(rank == 0)

    target_dtype = {
        "bf16": torch.bfloat16,
        "fp16": torch.float16
    }[args.dtype]
    with default_tensor_type(dtype=target_dtype, device="cuda"):
        model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path)
    for ckpt_id in range(args.num_ckpts):
        ckpt_path = hf_hub_download(repo_id=args.pretrained_path, filename=args.ckpt_format.format(str(ckpt_id)))
        # ckpt_path = os.path.join(args.pretrained_path, args.ckpt_format.format(str(ckpt_id)))
        print(f"Loading pretrained weights {ckpt_path}")
        checkpoint = torch.load(ckpt_path, map_location='cpu')
        msg = model.load_state_dict(checkpoint, strict=False)
    # print("load result:\n", msg)
    model.cuda()
    model.eval()
    print(f"Model = {str(model)}")

    barrier.wait()

    while True:
        if response_queue is not None:
            response_queue.put(Ready())
        img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get()
        try:
            if 'image' in modality and img_path is not None:
                image = Image.open(img_path).convert('RGB')
                inputs = T_random_resized_crop(image)
            elif 'video' in modality and video_path is not None:
                inputs = load_video(video_path)
            elif 'audio' in modality and audio_path is not None:
                inputs = load_audio(audio_path)
            elif 'point' in modality and point_path is not None:
                inputs = load_point(point_path)
            elif 'fmri' in modality and fmri_path is not None:
                inputs = load_fmri(fmri_path)
            elif 'rgbd' in modality and depth_path is not None and depth_rgb_path is not None:
                inputs = load_rgbx(depth_rgb_path, depth_path)
            elif 'rgbn' in modality and normal_path is not None and normal_rgb_path is not None:
                inputs = load_rgbx(normal_rgb_path, normal_path)
            else:
                inputs = None
        except:
            inputs = None
        
        if inputs is not None:
            inputs = inputs[None].cuda().to(target_dtype)
    
        conv = conv_templates["v1"].copy()
        for user, bot in chatbot:
            conv.append_message(conv.roles[0], user)
            conv.append_message(conv.roles[1], bot)

        with torch.cuda.amp.autocast(dtype=target_dtype):
            print(conv.get_prompt())
            for stream_response in model.stream_generate(
                conv.get_prompt(), inputs,
                max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,
                modal = modality
            ):
                conv_sep = (
                    conv.sep
                    if conv.sep_style == SeparatorStyle.SINGLE
                    else conv.sep2
                )
                end_pos = stream_response["text"].find(conv_sep)
                if end_pos != -1:
                    stream_response["text"] = (
                        stream_response['text'][:end_pos].rstrip() + "\n"
                    )
                    stream_response["end_of_content"] = True

                # keep a few characters if not end_of_content to avoid sending
                # part of conv_sep before all of it is generated.
                if not stream_response["end_of_content"]:
                    if len(stream_response["text"]) < len(conv_sep):
                        continue
                    stream_response["text"] = (
                        stream_response["text"][:-len(conv_sep)]
                    )

                if response_queue is not None:
                    response_queue.put(stream_response)

                if stream_response["end_of_content"]:
                    break


def gradio_worker(
    request_queues: List[mp.Queue], response_queue: mp.Queue,
    args: argparse.Namespace, barrier: mp.Barrier,
) -> None:
    """
    The gradio worker is responsible for displaying the WebUI and relay the
    requests to model workers. It should be launched only once.

    Args:
        request_queues (List[mp.Queue]): A list of request queues (one for
            each model worker).
        args (argparse.Namespace): All command line arguments.
        barrier (multiprocessing.Barrier): A barrier used to delay the start
            of Web UI to be after the start of the model.
    """

    def show_user_input(msg, chatbot):
        return "", chatbot + [[msg, None]]

    def stream_model_output(img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality):
        while True:
            content_piece = response_queue.get()
            if isinstance(content_piece, Ready):
                break
        for queue in request_queues:
            queue.put((img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality))
        while True:
            content_piece = response_queue.get()
            chatbot[-1][1] = content_piece["text"]
            yield chatbot
            if content_piece["end_of_content"]:
                break

    def undo(chatbot):
        if len(chatbot) > 0:
            chatbot = chatbot[:-1]
        return chatbot

    def clear():
        chatbot = []
        msg = ""
        return chatbot, msg
    
    def show_point_cloud(file):
        point = load_point(file).numpy()
        fig = go.Figure(
            data=[
                go.Scatter3d(
                    x=point[:,0], y=point[:,1], z=point[:,2],
                    mode='markers',
                    marker=dict(
                    size=1.2,
                    color=['rgb({},{},{})'.format(r, g, b) for r,g,b in zip(point[:,3], point[:,4], point[:,5])]
                ))],
            layout=dict(
                scene=dict(
                    xaxis=dict(visible=False),
                    yaxis=dict(visible=False),
                    zaxis=dict(visible=False)
                )),)
        return fig
    
    def change_modality(modal):
        return modal

    CSS ="""
    .contain { display: flex; flex-direction: column; }
    #component-0 { height: 100%; }
    #chatbot { flex-grow: 1; overflow: auto;}
    """

    header="""
    ## OneLLM: One Framework to Align All Modalities with Language
    [[Project Page](https://onellm.csuhan.com)] [[Paper](https://arxiv.org/abs/2312.03700)] [[Code](https://github.com/csuhan/OneLLM)]
    """

    with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
        gr.Markdown(header)
        with gr.Row(equal_height=True):
            modality = gr.Textbox(value='image', visible=False)
            with gr.Column(scale=1):
                with gr.Tab('Image') as img_tab:
                    img_path = gr.Image(label='Image Input', type='filepath')
                    gr.Examples(
                        examples=[
                            "examples/new_york.jpg",
                            "examples/food_menu.png",
                            ],
                        inputs=[img_path],
                    )
                with gr.Tab('Video') as video_tab:
                    video_path = gr.Video(label='Video Input', max_length=180)
                    gr.Examples(
                        examples=[
                            "examples/flower.mp4",
                            "examples/star_kun.mp4",
                            ],
                        inputs=[video_path],
                    )
                with gr.Tab('Audio') as audio_tab:
                    audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload'])
                    gr.Examples(
                        examples=[
                            "examples/bell_ring.wav",
                            "examples/bird_audio.wav",
                            ],
                        inputs=[audio_path],
                    )
                with gr.Tab('Point Cloud') as point_tab:
                    point_path = gr.File(label='Point Cloud Input', elem_id="pointpath", elem_classes="")
                    point_vis = gr.Plot()
                    btn = gr.Button(value="Show Point Cloud")
                    btn.click(show_point_cloud, point_path, point_vis)
                    gr.Examples(
                        examples=glob.glob("examples/point/*.npy"),
                        inputs=[point_path],
                        examples_per_page=5,
                    )
                with gr.Tab('IMU') as imu_tab:
                    gr.Markdown('Coming soon🤗')
                with gr.Tab('fMRI') as fmri_tab:
                    fmri_path = gr.File(label='fMRI Input', elem_id="fmripath", elem_classes="")
                    fmri_image_path = gr.Image(label='Reference Image', interactive=False)
                    gr.Examples(
                        examples=[
                            [file.replace('.jpg', '.npy'), file]
                            for file in glob.glob("examples/fmri/*.jpg")
                        ],
                        inputs=[fmri_path, fmri_image_path],
                        examples_per_page=3,
                    )
                with gr.Tab('Depth Map') as depth_tab:
                    depth_path = gr.Image(label='Depth Map', type='filepath')
                    depth_rgb_path = gr.Image(label='RGB Image', type='filepath')
                    gr.Examples(
                        examples=[
                            [rgb_image.replace('rgb', 'depth_scaled'), rgb_image]
                            for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[:9]
                        ],
                        inputs=[depth_path, depth_rgb_path],
                        examples_per_page=3,
                    )
                with gr.Tab('Normal Map') as normal_tab:
                    normal_path = gr.Image(label='Normal Map', type='filepath')
                    normal_rgb_path = gr.Image(label='RGB Image', type='filepath')
                    gr.Examples(
                        examples=[
                            [rgb_image.replace('rgb', 'normal'), rgb_image]
                            for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[9:]
                        ],
                        inputs=[normal_path, normal_rgb_path],
                        examples_per_page=3,
                    )
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(elem_id="chatbot")
                msg = gr.Textbox()

                with gr.Row():
                    submit_button = gr.Button("Submit", variant="primary")
                    undo_button = gr.Button("Undo")
                    clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, point_vis])
                with gr.Row():
                    max_gen_len = gr.Slider(
                        minimum=1, maximum=args.model_max_seq_len // 2,
                        value=args.model_max_seq_len // 2, interactive=True,
                        label="Single-turn max response length",
                    )
                    gen_t = gr.Slider(
                        minimum=0, maximum=1, value=0.1, interactive=True,
                        label="Temperature",
                    )
                    top_p = gr.Slider(
                        minimum=0, maximum=1, value=0.75, interactive=True,
                        label="Top-p",
                    )
        
        img_tab.select(partial(change_modality, 'image'), [], [modality])
        video_tab.select(partial(change_modality, 'video'), [], [modality])
        audio_tab.select(partial(change_modality, 'audio'), [], [modality])
        point_tab.select(partial(change_modality, 'point'), [], [modality])
        fmri_tab.select(partial(change_modality, 'fmri'), [], [modality])
        depth_tab.select(partial(change_modality, 'rgbd'), [], [modality])
        normal_tab.select(partial(change_modality, 'rgbn'), [], [modality])

        img_path.change(clear, [], [chatbot, msg])
        audio_path.change(clear, [], [chatbot, msg])
        video_path.change(clear, [], [chatbot, msg])
        point_path.change(clear, [], [chatbot, msg])
        fmri_path.change(clear, [], [chatbot, msg])
        depth_path.change(clear, [], [chatbot, msg])
        normal_path.change(clear, [], [chatbot, msg])

        msg.submit(
            show_user_input, [msg, chatbot], [msg, chatbot],
        ).then(
            stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
        )
        submit_button.click(
            show_user_input, [msg, chatbot], [msg, chatbot],
        ).then(
            stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
        )
        undo_button.click(undo, chatbot, chatbot)
    barrier.wait()
    demo.queue(api_open=True).launch(share=True, max_threads=1)


@dataclass
class DemoConfig:
    gpu_ids = [0]
    tokenizer_path = "config/llama2/tokenizer.model"
    llama_type = "onellm"
    llama_config = "config/llama2/7B.json"
    model_max_seq_len = 2048
    pretrained_path = "csuhan/OneLLM-7B-hf"
    # pretrained_path = "/home/pgao/jiaming/weights/7B_v20_splits/"
    ckpt_format = "consolidated.00-of-01.s{}.pth"
    num_ckpts = 10
    master_port = 23863
    master_addr = "127.0.0.1"
    dtype = "fp16"

if __name__ == "__main__":
    args = DemoConfig()

    # using the default "fork" method messes up some imported libs (e.g.,
    # pandas)
    # mp.set_start_method("spawn")

    # setup the queues and start the model workers
    request_queues = []
    response_queue = mp.Queue()
    worker_processes = []
    barrier = mp.Barrier(len(args.gpu_ids) + 1)
    for rank, gpu_id in enumerate(args.gpu_ids):
        request_queue = mp.Queue()
        rank_response_queue = response_queue if rank == 0 else None
        process = mp.Process(
            target=model_worker,
            args=(rank, args, barrier, request_queue, rank_response_queue),
        )
        process.start()
        worker_processes.append(process)
        request_queues.append(request_queue)

    gradio_worker(request_queues, response_queue, args, barrier)