Spaces:

remyxai
/

SpaceMantis

Sleeping

App Files Files Community

smellslikeml commited on Aug 13

Commit

3e7a2b7

•

1 Parent(s): 1afbcbd

update

Browse files

Files changed (11) hide show

README.md +6 -5
app.py +197 -126
barchart.jpeg +0 -0
examples/warehouse_rgb.jpg +0 -0
models/conversation.py +450 -0
models/mllava/__init__.py +4 -0
models/mllava/configuration_llava.py +134 -0
models/mllava/modeling_llava.py +770 -0
models/mllava/processing_llava.py +381 -0
models/mllava/utils.py +188 -0
requirements.txt +5 -4

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: SpaceLlama3.1
-emoji: 🌍
 colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 4.40.0
 app_file: app.py
 pinned: false
-license: llama3.1
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SpaceMantis
+emoji: 🌌
 colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.24.0
 app_file: app.py
 pinned: false
+license: apache-2.0
+short_description: Multimodal Language Model specialized for spatial reasoning
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,132 +1,203 @@
-"""SpaceLlama3.1 demo gradio app."""
-import datetime
-import logging
-import os
 import gradio as gr
-import torch
-import PIL.Image
-from prismatic import load
-from huggingface_hub import login
-# Authenticate with the Hugging Face Hub
-def authenticate_huggingface():
-    hf_token = os.getenv("HF_TOKEN")
-    if hf_token:
-        login(token=hf_token)
-    else:
-        raise ValueError("Hugging Face API token not found. Please set it as an environment variable named 'HF_TOKEN'.")
-# Call the authentication function once at the start
-authenticate_huggingface()
-INTRO_TEXT = """SpaceLlama3.1 demo\n\n
-| [Model](https://huggingface.co/remyxai/SpaceLlama3.1)
-| [GitHub](https://github.com/remyxai/VQASynth/tree/main)
-| [Demo](https://huggingface.co/spaces/remyxai/SpaceLlama3.1)
-| [Discord](https://discord.gg/DAy3P5wYJk)
-\n\n
-**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
-"""
-# Set model location as a constant outside the function
-MODEL_LOCATION = "remyxai/SpaceLlama3.1"  # Update as needed
-# Global model variable
-global_model = None
-def load_model():
-    """Loads the model globally."""
-    global global_model
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    global_model = load(MODEL_LOCATION)
-    global_model.to(device, dtype=torch.bfloat16)
-    logging.info("Model loaded successfully.")
-def compute(image, prompt):
-    """Runs model inference."""
-    if image is None:
-        raise gr.Error("Image required")
-    logging.info('prompt="%s"', prompt)
-    # Open the image file
-    if isinstance(image, str):
-        image = PIL.Image.open(image).convert("RGB")
-    # Use the globally loaded model
-    vlm = global_model
-    # Prepare prompt
-    prompt_builder = vlm.get_prompt_builder()
-    prompt_builder.add_turn(role="human", message=prompt)
-    prompt_text = prompt_builder.get_prompt()
-    # Generate the text based on image and prompt
-    generated_text = vlm.generate(
-        image,
-        prompt_text,
-        do_sample=True,
-        temperature=0.1,
-        max_new_tokens=512,
-        min_length=1,
-    )
-    output = generated_text.split("</s>")[0]
-    logging.info('output="%s"', output)
-    return output  # Ensure that output is a string
-def reset():
-    """Resets the input fields."""
-    return "", None
-def create_app():
-    """Creates demo UI."""
     with gr.Blocks() as demo:
-        # Main UI structure
-        gr.Markdown(INTRO_TEXT)
         with gr.Row():
-            image = gr.Image(value=None, label="Image", type="filepath", visible=True)  # input
-            with gr.Column():
-                prompt = gr.Textbox(value="", label="Prompt", visible=True)
-                model_info = gr.Markdown(label="Model Info")
-                run = gr.Button("Run", variant="primary")
-                clear = gr.Button("Clear")
-                highlighted_text = gr.HighlightedText(value="", label="Output", visible=True)
-        # Button event handlers
-        run.click(
-            fn=compute,
-            inputs=[image, prompt],
-            outputs=highlighted_text,  # Ensure this is the right output component
-        )
-        clear.click(fn=reset, inputs=None, outputs=[prompt, image])
-        # Status
-        status = gr.Markdown(f"Startup: {datetime.datetime.now()}")
-        gpu_kind = gr.Markdown(f"GPU=?")
-        demo.load(
-            fn=lambda: f"Model `{MODEL_LOCATION}` loaded.",  # Ensure the output is a string
-            inputs=None,
-            outputs=model_info,
-        )
-    return demo
 if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-    )
-    for k, v in os.environ.items():
-        logging.info('environ["%s"] = %r', k, v)
-    # Load the model once globally
-    load_model()
-    create_app().queue().launch()

 import gradio as gr
+import spaces
+import os
+import time
+from PIL import Image
+import functools
+from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava_stream, MLlavaForConditionalGeneration, chat_mllava
+from models.conversation import conv_templates
+from typing import List
+processor = MLlavaProcessor.from_pretrained("remyxai/SpaceMantis")
+model = LlavaForConditionalGeneration.from_pretrained("remyxai/SpaceMantis")
+conv_template = conv_templates['llama_3']
+@spaces.GPU
+def generate_stream(text:str, images:List[Image.Image], history: List[dict], **kwargs):
+    global processor, model
+    model = model.to("cuda")
+    if not images:
+        images = None
+    for text, history in chat_mllava_stream(text, images, model, processor, history=history, **kwargs):
+        yield text
+    return text
+@spaces.GPU
+def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
+    global processor, model
+    model = model.to("cuda")
+    if not images:
+        images = None
+    generated_text, history = chat_mllava(text, images, model, processor, history=history, **kwargs)
+    return generated_text
+def enable_next_image(uploaded_images, image):
+    uploaded_images.append(image)
+    return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
+def add_message(history, message):
+    if message["files"]:
+        for file in message["files"]:
+            history.append([(file,), None])
+    if message["text"]:
+        history.append([message["text"], None])
+    return history, gr.MultimodalTextbox(value=None)
+def print_like_dislike(x: gr.LikeData):
+    print(x.index, x.value, x.liked)
+def get_chat_history(history):
+    chat_history = []
+    user_role = conv_template.roles[0]
+    assistant_role = conv_template.roles[1]
+    for i, message in enumerate(history):
+        if isinstance(message[0], str):
+            chat_history.append({"role": user_role, "text": message[0]})
+            if i != len(history) - 1:
+                assert message[1], "The bot message is not provided, internal error"
+                chat_history.append({"role": assistant_role, "text": message[1]})
+            else:
+                assert not message[1], "the bot message internal error, get: {}".format(message[1])
+                chat_history.append({"role": assistant_role, "text": ""})
+    return chat_history
+def get_chat_images(history):
+    images = []
+    for message in history:
+        if isinstance(message[0], tuple):
+            images.extend(message[0])
+    return images
+def bot(history):
+    print(history)
+    cur_messages = {"text": "", "images": []}
+    for message in history[::-1]:
+        if message[1]:
+            break
+        if isinstance(message[0], str):
+            cur_messages["text"] = message[0] + " " + cur_messages["text"]
+        elif isinstance(message[0], tuple):
+            cur_messages["images"].extend(message[0])
+    cur_messages["text"] = cur_messages["text"].strip()
+    cur_messages["images"] = cur_messages["images"][::-1]
+    if not cur_messages["text"]:
+        raise gr.Error("Please enter a message")
+    if cur_messages['text'].count("<image>") < len(cur_messages['images']):
+        gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
+        cur_messages['text'] = "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>")) + cur_messages['text']
+        history[-1][0] = cur_messages["text"]
+    if cur_messages['text'].count("<image>") > len(cur_messages['images']):
+        gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
+        cur_messages['text'] = cur_messages['text'][::-1].replace("<image>"[::-1], "", cur_messages['text'].count("<image>") - len(cur_messages['images']))[::-1]
+        history[-1][0] = cur_messages["text"]
+    chat_history = get_chat_history(history)
+    chat_images = get_chat_images(history)
+    generation_kwargs = {
+        "max_new_tokens": 4096,
+        "num_beams": 1,
+        "do_sample": False
+    }
+    response = generate_stream(None, chat_images, chat_history, **generation_kwargs)
+    for _output in response:
+        history[-1][1] = _output
+        time.sleep(0.05)
+        yield history
+def build_demo():
     with gr.Blocks() as demo:
+        gr.Markdown(""" # SpaceMantis
+Mantis is a multimodal conversational AI model fine-tuned from [Mantis-8B-siglip-llama3](https://huggingface.co/remyxai/SpaceMantis/blob/main/TIGER-Lab/Mantis-8B-siglip-llama3) for enhanced spatial reasoning. It's optimized for multi-image reasoning, where inverleaved text and images can be used to generate responses.
+### [Github](https://github.com/remyxai/VQASynth) | [Model](https://huggingface.co/remyxai/SpaceMantis) | [Dataset](https://huggingface.co/datasets/remyxai/mantis-spacellava)
+        """)
+        gr.Markdown("""## Chat with SpaceMantis
+        SpaceMantis supports interleaved text-image input format, where you can simply use the placeholder `<image>` to indicate the position of uploaded images.
+        The model is optimized for multi-image reasoning, while preserving the ability to chat about text and images in a single conversation.
+        (The model currently serving is [🤗 remyxai/SpaceMantis](https://huggingface.co/remyxai/SpaceMantis))
+        """)
+        chatbot = gr.Chatbot(line_breaks=True)
+        chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload images. Please use <image> to indicate the position of uploaded images", show_label=True)
+        chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
+        """
+        with gr.Accordion(label='Advanced options', open=False):
+            temperature = gr.Slider(
+                label='Temperature',
+                minimum=0.1,
+                maximum=2.0,
+                step=0.1,
+                value=0.2,
+                interactive=True
+            )
+            top_p = gr.Slider(
+                label='Top-p',
+                minimum=0.05,
+                maximum=1.0,
+                step=0.05,
+                value=1.0,
+                interactive=True
+            )
+        """
+        bot_msg = chat_msg.success(bot, chatbot, chatbot, api_name="bot_response")
+        chatbot.like(print_like_dislike, None, None)
         with gr.Row():
+            send_button = gr.Button("Send")
+            clear_button = gr.ClearButton([chatbot, chat_input])
+        send_button.click(
+            add_message, [chatbot, chat_input], [chatbot, chat_input]
+        ).then(
+            bot, chatbot, chatbot, api_name="bot_response"
+        )
+        gr.Examples(
+            examples=[
+                {
+                    "text": "Give me the height of the man in the red hat in feet.",
+                    "files": ["./examples/warehouse_rgb.jpg"]
+                },
+            ],
+            inputs=[chat_input],
+        )
+        gr.Markdown("""
+## Citation
+```
+@article{chen2024spatialvlm,
+  title = {SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities},
+  author = {Chen, Boyuan and Xu, Zhuo and Kirmani, Sean and Ichter, Brian and Driess, Danny and Florence, Pete and Sadigh, Dorsa and Guibas, Leonidas and Xia, Fei},
+  journal = {arXiv preprint arXiv:2401.12168},
+  year = {2024},
+  url = {https://arxiv.org/abs/2401.12168},
+}
+@article{jiang2024mantis,
+  title={MANTIS: Interleaved Multi-Image Instruction Tuning},
+  author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
+  journal={arXiv preprint arXiv:2405.01483},
+  year={2024}
+}
+```""")
+    return demo
 if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()

barchart.jpeg ADDED Viewed

examples/warehouse_rgb.jpg ADDED Viewed

models/conversation.py ADDED Viewed

	@@ -0,0 +1,450 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    MFuyu = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n" + message + self.sep
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+        elif self.sep_style == SeparatorStyle.MFuyu:
+            seps = [self.sep, self.sep2]
+            ret = self.system + "\n"
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mfuyu_v1 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MFuyu,
+    sep="<0x04>", # begin of answer token
+    sep2="|ENDOFTEXT|",
+) # copied from conv_vicuna_v1
+conv_mllava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the multiple visual contents that the user provides, and assist the user with a variety of tasks using natural language."
+           "Each visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="</s>",
+    version="v1_mmtag",
+)
+conv_mllava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="</s>",
+)
+conv_llama_3 = Conversation(
+    system="<|start_header_id|>system<|end_header_id|>\n\nYou are a pirate chatbot who always responds in pirate speak!",
+    roles=("user", "assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
+default_conversation = conv_mfuyu_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "llama_3": conv_llama_3,
+    "mllava_v1": conv_mllava_v1,
+    "mllava_v1_mmtag": conv_mllava_v1_mmtag,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

models/mllava/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .modeling_llava import LlavaForConditionalGeneration, MLlavaForConditionalGeneration
+from .processing_llava import MLlavaProcessor
+from .configuration_llava import LlavaConfig
+from .utils import chat_mllava, chat_mllava_stream

models/mllava/configuration_llava.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# coding=utf-8
+# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Llava model configuration"""
+# from ...configuration_utils import PretrainedConfig
+# from ...utils import logging
+# from ..auto import CONFIG_MAPPING
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "llava-hf/llava-v1.5-7b": "https://huggingface.co/llava-hf/llava-v1.5-7b/resolve/main/config.json",
+}
+class LlavaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
+    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llava-9B.
+    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`LlavaVisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the CLIP backbone.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
+    Example:
+    ```python
+    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+    >>> # Initializing a Llava llava-1.5-7b style configuration
+    >>> configuration = LlavaConfig(vision_config, text_config)
+    >>> # Initializing a model from the llava-1.5-7b style configuration
+    >>> model = LlavaForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llava"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        vocab_size=32000,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.vocab_size = vocab_size
+        self.vision_config = vision_config
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+        self.vocab_size = self.vocab_size
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            self.vocab_size = self.text_config.vocab_size
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["llama"]()
+        super().__init__(**kwargs)

models/mllava/modeling_llava.py ADDED Viewed

	@@ -0,0 +1,770 @@

+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Llava model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+# from ... import PreTrainedModel
+# from ...activations import ACT2FN
+# from ...cache_utils import Cache
+# from ...modeling_outputs import ModelOutput
+# from ...utils import (
+#     add_start_docstrings,
+#     add_start_docstrings_to_model_forward,
+#     logging,
+#     replace_return_docstrings,
+# )
+# from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_llava import LlavaConfig
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import ModelOutput
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from .configuration_llava import LlavaConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlavaConfig"
+LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-1.5-13b-hf",
+    "llava-hf/bakLlava-v1-hf",
+    # See all Llava models at https://huggingface.co/models?filter=llava
+]
+@dataclass
+# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava
+class LlavaCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Llava causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class LlavaMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+LLAVA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlavaConfig`] or [`LlavaVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAVA_START_DOCSTRING,
+)
+class LlavaPreTrainedModel(PreTrainedModel):
+    config_class = LlavaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlavaVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        # important: this ported version of Llava isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+LLAVA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    """The LLAVA model which consists of a vision backbone and a language model.""",
+    LLAVA_START_DOCSTRING,
+)
+class LlavaForConditionalGeneration(LlavaPreTrainedModel):
+    def __init__(self, config: LlavaConfig, vision_tower=None, language_model=None):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config) if vision_tower is None else vision_tower
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.vocab_size = config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        ) if language_model is None else language_model
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+        num_images, num_image_patches, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        if labels is None:
+            final_labels = None
+        return final_embedding, final_attention_mask, final_labels, position_ids
+    @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1:
+                if isinstance(pixel_values, list):
+                    pixel_values = torch.cat([x for x in pixel_values if x is not None], dim=0)
+                # for siglip, need to transform the pixel_values to the right data type
+                if pixel_values.dtype != self.vision_tower.dtype:
+                    pixel_values = pixel_values.type(self.vision_tower.dtype)
+                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+                if vision_feature_select_strategy == "default":
+                    selected_image_feature = selected_image_feature[:, 1:]
+                elif vision_feature_select_strategy == "full":
+                    selected_image_feature = selected_image_feature
+                else:
+                    raise ValueError(
+                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+                    )
+                image_features = self.multi_modal_projector(selected_image_feature)
+                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, labels
+                )
+                if labels is None:
+                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
+            else:
+                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+                # generation with cache
+                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs[0]
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.image_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+            }
+        )
+        return model_inputs
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)
+from transformers.models.clip.modeling_clip import CLIPEncoderLayer, CLIPEncoder
+@add_start_docstrings(
+    """The MLLAVA model which consists of a vision backbone and a language model.""",
+    LLAVA_START_DOCSTRING,
+)
+class MLlavaForConditionalGeneration(LlavaForConditionalGeneration):
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        config.vision_config.type_vocab_size = 144
+        self.image_type_embeddings = nn.Embedding(config.vision_config.type_vocab_size, config.vision_config.hidden_size)
+        # self.vision_xatten_layers = nn.ModuleList([CLIPEncoderLayer(config.vision_config) for _ in range(config.vision_config.num_hidden_layers)])
+        self.vision_xatten_layers = CLIPEncoder(config.vision_config)
+    @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1:
+                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+                if vision_feature_select_strategy == "default":
+                    selected_image_feature = selected_image_feature[:, 1:]
+                elif vision_feature_select_strategy == "full":
+                    selected_image_feature = selected_image_feature
+                else:
+                    raise ValueError(
+                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+                    )
+                # added by Dongfu
+                num_images, num_image_patches, embed_dim = selected_image_feature.shape
+                image_type_embeddings = self.image_type_embeddings(torch.arange(num_images, device=selected_image_feature.device))
+                selected_image_feature += image_type_embeddings.unsqueeze(1)
+                xatten_output = self.vision_xatten_layers(selected_image_feature, attention_mask=None, causal_attention_mask=None)
+                selected_image_feature = xatten_output[0]
+                # end of added by Dongfu
+                image_features = self.multi_modal_projector(selected_image_feature)
+                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, labels
+                )
+                if labels is None:
+                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
+            else:
+                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+                # generation with cache
+                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs[0]
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

models/mllava/processing_llava.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Llava.
+"""
+import os
+import json
+from typing import List, Optional, Union, Dict
+# from ...feature_extraction_utils import BatchFeature
+# from ...image_utils import ImageInput
+# from ...processing_utils import ProcessorMixin
+# from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+# from ...utils import TensorType
+from transformers.feature_extraction_sequence_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+from transformers.processing_utils import transformers_module
+from transformers.utils.hub import is_remote_url, download_url, cached_file, is_offline_mode
+from transformers.utils import IMAGE_PROCESSOR_NAME
+from PIL import Image
+import logging
+import torch
+import numpy as np
+logger = logging.getLogger(__name__)
+class MLlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor.
+    [`LlavaProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
+    Args:
+        image_processor ([`CLIPImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "SiglipImageProcessor")
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast", "PreTrainedTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None):
+        super().__init__(image_processor, tokenizer)
+    def preprocess_interleaved_images_and_text(
+        self,
+        text,
+        images=None,
+    ):
+        """
+        Args:
+            text (`str`, `List[str]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                text can contain <image> tokens as the placeholder for the image(s) to be inserted.
+            images (`PIL.Image.Image`, `List[PIL.Image.Image]`, `List[List[PIL.Image.Image]]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+                the number of the images should match the number of <image> tokens in the text.
+        """
+        assert text is not None, "text cannot be None."
+        if images is not None:
+            if isinstance(images, Image.Image):
+                images = [images]
+            if isinstance(images, list) and isinstance(images[0], Image.Image):
+                if isinstance(text, str):
+                    images = [images]
+                elif isinstance(text, list):
+                    if len(text) != len(images):
+                        raise ValueError("Invalid input text. Number of texts does not match number of images.")
+                    images = [[image] for image in images]
+            if isinstance(text, str):
+                num_images = len(images[0])
+                num_image_tokens = text.count("<image>")
+                if num_image_tokens < num_images:
+                    # prepend empty image tokens to text
+                    if "USER:" in text:
+                        text = text.replace("USER:", "USER:" + "<image>" * (num_images - num_image_tokens), 1)
+                    elif "Human:" in text:
+                        text = text.replace("Human:", "Human:" + "<image>" * (num_images - num_image_tokens), 1)
+                    elif "HUMAN:" in text:
+                        text = text.replace("HUMAN:", "HUMAN:" + "<image>" * (num_images - num_image_tokens), 1)
+                    else:
+                        text = "<image>" * (num_images - num_image_tokens) + text
+                    # logger.warning("Image Tokens <image> are not provided in the text. Automatically prepending them before the text. This might cause model to behave unexpectedly.")
+                elif num_image_tokens > num_images:
+                    text = text.split("<image>")
+                    for i, t in enumerate(text):
+                        if i < num_images:
+                            text[i] = t + "<image>"
+                    text = "".join(text)
+                    logger.warning(f"Number of <image> tokens: {num_image_tokens} exceeds number of images: {num_images}. Automatically removing extra tokens at the end of the text.")
+                    # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
+                texts = [text]
+            elif isinstance(text, list):
+                if not isinstance(text[0], str):
+                    raise ValueError("Invalid input text. Each element of text must be a string.")
+                for i, t in enumerate(text):
+                    num_image_tokens = t.count("<image>")
+                    num_images = len(images[i])
+                    if num_image_tokens < num_images:
+                        # prepend empty image tokens to text
+                        if "USER:" in t:
+                            t = t.replace("USER:", "USER:" + "<image>" * (num_images - num_image_tokens), 1)
+                        elif "Human:" in t:
+                            t = t.replace("Human:", "Human:" + "<image>" * (num_images - num_image_tokens), 1)
+                        elif "HUMAN:" in t:
+                            t = t.replace("HUMAN:", "HUMAN:" + "<image>" * (num_images - num_image_tokens), 1)
+                        else:
+                            t = "<image>" * (num_images - num_image_tokens) + t
+                        # logger.warning("Image Tokens <image> are not provided in the text. Automatically prepending them before the text. This might cause model to behave unexpectedly.")
+                    elif num_image_tokens > num_images:
+                        t = t.split("<image>")
+                        for j, s in enumerate(t):
+                            if j < num_images:
+                                t[j] = s + "<image>"
+                        t = "".join(t)
+                        logger.warning(f"Number of <image> tokens: {num_image_tokens} exceeds number of images: {num_images}. Automatically removing extra tokens at the end of the text.")
+                        # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
+                    text[i] = t
+                texts = text
+            else:
+                raise ValueError("Invalid input text. text must be a string or a list of strings.")
+            assert all([t.count("<image>") == len(images_per_text) for t, images_per_text in zip(texts, images)]), "Number of <image> tokens in text does not match number of images."
+            # add image denotation in text before each <image> as "(image {i}: <image>)"
+            for i, t in enumerate(texts):
+                for j in range(len(images[i])):
+                    t = t.replace("<image>", f"(image {j+1}: <Image><IMAGE></Image>)", 1)
+                t = t.replace("<IMAGE>", "<image>")
+                texts[i] = t
+            # flatten images
+            images = [image for images_per_text in images for image in images_per_text]
+        else:
+            if isinstance(text, str):
+                texts = [text]
+            elif isinstance(text, list):
+                if not isinstance(text[0], str):
+                    raise ValueError("Invalid input text. Each element of text must be a string.")
+                texts = text
+            else:
+                raise ValueError("Invalid input text. text must be a string or a list of strings.")
+        return texts, images
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        add_image_ids: bool = True,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if add_image_ids:
+            text, images = self.preprocess_interleaved_images_and_text(text, images)
+        if images is not None:
+            pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"] # [batch_size, num_channels, height, width], e.g. [1, 3, 336, 336]
+        else:
+            pixel_values = None
+        text_inputs = self.tokenizer(
+            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+        )
+        # text_inputs:
+        # 1. input_ids: [batch_size, sequence_length], e.g. [1, 6]
+        # 2. attention_mask: [batch_size, sequence_length], e.g. [1, 6]
+        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def _right_pad_inputs_with_attention_mask(self, model_inputs: List[Dict]):
+        results = {}
+        assert len(model_inputs) == 1, "This method only supports a single input, but get {} inputs".format(len(model_inputs))
+        for k in model_inputs[0].keys():
+            if k == "pixel_values":
+                results[k] = [inputs[k] if inputs[k] is not None else None for inputs in model_inputs]
+            else:
+                results[k] = torch.cat([inputs[k] for inputs in model_inputs], dim=0)
+        return results
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+        user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_processor_file = pretrained_model_name_or_path
+            is_local = True
+        elif is_remote_url(pretrained_model_name_or_path):
+            processor_file = pretrained_model_name_or_path
+            resolved_processor_file = download_url(pretrained_model_name_or_path)
+        else:
+            processor_file = IMAGE_PROCESSOR_NAME
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_processor_file = cached_file(
+                    pretrained_model_name_or_path,
+                    processor_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=True,
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
+                )
+        # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
+        # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
+        # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
+        # However, for models added in the future, we won't get the expected error if this file is missing.
+        if resolved_processor_file is None:
+            image_processor_dict = {}
+        try:
+            # Load processor dict
+            with open(resolved_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
+            )
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            if isinstance(class_name, tuple):
+                if attribute_name == "tokenizer":
+                    classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+                    use_fast = kwargs.get("use_fast", True)
+                    if use_fast and classes[1] is not None:
+                        attribute_class = classes[1]
+                    else:
+                        attribute_class = classes[0]
+                elif attribute_name == "image_processor":
+                    image_processor_type = image_processor_dict.get("image_processor_type", None)
+                    if image_processor_type is not None:
+                        assert image_processor_type in class_name, f"Invalid image processor type: {image_processor_type}"
+                        attribute_class = getattr(transformers_module, image_processor_type)
+                    else:
+                        attribute_class = getattr(transformers_module, class_name[0])
+                else:
+                    raise ValueError(f"Invalid attribute name: {attribute_name}")
+            else:
+                attribute_class = getattr(transformers_module, class_name)
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return args

models/mllava/utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import PIL
+import torch
+from .modeling_llava import LlavaForConditionalGeneration
+from .processing_llava import MLlavaProcessor
+# from ..conversation import conv_mllava_v1_mmtag as default_conv
+from ..conversation import conv_mllava_v1 as default_conv, conv_templates
+from typing import List, Tuple, Union, Tuple
+def chat_mllava(
+    text:str,
+    images: List[Union[PIL.Image.Image, str]],
+    model:LlavaForConditionalGeneration,
+    processor:MLlavaProcessor,
+    max_input_length:int=None,
+    history:List[dict]=None,
+    **kwargs) -> Tuple[str, List[dict]]:
+    """
+    Chat with the Mllava model
+    Args:
+        text: str, the text to be sent to the model, where <image> will be the placeholder for the image
+        images: List[PIL.Image.Image], the images to be sent to the model, or None
+        model: LlavaForConditionalGeneration, the model to be used
+        processor: MLlavaProcessor, the processor to be used
+        max_input_length: int, the maximum input length
+        history: List[dict], list of messages in the conversation as history. Each message is a dictionary {"role": "ASSISTANT/USER", "text": "the message"}. If None, the conversation will start from scratch
+        kwargs: dict, the generation kwargs
+    Returns:
+        Tuple[str, List[dict]], the generated text and the history of the conversation
+    """
+    if "llama-3" in model.language_model.name_or_path.lower():
+        conv = conv_templates['llama_3']
+        terminators = [
+            processor.tokenizer.eos_token_id,
+            processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    else:
+        conv = default_conv
+        terminators = None
+    kwargs["eos_token_id"] = terminators
+    conv = conv.copy()
+    conv.messages = []
+    if history is not None:
+        for message in history:
+            assert message["role"] in conv.roles
+            conv.append_message(message["role"], message["text"])
+        if text:
+            assert conv.messages[-1][0] == conv.roles[1], "The last message in the history should be the assistant, if the given text is not empty"
+            conv.append_message(conv.roles[0], text)
+            conv.append_message(conv.roles[1], "")
+            history.append({"role": conv.roles[0], "text": text})
+            history.append({"role": conv.roles[1], "text": ""})
+        else:
+            if conv.messages[-1][0] == conv.roles[1]:
+                assert conv.messages[-1][1] == "", "No user message should be provided"
+            else:
+                assert conv.messages[-1][0] == conv.roles[0], "The last message in the history should be the user, if the given text is empty"
+                conv.append_message(conv.roles[0], "")
+                history.append({"role": conv.roles[0], "text": ""})
+    else:
+        history = []
+        history.append({"role": conv.roles[0], "text": text})
+        history.append({"role": conv.roles[1], "text": ""})
+        conv.append_message(conv.roles[0], text)
+        conv.append_message(conv.roles[1], "")
+    assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == "", "Format check"
+    assert history[-1]["role"] == conv.roles[1] and history[-1]["text"] == "", "Format check"
+    prompt = conv.get_prompt()
+    if images:
+        for i in range(len(images)):
+            if isinstance(images[i], str):
+                images[i] = PIL.Image.open(images[i]).convert("RGB")
+    inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    for k, v in inputs.items():
+        if v is not None:
+            if isinstance(v, torch.Tensor):
+                inputs[k] = v.to(model.device)
+            elif isinstance(v, list):
+                inputs[k] = [x.to(model.device) for x in v]
+            else:
+                raise ValueError(f"Invalid input type: {type(v)}")
+    output_ids = model.generate(**inputs, **kwargs)
+    output_ids = output_ids[0]
+    # remove the input tokens
+    generated_ids = output_ids[inputs["input_ids"].shape[-1]:]
+    generated_text = processor.decode(generated_ids, skip_special_tokens=True)
+    history[-1]["text"] = generated_text
+    return generated_text, history
+def chat_mllava_stream(
+    text:str,
+    images: List[Union[PIL.Image.Image, str]],
+    model:LlavaForConditionalGeneration,
+    processor:MLlavaProcessor,
+    max_input_length:int=None,
+    history:List[dict]=None,
+    **kwargs) -> Tuple[str, List[dict]]:
+    """
+    Chat with the Mllava model
+    Args:
+        text: str, the text to be sent to the model, where <image> will be the placeholder for the image
+        images: List[PIL.Image.Image], the images to be sent to the model, or None
+        model: LlavaForConditionalGeneration, the model to be used
+        processor: MLlavaProcessor, the processor to be used
+        max_input_length: int, the maximum input length
+        history: List[dict], list of messages in the conversation as history. Each message is a dictionary {"role": "ASSISTANT/USER", "text": "the message"}. If None, the conversation will start from scratch
+        kwargs: dict, the generation kwargs
+    Returns:
+        Tuple[str, List[dict]], the generated text and the history of the conversation
+    """
+    if "llama-3" in model.language_model.name_or_path.lower():
+        conv = conv_templates['llama_3']
+        terminators = [
+            processor.tokenizer.eos_token_id,
+            processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    else:
+        conv = default_conv
+        terminators = None
+    kwargs["eos_token_id"] = terminators
+    conv = conv.copy()
+    conv.messages = []
+    if history is not None:
+        for message in history:
+            assert message["role"] in conv.roles
+            conv.append_message(message["role"], message["text"])
+        if text:
+            assert conv.messages[-1][0] == conv.roles[1], "The last message in the history should be the assistant, if the given text is not empty"
+            conv.append_message(conv.roles[0], text)
+            conv.append_message(conv.roles[1], "")
+            history.append({"role": conv.roles[0], "text": text})
+            history.append({"role": conv.roles[1], "text": ""})
+        else:
+            if conv.messages[-1][0] == conv.roles[1]:
+                assert conv.messages[-1][1] == "", "No user message should be provided"
+            else:
+                assert conv.messages[-1][0] == conv.roles[0], "The last message in the history should be the user, if the given text is empty"
+                conv.append_message(conv.roles[0], "")
+                history.append({"role": conv.roles[0], "text": ""})
+    else:
+        history = []
+        history.append({"role": conv.roles[0], "text": text})
+        history.append({"role": conv.roles[1], "text": ""})
+        conv.append_message(conv.roles[0], text)
+        conv.append_message(conv.roles[1], "")
+    assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == "", "Format check"
+    assert history[-1]["role"] == conv.roles[1] and history[-1]["text"] == "", "Format check"
+    prompt = conv.get_prompt()
+    if images:
+        for i in range(len(images)):
+            if isinstance(images[i], str):
+                images[i] = PIL.Image.open(images[i])
+            images[i] = images[i].convert("RGB")
+    inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    print(processor.tokenizer.decode(inputs["input_ids"][0]))
+    for k, v in inputs.items():
+        if v is not None:
+            if isinstance(v, torch.Tensor):
+                inputs[k] = v.to(model.device)
+            elif isinstance(v, list):
+                inputs[k] = [x.to(model.device) for x in v]
+            else:
+                raise ValueError(f"Invalid input type: {type(v)}")
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    kwargs["streamer"] = streamer
+    inputs.update(kwargs)
+    thread = Thread(target=model.generate, kwargs=inputs)
+    thread.start()
+    for _output in streamer:
+        history[-1]["text"] += _output
+        yield history[-1]["text"], history

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-gradio
-Pillow
 torch
-requests
-git+https://github.com/remyxai/prismatic-vlms.git

 torch
+transformers>=4.41.0
+Pillow
+gradio
+spaces
+multiprocess