Spaces:

remyxai
/

SpaceLLaVA

Paused

App Files Files Community

Salma Mayorquin commited on Mar 9

Commit

0945ad6

•

1 Parent(s): a6e329b

initial commit

Browse files

Files changed (5) hide show

README.md +3 -3
app.py +83 -0
examples/warehouse_1.jpg +0 -0
examples/warehouse_2.jpg +0 -0
requirements.txt +28 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: SpaceLLaVA
-emoji: 📚
-colorFrom: green
-colorTo: green
 sdk: gradio
 sdk_version: 4.20.1
 app_file: app.py

 ---
 title: SpaceLLaVA
+emoji: 🛸
+colorFrom: blue
+colorTo: purple
 sdk: gradio
 sdk_version: 4.20.1
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import io
+import base64
+import numpy as np
+import torch
+import matplotlib
+import matplotlib.cm
+import gradio as gr
+from PIL import Image
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+# Converts an image input (PIL Image or file path) into a base64 data URI
+def image_to_base64_data_uri(image_input):
+    if isinstance(image_input, str):
+        with open(image_input, "rb") as img_file:
+            base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+    elif isinstance(image_input, Image.Image):
+        buffer = io.BytesIO()
+        image_input.save(buffer, format="PNG")
+        base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
+    else:
+        raise ValueError("Unsupported input type. Input must be a file path or a PIL.Image.Image instance.")
+    return f"data:image/png;base64,{base64_data}"
+class Llava:
+    def __init__(self, mmproj="model/mmproj-model-f16.gguf", model_path="model/ggml-model-q4_0.gguf", gpu=False):
+        chat_handler = Llava15ChatHandler(clip_model_path=mmproj, verbose=True)
+        n_gpu_layers = 0
+        if gpu:
+            n_gpu_layers = -1
+        self.llm = Llama(model_path=model_path, chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=n_gpu_layers)
+    def run_inference(self, image, prompt):
+        data_uri = image_to_base64_data_uri(image)
+        res = self.llm.create_chat_completion(
+            messages=[
+                {"role": "system", "content": "You are an assistant who perfectly describes images."},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": data_uri}},
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ]
+        )
+        return res["choices"][0]["message"]["content"]
+# Initialize the model
+llm_model = Llava()
+title_and_links_markdown = """
+# 🛸SpaceLLaVA🌋: A spatial reasoning multi-modal model
+This space hosts our initial release of LLaVA 1.5 LoRA tuned for spatial reasoning using data generated with [VQASynth](https://github.com/remyxai/VQASynth).
+Upload an image and ask a question.
+[Model](https://huggingface.co/remyxai/SpaceLLaVA) | [Code](https://github.com/remyxai/VQASynth) | [Paper](https://spatial-vlm.github.io)
+"""
+def predict(image, prompt):
+    result = llm_model.run_inference(image, prompt)
+    return result
+image_input = gr.inputs.Image(type="pil", label="Input Image")
+text_input = gr.inputs.Textbox(label="Prompt")
+# Initialize interface with examples
+iface = gr.Interface(
+    fn=predict,
+    inputs=[image_input, text_input],
+    outputs="text",
+    title="Llava Model Inference",
+    description="Input an image and a prompt to receive a description."
+)
+examples = [
+    ["examples/warehouse_1.jpg", "Is the man wearing gray pants to the left of the pile of boxes on a pallet?"],
+    ["examples/warehouse_2.jpg", "Is the forklift taller than the shelves of boxes?"],
+]
+iface.examples = examples
+iface.launch()

examples/warehouse_1.jpg ADDED Viewed

examples/warehouse_2.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+pip
+einops
+fastapi
+gradio==3.35.2
+markdown2[all]
+numpy
+requests
+sentencepiece
+tokenizers>=0.12.1
+torch==2.0.1
+torchvision==0.15.2
+uvicorn
+wandb
+shortuuid
+pillow
+httpx==0.24.0
+deepspeed==0.9.5
+peft==0.4.0
+transformers==4.31.0
+accelerate==0.21.0
+bitsandbytes==0.41.0
+scikit-learn==1.2.2
+sentencepiece==0.1.99
+einops==0.6.1
+einops-exts==0.0.4
+llama-cpp-python==0.2.55
+timm==0.6.13
+gradio_client==0.2.9