from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from pprint import pprint
import gradio as gr

# Initialize callback manager and LLM
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q3_K_M.gguf",
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

# Define the function to interact with the LLM
def llama_llm(prompt):
    llama_prompt = f"<s>[INST]<<SYS>>\nEve lives in Hamburg.; Bob lives in Cape Town.; Alice lives in Mumbay.\n<</SYS>>\n{prompt}[/INST]"
    response = llm(llama_prompt)
    return response

# Create the Gradio interface
iface = gr.Interface(
    fn=llama_llm,
    inputs="text",
    outputs="text",
    title="Llama LLM Chat Interface",
    description="Ask a question based on the system prompt: 'Eve lives in Hamburg.; Bob lives in Cape Town.; Alice lives in Mumbay.'"
)

# Launch the Gradio interface
iface.launch()