from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain_community.llms import LlamaCpp from pprint import pprint import gradio as gr # Initialize callback manager and LLM callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) llm = LlamaCpp( model_path="llama-2-7b-chat.Q3_K_M.gguf", temperature=0.75, max_tokens=2000, top_p=1, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) # Define the function to interact with the LLM def llama_llm(prompt): llama_prompt = f"[INST]<>\nEve lives in Hamburg.; Bob lives in Cape Town.; Alice lives in Mumbay.\n<>\n{prompt}[/INST]" response = llm(llama_prompt) return response # Create the Gradio interface iface = gr.Interface( fn=llama_llm, inputs="text", outputs="text", title="Llama LLM Chat Interface", description="Ask a question based on the system prompt: 'Eve lives in Hamburg.; Bob lives in Cape Town.; Alice lives in Mumbay.'" ) # Launch the Gradio interface iface.launch()