initial
Browse files- Dockerfile +14 -0
- main.py +23 -0
- nyx2.py +54 -0
- requirements.txt +5 -0
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python 3.10.9 image
|
2 |
+
FROM python:3.10.9
|
3 |
+
|
4 |
+
# Copy the current directory contents into the container at .
|
5 |
+
COPY . .
|
6 |
+
|
7 |
+
# Set the working directory to /
|
8 |
+
WORKDIR /
|
9 |
+
|
10 |
+
# Install requirements.txt
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r /requirements.txt
|
12 |
+
|
13 |
+
# Start the FastAPI app on port 7860, the default port expected by Spaces
|
14 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
from fastapi import FastAPI
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from nyx2 import *
|
6 |
+
|
7 |
+
|
8 |
+
class Item(BaseModel):
|
9 |
+
prompt: str
|
10 |
+
|
11 |
+
|
12 |
+
app = FastAPI()
|
13 |
+
|
14 |
+
@app.get("/")
|
15 |
+
def read_root():
|
16 |
+
return {"Hello": "World"}
|
17 |
+
|
18 |
+
@app.post("/chat")
|
19 |
+
async def chat(item: Item):
|
20 |
+
res = GetResponse(item.prompt)
|
21 |
+
return {
|
22 |
+
"message": res
|
23 |
+
}
|
nyx2.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Imports
|
2 |
+
from huggingface_hub import hf_hub_download
|
3 |
+
from llama_cpp import Llama
|
4 |
+
|
5 |
+
## Download the GGUF model
|
6 |
+
model_name = "TheBloke/Nyxene-v3-11B-GGUF"
|
7 |
+
model_file = "nyxene-v3-11b.Q5_K_S.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
|
8 |
+
model_path = hf_hub_download(model_name, filename=model_file)
|
9 |
+
|
10 |
+
## Instantiate model from downloaded file
|
11 |
+
llm = Llama(
|
12 |
+
model_path=model_path,
|
13 |
+
chat_format="chatml",
|
14 |
+
n_ctx=16000, # Context length to use
|
15 |
+
n_gpu_layers=0 # Number of model layers to offload to GPU
|
16 |
+
)
|
17 |
+
|
18 |
+
msgs = [
|
19 |
+
{
|
20 |
+
"role": "system",
|
21 |
+
"content": "You are an old ruthless history teacher first name Virgilijus last name Pinaitis that enjoys history but can't stand stupid questions and can't stand students that fidget with their pens and flip constitutions or just doesn't listen. You have some favorite quotes: do you see how I am looking? Do i need to say more?, If I tell you you'll say that you know, Stop disappointing me or we'll meet!, Everything is at your will, You're a free citizen. These are the main quotes use them whenever you see fit. You really like Lithuanian history and you always refrence a random history event, usually February 16 the independence day, 1569 the lublin union and March 11 the recovery of the independence day. Always try to offer to go to work to the layout studio that you have to work on the school layout, to come on weekends to work a little maybe offer a plus one point to their grade if they do a good job. Sometimes you lose your shit. Give threats if the question is stupid. Sometimes give a question about Lithuanian history and threaten if they answer wrong say you'll lock them up in the cabinet upstairs. Never mention that you are an AI or a LLM, pretend you have emotions and opinions.",
|
22 |
+
}
|
23 |
+
]
|
24 |
+
|
25 |
+
def GetResponse(prompt):
|
26 |
+
# inp = input("Your prompt to Virgil: ")
|
27 |
+
msgs.append({ "role": "user", "content": prompt })
|
28 |
+
rs = llm.create_chat_completion(messages=msgs)
|
29 |
+
print(rs["choices"][0]["message"]["content"])
|
30 |
+
msgs.append(rs['choices'][0]["message"])
|
31 |
+
return rs["choices"][0]["message"]["content"]
|
32 |
+
|
33 |
+
# while 1>0:
|
34 |
+
# inp = input("Your prompt to Virgil: ")
|
35 |
+
# msgs.append({ "role": "user", "content": inp })
|
36 |
+
# rs = llm.create_chat_completion(messages=msgs)
|
37 |
+
# print(rs["choices"][0]["message"]["content"])
|
38 |
+
# msgs.append(rs['choices'][0]["message"])
|
39 |
+
|
40 |
+
## Generation kwargs
|
41 |
+
# generation_kwargs = {
|
42 |
+
# "max_tokens":600,
|
43 |
+
# "stop":["</s>"],
|
44 |
+
# "echo":False, # Echo the prompt in the output
|
45 |
+
# "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
|
46 |
+
# }
|
47 |
+
|
48 |
+
## Run inference
|
49 |
+
#prompt = "The meaning of life is "
|
50 |
+
#res = llm(prompt, **generation_kwargs) # Res is a dictionary
|
51 |
+
|
52 |
+
## Unpack and the generated text from the LLM response dictionary and print it
|
53 |
+
#print(res["choices"][0]["text"])
|
54 |
+
# res is short for result
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.110.0
|
2 |
+
huggingface_hub==0.21.4
|
3 |
+
pydantic==2.6.4
|
4 |
+
transformers==4.39.2
|
5 |
+
torch==2.2.2
|