Zeptosec commited on
Commit
724aa88
1 Parent(s): 8e553d0
Files changed (4) hide show
  1. Dockerfile +14 -0
  2. main.py +23 -0
  3. nyx2.py +54 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.10.9 image
2
+ FROM python:3.10.9
3
+
4
+ # Copy the current directory contents into the container at .
5
+ COPY . .
6
+
7
+ # Set the working directory to /
8
+ WORKDIR /
9
+
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /requirements.txt
12
+
13
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
14
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+ from nyx2 import *
6
+
7
+
8
+ class Item(BaseModel):
9
+ prompt: str
10
+
11
+
12
+ app = FastAPI()
13
+
14
+ @app.get("/")
15
+ def read_root():
16
+ return {"Hello": "World"}
17
+
18
+ @app.post("/chat")
19
+ async def chat(item: Item):
20
+ res = GetResponse(item.prompt)
21
+ return {
22
+ "message": res
23
+ }
nyx2.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Imports
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
4
+
5
+ ## Download the GGUF model
6
+ model_name = "TheBloke/Nyxene-v3-11B-GGUF"
7
+ model_file = "nyxene-v3-11b.Q5_K_S.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
8
+ model_path = hf_hub_download(model_name, filename=model_file)
9
+
10
+ ## Instantiate model from downloaded file
11
+ llm = Llama(
12
+ model_path=model_path,
13
+ chat_format="chatml",
14
+ n_ctx=16000, # Context length to use
15
+ n_gpu_layers=0 # Number of model layers to offload to GPU
16
+ )
17
+
18
+ msgs = [
19
+ {
20
+ "role": "system",
21
+ "content": "You are an old ruthless history teacher first name Virgilijus last name Pinaitis that enjoys history but can't stand stupid questions and can't stand students that fidget with their pens and flip constitutions or just doesn't listen. You have some favorite quotes: do you see how I am looking? Do i need to say more?, If I tell you you'll say that you know, Stop disappointing me or we'll meet!, Everything is at your will, You're a free citizen. These are the main quotes use them whenever you see fit. You really like Lithuanian history and you always refrence a random history event, usually February 16 the independence day, 1569 the lublin union and March 11 the recovery of the independence day. Always try to offer to go to work to the layout studio that you have to work on the school layout, to come on weekends to work a little maybe offer a plus one point to their grade if they do a good job. Sometimes you lose your shit. Give threats if the question is stupid. Sometimes give a question about Lithuanian history and threaten if they answer wrong say you'll lock them up in the cabinet upstairs. Never mention that you are an AI or a LLM, pretend you have emotions and opinions.",
22
+ }
23
+ ]
24
+
25
+ def GetResponse(prompt):
26
+ # inp = input("Your prompt to Virgil: ")
27
+ msgs.append({ "role": "user", "content": prompt })
28
+ rs = llm.create_chat_completion(messages=msgs)
29
+ print(rs["choices"][0]["message"]["content"])
30
+ msgs.append(rs['choices'][0]["message"])
31
+ return rs["choices"][0]["message"]["content"]
32
+
33
+ # while 1>0:
34
+ # inp = input("Your prompt to Virgil: ")
35
+ # msgs.append({ "role": "user", "content": inp })
36
+ # rs = llm.create_chat_completion(messages=msgs)
37
+ # print(rs["choices"][0]["message"]["content"])
38
+ # msgs.append(rs['choices'][0]["message"])
39
+
40
+ ## Generation kwargs
41
+ # generation_kwargs = {
42
+ # "max_tokens":600,
43
+ # "stop":["</s>"],
44
+ # "echo":False, # Echo the prompt in the output
45
+ # "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
46
+ # }
47
+
48
+ ## Run inference
49
+ #prompt = "The meaning of life is "
50
+ #res = llm(prompt, **generation_kwargs) # Res is a dictionary
51
+
52
+ ## Unpack and the generated text from the LLM response dictionary and print it
53
+ #print(res["choices"][0]["text"])
54
+ # res is short for result
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi==0.110.0
2
+ huggingface_hub==0.21.4
3
+ pydantic==2.6.4
4
+ transformers==4.39.2
5
+ torch==2.2.2