Spaces:
Running
Running
Upload with huggingface_hub
Browse files- Dockerfile +20 -0
- app.py +22 -0
- requirements.txt +5 -0
- utils/extractor.py +39 -0
- utils/summarizer.py +81 -0
- workcell.yaml +10 -0
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.8
|
5 |
+
|
6 |
+
# Set up a new user named "user" with user ID 1000
|
7 |
+
RUN useradd -m -u 1000 user
|
8 |
+
# Switch to the "user" user
|
9 |
+
USER user
|
10 |
+
# Set home to the user's home directory
|
11 |
+
ENV HOME=/home/user \
|
12 |
+
PATH=/home/user/.local/bin:$PATH
|
13 |
+
# Set the working directory to the user's home directory
|
14 |
+
WORKDIR $HOME/app
|
15 |
+
|
16 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
17 |
+
COPY --chown=user . $HOME/app
|
18 |
+
RUN pip install --no-cache-dir --upgrade -r $HOME/app/requirements.txt
|
19 |
+
|
20 |
+
CMD ["workcell", "serve", "--config", "workcell.yaml", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
from typing import Dict, List
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
from utils.summarizer import get_analyze_result
|
6 |
+
from utils.extractor import get_html_text
|
7 |
+
from workcell.integrations.types import MarkdownMixin
|
8 |
+
|
9 |
+
|
10 |
+
class Input(BaseModel):
|
11 |
+
url: str = Field(default="https://openai.com/blog/introducing-chatgpt-and-whisper-apis", description="An url string which you want to analyze automatically.")
|
12 |
+
|
13 |
+
def analyze_url(input: Input) -> MarkdownMixin:
|
14 |
+
"""Returns a thought provoking discussion questions from url provided, generated by OpenAI GPT3 API."""
|
15 |
+
openai.api_key = os.getenv('SECRET_OPENAI_WORKCELL_WEBPAGE_QA')
|
16 |
+
# return summarization
|
17 |
+
text = get_html_text(input.url)
|
18 |
+
markdown = get_analyze_result(text)
|
19 |
+
output = MarkdownMixin(
|
20 |
+
data=markdown
|
21 |
+
)
|
22 |
+
return output
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
workcell
|
2 |
+
openai
|
3 |
+
selectolax
|
4 |
+
transformers
|
5 |
+
torch
|
utils/extractor.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from selectolax.parser import HTMLParser
|
3 |
+
import re
|
4 |
+
from string import punctuation
|
5 |
+
|
6 |
+
|
7 |
+
def preprocess_text(text):
|
8 |
+
text = text.lower() # Lowercase text
|
9 |
+
# punctuation = r'\'\":'
|
10 |
+
text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation
|
11 |
+
text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
|
12 |
+
return text
|
13 |
+
|
14 |
+
def get_html(url):
|
15 |
+
# request web page
|
16 |
+
resp = requests.get(url)
|
17 |
+
# get the response text. in this case it is HTML
|
18 |
+
html = resp.text
|
19 |
+
return html
|
20 |
+
|
21 |
+
def get_text(html):
|
22 |
+
tree = HTMLParser(html)
|
23 |
+
if tree.body is None:
|
24 |
+
return None
|
25 |
+
for tag in tree.css('script'):
|
26 |
+
tag.decompose()
|
27 |
+
for tag in tree.css('style'):
|
28 |
+
tag.decompose()
|
29 |
+
# get the text from the body tag
|
30 |
+
text = tree.body.text(separator='')
|
31 |
+
# preprocess
|
32 |
+
text = preprocess_text(text)
|
33 |
+
return text
|
34 |
+
|
35 |
+
def get_html_text(url):
|
36 |
+
html = get_html(url)
|
37 |
+
text = get_text(html)
|
38 |
+
return text
|
39 |
+
|
utils/summarizer.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import openai
|
3 |
+
from transformers import GPT2Tokenizer
|
4 |
+
|
5 |
+
# Initialize tokenizer
|
6 |
+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
7 |
+
|
8 |
+
# Prompt engineering
|
9 |
+
def get_prompt(text):
|
10 |
+
# prompt_prefix = """Generate exactly 3 different and thought provoking discussion questions about given article below, and return the answers of these questions with the evidence.
|
11 |
+
|
12 |
+
# Desired output format: [{"Q":<question>,"A":<answer>},{"Q":<question>,"A":<answer>},{"Q":<question>,"A":<answer>}].
|
13 |
+
# """
|
14 |
+
prompt_prefix = """Generate exactly 3 different and thought provoking discussion questions about given article below, and return the answers of these questions with the evidence.
|
15 |
+
|
16 |
+
Desired output should be a markdown format like this:
|
17 |
+
|
18 |
+
## Q1: <question>
|
19 |
+
|
20 |
+
<answer>
|
21 |
+
|
22 |
+
## Q2: <question>
|
23 |
+
|
24 |
+
<answer>
|
25 |
+
|
26 |
+
## Q3: <question>
|
27 |
+
|
28 |
+
<answer>
|
29 |
+
|
30 |
+
"""
|
31 |
+
prompt_postfix ="""
|
32 |
+
Given article content: \"""{}.\"""
|
33 |
+
"""
|
34 |
+
prompt = prompt_prefix + prompt_postfix.format(text)
|
35 |
+
return prompt
|
36 |
+
|
37 |
+
def limit_tokens(text, n=3000):
|
38 |
+
# Get the first n tokens from the input text
|
39 |
+
input_ids = tokenizer.encode(text, return_tensors="pt")
|
40 |
+
first_n_tokens = input_ids[:, :n]
|
41 |
+
# Convert the first n tokens back to text format
|
42 |
+
processed_text = tokenizer.decode(first_n_tokens[0], skip_special_tokens=True)
|
43 |
+
return processed_text
|
44 |
+
|
45 |
+
|
46 |
+
# Chat completion
|
47 |
+
def get_openai_chatcompletion(text):
|
48 |
+
"""Get OpenAI Chat Completion result.
|
49 |
+
"""
|
50 |
+
messages = []
|
51 |
+
processed_text = limit_tokens(text)
|
52 |
+
augmented_prompt = get_prompt(processed_text)
|
53 |
+
messages.append({"role":"user","content": augmented_prompt})
|
54 |
+
|
55 |
+
try:
|
56 |
+
result = openai.ChatCompletion.create(
|
57 |
+
model="gpt-3.5-turbo",
|
58 |
+
messages=messages,
|
59 |
+
temperature=0.7
|
60 |
+
)
|
61 |
+
except:
|
62 |
+
raise
|
63 |
+
return result
|
64 |
+
|
65 |
+
|
66 |
+
def get_analyze(result):
|
67 |
+
try:
|
68 |
+
# analyze = ast.literal_eval(result["choices"][0]['text'])
|
69 |
+
# analyze = eval(result["choices"][0]['text'])
|
70 |
+
# analyze = result["choices"][0]['text']
|
71 |
+
analyze = result["choices"][0]["message"]["content"]
|
72 |
+
except:
|
73 |
+
raise
|
74 |
+
return analyze
|
75 |
+
|
76 |
+
|
77 |
+
def get_analyze_result(text):
|
78 |
+
result = get_openai_chatcompletion(text)
|
79 |
+
analyze = get_analyze(result)
|
80 |
+
return analyze
|
81 |
+
|
workcell.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: analyze_url
|
2 |
+
provider:
|
3 |
+
name: huggingface
|
4 |
+
repository: weanalyze/analyze_url
|
5 |
+
branch: main
|
6 |
+
version: latest
|
7 |
+
runtime: python3.8
|
8 |
+
entrypoint: app:analyze_url
|
9 |
+
tags: {}
|
10 |
+
envs: {}
|