jiandong commited on
Commit
4ed95aa
1 Parent(s): e660645

Upload with huggingface_hub

Browse files
Files changed (6) hide show
  1. Dockerfile +20 -0
  2. app.py +22 -0
  3. requirements.txt +5 -0
  4. utils/extractor.py +39 -0
  5. utils/summarizer.py +81 -0
  6. workcell.yaml +10 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.8
5
+
6
+ # Set up a new user named "user" with user ID 1000
7
+ RUN useradd -m -u 1000 user
8
+ # Switch to the "user" user
9
+ USER user
10
+ # Set home to the user's home directory
11
+ ENV HOME=/home/user \
12
+ PATH=/home/user/.local/bin:$PATH
13
+ # Set the working directory to the user's home directory
14
+ WORKDIR $HOME/app
15
+
16
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
17
+ COPY --chown=user . $HOME/app
18
+ RUN pip install --no-cache-dir --upgrade -r $HOME/app/requirements.txt
19
+
20
+ CMD ["workcell", "serve", "--config", "workcell.yaml", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ from typing import Dict, List
4
+ from pydantic import BaseModel, Field
5
+ from utils.summarizer import get_analyze_result
6
+ from utils.extractor import get_html_text
7
+ from workcell.integrations.types import MarkdownMixin
8
+
9
+
10
+ class Input(BaseModel):
11
+ url: str = Field(default="https://openai.com/blog/introducing-chatgpt-and-whisper-apis", description="An url string which you want to analyze automatically.")
12
+
13
+ def analyze_url(input: Input) -> MarkdownMixin:
14
+ """Returns a thought provoking discussion questions from url provided, generated by OpenAI GPT3 API."""
15
+ openai.api_key = os.getenv('SECRET_OPENAI_WORKCELL_WEBPAGE_QA')
16
+ # return summarization
17
+ text = get_html_text(input.url)
18
+ markdown = get_analyze_result(text)
19
+ output = MarkdownMixin(
20
+ data=markdown
21
+ )
22
+ return output
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ workcell
2
+ openai
3
+ selectolax
4
+ transformers
5
+ torch
utils/extractor.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from selectolax.parser import HTMLParser
3
+ import re
4
+ from string import punctuation
5
+
6
+
7
+ def preprocess_text(text):
8
+ text = text.lower() # Lowercase text
9
+ # punctuation = r'\'\":'
10
+ text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation
11
+ text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
12
+ return text
13
+
14
+ def get_html(url):
15
+ # request web page
16
+ resp = requests.get(url)
17
+ # get the response text. in this case it is HTML
18
+ html = resp.text
19
+ return html
20
+
21
+ def get_text(html):
22
+ tree = HTMLParser(html)
23
+ if tree.body is None:
24
+ return None
25
+ for tag in tree.css('script'):
26
+ tag.decompose()
27
+ for tag in tree.css('style'):
28
+ tag.decompose()
29
+ # get the text from the body tag
30
+ text = tree.body.text(separator='')
31
+ # preprocess
32
+ text = preprocess_text(text)
33
+ return text
34
+
35
+ def get_html_text(url):
36
+ html = get_html(url)
37
+ text = get_text(html)
38
+ return text
39
+
utils/summarizer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import openai
3
+ from transformers import GPT2Tokenizer
4
+
5
+ # Initialize tokenizer
6
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
7
+
8
+ # Prompt engineering
9
+ def get_prompt(text):
10
+ # prompt_prefix = """Generate exactly 3 different and thought provoking discussion questions about given article below, and return the answers of these questions with the evidence.
11
+
12
+ # Desired output format: [{"Q":<question>,"A":<answer>},{"Q":<question>,"A":<answer>},{"Q":<question>,"A":<answer>}].
13
+ # """
14
+ prompt_prefix = """Generate exactly 3 different and thought provoking discussion questions about given article below, and return the answers of these questions with the evidence.
15
+
16
+ Desired output should be a markdown format like this:
17
+
18
+ ## Q1: <question>
19
+
20
+ <answer>
21
+
22
+ ## Q2: <question>
23
+
24
+ <answer>
25
+
26
+ ## Q3: <question>
27
+
28
+ <answer>
29
+
30
+ """
31
+ prompt_postfix ="""
32
+ Given article content: \"""{}.\"""
33
+ """
34
+ prompt = prompt_prefix + prompt_postfix.format(text)
35
+ return prompt
36
+
37
+ def limit_tokens(text, n=3000):
38
+ # Get the first n tokens from the input text
39
+ input_ids = tokenizer.encode(text, return_tensors="pt")
40
+ first_n_tokens = input_ids[:, :n]
41
+ # Convert the first n tokens back to text format
42
+ processed_text = tokenizer.decode(first_n_tokens[0], skip_special_tokens=True)
43
+ return processed_text
44
+
45
+
46
+ # Chat completion
47
+ def get_openai_chatcompletion(text):
48
+ """Get OpenAI Chat Completion result.
49
+ """
50
+ messages = []
51
+ processed_text = limit_tokens(text)
52
+ augmented_prompt = get_prompt(processed_text)
53
+ messages.append({"role":"user","content": augmented_prompt})
54
+
55
+ try:
56
+ result = openai.ChatCompletion.create(
57
+ model="gpt-3.5-turbo",
58
+ messages=messages,
59
+ temperature=0.7
60
+ )
61
+ except:
62
+ raise
63
+ return result
64
+
65
+
66
+ def get_analyze(result):
67
+ try:
68
+ # analyze = ast.literal_eval(result["choices"][0]['text'])
69
+ # analyze = eval(result["choices"][0]['text'])
70
+ # analyze = result["choices"][0]['text']
71
+ analyze = result["choices"][0]["message"]["content"]
72
+ except:
73
+ raise
74
+ return analyze
75
+
76
+
77
+ def get_analyze_result(text):
78
+ result = get_openai_chatcompletion(text)
79
+ analyze = get_analyze(result)
80
+ return analyze
81
+
workcell.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ name: analyze_url
2
+ provider:
3
+ name: huggingface
4
+ repository: weanalyze/analyze_url
5
+ branch: main
6
+ version: latest
7
+ runtime: python3.8
8
+ entrypoint: app:analyze_url
9
+ tags: {}
10
+ envs: {}