Spaces:

weanalyze
/

analyze_url

Running

App Files Files Community

jiandong commited on Apr 14, 2023

Commit

4ed95aa

•

1 Parent(s): e660645

Upload with huggingface_hub

Browse files

Files changed (6) hide show

Dockerfile +20 -0
app.py +22 -0
requirements.txt +5 -0
utils/extractor.py +39 -0
utils/summarizer.py +81 -0
workcell.yaml +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.8
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+RUN pip install --no-cache-dir --upgrade -r $HOME/app/requirements.txt
+CMD ["workcell", "serve", "--config", "workcell.yaml", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import openai
+from typing import Dict, List
+from pydantic import BaseModel, Field
+from utils.summarizer import get_analyze_result
+from utils.extractor import get_html_text
+from workcell.integrations.types import MarkdownMixin
+class Input(BaseModel):
+    url: str = Field(default="https://openai.com/blog/introducing-chatgpt-and-whisper-apis", description="An url string which you want to analyze automatically.")
+def analyze_url(input: Input) -> MarkdownMixin:
+    """Returns a thought provoking discussion questions from url provided, generated by OpenAI GPT3 API."""
+    openai.api_key = os.getenv('SECRET_OPENAI_WORKCELL_WEBPAGE_QA')
+    # return summarization
+    text = get_html_text(input.url)
+    markdown = get_analyze_result(text)
+    output = MarkdownMixin(
+        data=markdown
+    )
+    return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+workcell
+openai
+selectolax
+transformers
+torch

utils/extractor.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import requests
+from selectolax.parser import HTMLParser
+import re
+from string import punctuation
+def preprocess_text(text):
+    text = text.lower()  # Lowercase text
+    # punctuation = r'\'\":'
+    text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # Remove punctuation
+    text = " ".join(text.split())  # Remove extra spaces, tabs, and new lines
+    return text
+def get_html(url):
+    # request web page
+    resp = requests.get(url)
+    # get the response text. in this case it is HTML
+    html = resp.text
+    return html
+def get_text(html):
+    tree = HTMLParser(html)
+    if tree.body is None:
+        return None
+    for tag in tree.css('script'):
+        tag.decompose()
+    for tag in tree.css('style'):
+        tag.decompose()
+    # get the text from the body tag
+    text = tree.body.text(separator='')
+    # preprocess
+    text = preprocess_text(text)
+    return text
+def get_html_text(url):
+    html = get_html(url)
+    text = get_text(html)
+    return text

utils/summarizer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import ast
+import openai
+from transformers import GPT2Tokenizer
+# Initialize tokenizer
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+# Prompt engineering
+def get_prompt(text):
+    # prompt_prefix = """Generate exactly 3 different and thought provoking discussion questions about given article below, and return the answers of these questions with the evidence.
+    # Desired output format: [{"Q":<question>,"A":<answer>},{"Q":<question>,"A":<answer>},{"Q":<question>,"A":<answer>}].
+    # """
+    prompt_prefix = """Generate exactly 3 different and thought provoking discussion questions about given article below, and return the answers of these questions with the evidence.
+    Desired output should be a markdown format like this:
+    ## Q1: <question>
+   <answer>
+    ## Q2: <question>
+   <answer>
+    ## Q3: <question>
+   <answer>
+    """
+    prompt_postfix ="""
+    Given article content: \"""{}.\"""
+    """
+    prompt = prompt_prefix + prompt_postfix.format(text)
+    return prompt
+def limit_tokens(text, n=3000):
+    # Get the first n tokens from the input text
+    input_ids = tokenizer.encode(text, return_tensors="pt")
+    first_n_tokens = input_ids[:, :n]
+    # Convert the first n tokens back to text format
+    processed_text = tokenizer.decode(first_n_tokens[0], skip_special_tokens=True)
+    return processed_text
+# Chat completion
+def get_openai_chatcompletion(text):
+    """Get OpenAI Chat Completion result.
+    """
+    messages = []
+    processed_text = limit_tokens(text)
+    augmented_prompt = get_prompt(processed_text)
+    messages.append({"role":"user","content": augmented_prompt})
+    try:
+        result = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=0.7
+        )
+    except:
+        raise
+    return result
+def get_analyze(result):
+    try:
+        # analyze = ast.literal_eval(result["choices"][0]['text'])
+        # analyze = eval(result["choices"][0]['text'])
+        # analyze = result["choices"][0]['text']
+        analyze = result["choices"][0]["message"]["content"]
+    except:
+        raise
+    return analyze
+def get_analyze_result(text):
+    result = get_openai_chatcompletion(text)
+    analyze = get_analyze(result)
+    return analyze

workcell.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: analyze_url
+provider:
+  name: huggingface
+  repository: weanalyze/analyze_url
+  branch: main
+version: latest
+runtime: python3.8
+entrypoint: app:analyze_url
+tags: {}
+envs: {}