Spaces:

myscale
/

ChatData

Running

App Files Files Community

Fangrui Liu commited on Sep 25, 2023

Commit

45180a0

•

1 Parent(s): d5a4cb4

add wikipedia

Browse files

Files changed (4) hide show

app.py +235 -132
callbacks/arxiv_callbacks.py +1 -1
chains/arxiv_chains.py +49 -5
prompts/arxiv_prompt.py +4 -4

app.py CHANGED Viewed

@@ -1,3 +1,25 @@
 import re
 import pandas as pd
 from os import environ
@@ -6,34 +28,156 @@ import datetime
 environ['TOKENIZERS_PARALLELISM'] = 'true'
 environ['OPENAI_API_BASE'] = st.secrets['OPENAI_API_BASE']
-from langchain.vectorstores import MyScale, MyScaleSettings
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.retrievers.self_query.base import SelfQueryRetriever
-from langchain.chains.query_constructor.base import AttributeInfo, VirtualColumnName
-from langchain import OpenAI
-from langchain.chat_models import ChatOpenAI
-from langchain.prompts.prompt import PromptTemplate
-from langchain.prompts import PromptTemplate, ChatPromptTemplate, \
-    SystemMessagePromptTemplate, HumanMessagePromptTemplate
-from sqlalchemy import create_engine, MetaData
-from langchain.chains import LLMChain
-from langchain.utilities.sql_database import SQLDatabase
-from langchain_experimental.retrievers.vector_sql_database import VectorSQLDatabaseChainRetriever
-from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
-from chains.arxiv_chains import VectorSQLRetrieveCustomOutputParser
-from chains.arxiv_chains import ArXivQAwithSourcesChain, ArXivStuffDocumentChain
-from callbacks.arxiv_callbacks import ChatDataSelfSearchCallBackHandler, \
-    ChatDataSelfAskCallBackHandler, ChatDataSQLSearchCallBackHandler, \
-    ChatDataSQLAskCallBackHandler
-from prompts.arxiv_prompt import combine_prompt_template, _myscale_prompt
-st.set_page_config(page_title="ChatData")
-st.header("ChatData")
 def try_eval(x):
@@ -55,14 +199,14 @@ def display(dataframe, columns_=None, index=None):
         st.write("Sorry 😵 we didn't find any articles related to your query.\n\nMaybe the LLM is too naughty that does not follow our instruction... \n\nPlease try again and use verbs that may match the datatype.", unsafe_allow_html=True)
-@st.cache_resource
-def build_retriever():
     with st.spinner("Loading Model..."):
-        embeddings = HuggingFaceInstructEmbeddings(
-            model_name='hkunlp/instructor-xl',
-            embed_instruction="Represent the question for retrieving supporting scientific papers: ")
-    with st.spinner("Connecting DB..."):
         myscale_connection = {
             "host": st.secrets['MYSCALE_HOST'],
             "port": st.secrets['MYSCALE_PORT'],
@@ -70,69 +214,40 @@ def build_retriever():
             "password": st.secrets['MYSCALE_PASSWORD'],
         }
-        config = MyScaleSettings(**myscale_connection, table='ChatArXiv',
                                  column_map={
                                      "id": "id",
-                                     "text": "abstract",
-                                     "vector": "vector",
-                                     "metadata": "metadata"
                                  })
-        doc_search = MyScale(embeddings, config)
-    with st.spinner("Building Self Query Retriever..."):
-        metadata_field_info = [
-            AttributeInfo(
-                name=VirtualColumnName(name="pubdate"),
-                description="The year the paper is published",
-                type="timestamp",
-            ),
-            AttributeInfo(
-                name="authors",
-                description="List of author names",
-                type="list[string]",
-            ),
-            AttributeInfo(
-                name="title",
-                description="Title of the paper",
-                type="string",
-            ),
-            AttributeInfo(
-                name="categories",
-                description="arxiv categories to this paper",
-                type="list[string]"
-            ),
-            AttributeInfo(
-                name="length(categories)",
-                description="length of arxiv categories to this paper",
-                type="int"
-            ),
-        ]
         retriever = SelfQueryRetriever.from_llm(
-            OpenAI(openai_api_key=st.secrets['OPENAI_API_KEY'], temperature=0),
             doc_search, "Scientific papers indexes with abstracts. All in English.", metadata_field_info,
-            use_original_query=False)
-    document_with_metadata_prompt = PromptTemplate(
-        input_variables=["page_content", "id", "title", "ref_id",
-                        "authors", "pubdate", "categories"],
-        template="Title for PDF #{ref_id}: {title}\n\tAbstract: {page_content}\n\tAuthors: {authors}\n\tDate of Publication: {pubdate}\n\tCategories: {categories}\nSOURCE: {id}")
     COMBINE_PROMPT = ChatPromptTemplate.from_strings(
         string_messages=[(SystemMessagePromptTemplate, combine_prompt_template),
-                        (HumanMessagePromptTemplate, '{question}')])
     OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
-    with st.spinner('Building QA Chain with Self-query...'):
         chain = ArXivQAwithSourcesChain(
             retriever=retriever,
             combine_documents_chain=ArXivStuffDocumentChain(
                 llm_chain=LLMChain(
                     prompt=COMBINE_PROMPT,
-                    llm=ChatOpenAI(model_name='gpt-3.5-turbo-16k',
-                                openai_api_key=OPENAI_API_KEY, temperature=0.6),
                 ),
-                document_prompt=document_with_metadata_prompt,
                 document_variable_name="summaries",
             ),
@@ -140,23 +255,22 @@ def build_retriever():
             max_tokens_limit=12000,
         )
-    with st.spinner('Building Vector SQL Database Retriever'):
         MYSCALE_USER = st.secrets['MYSCALE_USER']
         MYSCALE_PASSWORD = st.secrets['MYSCALE_PASSWORD']
         MYSCALE_HOST = st.secrets['MYSCALE_HOST']
         MYSCALE_PORT = st.secrets['MYSCALE_PORT']
         engine = create_engine(
-            f'clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https')
         metadata = MetaData(bind=engine)
         PROMPT = PromptTemplate(
             input_variables=["input", "table_info", "top_k"],
             template=_myscale_prompt,
         )
         output_parser = VectorSQLRetrieveCustomOutputParser.from_embeddings(
-            model=embeddings)
         sql_query_chain = VectorSQLDatabaseChain.from_llm(
-            llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),
             prompt=PROMPT,
             top_k=10,
             return_direct=True,
@@ -165,18 +279,18 @@ def build_retriever():
             native_format=True
         )
         sql_retriever = VectorSQLDatabaseChainRetriever(
-            sql_db_chain=sql_query_chain, page_content_key="abstract")
-    with st.spinner('Building QA Chain with Vector SQL...'):
         sql_chain = ArXivQAwithSourcesChain(
             retriever=sql_retriever,
             combine_documents_chain=ArXivStuffDocumentChain(
                 llm_chain=LLMChain(
                     prompt=COMBINE_PROMPT,
-                    llm=ChatOpenAI(model_name='gpt-3.5-turbo-16k',
-                                openai_api_key=OPENAI_API_KEY, temperature=0.6),
                 ),
-                document_prompt=document_with_metadata_prompt,
                 document_variable_name="summaries",
             ),
@@ -184,48 +298,33 @@ def build_retriever():
             max_tokens_limit=12000,
         )
-    return [{'name': m.name.name if type(m.name) is VirtualColumnName else m.name, 'desc': m.description, 'type': m.type} for m in metadata_field_info], \
-        retriever, chain, sql_retriever, sql_chain
 if 'retriever' not in st.session_state:
-    st.session_state['metadata_columns'], \
-        st.session_state['retriever'], \
-        st.session_state['chain'], \
-        st.session_state['sql_retriever'], \
-        st.session_state['sql_chain'] = build_retriever()
-st.info("We provides you metadata columns below for query. Please choose a natural expression to describe filters on those columns.\n\n"
-        "For example: \n\n"
-        "*If you want to search papers with complex filters*:\n\n"
-        "- What is a Bayesian network? Please use articles published later than Feb 2018 and with more than 2 categories and whose title like `computer` and must have `cs.CV` in its category.\n\n"
-        "*If you want to ask questions based on papers in database*:\n\n"
-        "- What is PageRank?\n"
-        "- Did Geoffrey Hinton wrote paper about Capsule Neural Networks?\n"
-        "- Introduce some applications of GANs published around 2019.\n"
-        "- 请根据 2019 年左右的文章介绍一下 GAN 的应用都有哪些\n"
-        "- Veuillez présenter les applications du GAN sur la base des articles autour de 2019 ?\n"
-        "- Is it possible to synthesize room temperature super conductive material?")
 tab_sql, tab_self_query = st.tabs(['Vector SQL', 'Self-Query Retrievers'])
 with tab_sql:
-    st.info("You can retrieve papers with button `Query` or ask questions based on retrieved papers with button `Ask`.", icon='💡')
-    st.markdown('''```sql
-CREATE TABLE default.ChatArXiv (
-    `abstract` String,
-    `id` String,
-    `vector` Array(Float32),
-    `metadata` Object('JSON'),
-    `pubdate` DateTime,
-    `title` String,
-    `categories` Array(String),
-    `authors` Array(String),
-    `comment` String,
-    `primary_category` String,
-    VECTOR INDEX vec_idx vector TYPE MSTG('metric_type=Cosine'),
-    CONSTRAINT vec_len CHECK length(vector) = 768)
-ENGINE = ReplacingMergeTree ORDER BY id
-```''')
     st.text_input("Ask a question:", key='query_sql')
     cols = st.columns([1, 1, 7])
     cols[0].button("Query", key='search_sql')
@@ -237,7 +336,7 @@ ENGINE = ReplacingMergeTree ORDER BY id
         with plc_hldr.expander('Query Log', expanded=True):
             callback = ChatDataSQLSearchCallBackHandler()
             try:
-                docs = st.session_state.sql_retriever.get_relevant_documents(
                     st.session_state.query_sql, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 docs = pd.DataFrame(
@@ -253,14 +352,16 @@ ENGINE = ReplacingMergeTree ORDER BY id
         with plc_hldr.expander('Chat Log', expanded=True):
             callback = ChatDataSQLAskCallBackHandler()
             try:
-                ret = st.session_state.sql_chain(
                     st.session_state.query_sql, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
                 docs = ret['sources']
-                docs = pd.DataFrame([{**d.metadata, 'abstract': d.page_content} for d in docs])
-                display(docs, ['ref_id', 'title', 'id', 'categories', 'abstract', 'authors', 'pubdate'], index='ref_id')
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
                 raise e
@@ -268,7 +369,7 @@ ENGINE = ReplacingMergeTree ORDER BY id
 with tab_self_query:
     st.info("You can retrieve papers with button `Query` or ask questions based on retrieved papers with button `Ask`.", icon='💡')
-    st.dataframe(st.session_state.metadata_columns)
     st.text_input("Ask a question:", key='query_self')
     cols = st.columns([1, 1, 7])
     cols[0].button("Query", key='search_self')
@@ -281,13 +382,13 @@ with tab_self_query:
             call_back = None
             callback = ChatDataSelfSearchCallBackHandler()
             try:
-                docs = st.session_state.retriever.get_relevant_documents(
                     st.session_state.query_self, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 docs = pd.DataFrame(
                     [{**d.metadata, 'abstract': d.page_content} for d in docs])
-                display(docs, ['title', 'id', 'categories', 'abstract', 'authors', 'pubdate'])
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
                 raise e
@@ -299,14 +400,16 @@ with tab_self_query:
             call_back = None
             callback = ChatDataSelfAskCallBackHandler()
             try:
-                ret = st.session_state.chain(
                     st.session_state.query_self, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
                 docs = ret['sources']
-                docs = pd.DataFrame([{**d.metadata, 'abstract': d.page_content} for d in docs])
-                display(docs, ['title', 'id', 'categories', 'abstract', 'authors', 'pubdate'], index='ref_id')
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
                 raise e

+from prompts.arxiv_prompt import combine_prompt_template, _myscale_prompt
+from callbacks.arxiv_callbacks import ChatDataSelfSearchCallBackHandler, \
+    ChatDataSelfAskCallBackHandler, ChatDataSQLSearchCallBackHandler, \
+    ChatDataSQLAskCallBackHandler
+from chains.arxiv_chains import ArXivQAwithSourcesChain, ArXivStuffDocumentChain
+from chains.arxiv_chains import VectorSQLRetrieveCustomOutputParser
+from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
+from langchain_experimental.retrievers.vector_sql_database import VectorSQLDatabaseChainRetriever
+from langchain.utilities.sql_database import SQLDatabase
+from langchain.chains import LLMChain
+from sqlalchemy import create_engine, MetaData
+from langchain.prompts import PromptTemplate, ChatPromptTemplate, \
+    SystemMessagePromptTemplate, HumanMessagePromptTemplate
+from langchain.prompts.prompt import PromptTemplate
+from langchain.chat_models import ChatOpenAI
+from langchain import OpenAI
+from langchain.chains.query_constructor.base import AttributeInfo, VirtualColumnName
+from langchain.retrievers.self_query.base import SelfQueryRetriever
+from langchain.retrievers.self_query.myscale import MyScaleTranslator
+from langchain.embeddings import HuggingFaceInstructEmbeddings, SentenceTransformerEmbeddings
+from langchain.vectorstores import MyScaleSettings
+from chains.arxiv_chains import MyScaleWithoutMetadataJson
 import re
 import pandas as pd
 from os import environ
 environ['TOKENIZERS_PARALLELISM'] = 'true'
 environ['OPENAI_API_BASE'] = st.secrets['OPENAI_API_BASE']
+st.set_page_config(page_title="ChatData")
+st.header("ChatData")
+# query_model_name = "gpt-3.5-turbo-instruct"
+query_model_name = "text-davinci-003"
+chat_model_name = "gpt-3.5-turbo-16k"
+def hint_arxiv():
+    st.info("We provides you metadata columns below for query. Please choose a natural expression to describe filters on those columns.\n\n"
+            "For example: \n\n"
+            "*If you want to search papers with complex filters*:\n\n"
+            "- What is a Bayesian network? Please use articles published later than Feb 2018 and with more than 2 categories and whose title like `computer` and must have `cs.CV` in its category.\n\n"
+            "*If you want to ask questions based on papers in database*:\n\n"
+            "- What is PageRank?\n"
+            "- Did Geoffrey Hinton wrote paper about Capsule Neural Networks?\n"
+            "- Introduce some applications of GANs published around 2019.\n"
+            "- 请根据 2019 年左右的文章介绍一下 GAN 的应用都有哪些\n"
+            "- Veuillez présenter les applications du GAN sur la base des articles autour de 2019 ?\n"
+            "- Is it possible to synthesize room temperature super conductive material?")
+def hint_sql_arxiv():
+    st.info("You can retrieve papers with button `Query` or ask questions based on retrieved papers with button `Ask`.", icon='💡')
+    st.markdown('''```sql
+CREATE TABLE default.ChatArXiv (
+    `abstract` String,
+    `id` String,
+    `vector` Array(Float32),
+    `metadata` Object('JSON'),
+    `pubdate` DateTime,
+    `title` String,
+    `categories` Array(String),
+    `authors` Array(String),
+    `comment` String,
+    `primary_category` String,
+    VECTOR INDEX vec_idx vector TYPE MSTG('fp16_storage=1', 'metric_type=Cosine', 'disk_mode=3'),
+    CONSTRAINT vec_len CHECK length(vector) = 768)
+ENGINE = ReplacingMergeTree ORDER BY id
+```''')
+def hint_wiki():
+    st.info("We provides you metadata columns below for query. Please choose a natural expression to describe filters on those columns.\n\n"
+            "For example: \n\n"
+            "- Which company did Elon Musk found?\n"
+            "- What is Iron Gwazi?\n"
+            "- What is a Ring in mathematics?\n"
+            "- 苹果的发源地是那里？\n")
+def hint_sql_wiki():
+    st.info("You can retrieve papers with button `Query` or ask questions based on retrieved papers with button `Ask`.", icon='💡')
+    st.markdown('''```sql
+CREATE TABLE wiki.Wikipedia (
+    `id` String,
+    `title` String,
+    `text` String,
+    `url` String,
+    `wiki_id` UInt64,
+    `views` Float32,
+    `paragraph_id` UInt64,
+    `langs` UInt32,
+    `emb` Array(Float32),
+    VECTOR INDEX vec_idx emb TYPE MSTG('fp16_storage=1', 'metric_type=Cosine', 'disk_mode=3'),
+    CONSTRAINT emb_len CHECK length(emb) = 768)
+ENGINE = ReplacingMergeTree ORDER BY id
+```''')
+sel_map = {
+    'Wikipedia': {
+        "database": "wiki",
+        "table": "Wikipedia",
+        "hint": hint_wiki,
+        "hint_sql": hint_sql_wiki,
+        "doc_prompt": PromptTemplate(
+            input_variables=["page_content", "url", "title", "ref_id", "views"],
+            template="Title for Doc #{ref_id}: {title}\n\tviews: {views}\n\tcontent: {page_content}\nSOURCE: {url}"),
+        "metadata_cols": [
+            AttributeInfo(
+                name="title",
+                description="title of the wikipedia page",
+                type="string",
+            ),
+            AttributeInfo(
+                name="text",
+                description="paragraph from this wiki page",
+                type="string",
+            ),
+            AttributeInfo(
+                name="views",
+                description="number of views",
+                type="float"
+            ),
+        ],
+        "must_have_cols": ['id', 'title', 'url', 'text', 'views'],
+        "vector_col": "emb",
+        "text_col": "text",
+        "metadata_col": "metadata",
+        "emb_model": lambda: SentenceTransformerEmbeddings(
+            model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2',)
+    },
+    'ArXiv Papers': {
+        "database": "default",
+        "table": "ChatArXiv",
+        "hint": hint_arxiv,
+        "hint_sql": hint_sql_arxiv,
+        "doc_prompt": PromptTemplate(
+            input_variables=["page_content", "id", "title", "ref_id",
+                             "authors", "pubdate", "categories"],
+            template="Title for Doc #{ref_id}: {title}\n\tAbstract: {page_content}\n\tAuthors: {authors}\n\tDate of Publication: {pubdate}\n\tCategories: {categories}\nSOURCE: {id}"),
+        "metadata_cols": [
+            AttributeInfo(
+                name=VirtualColumnName(name="pubdate"),
+                description="The year the paper is published",
+                type="timestamp",
+            ),
+            AttributeInfo(
+                name="authors",
+                description="List of author names",
+                type="list[string]",
+            ),
+            AttributeInfo(
+                name="title",
+                description="Title of the paper",
+                type="string",
+            ),
+            AttributeInfo(
+                name="categories",
+                description="arxiv categories to this paper",
+                type="list[string]"
+            ),
+            AttributeInfo(
+                name="length(categories)",
+                description="length of arxiv categories to this paper",
+                type="int"
+            ),
+        ],
+        "must_have_cols": ['title', 'id', 'categories', 'abstract', 'authors', 'pubdate'],
+        "vector_col": "vector",
+        "text_col": "abstract",
+        "metadata_col": "metadata",
+        "emb_model": lambda: HuggingFaceInstructEmbeddings(
+            model_name='hkunlp/instructor-xl',
+            embed_instruction="Represent the question for retrieving supporting scientific papers: ")
+    }
+}
 def try_eval(x):
         st.write("Sorry 😵 we didn't find any articles related to your query.\n\nMaybe the LLM is too naughty that does not follow our instruction... \n\nPlease try again and use verbs that may match the datatype.", unsafe_allow_html=True)
+def build_embedding_model(_sel):
     with st.spinner("Loading Model..."):
+        embeddings = sel_map[_sel]["emb_model"]()
+    return embeddings
+def build_retriever(_sel):
+    with st.spinner(f"Connecting DB for {_sel}..."):
         myscale_connection = {
             "host": st.secrets['MYSCALE_HOST'],
             "port": st.secrets['MYSCALE_PORT'],
             "password": st.secrets['MYSCALE_PASSWORD'],
         }
+        config = MyScaleSettings(**myscale_connection,
+                                 database=sel_map[_sel]["database"],
+                                 table=sel_map[_sel]["table"],
                                  column_map={
                                      "id": "id",
+                                     "text": sel_map[_sel]["text_col"],
+                                     "vector": sel_map[_sel]["vector_col"],
+                                     "metadata": sel_map[_sel]["metadata_col"]
                                  })
+        doc_search = MyScaleWithoutMetadataJson(st.session_state[f"emb_model_{_sel}"], config,
+                                                must_have_cols=sel_map[_sel]['must_have_cols'])
+    with st.spinner(f"Building Self Query Retriever for {_sel}..."):
+        metadata_field_info = sel_map[_sel]["metadata_cols"]
         retriever = SelfQueryRetriever.from_llm(
+            OpenAI(model_name=query_model_name, openai_api_key=st.secrets['OPENAI_API_KEY'], temperature=0),
             doc_search, "Scientific papers indexes with abstracts. All in English.", metadata_field_info,
+            use_original_query=False, structured_query_translator=MyScaleTranslator())
     COMBINE_PROMPT = ChatPromptTemplate.from_strings(
         string_messages=[(SystemMessagePromptTemplate, combine_prompt_template),
+                         (HumanMessagePromptTemplate, '{question}')])
     OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
+    with st.spinner(f'Building QA Chain with Self-query for {_sel}...'):
         chain = ArXivQAwithSourcesChain(
             retriever=retriever,
             combine_documents_chain=ArXivStuffDocumentChain(
                 llm_chain=LLMChain(
                     prompt=COMBINE_PROMPT,
+                    llm=ChatOpenAI(model_name=chat_model_name,
+                                   openai_api_key=OPENAI_API_KEY, temperature=0.6),
                 ),
+                document_prompt=sel_map[_sel]["doc_prompt"],
                 document_variable_name="summaries",
             ),
             max_tokens_limit=12000,
         )
+    with st.spinner(f'Building Vector SQL Database Retriever for {_sel}...'):
         MYSCALE_USER = st.secrets['MYSCALE_USER']
         MYSCALE_PASSWORD = st.secrets['MYSCALE_PASSWORD']
         MYSCALE_HOST = st.secrets['MYSCALE_HOST']
         MYSCALE_PORT = st.secrets['MYSCALE_PORT']
         engine = create_engine(
+            f'clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/{sel_map[_sel]["database"]}?protocol=https')
         metadata = MetaData(bind=engine)
         PROMPT = PromptTemplate(
             input_variables=["input", "table_info", "top_k"],
             template=_myscale_prompt,
         )
         output_parser = VectorSQLRetrieveCustomOutputParser.from_embeddings(
+            model=st.session_state[f'emb_model_{_sel}'], must_have_columns=sel_map[_sel]["must_have_cols"])
         sql_query_chain = VectorSQLDatabaseChain.from_llm(
+            llm=OpenAI(model_name=query_model_name, openai_api_key=OPENAI_API_KEY, temperature=0),
             prompt=PROMPT,
             top_k=10,
             return_direct=True,
             native_format=True
         )
         sql_retriever = VectorSQLDatabaseChainRetriever(
+            sql_db_chain=sql_query_chain, page_content_key=sel_map[_sel]["text_col"])
+    with st.spinner(f'Building QA Chain with Vector SQL for {_sel}...'):
         sql_chain = ArXivQAwithSourcesChain(
             retriever=sql_retriever,
             combine_documents_chain=ArXivStuffDocumentChain(
                 llm_chain=LLMChain(
                     prompt=COMBINE_PROMPT,
+                    llm=ChatOpenAI(model_name=chat_model_name,
+                                   openai_api_key=OPENAI_API_KEY, temperature=0.6),
                 ),
+                document_prompt=sel_map[_sel]["doc_prompt"],
                 document_variable_name="summaries",
             ),
             max_tokens_limit=12000,
         )
+    return {
+        "metadata_columns": [{'name': m.name.name if type(m.name) is VirtualColumnName else m.name, 'desc': m.description, 'type': m.type} for m in metadata_field_info],
+        "retriever": retriever,
+        "chain": chain,
+        "sql_retriever": sql_retriever,
+        "sql_chain": sql_chain
+    }
+@st.cache_resource
+def build_all():
+    sel_map_obj = {}
+    for k in sel_map:
+        st.session_state[f'emb_model_{k}'] = build_embedding_model(k)
+        sel_map_obj[k] = build_retriever(k)
+    return sel_map_obj
 if 'retriever' not in st.session_state:
+    st.session_state["sel_map_obj"] = build_all()
+sel = st.selectbox('Choose the knowledge base you want to ask with:',
+                   options=['ArXiv Papers', 'Wikipedia'])
+sel_map[sel]['hint']()
 tab_sql, tab_self_query = st.tabs(['Vector SQL', 'Self-Query Retrievers'])
 with tab_sql:
+    sel_map[sel]['hint_sql']()
     st.text_input("Ask a question:", key='query_sql')
     cols = st.columns([1, 1, 7])
     cols[0].button("Query", key='search_sql')
         with plc_hldr.expander('Query Log', expanded=True):
             callback = ChatDataSQLSearchCallBackHandler()
             try:
+                docs = st.session_state.sel_map_obj[sel]["sql_retriever"].get_relevant_documents(
                     st.session_state.query_sql, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 docs = pd.DataFrame(
         with plc_hldr.expander('Chat Log', expanded=True):
             callback = ChatDataSQLAskCallBackHandler()
             try:
+                ret = st.session_state.sel_map_obj[sel]["sql_chain"](
                     st.session_state.query_sql, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
                 docs = ret['sources']
+                docs = pd.DataFrame(
+                    [{**d.metadata, 'abstract': d.page_content} for d in docs])
+                display(
+                    docs, ['ref_id'] + sel_map[sel]["must_have_cols"], index='ref_id')
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
                 raise e
 with tab_self_query:
     st.info("You can retrieve papers with button `Query` or ask questions based on retrieved papers with button `Ask`.", icon='💡')
+    st.dataframe(st.session_state.sel_map_obj[sel]["metadata_columns"])
     st.text_input("Ask a question:", key='query_self')
     cols = st.columns([1, 1, 7])
     cols[0].button("Query", key='search_self')
             call_back = None
             callback = ChatDataSelfSearchCallBackHandler()
             try:
+                docs = st.session_state.sel_map_obj[sel]["retriever"].get_relevant_documents(
                     st.session_state.query_self, callbacks=[callback])
+                print(docs)
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 docs = pd.DataFrame(
                     [{**d.metadata, 'abstract': d.page_content} for d in docs])
+                display(docs, sel_map[sel]["must_have_cols"])
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
                 raise e
             call_back = None
             callback = ChatDataSelfAskCallBackHandler()
             try:
+                ret = st.session_state.sel_map_obj[sel]["chain"](
                     st.session_state.query_self, callbacks=[callback])
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
                 docs = ret['sources']
+                docs = pd.DataFrame(
+                    [{**d.metadata, 'abstract': d.page_content} for d in docs])
+                display(
+                    docs, ['ref_id'] + sel_map[sel]["must_have_cols"], index='ref_id')
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
                 raise e

callbacks/arxiv_callbacks.py CHANGED Viewed

@@ -90,4 +90,4 @@ class ChatDataSQLAskCallBackHandler(ChatDataSQLSearchCallBackHandler):
         self.progress_bar = st.progress(value=0.0, text='Writing SQL...')
         self.status_bar = st.empty()
         self.prog_value = 0
-        self.prog_interval = 0.1

         self.progress_bar = st.progress(value=0.0, text='Writing SQL...')
         self.status_bar = st.empty()
         self.prog_value = 0
+        self.prog_interval = 0.1

chains/arxiv_chains.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import re
 import inspect
 from typing import Dict, Any, Optional, List, Tuple
@@ -7,21 +7,62 @@ from langchain.callbacks.manager import (
     AsyncCallbackManagerForChainRun,
     CallbackManagerForChainRun,
 )
 from langchain.schema import BaseRetriever
 from langchain.callbacks.manager import Callbacks
 from langchain.schema.prompt_template import format_document
 from langchain.docstore.document import Document
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
-from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from langchain_experimental.sql.vector_sql import VectorSQLOutputParser
 class VectorSQLRetrieveCustomOutputParser(VectorSQLOutputParser):
     """Based on VectorSQLOutputParser
     It also modify the SQL to get all columns
     """
     @property
     def _type(self) -> str:
@@ -123,12 +164,15 @@ class ArXivQAwithSourcesChain(RetrievalQAWithSourcesChain):
         ref_cnt = 1
         for d in docs:
             ref_id = d.metadata['ref_id']
-            if f"PDF #{ref_id}" in answer:
                 title = d.metadata['title'].replace('\n', '')
                 d.metadata['ref_id'] = ref_cnt
-                answer = answer.replace(f"PDF #{ref_id}", f"{title} [{ref_cnt}]")
                 sources.append(d)
                 ref_cnt += 1
         result: Dict[str, Any] = {
             self.answer_key: answer,
@@ -147,4 +191,4 @@ class ArXivQAwithSourcesChain(RetrievalQAWithSourcesChain):
     @property
     def _chain_type(self) -> str:
-        return "arxiv_qa_with_sources_chain"

+import logging
 import inspect
 from typing import Dict, Any, Optional, List, Tuple
     AsyncCallbackManagerForChainRun,
     CallbackManagerForChainRun,
 )
+from langchain.embeddings.base import Embeddings
 from langchain.schema import BaseRetriever
 from langchain.callbacks.manager import Callbacks
 from langchain.schema.prompt_template import format_document
 from langchain.docstore.document import Document
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
+from langchain.vectorstores.myscale import MyScale, MyScaleSettings
 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from langchain_experimental.sql.vector_sql import VectorSQLOutputParser
+logger = logging.getLogger()
+class MyScaleWithoutMetadataJson(MyScale):
+    def __init__(self, embedding: Embeddings, config: Optional[MyScaleSettings] = None, must_have_cols: List[str] = [], **kwargs: Any) -> None:
+        super().__init__(embedding, config, **kwargs)
+        self.must_have_cols: List[str] = must_have_cols
+    def _build_qstr(
+        self, q_emb: List[float], topk: int, where_str: Optional[str] = None
+    ) -> str:
+        q_emb_str = ",".join(map(str, q_emb))
+        if where_str:
+            where_str = f"PREWHERE {where_str}"
+        else:
+            where_str = ""
+        q_str = f"""
+            SELECT {self.config.column_map['text']}, dist, {','.join(self.must_have_cols)}
+            FROM {self.config.database}.{self.config.table}
+            {where_str}
+            ORDER BY distance({self.config.column_map['vector']}, [{q_emb_str}])
+                AS dist {self.dist_order}
+            LIMIT {topk}
+            """
+        return q_str
+    def similarity_search_by_vector(self, embedding: List[float], k: int = 4, where_str: Optional[str] = None, **kwargs: Any) -> List[Document]:
+        q_str = self._build_qstr(embedding, k, where_str)
+        try:
+            return [
+                Document(
+                    page_content=r[self.config.column_map["text"]],
+                    metadata={k: r[k] for k in self.must_have_cols},
+                )
+                for r in self.client.query(q_str).named_results()
+            ]
+        except Exception as e:
+            logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
+            return []
 class VectorSQLRetrieveCustomOutputParser(VectorSQLOutputParser):
     """Based on VectorSQLOutputParser
     It also modify the SQL to get all columns
     """
+    must_have_columns: List[str]
     @property
     def _type(self) -> str:
         ref_cnt = 1
         for d in docs:
             ref_id = d.metadata['ref_id']
+            if f"Doc #{ref_id}" in answer:
+                answer = answer.replace(f"Doc #{ref_id}", f"#{ref_id}")
+            if f"#{ref_id}" in answer:
                 title = d.metadata['title'].replace('\n', '')
                 d.metadata['ref_id'] = ref_cnt
+                answer = answer.replace(f"#{ref_id}", f"{title} [{ref_cnt}]")
                 sources.append(d)
                 ref_cnt += 1
         result: Dict[str, Any] = {
             self.answer_key: answer,
     @property
     def _chain_type(self) -> str:
+        return "arxiv_qa_with_sources_chain"

prompts/arxiv_prompt.py CHANGED Viewed

@@ -1,12 +1,12 @@
 combine_prompt_template = (
-            "You are a helpful PDF assistant. Your task is to provide information and answer any questions "
-            + "related to PDFs given below. You should use the sections, title and abstract of the selected PDFs as your source of information "
             + "and try to provide concise and accurate answers to any questions asked by the user. If you are unable to find "
             + "relevant information in the given sections, you will need to let the user know that the source does not contain "
             + "relevant information but still try to provide an answer based on your general knowledge. You must refer to the "
             + "corresponding section name and page that you refer to when answering. The following is the related information "
-            + "about the PDF file that will help you answer users' questions, you MUST answer it using question's language:\n\n {summaries}"
-            + "Now you should anwser user's question. Remember you must use the PDF # to refer papers:\n\n"
         )
 _myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.

 combine_prompt_template = (
+            "You are a helpful document assistant. Your task is to provide information and answer any questions "
+            + "related to documents given below. You should use the sections, title and abstract of the selected documents as your source of information "
             + "and try to provide concise and accurate answers to any questions asked by the user. If you are unable to find "
             + "relevant information in the given sections, you will need to let the user know that the source does not contain "
             + "relevant information but still try to provide an answer based on your general knowledge. You must refer to the "
             + "corresponding section name and page that you refer to when answering. The following is the related information "
+            + "about the document that will help you answer users' questions, you MUST answer it using question's language:\n\n {summaries}"
+            + "Now you should anwser user's question. Remember you must use `Doc #` to refer papers:\n\n"
         )
 _myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.