nlp2024_papers / app.py
yutohub's picture
Create app.py
749fb56 verified
raw
history blame contribute delete
No virus
3.15 kB
import datetime
import json
from typing import List, Tuple
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
import streamlit as st
from sudachipy import dictionary, tokenizer
def generate_word_ngrams(
text: str, min_len: int, max_len: int, binary: bool = False
) -> List[Tuple[str, ...]]:
"""
Tokenize the input text into words and generate n-grams of specified lengths.
Args:
text (str): The input string.
min_len (int): The minimum length of the n-grams.
max_len (int): The maximum length of the n-grams.
binary (bool, optional): If True, remove duplicates. Defaults to False.
Returns:
List[Tuple[str, ...]]: A list of n-grams as tuples of words.
"""
tokenizer_obj = dictionary.Dictionary(dict="full").create()
mode = tokenizer.Tokenizer.SplitMode.A
tokens = tokenizer_obj.tokenize(text, mode)
words = [token.surface() for token in tokens]
ngrams: List[Tuple[str, ...]] = []
for n in range(min_len, max_len + 1):
for k in range(len(words) - n + 1):
ngram = tuple(words[k:k + n])
ngrams.append(ngram)
if binary:
ngrams = list(set(ngrams)) # Remove duplicates
return ngrams
def preprocess_func(text: str) -> List[str]:
ngrams = generate_word_ngrams(text, 1, 1, True)
return [' '.join(ngram) for ngram in ngrams]
def load_docs_from_json(json_path):
with open(json_path) as f:
papers = json.load(f)
docs = []
for paper in papers:
page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}"
doc = Document(
page_content=page_content,
metadata={
'session_id': paper['session_id'],
'session_title': paper['session_title'],
'session_info': paper['session_info'],
'id': paper['pid'],
'title': paper['ptitle'],
'pdf_link': paper['pdf_link'],
'authors': paper['pauthors'],
}
)
docs.append(doc)
return docs
# init
json_path = "nlp2024_papers.json"
docs = load_docs_from_json(json_path)
retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)
retriever.k = 10
# streamlit
st.title("NLP2024 Papers Search")
st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).")
st.markdown(f"Nmber of documents: `{len(docs)}`.")
st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"")
prompt = st.chat_input("Search anything...")
if prompt:
results = retriever.invoke(prompt)
st.markdown(f"Top `{len(results)}` related papers")
for result in results:
with st.expander(label=result.metadata['title'], expanded=False):
for k in result.metadata:
st.write(f"{k}: {result.metadata[k]}")
st.divider()
st.markdown(result.page_content)