Spaces:
Sleeping
Sleeping
import datetime | |
import json | |
from typing import List, Tuple | |
from langchain_community.retrievers import BM25Retriever | |
from langchain_core.documents import Document | |
import streamlit as st | |
from sudachipy import dictionary, tokenizer | |
def generate_word_ngrams( | |
text: str, min_len: int, max_len: int, binary: bool = False | |
) -> List[Tuple[str, ...]]: | |
""" | |
Tokenize the input text into words and generate n-grams of specified lengths. | |
Args: | |
text (str): The input string. | |
min_len (int): The minimum length of the n-grams. | |
max_len (int): The maximum length of the n-grams. | |
binary (bool, optional): If True, remove duplicates. Defaults to False. | |
Returns: | |
List[Tuple[str, ...]]: A list of n-grams as tuples of words. | |
""" | |
tokenizer_obj = dictionary.Dictionary(dict="full").create() | |
mode = tokenizer.Tokenizer.SplitMode.A | |
tokens = tokenizer_obj.tokenize(text, mode) | |
words = [token.surface() for token in tokens] | |
ngrams: List[Tuple[str, ...]] = [] | |
for n in range(min_len, max_len + 1): | |
for k in range(len(words) - n + 1): | |
ngram = tuple(words[k:k + n]) | |
ngrams.append(ngram) | |
if binary: | |
ngrams = list(set(ngrams)) # Remove duplicates | |
return ngrams | |
def preprocess_func(text: str) -> List[str]: | |
ngrams = generate_word_ngrams(text, 1, 1, True) | |
return [' '.join(ngram) for ngram in ngrams] | |
def load_docs_from_json(json_path): | |
with open(json_path) as f: | |
papers = json.load(f) | |
docs = [] | |
for paper in papers: | |
page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}" | |
doc = Document( | |
page_content=page_content, | |
metadata={ | |
'session_id': paper['session_id'], | |
'session_title': paper['session_title'], | |
'session_info': paper['session_info'], | |
'id': paper['pid'], | |
'title': paper['ptitle'], | |
'pdf_link': paper['pdf_link'], | |
'authors': paper['pauthors'], | |
} | |
) | |
docs.append(doc) | |
return docs | |
# init | |
json_path = "nlp2024_papers.json" | |
docs = load_docs_from_json(json_path) | |
retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func) | |
retriever.k = 10 | |
# streamlit | |
st.title("NLP2024 Papers Search") | |
st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).") | |
st.markdown(f"Nmber of documents: `{len(docs)}`.") | |
st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"") | |
prompt = st.chat_input("Search anything...") | |
if prompt: | |
results = retriever.invoke(prompt) | |
st.markdown(f"Top `{len(results)}` related papers") | |
for result in results: | |
with st.expander(label=result.metadata['title'], expanded=False): | |
for k in result.metadata: | |
st.write(f"{k}: {result.metadata[k]}") | |
st.divider() | |
st.markdown(result.page_content) |