Spaces:

yutohub
/

nlp2024_papers

Sleeping

App Files Files Community

nlp2024_papers / app.py

yutohub

Create app.py

749fb56 verified 3 months ago

raw

history blame contribute delete

No virus

3.15 kB

	import datetime
	import json
	from typing import List, Tuple

	from langchain_community.retrievers import BM25Retriever
	from langchain_core.documents import Document
	import streamlit as st
	from sudachipy import dictionary, tokenizer


	def generate_word_ngrams(
	text: str, min_len: int, max_len: int, binary: bool = False
	) -> List[Tuple[str, ...]]:
	"""
	Tokenize the input text into words and generate n-grams of specified lengths.

	Args:
	text (str): The input string.
	min_len (int): The minimum length of the n-grams.
	max_len (int): The maximum length of the n-grams.
	binary (bool, optional): If True, remove duplicates. Defaults to False.

	Returns:
	List[Tuple[str, ...]]: A list of n-grams as tuples of words.
	"""
	tokenizer_obj = dictionary.Dictionary(dict="full").create()
	mode = tokenizer.Tokenizer.SplitMode.A
	tokens = tokenizer_obj.tokenize(text, mode)
	words = [token.surface() for token in tokens]

	ngrams: List[Tuple[str, ...]] = []

	for n in range(min_len, max_len + 1):
	for k in range(len(words) - n + 1):
	ngram = tuple(words[k:k + n])
	ngrams.append(ngram)

	if binary:
	ngrams = list(set(ngrams)) # Remove duplicates

	return ngrams


	def preprocess_func(text: str) -> List[str]:
	ngrams = generate_word_ngrams(text, 1, 1, True)
	return [' '.join(ngram) for ngram in ngrams]


	def load_docs_from_json(json_path):
	with open(json_path) as f:
	papers = json.load(f)

	docs = []
	for paper in papers:
	page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}"
	doc = Document(
	page_content=page_content,
	metadata={
	'session_id': paper['session_id'],
	'session_title': paper['session_title'],
	'session_info': paper['session_info'],
	'id': paper['pid'],
	'title': paper['ptitle'],
	'pdf_link': paper['pdf_link'],
	'authors': paper['pauthors'],
	}
	)
	docs.append(doc)

	return docs


	# init
	json_path = "nlp2024_papers.json"
	docs = load_docs_from_json(json_path)
	retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)
	retriever.k = 10

	# streamlit
	st.title("NLP2024 Papers Search")
	st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).")
	st.markdown(f"Nmber of documents: `{len(docs)}`.")
	st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"")

	prompt = st.chat_input("Search anything...")

	if prompt:
	results = retriever.invoke(prompt)

	st.markdown(f"Top `{len(results)}` related papers")

	for result in results:
	with st.expander(label=result.metadata['title'], expanded=False):
	for k in result.metadata:
	st.write(f"{k}: {result.metadata[k]}")
	st.divider()
	st.markdown(result.page_content)