Spaces:

MohammedNasser
/

Arabic-PDF-Chat

Running

App Files Files Community

Arabic-PDF-Chat / app.py

MohammedNasser

Update app.py

1a197f9 verified about 1 hour ago

raw

history blame contribute delete

No virus

11.6 kB

	import gradio as gr
	import os
	import subprocess
	import uuid
	import fitz
	from dotenv import load_dotenv
	from langchain_community.document_loaders import UnstructuredPDFLoader
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_text_splitters import CharacterTextSplitter
	from langchain_groq import ChatGroq
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from gtts import gTTS
	import sys
	import pytesseract
	from pdf2image import convert_from_path
	from huggingface_hub import Repository, login
	from huggingface_hub import hf_hub_download
	from langchain.schema import Document
	from PyPDF2 import PdfReader # Make sure to install PyPDF2 for PDF handling
	from langdetect import detect # Install langdetect to detect language



	# Load environment variables
	load_dotenv()
	secret_key = os.getenv("GROQ_API_KEY")
	hf_key = os.getenv("HF_TOKEN")

	os.environ["GROQ_API_KEY"] = secret_key
	login(token=hf_key,add_to_git_credential=True)

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

	# Ensure the necessary folders exist
	UPLOAD_FOLDER = 'uploads/'
	AUDIO_FOLDER = 'audio/'
	for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
	if not os.path.exists(folder):
	os.makedirs(folder)





	def load_pdf(file_path):
	"""Load and preprocess Arabic text from a PDF file."""

	try:
	pages = convert_from_path(file_path, 500)
	except Exception as e:
	print(f"Error loading PDF: {e}")
	return []

	documents = []
	for pageNum, imgBlob in enumerate(pages):
	try:
	text = pytesseract.image_to_string(imgBlob, lang="ara")

	documents.append(text)
	except Exception as e:
	print(f"Error processing page {pageNum}: {e}")
	documents.append("") # Append empty string for pages where OCR failed

	return documents

	def prepare_vectorstore(data):
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
	# Create Document objects from the input data
	documents = [Document(page_content=text) for text in data]

	# Split the documents into chunks
	chunks = text_splitter.split_documents(documents)

	# Create the vector store
	vectorstore = FAISS.from_documents(chunks, embeddings)
	return vectorstore

	def create_chain(vectorstore):
	llm = ChatGroq(model="gemma2-9b-it", temperature=0)
	retriever = vectorstore.as_retriever()
	memory = ConversationBufferMemory(llm=llm, output_key="answer", memory_key="chat_history", return_messages=True)
	chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=retriever,
	memory=memory,
	verbose=False,
	chain_type="map_reduce"
	)
	return chain



	custom_css = """
	@import url('https://fonts.googleapis.com/css2?family=Noto+Kufi+Arabic:wght@400;700&display=swap');
	@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@400;700&display=swap');

	body {
	font-family: 'Noto Kufi Arabic', sans-serif;
	background: linear-gradient(135deg, #799351 0%, #A67B5B 100%);
	background-size: cover;
	background-position: center;
	background-attachment: fixed;
	}

	.gradio-container {
	direction: rtl;
	font-family: 'Noto Kufi Arabic', sans-serif;
	font-size: 16px;
	max-width: 800px !important;
	margin: auto !important;
	background: rgba(255, 255, 255, 0.9);
	border-radius: 20px;
	box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
	backdrop-filter: blur(4px);
	border: 1px solid rgba(255, 255, 255, 0.18);
	padding: 20px;
	}


	.gr-textbox input, .gr-textbox textarea {
	text-align: right !important; /* Align text to the right */
	direction: rtl !important; /* Set RTL text direction */
	font-family: 'Cairo', sans-serif !important;
	}



	.gr-file, .gr-audio {
	text-align: right !important; /* Align text to the right */
	direction: rtl !important; /* Set RTL text direction */
	}

	label {
	font-size: 14px !important;
	color: #000000 !important;
	background-color: #EEEEEE;
	}


	.arabic-chatbox .message.user {
	font-family: 'Cairo', sans-serif !important;
	background-color: #FFFBE6; /* Light gray background for user messages */
	border-radius: 10px;
	padding: 10px;
	margin-bottom: 10px;
	}

	.arabic-chatbox .message.bot {
	font-family: 'Cairo', sans-serif !important;
	background-color: #E7FBE6; /* Light cyan background for bot messages */
	border-radius: 10px;
	padding: 10px;
	margin-bottom: 10px;
	}

	.arabic-chatbox .message.user img,
	#chatbox .message.bot img {
	margin-right: 10px;
	}

	.custom-submit-button {
	background-color: #E68369 !important;
	border: none !important;
	border-radius: 5px !important;
	padding: 10px 20px !important;
	font-size: 16px !important;
	cursor: pointer !important;
	}

	.custom-submit-button:hover {
	background-color: white !important;
	color: #E6B9A6 !important;
	}

	#clear_btn {
	background-color: #698474;
	color: white;
	border: none;
	border-radius: 5px;
	padding: 10px 20px;
	font-size: 16px;
	cursor: pointer;
	}

	#clear_btn:hover {
	background-color: white;
	color: #698474;
	}

	"""

	# Function to check if the file is a valid PDF in Arabic and less than 10MB
	def validate_pdf(pdf):
	if pdf is None:
	return "لم يتم اختيار أي ملف", False
	if not pdf.name.endswith(".pdf"):
	return "الملف الذي اخترته ليس PDF", False
	if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
	return "حجم الملف أكبر من 10 ميجا بايت", False

	# Check if PDF content is Arabic
	reader = PdfReader(pdf.name)
	text = ""
	for page in reader.pages:
	text += page.extract_text()

	try:
	if detect(text) != "ar":
	return "الملف ليس باللغة العربية", False
	except:
	return "فشل في تحليل اللغة", False

	return "الملف صالح للدردشة", True

	def upload_pdf(pdf_file):
	global vectorstore, chathistory
	chathistory = []
	data = load_pdf(pdf_file)
	vectorstore = prepare_vectorstore(data)

	return "تم تحميل الملف بنجاح !", True


	def chat(user_input):
	global chathistory, vectorstore

	if not user_input.strip(): # Check if the input is empty or contains only whitespace
	return gr.update(value='<span style="color:red;">الرجاء إدخال سؤال.</span>'), "", None



	prompt = f"""
	You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
	When responding, ensure the following:
	- Your answer directly reflects the content of the document.
	- If the requested information is not available in the document, clearly state that in Arabic.
	- Keep your response concise yet comprehensive, addressing the question fully.
	- Respond only in a professional and well-versed Arabic Language.
	Question: {user_input}.
	"""
	chain = create_chain(vectorstore)
	response = chain({"question": prompt})
	assistant_response = response["answer"]



	chathistory.append({"user_content": f"👤 {user_input}", "bot_content": f"🤖 {assistant_response}"})
	# Generate a unique identifier for the audio file
	audio_id = str(uuid.uuid4())

	# Create audio file
	tts = gTTS(text=assistant_response, lang='ar')
	audio_file = f"{audio_id}.mp3"
	tts.save(audio_file)

	history_display = [(msg["user_content"], msg["bot_content"]) for msg in chathistory]
	return gr.update(value=''), history_display, audio_file

	with gr.Blocks(css=custom_css) as demo:
	with gr.Row():
	gr.Markdown("<h2 style='text-align: center; color: #00000;'>المساعد العربي ar-pdf-chat للدردشة </h2>", rtl=True)

	with gr.Row():
	pdf_input = gr.File(label="اختر ملف PDF")
	with gr.Row():
	output_label = gr.HTML(value='') # Replaced Textbox with HTML for label
	with gr.Row():
	submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
	with gr.Row():
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(label="الشات", elem_classes="arabic-chatbox", height=400, rtl=True, layout='bubble', bubble_full_width=False)
	with gr.Row():
	chat_label = gr.HTML(value='')
	with gr.Row():
	chat_input = gr.Textbox(label="💬", rtl=True, visible=False, placeholder="أدخل سؤالك هنا ..", elem_id="inputbox", lines=2)
	#chat_output = gr.Textbox(label="الرد الآلي 🤖", rtl=True, visible=False)
	with gr.Row():
	audio_output = gr.Audio(label="🔊", visible=False)

	with gr.Row():
	submit_button_chat = gr.Button("إرسال", interactive=True, visible=False, elem_classes="custom-submit-button", variant='primary')
	with gr.Row():
	clear_btn = gr.Button("مسح", interactive=True, visible=False, variant='secondary')

	def handle_file_upload(pdf):
	output_label.value=''
	message, is_valid = validate_pdf(pdf)
	color = "red" if not is_valid else "green"
	# Update HTML label instead of Textbox

	if is_valid:
	# Enable the upload button if the file is valid
	value=''
	return gr.update(value=value), gr.update(interactive=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

	else:
	value=f'<span style="color:{color}">{message}</span>'
	return gr.update(value=value), gr.update(interactive=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

	def process_pdf_and_enable_components(pdf):
	# Process PDF and activate the other components
	output_label.value='<span style="color:blue">جاري معالجة الملف...</span>'
	message, is_valid = upload_pdf(pdf)
	value=f'<span style="color:green">{message}</span>'
	return gr.update(value=value), gr.update(visible=True), gr.update(interactive=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

	def clear_chat():
	return "", None

	# When the user uploads a file, validate it and then allow PDF upload
	pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label,submit_button_pdf, submit_button_chat, chatbot, chat_input, audio_output, clear_btn])

	# When the user presses the upload button, process the PDF and enable other components
	submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label, submit_button_chat, submit_button_pdf, chatbot, chat_input, audio_output, clear_btn])
	clear_btn.click(clear_chat, outputs=[chat_input, audio_output])
	# Chat button connection
	submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_label, chatbot, audio_output])


	# Launch the Gradio app
	demo.launch(inbrowser=True)