Spaces:

MohammedNasser
/

Arabic-PDF-Chat

Running

App Files Files Community

MohammedNasser commited on 3 days ago

Commit

cba3641

•

1 Parent(s): c1f9d40

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -25

app.py CHANGED Viewed

@@ -18,7 +18,8 @@ from pdf2image import convert_from_path
 from huggingface_hub import Repository, login
 from huggingface_hub import hf_hub_download
 from langchain.schema import Document
@@ -39,8 +40,35 @@ for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
     if not os.path.exists(folder):
         os.makedirs(folder)
 def load_pdf(file_path):
     """Load and preprocess Arabic text from a PDF file."""
     try:
         pages = convert_from_path(file_path, 500)
     except Exception as e:
@@ -51,6 +79,7 @@ def load_pdf(file_path):
     for pageNum, imgBlob in enumerate(pages):
         try:
             text = pytesseract.image_to_string(imgBlob, lang="ara")
             documents.append(text)
         except Exception as e:
             print(f"Error processing page {pageNum}: {e}")
@@ -159,19 +188,17 @@ p {
 def upload_pdf(pdf_file):
     global vectorstore, chain  # Use global variables to store state
     data = load_pdf(pdf_file)
     vectorstore = prepare_vectorstore(data)
     chain = create_chain(vectorstore)
-    return "تم تحميل الملف بنجاح !"
 def chat(user_input):
     global chain  # Access the global chain variable
-    if chain is None:
-        return "Please upload a PDF file first.", None  # Prompt user to upload a PDF
     prompt = f"""
         You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
         When responding, ensure the following:
@@ -193,31 +220,51 @@ def chat(user_input):
     audio_file = f"{audio_id}.mp3"
     tts.save(audio_file)
     return assistant_response, audio_file
 with gr.Blocks(css=custom_css) as demo:
     pdf_input = gr.File(label="اختر ملف PDF")
-    output_text = gr.Textbox(label=" ")
-    submit_button_pdf = gr.Button("ارفع الملف")
-    chat_input = gr.Textbox(label="أدخل سؤالك هنا")
-    chat_output = gr.Textbox(label="الرد الآلي")
-    audio_output = gr.Audio(label="استمع إلى الرد")
-    submit_button_chat = gr.Button("إرسال")
-    # Connect upload button to upload function
-    submit_button_pdf.click(upload_pdf, inputs=[pdf_input], outputs=[output_text])
-    # Connect chat button to chat function
-    submit_button_chat.click(chat, inputs=[chat_input], outputs=[chat_output, audio_output])
 # Launch the Gradio app
 demo.launch(inbrowser=True)

 from huggingface_hub import Repository, login
 from huggingface_hub import hf_hub_download
 from langchain.schema import Document
+from PyPDF2 import PdfReader  # Make sure to install PyPDF2 for PDF handling
+from langdetect import detect  # Install langdetect to detect language
     if not os.path.exists(folder):
         os.makedirs(folder)
+# Function to check if the file is a valid PDF in Arabic and less than 10MB
+def validate_pdf(pdf):
+    if pdf is None:
+        return "لم يتم اختيار أي ملف", False
+    if not pdf.name.endswith(".pdf"):
+        return "الملف الذي اخترته ليس PDF", False
+    if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
+        return "حجم الملف أكبر من 10 ميجا بايت", False
+    # Check if PDF content is Arabic
+    reader = PdfReader(pdf.name)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    try:
+        if detect(text) != "ar":
+            return "الملف ليس باللغة العربية", False
+    except:
+        return "فشل في تحليل اللغة", False
+    return "الملف صالح للدردشة", True
 def load_pdf(file_path):
     """Load and preprocess Arabic text from a PDF file."""
     try:
         pages = convert_from_path(file_path, 500)
     except Exception as e:
     for pageNum, imgBlob in enumerate(pages):
         try:
             text = pytesseract.image_to_string(imgBlob, lang="ara")
             documents.append(text)
         except Exception as e:
             print(f"Error processing page {pageNum}: {e}")
 def upload_pdf(pdf_file):
     global vectorstore, chain  # Use global variables to store state
     data = load_pdf(pdf_file)
     vectorstore = prepare_vectorstore(data)
     chain = create_chain(vectorstore)
+    return "تم تحميل الملف بنجاح !", True
 def chat(user_input):
     global chain  # Access the global chain variable
     prompt = f"""
         You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
         When responding, ensure the following:
     audio_file = f"{audio_id}.mp3"
     tts.save(audio_file)
     return assistant_response, audio_file
 with gr.Blocks(css=custom_css) as demo:
     pdf_input = gr.File(label="اختر ملف PDF")
+    output_label = gr.HTML()  # Replaced Textbox with HTML for label
+    submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
+    chat_input = gr.Textbox(label="أدخل سؤالك هنا", interactive=False)
+    chat_output = gr.Textbox(label="الرد الآلي", interactive=False)
+    audio_output = gr.Audio(label="استمع إلى الرد", interactive=False)
+    submit_button_chat = gr.Button("إرسال", interactive=False)
+    def handle_file_upload(pdf):
+        output_label.update('')
+        message, is_valid = validate_pdf(pdf)
+        color = "red" if not is_valid else "green"
+        # Update HTML label instead of Textbox
+        if is_valid:
+            # Enable the upload button if the file is valid
+            submit_button_pdf.update(interactive=True)
+            output_label.update('')
+        else:
+            output_label.update(f'<span style="color:{color}">{message}</span>')
+    def process_pdf_and_enable_components(pdf):
+        # Process PDF and activate the other components
+        output_label.update('<span style="color:green">جاري معالجة الملف...</span>')
+        message, is_valid = upload_pdf(pdf)
+        output_label.update(f'<span style="color:green">{message}</span>')
+        if is_valid:
+            chat_input.update(interactive=True)
+            chat_output.update(interactive=True)
+            audio_output.update(interactive=True)
+            submit_button_chat.update(interactive=True)
+     # When the user uploads a file, validate it and then allow PDF upload
+    pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label])
+    # When the user presses the upload button, process the PDF and enable other components
+    submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label])
+    # Chat button connection
+    submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_output, audio_output])
 # Launch the Gradio app
 demo.launch(inbrowser=True)