Spaces:

MohammedNasser
/

Arabic-PDF-Chat

Running

File size: 8,810 Bytes

bf18e69
bee2b96
631b794
0c4a24a
bee2b96
 
 
 
7b54e65
bee2b96
 
 
 
 
bf18e69
4132a28
 
3759483
c2777d8
6ebe94a
cba3641
 
c2777d8
bf18e69
 
bee2b96
 
2cc1efc
3759483
2cc1efc
e70a2d0
345a26b
7b54e65
d187736
7b54e65
bf18e69
bee2b96
bf18e69
bee2b96
 
 
 
cba3641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bee2b96
bf18e69
cba3641
d800d23
 
 
 
 
 
bee2b96
bf18e69
d800d23
 
cba3641
d800d23
 
 
 
c2777d8
d800d23
c2777d8
bee2b96
 
6ebe94a
 
1511464
6ebe94a
 
 
 
 
bee2b96
 
 
 
 
 
 
 
 
 
 
 
 
 
d800d23
bf18e69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d800d23
4fa6022
 
cba3641
f348de6
 
4fa6022
cba3641
f348de6
 
4fa6022
 
 
cba3641
4fa6022
 
f348de6
 
71e197c
f348de6
71e197c
f348de6
71e197c
f348de6
4fa6022
 
 
 
 
f348de6
4fa6022
 
 
 
f348de6
cba3641
4fa6022
 
 
 
 
cba3641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fa6022
f348de6
 
c1f9d40
f348de6
39c89fc
55a1f5b

import gradio as gr
import os
import subprocess
import uuid
import fitz
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from gtts import gTTS
import sys
import pytesseract
from pdf2image import convert_from_path
from huggingface_hub import Repository, login
from huggingface_hub import hf_hub_download
from langchain.schema import Document
from PyPDF2 import PdfReader  # Make sure to install PyPDF2 for PDF handling
from langdetect import detect  # Install langdetect to detect language



# Load environment variables
load_dotenv()
secret_key = os.getenv("GROQ_API_KEY")
hf_key = os.getenv("HF_TOKEN")

os.environ["GROQ_API_KEY"] = secret_key
login(token=hf_key,add_to_git_credential=True)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# Ensure the necessary folders exist
UPLOAD_FOLDER = 'uploads/'
AUDIO_FOLDER = 'audio/'
for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)



# Function to check if the file is a valid PDF in Arabic and less than 10MB
def validate_pdf(pdf):
    if pdf is None:
        return "لم يتم اختيار أي ملف", False
    if not pdf.name.endswith(".pdf"):
        return "الملف الذي اخترته ليس PDF", False
    if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
        return "حجم الملف أكبر من 10 ميجا بايت", False
    
    # Check if PDF content is Arabic
    reader = PdfReader(pdf.name)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    try:
        if detect(text) != "ar":
            return "الملف ليس باللغة العربية", False
    except:
        return "فشل في تحليل اللغة", False
    
    return "الملف صالح للدردشة", True


def load_pdf(file_path):
    """Load and preprocess Arabic text from a PDF file."""
    
    try:
        pages = convert_from_path(file_path, 500)
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return []

    documents = []
    for pageNum, imgBlob in enumerate(pages):
        try:
            text = pytesseract.image_to_string(imgBlob, lang="ara")
            
            documents.append(text)
        except Exception as e:
            print(f"Error processing page {pageNum}: {e}")
            documents.append("")  # Append empty string for pages where OCR failed

    return documents

def prepare_vectorstore(data):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
    # Create Document objects from the input data
    documents = [Document(page_content=text) for text in data]
    
    # Split the documents into chunks
    chunks = text_splitter.split_documents(documents)
    
    # Create the vector store
    vectorstore = FAISS.from_documents(chunks, embeddings)
    return vectorstore

def create_chain(vectorstore):
    llm = ChatGroq(model="gemma2-9b-it", temperature=0)
    retriever = vectorstore.as_retriever()
    memory = ConversationBufferMemory(llm=llm, output_key="answer", memory_key="chat_history", return_messages=True)
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=False,
        chain_type="map_reduce"
    )
    return chain
    
custom_css = """
body {
    font-family: 'Noto Kufi Arabic', sans-serif;
    background: linear-gradient(135deg, #799351 0%, #A67B5B 100%);
    background-size: cover;
    background-position: center;
    background-attachment: fixed;
}

.gradio-container {
    max-width: 800px !important;
    margin: auto !important;
    background: rgba(255, 255, 255, 0.9);
    border-radius: 20px;
    box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
    backdrop-filter: blur(4px);
    border: 1px solid rgba(255, 255, 255, 0.18);
    padding: 20px;
}

h1, h2, h3 {
    color: #1A4D2E;
    font-weight: bold;
    text-align: center;
}

p {
    color: #A89F91;
}

.gradio-button {
    background-color: #5F6F65 !important;
    color: #FFFFFF !important;
}

.gradio-button:hover {
    background-color: #FFFFFF !important;
    color: #5F6F65 !important;
}

.chat-message {
    border-radius: 10px;
    padding: 10px;
    margin-bottom: 10px;
}

.chat-message.user {
    background-color: #E7F0DC;
}

.chat-message.bot {
    background-color: #F7EED3;
}

.chat-message::before {
    content: '';
    display: inline-block;
    width: 24px;
    height: 24px;
    background-size: contain;
    background-repeat: no-repeat;
    margin-right: 10px;
    vertical-align: middle;
}

.chat-message.user::before {
    content: '👤';
}

.chat-message.bot::before {
    content: '🤖';
}
"""

def upload_pdf(pdf_file):
    global vectorstore, chain  # Use global variables to store state
    
    data = load_pdf(pdf_file)
    vectorstore = prepare_vectorstore(data)
    chain = create_chain(vectorstore)
    return "تم تحميل الملف بنجاح !", True


def chat(user_input):
    global chain  # Access the global chain variable

   
    prompt = f"""
        You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
        When responding, ensure the following:
           - Your answer directly reflects the content of the document.
           - If the requested information is not available in the document, clearly state that in Arabic.
           - Keep your response concise yet comprehensive, addressing the question fully.
           - Always respond only in a professional and well-versed Arabic Language.\n
        Question: {user_input}\n
        """

    response = chain({"question": prompt})
    assistant_response = response["answer"]

    # Generate a unique identifier for the audio file
    audio_id = str(uuid.uuid4())

    # Create audio file
    tts = gTTS(text=assistant_response, lang='ar')
    audio_file = f"{audio_id}.mp3"
    tts.save(audio_file)


    return assistant_response, audio_file

with gr.Blocks(css=custom_css) as demo:
    
    pdf_input = gr.File(label="اختر ملف PDF")
    output_label = gr.HTML()  # Replaced Textbox with HTML for label
    submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
    chat_input = gr.Textbox(label="أدخل سؤالك هنا", interactive=False)
    chat_output = gr.Textbox(label="الرد الآلي", interactive=False)
    audio_output = gr.Audio(label="استمع إلى الرد", interactive=False)
    submit_button_chat = gr.Button("إرسال", interactive=False)

    def handle_file_upload(pdf):
        output_label.update('')
        message, is_valid = validate_pdf(pdf)
        color = "red" if not is_valid else "green"
        # Update HTML label instead of Textbox
        
        if is_valid:
            # Enable the upload button if the file is valid
            submit_button_pdf.update(interactive=True)
            output_label.update('')
        else:
            output_label.update(f'<span style="color:{color}">{message}</span>')

    def process_pdf_and_enable_components(pdf):
        # Process PDF and activate the other components
        output_label.update('<span style="color:green">جاري معالجة الملف...</span>')
        message, is_valid = upload_pdf(pdf)
        output_label.update(f'<span style="color:green">{message}</span>')
        if is_valid:
            chat_input.update(interactive=True)
            chat_output.update(interactive=True)
            audio_output.update(interactive=True)
            submit_button_chat.update(interactive=True)
     # When the user uploads a file, validate it and then allow PDF upload
    pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label])

    # When the user presses the upload button, process the PDF and enable other components
    submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label])

    # Chat button connection
    submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_output, audio_output])
    

# Launch the Gradio app
demo.launch(inbrowser=True)