MohammedNasser commited on
Commit
cba3641
1 Parent(s): c1f9d40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -25
app.py CHANGED
@@ -18,7 +18,8 @@ from pdf2image import convert_from_path
18
  from huggingface_hub import Repository, login
19
  from huggingface_hub import hf_hub_download
20
  from langchain.schema import Document
21
-
 
22
 
23
 
24
 
@@ -39,8 +40,35 @@ for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
39
  if not os.path.exists(folder):
40
  os.makedirs(folder)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def load_pdf(file_path):
43
  """Load and preprocess Arabic text from a PDF file."""
 
44
  try:
45
  pages = convert_from_path(file_path, 500)
46
  except Exception as e:
@@ -51,6 +79,7 @@ def load_pdf(file_path):
51
  for pageNum, imgBlob in enumerate(pages):
52
  try:
53
  text = pytesseract.image_to_string(imgBlob, lang="ara")
 
54
  documents.append(text)
55
  except Exception as e:
56
  print(f"Error processing page {pageNum}: {e}")
@@ -159,19 +188,17 @@ p {
159
 
160
  def upload_pdf(pdf_file):
161
  global vectorstore, chain # Use global variables to store state
162
-
163
  data = load_pdf(pdf_file)
164
  vectorstore = prepare_vectorstore(data)
165
  chain = create_chain(vectorstore)
166
- return "تم تحميل الملف بنجاح !"
167
 
168
 
169
  def chat(user_input):
170
  global chain # Access the global chain variable
171
 
172
- if chain is None:
173
- return "Please upload a PDF file first.", None # Prompt user to upload a PDF
174
-
175
  prompt = f"""
176
  You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
177
  When responding, ensure the following:
@@ -193,31 +220,51 @@ def chat(user_input):
193
  audio_file = f"{audio_id}.mp3"
194
  tts.save(audio_file)
195
 
 
196
  return assistant_response, audio_file
197
 
198
  with gr.Blocks(css=custom_css) as demo:
199
 
200
  pdf_input = gr.File(label="اختر ملف PDF")
201
-
202
- output_text = gr.Textbox(label=" ")
203
-
204
- submit_button_pdf = gr.Button("ارفع الملف")
205
-
206
- chat_input = gr.Textbox(label="أدخل سؤالك هنا")
207
-
208
- chat_output = gr.Textbox(label="الرد الآلي")
209
-
210
- audio_output = gr.Audio(label="استمع إلى الرد")
211
-
212
- submit_button_chat = gr.Button("إرسال")
213
-
214
-
215
- # Connect upload button to upload function
216
- submit_button_pdf.click(upload_pdf, inputs=[pdf_input], outputs=[output_text])
217
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- # Connect chat button to chat function
220
- submit_button_chat.click(chat, inputs=[chat_input], outputs=[chat_output, audio_output])
221
 
222
  # Launch the Gradio app
223
  demo.launch(inbrowser=True)
 
18
  from huggingface_hub import Repository, login
19
  from huggingface_hub import hf_hub_download
20
  from langchain.schema import Document
21
+ from PyPDF2 import PdfReader # Make sure to install PyPDF2 for PDF handling
22
+ from langdetect import detect # Install langdetect to detect language
23
 
24
 
25
 
 
40
  if not os.path.exists(folder):
41
  os.makedirs(folder)
42
 
43
+
44
+
45
+ # Function to check if the file is a valid PDF in Arabic and less than 10MB
46
+ def validate_pdf(pdf):
47
+ if pdf is None:
48
+ return "لم يتم اختيار أي ملف", False
49
+ if not pdf.name.endswith(".pdf"):
50
+ return "الملف الذي اخترته ليس PDF", False
51
+ if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
52
+ return "حجم الملف أكبر من 10 ميجا بايت", False
53
+
54
+ # Check if PDF content is Arabic
55
+ reader = PdfReader(pdf.name)
56
+ text = ""
57
+ for page in reader.pages:
58
+ text += page.extract_text()
59
+
60
+ try:
61
+ if detect(text) != "ar":
62
+ return "الملف ليس باللغة العربية", False
63
+ except:
64
+ return "فشل في تحليل اللغة", False
65
+
66
+ return "الملف صالح للدردشة", True
67
+
68
+
69
  def load_pdf(file_path):
70
  """Load and preprocess Arabic text from a PDF file."""
71
+
72
  try:
73
  pages = convert_from_path(file_path, 500)
74
  except Exception as e:
 
79
  for pageNum, imgBlob in enumerate(pages):
80
  try:
81
  text = pytesseract.image_to_string(imgBlob, lang="ara")
82
+
83
  documents.append(text)
84
  except Exception as e:
85
  print(f"Error processing page {pageNum}: {e}")
 
188
 
189
  def upload_pdf(pdf_file):
190
  global vectorstore, chain # Use global variables to store state
191
+
192
  data = load_pdf(pdf_file)
193
  vectorstore = prepare_vectorstore(data)
194
  chain = create_chain(vectorstore)
195
+ return "تم تحميل الملف بنجاح !", True
196
 
197
 
198
  def chat(user_input):
199
  global chain # Access the global chain variable
200
 
201
+
 
 
202
  prompt = f"""
203
  You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
204
  When responding, ensure the following:
 
220
  audio_file = f"{audio_id}.mp3"
221
  tts.save(audio_file)
222
 
223
+
224
  return assistant_response, audio_file
225
 
226
  with gr.Blocks(css=custom_css) as demo:
227
 
228
  pdf_input = gr.File(label="اختر ملف PDF")
229
+ output_label = gr.HTML() # Replaced Textbox with HTML for label
230
+ submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
231
+ chat_input = gr.Textbox(label="أدخل سؤالك هنا", interactive=False)
232
+ chat_output = gr.Textbox(label="الرد الآلي", interactive=False)
233
+ audio_output = gr.Audio(label="استمع إلى الرد", interactive=False)
234
+ submit_button_chat = gr.Button("إرسال", interactive=False)
235
+
236
+ def handle_file_upload(pdf):
237
+ output_label.update('')
238
+ message, is_valid = validate_pdf(pdf)
239
+ color = "red" if not is_valid else "green"
240
+ # Update HTML label instead of Textbox
241
+
242
+ if is_valid:
243
+ # Enable the upload button if the file is valid
244
+ submit_button_pdf.update(interactive=True)
245
+ output_label.update('')
246
+ else:
247
+ output_label.update(f'<span style="color:{color}">{message}</span>')
248
+
249
+ def process_pdf_and_enable_components(pdf):
250
+ # Process PDF and activate the other components
251
+ output_label.update('<span style="color:green">جاري معالجة الملف...</span>')
252
+ message, is_valid = upload_pdf(pdf)
253
+ output_label.update(f'<span style="color:green">{message}</span>')
254
+ if is_valid:
255
+ chat_input.update(interactive=True)
256
+ chat_output.update(interactive=True)
257
+ audio_output.update(interactive=True)
258
+ submit_button_chat.update(interactive=True)
259
+ # When the user uploads a file, validate it and then allow PDF upload
260
+ pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label])
261
+
262
+ # When the user presses the upload button, process the PDF and enable other components
263
+ submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label])
264
+
265
+ # Chat button connection
266
+ submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_output, audio_output])
267
 
 
 
268
 
269
  # Launch the Gradio app
270
  demo.launch(inbrowser=True)