pierreguillou commited on
Commit
0b0cce3
1 Parent(s): e3661c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -13
app.py CHANGED
@@ -85,14 +85,14 @@ def app_outputs(uploaded_pdf):
85
  if not msg.startswith("Error with the PDF"):
86
 
87
  # Extraction of image data (text and bounding boxes)
88
- dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
89
  # prepare our data in the format of the model
90
- encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
91
  custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
92
  # Get predictions (token level)
93
  outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
94
- # Get predictions (line level)
95
- probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
96
  # Get labeled images with lines bounding boxes
97
  images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
98
 
@@ -101,6 +101,7 @@ def app_outputs(uploaded_pdf):
101
  for i in range(num_images):
102
  if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
103
  else: img_file = filename.replace(".pdf", ".png")
 
104
  images[i].save(img_file)
105
  img_files.append(img_file)
106
 
@@ -118,6 +119,7 @@ def app_outputs(uploaded_pdf):
118
  csv_files = list()
119
  for i in range(max_imgboxes):
120
  csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
 
121
  csv_files.append(gr.File.update(value=csv_file, visible=True))
122
  df[i].to_csv(csv_file, encoding="utf-8", index=False)
123
 
@@ -132,15 +134,17 @@ def app_outputs(uploaded_pdf):
132
 
133
  return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
134
 
135
- # gradio APP
136
  with gr.Blocks(title="Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)", css=".gradio-container") as demo:
137
  gr.HTML("""
138
- <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>[ WARNING: this APP hs to be updated ] Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)</h1></div>
139
- <div style="margin-top: 40px"><p>(03/31/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model Layout XLM base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 384 tokens).</p></div>
140
  <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2104.08836" target="_blank">LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
141
- <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run Layout XLM base (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
142
- <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
143
- <div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div><div style="margin-top: 20px"><p>Links to Document Understanding APPs:</p><ul><li>Line level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1" target="_blank">v1 (LiLT base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2" target="_blank">v2 (LayoutXLM base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1" target="_blank">v1 (LilT base vs LayoutXLM base)</a></li><li>Paragraph level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">v1 (LiLT base)</a></li></ul></div><div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p><ul><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
 
 
144
  """)
145
  with gr.Row():
146
  pdf_file = gr.File(label="PDF")
@@ -162,7 +166,7 @@ with gr.Blocks(title="Inference APP for Document Understanding at paragraph leve
162
  with gr.Row():
163
  csvboxes = []
164
  for num_page in range(max_imgboxes):
165
- csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
166
  csvboxes.append(csv)
167
  with gr.Row():
168
  dfboxes = []
@@ -180,11 +184,12 @@ with gr.Blocks(title="Inference APP for Document Understanding at paragraph leve
180
 
181
  outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
182
  submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
 
183
  reset_btn.click(
184
  lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
185
  inputs=[],
186
- outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
187
- )
188
 
189
  gr.Examples(
190
  [["files/example.pdf"]],
 
85
  if not msg.startswith("Error with the PDF"):
86
 
87
  # Extraction of image data (text and bounding boxes)
88
+ dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images)
89
  # prepare our data in the format of the model
90
+ encoded_dataset = dataset.map(prepare_inference_features_paragraph, batched=True, batch_size=64, remove_columns=dataset.column_names)
91
  custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
92
  # Get predictions (token level)
93
  outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
94
+ # Get predictions (paragraph level)
95
+ probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
96
  # Get labeled images with lines bounding boxes
97
  images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
98
 
 
101
  for i in range(num_images):
102
  if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
103
  else: img_file = filename.replace(".pdf", ".png")
104
+ img_file = img_file.replace("/", "_")
105
  images[i].save(img_file)
106
  img_files.append(img_file)
107
 
 
119
  csv_files = list()
120
  for i in range(max_imgboxes):
121
  csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
122
+ csv_file = csv_file.replace("/", "_")
123
  csv_files.append(gr.File.update(value=csv_file, visible=True))
124
  df[i].to_csv(csv_file, encoding="utf-8", index=False)
125
 
 
134
 
135
  return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
136
 
137
+ # Gradio APP
138
  with gr.Blocks(title="Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)", css=".gradio-container") as demo:
139
  gr.HTML("""
140
+ <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)</h1></div>
141
+ <div style="margin-top: 40px"><p>(03/31/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">model Layout XLM base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at paragraph level</a> (chunk size of 512 tokens).</p></div>
142
  <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2104.08836" target="_blank">LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
143
+ <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run Layout XLM base (already fine-tuned on the dataset DocLayNet base at paragraph level) on the individual tokens and then, visualize the result at paragraph level!</p></div>
144
+ <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at paragraph level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
145
+ <div><p>However, the inference time per page can be high when running the model on CPU due to the number of paragraph predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb" target="_blank">Document AI | Inference at paragraph level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
146
+ <div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
147
+ <ul><li>(03/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-3507af80573d" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level with LayoutXLM base</a></li><li>(03/25/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-app-to-compare-the-document-understanding-lilt-and-layoutxlm-base-models-at-line-1c53eb481a15" target="_blank">Document AI | APP to compare the Document Understanding LiLT and LayoutXLM (base) models at line level</a></li><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
148
  """)
149
  with gr.Row():
150
  pdf_file = gr.File(label="PDF")
 
166
  with gr.Row():
167
  csvboxes = []
168
  for num_page in range(max_imgboxes):
169
+ csv = gr.File(visible=True, label=f"CSV file at paragraph level (page {num_page})")
170
  csvboxes.append(csv)
171
  with gr.Row():
172
  dfboxes = []
 
184
 
185
  outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
186
  submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
187
+ # https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91
188
  reset_btn.click(
189
  lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
190
  inputs=[],
191
+ outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
192
+ )
193
 
194
  gr.Examples(
195
  [["files/example.pdf"]],