mrfakename commited on
Commit
7c7f70a
1 Parent(s): 4d3835d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ABOUT = """
2
+ # TB-OCR Unofficial Demo
3
+
4
+ This is an unofficial demo of [yifeihu/TB-OCR-preview-0.1](https://huggingface.co/yifeihu/TB-OCR-preview-0.1).
5
+
6
+ Overview of TB-OCR:
7
+
8
+ > TB-OCR-preview (Text Block OCR), created by [Yifei Hu](https://x.com/hu_yifei), is an end-to-end OCR model handling text, math latex, and markdown formats all at once. The model takes a block of text as the input and returns clean markdown output. Headers are marked with `##`. Math expressions are guaranteed to be wrapped in brackets `\( inline math \) \[ display math \]` for easier parsing. This model does not require line-detection or math formula detection.
9
+
10
+ (From the [model card](https://huggingface.co/yifeihu/TB-OCR-preview-0.1))
11
+ """
12
+ # check out https://huggingface.co/microsoft/Phi-3.5-vision-instruct for more details
13
+
14
+ import torch
15
+ from transformers import AutoModelForCausalLM, AutoProcessor
16
+ from PIL import Image
17
+ import requests
18
+
19
+ model_id = "yifeihu/TB-OCR-preview-0.1"
20
+
21
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ model_id,
25
+ device_map="cuda",
26
+ trust_remote_code=True,
27
+ torch_dtype="auto",
28
+ _attn_implementation='flash_attention_2',
29
+ load_in_4bit=True # Optional: Load model in 4-bit mode to save memory
30
+ )
31
+
32
+ processor = AutoProcessor.from_pretrained(model_id,
33
+ trust_remote_code=True,
34
+ num_crops=16
35
+ )
36
+
37
+ def phi_ocr(image_url):
38
+ question = "Convert the text to markdown format."
39
+ image = Image.open(image_url)
40
+ prompt_message = [{
41
+ 'role': 'user',
42
+ 'content': f'<|image_1|>\n{question}',
43
+ }]
44
+
45
+ prompt = processor.tokenizer.apply_chat_template(prompt_message, tokenize=False, add_generation_prompt=True)
46
+ inputs = processor(prompt, [image], return_tensors="pt").to("cuda")
47
+
48
+ generation_args = {
49
+ "max_new_tokens": 1024,
50
+ "temperature": 0.1,
51
+ "do_sample": False
52
+ }
53
+
54
+ generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args
55
+ )
56
+
57
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
58
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
59
+
60
+ response = response.split("<image_end>")[0] # remove the image_end token
61
+
62
+ return response
63
+
64
+ import gradio as gr
65
+
66
+ with gr.Blocks() as demo:
67
+ gr.Markdown(ABOUT)
68
+ with gr.Row():
69
+ with gr.Column():
70
+ img = gr.Image(label="Input image", type="filename")
71
+ btn = gr.Button("OCR")
72
+ with gr.Column():
73
+ out = gr.Markdown()
74
+ btn.click(phi_ocr, inputs=img, outputs=out)