QuanSun commited on
Commit
d5d3d11
1 Parent(s): 6f98381

add Emu2 README

Browse files
Files changed (1) hide show
  1. README.md +255 -0
README.md ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+
6
+ [🤗 HF Demo](https://huggingface.co/spaces/BAAI/Emu2) | [Demo](https://emu.ssi.plus) | [Project Page](https://baaivision.github.io/emu2/)
7
+
8
+ ## Model Weights
9
+
10
+ | Model name | Weight |
11
+ | ------------------ | ------------------------------------------------------- |
12
+ | **Emu2** | [🤗 HF link](https://huggingface.co/BAAI/Emu2) |
13
+ | **Emu2-Chat** | [🤗 HF link](https://huggingface.co/BAAI/Emu2-Chat) |
14
+ | **Emu2-Gen** | [🤗 HF link](https://huggingface.co/BAAI/Emu2-Gen) |
15
+
16
+
17
+ ## Inference (Huggingface Version)
18
+
19
+ #### Single GPU
20
+
21
+ ```python
22
+ from PIL import Image
23
+ import requests
24
+ import torch
25
+ from transformers import AutoModelForCausalLM, AutoTokenizer
26
+
27
+
28
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2")
29
+
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ "BAAI/Emu2",
32
+ torch_dtype=torch.bfloat16,
33
+ low_cpu_mem_usage=True,
34
+ trust_remote_code=True).to('cuda').eval()
35
+
36
+
37
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
38
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
39
+
40
+ query = '[<IMG_PLH>]Describe the image in details:'
41
+ image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
42
+
43
+
44
+ inputs = model.build_input_ids(
45
+ text=[query],
46
+ tokenizer=tokenizer,
47
+ image=[image]
48
+ )
49
+
50
+ with torch.no_grad():
51
+ outputs = model.generate(
52
+ input_ids=inputs["input_ids"],
53
+ attention_mask=inputs["attention_mask"],
54
+ image=inputs["image"].to(torch.bfloat16),
55
+ max_new_tokens=64,
56
+ length_penalty=-1)
57
+
58
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
59
+ ```
60
+
61
+ Interleaved image and text
62
+
63
+ ```python
64
+ from PIL import Image
65
+ import requests
66
+ import torch
67
+ from transformers import AutoModelForCausalLM, AutoTokenizer
68
+
69
+
70
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2")
71
+
72
+ model = AutoModelForCausalLM.from_pretrained(
73
+ "BAAI/Emu2",
74
+ torch_dtype=torch.bfloat16,
75
+ low_cpu_mem_usage=True,
76
+ trust_remote_code=True).to('cuda').eval()
77
+
78
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
79
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
80
+
81
+ query = "[<IMG_PLH>][red, white, 3, bottom left].[<IMG_PLH>][yellow, white, 2, top left].[<IMG_PLH>][green, black, 4, bottom right][<IMG_PLH>]"
82
+
83
+ images = [
84
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/red_white_3_bottom_left.jpg?raw=true',stream=True).raw).convert('RGB'),
85
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/yellow_white_2_top_right.jpg?raw=true',stream=True).raw).convert('RGB'),
86
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/green_black_4_bottom_right.jpg?raw=true',stream=True).raw).convert('RGB'),
87
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB'),
88
+ ]
89
+
90
+ inputs = model.build_input_ids(
91
+ text=[query],
92
+ tokenizer=tokenizer,
93
+ image=images
94
+
95
+ )
96
+
97
+ with torch.no_grad():
98
+ outputs = model.generate(
99
+ input_ids=inputs["input_ids"],
100
+ attention_mask=inputs["attention_mask"],
101
+ image=inputs["image"].to(torch.bfloat16),
102
+ max_new_tokens=64,
103
+ length_penalty=-1)
104
+
105
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
106
+ ```
107
+
108
+ #### Multi GPU
109
+
110
+
111
+ ```python
112
+ from PIL import Image
113
+ import requests
114
+ import torch
115
+ from transformers import AutoModelForCausalLM, AutoTokenizer
116
+ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
117
+
118
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2")
119
+
120
+ with init_empty_weights():
121
+ model = AutoModelForCausalLM.from_pretrained(
122
+ "BAAI/Emu2",
123
+ torch_dtype=torch.bfloat16,
124
+ low_cpu_mem_usage=True,
125
+ trust_remote_code=True)
126
+
127
+ device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer'])
128
+ # input and output logits should be on same device
129
+ device_map["model.decoder.lm.lm_head"] = 0
130
+
131
+ model = load_checkpoint_and_dispatch(
132
+ model,
133
+ 'local/path/to/hf/version/Emu2/model',
134
+ device_map=device_map).eval()
135
+
136
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
137
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
138
+
139
+ query = '[<IMG_PLH>]Describe the image in details:'
140
+ image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
141
+
142
+ inputs = model.build_input_ids(
143
+ text=[query],
144
+ tokenizer=tokenizer,
145
+ image=[image]
146
+
147
+ )
148
+
149
+ with torch.no_grad():
150
+ outputs = model.generate(
151
+ input_ids=inputs["input_ids"],
152
+ attention_mask=inputs["attention_mask"],
153
+ image=inputs["image"].to(torch.bfloat16),
154
+ max_new_tokens=64,
155
+ length_penalty=-1)
156
+
157
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
158
+ ```
159
+
160
+ Interleaved image and text
161
+
162
+ ```python
163
+ from PIL import Image
164
+ import requests
165
+ import torch
166
+ from transformers import AutoModelForCausalLM, AutoTokenizer
167
+ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
168
+
169
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2")
170
+
171
+ with init_empty_weights():
172
+ model = AutoModelForCausalLM.from_pretrained(
173
+ "BAAI/Emu2",
174
+ torch_dtype=torch.bfloat16,
175
+ low_cpu_mem_usage=True,
176
+ trust_remote_code=True)
177
+
178
+ device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer'])
179
+ # input and output logits should be on same device
180
+ device_map["model.decoder.lm.lm_head"] = 0
181
+
182
+ model = load_checkpoint_and_dispatch(
183
+ model,
184
+ 'local/path/to/hf/version/Emu2/model',
185
+ device_map=device_map).eval()
186
+
187
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
188
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
189
+ query = "[<IMG_PLH>][red, white, 3, bottom left].[<IMG_PLH>][yellow, white, 2, top left].[<IMG_PLH>][green, black, 4, bottom right][<IMG_PLH>]"
190
+
191
+ images = [
192
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/red_white_3_bottom_left.jpg?raw=true',stream=True).raw).convert('RGB'),
193
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/yellow_white_2_top_right.jpg?raw=true',stream=True).raw).convert('RGB'),
194
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/green_black_4_bottom_right.jpg?raw=true',stream=True).raw).convert('RGB'),
195
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB'),
196
+ ]
197
+
198
+ inputs = model.build_input_ids(
199
+ text=[query],
200
+ tokenizer=tokenizer,
201
+ image=images
202
+
203
+ )
204
+
205
+ with torch.no_grad():
206
+ outputs = model.generate(
207
+ input_ids=inputs["input_ids"],
208
+ attention_mask=inputs["attention_mask"],
209
+ image=inputs["image"].to(torch.bfloat16),
210
+ max_new_tokens=64,
211
+ length_penalty=-1)
212
+
213
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
214
+ ```
215
+
216
+ #### Quantization
217
+
218
+ Check quantization guidance at [transformers](https://huggingface.co/docs/transformers/v4.28.0/main_classes/quantization)
219
+
220
+
221
+ ```python
222
+ from PIL import Image
223
+ import requests
224
+ import torch
225
+ from transformers import AutoModelForCausalLM, AutoTokenizer
226
+
227
+
228
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2")
229
+
230
+ model = AutoModelForCausalLM.from_pretrained(
231
+ "BAAI/Emu2",
232
+ load_in_4bit=True,
233
+ trust_remote_code=True,
234
+ bnb_4bit_compute_dtype=torch.float16).eval()
235
+
236
+ query = '[<IMG_PLH>]Describe the image in details:'
237
+ image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
238
+
239
+ inputs = model.build_input_ids(
240
+ text=[query],
241
+ tokenizer=tokenizer,
242
+ image=[image]
243
+
244
+ )
245
+
246
+ with torch.no_grad():
247
+ outputs = model.generate(
248
+ input_ids=inputs["input_ids"],
249
+ attention_mask=inputs["attention_mask"],
250
+ image=inputs["image"].to(torch.float16), # should be torch.float16
251
+ max_new_tokens=64,
252
+ length_penalty=-1)
253
+
254
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
255
+ ```