harshad317 commited on
Commit
aaf4d33
1 Parent(s): ff69fc9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,202 +1,859 @@
1
  ---
2
- base_model: google/paligemma-3b-pt-448
3
- library_name: peft
 
 
 
 
 
 
4
  ---
 
5
 
6
- # Model Card for Model ID
7
 
8
- <!-- Provide a quick summary of what the model is/does. -->
9
 
 
10
 
 
 
 
11
 
12
- ## Model Details
13
 
14
- ### Model Description
15
 
16
- <!-- Provide a longer summary of what this model is. -->
17
 
 
18
 
 
19
 
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
 
28
- ### Model Sources [optional]
29
 
30
- <!-- Provide the basic links for the model. -->
 
 
 
 
 
 
 
31
 
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
 
36
- ## Uses
 
 
 
 
37
 
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
 
40
- ### Direct Use
41
 
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
 
44
- [More Information Needed]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- ### Downstream Use [optional]
47
 
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
 
50
- [More Information Needed]
 
51
 
52
- ### Out-of-Scope Use
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
 
56
- [More Information Needed]
57
 
58
- ## Bias, Risks, and Limitations
59
 
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
 
62
- [More Information Needed]
 
63
 
64
- ### Recommendations
 
 
 
 
 
 
 
65
 
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 
 
67
 
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- ## How to Get Started with the Model
 
 
 
 
 
71
 
72
- Use the code below to get started with the model.
73
 
74
- [More Information Needed]
 
 
 
 
75
 
76
- ## Training Details
 
 
77
 
78
- ### Training Data
 
79
 
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.12.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: transformers
3
+ license: gemma
4
+ pipeline_tag: image-text-to-text
5
+ extra_gated_heading: Access PaliGemma on Hugging Face
6
+ extra_gated_prompt: To access PaliGemma on Hugging Face, you’re required to review
7
+ and agree to Google’s usage license. To do this, please ensure you’re logged-in
8
+ to Hugging Face and click below. Requests are processed immediately.
9
+ extra_gated_button_content: Acknowledge license
10
  ---
11
+ # PaliGemma model card
12
 
13
+ **Model page:** [PaliGemma](https://ai.google.dev/gemma/docs/paligemma)
14
 
15
+ Transformers PaliGemma 3B weights, pre-trained with 448*448 input images and 512 token input/output text sequences. The models are available in float32, bfloat16 and float16 formats for fine-tuning.
16
 
17
+ **Resources and technical documentation:**
18
 
19
+ * [Responsible Generative AI Toolkit](https://ai.google.dev/responsible)
20
+ * [PaliGemma on Kaggle](https://www.kaggle.com/models/google/paligemma)
21
+ * [PaliGemma on Vertex Model Garden](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/363)
22
 
23
+ **Terms of Use:** [Terms](https://www.kaggle.com/models/google/paligemma/license/consent/verify/huggingface?returnModelRepoId=google/paligemma-3b-pt-448)
24
 
25
+ **Authors:** Google
26
 
27
+ ## Model information
28
 
29
+ ### Model summary
30
 
31
+ #### Description
32
 
33
+ PaliGemma is a versatile and lightweight vision-language model (VLM) inspired by
34
+ [PaLI-3](https://arxiv.org/abs/2310.09199) and based on open components such as
35
+ the [SigLIP vision model](https://arxiv.org/abs/2303.15343) and the [Gemma
36
+ language model](https://arxiv.org/abs/2403.08295). It takes both image and text
37
+ as input and generates text as output, supporting multiple languages. It is designed for class-leading fine-tune performance on a wide range of vision-language tasks such as image and short video caption, visual question answering, text reading, object detection and object segmentation.
 
 
38
 
39
+ #### Model architecture
40
 
41
+ PaliGemma is the composition of a [Transformer
42
+ decoder](https://arxiv.org/abs/1706.03762) and a [Vision Transformer image
43
+ encoder](https://arxiv.org/abs/2010.11929), with a total of 3 billion
44
+ params. The text decoder is initialized from
45
+ [Gemma-2B](https://www.kaggle.com/models/google/gemma). The image encoder is
46
+ initialized from
47
+ [SigLIP-So400m/14](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/image_text/SigLIP_demo.ipynb).
48
+ PaliGemma is trained following the PaLI-3 recipes.
49
 
50
+ #### Inputs and outputs
 
 
51
 
52
+ * **Input:** Image and text string, such as a prompt to caption the image, or
53
+ a question.
54
+ * **Output:** Generated text in response to the input, such as a caption of
55
+ the image, an answer to a question, a list of object bounding box
56
+ coordinates, or segmentation codewords.
57
 
58
+ ### Model data
59
 
60
+ #### Pre-train datasets
61
 
62
+ PaliGemma is pre-trained on the following mixture of datasets:
63
 
64
+ * **WebLI:** [WebLI (Web Language Image)](https://arxiv.org/abs/2209.06794) is
65
+ a web-scale multilingual image-text dataset built from the public web. A
66
+ wide range of WebLI splits are used to acquire versatile model capabilities,
67
+ such as visual semantic understanding, object localization,
68
+ visually-situated text understanding, multilinguality, etc.
69
+ * **CC3M-35L:** Curated English image-alt_text pairs from webpages ([Sharma et
70
+ al., 2018](https://aclanthology.org/P18-1238/)). We used the [Google Cloud
71
+ Translation API](https://cloud.google.com/translate) to translate into 34
72
+ additional languages.
73
+ * **VQ²A-CC3M-35L/VQG-CC3M-35L:** A subset of VQ2A-CC3M ([Changpinyo et al.,
74
+ 2022a](https://aclanthology.org/2022.naacl-main.142/)), translated into the
75
+ same additional 34 languages as CC3M-35L, using the [Google Cloud
76
+ Translation API](https://cloud.google.com/translate).
77
+ * **OpenImages:** Detection and object-aware questions and answers
78
+ ([Piergiovanni et al. 2022](https://arxiv.org/abs/2209.04372)) generated by
79
+ handcrafted rules on the [OpenImages dataset].
80
+ * **WIT:** Images and texts collected from Wikipedia ([Srinivasan et al.,
81
+ 2021](https://arxiv.org/abs/2103.01913)).
82
 
83
+ [OpenImages dataset]: https://storage.googleapis.com/openimages/web/factsfigures_v7.html
84
 
85
+ #### Data responsibility filtering
86
 
87
+ The following filters are applied to WebLI, with the goal of training PaliGemma
88
+ on clean data:
89
 
90
+ * **Pornographic image filtering:** This filter removes images deemed to be of
91
+ pornographic nature.
92
+ * **Text safety filtering:** We identify and filter out images that are paired
93
+ with unsafe text. Unsafe text is any text deemed to contain or be about
94
+ CSAI, pornography, vulgarities, or otherwise offensive.
95
+ * **Text toxicity filtering:** We further use the [Perspective
96
+ API](https://perspectiveapi.com/) to identify and filter out images that are
97
+ paired with text deemed insulting, obscene, hateful or otherwise toxic.
98
+ * **Text personal information filtering:** We filtered certain personal information and other sensitive data using [Cloud Data Loss Prevention (DLP)
99
+ API](https://cloud.google.com/security/products/dlp) to protect the privacy
100
+ of individuals. Identifiers such as social security numbers and [other sensitive information types] were removed.
101
+ * **Additional methods:** Filtering based on content quality and safety in
102
+ line with our policies and practices.
103
 
104
+ [other sensitive information types]: https://cloud.google.com/sensitive-data-protection/docs/high-sensitivity-infotypes-reference?_gl=1*jg604m*_ga*ODk5MzA3ODQyLjE3MTAzMzQ3NTk.*_ga_WH2QY8WWF5*MTcxMDUxNTkxMS4yLjEuMTcxMDUxNjA2NC4wLjAuMA..&_ga=2.172110058.-899307842.1710334759
105
 
 
106
 
 
107
 
108
+ ## How to Use
109
 
110
+ PaliGemma is a single-turn vision language model not meant for conversational use,
111
+ and it works best when fine-tuning to a specific use case.
112
 
113
+ You can configure which task the model will solve by conditioning it with task prefixes,
114
+ such as “detect” or “segment”. The pretrained models were trained in this fashion to imbue
115
+ them with a rich set of capabilities (question answering, captioning, segmentation, etc.).
116
+ However, they are not designed to be used directly, but to be transferred (by fine-tuning)
117
+ to specific tasks using a similar prompt structure. For interactive testing, you can use
118
+ the "mix" family of models, which have been fine-tuned on a mixture of tasks. To see model
119
+ [google/paligemma-3b-mix-448](https://huggingface.co/google/paligemma-3b-mix-448) in action,
120
+ check [this Space that uses the Transformers codebase](https://huggingface.co/spaces/big-vision/paligemma-hf).
121
 
122
+ Please, refer to the [usage and limitations section](#usage-and-limitations) for intended
123
+ use cases, or visit the [blog post](https://huggingface.co/blog/paligemma-google-vlm) for
124
+ additional details and examples.
125
 
126
+ ## Use in Transformers
127
+
128
+ The following snippets use model `google/paligemma-3b-mix-224` for reference purposes.
129
+ The model in this repo you are now browsing may have been trained for other tasks, please
130
+ make sure you use appropriate inputs for the task at hand.
131
+
132
+ ### Running the default precision (`float32`) on CPU
133
+
134
+ ```python
135
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
136
+ from PIL import Image
137
+ import requests
138
+ import torch
139
+
140
+ model_id = "google/paligemma-3b-mix-224"
141
+
142
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
143
+ image = Image.open(requests.get(url, stream=True).raw)
144
+
145
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
146
+ processor = AutoProcessor.from_pretrained(model_id)
147
+
148
+ # Instruct the model to create a caption in Spanish
149
+ prompt = "caption es"
150
+ model_inputs = processor(text=prompt, images=image, return_tensors="pt")
151
+ input_len = model_inputs["input_ids"].shape[-1]
152
+
153
+ with torch.inference_mode():
154
+ generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
155
+ generation = generation[0][input_len:]
156
+ decoded = processor.decode(generation, skip_special_tokens=True)
157
+ print(decoded)
158
+ ```
159
 
160
+ Output: `Un auto azul estacionado frente a un edificio.`
161
+
162
+ ### Running other precisions on CUDA
163
+
164
+ For convenience, the repos contain revisions of the weights already converted to `bfloat16` and `float16`,
165
+ so you can use them to reduce the download size and avoid casting on your local computer.
166
 
167
+ This is how you'd run `bfloat16` on an nvidia CUDA card.
168
 
169
+ ```python
170
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
171
+ from PIL import Image
172
+ import requests
173
+ import torch
174
 
175
+ model_id = "google/paligemma-3b-mix-224"
176
+ device = "cuda:0"
177
+ dtype = torch.bfloat16
178
 
179
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
180
+ image = Image.open(requests.get(url, stream=True).raw)
181
 
182
+ model = PaliGemmaForConditionalGeneration.from_pretrained(
183
+ model_id,
184
+ torch_dtype=dtype,
185
+ device_map=device,
186
+ revision="bfloat16",
187
+ ).eval()
188
+ processor = AutoProcessor.from_pretrained(model_id)
189
+
190
+ # Instruct the model to create a caption in Spanish
191
+ prompt = "caption es"
192
+ model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
193
+ input_len = model_inputs["input_ids"].shape[-1]
194
+
195
+ with torch.inference_mode():
196
+ generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
197
+ generation = generation[0][input_len:]
198
+ decoded = processor.decode(generation, skip_special_tokens=True)
199
+ print(decoded)
200
+ ```
201
+
202
+ ### Loading in 4-bit / 8-bit
203
+
204
+ You need to install `bitsandbytes` to automatically run inference using 8-bit or 4-bit precision:
205
+
206
+ ```
207
+ pip install bitsandbytes accelerate
208
+ ```
209
+
210
+ ```
211
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
212
+ from PIL import Image
213
+ import requests
214
+ import torch
215
+
216
+ model_id = "google/paligemma-3b-mix-224"
217
+ device = "cuda:0"
218
+ dtype = torch.bfloat16
219
+
220
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
221
+ image = Image.open(requests.get(url, stream=True).raw)
222
+
223
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
224
+
225
+ model = PaliGemmaForConditionalGeneration.from_pretrained(
226
+ model_id, quantization_config=quantization_config
227
+ ).eval()
228
+ processor = AutoProcessor.from_pretrained(model_id)
229
+
230
+ # Instruct the model to create a caption in Spanish
231
+ prompt = "caption es"
232
+ model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
233
+ input_len = model_inputs["input_ids"].shape[-1]
234
+
235
+ with torch.inference_mode():
236
+ generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
237
+ generation = generation[0][input_len:]
238
+ decoded = processor.decode(generation, skip_special_tokens=True)
239
+ print(decoded)
240
+ ```
241
+
242
+ ## Implementation information
243
+
244
+ ### Hardware
245
+
246
+ PaliGemma was trained using the latest generation of Tensor Processing Unit
247
+ (TPU) hardware (TPUv5e).
248
+
249
+ ### Software
250
+
251
+ Training was done using [JAX](https://github.com/google/jax),
252
+ [Flax](https://github.com/google/flax),
253
+ [TFDS](https://github.com/tensorflow/datasets) and
254
+ [`big_vision`](https://github.com/google-research/big_vision).
255
+
256
+ JAX allows researchers to take advantage of the latest generation of hardware,
257
+ including TPUs, for faster and more efficient training of large models.
258
+
259
+ TFDS is used to access datasets and Flax is used for model architecture. The
260
+ PaliGemma fine-tune code and inference code are released in the `big_vision`
261
+ GitHub repository.
262
+
263
+ ## Evaluation information
264
+
265
+ ### Benchmark results
266
+
267
+ In order to verify the transferability of PaliGemma to a wide variety of
268
+ academic tasks, we fine-tune the pretrained models on each task. Additionally we
269
+ train the mix model with a mixture of the transfer tasks. We report results on
270
+ different resolutions to provide an impression of which tasks benefit from
271
+ increased resolution. Importantly, none of these tasks or datasets are part of
272
+ the pretraining data mixture, and their images are explicitly removed from the
273
+ web-scale pre-training data.
274
+
275
+ #### Single task (fine-tune on single task)
276
+
277
+ <table>
278
+ <tbody><tr>
279
+ <th>Benchmark<br>(train split)</th>
280
+ <th>Metric<br>(split)</th>
281
+ <th>pt-224</th>
282
+ <th>pt-448</th>
283
+ <th>pt-896</th>
284
+ </tr>
285
+ <tr>
286
+ <th>Captioning</th>
287
+ </tr>
288
+ <tr>
289
+ <td>
290
+ <a href="https://cocodataset.org/#home">COCO captions</a><br>(train+restval)
291
+ </td>
292
+ <td>CIDEr (val)</td>
293
+ <td>141.92</td>
294
+ <td>144.60</td>
295
+ </tr>
296
+ <tr>
297
+ <td>
298
+ <a href="https://nocaps.org/">NoCaps</a><br>(Eval of COCO<br>captions transfer)
299
+ </td>
300
+ <td>CIDEr (val)</td>
301
+ <td>121.72</td>
302
+ <td>123.58</td>
303
+ </tr>
304
+ <tr>
305
+ <td>
306
+ <a href="https://arxiv.org/pdf/2205.12522">COCO-35L</a><br>(train)
307
+ </td>
308
+ <td>CIDEr dev<br>(en/avg-34/avg)</td>
309
+ <td>
310
+ 139.2<br>
311
+ 115.8<br>
312
+ 116.4
313
+ </td>
314
+ <td>
315
+ 141.2<br>
316
+ 118.0<br>
317
+ 118.6
318
+ </td>
319
+ </tr>
320
+ <tr>
321
+ <td>
322
+ <a href="https://arxiv.org/pdf/2205.12522">XM3600</a><br>(Eval of COCO-35L transfer)
323
+ </td>
324
+ <td>CIDEr dev<br>(en/avg-34/avg)</td>
325
+ <td>
326
+ 78.1<br>
327
+ 41.3<br>
328
+ 42.4
329
+ </td>
330
+ <td>
331
+ 80.0<br>
332
+ 41.9<br>
333
+ 42.9
334
+ </td>
335
+ </tr>
336
+ <tr>
337
+ <td>
338
+ <a href="https://textvqa.org/textcaps/">TextCaps</a><br>(train)
339
+ </td>
340
+ <td>CIDEr (val)</td>
341
+ <td>127.48</td>
342
+ <td>153.94</td>
343
+ </tr>
344
+ <tr>
345
+ <td>
346
+ <a href="https://arxiv.org/abs/2110.11624">SciCap</a><br>(first sentence, no subfigure)<br>(train+val)
347
+ </td>
348
+ <td>CIDEr/BLEU-4<br>(test)</td>
349
+ <td>
350
+ 162.25<br>
351
+ 0.192<br>
352
+ </td>
353
+ <td>
354
+ 181.49<br>
355
+ 0.211<br>
356
+ </td>
357
+ </tr>
358
+ <tr>
359
+ <td>
360
+ <a href="https://arxiv.org/abs/2108.03353">Screen2words</a><br>(train+dev)
361
+ </td>
362
+ <td>CIDEr (test)</td>
363
+ <td>117.57</td>
364
+ <td>119.59</td>
365
+ </tr>
366
+ <tr>
367
+ <td>
368
+ <a href="https://arxiv.org/abs/2010.04295">Widget Captioning</a><br>(train+dev)
369
+ </td>
370
+ <td>CIDEr (test)</td>
371
+ <td>136.07</td>
372
+ <td>148.36</td>
373
+ </tr>
374
+ <tr>
375
+ <th>Question answering</th>
376
+ </tr>
377
+ <tr>
378
+ <td>
379
+ <a href="https://visualqa.org/index.html">VQAv2</a><br>(train+validation)
380
+ </td>
381
+ <td>Accuracy<br>(Test server - std)</td>
382
+ <td>83.19</td>
383
+ <td>85.64</td>
384
+ </tr>
385
+ <tr>
386
+ <td>
387
+ <a href="https://arxiv.org/abs/2401.06209">MMVP</a><br>(Eval of VQAv2 transfer)
388
+ </td>
389
+ <td>Paired Accuracy</td>
390
+ <td>47.33</td>
391
+ <td>45.33</td>
392
+ </tr>
393
+ <tr>
394
+ <td>
395
+ <a href="https://arxiv.org/abs/2305.10355">POPE</a><br>(Eval of VQAv2 transfer)
396
+ </td>
397
+ <td>Accuracy<br>(random/popular/<br>adversarial)</td>
398
+ <td>
399
+ 87.80<br>
400
+ 85.87<br>
401
+ 84.27
402
+ </td>
403
+ <td>
404
+ 88.23<br>
405
+ 86.77<br>
406
+ 85.90
407
+ </td>
408
+ </tr>
409
+ <tr>
410
+ <td>
411
+ <a href="https://okvqa.allenai.org/">OKVQA</a><br>(train)
412
+ </td>
413
+ <td>Accuracy (val)</td>
414
+ <td>63.54</td>
415
+ <td>63.15</td>
416
+ </tr>
417
+ <tr>
418
+ <td>
419
+ <a href="https://allenai.org/project/a-okvqa/home">A-OKVQA</a> (MC)<br>(train+val)
420
+ </td>
421
+ <td>Accuracy<br>(Test server)</td>
422
+ <td>76.37</td>
423
+ <td>76.90</td>
424
+ </tr>
425
+ <tr>
426
+ <td>
427
+ <a href="https://allenai.org/project/a-okvqa/home">A-OKVQA</a> (DA)<br>(train+val)
428
+ </td>
429
+ <td>Accuracy<br>(Test server)</td>
430
+ <td>61.85</td>
431
+ <td>63.22</td>
432
+ </tr>
433
+ <tr>
434
+ <td>
435
+ <a href="https://cs.stanford.edu/people/dorarad/gqa/about.html">GQA</a><br>(train_balanced+<br>val_balanced)
436
+ </td>
437
+ <td>Accuracy<br>(testdev balanced)</td>
438
+ <td>65.61</td>
439
+ <td>67.03</td>
440
+ </tr>
441
+ <tr>
442
+ <td>
443
+ <a href="https://aclanthology.org/2022.findings-acl.196/">xGQA</a><br>(Eval of GQA transfer)
444
+ </td>
445
+ <td>Mean Accuracy<br>(bn, de, en, id,<br>ko, pt, ru, zh)</td>
446
+ <td>58.37</td>
447
+ <td>59.07</td>
448
+ </tr>
449
+ <tr>
450
+ <td>
451
+ <a href="https://lil.nlp.cornell.edu/nlvr/">NLVR2</a><br>(train+dev)
452
+ </td>
453
+ <td>Accuracy (test)</td>
454
+ <td>90.02</td>
455
+ <td>88.93</td>
456
+ </tr>
457
+ <tr>
458
+ <td>
459
+ <a href="https://marvl-challenge.github.io/">MaRVL</a><br>(Eval of NLVR2 transfer)
460
+ </td>
461
+ <td>Mean Accuracy<br>(test)<br>(id, sw, ta, tr, zh)</td>
462
+ <td>80.57</td>
463
+ <td>76.78</td>
464
+ </tr>
465
+ <tr>
466
+ <td>
467
+ <a href="https://allenai.org/data/diagrams">AI2D</a><br>(train)
468
+ </td>
469
+ <td>Accuracy (test)</td>
470
+ <td>72.12</td>
471
+ <td>73.28</td>
472
+ </tr>
473
+ <tr>
474
+ <td>
475
+ <a href="https://scienceqa.github.io/">ScienceQA</a><br>(Img subset, no CoT)<br>(train+val)
476
+ </td>
477
+ <td>Accuracy (test)</td>
478
+ <td>95.39</td>
479
+ <td>95.93</td>
480
+ </tr>
481
+ <tr>
482
+ <td>
483
+ <a href="https://zenodo.org/records/6344334">RSVQA-LR</a> (Non numeric)<br>(train+val)
484
+ </td>
485
+ <td>Mean Accuracy<br>(test)</td>
486
+ <td>92.65</td>
487
+ <td>93.11</td>
488
+ </tr>
489
+ <tr>
490
+ <td>
491
+ <a href="https://zenodo.org/records/6344367">RSVQA-HR</a> (Non numeric)<br>(train+val)
492
+ </td>
493
+ <td>Mean Accuracy<br>(test/test2)</td>
494
+ <td>
495
+ 92.61<br>
496
+ 90.58
497
+ </td>
498
+ <td>
499
+ 92.79<br>
500
+ 90.54
501
+ </td>
502
+ </tr>
503
+ <tr>
504
+ <td>
505
+ <a href="https://arxiv.org/abs/2203.10244">ChartQA</a><br>(human+aug)x(train+val)
506
+ </td>
507
+ <td>Mean Relaxed<br>Accuracy<br>(test_human,<br>test_aug)</td>
508
+ <td>57.08</td>
509
+ <td>71.36</td>
510
+ </tr>
511
+ <tr>
512
+ <td>
513
+ <a href="https://vizwiz.org/tasks-and-datasets/vqa/">VizWiz VQA</a><br>(train+val)
514
+ </td>
515
+ <td>Accuracy<br>(Test server - std)</td>
516
+ <td>
517
+ 73.7
518
+ </td>
519
+ <td>
520
+ 75.52
521
+ </td>
522
+ </tr>
523
+ <tr>
524
+ <td>
525
+ <a href="https://arxiv.org/abs/1810.12440">TallyQA</a><br>(train)
526
+ </td>
527
+ <td>Accuracy<br>(test_simple/<br>test_complex)</td>
528
+ <td>
529
+ 81.72<br>
530
+ 69.56
531
+ </td>
532
+ <td>
533
+ 84.86<br>
534
+ 72.27
535
+ </td>
536
+ </tr>
537
+ <tr>
538
+ <td>
539
+ <a href="https://ocr-vqa.github.io/">OCR-VQA</a><br>(train+val)
540
+ </td>
541
+ <td>Accuracy (test)</td>
542
+ <td>72.32</td>
543
+ <td>74.61</td>
544
+ <td>74.93</td>
545
+ </tr>
546
+ <tr>
547
+ <td>
548
+ <a href="https://textvqa.org/">TextVQA</a><br>(train+val)
549
+ </td>
550
+ <td>Accuracy<br>(Test server - std)</td>
551
+ <td>55.47</td>
552
+ <td>73.15</td>
553
+ <td>76.48</td>
554
+ </tr>
555
+ <tr>
556
+ <td>
557
+ <a href="https://www.docvqa.org/">DocVQA</a><br>(train+val)
558
+ </td>
559
+ <td>ANLS (Test server)</td>
560
+ <td>43.74</td>
561
+ <td>78.02</td>
562
+ <td>84.77</td>
563
+ </tr>
564
+ <tr>
565
+ <td>
566
+ <a href="https://openaccess.thecvf.com/content/WACV2022/papers/Mathew_InfographicVQA_WACV_2022_paper.pdf">Infographic VQA</a><br>(train+val)
567
+ </td>
568
+ <td>ANLS (Test server)</td>
569
+ <td>28.46</td>
570
+ <td>40.47</td>
571
+ <td>47.75</td>
572
+ </tr>
573
+ <tr>
574
+ <td>
575
+ <a href="https://arxiv.org/abs/1905.13648">SceneText VQA</a><br>(train+val)
576
+ </td>
577
+ <td>ANLS (Test server)</td>
578
+ <td>63.29</td>
579
+ <td>81.82</td>
580
+ <td>84.40</td>
581
+ </tr>
582
+ <tr>
583
+ <th>Segmentation</th>
584
+ </tr>
585
+ <tr>
586
+ <td>
587
+ <a href="https://arxiv.org/abs/1608.00272">RefCOCO</a><br>(combined refcoco, refcoco+,<br>refcocog excluding val<br>and test images)
588
+ </td>
589
+ <td>MIoU<br>(validation)<br>refcoco/refcoco+/<br>refcocog</td>
590
+ <td>
591
+ 73.40<br>
592
+ 68.32<br>
593
+ 67.65
594
+ </td>
595
+ <td>
596
+ 75.57<br>
597
+ 69.76<br>
598
+ 70.17
599
+ </td>
600
+ <td>
601
+ 76.94<br>
602
+ 72.18<br>
603
+ 72.22
604
+ </td>
605
+ </tr>
606
+ <tr>
607
+ <th>Video tasks (Caption/QA)</th>
608
+ </tr>
609
+ <tr>
610
+ <td>MSR-VTT (Captioning)</td>
611
+ <td>CIDEr (test)</td>
612
+ <td>70.54</td>
613
+ </tr>
614
+ <tr>
615
+ <td>MSR-VTT (QA)</td>
616
+ <td>Accuracy (test)</td>
617
+ <td>50.09</td>
618
+ </tr>
619
+ <tr>
620
+ <td>ActivityNet (Captioning)</td>
621
+ <td>CIDEr (test)</td>
622
+ <td>34.62</td>
623
+ </tr>
624
+ <tr>
625
+ <td>ActivityNet (QA)</td>
626
+ <td>Accuracy (test)</td>
627
+ <td>50.78</td>
628
+ </tr>
629
+ <tr>
630
+ <td>VATEX (Captioning)</td>
631
+ <td>CIDEr (test)</td>
632
+ <td>79.73</td>
633
+ </tr>
634
+ <tr>
635
+ <td>MSVD (QA)</td>
636
+ <td>Accuracy (test)</td>
637
+ <td>60.22</td>
638
+ </tr>
639
+ </tbody></table>
640
+
641
+ #### Mix model (fine-tune on mixture of transfer tasks)
642
+
643
+ <table>
644
+ <tbody><tr>
645
+ <th>Benchmark</th>
646
+ <th>Metric (split)</th>
647
+ <th>mix-224</th>
648
+ <th>mix-448</th>
649
+ </tr>
650
+ <tr>
651
+ <td><a href="https://arxiv.org/abs/2401.06209">MMVP</a></td>
652
+ <td>Paired Accuracy</td>
653
+ <td>46.00</td>
654
+ <td>45.33</td>
655
+ </tr>
656
+ <tr>
657
+ <td><a href="https://arxiv.org/abs/2305.10355">POPE</a></td>
658
+ <td>Accuracy<br>(random/popular/adversarial)</td>
659
+ <td>
660
+ 88.00<br>
661
+ 86.63<br>
662
+ 85.67
663
+ </td>
664
+ <td>
665
+ 89.37<br>
666
+ 88.40<br>
667
+ 87.47
668
+ </td>
669
+ </tr>
670
+ </tbody></table>
671
+
672
+ ## Ethics and safety
673
+
674
+ ### Evaluation approach
675
+
676
+ Our evaluation methods include structured evaluations and internal red-teaming
677
+ testing of relevant content policies. Red-teaming was conducted by a number of
678
+ different teams, each with different goals and human evaluation metrics. These
679
+ models were evaluated against a number of different categories relevant to
680
+ ethics and safety, including:
681
+
682
+ * Human evaluation on prompts covering child safety, content safety and
683
+ representational harms. See the [Gemma model
684
+ card](https://ai.google.dev/gemma/docs/model_card#evaluation_approach) for
685
+ more details on evaluation approach, but with image captioning and visual
686
+ question answering setups.
687
+ * Image-to-Text benchmark evaluation: Benchmark against relevant academic
688
+ datasets such as FairFace Dataset ([Karkkainen et al.,
689
+ 2021](https://arxiv.org/abs/1908.04913)).
690
+
691
+ ### Evaluation results
692
+
693
+ * The human evaluation results of ethics and safety evaluations are within
694
+ acceptable thresholds for meeting [internal
695
+ policies](https://storage.googleapis.com/gweb-uniblog-publish-prod/documents/2023_Google_AI_Principles_Progress_Update.pdf#page=11)
696
+ for categories such as child safety, content safety and representational
697
+ harms.
698
+ * On top of robust internal evaluations, we also use the Perspective API
699
+ (threshold of 0.8) to measure toxicity, profanity, and other potential
700
+ issues in the generated captions for images sourced from the FairFace
701
+ dataset. We report the maximum and median values observed across subgroups
702
+ for each of the perceived gender, ethnicity, and age attributes.
703
+
704
+
705
+ <table>
706
+ <tbody><tr>
707
+ </tr></tbody><tbody><tr><th>Metric</th>
708
+ <th>Perceived<br>gender</th>
709
+ <th></th>
710
+ <th>Ethnicity</th>
711
+ <th></th>
712
+ <th>Age group</th>
713
+ <th></th>
714
+ </tr>
715
+ <tr>
716
+ <th></th>
717
+ <th>Maximum</th>
718
+ <th>Median</th>
719
+ <th>Maximum</th>
720
+ <th>Median</th>
721
+ <th>Maximum</th>
722
+ <th>Median</th>
723
+ </tr>
724
+ <tr>
725
+ <td>Toxicity</td>
726
+ <td>0.04%</td>
727
+ <td>0.03%</td>
728
+ <td>0.08%</td>
729
+ <td>0.00%</td>
730
+ <td>0.09%</td>
731
+ <td>0.00%</td>
732
+ </tr>
733
+ <tr>
734
+ <td>Identity Attack</td>
735
+ <td>0.00%</td>
736
+ <td>0.00%</td>
737
+ <td>0.00%</td>
738
+ <td>0.00%</td>
739
+ <td>0.00%</td>
740
+ <td>0.00%</td>
741
+ </tr>
742
+ <tr>
743
+ <td>Insult</td>
744
+ <td>0.06%</td>
745
+ <td>0.04%</td>
746
+ <td>0.09%</td>
747
+ <td>0.07%</td>
748
+ <td>0.16%</td>
749
+ <td>0.00%</td>
750
+ </tr>
751
+ <tr>
752
+ <td>Threat</td>
753
+ <td>0.06%</td>
754
+ <td>0.05%</td>
755
+ <td>0.14%</td>
756
+ <td>0.05%</td>
757
+ <td>0.17%</td>
758
+ <td>0.00%</td>
759
+ </tr>
760
+ <tr>
761
+ <td>Profanity</td>
762
+ <td>0.00%</td>
763
+ <td>0.00%</td>
764
+ <td>0.00%</td>
765
+ <td>0.00%</td>
766
+ <td>0.00%</td>
767
+ <td>0.00%</td>
768
+ </tr>
769
+ </tbody></table>
770
+
771
+ ## Usage and limitations
772
+
773
+ ### Intended usage
774
+
775
+ Open Vision Language Models (VLMs) have a wide range of applications across
776
+ various industries and domains. The following list of potential uses is not
777
+ comprehensive. The purpose of this list is to provide contextual information
778
+ about the possible use-cases that the model creators considered as part of model
779
+ training and development.
780
+
781
+ Fine-tune on specific vision-language task:
782
+
783
+ * The pre-trained models can be fine-tuned on a wide range of vision-language
784
+ tasks such as: image captioning, short video caption, visual question
785
+ answering, text reading, object detection and object segmentation.
786
+ * The pre-trained models can be fine-tuned for specific domains such as remote
787
+ sensing question answering, visual questions from people who are blind,
788
+ science question answering, describe UI element functionalities.
789
+ * The pre-trained models can be fine-tuned for tasks with non-textual outputs
790
+ such as bounding boxes or segmentation masks.
791
+
792
+ Vision-language research:
793
+
794
+ * The pre-trained models and fine-tuned models can serve as a foundation for researchers to experiment with VLM
795
+ techniques, develop algorithms, and contribute to the advancement of the
796
+ field.
797
+
798
+ ### Ethical considerations and risks
799
+
800
+ The development of vision-language models (VLMs) raises several ethical concerns. In creating an open model, we have carefully considered the following:
801
+
802
+ * Bias and Fairness
803
+ * VLMs trained on large-scale, real-world image-text data can reflect socio-cultural biases embedded in the training material. These models underwent careful scrutiny, input data pre-processing described and posterior evaluations reported in this card.
804
+ * Misinformation and Misuse
805
+ * VLMs can be misused to generate text that is false, misleading, or harmful.
806
+ * Guidelines are provided for responsible use with the model, see the [Responsible Generative AI Toolkit](https://ai.google.dev/responsible).
807
+ * Transparency and Accountability
808
+ * This model card summarizes details on the models' architecture, capabilities, limitations, and evaluation processes.
809
+ * A responsibly developed open model offers the opportunity to share innovation by making VLM technology accessible to developers and researchers across the AI ecosystem.
810
+
811
+
812
+ Risks identified and mitigations:
813
+
814
+ * **Perpetuation of biases:** It's encouraged to perform continuous monitoring
815
+ (using evaluation metrics, human review) and the exploration of de-biasing
816
+ techniques during model training, fine-tuning, and other use cases.
817
+ * **Generation of harmful content:** Mechanisms and guidelines for content
818
+ safety are essential. Developers are encouraged to exercise caution and
819
+ implement appropriate content safety safeguards based on their specific
820
+ product policies and application use cases.
821
+ * **Misuse for malicious purposes:** Technical limitations and developer and
822
+ end-user education can help mitigate against malicious applications of LLMs.
823
+ Educational resources and reporting mechanisms for users to flag misuse are
824
+ provided. Prohibited uses of Gemma models are outlined in the [Gemma
825
+ Prohibited Use Policy](https://ai.google.dev/gemma/prohibited_use_policy).
826
+ * **Privacy violations:** Models were trained on data filtered to remove certain personal information and sensitive data. Developers are encouraged to adhere to privacy regulations with privacy-preserving techniques.
827
+
828
+ ### Limitations
829
+
830
+ * Most limitations inherited from the underlying Gemma model still apply:
831
+ * VLMs are better at tasks that can be framed with clear prompts and
832
+ instructions. Open-ended or highly complex tasks might be challenging.
833
+ * Natural language is inherently complex. VLMs might struggle to grasp
834
+ subtle nuances, sarcasm, or figurative language.
835
+ * VLMs generate responses based on information they learned from their
836
+ training datasets, but they are not knowledge bases. They may generate
837
+ incorrect or outdated factual statements.
838
+ * VLMs rely on statistical patterns in language and images. They might
839
+ lack the ability to apply common sense reasoning in certain situations.
840
+ * PaliGemma was designed first and foremost to serve as a general pre-trained
841
+ model for transfer to specialized tasks. Hence, its "out of the box" or
842
+ "zero-shot" performance might lag behind models designed specifically for
843
+ that.
844
+ * PaliGemma is not a multi-turn chatbot. It is designed for a single round of
845
+ image and text input.
846
+
847
+ ## Citation
848
+
849
+ ```bibtex
850
+ @article{beyer2024paligemma,
851
+ title={{PaliGemma: A versatile 3B VLM for transfer}},
852
+ author={Lucas Beyer* and Andreas Steiner* and André Susano Pinto* and Alexander Kolesnikov* and Xiao Wang* and Daniel Salz and Maxim Neumann and Ibrahim Alabdulmohsin and Michael Tschannen and Emanuele Bugliarello and Thomas Unterthiner and Daniel Keysers and Skanda Koppula and Fangyu Liu and Adam Grycner and Alexey Gritsenko and Neil Houlsby and Manoj Kumar and Keran Rong and Julian Eisenschlos and Rishabh Kabra and Matthias Bauer and Matko Bošnjak and Xi Chen and Matthias Minderer and Paul Voigtlaender and Ioana Bica and Ivana Balazevic and Joan Puigcerver and Pinelopi Papalampidi and Olivier Henaff and Xi Xiong and Radu Soricut and Jeremiah Harmsen and Xiaohua Zhai*},
853
+ year={2024},
854
+ journal={arXiv preprint arXiv:2407.07726}
855
+ }
856
+ ```
857
+
858
+
859
+ Find the paper [here](https://arxiv.org/abs/2407.07726).
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image>": 257152
3
+ }
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "final-hf/paligemma-3b-pt-448-main",
3
+ "architectures": [
4
+ "PaliGemmaForConditionalGeneration"
5
+ ],
6
+ "bos_token_id": 2,
7
+ "eos_token_id": 1,
8
+ "hidden_size": 2048,
9
+ "ignore_index": -100,
10
+ "image_token_index": 257152,
11
+ "model_type": "paligemma",
12
+ "pad_token_id": 0,
13
+ "projection_dim": 2048,
14
+ "text_config": {
15
+ "hidden_size": 2048,
16
+ "intermediate_size": 16384,
17
+ "model_type": "gemma",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 18,
20
+ "num_image_tokens": 1024,
21
+ "num_key_value_heads": 1,
22
+ "torch_dtype": "float32",
23
+ "vocab_size": 257216
24
+ },
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.41.0.dev0",
27
+ "vision_config": {
28
+ "hidden_size": 1152,
29
+ "image_size": 448,
30
+ "intermediate_size": 4304,
31
+ "model_type": "siglip_vision_model",
32
+ "num_attention_heads": 16,
33
+ "num_hidden_layers": 27,
34
+ "num_image_tokens": 1024,
35
+ "patch_size": 14,
36
+ "projection_dim": 2048,
37
+ "projector_hidden_act": "gelu_fast",
38
+ "vision_use_head": false
39
+ },
40
+ "vocab_size": 257216
41
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.41.0.dev0"
7
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:016569117e00622ec8ff7af9104a66b5e81baf06772674be997e5c41265fde96
3
+ size 4956951424
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f876f1e6915d8abda1652312dc9a0ea7810a137e4996226fc1dfb2ede7ad8109
3
+ size 4999820608
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:418030fe594809e40b1d57f3e386122795fc17716a57266db6b32c504e00b07a
3
+ size 1740714288
model.safetensors.index.json ADDED
@@ -0,0 +1,610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 11697404864
4
+ },
5
+ "weight_map": {
6
+ "language_model.model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
8
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
9
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
10
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
11
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
12
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
13
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
14
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
15
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
16
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
17
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
18
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
19
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
20
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
22
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
23
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
24
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
25
+ "language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
26
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
27
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
28
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
29
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
30
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
31
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
32
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
33
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
34
+ "language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
35
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
36
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
37
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
38
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
39
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
40
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
41
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
42
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
43
+ "language_model.model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
44
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
45
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
46
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
47
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
48
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
49
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
50
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
51
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
52
+ "language_model.model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
53
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
54
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
55
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
56
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
57
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
58
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
59
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
60
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
61
+ "language_model.model.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors",
62
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
63
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
64
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
65
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
66
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
67
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
68
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
69
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
70
+ "language_model.model.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors",
71
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
72
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
73
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
74
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
75
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
76
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
77
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
78
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
79
+ "language_model.model.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors",
80
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
81
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
82
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
83
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
84
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
85
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
86
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
87
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
88
+ "language_model.model.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors",
89
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
90
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
91
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
92
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
93
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
94
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
95
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
96
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
97
+ "language_model.model.layers.2.input_layernorm.weight": "model-00002-of-00003.safetensors",
98
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
99
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
100
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
101
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
102
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
103
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
104
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
105
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
106
+ "language_model.model.layers.3.input_layernorm.weight": "model-00002-of-00003.safetensors",
107
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
108
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
109
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
110
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
111
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
112
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
113
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
114
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
115
+ "language_model.model.layers.4.input_layernorm.weight": "model-00002-of-00003.safetensors",
116
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
117
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
118
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
119
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
120
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
121
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
122
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
123
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
124
+ "language_model.model.layers.5.input_layernorm.weight": "model-00002-of-00003.safetensors",
125
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
126
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
127
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
128
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
129
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
130
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
131
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
132
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
133
+ "language_model.model.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors",
134
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
135
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
136
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
137
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
138
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
139
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
140
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
141
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
142
+ "language_model.model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors",
143
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
144
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
145
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
146
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
147
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
148
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
149
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
150
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
151
+ "language_model.model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors",
152
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
153
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
154
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
155
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
156
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
157
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
158
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
159
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
160
+ "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors",
161
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
162
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
163
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
164
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
165
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
166
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
167
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
168
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
169
+ "language_model.model.norm.weight": "model-00003-of-00003.safetensors",
170
+ "multi_modal_projector.linear.bias": "model-00001-of-00003.safetensors",
171
+ "multi_modal_projector.linear.weight": "model-00001-of-00003.safetensors",
172
+ "vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors",
173
+ "vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors",
174
+ "vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors",
175
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors",
176
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors",
177
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors",
178
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors",
179
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
180
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
181
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
182
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
183
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
184
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
185
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
186
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
187
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
188
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
189
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
190
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
191
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors",
192
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors",
193
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors",
194
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors",
195
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
196
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
197
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
198
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
199
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
200
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
201
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
202
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
203
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
204
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
205
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
206
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
207
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors",
208
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors",
209
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors",
210
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors",
211
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
212
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
213
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
214
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
215
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
216
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
217
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
218
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
219
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
220
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
221
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
222
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
223
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors",
224
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors",
225
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors",
226
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors",
227
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
228
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
229
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
230
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
231
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
232
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
233
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
234
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
235
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
236
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
237
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
238
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
239
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors",
240
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors",
241
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors",
242
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors",
243
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
244
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
245
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
246
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
247
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
248
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
249
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
250
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
251
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
252
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
253
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
254
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
255
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors",
256
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors",
257
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors",
258
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors",
259
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
260
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
261
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
262
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
263
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
264
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
265
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
266
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
267
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
268
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
269
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
270
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
271
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors",
272
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors",
273
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors",
274
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors",
275
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors",
276
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors",
277
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors",
278
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors",
279
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
280
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
281
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
282
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
283
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
284
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
285
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
286
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors",
288
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors",
289
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors",
290
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors",
291
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors",
292
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors",
293
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors",
294
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors",
295
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
296
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
297
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
298
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
299
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
300
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
301
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
302
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
303
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors",
304
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors",
305
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors",
306
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors",
307
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors",
308
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors",
309
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors",
310
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors",
311
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
312
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
313
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
314
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
315
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
316
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
317
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
318
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
319
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors",
320
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors",
321
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors",
322
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors",
323
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors",
324
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors",
325
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors",
326
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors",
327
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
328
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
329
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
330
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
331
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
332
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
333
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
334
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
335
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors",
336
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors",
337
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors",
338
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors",
339
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors",
340
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors",
341
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors",
342
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors",
343
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
344
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
345
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
346
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
347
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
348
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
349
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
350
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
351
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors",
352
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors",
353
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors",
354
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors",
355
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors",
356
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors",
357
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors",
358
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors",
359
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
360
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
361
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
362
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
363
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
364
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
365
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
366
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
367
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors",
368
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors",
369
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors",
370
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors",
371
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
372
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
373
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
374
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
375
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
376
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
377
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
378
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
379
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
380
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
381
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
382
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
383
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors",
384
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors",
385
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors",
386
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors",
387
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors",
388
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors",
389
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors",
390
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors",
391
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
392
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
393
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
394
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
395
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
396
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
397
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
398
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
399
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors",
400
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors",
401
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors",
402
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors",
403
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors",
404
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors",
405
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors",
406
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors",
407
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
408
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
409
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
410
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
411
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
412
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
413
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
414
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
415
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors",
416
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors",
417
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors",
418
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors",
419
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors",
420
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors",
421
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors",
422
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors",
423
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
424
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
425
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
426
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
427
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
428
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
429
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
430
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
431
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors",
432
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors",
433
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors",
434
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors",
435
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors",
436
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors",
437
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors",
438
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors",
439
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
440
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
441
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
442
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
443
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
444
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
445
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
446
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
447
+ "vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors",
448
+ "vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors",
449
+ "vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors",
450
+ "vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors",
451
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors",
452
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors",
453
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors",
454
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors",
455
+ "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
456
+ "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
457
+ "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
458
+ "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
459
+ "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
460
+ "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
461
+ "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
462
+ "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
463
+ "vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors",
464
+ "vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors",
465
+ "vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors",
466
+ "vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors",
467
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors",
468
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors",
469
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors",
470
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors",
471
+ "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
472
+ "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
473
+ "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
474
+ "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
475
+ "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
476
+ "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
477
+ "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
478
+ "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
479
+ "vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors",
480
+ "vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors",
481
+ "vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors",
482
+ "vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors",
483
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors",
484
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors",
485
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors",
486
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors",
487
+ "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
488
+ "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
489
+ "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
490
+ "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
491
+ "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
492
+ "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
493
+ "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
494
+ "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
495
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors",
496
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors",
497
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors",
498
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors",
499
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
500
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
501
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
502
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
503
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
504
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
505
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
506
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
507
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
508
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
509
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
510
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
511
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors",
512
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors",
513
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors",
514
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors",
515
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
516
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
517
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
518
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
519
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
520
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
521
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
522
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
523
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
524
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
525
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
526
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
527
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors",
528
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors",
529
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors",
530
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors",
531
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
532
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
533
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
534
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
535
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
536
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
537
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
538
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
539
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
540
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
541
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
542
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
543
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors",
544
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors",
545
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors",
546
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors",
547
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
548
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
549
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
550
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
551
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
552
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
553
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
554
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
555
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
556
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
557
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
558
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
559
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors",
560
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors",
561
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors",
562
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors",
563
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
564
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
565
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
566
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
567
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
568
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
569
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
570
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
571
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
572
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
573
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
574
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
575
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors",
576
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors",
577
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors",
578
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors",
579
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
580
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
581
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
582
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
583
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
584
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
585
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
586
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
587
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
588
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
589
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
590
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
591
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors",
592
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors",
593
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors",
594
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors",
595
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
596
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
597
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
598
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
599
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
600
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
601
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors",
602
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors",
603
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
604
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
605
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
606
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
607
+ "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors",
608
+ "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors"
609
+ }
610
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format",
15
+ "do_convert_rgb"
16
+ ],
17
+ "do_convert_rgb": null,
18
+ "do_normalize": true,
19
+ "do_rescale": true,
20
+ "do_resize": true,
21
+ "image_mean": [
22
+ 0.5,
23
+ 0.5,
24
+ 0.5
25
+ ],
26
+ "image_processor_type": "SiglipImageProcessor",
27
+ "image_seq_length": 1024,
28
+ "image_std": [
29
+ 0.5,
30
+ 0.5,
31
+ 0.5
32
+ ],
33
+ "processor_class": "PaliGemmaProcessor",
34
+ "resample": 3,
35
+ "rescale_factor": 0.00392156862745098,
36
+ "size": {
37
+ "height": 448,
38
+ "width": 448
39
+ }
40
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<image>"
4
+ ],
5
+ "bos_token": {
6
+ "content": "<bos>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "content": "<eos>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef6773c135b77b834de1d13c75a4c98ab7a3684ffd602d1831e1f1bf5467c563
3
+ size 17549604
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8986bb4f423f07f8c7f70d0dbe3526fb2316056c17bae71b1ea975e77a168fc6
3
+ size 4264023
tokenizer_config.json ADDED
@@ -0,0 +1,1764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "5": {
46
+ "content": "<2mass>",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "6": {
54
+ "content": "[@BOS@]",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "7": {
62
+ "content": "<unused0>",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "8": {
70
+ "content": "<unused1>",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "9": {
78
+ "content": "<unused2>",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "10": {
86
+ "content": "<unused3>",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "11": {
94
+ "content": "<unused4>",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "12": {
102
+ "content": "<unused5>",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "13": {
110
+ "content": "<unused6>",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "14": {
118
+ "content": "<unused7>",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "15": {
126
+ "content": "<unused8>",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "16": {
134
+ "content": "<unused9>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "17": {
142
+ "content": "<unused10>",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "18": {
150
+ "content": "<unused11>",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "19": {
158
+ "content": "<unused12>",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "20": {
166
+ "content": "<unused13>",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "21": {
174
+ "content": "<unused14>",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "22": {
182
+ "content": "<unused15>",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "23": {
190
+ "content": "<unused16>",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "24": {
198
+ "content": "<unused17>",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "25": {
206
+ "content": "<unused18>",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "26": {
214
+ "content": "<unused19>",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "27": {
222
+ "content": "<unused20>",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "28": {
230
+ "content": "<unused21>",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "29": {
238
+ "content": "<unused22>",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "30": {
246
+ "content": "<unused23>",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "31": {
254
+ "content": "<unused24>",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "32": {
262
+ "content": "<unused25>",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "33": {
270
+ "content": "<unused26>",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "34": {
278
+ "content": "<unused27>",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "35": {
286
+ "content": "<unused28>",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "36": {
294
+ "content": "<unused29>",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "37": {
302
+ "content": "<unused30>",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "38": {
310
+ "content": "<unused31>",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "39": {
318
+ "content": "<unused32>",
319
+ "lstrip": false,
320
+ "normalized": true,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "40": {
326
+ "content": "<unused33>",
327
+ "lstrip": false,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "41": {
334
+ "content": "<unused34>",
335
+ "lstrip": false,
336
+ "normalized": true,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "42": {
342
+ "content": "<unused35>",
343
+ "lstrip": false,
344
+ "normalized": true,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "43": {
350
+ "content": "<unused36>",
351
+ "lstrip": false,
352
+ "normalized": true,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "44": {
358
+ "content": "<unused37>",
359
+ "lstrip": false,
360
+ "normalized": true,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "45": {
366
+ "content": "<unused38>",
367
+ "lstrip": false,
368
+ "normalized": true,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "46": {
374
+ "content": "<unused39>",
375
+ "lstrip": false,
376
+ "normalized": true,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "47": {
382
+ "content": "<unused40>",
383
+ "lstrip": false,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "48": {
390
+ "content": "<unused41>",
391
+ "lstrip": false,
392
+ "normalized": true,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "49": {
398
+ "content": "<unused42>",
399
+ "lstrip": false,
400
+ "normalized": true,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50": {
406
+ "content": "<unused43>",
407
+ "lstrip": false,
408
+ "normalized": true,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "51": {
414
+ "content": "<unused44>",
415
+ "lstrip": false,
416
+ "normalized": true,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "52": {
422
+ "content": "<unused45>",
423
+ "lstrip": false,
424
+ "normalized": true,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "53": {
430
+ "content": "<unused46>",
431
+ "lstrip": false,
432
+ "normalized": true,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "54": {
438
+ "content": "<unused47>",
439
+ "lstrip": false,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "55": {
446
+ "content": "<unused48>",
447
+ "lstrip": false,
448
+ "normalized": true,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "56": {
454
+ "content": "<unused49>",
455
+ "lstrip": false,
456
+ "normalized": true,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "57": {
462
+ "content": "<unused50>",
463
+ "lstrip": false,
464
+ "normalized": true,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "58": {
470
+ "content": "<unused51>",
471
+ "lstrip": false,
472
+ "normalized": true,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "59": {
478
+ "content": "<unused52>",
479
+ "lstrip": false,
480
+ "normalized": true,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "60": {
486
+ "content": "<unused53>",
487
+ "lstrip": false,
488
+ "normalized": true,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "61": {
494
+ "content": "<unused54>",
495
+ "lstrip": false,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "62": {
502
+ "content": "<unused55>",
503
+ "lstrip": false,
504
+ "normalized": true,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "63": {
510
+ "content": "<unused56>",
511
+ "lstrip": false,
512
+ "normalized": true,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "64": {
518
+ "content": "<unused57>",
519
+ "lstrip": false,
520
+ "normalized": true,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "65": {
526
+ "content": "<unused58>",
527
+ "lstrip": false,
528
+ "normalized": true,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "66": {
534
+ "content": "<unused59>",
535
+ "lstrip": false,
536
+ "normalized": true,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "67": {
542
+ "content": "<unused60>",
543
+ "lstrip": false,
544
+ "normalized": true,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "68": {
550
+ "content": "<unused61>",
551
+ "lstrip": false,
552
+ "normalized": true,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "69": {
558
+ "content": "<unused62>",
559
+ "lstrip": false,
560
+ "normalized": true,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "70": {
566
+ "content": "<unused63>",
567
+ "lstrip": false,
568
+ "normalized": true,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "71": {
574
+ "content": "<unused64>",
575
+ "lstrip": false,
576
+ "normalized": true,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "72": {
582
+ "content": "<unused65>",
583
+ "lstrip": false,
584
+ "normalized": true,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "73": {
590
+ "content": "<unused66>",
591
+ "lstrip": false,
592
+ "normalized": true,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "74": {
598
+ "content": "<unused67>",
599
+ "lstrip": false,
600
+ "normalized": true,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "75": {
606
+ "content": "<unused68>",
607
+ "lstrip": false,
608
+ "normalized": true,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "76": {
614
+ "content": "<unused69>",
615
+ "lstrip": false,
616
+ "normalized": true,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "77": {
622
+ "content": "<unused70>",
623
+ "lstrip": false,
624
+ "normalized": true,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "78": {
630
+ "content": "<unused71>",
631
+ "lstrip": false,
632
+ "normalized": true,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "79": {
638
+ "content": "<unused72>",
639
+ "lstrip": false,
640
+ "normalized": true,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "80": {
646
+ "content": "<unused73>",
647
+ "lstrip": false,
648
+ "normalized": true,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "81": {
654
+ "content": "<unused74>",
655
+ "lstrip": false,
656
+ "normalized": true,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "82": {
662
+ "content": "<unused75>",
663
+ "lstrip": false,
664
+ "normalized": true,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "83": {
670
+ "content": "<unused76>",
671
+ "lstrip": false,
672
+ "normalized": true,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "84": {
678
+ "content": "<unused77>",
679
+ "lstrip": false,
680
+ "normalized": true,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "85": {
686
+ "content": "<unused78>",
687
+ "lstrip": false,
688
+ "normalized": true,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "86": {
694
+ "content": "<unused79>",
695
+ "lstrip": false,
696
+ "normalized": true,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "87": {
702
+ "content": "<unused80>",
703
+ "lstrip": false,
704
+ "normalized": true,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "88": {
710
+ "content": "<unused81>",
711
+ "lstrip": false,
712
+ "normalized": true,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "89": {
718
+ "content": "<unused82>",
719
+ "lstrip": false,
720
+ "normalized": true,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "90": {
726
+ "content": "<unused83>",
727
+ "lstrip": false,
728
+ "normalized": true,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "91": {
734
+ "content": "<unused84>",
735
+ "lstrip": false,
736
+ "normalized": true,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "92": {
742
+ "content": "<unused85>",
743
+ "lstrip": false,
744
+ "normalized": true,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "93": {
750
+ "content": "<unused86>",
751
+ "lstrip": false,
752
+ "normalized": true,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "94": {
758
+ "content": "<unused87>",
759
+ "lstrip": false,
760
+ "normalized": true,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "95": {
766
+ "content": "<unused88>",
767
+ "lstrip": false,
768
+ "normalized": true,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "96": {
774
+ "content": "<unused89>",
775
+ "lstrip": false,
776
+ "normalized": true,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "97": {
782
+ "content": "<unused90>",
783
+ "lstrip": false,
784
+ "normalized": true,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "98": {
790
+ "content": "<unused91>",
791
+ "lstrip": false,
792
+ "normalized": true,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "99": {
798
+ "content": "<unused92>",
799
+ "lstrip": false,
800
+ "normalized": true,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "100": {
806
+ "content": "<unused93>",
807
+ "lstrip": false,
808
+ "normalized": true,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "101": {
814
+ "content": "<unused94>",
815
+ "lstrip": false,
816
+ "normalized": true,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ },
821
+ "102": {
822
+ "content": "<unused95>",
823
+ "lstrip": false,
824
+ "normalized": true,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": false
828
+ },
829
+ "103": {
830
+ "content": "<unused96>",
831
+ "lstrip": false,
832
+ "normalized": true,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": false
836
+ },
837
+ "104": {
838
+ "content": "<unused97>",
839
+ "lstrip": false,
840
+ "normalized": true,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": false
844
+ },
845
+ "105": {
846
+ "content": "<unused98>",
847
+ "lstrip": false,
848
+ "normalized": true,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": false
852
+ },
853
+ "106": {
854
+ "content": "<start_of_turn>",
855
+ "lstrip": false,
856
+ "normalized": true,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": false
860
+ },
861
+ "107": {
862
+ "content": "<end_of_turn>",
863
+ "lstrip": false,
864
+ "normalized": true,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": false
868
+ },
869
+ "108": {
870
+ "content": "\n",
871
+ "lstrip": false,
872
+ "normalized": true,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": false
876
+ },
877
+ "109": {
878
+ "content": "\n\n",
879
+ "lstrip": false,
880
+ "normalized": true,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": false
884
+ },
885
+ "110": {
886
+ "content": "\n\n\n",
887
+ "lstrip": false,
888
+ "normalized": true,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": false
892
+ },
893
+ "111": {
894
+ "content": "\n\n\n\n",
895
+ "lstrip": false,
896
+ "normalized": true,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": false
900
+ },
901
+ "112": {
902
+ "content": "\n\n\n\n\n",
903
+ "lstrip": false,
904
+ "normalized": true,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": false
908
+ },
909
+ "113": {
910
+ "content": "\n\n\n\n\n\n",
911
+ "lstrip": false,
912
+ "normalized": true,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": false
916
+ },
917
+ "114": {
918
+ "content": "\n\n\n\n\n\n\n",
919
+ "lstrip": false,
920
+ "normalized": true,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": false
924
+ },
925
+ "115": {
926
+ "content": "\n\n\n\n\n\n\n\n",
927
+ "lstrip": false,
928
+ "normalized": true,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": false
932
+ },
933
+ "116": {
934
+ "content": "\n\n\n\n\n\n\n\n\n",
935
+ "lstrip": false,
936
+ "normalized": true,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": false
940
+ },
941
+ "117": {
942
+ "content": "\n\n\n\n\n\n\n\n\n\n",
943
+ "lstrip": false,
944
+ "normalized": true,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": false
948
+ },
949
+ "118": {
950
+ "content": "\n\n\n\n\n\n\n\n\n\n\n",
951
+ "lstrip": false,
952
+ "normalized": true,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": false
956
+ },
957
+ "119": {
958
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n",
959
+ "lstrip": false,
960
+ "normalized": true,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": false
964
+ },
965
+ "120": {
966
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n",
967
+ "lstrip": false,
968
+ "normalized": true,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": false
972
+ },
973
+ "121": {
974
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
975
+ "lstrip": false,
976
+ "normalized": true,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": false
980
+ },
981
+ "122": {
982
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
983
+ "lstrip": false,
984
+ "normalized": true,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": false
988
+ },
989
+ "123": {
990
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
991
+ "lstrip": false,
992
+ "normalized": true,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": false
996
+ },
997
+ "124": {
998
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
999
+ "lstrip": false,
1000
+ "normalized": true,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": false
1004
+ },
1005
+ "125": {
1006
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1007
+ "lstrip": false,
1008
+ "normalized": true,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": false
1012
+ },
1013
+ "126": {
1014
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1015
+ "lstrip": false,
1016
+ "normalized": true,
1017
+ "rstrip": false,
1018
+ "single_word": false,
1019
+ "special": false
1020
+ },
1021
+ "127": {
1022
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1023
+ "lstrip": false,
1024
+ "normalized": true,
1025
+ "rstrip": false,
1026
+ "single_word": false,
1027
+ "special": false
1028
+ },
1029
+ "128": {
1030
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1031
+ "lstrip": false,
1032
+ "normalized": true,
1033
+ "rstrip": false,
1034
+ "single_word": false,
1035
+ "special": false
1036
+ },
1037
+ "129": {
1038
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1039
+ "lstrip": false,
1040
+ "normalized": true,
1041
+ "rstrip": false,
1042
+ "single_word": false,
1043
+ "special": false
1044
+ },
1045
+ "130": {
1046
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1047
+ "lstrip": false,
1048
+ "normalized": true,
1049
+ "rstrip": false,
1050
+ "single_word": false,
1051
+ "special": false
1052
+ },
1053
+ "131": {
1054
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1055
+ "lstrip": false,
1056
+ "normalized": true,
1057
+ "rstrip": false,
1058
+ "single_word": false,
1059
+ "special": false
1060
+ },
1061
+ "132": {
1062
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1063
+ "lstrip": false,
1064
+ "normalized": true,
1065
+ "rstrip": false,
1066
+ "single_word": false,
1067
+ "special": false
1068
+ },
1069
+ "133": {
1070
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1071
+ "lstrip": false,
1072
+ "normalized": true,
1073
+ "rstrip": false,
1074
+ "single_word": false,
1075
+ "special": false
1076
+ },
1077
+ "134": {
1078
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1079
+ "lstrip": false,
1080
+ "normalized": true,
1081
+ "rstrip": false,
1082
+ "single_word": false,
1083
+ "special": false
1084
+ },
1085
+ "135": {
1086
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1087
+ "lstrip": false,
1088
+ "normalized": true,
1089
+ "rstrip": false,
1090
+ "single_word": false,
1091
+ "special": false
1092
+ },
1093
+ "136": {
1094
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1095
+ "lstrip": false,
1096
+ "normalized": true,
1097
+ "rstrip": false,
1098
+ "single_word": false,
1099
+ "special": false
1100
+ },
1101
+ "137": {
1102
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1103
+ "lstrip": false,
1104
+ "normalized": true,
1105
+ "rstrip": false,
1106
+ "single_word": false,
1107
+ "special": false
1108
+ },
1109
+ "138": {
1110
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1111
+ "lstrip": false,
1112
+ "normalized": true,
1113
+ "rstrip": false,
1114
+ "single_word": false,
1115
+ "special": false
1116
+ },
1117
+ "139": {
1118
+ "content": "▁▁",
1119
+ "lstrip": false,
1120
+ "normalized": true,
1121
+ "rstrip": false,
1122
+ "single_word": false,
1123
+ "special": false
1124
+ },
1125
+ "140": {
1126
+ "content": "▁▁▁",
1127
+ "lstrip": false,
1128
+ "normalized": true,
1129
+ "rstrip": false,
1130
+ "single_word": false,
1131
+ "special": false
1132
+ },
1133
+ "141": {
1134
+ "content": "▁▁▁▁",
1135
+ "lstrip": false,
1136
+ "normalized": true,
1137
+ "rstrip": false,
1138
+ "single_word": false,
1139
+ "special": false
1140
+ },
1141
+ "142": {
1142
+ "content": "▁▁▁▁▁",
1143
+ "lstrip": false,
1144
+ "normalized": true,
1145
+ "rstrip": false,
1146
+ "single_word": false,
1147
+ "special": false
1148
+ },
1149
+ "143": {
1150
+ "content": "▁▁▁▁▁▁",
1151
+ "lstrip": false,
1152
+ "normalized": true,
1153
+ "rstrip": false,
1154
+ "single_word": false,
1155
+ "special": false
1156
+ },
1157
+ "144": {
1158
+ "content": "▁▁▁▁▁▁▁",
1159
+ "lstrip": false,
1160
+ "normalized": true,
1161
+ "rstrip": false,
1162
+ "single_word": false,
1163
+ "special": false
1164
+ },
1165
+ "145": {
1166
+ "content": "▁▁▁▁▁▁▁▁",
1167
+ "lstrip": false,
1168
+ "normalized": true,
1169
+ "rstrip": false,
1170
+ "single_word": false,
1171
+ "special": false
1172
+ },
1173
+ "146": {
1174
+ "content": "▁▁▁▁▁▁▁▁▁",
1175
+ "lstrip": false,
1176
+ "normalized": true,
1177
+ "rstrip": false,
1178
+ "single_word": false,
1179
+ "special": false
1180
+ },
1181
+ "147": {
1182
+ "content": "▁▁▁▁▁▁▁▁▁▁",
1183
+ "lstrip": false,
1184
+ "normalized": true,
1185
+ "rstrip": false,
1186
+ "single_word": false,
1187
+ "special": false
1188
+ },
1189
+ "148": {
1190
+ "content": "▁▁▁▁▁▁▁▁▁▁▁",
1191
+ "lstrip": false,
1192
+ "normalized": true,
1193
+ "rstrip": false,
1194
+ "single_word": false,
1195
+ "special": false
1196
+ },
1197
+ "149": {
1198
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
1199
+ "lstrip": false,
1200
+ "normalized": true,
1201
+ "rstrip": false,
1202
+ "single_word": false,
1203
+ "special": false
1204
+ },
1205
+ "150": {
1206
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
1207
+ "lstrip": false,
1208
+ "normalized": true,
1209
+ "rstrip": false,
1210
+ "single_word": false,
1211
+ "special": false
1212
+ },
1213
+ "151": {
1214
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1215
+ "lstrip": false,
1216
+ "normalized": true,
1217
+ "rstrip": false,
1218
+ "single_word": false,
1219
+ "special": false
1220
+ },
1221
+ "152": {
1222
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1223
+ "lstrip": false,
1224
+ "normalized": true,
1225
+ "rstrip": false,
1226
+ "single_word": false,
1227
+ "special": false
1228
+ },
1229
+ "153": {
1230
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1231
+ "lstrip": false,
1232
+ "normalized": true,
1233
+ "rstrip": false,
1234
+ "single_word": false,
1235
+ "special": false
1236
+ },
1237
+ "154": {
1238
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1239
+ "lstrip": false,
1240
+ "normalized": true,
1241
+ "rstrip": false,
1242
+ "single_word": false,
1243
+ "special": false
1244
+ },
1245
+ "155": {
1246
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1247
+ "lstrip": false,
1248
+ "normalized": true,
1249
+ "rstrip": false,
1250
+ "single_word": false,
1251
+ "special": false
1252
+ },
1253
+ "156": {
1254
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1255
+ "lstrip": false,
1256
+ "normalized": true,
1257
+ "rstrip": false,
1258
+ "single_word": false,
1259
+ "special": false
1260
+ },
1261
+ "157": {
1262
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1263
+ "lstrip": false,
1264
+ "normalized": true,
1265
+ "rstrip": false,
1266
+ "single_word": false,
1267
+ "special": false
1268
+ },
1269
+ "158": {
1270
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1271
+ "lstrip": false,
1272
+ "normalized": true,
1273
+ "rstrip": false,
1274
+ "single_word": false,
1275
+ "special": false
1276
+ },
1277
+ "159": {
1278
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1279
+ "lstrip": false,
1280
+ "normalized": true,
1281
+ "rstrip": false,
1282
+ "single_word": false,
1283
+ "special": false
1284
+ },
1285
+ "160": {
1286
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1287
+ "lstrip": false,
1288
+ "normalized": true,
1289
+ "rstrip": false,
1290
+ "single_word": false,
1291
+ "special": false
1292
+ },
1293
+ "161": {
1294
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1295
+ "lstrip": false,
1296
+ "normalized": true,
1297
+ "rstrip": false,
1298
+ "single_word": false,
1299
+ "special": false
1300
+ },
1301
+ "162": {
1302
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1303
+ "lstrip": false,
1304
+ "normalized": true,
1305
+ "rstrip": false,
1306
+ "single_word": false,
1307
+ "special": false
1308
+ },
1309
+ "163": {
1310
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1311
+ "lstrip": false,
1312
+ "normalized": true,
1313
+ "rstrip": false,
1314
+ "single_word": false,
1315
+ "special": false
1316
+ },
1317
+ "164": {
1318
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1319
+ "lstrip": false,
1320
+ "normalized": true,
1321
+ "rstrip": false,
1322
+ "single_word": false,
1323
+ "special": false
1324
+ },
1325
+ "165": {
1326
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1327
+ "lstrip": false,
1328
+ "normalized": true,
1329
+ "rstrip": false,
1330
+ "single_word": false,
1331
+ "special": false
1332
+ },
1333
+ "166": {
1334
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1335
+ "lstrip": false,
1336
+ "normalized": true,
1337
+ "rstrip": false,
1338
+ "single_word": false,
1339
+ "special": false
1340
+ },
1341
+ "167": {
1342
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1343
+ "lstrip": false,
1344
+ "normalized": true,
1345
+ "rstrip": false,
1346
+ "single_word": false,
1347
+ "special": false
1348
+ },
1349
+ "168": {
1350
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1351
+ "lstrip": false,
1352
+ "normalized": true,
1353
+ "rstrip": false,
1354
+ "single_word": false,
1355
+ "special": false
1356
+ },
1357
+ "169": {
1358
+ "content": "<table>",
1359
+ "lstrip": false,
1360
+ "normalized": true,
1361
+ "rstrip": false,
1362
+ "single_word": false,
1363
+ "special": false
1364
+ },
1365
+ "170": {
1366
+ "content": "<caption>",
1367
+ "lstrip": false,
1368
+ "normalized": true,
1369
+ "rstrip": false,
1370
+ "single_word": false,
1371
+ "special": false
1372
+ },
1373
+ "171": {
1374
+ "content": "<thead>",
1375
+ "lstrip": false,
1376
+ "normalized": true,
1377
+ "rstrip": false,
1378
+ "single_word": false,
1379
+ "special": false
1380
+ },
1381
+ "172": {
1382
+ "content": "<tbody>",
1383
+ "lstrip": false,
1384
+ "normalized": true,
1385
+ "rstrip": false,
1386
+ "single_word": false,
1387
+ "special": false
1388
+ },
1389
+ "173": {
1390
+ "content": "<tfoot>",
1391
+ "lstrip": false,
1392
+ "normalized": true,
1393
+ "rstrip": false,
1394
+ "single_word": false,
1395
+ "special": false
1396
+ },
1397
+ "174": {
1398
+ "content": "<tr>",
1399
+ "lstrip": false,
1400
+ "normalized": true,
1401
+ "rstrip": false,
1402
+ "single_word": false,
1403
+ "special": false
1404
+ },
1405
+ "175": {
1406
+ "content": "<th>",
1407
+ "lstrip": false,
1408
+ "normalized": true,
1409
+ "rstrip": false,
1410
+ "single_word": false,
1411
+ "special": false
1412
+ },
1413
+ "176": {
1414
+ "content": "<td>",
1415
+ "lstrip": false,
1416
+ "normalized": true,
1417
+ "rstrip": false,
1418
+ "single_word": false,
1419
+ "special": false
1420
+ },
1421
+ "177": {
1422
+ "content": "</table>",
1423
+ "lstrip": false,
1424
+ "normalized": true,
1425
+ "rstrip": false,
1426
+ "single_word": false,
1427
+ "special": false
1428
+ },
1429
+ "178": {
1430
+ "content": "</caption>",
1431
+ "lstrip": false,
1432
+ "normalized": true,
1433
+ "rstrip": false,
1434
+ "single_word": false,
1435
+ "special": false
1436
+ },
1437
+ "179": {
1438
+ "content": "</thead>",
1439
+ "lstrip": false,
1440
+ "normalized": true,
1441
+ "rstrip": false,
1442
+ "single_word": false,
1443
+ "special": false
1444
+ },
1445
+ "180": {
1446
+ "content": "</tbody>",
1447
+ "lstrip": false,
1448
+ "normalized": true,
1449
+ "rstrip": false,
1450
+ "single_word": false,
1451
+ "special": false
1452
+ },
1453
+ "181": {
1454
+ "content": "</tfoot>",
1455
+ "lstrip": false,
1456
+ "normalized": true,
1457
+ "rstrip": false,
1458
+ "single_word": false,
1459
+ "special": false
1460
+ },
1461
+ "182": {
1462
+ "content": "</tr>",
1463
+ "lstrip": false,
1464
+ "normalized": true,
1465
+ "rstrip": false,
1466
+ "single_word": false,
1467
+ "special": false
1468
+ },
1469
+ "183": {
1470
+ "content": "</th>",
1471
+ "lstrip": false,
1472
+ "normalized": true,
1473
+ "rstrip": false,
1474
+ "single_word": false,
1475
+ "special": false
1476
+ },
1477
+ "184": {
1478
+ "content": "</td>",
1479
+ "lstrip": false,
1480
+ "normalized": true,
1481
+ "rstrip": false,
1482
+ "single_word": false,
1483
+ "special": false
1484
+ },
1485
+ "185": {
1486
+ "content": "<h1>",
1487
+ "lstrip": false,
1488
+ "normalized": true,
1489
+ "rstrip": false,
1490
+ "single_word": false,
1491
+ "special": false
1492
+ },
1493
+ "186": {
1494
+ "content": "<h2>",
1495
+ "lstrip": false,
1496
+ "normalized": true,
1497
+ "rstrip": false,
1498
+ "single_word": false,
1499
+ "special": false
1500
+ },
1501
+ "187": {
1502
+ "content": "<h3>",
1503
+ "lstrip": false,
1504
+ "normalized": true,
1505
+ "rstrip": false,
1506
+ "single_word": false,
1507
+ "special": false
1508
+ },
1509
+ "188": {
1510
+ "content": "<h4>",
1511
+ "lstrip": false,
1512
+ "normalized": true,
1513
+ "rstrip": false,
1514
+ "single_word": false,
1515
+ "special": false
1516
+ },
1517
+ "189": {
1518
+ "content": "<h5>",
1519
+ "lstrip": false,
1520
+ "normalized": true,
1521
+ "rstrip": false,
1522
+ "single_word": false,
1523
+ "special": false
1524
+ },
1525
+ "190": {
1526
+ "content": "<h6>",
1527
+ "lstrip": false,
1528
+ "normalized": true,
1529
+ "rstrip": false,
1530
+ "single_word": false,
1531
+ "special": false
1532
+ },
1533
+ "191": {
1534
+ "content": "<blockquote>",
1535
+ "lstrip": false,
1536
+ "normalized": true,
1537
+ "rstrip": false,
1538
+ "single_word": false,
1539
+ "special": false
1540
+ },
1541
+ "192": {
1542
+ "content": "</h1>",
1543
+ "lstrip": false,
1544
+ "normalized": true,
1545
+ "rstrip": false,
1546
+ "single_word": false,
1547
+ "special": false
1548
+ },
1549
+ "193": {
1550
+ "content": "</h2>",
1551
+ "lstrip": false,
1552
+ "normalized": true,
1553
+ "rstrip": false,
1554
+ "single_word": false,
1555
+ "special": false
1556
+ },
1557
+ "194": {
1558
+ "content": "</h3>",
1559
+ "lstrip": false,
1560
+ "normalized": true,
1561
+ "rstrip": false,
1562
+ "single_word": false,
1563
+ "special": false
1564
+ },
1565
+ "195": {
1566
+ "content": "</h4>",
1567
+ "lstrip": false,
1568
+ "normalized": true,
1569
+ "rstrip": false,
1570
+ "single_word": false,
1571
+ "special": false
1572
+ },
1573
+ "196": {
1574
+ "content": "</h5>",
1575
+ "lstrip": false,
1576
+ "normalized": true,
1577
+ "rstrip": false,
1578
+ "single_word": false,
1579
+ "special": false
1580
+ },
1581
+ "197": {
1582
+ "content": "</h6>",
1583
+ "lstrip": false,
1584
+ "normalized": true,
1585
+ "rstrip": false,
1586
+ "single_word": false,
1587
+ "special": false
1588
+ },
1589
+ "198": {
1590
+ "content": "</blockquote>",
1591
+ "lstrip": false,
1592
+ "normalized": true,
1593
+ "rstrip": false,
1594
+ "single_word": false,
1595
+ "special": false
1596
+ },
1597
+ "199": {
1598
+ "content": "<strong>",
1599
+ "lstrip": false,
1600
+ "normalized": true,
1601
+ "rstrip": false,
1602
+ "single_word": false,
1603
+ "special": false
1604
+ },
1605
+ "200": {
1606
+ "content": "<em>",
1607
+ "lstrip": false,
1608
+ "normalized": true,
1609
+ "rstrip": false,
1610
+ "single_word": false,
1611
+ "special": false
1612
+ },
1613
+ "201": {
1614
+ "content": "<b>",
1615
+ "lstrip": false,
1616
+ "normalized": true,
1617
+ "rstrip": false,
1618
+ "single_word": false,
1619
+ "special": false
1620
+ },
1621
+ "202": {
1622
+ "content": "<i>",
1623
+ "lstrip": false,
1624
+ "normalized": true,
1625
+ "rstrip": false,
1626
+ "single_word": false,
1627
+ "special": false
1628
+ },
1629
+ "203": {
1630
+ "content": "<u>",
1631
+ "lstrip": false,
1632
+ "normalized": true,
1633
+ "rstrip": false,
1634
+ "single_word": false,
1635
+ "special": false
1636
+ },
1637
+ "204": {
1638
+ "content": "<s>",
1639
+ "lstrip": false,
1640
+ "normalized": true,
1641
+ "rstrip": false,
1642
+ "single_word": false,
1643
+ "special": false
1644
+ },
1645
+ "205": {
1646
+ "content": "<sub>",
1647
+ "lstrip": false,
1648
+ "normalized": true,
1649
+ "rstrip": false,
1650
+ "single_word": false,
1651
+ "special": false
1652
+ },
1653
+ "206": {
1654
+ "content": "<sup>",
1655
+ "lstrip": false,
1656
+ "normalized": true,
1657
+ "rstrip": false,
1658
+ "single_word": false,
1659
+ "special": false
1660
+ },
1661
+ "207": {
1662
+ "content": "<code>",
1663
+ "lstrip": false,
1664
+ "normalized": true,
1665
+ "rstrip": false,
1666
+ "single_word": false,
1667
+ "special": false
1668
+ },
1669
+ "208": {
1670
+ "content": "</strong>",
1671
+ "lstrip": false,
1672
+ "normalized": true,
1673
+ "rstrip": false,
1674
+ "single_word": false,
1675
+ "special": false
1676
+ },
1677
+ "209": {
1678
+ "content": "</em>",
1679
+ "lstrip": false,
1680
+ "normalized": true,
1681
+ "rstrip": false,
1682
+ "single_word": false,
1683
+ "special": false
1684
+ },
1685
+ "210": {
1686
+ "content": "</b>",
1687
+ "lstrip": false,
1688
+ "normalized": true,
1689
+ "rstrip": false,
1690
+ "single_word": false,
1691
+ "special": false
1692
+ },
1693
+ "211": {
1694
+ "content": "</i>",
1695
+ "lstrip": false,
1696
+ "normalized": true,
1697
+ "rstrip": false,
1698
+ "single_word": false,
1699
+ "special": false
1700
+ },
1701
+ "212": {
1702
+ "content": "</u>",
1703
+ "lstrip": false,
1704
+ "normalized": true,
1705
+ "rstrip": false,
1706
+ "single_word": false,
1707
+ "special": false
1708
+ },
1709
+ "213": {
1710
+ "content": "</s>",
1711
+ "lstrip": false,
1712
+ "normalized": true,
1713
+ "rstrip": false,
1714
+ "single_word": false,
1715
+ "special": false
1716
+ },
1717
+ "214": {
1718
+ "content": "</sub>",
1719
+ "lstrip": false,
1720
+ "normalized": true,
1721
+ "rstrip": false,
1722
+ "single_word": false,
1723
+ "special": false
1724
+ },
1725
+ "215": {
1726
+ "content": "</sup>",
1727
+ "lstrip": false,
1728
+ "normalized": true,
1729
+ "rstrip": false,
1730
+ "single_word": false,
1731
+ "special": false
1732
+ },
1733
+ "216": {
1734
+ "content": "</code>",
1735
+ "lstrip": false,
1736
+ "normalized": true,
1737
+ "rstrip": false,
1738
+ "single_word": false,
1739
+ "special": false
1740
+ },
1741
+ "257152": {
1742
+ "content": "<image>",
1743
+ "lstrip": false,
1744
+ "normalized": false,
1745
+ "rstrip": false,
1746
+ "single_word": false,
1747
+ "special": true
1748
+ }
1749
+ },
1750
+ "additional_special_tokens": [
1751
+ "<image>"
1752
+ ],
1753
+ "bos_token": "<bos>",
1754
+ "clean_up_tokenization_spaces": false,
1755
+ "eos_token": "<eos>",
1756
+ "model_max_length": 1000000000000000019884624838656,
1757
+ "pad_token": "<pad>",
1758
+ "processor_class": "PaliGemmaProcessor",
1759
+ "sp_model_kwargs": {},
1760
+ "spaces_between_special_tokens": false,
1761
+ "tokenizer_class": "GemmaTokenizer",
1762
+ "unk_token": "<unk>",
1763
+ "use_default_system_prompt": false
1764
+ }