matteogabburo commited on
Commit
46d882e
Β·
verified Β·
1 Parent(s): 9d55c3f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image>": 256000
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {% for message in messages %}{% if message['role'] == 'user' %}user: {{ message['content'] }}
2
+ {% elif message['role'] == 'assistant' %}assistant: {{ message['content'] }}
3
+ {% endif %}{% endfor %}{% if add_generation_prompt %}assistant: {% endif %}
config.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "return_dict": true,
3
+ "output_hidden_states": false,
4
+ "torchscript": false,
5
+ "dtype": null,
6
+ "pruned_heads": {},
7
+ "tie_word_embeddings": true,
8
+ "chunk_size_feed_forward": 0,
9
+ "is_encoder_decoder": false,
10
+ "is_decoder": false,
11
+ "cross_attention_hidden_size": null,
12
+ "add_cross_attention": false,
13
+ "tie_encoder_decoder": false,
14
+ "architectures": [
15
+ "VillanovaVLMForConditionalGeneration"
16
+ ],
17
+ "finetuning_task": null,
18
+ "id2label": {
19
+ "0": "LABEL_0",
20
+ "1": "LABEL_1"
21
+ },
22
+ "label2id": {
23
+ "LABEL_0": 0,
24
+ "LABEL_1": 1
25
+ },
26
+ "task_specific_params": null,
27
+ "problem_type": null,
28
+ "tokenizer_class": null,
29
+ "prefix": null,
30
+ "bos_token_id": null,
31
+ "pad_token_id": null,
32
+ "eos_token_id": null,
33
+ "sep_token_id": null,
34
+ "decoder_start_token_id": null,
35
+ "max_length": 20,
36
+ "min_length": 0,
37
+ "do_sample": false,
38
+ "early_stopping": false,
39
+ "num_beams": 1,
40
+ "temperature": 1.0,
41
+ "top_k": 50,
42
+ "top_p": 1.0,
43
+ "typical_p": 1.0,
44
+ "repetition_penalty": 1.0,
45
+ "length_penalty": 1.0,
46
+ "no_repeat_ngram_size": 0,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "bad_words_ids": null,
49
+ "num_return_sequences": 1,
50
+ "output_scores": false,
51
+ "return_dict_in_generate": false,
52
+ "forced_bos_token_id": null,
53
+ "forced_eos_token_id": null,
54
+ "remove_invalid_values": false,
55
+ "exponential_decay_length_penalty": null,
56
+ "suppress_tokens": null,
57
+ "begin_suppress_tokens": null,
58
+ "num_beam_groups": 1,
59
+ "diversity_penalty": 0.0,
60
+ "_name_or_path": "villanova-vlm",
61
+ "transformers_version": "4.57.3",
62
+ "tf_legacy_loss": false,
63
+ "use_bfloat16": false,
64
+ "vision_config": {
65
+ "hidden_size": 1024,
66
+ "image_size": 384,
67
+ "patch_size": 16,
68
+ "num_patches": 576,
69
+ "num_hidden_layers": 24,
70
+ "num_attention_heads": 16,
71
+ "intermediate_size": 4096,
72
+ "model_name": "ViT-L-16-SigLIP-384",
73
+ "pretrained": "webli"
74
+ },
75
+ "projector_config": {
76
+ "num_layers": 2,
77
+ "input_size": 1024,
78
+ "output_size": 2560,
79
+ "hidden_size": 2560,
80
+ "activation": "gelu",
81
+ "use_layer_norm": false,
82
+ "bias": true,
83
+ "output_scale": 1.0
84
+ },
85
+ "text_config": {
86
+ "architectures": [
87
+ "VillanovaVLM"
88
+ ],
89
+ "auto_map": {
90
+ "AutoConfig": "villanova_config.VillanovaVLMConfig",
91
+ "AutoModelForCausalLM": "villanova_vlm.VillanovaVLM"
92
+ },
93
+ "bos_token_id": 1,
94
+ "dtype": "float32",
95
+ "eos_token_id": 2,
96
+ "freeze_vision_encoder": true,
97
+ "image_seq_length": 576,
98
+ "image_token_index": 256000,
99
+ "model_type": "villanova_vlm",
100
+ "pad_token_id": 2,
101
+ "projector_hidden_act": "gelu",
102
+ "projector_hidden_size": 2560,
103
+ "projector_num_layers": 2,
104
+ "projector_output_scale": 1.0,
105
+ "projector_use_output_norm": false,
106
+ "text_config": {
107
+ "_name_or_path": "/media/storage/store1/gabburo/models/villanova-sal-2b-w_const_pretrain_dcos1000k-1100k_to3e-6_v2-step=1099999",
108
+ "architectures": [
109
+ "LlamaForCausalLM"
110
+ ],
111
+ "attention_bias": false,
112
+ "attention_dropout": 0.0,
113
+ "dtype": "bfloat16",
114
+ "head_dim": 128,
115
+ "hidden_act": "silu",
116
+ "hidden_size": 2560,
117
+ "initializer_range": 0.014,
118
+ "intermediate_size": 10240,
119
+ "max_position_embeddings": 4096,
120
+ "mlp_bias": false,
121
+ "model_type": "llama",
122
+ "num_attention_heads": 20,
123
+ "num_hidden_layers": 18,
124
+ "num_key_value_heads": 4,
125
+ "pretraining_tp": 1,
126
+ "rms_norm_eps": 1e-05,
127
+ "rope_scaling": null,
128
+ "rope_theta": 10000,
129
+ "tie_word_embeddings": true,
130
+ "use_cache": true,
131
+ "vocab_size": 256001
132
+ },
133
+ "transformers_version": "4.57.3",
134
+ "vision_config": {
135
+ "backend": "openclip",
136
+ "encoder_name": "ViT-L-16-SigLIP-384",
137
+ "hidden_size": 1024,
138
+ "image_size": 384,
139
+ "num_patches": 576,
140
+ "patch_size": 16,
141
+ "pretrained": "webli"
142
+ },
143
+ "vision_feature_layer": -1,
144
+ "vision_feature_select_strategy": "default"
145
+ },
146
+ "image_token_index": 256000,
147
+ "vocab_size": 256001,
148
+ "hidden_size": 2560,
149
+ "model_type": "villanova",
150
+ "output_attentions": false,
151
+ "auto_map": {
152
+ "AutoConfig": "configuration_villanova.VillanovaConfig",
153
+ "AutoModelForImageTextToText": "modeling_villanova.VillanovaVLMForConditionalGeneration",
154
+ "AutoProcessor": "processing_villanova.VillanovaProcessor"
155
+ }
156
+ }
configuration_villanova.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Villanova VLM Configuration for HuggingFace.
2
+
3
+ This is a standalone configuration file for use with trust_remote_code=True.
4
+ It contains no imports from aithlas_trainer to ensure self-containment.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from transformers import PretrainedConfig
10
+
11
+
12
+ class VillanovaTextConfig(PretrainedConfig):
13
+ """Text/LLM configuration wrapper for Villanova VLM.
14
+
15
+ This wraps the LLM config dict to provide the to_dict() method
16
+ required by transformers' GenerationConfig.
17
+ """
18
+
19
+ model_type = "villanova_text"
20
+
21
+ def __init__(self, **kwargs: Any) -> None:
22
+ super().__init__(**kwargs)
23
+
24
+
25
+ class VillanovaConfig(PretrainedConfig):
26
+ """Configuration class for Villanova VLM.
27
+
28
+ This configuration extends HuggingFace's PretrainedConfig to enable
29
+ loading with AutoConfig and trust_remote_code=True.
30
+
31
+ Args:
32
+ vision_config: Vision encoder configuration dict
33
+ projector_config: MLP projector configuration dict
34
+ text_config: LLM configuration dict
35
+ image_token_index: Token ID for <image> placeholder
36
+ vocab_size: Vocabulary size (from LLM)
37
+ hidden_size: LLM hidden dimension
38
+
39
+ Example:
40
+ >>> config = VillanovaConfig.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview")
41
+ >>> print(config.vision_config)
42
+ """
43
+
44
+ model_type = "villanova"
45
+
46
+ def __init__(
47
+ self,
48
+ vision_config: dict[str, Any] | None = None,
49
+ projector_config: dict[str, Any] | None = None,
50
+ text_config: dict[str, Any] | None = None,
51
+ image_token_index: int = 32000,
52
+ vocab_size: int | None = None,
53
+ hidden_size: int | None = None,
54
+ **kwargs: Any,
55
+ ) -> None:
56
+ super().__init__(**kwargs)
57
+
58
+ # Default vision config (ViT-L-14-CLIPA-336)
59
+ self.vision_config = vision_config or {
60
+ "hidden_size": 1024,
61
+ "image_size": 336,
62
+ "patch_size": 14,
63
+ "num_patches": 576,
64
+ "num_hidden_layers": 24,
65
+ "num_attention_heads": 16,
66
+ "intermediate_size": 4096,
67
+ "model_name": "ViT-L-14-CLIPA-336",
68
+ "pretrained": "datacomp1b",
69
+ }
70
+
71
+ # Default projector config (2-layer MLP with GELU, no LayerNorm like LLaVA)
72
+ self.projector_config = projector_config or {
73
+ "num_layers": 2,
74
+ "input_size": 1024,
75
+ "output_size": 2048,
76
+ "hidden_size": 2048,
77
+ "activation": "gelu",
78
+ "use_layer_norm": False, # No LayerNorm on output (like LLaVA)
79
+ "bias": True,
80
+ }
81
+
82
+ # Text/LLM config - wrap as PretrainedConfig for compatibility with GenerationConfig
83
+ text_config_dict = text_config or {}
84
+ self.text_config = VillanovaTextConfig(**text_config_dict)
85
+
86
+ # Special tokens
87
+ self.image_token_index = image_token_index
88
+
89
+ # Derive from text_config if not provided
90
+ self.vocab_size = vocab_size or text_config_dict.get("vocab_size", 32000)
91
+ self.hidden_size = hidden_size or text_config_dict.get("hidden_size", 2048)
92
+
93
+ # Update projector output size to match LLM hidden size
94
+ if self.projector_config.get("output_size") != self.hidden_size:
95
+ self.projector_config["output_size"] = self.hidden_size
96
+ self.projector_config["hidden_size"] = self.hidden_size
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "max_length": 2048,
7
+ "do_sample": false,
8
+ "temperature": 1.0,
9
+ "top_p": 1.0,
10
+ "top_k": 50
11
+ }
image_processing_villanova.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Villanova VLM Image Processor for HuggingFace.
2
+
3
+ This is a standalone image processor file for use with trust_remote_code=True.
4
+ It contains no imports from aithlas_trainer to ensure self-containment.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ from PIL import Image
11
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
12
+ from transformers.image_utils import (
13
+ ChannelDimension,
14
+ ImageInput,
15
+ make_list_of_images,
16
+ to_numpy_array,
17
+ valid_images,
18
+ )
19
+
20
+
21
+ class VillanovaImageProcessor(BaseImageProcessor):
22
+ """Image processor for Villanova VLM.
23
+
24
+ Processes images for the ViT-L-14-CLIPA-336 vision encoder:
25
+ - Resize to 336x336
26
+ - Normalize with ImageNet statistics (as used by OpenCLIP CLIPA models)
27
+ - Convert to RGB if needed
28
+
29
+ Args:
30
+ do_resize: Whether to resize images
31
+ size: Target size {"height": 336, "width": 336}
32
+ resample: PIL resampling filter (default: BILINEAR as used by OpenCLIP)
33
+ do_rescale: Whether to rescale pixel values
34
+ rescale_factor: Rescale factor (1/255)
35
+ do_normalize: Whether to normalize
36
+ image_mean: Normalization mean (ImageNet: [0.485, 0.456, 0.406])
37
+ image_std: Normalization std (ImageNet: [0.229, 0.224, 0.225])
38
+ do_convert_rgb: Convert to RGB if needed
39
+
40
+ Example:
41
+ >>> processor = VillanovaImageProcessor()
42
+ >>> image = Image.open("image.jpg")
43
+ >>> inputs = processor(image, return_tensors="pt")
44
+ >>> print(inputs.pixel_values.shape)
45
+ torch.Size([1, 3, 336, 336])
46
+ """
47
+
48
+ model_input_names = ["pixel_values"]
49
+
50
+ def __init__(
51
+ self,
52
+ do_resize: bool = True,
53
+ size: dict[str, int] | None = None,
54
+ resample: int = 2, # PIL.Image.BILINEAR (as used by OpenCLIP)
55
+ do_rescale: bool = True,
56
+ rescale_factor: float = 1 / 255,
57
+ do_normalize: bool = True,
58
+ image_mean: list[float] | None = None,
59
+ image_std: list[float] | None = None,
60
+ do_convert_rgb: bool = True,
61
+ **kwargs: Any,
62
+ ) -> None:
63
+ super().__init__(**kwargs)
64
+
65
+ self.do_resize = do_resize
66
+ self.size = size or {"height": 336, "width": 336}
67
+ self.resample = resample
68
+ self.do_rescale = do_rescale
69
+ self.rescale_factor = rescale_factor
70
+ self.do_normalize = do_normalize
71
+ # ImageNet normalization (same as OpenCLIP ViT-L-14-CLIPA-336)
72
+ self.image_mean = image_mean or [0.485, 0.456, 0.406]
73
+ self.image_std = image_std or [0.229, 0.224, 0.225]
74
+ self.do_convert_rgb = do_convert_rgb
75
+
76
+ def resize(
77
+ self,
78
+ image: np.ndarray,
79
+ size: dict[str, int],
80
+ resample: int = 2,
81
+ data_format: ChannelDimension | None = None,
82
+ **kwargs: Any,
83
+ ) -> np.ndarray:
84
+ """Resize image to target size."""
85
+ height, width = size["height"], size["width"]
86
+
87
+ # Convert to PIL for resizing
88
+ if isinstance(image, np.ndarray):
89
+ pil_image = Image.fromarray(image.astype(np.uint8))
90
+ else:
91
+ pil_image = image
92
+
93
+ # Resize
94
+ resized = pil_image.resize((width, height), resample=resample)
95
+
96
+ # Convert back to numpy
97
+ return np.array(resized)
98
+
99
+ def rescale(
100
+ self,
101
+ image: np.ndarray,
102
+ scale: float,
103
+ data_format: ChannelDimension | None = None,
104
+ **kwargs: Any,
105
+ ) -> np.ndarray:
106
+ """Rescale pixel values."""
107
+ return image.astype(np.float32) * scale
108
+
109
+ def normalize(
110
+ self,
111
+ image: np.ndarray,
112
+ mean: list[float],
113
+ std: list[float],
114
+ data_format: ChannelDimension | None = None,
115
+ **kwargs: Any,
116
+ ) -> np.ndarray:
117
+ """Normalize image with mean and std."""
118
+ mean = np.array(mean, dtype=np.float32)
119
+ std = np.array(std, dtype=np.float32)
120
+
121
+ # Ensure image is float
122
+ image = image.astype(np.float32)
123
+
124
+ # Normalize (assuming HWC format)
125
+ if image.ndim == 3:
126
+ image = (image - mean) / std
127
+
128
+ return image
129
+
130
+ def preprocess(
131
+ self,
132
+ images: ImageInput,
133
+ do_resize: bool | None = None,
134
+ size: dict[str, int] | None = None,
135
+ resample: int | None = None,
136
+ do_rescale: bool | None = None,
137
+ rescale_factor: float | None = None,
138
+ do_normalize: bool | None = None,
139
+ image_mean: list[float] | None = None,
140
+ image_std: list[float] | None = None,
141
+ do_convert_rgb: bool | None = None,
142
+ return_tensors: str | None = None,
143
+ data_format: ChannelDimension = ChannelDimension.FIRST,
144
+ **kwargs: Any,
145
+ ) -> BatchFeature:
146
+ """Preprocess images for the model.
147
+
148
+ Args:
149
+ images: Single image or list of images
150
+ do_resize: Override resize setting
151
+ size: Override target size
152
+ resample: Override resampling filter
153
+ do_rescale: Override rescale setting
154
+ rescale_factor: Override rescale factor
155
+ do_normalize: Override normalize setting
156
+ image_mean: Override mean
157
+ image_std: Override std
158
+ do_convert_rgb: Override RGB conversion
159
+ return_tensors: Output tensor format ("pt", "np", etc.)
160
+ data_format: Channel dimension format
161
+
162
+ Returns:
163
+ BatchFeature with pixel_values
164
+ """
165
+ do_resize = do_resize if do_resize is not None else self.do_resize
166
+ size = size if size is not None else self.size
167
+ resample = resample if resample is not None else self.resample
168
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
169
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
170
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
171
+ image_mean = image_mean if image_mean is not None else self.image_mean
172
+ image_std = image_std if image_std is not None else self.image_std
173
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
174
+
175
+ # Handle single image
176
+ images = make_list_of_images(images)
177
+
178
+ if not valid_images(images):
179
+ raise ValueError("Invalid image input")
180
+
181
+ processed_images = []
182
+ for image in images:
183
+ # Convert to RGB if needed
184
+ if do_convert_rgb:
185
+ if isinstance(image, Image.Image):
186
+ image = image.convert("RGB")
187
+ elif isinstance(image, np.ndarray):
188
+ if image.shape[-1] == 4: # RGBA
189
+ image = image[..., :3]
190
+ elif image.ndim == 2: # Grayscale
191
+ image = np.stack([image] * 3, axis=-1)
192
+
193
+ # Convert to numpy
194
+ image = to_numpy_array(image)
195
+
196
+ # Resize
197
+ if do_resize:
198
+ image = self.resize(image, size, resample)
199
+
200
+ # Rescale
201
+ if do_rescale:
202
+ image = self.rescale(image, rescale_factor)
203
+
204
+ # Normalize
205
+ if do_normalize:
206
+ image = self.normalize(image, image_mean, image_std)
207
+
208
+ # Convert to CHW format
209
+ if data_format == ChannelDimension.FIRST:
210
+ image = np.transpose(image, (2, 0, 1))
211
+
212
+ processed_images.append(image)
213
+
214
+ # Stack into batch
215
+ pixel_values = np.stack(processed_images, axis=0)
216
+
217
+ data = {"pixel_values": pixel_values}
218
+
219
+ return BatchFeature(data=data, tensor_type=return_tensors)
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea2b1700a60c63cfb1f33fffcb4c16e6e43420e53f095b4cb74a3a101105e2b4
3
+ size 4981776984
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a2523a1e6bf438d62fd5644da5f6b3dbc2f176e4e8c9e28ca1a5b8916d07df9
3
+ size 377510136
model.safetensors.index.json ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5359228928
4
+ },
5
+ "weight_map": {
6
+ "vision_encoder.trunk.pos_embed": "model-00001-of-00002.safetensors",
7
+ "vision_encoder.trunk.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
8
+ "vision_encoder.trunk.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
9
+ "vision_encoder.trunk.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
10
+ "vision_encoder.trunk.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
11
+ "vision_encoder.trunk.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
12
+ "vision_encoder.trunk.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
13
+ "vision_encoder.trunk.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
14
+ "vision_encoder.trunk.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
15
+ "vision_encoder.trunk.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
16
+ "vision_encoder.trunk.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
17
+ "vision_encoder.trunk.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
18
+ "vision_encoder.trunk.blocks.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
19
+ "vision_encoder.trunk.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
20
+ "vision_encoder.trunk.blocks.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
21
+ "vision_encoder.trunk.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
22
+ "vision_encoder.trunk.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
23
+ "vision_encoder.trunk.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
24
+ "vision_encoder.trunk.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
25
+ "vision_encoder.trunk.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
26
+ "vision_encoder.trunk.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
27
+ "vision_encoder.trunk.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
28
+ "vision_encoder.trunk.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
29
+ "vision_encoder.trunk.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
30
+ "vision_encoder.trunk.blocks.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
31
+ "vision_encoder.trunk.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
32
+ "vision_encoder.trunk.blocks.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
33
+ "vision_encoder.trunk.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
34
+ "vision_encoder.trunk.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
35
+ "vision_encoder.trunk.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
36
+ "vision_encoder.trunk.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
37
+ "vision_encoder.trunk.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
38
+ "vision_encoder.trunk.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
39
+ "vision_encoder.trunk.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
40
+ "vision_encoder.trunk.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
41
+ "vision_encoder.trunk.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
42
+ "vision_encoder.trunk.blocks.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
43
+ "vision_encoder.trunk.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
44
+ "vision_encoder.trunk.blocks.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
45
+ "vision_encoder.trunk.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
46
+ "vision_encoder.trunk.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
47
+ "vision_encoder.trunk.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
48
+ "vision_encoder.trunk.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
49
+ "vision_encoder.trunk.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
50
+ "vision_encoder.trunk.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
51
+ "vision_encoder.trunk.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
52
+ "vision_encoder.trunk.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
53
+ "vision_encoder.trunk.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
54
+ "vision_encoder.trunk.blocks.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
55
+ "vision_encoder.trunk.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
56
+ "vision_encoder.trunk.blocks.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
57
+ "vision_encoder.trunk.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
58
+ "vision_encoder.trunk.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
59
+ "vision_encoder.trunk.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
60
+ "vision_encoder.trunk.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
61
+ "vision_encoder.trunk.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
62
+ "vision_encoder.trunk.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
63
+ "vision_encoder.trunk.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
64
+ "vision_encoder.trunk.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
65
+ "vision_encoder.trunk.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
66
+ "vision_encoder.trunk.blocks.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
67
+ "vision_encoder.trunk.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
68
+ "vision_encoder.trunk.blocks.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
69
+ "vision_encoder.trunk.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
70
+ "vision_encoder.trunk.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
71
+ "vision_encoder.trunk.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
72
+ "vision_encoder.trunk.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
73
+ "vision_encoder.trunk.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
74
+ "vision_encoder.trunk.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
75
+ "vision_encoder.trunk.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
76
+ "vision_encoder.trunk.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
77
+ "vision_encoder.trunk.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
78
+ "vision_encoder.trunk.blocks.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
79
+ "vision_encoder.trunk.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
80
+ "vision_encoder.trunk.blocks.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
81
+ "vision_encoder.trunk.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
82
+ "vision_encoder.trunk.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
83
+ "vision_encoder.trunk.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
84
+ "vision_encoder.trunk.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
85
+ "vision_encoder.trunk.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
86
+ "vision_encoder.trunk.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
87
+ "vision_encoder.trunk.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
88
+ "vision_encoder.trunk.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
89
+ "vision_encoder.trunk.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
90
+ "vision_encoder.trunk.blocks.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
91
+ "vision_encoder.trunk.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
92
+ "vision_encoder.trunk.blocks.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
93
+ "vision_encoder.trunk.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
94
+ "vision_encoder.trunk.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
95
+ "vision_encoder.trunk.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
96
+ "vision_encoder.trunk.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
97
+ "vision_encoder.trunk.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
98
+ "vision_encoder.trunk.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
99
+ "vision_encoder.trunk.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
100
+ "vision_encoder.trunk.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
101
+ "vision_encoder.trunk.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
102
+ "vision_encoder.trunk.blocks.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
103
+ "vision_encoder.trunk.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
104
+ "vision_encoder.trunk.blocks.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
105
+ "vision_encoder.trunk.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
106
+ "vision_encoder.trunk.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
107
+ "vision_encoder.trunk.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
108
+ "vision_encoder.trunk.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
109
+ "vision_encoder.trunk.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
110
+ "vision_encoder.trunk.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
111
+ "vision_encoder.trunk.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
112
+ "vision_encoder.trunk.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
113
+ "vision_encoder.trunk.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
114
+ "vision_encoder.trunk.blocks.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
115
+ "vision_encoder.trunk.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
116
+ "vision_encoder.trunk.blocks.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
117
+ "vision_encoder.trunk.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
118
+ "vision_encoder.trunk.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
119
+ "vision_encoder.trunk.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
120
+ "vision_encoder.trunk.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
121
+ "vision_encoder.trunk.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
122
+ "vision_encoder.trunk.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
123
+ "vision_encoder.trunk.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
124
+ "vision_encoder.trunk.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
125
+ "vision_encoder.trunk.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
126
+ "vision_encoder.trunk.blocks.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
127
+ "vision_encoder.trunk.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
128
+ "vision_encoder.trunk.blocks.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
129
+ "vision_encoder.trunk.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
130
+ "vision_encoder.trunk.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
131
+ "vision_encoder.trunk.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
132
+ "vision_encoder.trunk.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
133
+ "vision_encoder.trunk.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
134
+ "vision_encoder.trunk.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
135
+ "vision_encoder.trunk.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
136
+ "vision_encoder.trunk.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
137
+ "vision_encoder.trunk.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
138
+ "vision_encoder.trunk.blocks.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
139
+ "vision_encoder.trunk.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
140
+ "vision_encoder.trunk.blocks.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
141
+ "vision_encoder.trunk.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
142
+ "vision_encoder.trunk.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
143
+ "vision_encoder.trunk.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
144
+ "vision_encoder.trunk.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
145
+ "vision_encoder.trunk.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
146
+ "vision_encoder.trunk.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
147
+ "vision_encoder.trunk.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
148
+ "vision_encoder.trunk.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
149
+ "vision_encoder.trunk.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
150
+ "vision_encoder.trunk.blocks.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
151
+ "vision_encoder.trunk.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
152
+ "vision_encoder.trunk.blocks.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
153
+ "vision_encoder.trunk.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
154
+ "vision_encoder.trunk.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
155
+ "vision_encoder.trunk.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
156
+ "vision_encoder.trunk.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
157
+ "vision_encoder.trunk.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
158
+ "vision_encoder.trunk.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
159
+ "vision_encoder.trunk.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
160
+ "vision_encoder.trunk.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
161
+ "vision_encoder.trunk.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
162
+ "vision_encoder.trunk.blocks.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
163
+ "vision_encoder.trunk.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
164
+ "vision_encoder.trunk.blocks.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
165
+ "vision_encoder.trunk.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
166
+ "vision_encoder.trunk.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
167
+ "vision_encoder.trunk.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
168
+ "vision_encoder.trunk.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
169
+ "vision_encoder.trunk.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
170
+ "vision_encoder.trunk.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
171
+ "vision_encoder.trunk.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
172
+ "vision_encoder.trunk.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
173
+ "vision_encoder.trunk.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
174
+ "vision_encoder.trunk.blocks.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
175
+ "vision_encoder.trunk.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
176
+ "vision_encoder.trunk.blocks.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
177
+ "vision_encoder.trunk.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
178
+ "vision_encoder.trunk.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
179
+ "vision_encoder.trunk.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
180
+ "vision_encoder.trunk.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
181
+ "vision_encoder.trunk.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
182
+ "vision_encoder.trunk.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
183
+ "vision_encoder.trunk.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
184
+ "vision_encoder.trunk.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
185
+ "vision_encoder.trunk.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
186
+ "vision_encoder.trunk.blocks.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
187
+ "vision_encoder.trunk.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
188
+ "vision_encoder.trunk.blocks.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
189
+ "vision_encoder.trunk.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
190
+ "vision_encoder.trunk.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
191
+ "vision_encoder.trunk.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
192
+ "vision_encoder.trunk.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
193
+ "vision_encoder.trunk.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
194
+ "vision_encoder.trunk.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
195
+ "vision_encoder.trunk.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
196
+ "vision_encoder.trunk.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
197
+ "vision_encoder.trunk.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
198
+ "vision_encoder.trunk.blocks.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
199
+ "vision_encoder.trunk.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
200
+ "vision_encoder.trunk.blocks.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
201
+ "vision_encoder.trunk.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
202
+ "vision_encoder.trunk.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
203
+ "vision_encoder.trunk.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
204
+ "vision_encoder.trunk.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
205
+ "vision_encoder.trunk.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
206
+ "vision_encoder.trunk.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
207
+ "vision_encoder.trunk.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
208
+ "vision_encoder.trunk.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
209
+ "vision_encoder.trunk.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
210
+ "vision_encoder.trunk.blocks.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
211
+ "vision_encoder.trunk.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
212
+ "vision_encoder.trunk.blocks.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
213
+ "vision_encoder.trunk.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
214
+ "vision_encoder.trunk.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
215
+ "vision_encoder.trunk.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
216
+ "vision_encoder.trunk.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
217
+ "vision_encoder.trunk.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
218
+ "vision_encoder.trunk.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
219
+ "vision_encoder.trunk.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
220
+ "vision_encoder.trunk.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
221
+ "vision_encoder.trunk.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
222
+ "vision_encoder.trunk.blocks.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
223
+ "vision_encoder.trunk.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
224
+ "vision_encoder.trunk.blocks.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
225
+ "vision_encoder.trunk.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
226
+ "vision_encoder.trunk.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
227
+ "vision_encoder.trunk.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
228
+ "vision_encoder.trunk.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
229
+ "vision_encoder.trunk.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
230
+ "vision_encoder.trunk.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
231
+ "vision_encoder.trunk.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
232
+ "vision_encoder.trunk.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
233
+ "vision_encoder.trunk.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
234
+ "vision_encoder.trunk.blocks.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
235
+ "vision_encoder.trunk.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
236
+ "vision_encoder.trunk.blocks.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
237
+ "vision_encoder.trunk.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
238
+ "vision_encoder.trunk.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
239
+ "vision_encoder.trunk.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
240
+ "vision_encoder.trunk.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
241
+ "vision_encoder.trunk.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
242
+ "vision_encoder.trunk.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
243
+ "vision_encoder.trunk.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
244
+ "vision_encoder.trunk.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
245
+ "vision_encoder.trunk.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
246
+ "vision_encoder.trunk.blocks.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
247
+ "vision_encoder.trunk.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
248
+ "vision_encoder.trunk.blocks.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
249
+ "vision_encoder.trunk.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
250
+ "vision_encoder.trunk.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
251
+ "vision_encoder.trunk.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
252
+ "vision_encoder.trunk.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
253
+ "vision_encoder.trunk.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
254
+ "vision_encoder.trunk.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
255
+ "vision_encoder.trunk.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
256
+ "vision_encoder.trunk.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
257
+ "vision_encoder.trunk.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
258
+ "vision_encoder.trunk.blocks.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
259
+ "vision_encoder.trunk.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
260
+ "vision_encoder.trunk.blocks.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
261
+ "vision_encoder.trunk.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
262
+ "vision_encoder.trunk.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
263
+ "vision_encoder.trunk.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
264
+ "vision_encoder.trunk.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
265
+ "vision_encoder.trunk.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
266
+ "vision_encoder.trunk.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
267
+ "vision_encoder.trunk.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
268
+ "vision_encoder.trunk.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
269
+ "vision_encoder.trunk.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
270
+ "vision_encoder.trunk.blocks.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
271
+ "vision_encoder.trunk.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
272
+ "vision_encoder.trunk.blocks.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
273
+ "vision_encoder.trunk.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
274
+ "vision_encoder.trunk.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
275
+ "vision_encoder.trunk.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
276
+ "vision_encoder.trunk.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
277
+ "vision_encoder.trunk.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
278
+ "vision_encoder.trunk.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
279
+ "vision_encoder.trunk.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
280
+ "vision_encoder.trunk.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
281
+ "vision_encoder.trunk.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
282
+ "vision_encoder.trunk.blocks.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
283
+ "vision_encoder.trunk.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
284
+ "vision_encoder.trunk.blocks.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
285
+ "vision_encoder.trunk.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
286
+ "vision_encoder.trunk.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
287
+ "vision_encoder.trunk.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
288
+ "vision_encoder.trunk.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
289
+ "vision_encoder.trunk.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
290
+ "vision_encoder.trunk.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
291
+ "vision_encoder.trunk.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
292
+ "vision_encoder.trunk.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
293
+ "vision_encoder.trunk.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
294
+ "vision_encoder.trunk.blocks.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
295
+ "vision_encoder.trunk.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
296
+ "vision_encoder.trunk.blocks.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
297
+ "vision_encoder.trunk.norm.weight": "model-00001-of-00002.safetensors",
298
+ "vision_encoder.trunk.norm.bias": "model-00001-of-00002.safetensors",
299
+ "vision_encoder.trunk.attn_pool.latent": "model-00001-of-00002.safetensors",
300
+ "vision_encoder.trunk.attn_pool.q.weight": "model-00001-of-00002.safetensors",
301
+ "vision_encoder.trunk.attn_pool.q.bias": "model-00001-of-00002.safetensors",
302
+ "vision_encoder.trunk.attn_pool.kv.weight": "model-00001-of-00002.safetensors",
303
+ "vision_encoder.trunk.attn_pool.kv.bias": "model-00001-of-00002.safetensors",
304
+ "vision_encoder.trunk.attn_pool.proj.weight": "model-00001-of-00002.safetensors",
305
+ "vision_encoder.trunk.attn_pool.proj.bias": "model-00001-of-00002.safetensors",
306
+ "vision_encoder.trunk.attn_pool.norm.weight": "model-00001-of-00002.safetensors",
307
+ "vision_encoder.trunk.attn_pool.norm.bias": "model-00001-of-00002.safetensors",
308
+ "vision_encoder.trunk.attn_pool.mlp.fc1.weight": "model-00001-of-00002.safetensors",
309
+ "vision_encoder.trunk.attn_pool.mlp.fc1.bias": "model-00001-of-00002.safetensors",
310
+ "vision_encoder.trunk.attn_pool.mlp.fc2.weight": "model-00001-of-00002.safetensors",
311
+ "vision_encoder.trunk.attn_pool.mlp.fc2.bias": "model-00001-of-00002.safetensors",
312
+ "projector.mlp.0.weight": "model-00001-of-00002.safetensors",
313
+ "projector.mlp.0.bias": "model-00001-of-00002.safetensors",
314
+ "projector.mlp.2.weight": "model-00001-of-00002.safetensors",
315
+ "projector.mlp.2.bias": "model-00001-of-00002.safetensors",
316
+ "language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
317
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
318
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
319
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
320
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
321
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
322
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
323
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
324
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
325
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
326
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
327
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
328
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
329
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
330
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
331
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
332
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
333
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
334
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
335
+ "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
336
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
337
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
338
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
339
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
340
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
341
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
342
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
343
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
344
+ "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
345
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
346
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
347
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
348
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
349
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
350
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
351
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
352
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
353
+ "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
354
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
355
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
356
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
357
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
358
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
359
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
360
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
361
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
362
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
364
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
365
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
366
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
367
+ "language_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
368
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
369
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
370
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
371
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
373
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
374
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
375
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
376
+ "language_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
377
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
378
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
379
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
380
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
381
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
382
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
383
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
384
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
385
+ "language_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
386
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
387
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
388
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
389
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
390
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
391
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
392
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
393
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
394
+ "language_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
395
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
396
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
397
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
398
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
399
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
402
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
403
+ "language_model.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
404
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
405
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
406
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
407
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
408
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
409
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
410
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
411
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
412
+ "language_model.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
413
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
414
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
415
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
416
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
417
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
418
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
419
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
420
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
421
+ "language_model.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
422
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
423
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
424
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
425
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
426
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
427
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
428
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
429
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
430
+ "language_model.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
431
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
432
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
433
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
434
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
435
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
436
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
437
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
438
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
439
+ "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
440
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
441
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
442
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
443
+ "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
444
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
445
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
446
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
447
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
448
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
449
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
450
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
451
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
452
+ "language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
453
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
454
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
455
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
456
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
457
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
458
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
459
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
460
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
461
+ "language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
462
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
463
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
464
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
465
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
466
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
467
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
468
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
469
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
470
+ "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors",
471
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
472
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
473
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
474
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
475
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
476
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
477
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
478
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
479
+ "language_model.model.norm.weight": "model-00002-of-00002.safetensors"
480
+ }
481
+ }
modeling_villanova.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Villanova VLM Model for HuggingFace.
2
+
3
+ This is a standalone model file for use with trust_remote_code=True.
4
+ It contains no imports from aithlas_trainer to ensure self-containment.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ from transformers import AutoModelForCausalLM, PreTrainedModel
12
+ from transformers.modeling_outputs import CausalLMOutputWithPast
13
+
14
+ from .configuration_villanova import VillanovaConfig
15
+
16
+
17
+ class ViTEncoder(nn.Module):
18
+ """Vision encoder for Villanova VLM using OpenCLIP.
19
+
20
+ Supports both:
21
+ - OpenCLIP CLIPA models (ViT-L-14-CLIPA-336) with direct visual transformer
22
+ - SigLIP models (ViT-L-16-SigLIP-384) wrapped via TimmModel
23
+
24
+ The model is loaded from OpenCLIP pretrained weights (not from safetensors).
25
+
26
+ IMPORTANT: Uses manual forward pass to match training code exactly.
27
+ Do NOT use output_tokens=True as it produces different outputs.
28
+ """
29
+
30
+ def __init__(self, config: dict[str, Any]) -> None:
31
+ super().__init__()
32
+ self.hidden_size = config.get("hidden_size", 1024)
33
+ # Support both old key (model_name) and new key (encoder_name)
34
+ self.model_name = config.get("encoder_name") or config.get("model_name", "ViT-L-14-CLIPA-336")
35
+ self.pretrained = config.get("pretrained", "datacomp1b")
36
+
37
+ # Placeholder - will be loaded lazily
38
+ self._clip_model: nn.Module | None = None
39
+ self._is_siglip: bool = "SigLIP" in self.model_name
40
+
41
+ def _ensure_clip_loaded(self) -> None:
42
+ """Load OpenCLIP model if not already loaded."""
43
+ if self._clip_model is None:
44
+ import open_clip
45
+
46
+ model, _, _ = open_clip.create_model_and_transforms(
47
+ self.model_name,
48
+ pretrained=self.pretrained,
49
+ )
50
+ # Use model.visual directly
51
+ self._clip_model = model.visual
52
+ self._clip_model.eval()
53
+
54
+ # Freeze all parameters
55
+ for param in self._clip_model.parameters():
56
+ param.requires_grad = False
57
+
58
+ def _forward_siglip(self, pixel_values: torch.Tensor) -> torch.Tensor:
59
+ """Forward pass for SigLIP models (TimmModel wrapper)."""
60
+ visual = self._clip_model
61
+ trunk = visual.trunk # VisionTransformer from timm
62
+
63
+ # Patch embedding
64
+ x = trunk.patch_embed(pixel_values) # (B, num_patches, hidden_dim)
65
+
66
+ # Add positional embedding (SigLIP may or may not have cls_token)
67
+ if trunk.cls_token is not None and trunk.cls_token.numel() > 0:
68
+ cls_tokens = trunk.cls_token.expand(x.shape[0], -1, -1)
69
+ x = torch.cat([cls_tokens, x], dim=1)
70
+
71
+ # Add positional embedding
72
+ x = x + trunk.pos_embed
73
+
74
+ # Optional: position dropout (usually identity)
75
+ x = trunk.pos_drop(x)
76
+
77
+ # Optional: patch dropout (usually identity)
78
+ if hasattr(trunk, "patch_drop") and trunk.patch_drop is not None:
79
+ x = trunk.patch_drop(x)
80
+
81
+ # Optional: pre-norm (some models have this)
82
+ if hasattr(trunk, "norm_pre") and trunk.norm_pre is not None:
83
+ x = trunk.norm_pre(x)
84
+
85
+ # Apply transformer blocks
86
+ x = trunk.blocks(x)
87
+
88
+ # Final norm
89
+ x = trunk.norm(x)
90
+
91
+ # Remove CLS token if present, return only patch tokens
92
+ if trunk.cls_token is not None and trunk.cls_token.numel() > 0:
93
+ patch_tokens = x[:, 1:, :]
94
+ else:
95
+ patch_tokens = x
96
+
97
+ return patch_tokens
98
+
99
+ def _forward_clipa(self, pixel_values: torch.Tensor) -> torch.Tensor:
100
+ """Forward pass for CLIPA models (standard OpenCLIP)."""
101
+ visual = self._clip_model
102
+
103
+ # Step 1: Get patch embeddings via conv1
104
+ x = visual.conv1(pixel_values) # (B, hidden_dim, grid, grid)
105
+ x = x.reshape(x.shape[0], x.shape[1], -1) # (B, hidden_dim, num_patches)
106
+ x = x.permute(0, 2, 1) # (B, num_patches, hidden_dim)
107
+
108
+ # Step 2: Add positional embeddings (including CLS position)
109
+ if hasattr(visual, "positional_embedding"):
110
+ # OpenCLIP style: add CLS token and positional embeddings
111
+ cls_pos = visual.class_embedding.expand(x.shape[0], 1, -1)
112
+ x = torch.cat([cls_pos, x], dim=1)
113
+ x = x + visual.positional_embedding.unsqueeze(0)
114
+ elif hasattr(visual, "pos_embed"):
115
+ # Alternative style
116
+ x = x + visual.pos_embed[:, 1:, :]
117
+
118
+ # Step 3: Apply layer norm before transformer
119
+ x = visual.ln_pre(x)
120
+
121
+ # Step 4: Apply transformer (expects seq_len first)
122
+ x = x.permute(1, 0, 2) # (seq_len, B, hidden_dim)
123
+ x = visual.transformer(x)
124
+ x = x.permute(1, 0, 2) # (B, seq_len, hidden_dim)
125
+
126
+ # Step 5: Apply post-transformer layer norm (CRITICAL for correct output scale)
127
+ x = visual.ln_post(x)
128
+
129
+ # Step 6: Remove CLS token, return only patch tokens
130
+ patch_tokens = x[:, 1:, :]
131
+
132
+ return patch_tokens
133
+
134
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
135
+ """Encode images to visual embeddings.
136
+
137
+ Uses MANUAL forward pass through OpenCLIP vision encoder to match
138
+ training code exactly. This is critical for correct inference.
139
+
140
+ Args:
141
+ pixel_values: Image tensor (batch_size, 3, H, W)
142
+
143
+ Returns:
144
+ Visual embeddings (batch_size, num_patches, hidden_size)
145
+ """
146
+ self._ensure_clip_loaded()
147
+
148
+ visual = self._clip_model
149
+
150
+ # Convert model to input dtype if needed (critical for matching training behavior)
151
+ input_dtype = pixel_values.dtype
152
+ model_dtype = next(visual.parameters()).dtype
153
+ if model_dtype != input_dtype:
154
+ self._clip_model = visual.to(dtype=input_dtype)
155
+ visual = self._clip_model
156
+
157
+ # Move model to same device as input
158
+ if next(visual.parameters()).device != pixel_values.device:
159
+ self._clip_model = visual.to(pixel_values.device)
160
+ visual = self._clip_model
161
+
162
+ with torch.no_grad():
163
+ if self._is_siglip:
164
+ return self._forward_siglip(pixel_values)
165
+ else:
166
+ return self._forward_clipa(pixel_values)
167
+
168
+
169
+ class MLPProjector(nn.Module):
170
+ """MLP Projector to map vision features to LLM embedding space.
171
+
172
+ 2-layer MLP with GELU activation (no output LayerNorm by default).
173
+ Structure matches the VillanovaVLM training checkpoint format:
174
+ - mlp.0: Linear(input_size, hidden_size)
175
+ - mlp.1: GELU (no params)
176
+ - mlp.2: Linear(hidden_size, output_size)
177
+ - output_norm: Identity() by default (no LayerNorm, like LLaVA)
178
+
179
+ NOTE: LLaVA does NOT use LayerNorm on projector output.
180
+ LLM embeddings have stdβ‰ˆ0.008, LayerNorm forces stdβ‰ˆ1, causing 140x scale mismatch.
181
+ """
182
+
183
+ def __init__(self, config: dict[str, Any]) -> None:
184
+ super().__init__()
185
+
186
+ input_size = config.get("input_size", 1024)
187
+ output_size = config.get("output_size", 2048)
188
+ hidden_size = config.get("hidden_size", output_size)
189
+ use_layer_norm = config.get("use_layer_norm", False)
190
+ bias = config.get("bias", True)
191
+ # Scale factor for output. Default 1.0 to match training behavior.
192
+ # Note: If training used output_scale, it should be set in config.
193
+ self.output_scale = config.get("output_scale", 1.0)
194
+
195
+ # Build MLP layers to match checkpoint structure
196
+ self.mlp = nn.Sequential(
197
+ nn.Linear(input_size, hidden_size, bias=bias),
198
+ nn.GELU(),
199
+ nn.Linear(hidden_size, output_size, bias=bias),
200
+ )
201
+
202
+ # Output normalization (separate from mlp to match checkpoint keys)
203
+ if use_layer_norm:
204
+ self.output_norm = nn.LayerNorm(output_size)
205
+ else:
206
+ self.output_norm = nn.Identity()
207
+
208
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
209
+ """Project vision features to LLM space."""
210
+ x = self.mlp(x)
211
+ x = self.output_norm(x)
212
+ # Scale to match LLM embedding magnitude
213
+ if self.output_scale != 1.0:
214
+ x = x * self.output_scale
215
+ return x
216
+
217
+
218
+ class VillanovaVLMForConditionalGeneration(PreTrainedModel):
219
+ """Villanova Vision-Language Model for conditional generation.
220
+
221
+ Combines ViT-L-14-CLIPA-336 vision encoder, 2-layer MLP projector,
222
+ and Villanova 2B language model.
223
+
224
+ Example:
225
+ >>> from transformers import AutoModelForImageTextToText, AutoProcessor
226
+ >>> model = AutoModelForImageTextToText.from_pretrained(
227
+ ... "VillanovaAI/Villanova-2B-VL-2512-Preview",
228
+ ... trust_remote_code=True,
229
+ ... )
230
+ >>> processor = AutoProcessor.from_pretrained(
231
+ ... "VillanovaAI/Villanova-2B-VL-2512-Preview",
232
+ ... trust_remote_code=True,
233
+ ... )
234
+ """
235
+
236
+ config_class = VillanovaConfig
237
+ base_model_prefix = "model"
238
+ supports_gradient_checkpointing = True
239
+ _no_split_modules = ["MLPProjector"]
240
+
241
+ def __init__(self, config: VillanovaConfig) -> None:
242
+ super().__init__(config)
243
+
244
+ # Vision encoder
245
+ self.vision_encoder = ViTEncoder(config.vision_config)
246
+
247
+ # Projector
248
+ self.projector = MLPProjector(config.projector_config)
249
+
250
+ # Language model (will be loaded separately)
251
+ self.language_model: PreTrainedModel | None = None
252
+
253
+ # Image token index
254
+ self.image_token_index = config.image_token_index
255
+
256
+ self.post_init()
257
+
258
+ def get_input_embeddings(self) -> nn.Module | None:
259
+ """Get input embeddings from language model."""
260
+ if self.language_model is not None:
261
+ return self.language_model.get_input_embeddings()
262
+ return None
263
+
264
+ def set_input_embeddings(self, value: nn.Module) -> None:
265
+ """Set input embeddings in language model."""
266
+ if self.language_model is not None:
267
+ self.language_model.set_input_embeddings(value)
268
+
269
+ def get_output_embeddings(self) -> nn.Module | None:
270
+ """Get output embeddings from language model."""
271
+ if self.language_model is not None:
272
+ return self.language_model.get_output_embeddings()
273
+ return None
274
+
275
+ def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
276
+ """Set output embeddings in language model."""
277
+ if self.language_model is not None:
278
+ self.language_model.set_output_embeddings(new_embeddings)
279
+
280
+ def _merge_input_ids_with_image_features(
281
+ self,
282
+ input_ids: torch.Tensor,
283
+ image_features: torch.Tensor,
284
+ attention_mask: torch.Tensor | None = None,
285
+ ) -> tuple[torch.Tensor, torch.Tensor | None]:
286
+ """Merge text embeddings with image features at <image> token positions.
287
+
288
+ This uses the EXPANSION approach (like LLaVA): a single <image> token in the
289
+ input is replaced with all 576 visual feature tokens. The sequence length
290
+ increases by (num_patches - 1).
291
+
292
+ For training compatibility, we expand the single <image> token to num_patches
293
+ copies, then replace each with the corresponding visual feature.
294
+ """
295
+ batch_size = input_ids.shape[0]
296
+ num_patches = image_features.shape[1]
297
+
298
+ # Get text embeddings
299
+ text_embeddings = self.get_input_embeddings()(input_ids)
300
+
301
+ # Find image token positions
302
+ image_token_mask = input_ids == self.image_token_index
303
+
304
+ new_embeddings_list = []
305
+ new_attention_mask_list = [] if attention_mask is not None else None
306
+
307
+ for b in range(batch_size):
308
+ image_positions = torch.where(image_token_mask[b])[0]
309
+ num_image_tokens = len(image_positions)
310
+
311
+ if num_image_tokens == 0:
312
+ # No image tokens - keep original embeddings
313
+ new_embeddings_list.append(text_embeddings[b])
314
+ if attention_mask is not None:
315
+ new_attention_mask_list.append(attention_mask[b])
316
+ elif num_image_tokens == 1:
317
+ # Single <image> token - expand to num_patches visual features
318
+ pos = image_positions[0].item()
319
+ before = text_embeddings[b, :pos]
320
+ after = text_embeddings[b, pos + 1:]
321
+ # Insert all visual features at the single <image> position
322
+ merged = torch.cat([before, image_features[b], after], dim=0)
323
+ new_embeddings_list.append(merged)
324
+
325
+ if attention_mask is not None:
326
+ mask_before = attention_mask[b, :pos]
327
+ mask_after = attention_mask[b, pos + 1:]
328
+ image_mask = torch.ones(num_patches, dtype=attention_mask.dtype, device=attention_mask.device)
329
+ merged_mask = torch.cat([mask_before, image_mask, mask_after], dim=0)
330
+ new_attention_mask_list.append(merged_mask)
331
+ else:
332
+ # Multiple <image> tokens - replace each with corresponding visual feature
333
+ # This matches the training behavior when tokens are pre-expanded
334
+ output = text_embeddings[b].clone()
335
+ actual_patches = min(num_patches, num_image_tokens)
336
+ for i in range(actual_patches):
337
+ pos = image_positions[i].item()
338
+ output[pos] = image_features[b, i]
339
+ new_embeddings_list.append(output)
340
+ if attention_mask is not None:
341
+ new_attention_mask_list.append(attention_mask[b])
342
+
343
+ # Pad to same length
344
+ max_len = max(e.shape[0] for e in new_embeddings_list)
345
+ padded_embeddings = torch.zeros(
346
+ batch_size, max_len, text_embeddings.shape[-1],
347
+ dtype=text_embeddings.dtype, device=text_embeddings.device
348
+ )
349
+ for b, emb in enumerate(new_embeddings_list):
350
+ padded_embeddings[b, :emb.shape[0]] = emb
351
+
352
+ padded_attention_mask = None
353
+ if new_attention_mask_list is not None:
354
+ padded_attention_mask = torch.zeros(
355
+ batch_size, max_len, dtype=attention_mask.dtype, device=attention_mask.device
356
+ )
357
+ for b, mask in enumerate(new_attention_mask_list):
358
+ padded_attention_mask[b, :mask.shape[0]] = mask
359
+
360
+ return padded_embeddings, padded_attention_mask
361
+
362
+ def forward(
363
+ self,
364
+ input_ids: torch.Tensor | None = None,
365
+ pixel_values: torch.Tensor | None = None,
366
+ attention_mask: torch.Tensor | None = None,
367
+ labels: torch.Tensor | None = None,
368
+ inputs_embeds: torch.Tensor | None = None,
369
+ past_key_values: tuple | None = None,
370
+ use_cache: bool | None = None,
371
+ output_attentions: bool | None = None,
372
+ output_hidden_states: bool | None = None,
373
+ return_dict: bool | None = None,
374
+ **kwargs: Any,
375
+ ) -> CausalLMOutputWithPast | tuple:
376
+ """Forward pass."""
377
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
378
+
379
+ if self.language_model is None:
380
+ raise RuntimeError("Language model not initialized")
381
+
382
+ # Process image if provided
383
+ if pixel_values is not None and inputs_embeds is None:
384
+ image_features = self.vision_encoder(pixel_values)
385
+ # Cast to projector dtype (vision encoder may output float32)
386
+ image_features = image_features.to(self.projector.mlp[0].weight.dtype)
387
+ image_features = self.projector(image_features)
388
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
389
+ input_ids, image_features, attention_mask
390
+ )
391
+ input_ids = None
392
+
393
+ return self.language_model(
394
+ input_ids=input_ids,
395
+ attention_mask=attention_mask,
396
+ inputs_embeds=inputs_embeds,
397
+ labels=labels,
398
+ past_key_values=past_key_values,
399
+ use_cache=use_cache,
400
+ output_attentions=output_attentions,
401
+ output_hidden_states=output_hidden_states,
402
+ return_dict=return_dict,
403
+ )
404
+
405
+ def generate(
406
+ self,
407
+ input_ids: torch.Tensor | None = None,
408
+ pixel_values: torch.Tensor | None = None,
409
+ attention_mask: torch.Tensor | None = None,
410
+ max_new_tokens: int = 256,
411
+ do_sample: bool = False,
412
+ temperature: float = 1.0,
413
+ top_p: float = 1.0,
414
+ top_k: int = 50,
415
+ **kwargs: Any,
416
+ ) -> torch.Tensor:
417
+ """Generate text conditioned on image and prompt."""
418
+ if self.language_model is None:
419
+ raise RuntimeError("Language model not initialized")
420
+
421
+ if pixel_values is not None:
422
+ image_features = self.vision_encoder(pixel_values)
423
+ # Cast to projector dtype (vision encoder may output float32)
424
+ image_features = image_features.to(self.projector.mlp[0].weight.dtype)
425
+ image_features = self.projector(image_features)
426
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
427
+ input_ids, image_features, attention_mask
428
+ )
429
+
430
+ # Get token IDs from text_config or kwargs
431
+ text_config = self.config.text_config
432
+ pad_token_id = kwargs.pop("pad_token_id", None) or getattr(text_config, "pad_token_id", None)
433
+ eos_token_id = kwargs.pop("eos_token_id", None) or getattr(text_config, "eos_token_id", None)
434
+
435
+ return self.language_model.generate(
436
+ inputs_embeds=inputs_embeds,
437
+ attention_mask=attention_mask,
438
+ max_new_tokens=max_new_tokens,
439
+ do_sample=do_sample,
440
+ temperature=temperature,
441
+ top_p=top_p,
442
+ top_k=top_k,
443
+ pad_token_id=pad_token_id,
444
+ eos_token_id=eos_token_id,
445
+ **kwargs,
446
+ )
447
+
448
+ return self.language_model.generate(
449
+ input_ids=input_ids,
450
+ attention_mask=attention_mask,
451
+ max_new_tokens=max_new_tokens,
452
+ do_sample=do_sample,
453
+ temperature=temperature,
454
+ top_p=top_p,
455
+ top_k=top_k,
456
+ **kwargs,
457
+ )
458
+
459
+ @classmethod
460
+ def from_pretrained(
461
+ cls,
462
+ pretrained_model_name_or_path: str,
463
+ *model_args: Any,
464
+ config: VillanovaConfig | None = None,
465
+ torch_dtype: torch.dtype | str | None = None,
466
+ device_map: str | dict | None = None,
467
+ **kwargs: Any,
468
+ ) -> "VillanovaVLMForConditionalGeneration":
469
+ """Load pretrained model."""
470
+ from pathlib import Path
471
+
472
+ from safetensors.torch import load_file
473
+ from transformers import AutoConfig
474
+
475
+ # Remove trust_remote_code from kwargs to avoid passing it twice
476
+ kwargs.pop("trust_remote_code", None)
477
+
478
+ # Handle dtype/torch_dtype - newer transformers uses 'dtype' instead of 'torch_dtype'
479
+ if torch_dtype is None:
480
+ torch_dtype = kwargs.pop("dtype", None)
481
+ else:
482
+ kwargs.pop("dtype", None) # Remove if both were passed
483
+
484
+ # Load config
485
+ if config is None:
486
+ config = AutoConfig.from_pretrained(
487
+ pretrained_model_name_or_path,
488
+ trust_remote_code=True,
489
+ **kwargs,
490
+ )
491
+
492
+ # Handle torch_dtype string conversion
493
+ if torch_dtype is not None:
494
+ if isinstance(torch_dtype, str):
495
+ torch_dtype = getattr(torch, torch_dtype.replace("torch.", ""))
496
+
497
+ # Create model
498
+ model = cls(config)
499
+
500
+ # Create LLM from text_config
501
+ # Get the text config dict
502
+ text_config_dict = config.text_config.to_dict() if hasattr(config.text_config, "to_dict") else dict(config.text_config)
503
+
504
+ # Check for nested text_config (used in VillanovaVLM training format)
505
+ if "text_config" in text_config_dict and isinstance(text_config_dict["text_config"], dict):
506
+ # Use the nested text_config which contains the actual LLM config
507
+ llm_config_dict = dict(text_config_dict["text_config"])
508
+ else:
509
+ llm_config_dict = text_config_dict
510
+
511
+ # Get model type from config to determine which model class to use
512
+ model_type = llm_config_dict.pop("model_type", "llama")
513
+
514
+ # Remove non-config keys
515
+ for key in ["_name_or_path", "transformers_version", "torch_dtype", "dtype"]:
516
+ llm_config_dict.pop(key, None)
517
+
518
+ # Create the LLM config and model
519
+ from transformers import AutoConfig as HFAutoConfig, AutoModelForCausalLM as HFAutoModelForCausalLM
520
+
521
+ llm_config = HFAutoConfig.for_model(model_type, **llm_config_dict)
522
+ model.language_model = HFAutoModelForCausalLM.from_config(llm_config, torch_dtype=torch_dtype)
523
+
524
+ # Load all weights from safetensors
525
+ model_path = Path(pretrained_model_name_or_path)
526
+
527
+ if model_path.exists():
528
+ safetensors_files = sorted(model_path.glob("*.safetensors"))
529
+ else:
530
+ from huggingface_hub import hf_hub_download, list_repo_files
531
+ try:
532
+ # Get list of safetensor files from the repo
533
+ repo_files = list_repo_files(pretrained_model_name_or_path)
534
+ sf_files = [f for f in repo_files if f.endswith(".safetensors")]
535
+ safetensors_files = []
536
+ for sf in sf_files:
537
+ sf_path = hf_hub_download(pretrained_model_name_or_path, sf)
538
+ safetensors_files.append(Path(sf_path))
539
+ except Exception:
540
+ safetensors_files = []
541
+
542
+ vision_state_dict = {}
543
+ projector_state_dict = {}
544
+ llm_state_dict = {}
545
+
546
+ for sf_file in safetensors_files:
547
+ state_dict = load_file(sf_file)
548
+ for key, value in state_dict.items():
549
+ # Convert dtype if needed
550
+ if torch_dtype is not None:
551
+ value = value.to(torch_dtype)
552
+
553
+ if key.startswith("vision_encoder."):
554
+ new_key = key.replace("vision_encoder.", "")
555
+ vision_state_dict[new_key] = value
556
+ elif key.startswith("projector."):
557
+ new_key = key.replace("projector.", "")
558
+ projector_state_dict[new_key] = value
559
+ elif key.startswith("language_model."):
560
+ # LLM weights - strip the language_model. prefix
561
+ new_key = key.replace("language_model.", "")
562
+ llm_state_dict[new_key] = value
563
+ else:
564
+ # LLM weights without prefix (legacy format)
565
+ llm_state_dict[key] = value
566
+
567
+ # Load weights into model components
568
+ # Note: vision_encoder uses OpenCLIP pretrained weights, not from safetensors
569
+ if projector_state_dict:
570
+ model.projector.load_state_dict(projector_state_dict, strict=False)
571
+ if llm_state_dict:
572
+ model.language_model.load_state_dict(llm_state_dict, strict=False)
573
+
574
+ # Convert model to target dtype AFTER loading weights
575
+ # load_state_dict doesn't change the model's dtype, so we must convert explicitly
576
+ if torch_dtype is not None:
577
+ model.projector = model.projector.to(dtype=torch_dtype)
578
+ model.language_model = model.language_model.to(dtype=torch_dtype)
579
+
580
+ # Handle device_map
581
+ if device_map is not None:
582
+ import accelerate
583
+
584
+ if device_map == "auto":
585
+ # Infer device map automatically
586
+ device_map = accelerate.infer_auto_device_map(
587
+ model,
588
+ max_memory=None,
589
+ no_split_module_classes=["MLPProjector", "ViTEncoder"],
590
+ )
591
+
592
+ if isinstance(device_map, dict):
593
+ model = accelerate.dispatch_model(model, device_map=device_map)
594
+ else:
595
+ # Simple device placement
596
+ model = model.to(device_map)
597
+
598
+ return model
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "VillanovaProcessor",
3
+ "image_processor_type": "VillanovaImageProcessor",
4
+ "auto_map": {
5
+ "AutoProcessor": "processing_villanova.VillanovaProcessor",
6
+ "AutoImageProcessor": "image_processing_villanova.VillanovaImageProcessor"
7
+ },
8
+ "do_resize": true,
9
+ "size": {
10
+ "height": 384,
11
+ "width": 384
12
+ },
13
+ "resample": 3,
14
+ "do_rescale": true,
15
+ "rescale_factor": 0.00392156862745098,
16
+ "do_normalize": true,
17
+ "image_mean": [
18
+ 0.5,
19
+ 0.5,
20
+ 0.5
21
+ ],
22
+ "image_std": [
23
+ 0.5,
24
+ 0.5,
25
+ 0.5
26
+ ],
27
+ "do_convert_rgb": true
28
+ }
processing_villanova.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Villanova VLM Processor for HuggingFace.
2
+
3
+ This is a standalone processor file for use with trust_remote_code=True.
4
+ It contains no imports from aithlas_trainer to ensure self-containment.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from PIL import Image
10
+ from transformers import AutoTokenizer
11
+ from transformers.feature_extraction_utils import BatchFeature
12
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
13
+
14
+ from .image_processing_villanova import VillanovaImageProcessor
15
+
16
+
17
+ class VillanovaProcessor:
18
+ """Unified processor for Villanova VLM.
19
+
20
+ Combines VillanovaImageProcessor and the LLM tokenizer for easy
21
+ preprocessing of image-text pairs.
22
+
23
+ Args:
24
+ image_processor: VillanovaImageProcessor instance
25
+ tokenizer: LLM tokenizer instance
26
+
27
+ Example:
28
+ >>> processor = VillanovaProcessor.from_pretrained("VillanovaAI/Villanova-2B-VL-2512-Preview")
29
+ >>> image = Image.open("image.jpg")
30
+ >>> inputs = processor(images=image, text="Describe this image.", return_tensors="pt")
31
+ >>> print(inputs.keys())
32
+ dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
33
+ """
34
+
35
+ attributes = ["image_processor", "tokenizer"]
36
+ image_processor_class = "VillanovaImageProcessor"
37
+ tokenizer_class = "AutoTokenizer"
38
+
39
+ def __init__(
40
+ self,
41
+ image_processor: VillanovaImageProcessor | None = None,
42
+ tokenizer: Any | None = None,
43
+ **kwargs: Any,
44
+ ) -> None:
45
+ if image_processor is None:
46
+ image_processor = VillanovaImageProcessor()
47
+
48
+ self.image_processor = image_processor
49
+ self.tokenizer = tokenizer
50
+
51
+ def __call__(
52
+ self,
53
+ images: Image.Image | list[Image.Image] | None = None,
54
+ text: TextInput | PreTokenizedInput | list[TextInput] | None = None,
55
+ padding: bool | str = False,
56
+ truncation: bool | None = None,
57
+ max_length: int | None = None,
58
+ return_tensors: str | None = None,
59
+ **kwargs: Any,
60
+ ) -> BatchFeature:
61
+ """Process images and/or text for the model.
62
+
63
+ Args:
64
+ images: Single image or list of images (PIL.Image, path, or URL)
65
+ text: Single text or list of texts
66
+ padding: Padding strategy
67
+ truncation: Whether to truncate
68
+ max_length: Maximum sequence length
69
+ return_tensors: Output tensor format ("pt", "np", etc.)
70
+
71
+ Returns:
72
+ BatchFeature with pixel_values, input_ids, attention_mask
73
+
74
+ Raises:
75
+ ValueError: If neither images nor text is provided
76
+ """
77
+ if images is None and text is None:
78
+ raise ValueError("You must provide either images or text or both")
79
+
80
+ result = BatchFeature()
81
+
82
+ # Process images
83
+ if images is not None:
84
+ image_features = self.image_processor(
85
+ images,
86
+ return_tensors=return_tensors,
87
+ **kwargs,
88
+ )
89
+ result.update(image_features)
90
+
91
+ # Process text
92
+ if text is not None:
93
+ text_features = self.tokenizer(
94
+ text,
95
+ padding=padding,
96
+ truncation=truncation,
97
+ max_length=max_length,
98
+ return_tensors=return_tensors,
99
+ **kwargs,
100
+ )
101
+ result.update(text_features)
102
+
103
+ return result
104
+
105
+ def batch_decode(self, *args: Any, **kwargs: Any) -> list[str]:
106
+ """Decode token IDs to text.
107
+
108
+ Delegates to the tokenizer's batch_decode method.
109
+ """
110
+ return self.tokenizer.batch_decode(*args, **kwargs)
111
+
112
+ def decode(self, *args: Any, **kwargs: Any) -> str:
113
+ """Decode token IDs to text.
114
+
115
+ Delegates to the tokenizer's decode method.
116
+ """
117
+ return self.tokenizer.decode(*args, **kwargs)
118
+
119
+ def apply_chat_template(
120
+ self,
121
+ conversation: list[dict],
122
+ add_generation_prompt: bool = False,
123
+ **kwargs: Any,
124
+ ) -> str:
125
+ """Apply chat template to conversation.
126
+
127
+ Args:
128
+ conversation: List of message dicts with "role" and "content"
129
+ add_generation_prompt: Whether to add generation prompt
130
+
131
+ Returns:
132
+ Formatted prompt string
133
+
134
+ Example:
135
+ >>> messages = [{"role": "user", "content": "<image>\\nDescribe this."}]
136
+ >>> prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
137
+ """
138
+ return self.tokenizer.apply_chat_template(
139
+ conversation,
140
+ add_generation_prompt=add_generation_prompt,
141
+ tokenize=False,
142
+ **kwargs,
143
+ )
144
+
145
+ @property
146
+ def model_input_names(self) -> list[str]:
147
+ """Get model input names."""
148
+ tokenizer_input_names = self.tokenizer.model_input_names
149
+ image_processor_input_names = self.image_processor.model_input_names
150
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
151
+
152
+ @classmethod
153
+ def from_pretrained(
154
+ cls,
155
+ pretrained_model_name_or_path: str,
156
+ **kwargs: Any,
157
+ ) -> "VillanovaProcessor":
158
+ """Load processor from pretrained model.
159
+
160
+ Args:
161
+ pretrained_model_name_or_path: Model ID or local path
162
+
163
+ Returns:
164
+ VillanovaProcessor instance
165
+ """
166
+ # Remove trust_remote_code from kwargs to avoid passing it twice
167
+ kwargs.pop("trust_remote_code", None)
168
+
169
+ image_processor = VillanovaImageProcessor.from_pretrained(
170
+ pretrained_model_name_or_path,
171
+ **kwargs,
172
+ )
173
+ tokenizer = AutoTokenizer.from_pretrained(
174
+ pretrained_model_name_or_path,
175
+ trust_remote_code=True,
176
+ **kwargs,
177
+ )
178
+
179
+ return cls(image_processor=image_processor, tokenizer=tokenizer)
180
+
181
+ def save_pretrained(
182
+ self,
183
+ save_directory: str,
184
+ **kwargs: Any,
185
+ ) -> None:
186
+ """Save processor to directory.
187
+
188
+ Args:
189
+ save_directory: Directory to save to
190
+ """
191
+ self.image_processor.save_pretrained(save_directory, **kwargs)
192
+ self.tokenizer.save_pretrained(save_directory, **kwargs)
193
+
194
+ @classmethod
195
+ def register_for_auto_class(cls, auto_class: str = "AutoProcessor") -> None:
196
+ """Register this class for automatic loading.
197
+
198
+ This is a no-op for custom processors loaded with trust_remote_code=True,
199
+ but required by the transformers auto-loading mechanism.
200
+
201
+ Args:
202
+ auto_class: The auto class to register with (default: "AutoProcessor")
203
+ """
204
+ # No-op - custom classes loaded via trust_remote_code don't need registration
205
+ pass
special_tokens_map.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<image>"
4
+ ],
5
+ "bos_token": {
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "cls_token": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "pad_token": {
27
+ "content": "</s>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "sep_token": {
34
+ "content": "</s>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ },
40
+ "unk_token": {
41
+ "content": "<unk>",
42
+ "lstrip": false,
43
+ "normalized": false,
44
+ "rstrip": false,
45
+ "single_word": false
46
+ }
47
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23632cdff814fe6ae5eb6159980453467d5e93ca315c82e4e13dadc78da7d525
3
+ size 37007600
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab94ddf46d14f0279254858d53770c5319c5129d47291ee2bada530271cb1292
3
+ size 4813276
tokenizer_config.json ADDED
@@ -0,0 +1,1113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<|im_start|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<|im_end|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<|reserved_token_1|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "7": {
63
+ "content": "<|reserved_token_2|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "8": {
71
+ "content": "<|reserved_token_3|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "9": {
79
+ "content": "<|reserved_token_4|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "10": {
87
+ "content": "<|reserved_token_5|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "11": {
95
+ "content": "<|reserved_token_6|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "12": {
103
+ "content": "<|reserved_token_7|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "13": {
111
+ "content": "<|reserved_token_8|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "14": {
119
+ "content": "<|reserved_token_9|>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "15": {
127
+ "content": "<|reserved_token_10|>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ },
134
+ "16": {
135
+ "content": "<|reserved_token_11|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": true
141
+ },
142
+ "17": {
143
+ "content": "<|reserved_token_12|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": true
149
+ },
150
+ "18": {
151
+ "content": "<|reserved_token_13|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": true
157
+ },
158
+ "19": {
159
+ "content": "<|reserved_token_14|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": true
165
+ },
166
+ "20": {
167
+ "content": "<|reserved_token_15|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": true
173
+ },
174
+ "21": {
175
+ "content": "<|reserved_token_16|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ },
182
+ "22": {
183
+ "content": "<|reserved_token_17|>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "23": {
191
+ "content": "<|reserved_token_18|>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "24": {
199
+ "content": "<|reserved_token_19|>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "25": {
207
+ "content": "<|reserved_token_20|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "26": {
215
+ "content": "<|reserved_token_21|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "27": {
223
+ "content": "<|reserved_token_22|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "28": {
231
+ "content": "<|reserved_token_23|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "29": {
239
+ "content": "<|reserved_token_24|>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "30": {
247
+ "content": "<|reserved_token_25|>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "31": {
255
+ "content": "<|reserved_token_26|>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "32": {
263
+ "content": "<|reserved_token_27|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "33": {
271
+ "content": "<|reserved_token_28|>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "34": {
279
+ "content": "<|reserved_token_29|>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "35": {
287
+ "content": "<|reserved_token_30|>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "36": {
295
+ "content": "<|reserved_token_31|>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "37": {
303
+ "content": "<|reserved_token_32|>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "38": {
311
+ "content": "<|reserved_token_33|>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ },
318
+ "39": {
319
+ "content": "<|reserved_token_34|>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": true
325
+ },
326
+ "40": {
327
+ "content": "<|reserved_token_35|>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": true
333
+ },
334
+ "41": {
335
+ "content": "<|reserved_token_36|>",
336
+ "lstrip": false,
337
+ "normalized": false,
338
+ "rstrip": false,
339
+ "single_word": false,
340
+ "special": true
341
+ },
342
+ "42": {
343
+ "content": "<|reserved_token_37|>",
344
+ "lstrip": false,
345
+ "normalized": false,
346
+ "rstrip": false,
347
+ "single_word": false,
348
+ "special": true
349
+ },
350
+ "43": {
351
+ "content": "<|reserved_token_38|>",
352
+ "lstrip": false,
353
+ "normalized": false,
354
+ "rstrip": false,
355
+ "single_word": false,
356
+ "special": true
357
+ },
358
+ "44": {
359
+ "content": "<|reserved_token_39|>",
360
+ "lstrip": false,
361
+ "normalized": false,
362
+ "rstrip": false,
363
+ "single_word": false,
364
+ "special": true
365
+ },
366
+ "45": {
367
+ "content": "<|reserved_token_40|>",
368
+ "lstrip": false,
369
+ "normalized": false,
370
+ "rstrip": false,
371
+ "single_word": false,
372
+ "special": true
373
+ },
374
+ "46": {
375
+ "content": "<|reserved_token_41|>",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false,
380
+ "special": true
381
+ },
382
+ "47": {
383
+ "content": "<|reserved_token_42|>",
384
+ "lstrip": false,
385
+ "normalized": false,
386
+ "rstrip": false,
387
+ "single_word": false,
388
+ "special": true
389
+ },
390
+ "48": {
391
+ "content": "<|reserved_token_43|>",
392
+ "lstrip": false,
393
+ "normalized": false,
394
+ "rstrip": false,
395
+ "single_word": false,
396
+ "special": true
397
+ },
398
+ "49": {
399
+ "content": "<|reserved_token_44|>",
400
+ "lstrip": false,
401
+ "normalized": false,
402
+ "rstrip": false,
403
+ "single_word": false,
404
+ "special": true
405
+ },
406
+ "50": {
407
+ "content": "<|reserved_token_45|>",
408
+ "lstrip": false,
409
+ "normalized": false,
410
+ "rstrip": false,
411
+ "single_word": false,
412
+ "special": true
413
+ },
414
+ "51": {
415
+ "content": "<|reserved_token_46|>",
416
+ "lstrip": false,
417
+ "normalized": false,
418
+ "rstrip": false,
419
+ "single_word": false,
420
+ "special": true
421
+ },
422
+ "52": {
423
+ "content": "<|reserved_token_47|>",
424
+ "lstrip": false,
425
+ "normalized": false,
426
+ "rstrip": false,
427
+ "single_word": false,
428
+ "special": true
429
+ },
430
+ "53": {
431
+ "content": "<|reserved_token_48|>",
432
+ "lstrip": false,
433
+ "normalized": false,
434
+ "rstrip": false,
435
+ "single_word": false,
436
+ "special": true
437
+ },
438
+ "54": {
439
+ "content": "<|reserved_token_49|>",
440
+ "lstrip": false,
441
+ "normalized": false,
442
+ "rstrip": false,
443
+ "single_word": false,
444
+ "special": true
445
+ },
446
+ "55": {
447
+ "content": "<|reserved_token_50|>",
448
+ "lstrip": false,
449
+ "normalized": false,
450
+ "rstrip": false,
451
+ "single_word": false,
452
+ "special": true
453
+ },
454
+ "56": {
455
+ "content": "<|reserved_token_51|>",
456
+ "lstrip": false,
457
+ "normalized": false,
458
+ "rstrip": false,
459
+ "single_word": false,
460
+ "special": true
461
+ },
462
+ "57": {
463
+ "content": "<|reserved_token_52|>",
464
+ "lstrip": false,
465
+ "normalized": false,
466
+ "rstrip": false,
467
+ "single_word": false,
468
+ "special": true
469
+ },
470
+ "58": {
471
+ "content": "<|reserved_token_53|>",
472
+ "lstrip": false,
473
+ "normalized": false,
474
+ "rstrip": false,
475
+ "single_word": false,
476
+ "special": true
477
+ },
478
+ "59": {
479
+ "content": "<|reserved_token_54|>",
480
+ "lstrip": false,
481
+ "normalized": false,
482
+ "rstrip": false,
483
+ "single_word": false,
484
+ "special": true
485
+ },
486
+ "60": {
487
+ "content": "<|reserved_token_55|>",
488
+ "lstrip": false,
489
+ "normalized": false,
490
+ "rstrip": false,
491
+ "single_word": false,
492
+ "special": true
493
+ },
494
+ "61": {
495
+ "content": "<|reserved_token_56|>",
496
+ "lstrip": false,
497
+ "normalized": false,
498
+ "rstrip": false,
499
+ "single_word": false,
500
+ "special": true
501
+ },
502
+ "62": {
503
+ "content": "<|reserved_token_57|>",
504
+ "lstrip": false,
505
+ "normalized": false,
506
+ "rstrip": false,
507
+ "single_word": false,
508
+ "special": true
509
+ },
510
+ "63": {
511
+ "content": "<|reserved_token_58|>",
512
+ "lstrip": false,
513
+ "normalized": false,
514
+ "rstrip": false,
515
+ "single_word": false,
516
+ "special": true
517
+ },
518
+ "64": {
519
+ "content": "<|reserved_token_59|>",
520
+ "lstrip": false,
521
+ "normalized": false,
522
+ "rstrip": false,
523
+ "single_word": false,
524
+ "special": true
525
+ },
526
+ "65": {
527
+ "content": "<|reserved_token_60|>",
528
+ "lstrip": false,
529
+ "normalized": false,
530
+ "rstrip": false,
531
+ "single_word": false,
532
+ "special": true
533
+ },
534
+ "66": {
535
+ "content": "<|reserved_token_61|>",
536
+ "lstrip": false,
537
+ "normalized": false,
538
+ "rstrip": false,
539
+ "single_word": false,
540
+ "special": true
541
+ },
542
+ "67": {
543
+ "content": "<|reserved_token_62|>",
544
+ "lstrip": false,
545
+ "normalized": false,
546
+ "rstrip": false,
547
+ "single_word": false,
548
+ "special": true
549
+ },
550
+ "68": {
551
+ "content": "<|reserved_token_63|>",
552
+ "lstrip": false,
553
+ "normalized": false,
554
+ "rstrip": false,
555
+ "single_word": false,
556
+ "special": true
557
+ },
558
+ "69": {
559
+ "content": "<|reserved_token_64|>",
560
+ "lstrip": false,
561
+ "normalized": false,
562
+ "rstrip": false,
563
+ "single_word": false,
564
+ "special": true
565
+ },
566
+ "70": {
567
+ "content": "<|reserved_token_65|>",
568
+ "lstrip": false,
569
+ "normalized": false,
570
+ "rstrip": false,
571
+ "single_word": false,
572
+ "special": true
573
+ },
574
+ "71": {
575
+ "content": "<|reserved_token_66|>",
576
+ "lstrip": false,
577
+ "normalized": false,
578
+ "rstrip": false,
579
+ "single_word": false,
580
+ "special": true
581
+ },
582
+ "72": {
583
+ "content": "<|reserved_token_67|>",
584
+ "lstrip": false,
585
+ "normalized": false,
586
+ "rstrip": false,
587
+ "single_word": false,
588
+ "special": true
589
+ },
590
+ "73": {
591
+ "content": "<|reserved_token_68|>",
592
+ "lstrip": false,
593
+ "normalized": false,
594
+ "rstrip": false,
595
+ "single_word": false,
596
+ "special": true
597
+ },
598
+ "74": {
599
+ "content": "<|reserved_token_69|>",
600
+ "lstrip": false,
601
+ "normalized": false,
602
+ "rstrip": false,
603
+ "single_word": false,
604
+ "special": true
605
+ },
606
+ "75": {
607
+ "content": "<|reserved_token_70|>",
608
+ "lstrip": false,
609
+ "normalized": false,
610
+ "rstrip": false,
611
+ "single_word": false,
612
+ "special": true
613
+ },
614
+ "76": {
615
+ "content": "<|reserved_token_71|>",
616
+ "lstrip": false,
617
+ "normalized": false,
618
+ "rstrip": false,
619
+ "single_word": false,
620
+ "special": true
621
+ },
622
+ "77": {
623
+ "content": "<|reserved_token_72|>",
624
+ "lstrip": false,
625
+ "normalized": false,
626
+ "rstrip": false,
627
+ "single_word": false,
628
+ "special": true
629
+ },
630
+ "78": {
631
+ "content": "<|reserved_token_73|>",
632
+ "lstrip": false,
633
+ "normalized": false,
634
+ "rstrip": false,
635
+ "single_word": false,
636
+ "special": true
637
+ },
638
+ "79": {
639
+ "content": "<|reserved_token_74|>",
640
+ "lstrip": false,
641
+ "normalized": false,
642
+ "rstrip": false,
643
+ "single_word": false,
644
+ "special": true
645
+ },
646
+ "80": {
647
+ "content": "<|reserved_token_75|>",
648
+ "lstrip": false,
649
+ "normalized": false,
650
+ "rstrip": false,
651
+ "single_word": false,
652
+ "special": true
653
+ },
654
+ "81": {
655
+ "content": "<|reserved_token_76|>",
656
+ "lstrip": false,
657
+ "normalized": false,
658
+ "rstrip": false,
659
+ "single_word": false,
660
+ "special": true
661
+ },
662
+ "82": {
663
+ "content": "<|reserved_token_77|>",
664
+ "lstrip": false,
665
+ "normalized": false,
666
+ "rstrip": false,
667
+ "single_word": false,
668
+ "special": true
669
+ },
670
+ "83": {
671
+ "content": "<|reserved_token_78|>",
672
+ "lstrip": false,
673
+ "normalized": false,
674
+ "rstrip": false,
675
+ "single_word": false,
676
+ "special": true
677
+ },
678
+ "84": {
679
+ "content": "<|reserved_token_79|>",
680
+ "lstrip": false,
681
+ "normalized": false,
682
+ "rstrip": false,
683
+ "single_word": false,
684
+ "special": true
685
+ },
686
+ "85": {
687
+ "content": "<|reserved_token_80|>",
688
+ "lstrip": false,
689
+ "normalized": false,
690
+ "rstrip": false,
691
+ "single_word": false,
692
+ "special": true
693
+ },
694
+ "86": {
695
+ "content": "<|reserved_token_81|>",
696
+ "lstrip": false,
697
+ "normalized": false,
698
+ "rstrip": false,
699
+ "single_word": false,
700
+ "special": true
701
+ },
702
+ "87": {
703
+ "content": "<|reserved_token_82|>",
704
+ "lstrip": false,
705
+ "normalized": false,
706
+ "rstrip": false,
707
+ "single_word": false,
708
+ "special": true
709
+ },
710
+ "88": {
711
+ "content": "<|reserved_token_83|>",
712
+ "lstrip": false,
713
+ "normalized": false,
714
+ "rstrip": false,
715
+ "single_word": false,
716
+ "special": true
717
+ },
718
+ "89": {
719
+ "content": "<|reserved_token_84|>",
720
+ "lstrip": false,
721
+ "normalized": false,
722
+ "rstrip": false,
723
+ "single_word": false,
724
+ "special": true
725
+ },
726
+ "90": {
727
+ "content": "<|reserved_token_85|>",
728
+ "lstrip": false,
729
+ "normalized": false,
730
+ "rstrip": false,
731
+ "single_word": false,
732
+ "special": true
733
+ },
734
+ "91": {
735
+ "content": "<|reserved_token_86|>",
736
+ "lstrip": false,
737
+ "normalized": false,
738
+ "rstrip": false,
739
+ "single_word": false,
740
+ "special": true
741
+ },
742
+ "92": {
743
+ "content": "<|reserved_token_87|>",
744
+ "lstrip": false,
745
+ "normalized": false,
746
+ "rstrip": false,
747
+ "single_word": false,
748
+ "special": true
749
+ },
750
+ "93": {
751
+ "content": "<|reserved_token_88|>",
752
+ "lstrip": false,
753
+ "normalized": false,
754
+ "rstrip": false,
755
+ "single_word": false,
756
+ "special": true
757
+ },
758
+ "94": {
759
+ "content": "<|reserved_token_89|>",
760
+ "lstrip": false,
761
+ "normalized": false,
762
+ "rstrip": false,
763
+ "single_word": false,
764
+ "special": true
765
+ },
766
+ "95": {
767
+ "content": "<|reserved_token_90|>",
768
+ "lstrip": false,
769
+ "normalized": false,
770
+ "rstrip": false,
771
+ "single_word": false,
772
+ "special": true
773
+ },
774
+ "96": {
775
+ "content": "<|reserved_token_91|>",
776
+ "lstrip": false,
777
+ "normalized": false,
778
+ "rstrip": false,
779
+ "single_word": false,
780
+ "special": true
781
+ },
782
+ "97": {
783
+ "content": "<|reserved_token_92|>",
784
+ "lstrip": false,
785
+ "normalized": false,
786
+ "rstrip": false,
787
+ "single_word": false,
788
+ "special": true
789
+ },
790
+ "98": {
791
+ "content": "<|reserved_token_93|>",
792
+ "lstrip": false,
793
+ "normalized": false,
794
+ "rstrip": false,
795
+ "single_word": false,
796
+ "special": true
797
+ },
798
+ "99": {
799
+ "content": "<|reserved_token_94|>",
800
+ "lstrip": false,
801
+ "normalized": false,
802
+ "rstrip": false,
803
+ "single_word": false,
804
+ "special": true
805
+ },
806
+ "100": {
807
+ "content": "<|reserved_token_95|>",
808
+ "lstrip": false,
809
+ "normalized": false,
810
+ "rstrip": false,
811
+ "single_word": false,
812
+ "special": true
813
+ },
814
+ "101": {
815
+ "content": "<|reserved_token_96|>",
816
+ "lstrip": false,
817
+ "normalized": false,
818
+ "rstrip": false,
819
+ "single_word": false,
820
+ "special": true
821
+ },
822
+ "102": {
823
+ "content": "<|reserved_token_97|>",
824
+ "lstrip": false,
825
+ "normalized": false,
826
+ "rstrip": false,
827
+ "single_word": false,
828
+ "special": true
829
+ },
830
+ "103": {
831
+ "content": "<|reserved_token_98|>",
832
+ "lstrip": false,
833
+ "normalized": false,
834
+ "rstrip": false,
835
+ "single_word": false,
836
+ "special": true
837
+ },
838
+ "104": {
839
+ "content": "\\r",
840
+ "lstrip": false,
841
+ "normalized": false,
842
+ "rstrip": false,
843
+ "single_word": false,
844
+ "special": false
845
+ },
846
+ "105": {
847
+ "content": "▁▁",
848
+ "lstrip": false,
849
+ "normalized": false,
850
+ "rstrip": false,
851
+ "single_word": false,
852
+ "special": false
853
+ },
854
+ "106": {
855
+ "content": "▁▁▁",
856
+ "lstrip": false,
857
+ "normalized": false,
858
+ "rstrip": false,
859
+ "single_word": false,
860
+ "special": false
861
+ },
862
+ "107": {
863
+ "content": "▁▁▁▁",
864
+ "lstrip": false,
865
+ "normalized": false,
866
+ "rstrip": false,
867
+ "single_word": false,
868
+ "special": false
869
+ },
870
+ "108": {
871
+ "content": "▁▁▁▁▁",
872
+ "lstrip": false,
873
+ "normalized": false,
874
+ "rstrip": false,
875
+ "single_word": false,
876
+ "special": false
877
+ },
878
+ "109": {
879
+ "content": "▁▁▁▁▁▁",
880
+ "lstrip": false,
881
+ "normalized": false,
882
+ "rstrip": false,
883
+ "single_word": false,
884
+ "special": false
885
+ },
886
+ "110": {
887
+ "content": "▁▁▁▁▁▁▁",
888
+ "lstrip": false,
889
+ "normalized": false,
890
+ "rstrip": false,
891
+ "single_word": false,
892
+ "special": false
893
+ },
894
+ "111": {
895
+ "content": "▁▁▁▁▁▁▁▁",
896
+ "lstrip": false,
897
+ "normalized": false,
898
+ "rstrip": false,
899
+ "single_word": false,
900
+ "special": false
901
+ },
902
+ "112": {
903
+ "content": "▁▁▁▁▁▁▁▁▁",
904
+ "lstrip": false,
905
+ "normalized": false,
906
+ "rstrip": false,
907
+ "single_word": false,
908
+ "special": false
909
+ },
910
+ "113": {
911
+ "content": "▁▁▁▁▁▁▁▁▁▁",
912
+ "lstrip": false,
913
+ "normalized": false,
914
+ "rstrip": false,
915
+ "single_word": false,
916
+ "special": false
917
+ },
918
+ "114": {
919
+ "content": "▁▁▁▁▁▁▁▁▁▁▁",
920
+ "lstrip": false,
921
+ "normalized": false,
922
+ "rstrip": false,
923
+ "single_word": false,
924
+ "special": false
925
+ },
926
+ "115": {
927
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
928
+ "lstrip": false,
929
+ "normalized": false,
930
+ "rstrip": false,
931
+ "single_word": false,
932
+ "special": false
933
+ },
934
+ "116": {
935
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
936
+ "lstrip": false,
937
+ "normalized": false,
938
+ "rstrip": false,
939
+ "single_word": false,
940
+ "special": false
941
+ },
942
+ "117": {
943
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
944
+ "lstrip": false,
945
+ "normalized": false,
946
+ "rstrip": false,
947
+ "single_word": false,
948
+ "special": false
949
+ },
950
+ "118": {
951
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
952
+ "lstrip": false,
953
+ "normalized": false,
954
+ "rstrip": false,
955
+ "single_word": false,
956
+ "special": false
957
+ },
958
+ "119": {
959
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
960
+ "lstrip": false,
961
+ "normalized": false,
962
+ "rstrip": false,
963
+ "single_word": false,
964
+ "special": false
965
+ },
966
+ "120": {
967
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
968
+ "lstrip": false,
969
+ "normalized": false,
970
+ "rstrip": false,
971
+ "single_word": false,
972
+ "special": false
973
+ },
974
+ "121": {
975
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
976
+ "lstrip": false,
977
+ "normalized": false,
978
+ "rstrip": false,
979
+ "single_word": false,
980
+ "special": false
981
+ },
982
+ "122": {
983
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
984
+ "lstrip": false,
985
+ "normalized": false,
986
+ "rstrip": false,
987
+ "single_word": false,
988
+ "special": false
989
+ },
990
+ "123": {
991
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
992
+ "lstrip": false,
993
+ "normalized": false,
994
+ "rstrip": false,
995
+ "single_word": false,
996
+ "special": false
997
+ },
998
+ "124": {
999
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1000
+ "lstrip": false,
1001
+ "normalized": false,
1002
+ "rstrip": false,
1003
+ "single_word": false,
1004
+ "special": false
1005
+ },
1006
+ "125": {
1007
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1008
+ "lstrip": false,
1009
+ "normalized": false,
1010
+ "rstrip": false,
1011
+ "single_word": false,
1012
+ "special": false
1013
+ },
1014
+ "126": {
1015
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1016
+ "lstrip": false,
1017
+ "normalized": false,
1018
+ "rstrip": false,
1019
+ "single_word": false,
1020
+ "special": false
1021
+ },
1022
+ "127": {
1023
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1024
+ "lstrip": false,
1025
+ "normalized": false,
1026
+ "rstrip": false,
1027
+ "single_word": false,
1028
+ "special": false
1029
+ },
1030
+ "128": {
1031
+ "content": "\t\t",
1032
+ "lstrip": false,
1033
+ "normalized": false,
1034
+ "rstrip": false,
1035
+ "single_word": false,
1036
+ "special": false
1037
+ },
1038
+ "129": {
1039
+ "content": "\t\t\t",
1040
+ "lstrip": false,
1041
+ "normalized": false,
1042
+ "rstrip": false,
1043
+ "single_word": false,
1044
+ "special": false
1045
+ },
1046
+ "130": {
1047
+ "content": "\t\t\t\t",
1048
+ "lstrip": false,
1049
+ "normalized": false,
1050
+ "rstrip": false,
1051
+ "single_word": false,
1052
+ "special": false
1053
+ },
1054
+ "131": {
1055
+ "content": "\t\t\t\t\t",
1056
+ "lstrip": false,
1057
+ "normalized": false,
1058
+ "rstrip": false,
1059
+ "single_word": false,
1060
+ "special": false
1061
+ },
1062
+ "132": {
1063
+ "content": "\t\t\t\t\t\t",
1064
+ "lstrip": false,
1065
+ "normalized": false,
1066
+ "rstrip": false,
1067
+ "single_word": false,
1068
+ "special": false
1069
+ },
1070
+ "133": {
1071
+ "content": "\n\n",
1072
+ "lstrip": false,
1073
+ "normalized": false,
1074
+ "rstrip": false,
1075
+ "single_word": false,
1076
+ "special": false
1077
+ },
1078
+ "134": {
1079
+ "content": "\n\n\n",
1080
+ "lstrip": false,
1081
+ "normalized": false,
1082
+ "rstrip": false,
1083
+ "single_word": false,
1084
+ "special": false
1085
+ },
1086
+ "256000": {
1087
+ "content": "<image>",
1088
+ "lstrip": false,
1089
+ "normalized": false,
1090
+ "rstrip": false,
1091
+ "single_word": false,
1092
+ "special": true
1093
+ }
1094
+ },
1095
+ "additional_special_tokens": [
1096
+ "<image>"
1097
+ ],
1098
+ "bos_token": "<s>",
1099
+ "clean_up_tokenization_spaces": false,
1100
+ "cls_token": "<s>",
1101
+ "eos_token": "</s>",
1102
+ "extra_special_tokens": {},
1103
+ "legacy": false,
1104
+ "local_files_only": true,
1105
+ "model_max_length": 1000000000000000019884624838656,
1106
+ "pad_token": "</s>",
1107
+ "sep_token": "</s>",
1108
+ "sp_model_kwargs": {},
1109
+ "spaces_between_special_tokens": false,
1110
+ "tokenizer_class": "LlamaTokenizer",
1111
+ "unk_token": "<unk>",
1112
+ "use_default_system_prompt": false
1113
+ }