| { |
| "architectures": [ |
| "VTPModel" |
| ], |
| "decoder_depth": 12, |
| "decoder_embed_dim": 384, |
| "decoder_ffn_layer": "swiglu", |
| "decoder_init_values": null, |
| "decoder_norm_layer": "layernorm", |
| "decoder_num_heads": 6, |
| "decoder_use_qk_norm": false, |
| "image_size": 256, |
| "init_logit_bias": null, |
| "init_logit_scale": null, |
| "model_type": "vtp", |
| "nonscalar_logit_scale": false, |
| "text_context_length": 77, |
| "text_depth": 12, |
| "text_embed_cls": false, |
| "text_embed_dim": 768, |
| "text_ls_init_value": null, |
| "text_mlp_ratio": 4.0, |
| "text_no_causal_mask": false, |
| "text_num_heads": 12, |
| "text_output_tokens": false, |
| "text_pad_id": 0, |
| "text_pool_type": "argmax", |
| "text_proj_bias": false, |
| "text_proj_type": "linear", |
| "text_quick_gelu": false, |
| "text_vocab_size": 49408, |
| "torch_dtype": "float32", |
| "train_clip": true, |
| "train_reconstruction": true, |
| "transformers_version": "4.55.4", |
| "vision_bottleneck_ae_only": true, |
| "vision_clip_feat": "cls", |
| "vision_depth": 12, |
| "vision_embed_dim": 384, |
| "vision_feature_bottleneck": 64, |
| "vision_ffn_layer": "swiglu", |
| "vision_init_values": null, |
| "vision_mlp_ratio": 4, |
| "vision_norm_layer": "rmsnorm", |
| "vision_num_heads": 6, |
| "vision_patch_size": 16, |
| "vision_use_qk_norm": false |
| } |
|
|