Antuke's picture
init
c69c4af
# Copyright (c) Meta Platforms, Inc. and affiliates.
"""
Include all available vision encoder configurations.
"""
from dataclasses import dataclass, replace
from typing import Optional
from huggingface_hub import hf_hub_download
def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
path = path or f"hf://facebook/{name}:{name}.pt"
if path.startswith("hf://"):
# Load from huggingface
path = path[len("hf://"):]
repo, file = path.split(":")
return hf_hub_download(repo_id=repo, filename=file)
else:
return path
@dataclass
class PEConfig:
""" Vision Tower Config. """
patch_size: int
width: int
layers: int
heads: int
mlp_ratio: float
output_dim: Optional[int]
ls_init_value: float = None
drop_path: float = 0.0
image_size: int = 224,
use_abs_posemb: bool = True
use_cls_token: bool = False
use_rope2d: bool = True
pool_type: str = "attn"
attn_pooler_heads: int = 8
use_ln_pre: bool = True
use_ln_post: bool = True
@dataclass
class PETextConfig:
""" Text Tower Config. """
context_length: int
width: int
heads: int
layers: int
output_dim: int
mlp_ratio: float = 4.0
vocab_size: int = 49408
PE_VISION_CONFIG = {}
PE_TEXT_CONFIG = {}
#########################################
# PE CORE #
#########################################
PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
image_size=448,
patch_size=14,
width=1536,
layers=50,
heads=16,
mlp_ratio=8960 / 1536,
pool_type="attn",
output_dim=1280,
use_cls_token=False,
)
PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
context_length=72,
width=1280,
heads=20,
layers=24,
output_dim=1280
)
PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
image_size=336,
patch_size=14,
width=1024,
layers=24,
heads=16,
mlp_ratio=4.0,
pool_type="attn",
output_dim=1024,
use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
context_length=32,
width=1024,
heads=16,
layers=24,
output_dim=1024
)
PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
image_size=224,
patch_size=16,
width=768,
layers=12,
heads=12,
mlp_ratio=4.0,
pool_type="attn",
output_dim=1024,
use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]
PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig(
image_size=384,
patch_size=16,
width=384,
layers=12,
heads=6,
mlp_ratio=4.0,
pool_type="attn",
output_dim=512,
use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig(
context_length=32,
width=512,
heads=8,
layers=12,
output_dim=512
)
PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig(
image_size=384,
patch_size=16,
width=192,
layers=12,
heads=3,
mlp_ratio=4.0,
pool_type="attn",
output_dim=512,
use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"]
#########################################
# PE Lang #
#########################################
PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
PE_VISION_CONFIG["PE-Core-G14-448"],
image_size=448,
pool_type="none",
use_ln_post=False,
output_dim=None,
ls_init_value=0.1,
layers=47,
)
PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
PE_VISION_CONFIG["PE-Core-L14-336"],
image_size=448,
pool_type="none",
use_ln_post=False,
output_dim=None,
ls_init_value=0.1,
layers=23
)
# Stage 2 checkpoints for PLM-8B and PLM-3B respectively. Pretrained with tiling.
# Use these checkpoints if you're building a model that uses tiling downstream!
PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"]
PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"]
#########################################
# PE Spatial #
#########################################
PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
PE_VISION_CONFIG["PE-Core-G14-448"],
image_size=448,
pool_type="none",
use_ln_post=False,
output_dim=None,
ls_init_value=0.1,
)
# No layerscale on the smaller spatial models
PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace(
PE_VISION_CONFIG["PE-Core-L14-336"],
image_size=448,
pool_type="none",
use_ln_post=False,
output_dim=None,
)
PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace(
PE_VISION_CONFIG["PE-Core-B16-224"],
image_size=512,
pool_type="none",
use_ln_post=False,
output_dim=None,
)
PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace(
PE_VISION_CONFIG["PE-Core-S16-384"],
image_size=512,
pool_type="none",
use_ln_post=False,
output_dim=None,
)
PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace(
PE_VISION_CONFIG["PE-Core-T16-384"],
image_size=512,
pool_type="none",
use_ln_post=False,
output_dim=None,
)