|
|
|
|
|
|
|
|
""" |
|
|
Include all available vision encoder configurations. |
|
|
""" |
|
|
|
|
|
from dataclasses import dataclass, replace |
|
|
|
|
|
from typing import Optional |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
|
|
|
def fetch_pe_checkpoint(name: str, path: Optional[str] = None): |
|
|
path = path or f"hf://facebook/{name}:{name}.pt" |
|
|
|
|
|
if path.startswith("hf://"): |
|
|
|
|
|
path = path[len("hf://"):] |
|
|
repo, file = path.split(":") |
|
|
|
|
|
return hf_hub_download(repo_id=repo, filename=file) |
|
|
else: |
|
|
return path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PEConfig: |
|
|
""" Vision Tower Config. """ |
|
|
patch_size: int |
|
|
width: int |
|
|
layers: int |
|
|
heads: int |
|
|
mlp_ratio: float |
|
|
output_dim: Optional[int] |
|
|
|
|
|
ls_init_value: float = None |
|
|
drop_path: float = 0.0 |
|
|
|
|
|
image_size: int = 224, |
|
|
use_abs_posemb: bool = True |
|
|
use_cls_token: bool = False |
|
|
use_rope2d: bool = True |
|
|
|
|
|
pool_type: str = "attn" |
|
|
attn_pooler_heads: int = 8 |
|
|
|
|
|
use_ln_pre: bool = True |
|
|
use_ln_post: bool = True |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PETextConfig: |
|
|
""" Text Tower Config. """ |
|
|
context_length: int |
|
|
width: int |
|
|
heads: int |
|
|
layers: int |
|
|
|
|
|
output_dim: int |
|
|
|
|
|
mlp_ratio: float = 4.0 |
|
|
vocab_size: int = 49408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG = {} |
|
|
PE_TEXT_CONFIG = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig( |
|
|
image_size=448, |
|
|
patch_size=14, |
|
|
width=1536, |
|
|
layers=50, |
|
|
heads=16, |
|
|
mlp_ratio=8960 / 1536, |
|
|
pool_type="attn", |
|
|
output_dim=1280, |
|
|
use_cls_token=False, |
|
|
) |
|
|
PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig( |
|
|
context_length=72, |
|
|
width=1280, |
|
|
heads=20, |
|
|
layers=24, |
|
|
output_dim=1280 |
|
|
) |
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig( |
|
|
image_size=336, |
|
|
patch_size=14, |
|
|
width=1024, |
|
|
layers=24, |
|
|
heads=16, |
|
|
mlp_ratio=4.0, |
|
|
pool_type="attn", |
|
|
output_dim=1024, |
|
|
use_cls_token=True, |
|
|
) |
|
|
PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig( |
|
|
context_length=32, |
|
|
width=1024, |
|
|
heads=16, |
|
|
layers=24, |
|
|
output_dim=1024 |
|
|
) |
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig( |
|
|
image_size=224, |
|
|
patch_size=16, |
|
|
width=768, |
|
|
layers=12, |
|
|
heads=12, |
|
|
mlp_ratio=4.0, |
|
|
pool_type="attn", |
|
|
output_dim=1024, |
|
|
use_cls_token=True, |
|
|
) |
|
|
PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig( |
|
|
image_size=384, |
|
|
patch_size=16, |
|
|
width=384, |
|
|
layers=12, |
|
|
heads=6, |
|
|
mlp_ratio=4.0, |
|
|
pool_type="attn", |
|
|
output_dim=512, |
|
|
use_cls_token=True, |
|
|
) |
|
|
PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig( |
|
|
context_length=32, |
|
|
width=512, |
|
|
heads=8, |
|
|
layers=12, |
|
|
output_dim=512 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig( |
|
|
image_size=384, |
|
|
patch_size=16, |
|
|
width=192, |
|
|
layers=12, |
|
|
heads=3, |
|
|
mlp_ratio=4.0, |
|
|
pool_type="attn", |
|
|
output_dim=512, |
|
|
use_cls_token=True, |
|
|
) |
|
|
PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Lang-G14-448"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-G14-448"], |
|
|
image_size=448, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
ls_init_value=0.1, |
|
|
layers=47, |
|
|
) |
|
|
|
|
|
PE_VISION_CONFIG["PE-Lang-L14-448"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-L14-336"], |
|
|
image_size=448, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
ls_init_value=0.1, |
|
|
layers=23 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"] |
|
|
PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-G14-448"], |
|
|
image_size=448, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
ls_init_value=0.1, |
|
|
) |
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-L14-336"], |
|
|
image_size=448, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
) |
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-B16-224"], |
|
|
image_size=512, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
) |
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-S16-384"], |
|
|
image_size=512, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
) |
|
|
|
|
|
|
|
|
PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace( |
|
|
PE_VISION_CONFIG["PE-Core-T16-384"], |
|
|
image_size=512, |
|
|
pool_type="none", |
|
|
use_ln_post=False, |
|
|
output_dim=None, |
|
|
) |
|
|
|