Spaces:

Antuke
/

FaR-FT-PE

Sleeping

File size: 5,112 Bytes

c69c4af

# Copyright (c) Meta Platforms, Inc. and affiliates.

"""
Include all available vision encoder configurations.
"""

from dataclasses import dataclass, replace

from typing import Optional

from huggingface_hub import hf_hub_download



def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
    path = path or f"hf://facebook/{name}:{name}.pt"

    if path.startswith("hf://"):
        # Load from huggingface
        path = path[len("hf://"):]
        repo, file = path.split(":")

        return hf_hub_download(repo_id=repo, filename=file)
    else:
        return path




@dataclass
class PEConfig:
    """ Vision Tower Config. """
    patch_size: int
    width: int
    layers: int
    heads: int
    mlp_ratio: float
    output_dim: Optional[int]

    ls_init_value: float = None
    drop_path: float = 0.0

    image_size: int = 224,
    use_abs_posemb: bool = True
    use_cls_token: bool = False
    use_rope2d: bool = True

    pool_type: str = "attn"
    attn_pooler_heads: int = 8

    use_ln_pre: bool = True
    use_ln_post: bool = True


@dataclass
class PETextConfig:
    """ Text Tower Config. """
    context_length: int
    width: int
    heads: int
    layers: int

    output_dim: int

    mlp_ratio: float = 4.0
    vocab_size: int = 49408




PE_VISION_CONFIG = {}
PE_TEXT_CONFIG = {}



#########################################
#                PE CORE                #
#########################################

PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
    image_size=448,
    patch_size=14,
    width=1536,
    layers=50,
    heads=16,
    mlp_ratio=8960 / 1536,
    pool_type="attn",
    output_dim=1280,
    use_cls_token=False,
)
PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
    context_length=72,
    width=1280,
    heads=20,
    layers=24,
    output_dim=1280
)


PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
    image_size=336,
    patch_size=14,
    width=1024,
    layers=24,
    heads=16,
    mlp_ratio=4.0,
    pool_type="attn",
    output_dim=1024,
    use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
    context_length=32,
    width=1024,
    heads=16,
    layers=24,
    output_dim=1024
)


PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
    image_size=224,
    patch_size=16,
    width=768,
    layers=12,
    heads=12,
    mlp_ratio=4.0,
    pool_type="attn",
    output_dim=1024,
    use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]




PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig(
    image_size=384,
    patch_size=16,
    width=384,
    layers=12,
    heads=6,
    mlp_ratio=4.0,
    pool_type="attn",
    output_dim=512,
    use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig(
    context_length=32,
    width=512,
    heads=8,
    layers=12,
    output_dim=512
)



PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig(
    image_size=384,
    patch_size=16,
    width=192,
    layers=12,
    heads=3,
    mlp_ratio=4.0,
    pool_type="attn",
    output_dim=512,
    use_cls_token=True,
)
PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"]







#########################################
#                PE Lang                #
#########################################

PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
    PE_VISION_CONFIG["PE-Core-G14-448"],
    image_size=448,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
    ls_init_value=0.1,
    layers=47,
)

PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
    PE_VISION_CONFIG["PE-Core-L14-336"],
    image_size=448,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
    ls_init_value=0.1,
    layers=23
)


# Stage 2 checkpoints for PLM-8B and PLM-3B respectively. Pretrained with tiling.
# Use these checkpoints if you're building a model that uses tiling downstream!
PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"]
PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"]








#########################################
#               PE Spatial              #
#########################################

PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
    PE_VISION_CONFIG["PE-Core-G14-448"],
    image_size=448,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
    ls_init_value=0.1,
)

# No layerscale on the smaller spatial models
PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace(
    PE_VISION_CONFIG["PE-Core-L14-336"],
    image_size=448,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
)


PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace(
    PE_VISION_CONFIG["PE-Core-B16-224"],
    image_size=512,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
)


PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace(
    PE_VISION_CONFIG["PE-Core-S16-384"],
    image_size=512,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
)


PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace(
    PE_VISION_CONFIG["PE-Core-T16-384"],
    image_size=512,
    pool_type="none",
    use_ln_post=False,
    output_dim=None,
)