Spaces:

Antuke
/

FaR-FT-PE

Sleeping

App Files Files Community

FaR-FT-PE / core /vision_encoder /config.py

Antuke

init

c69c4af about 1 month ago

raw

history blame contribute delete

5.11 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.

	"""
	Include all available vision encoder configurations.
	"""

	from dataclasses import dataclass, replace

	from typing import Optional

	from huggingface_hub import hf_hub_download



	def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
	path = path or f"hf://facebook/{name}:{name}.pt"

	if path.startswith("hf://"):
	# Load from huggingface
	path = path[len("hf://"):]
	repo, file = path.split(":")

	return hf_hub_download(repo_id=repo, filename=file)
	else:
	return path




	@dataclass
	class PEConfig:
	""" Vision Tower Config. """
	patch_size: int
	width: int
	layers: int
	heads: int
	mlp_ratio: float
	output_dim: Optional[int]

	ls_init_value: float = None
	drop_path: float = 0.0

	image_size: int = 224,
	use_abs_posemb: bool = True
	use_cls_token: bool = False
	use_rope2d: bool = True

	pool_type: str = "attn"
	attn_pooler_heads: int = 8

	use_ln_pre: bool = True
	use_ln_post: bool = True


	@dataclass
	class PETextConfig:
	""" Text Tower Config. """
	context_length: int
	width: int
	heads: int
	layers: int

	output_dim: int

	mlp_ratio: float = 4.0
	vocab_size: int = 49408




	PE_VISION_CONFIG = {}
	PE_TEXT_CONFIG = {}



	#########################################
	# PE CORE #
	#########################################

	PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
	image_size=448,
	patch_size=14,
	width=1536,
	layers=50,
	heads=16,
	mlp_ratio=8960 / 1536,
	pool_type="attn",
	output_dim=1280,
	use_cls_token=False,
	)
	PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
	context_length=72,
	width=1280,
	heads=20,
	layers=24,
	output_dim=1280
	)


	PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
	image_size=336,
	patch_size=14,
	width=1024,
	layers=24,
	heads=16,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=1024,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
	context_length=32,
	width=1024,
	heads=16,
	layers=24,
	output_dim=1024
	)


	PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
	image_size=224,
	patch_size=16,
	width=768,
	layers=12,
	heads=12,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=1024,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]




	PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig(
	image_size=384,
	patch_size=16,
	width=384,
	layers=12,
	heads=6,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=512,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig(
	context_length=32,
	width=512,
	heads=8,
	layers=12,
	output_dim=512
	)



	PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig(
	image_size=384,
	patch_size=16,
	width=192,
	layers=12,
	heads=3,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=512,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"]







	#########################################
	# PE Lang #
	#########################################

	PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-G14-448"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	ls_init_value=0.1,
	layers=47,
	)

	PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-L14-336"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	ls_init_value=0.1,
	layers=23
	)


	# Stage 2 checkpoints for PLM-8B and PLM-3B respectively. Pretrained with tiling.
	# Use these checkpoints if you're building a model that uses tiling downstream!
	PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"]
	PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"]








	#########################################
	# PE Spatial #
	#########################################

	PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-G14-448"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	ls_init_value=0.1,
	)

	# No layerscale on the smaller spatial models
	PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-L14-336"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)


	PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace(
	PE_VISION_CONFIG["PE-Core-B16-224"],
	image_size=512,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)


	PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace(
	PE_VISION_CONFIG["PE-Core-S16-384"],
	image_size=512,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)


	PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace(
	PE_VISION_CONFIG["PE-Core-T16-384"],
	image_size=512,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)