Upload deepseek_python_20250816_b5cee4.py
Browse filespyhton file for model creation created by Deepseek AI for guidance and developments
deepseek_python_20250816_b5cee4.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import DistilBertModel, DistilBertConfig
|
| 4 |
+
from diffusers import UNet2DConditionModel
|
| 5 |
+
|
| 6 |
+
class VideoJEPA(nn.Module):
|
| 7 |
+
def __init__(self, text_dim=768, video_dim=512, latent_dim=1024):
|
| 8 |
+
super().__init__()
|
| 9 |
+
|
| 10 |
+
# Video Encoder (Hierarchical 3D CNN)
|
| 11 |
+
self.video_encoder = nn.Sequential(
|
| 12 |
+
nn.Conv3d(3, 64, kernel_size=(3, 5, 5), stride=(1, 2, 2)),
|
| 13 |
+
nn.ReLU(),
|
| 14 |
+
nn.MaxPool3d((1, 2, 2)),
|
| 15 |
+
nn.Conv3d(64, 128, kernel_size=(3, 3, 3)),
|
| 16 |
+
nn.ReLU(),
|
| 17 |
+
nn.AdaptiveAvgPool3d((None, 8, 8))
|
| 18 |
+
)
|
| 19 |
+
self.video_proj = nn.Linear(128*8*8, video_dim)
|
| 20 |
+
|
| 21 |
+
# Text Encoder (DistilBERT)
|
| 22 |
+
self.text_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
|
| 23 |
+
self.text_proj = nn.Linear(text_dim, latent_dim)
|
| 24 |
+
|
| 25 |
+
# Joint Fusion Transformer
|
| 26 |
+
self.fusion_transformer = nn.TransformerEncoder(
|
| 27 |
+
nn.TransformerEncoderLayer(d_model=latent_dim, nhead=8),
|
| 28 |
+
num_layers=4
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Diffusion Decoder (Conditional UNet)
|
| 32 |
+
self.diffusion_decoder = UNet2DConditionModel(
|
| 33 |
+
sample_size=64,
|
| 34 |
+
in_channels=3,
|
| 35 |
+
out_channels=3,
|
| 36 |
+
cross_attention_dim=latent_dim
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def forward(self, video, text_input):
|
| 40 |
+
# Video encoding
|
| 41 |
+
B, C, T, H, W = video.shape
|
| 42 |
+
video_features = self.video_encoder(video)
|
| 43 |
+
video_features = video_features.permute(0, 2, 1, 3, 4).contiguous()
|
| 44 |
+
video_features = video_features.view(B*T, -1)
|
| 45 |
+
video_emb = self.video_proj(video_features).view(B, T, -1)
|
| 46 |
+
|
| 47 |
+
# Text encoding
|
| 48 |
+
text_emb = self.text_encoder(**text_input).last_hidden_state
|
| 49 |
+
text_emb = self.text_proj(text_emb[:, 0]) # [CLS] token
|
| 50 |
+
|
| 51 |
+
# Contextual fusion
|
| 52 |
+
fused_emb = torch.cat([video_emb, text_emb.unsqueeze(1)], dim=1)
|
| 53 |
+
context_emb = self.fusion_transformer(fused_emb)
|
| 54 |
+
|
| 55 |
+
return context_emb
|