atanu2531 commited on
Commit
24bca9f
·
verified ·
1 Parent(s): b9fa9b2

Upload deepseek_python_20250816_b5cee4.py

Browse files

pyhton file for model creation created by Deepseek AI for guidance and developments

Files changed (1) hide show
  1. deepseek_python_20250816_b5cee4.py +55 -0
deepseek_python_20250816_b5cee4.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import DistilBertModel, DistilBertConfig
4
+ from diffusers import UNet2DConditionModel
5
+
6
+ class VideoJEPA(nn.Module):
7
+ def __init__(self, text_dim=768, video_dim=512, latent_dim=1024):
8
+ super().__init__()
9
+
10
+ # Video Encoder (Hierarchical 3D CNN)
11
+ self.video_encoder = nn.Sequential(
12
+ nn.Conv3d(3, 64, kernel_size=(3, 5, 5), stride=(1, 2, 2)),
13
+ nn.ReLU(),
14
+ nn.MaxPool3d((1, 2, 2)),
15
+ nn.Conv3d(64, 128, kernel_size=(3, 3, 3)),
16
+ nn.ReLU(),
17
+ nn.AdaptiveAvgPool3d((None, 8, 8))
18
+ )
19
+ self.video_proj = nn.Linear(128*8*8, video_dim)
20
+
21
+ # Text Encoder (DistilBERT)
22
+ self.text_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
23
+ self.text_proj = nn.Linear(text_dim, latent_dim)
24
+
25
+ # Joint Fusion Transformer
26
+ self.fusion_transformer = nn.TransformerEncoder(
27
+ nn.TransformerEncoderLayer(d_model=latent_dim, nhead=8),
28
+ num_layers=4
29
+ )
30
+
31
+ # Diffusion Decoder (Conditional UNet)
32
+ self.diffusion_decoder = UNet2DConditionModel(
33
+ sample_size=64,
34
+ in_channels=3,
35
+ out_channels=3,
36
+ cross_attention_dim=latent_dim
37
+ )
38
+
39
+ def forward(self, video, text_input):
40
+ # Video encoding
41
+ B, C, T, H, W = video.shape
42
+ video_features = self.video_encoder(video)
43
+ video_features = video_features.permute(0, 2, 1, 3, 4).contiguous()
44
+ video_features = video_features.view(B*T, -1)
45
+ video_emb = self.video_proj(video_features).view(B, T, -1)
46
+
47
+ # Text encoding
48
+ text_emb = self.text_encoder(**text_input).last_hidden_state
49
+ text_emb = self.text_proj(text_emb[:, 0]) # [CLS] token
50
+
51
+ # Contextual fusion
52
+ fused_emb = torch.cat([video_emb, text_emb.unsqueeze(1)], dim=1)
53
+ context_emb = self.fusion_transformer(fused_emb)
54
+
55
+ return context_emb