From 8d48a1146c81e2ad98e38256a8703483d7547d4e Mon Sep 17 00:00:00 2001 From: Rezatagi1224 Date: Sat, 20 Dec 2025 23:08:52 +0330 Subject: [PATCH] Create MiniCLIP-ViT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit هدف درک معماری CLIP --- MiniCLIP-ViT | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 MiniCLIP-ViT diff --git a/MiniCLIP-ViT b/MiniCLIP-ViT new file mode 100644 index 00000000..4cb68a07 --- /dev/null +++ b/MiniCLIP-ViT @@ -0,0 +1,169 @@ +#در این فایل قراره یک مینی clip با هدف درک معماری clip +# واقعی که هسته AGI است + +import torch +import torch.nn as nn +import torch.nn.functional as F + +#---creat paths---# +class PathEmbedding(nn.Module): + + def __init__(self,img_size=32,paths_size=8,in_channels=3,embd_dim=128): + super().__init__() + + self.num_paths = (img_size // paths_size) ** 2 + + self.project = nn.Conv2d( + in_channels,embd_dim, + kernel_size=paths_size,stride=paths_size + ) + + + def forward(self,x): + + x = self.project(x) + x = x.flatten(2) + x = x.transpose(1,2) + return x + + +#---ViTBlock---# +class VitBlock(nn.Module): + + def __init__(self,embd_dim,num_heads): + super().__init__() + + self.att_head = nn.MultiheadAttention(embd_dim,num_heads,batch_first=True) + + self.mlp = nn.Sequential( + nn.Linear(embd_dim*4,embd_dim), + nn.GELU(), + nn.Linear(embd_dim,embd_dim*4) + ) + + self.norm1 = nn.LayerNorm(embd_dim) + self.norm2 = nn.LayerNorm(embd_dim) + + + def forward(self,x): + + att, _ = self.att_head(self.norm1(x),self.norm1(x),self.norm1(x)) + x = x + att + x = x + self.mlp(self.norm2(x)) + return x + + +#---creat MiniViT----# +class VitImageEncoder(nn.Module): + + def __init__(self,embd_dim=128,path_size=8,num_heads=3,depth=4,img_size=32): + super().__init__() + + self.path_embedding = PathEmbedding(img_size,path_size,3,embd_dim) + + self.cls = nn.Parameter(torch.randn(1,1,embd_dim)) + + # Corrected: use num_paths from the instantiated path_embedding + self.pos_path = nn.Parameter(torch.randn(1,1+self.path_embedding.num_paths,embd_dim)) + + self.blocks = nn.Sequential( + *[VitBlock(embd_dim,num_heads) + for _ in range(depth)] + ) + + self.norm = nn.LayerNorm(embd_dim) + + + def forward(self,x): + + B = x.size(0) + + x = self.path_embedding(x) + + cls = self.cls.expand(B,-1,-1) + + # Corrected: concatenate cls token at the beginning of the sequence + x = torch.cat((cls, x),dim=1) + + x = x + self.pos_path + + # Apply blocks + for block in self.blocks: + x = block(x) + + out = self.norm(x[:,0]) # Take the class token representation + return F.normalize(out,dim=-1) + + +#---creat Text_encoder---# +class TextEncoder(nn.Module): + + def __init__(self,embd_dim,vocab_size): + super().__init__() + + self.embedding = nn.Embedding(vocab_size,embd_dim) + + + def forward(self,x): # x is expected to be a tensor of token indices (batch_size, sequence_length) + + embd = self.embedding(x) # Output (batch_size, sequence_length, embd_dim) + # Assuming we want to average embeddings across the sequence length for a single text representation + embd = embd.mean(dim=1) # Output (batch_size, embd_dim) + x = F.normalize(embd,dim=-1) + return x + + +#--creat mini clip---# +class MiniCLIP(nn.Module): + + def __init__(self,image_encoder_instance,text_encoder_instance,embd_dim): + super().__init__() + + self.ViTimage_encoder = image_encoder_instance + self.text_encoder = text_encoder_instance + # Corrected: logits_scale initialization and requires_grad + self.logits_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07))) + # The original paper uses a learnable temperature parameter initialized to log(1/0.07) + # It's usually a scalar, not a tensor of shape []. Simplified for common use. + # Original paper initializes it as learnable logit_scale parameter + + def forward(self,image_input,text_input): + + image_features = self.ViTimage_encoder(image_input) + text_features = self.text_encoder(text_input) + + image_features = F.normalize(image_features,dim=-1) + text_features = F.normalize(text_features,dim=-1) + + scale = self.logits_scale.exp() + logits = scale * image_features @ text_features.T + return logits + + +def lossClip(logits): + + # Corrected: Missing parenthesis for size(0) + labels = torch.arange(logits.size(0)).to(logits.device) + + i_loss = F.cross_entropy(logits,labels) + # Corrected: apply cross_entropy on transposed logits for text-to-image similarity + t_loss = F.cross_entropy(logits.T,labels) + + out = (i_loss + t_loss) / 2 + return out + + +#------Creat Model------# +embd_dim = 128 +vocab_size= 512 + +# Corrected: Instantiate encoders before passing them to MiniCLIP +# Changed num_heads from default 3 to 8, as 128 is not divisible by 3. +image_encoder_instance = VitImageEncoder(embd_dim=embd_dim, num_heads=8) # Assuming default img_size, path_size, depth +text_encoder_instance = TextEncoder(embd_dim,vocab_size) + +# Corrected: Pass instances to MiniCLIP constructor +model = MiniCLIP(image_encoder_instance,text_encoder_instance,embd_dim) + + +