moPPIt / classifier_code /binding_affinity_unpooled_2.py
AlienChen's picture
Upload 72 files
3527383 verified
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import spearmanr
from collections import defaultdict
import pandas as pd
import logging
import os
import torch.optim as optim
from datetime import datetime
from transformers import AutoModel, AutoConfig, AutoTokenizer
import os
# point HF_ENDPOINT at your mirror
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
class UnpooledBindingPredictor(nn.Module):
def __init__(self,
esm_model_name="facebook/esm2_t33_650M_UR50D",
hidden_dim=512,
kernel_sizes=[3, 5, 7],
n_heads=8,
n_layers=3,
dropout=0.1,
freeze_esm=True):
super().__init__()
# Define binding thresholds
self.tight_threshold = 7.5 # Kd/Ki/IC50 ≤ ~30nM
self.weak_threshold = 6.0 # Kd/Ki/IC50 > 1μM
# Load ESM model for computing embeddings on the fly
self.esm_model = AutoModel.from_pretrained(esm_model_name)
self.config = AutoConfig.from_pretrained(esm_model_name)
# Freeze ESM parameters if needed
if freeze_esm:
for param in self.esm_model.parameters():
param.requires_grad = False
# Get ESM hidden size
esm_dim = self.config.hidden_size
# Output channels for CNN layers
output_channels_per_kernel = 64
# CNN layers for handling variable length sequences
self.protein_conv_layers = nn.ModuleList([
nn.Conv1d(
in_channels=esm_dim,
out_channels=output_channels_per_kernel,
kernel_size=k,
padding='same'
) for k in kernel_sizes
])
self.binder_conv_layers = nn.ModuleList([
nn.Conv1d(
in_channels=esm_dim,
out_channels=output_channels_per_kernel,
kernel_size=k,
padding='same'
) for k in kernel_sizes
])
# Calculate total features after convolution and pooling
total_features_per_seq = output_channels_per_kernel * len(kernel_sizes) * 2
# Project to same dimension after CNN processing
self.protein_projection = nn.Linear(total_features_per_seq, hidden_dim)
self.binder_projection = nn.Linear(total_features_per_seq, hidden_dim)
self.protein_norm = nn.LayerNorm(hidden_dim)
self.binder_norm = nn.LayerNorm(hidden_dim)
# Cross attention blocks with layer norm
self.cross_attention_layers = nn.ModuleList([
nn.ModuleDict({
'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
'norm1': nn.LayerNorm(hidden_dim),
'ffn': nn.Sequential(
nn.Linear(hidden_dim, hidden_dim * 4),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim * 4, hidden_dim)
),
'norm2': nn.LayerNorm(hidden_dim)
}) for _ in range(n_layers)
])
# Prediction heads
self.shared_head = nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim),
nn.ReLU(),
nn.Dropout(dropout),
)
# Regression head
self.regression_head = nn.Linear(hidden_dim, 1)
# Classification head (3 classes: tight, medium, loose binding)
self.classification_head = nn.Linear(hidden_dim, 3)
def get_binding_class(self, affinity):
"""Convert affinity values to class indices
0: tight binding (>= 7.5)
1: medium binding (6.0-7.5)
2: weak binding (< 6.0)
"""
if isinstance(affinity, torch.Tensor):
tight_mask = affinity >= self.tight_threshold
weak_mask = affinity < self.weak_threshold
medium_mask = ~(tight_mask | weak_mask)
classes = torch.zeros_like(affinity, dtype=torch.long)
classes[medium_mask] = 1
classes[weak_mask] = 2
return classes
else:
if affinity >= self.tight_threshold:
return 0 # tight binding
elif affinity < self.weak_threshold:
return 2 # weak binding
else:
return 1 # medium binding
def compute_embeddings(self, input_ids, attention_mask=None):
"""Compute ESM embeddings on the fly"""
esm_outputs = self.esm_model(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True
)
# Get the unpooled last hidden states (batch_size x seq_length x hidden_size)
return esm_outputs.last_hidden_state
def process_sequence(self, unpooled_emb, conv_layers, attention_mask=None):
"""Process a sequence through CNN layers and pooling"""
# Transpose for CNN: [batch_size, hidden_size, seq_length]
x = unpooled_emb.transpose(1, 2)
# Apply CNN layers and collect outputs
conv_outputs = []
for conv in conv_layers:
conv_out = F.relu(conv(x))
conv_outputs.append(conv_out)
# Concatenate along channel dimension
conv_output = torch.cat(conv_outputs, dim=1)
# Global pooling (both max and average)
# If attention mask is provided, use it to create a proper mask for pooling
if attention_mask is not None:
# Create a mask for pooling (1 for valid positions, 0 for padding)
# Expand mask to match conv_output channels
expanded_mask = attention_mask.unsqueeze(1).expand(-1, conv_output.size(1), -1)
# Apply mask (set padding to large negative value for max pooling)
masked_output = conv_output.clone()
masked_output = masked_output.masked_fill(expanded_mask == 0, float('-inf'))
# Max pooling along sequence dimension
max_pooled = torch.max(masked_output, dim=2)[0]
# Average pooling (sum divided by number of valid positions)
sum_pooled = torch.sum(conv_output * expanded_mask, dim=2)
valid_positions = torch.sum(expanded_mask, dim=2)
valid_positions = torch.clamp(valid_positions, min=1.0) # Avoid division by zero
avg_pooled = sum_pooled / valid_positions
else:
# If no mask, use standard pooling
max_pooled = torch.max(conv_output, dim=2)[0]
avg_pooled = torch.mean(conv_output, dim=2)
# Concatenate the pooled features
pooled = torch.cat([max_pooled, avg_pooled], dim=1)
return pooled
def forward(self, protein_input_ids, binder_input_ids, protein_mask=None, binder_mask=None):
# Compute embeddings on the fly using the ESM model
protein_unpooled = self.compute_embeddings(protein_input_ids, protein_mask)
binder_unpooled = self.compute_embeddings(binder_input_ids, binder_mask)
# Process protein and binder sequences through CNN layers
protein_features = self.process_sequence(protein_unpooled, self.protein_conv_layers, protein_mask)
binder_features = self.process_sequence(binder_unpooled, self.binder_conv_layers, binder_mask)
# Project to same dimension
protein = self.protein_norm(self.protein_projection(protein_features))
binder = self.binder_norm(self.binder_projection(binder_features))
# Reshape for attention: from [batch_size, hidden_dim] to [1, batch_size, hidden_dim]
protein = protein.unsqueeze(0)
binder = binder.unsqueeze(0)
# Cross attention layers
for layer in self.cross_attention_layers:
# Protein attending to binder
attended_protein = layer['attention'](
protein, binder, binder
)[0]
protein = layer['norm1'](protein + attended_protein)
protein = layer['norm2'](protein + layer['ffn'](protein))
# Binder attending to protein
attended_binder = layer['attention'](
binder, protein, protein
)[0]
binder = layer['norm1'](binder + attended_binder)
binder = layer['norm2'](binder + layer['ffn'](binder))
# Remove sequence dimension
protein_pool = protein.squeeze(0)
binder_pool = binder.squeeze(0)
# Concatenate both representations
combined = torch.cat([protein_pool, binder_pool], dim=-1)
# Shared features
shared_features = self.shared_head(combined)
regression_output = self.regression_head(shared_features)
classification_logits = self.classification_head(shared_features)
return regression_output, classification_logits
def load_model(checkpoint_path, device):
"""Load trained model from checkpoint."""
checkpoint = torch.load(checkpoint_path, map_location=device)
# Import the model class from your module or redefine it here
# Initialize model with the same parameters used during training
model = UnpooledBindingPredictor(
esm_model_name="facebook/esm2_t33_650M_UR50D",
hidden_dim=384,
kernel_sizes=[3, 5, 7],
n_heads=8,
n_layers=4,
dropout=0.14561457009902096,
freeze_esm=True
).to(device)
# Load the trained weights
model.load_state_dict(checkpoint['model_state_dict'])
model.eval() # Set to evaluation mode
return model
def prepare_inputs(protein_sequence, binder_sequence, tokenizer, max_length=1024, device='cuda'):
"""Tokenize protein and binder sequences."""
protein_tokens = tokenizer(
protein_sequence,
return_tensors="pt",
padding="max_length",
max_length=max_length,
truncation=True
)
binder_tokens = tokenizer(
binder_sequence,
return_tensors="pt",
padding="max_length",
max_length=max_length,
truncation=True
)
return {
'protein_input_ids': protein_tokens['input_ids'].to(device),
'protein_attention_mask': protein_tokens['attention_mask'].to(device),
'binder_input_ids': binder_tokens['input_ids'].to(device),
'binder_attention_mask': binder_tokens['attention_mask'].to(device)
}
# Perform prediction
def predict_binding(model, protein_sequence, binder_sequence, device='cuda'):
"""Predict binding affinity between protein and binder sequences."""
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
inputs = prepare_inputs(protein_sequence, binder_sequence, tokenizer, device=device)
with torch.no_grad():
regression_output, classification_logits = model(
inputs['protein_input_ids'],
inputs['binder_input_ids'],
inputs['protein_attention_mask'],
inputs['binder_attention_mask']
)
# Get numerical prediction (pKd/pKi)
predicted_affinity = regression_output.item()
# Get classification prediction (tight, medium, weak)
predicted_class_idx = torch.argmax(classification_logits, dim=1).item()
class_names = ['Tight binding', 'Medium binding', 'Weak binding']
predicted_class = class_names[predicted_class_idx]
# Get class probabilities
class_probs = F.softmax(classification_logits, dim=1).cpu().numpy()[0]
return {
'predicted_affinity': predicted_affinity,
'binding_class': predicted_class,
'class_probabilities': {name: prob for name, prob in zip(class_names, class_probs)},
'tight_threshold': model.tight_threshold, # 7.5 (≤ ~30nM)
'weak_threshold': model.weak_threshold # 6.0 (> 1μM)
}
# Example usage
if __name__ == "__main__":
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the model
model = load_model('/scratch/pranamlab/tong/checkpoints/MOG-DFM/classifier_ckpt/binding_affinity_unpooled.pt', device)
# Example protein sequences (replace with actual sequences)
binders = ['GLSKGCFGLKLDRIGSMSGLGC', 'RGLSDGFLKLKMGISGSLGC']
protein_sequence = "RNLTLAVVLPEHNLSYAWAWPRVGPAVALAVEALGRALPVDLRFVSSELEGACSEYLAPLSAVDLKLYHDPDLLLGPGCVYPAASVARFASHWRLPLLTAGAVASGFSAKNDHYRTLVRTGPSAPKLGEFVVTLHGHFNWTARAALLYLDARTDDRPHYFTIEGVFEALQGSNLSVQHQVYAREPGGPEQATHFIRANGRIVYICGPLEMLHEILLQAQRENLTNGDYVFFYLDVFGESLRAGPTRATGRPWQDNRTREQAQALREAFQTVLVITYREPPNPEYQEFQNRLLIRAREDFGVELGPSLMNLIAGCFYDGILLYAEVLNETIQEGGTREDGLRIVEKMQGRRYHGVTGLVVMDKNNDRETDFVLWAMGDLDSGDFQPAAHYSGAEKQIWWTGRPIPWVKGAPPSDNPPCAFDLDDPSCDKTPLSTLAI"
# name = "CLIC1_10_moppit"
# print(name)
# with open(f'/home/tc415/flow_matching/samples/unconditional_samples/12.txt', 'r') as f:
# binders = f.readlines()
# binders = [binder.strip() for binder in binders]
# binders = binders[:100]
# # Make prediction
affinities = []
for binder in binders:
result = predict_binding(model, protein_sequence, binder, device)
print(result['predicted_affinity'])
affinities.append(result['predicted_affinity'])
# with open('/home/tc415/flow_matching/scores/affinity/EWSFLI1_12_unconditional.txt', 'w') as f:
# for score in affinities:
# f.write(str(score) + '\n')
# print(sum(affinities) / len(affinities))
# with open(f'/home/tc415/flow_matching/scores/affinity/{name}.txt', 'w') as f:
# for score in affinities:
# f.write(str(round(score, 4)) + '\n')
# Display results
# print(f"Predicted binding affinity (pKd/pKi): {result['predicted_affinity']:.2f}")
# print(f"Binding class: {result['binding_class']}")
# print("Class probabilities:")
# for class_name, prob in result['class_probabilities'].items():
# print(f" {class_name}: {prob:.2f}")