File size: 14,400 Bytes
3527383 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 |
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import spearmanr
from collections import defaultdict
import pandas as pd
import logging
import os
import torch.optim as optim
from datetime import datetime
from transformers import AutoModel, AutoConfig, AutoTokenizer
import os
# point HF_ENDPOINT at your mirror
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
class UnpooledBindingPredictor(nn.Module):
def __init__(self,
esm_model_name="facebook/esm2_t33_650M_UR50D",
hidden_dim=512,
kernel_sizes=[3, 5, 7],
n_heads=8,
n_layers=3,
dropout=0.1,
freeze_esm=True):
super().__init__()
# Define binding thresholds
self.tight_threshold = 7.5 # Kd/Ki/IC50 ≤ ~30nM
self.weak_threshold = 6.0 # Kd/Ki/IC50 > 1μM
# Load ESM model for computing embeddings on the fly
self.esm_model = AutoModel.from_pretrained(esm_model_name)
self.config = AutoConfig.from_pretrained(esm_model_name)
# Freeze ESM parameters if needed
if freeze_esm:
for param in self.esm_model.parameters():
param.requires_grad = False
# Get ESM hidden size
esm_dim = self.config.hidden_size
# Output channels for CNN layers
output_channels_per_kernel = 64
# CNN layers for handling variable length sequences
self.protein_conv_layers = nn.ModuleList([
nn.Conv1d(
in_channels=esm_dim,
out_channels=output_channels_per_kernel,
kernel_size=k,
padding='same'
) for k in kernel_sizes
])
self.binder_conv_layers = nn.ModuleList([
nn.Conv1d(
in_channels=esm_dim,
out_channels=output_channels_per_kernel,
kernel_size=k,
padding='same'
) for k in kernel_sizes
])
# Calculate total features after convolution and pooling
total_features_per_seq = output_channels_per_kernel * len(kernel_sizes) * 2
# Project to same dimension after CNN processing
self.protein_projection = nn.Linear(total_features_per_seq, hidden_dim)
self.binder_projection = nn.Linear(total_features_per_seq, hidden_dim)
self.protein_norm = nn.LayerNorm(hidden_dim)
self.binder_norm = nn.LayerNorm(hidden_dim)
# Cross attention blocks with layer norm
self.cross_attention_layers = nn.ModuleList([
nn.ModuleDict({
'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
'norm1': nn.LayerNorm(hidden_dim),
'ffn': nn.Sequential(
nn.Linear(hidden_dim, hidden_dim * 4),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim * 4, hidden_dim)
),
'norm2': nn.LayerNorm(hidden_dim)
}) for _ in range(n_layers)
])
# Prediction heads
self.shared_head = nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim),
nn.ReLU(),
nn.Dropout(dropout),
)
# Regression head
self.regression_head = nn.Linear(hidden_dim, 1)
# Classification head (3 classes: tight, medium, loose binding)
self.classification_head = nn.Linear(hidden_dim, 3)
def get_binding_class(self, affinity):
"""Convert affinity values to class indices
0: tight binding (>= 7.5)
1: medium binding (6.0-7.5)
2: weak binding (< 6.0)
"""
if isinstance(affinity, torch.Tensor):
tight_mask = affinity >= self.tight_threshold
weak_mask = affinity < self.weak_threshold
medium_mask = ~(tight_mask | weak_mask)
classes = torch.zeros_like(affinity, dtype=torch.long)
classes[medium_mask] = 1
classes[weak_mask] = 2
return classes
else:
if affinity >= self.tight_threshold:
return 0 # tight binding
elif affinity < self.weak_threshold:
return 2 # weak binding
else:
return 1 # medium binding
def compute_embeddings(self, input_ids, attention_mask=None):
"""Compute ESM embeddings on the fly"""
esm_outputs = self.esm_model(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True
)
# Get the unpooled last hidden states (batch_size x seq_length x hidden_size)
return esm_outputs.last_hidden_state
def process_sequence(self, unpooled_emb, conv_layers, attention_mask=None):
"""Process a sequence through CNN layers and pooling"""
# Transpose for CNN: [batch_size, hidden_size, seq_length]
x = unpooled_emb.transpose(1, 2)
# Apply CNN layers and collect outputs
conv_outputs = []
for conv in conv_layers:
conv_out = F.relu(conv(x))
conv_outputs.append(conv_out)
# Concatenate along channel dimension
conv_output = torch.cat(conv_outputs, dim=1)
# Global pooling (both max and average)
# If attention mask is provided, use it to create a proper mask for pooling
if attention_mask is not None:
# Create a mask for pooling (1 for valid positions, 0 for padding)
# Expand mask to match conv_output channels
expanded_mask = attention_mask.unsqueeze(1).expand(-1, conv_output.size(1), -1)
# Apply mask (set padding to large negative value for max pooling)
masked_output = conv_output.clone()
masked_output = masked_output.masked_fill(expanded_mask == 0, float('-inf'))
# Max pooling along sequence dimension
max_pooled = torch.max(masked_output, dim=2)[0]
# Average pooling (sum divided by number of valid positions)
sum_pooled = torch.sum(conv_output * expanded_mask, dim=2)
valid_positions = torch.sum(expanded_mask, dim=2)
valid_positions = torch.clamp(valid_positions, min=1.0) # Avoid division by zero
avg_pooled = sum_pooled / valid_positions
else:
# If no mask, use standard pooling
max_pooled = torch.max(conv_output, dim=2)[0]
avg_pooled = torch.mean(conv_output, dim=2)
# Concatenate the pooled features
pooled = torch.cat([max_pooled, avg_pooled], dim=1)
return pooled
def forward(self, protein_input_ids, binder_input_ids, protein_mask=None, binder_mask=None):
# Compute embeddings on the fly using the ESM model
protein_unpooled = self.compute_embeddings(protein_input_ids, protein_mask)
binder_unpooled = self.compute_embeddings(binder_input_ids, binder_mask)
# Process protein and binder sequences through CNN layers
protein_features = self.process_sequence(protein_unpooled, self.protein_conv_layers, protein_mask)
binder_features = self.process_sequence(binder_unpooled, self.binder_conv_layers, binder_mask)
# Project to same dimension
protein = self.protein_norm(self.protein_projection(protein_features))
binder = self.binder_norm(self.binder_projection(binder_features))
# Reshape for attention: from [batch_size, hidden_dim] to [1, batch_size, hidden_dim]
protein = protein.unsqueeze(0)
binder = binder.unsqueeze(0)
# Cross attention layers
for layer in self.cross_attention_layers:
# Protein attending to binder
attended_protein = layer['attention'](
protein, binder, binder
)[0]
protein = layer['norm1'](protein + attended_protein)
protein = layer['norm2'](protein + layer['ffn'](protein))
# Binder attending to protein
attended_binder = layer['attention'](
binder, protein, protein
)[0]
binder = layer['norm1'](binder + attended_binder)
binder = layer['norm2'](binder + layer['ffn'](binder))
# Remove sequence dimension
protein_pool = protein.squeeze(0)
binder_pool = binder.squeeze(0)
# Concatenate both representations
combined = torch.cat([protein_pool, binder_pool], dim=-1)
# Shared features
shared_features = self.shared_head(combined)
regression_output = self.regression_head(shared_features)
classification_logits = self.classification_head(shared_features)
return regression_output, classification_logits
def load_model(checkpoint_path, device):
"""Load trained model from checkpoint."""
checkpoint = torch.load(checkpoint_path, map_location=device)
# Import the model class from your module or redefine it here
# Initialize model with the same parameters used during training
model = UnpooledBindingPredictor(
esm_model_name="facebook/esm2_t33_650M_UR50D",
hidden_dim=384,
kernel_sizes=[3, 5, 7],
n_heads=8,
n_layers=4,
dropout=0.14561457009902096,
freeze_esm=True
).to(device)
# Load the trained weights
model.load_state_dict(checkpoint['model_state_dict'])
model.eval() # Set to evaluation mode
return model
def prepare_inputs(protein_sequence, binder_sequence, tokenizer, max_length=1024, device='cuda'):
"""Tokenize protein and binder sequences."""
protein_tokens = tokenizer(
protein_sequence,
return_tensors="pt",
padding="max_length",
max_length=max_length,
truncation=True
)
binder_tokens = tokenizer(
binder_sequence,
return_tensors="pt",
padding="max_length",
max_length=max_length,
truncation=True
)
return {
'protein_input_ids': protein_tokens['input_ids'].to(device),
'protein_attention_mask': protein_tokens['attention_mask'].to(device),
'binder_input_ids': binder_tokens['input_ids'].to(device),
'binder_attention_mask': binder_tokens['attention_mask'].to(device)
}
# Perform prediction
def predict_binding(model, protein_sequence, binder_sequence, device='cuda'):
"""Predict binding affinity between protein and binder sequences."""
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
inputs = prepare_inputs(protein_sequence, binder_sequence, tokenizer, device=device)
with torch.no_grad():
regression_output, classification_logits = model(
inputs['protein_input_ids'],
inputs['binder_input_ids'],
inputs['protein_attention_mask'],
inputs['binder_attention_mask']
)
# Get numerical prediction (pKd/pKi)
predicted_affinity = regression_output.item()
# Get classification prediction (tight, medium, weak)
predicted_class_idx = torch.argmax(classification_logits, dim=1).item()
class_names = ['Tight binding', 'Medium binding', 'Weak binding']
predicted_class = class_names[predicted_class_idx]
# Get class probabilities
class_probs = F.softmax(classification_logits, dim=1).cpu().numpy()[0]
return {
'predicted_affinity': predicted_affinity,
'binding_class': predicted_class,
'class_probabilities': {name: prob for name, prob in zip(class_names, class_probs)},
'tight_threshold': model.tight_threshold, # 7.5 (≤ ~30nM)
'weak_threshold': model.weak_threshold # 6.0 (> 1μM)
}
# Example usage
if __name__ == "__main__":
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the model
model = load_model('/scratch/pranamlab/tong/checkpoints/MOG-DFM/classifier_ckpt/binding_affinity_unpooled.pt', device)
# Example protein sequences (replace with actual sequences)
binders = ['GLSKGCFGLKLDRIGSMSGLGC', 'RGLSDGFLKLKMGISGSLGC']
protein_sequence = "RNLTLAVVLPEHNLSYAWAWPRVGPAVALAVEALGRALPVDLRFVSSELEGACSEYLAPLSAVDLKLYHDPDLLLGPGCVYPAASVARFASHWRLPLLTAGAVASGFSAKNDHYRTLVRTGPSAPKLGEFVVTLHGHFNWTARAALLYLDARTDDRPHYFTIEGVFEALQGSNLSVQHQVYAREPGGPEQATHFIRANGRIVYICGPLEMLHEILLQAQRENLTNGDYVFFYLDVFGESLRAGPTRATGRPWQDNRTREQAQALREAFQTVLVITYREPPNPEYQEFQNRLLIRAREDFGVELGPSLMNLIAGCFYDGILLYAEVLNETIQEGGTREDGLRIVEKMQGRRYHGVTGLVVMDKNNDRETDFVLWAMGDLDSGDFQPAAHYSGAEKQIWWTGRPIPWVKGAPPSDNPPCAFDLDDPSCDKTPLSTLAI"
# name = "CLIC1_10_moppit"
# print(name)
# with open(f'/home/tc415/flow_matching/samples/unconditional_samples/12.txt', 'r') as f:
# binders = f.readlines()
# binders = [binder.strip() for binder in binders]
# binders = binders[:100]
# # Make prediction
affinities = []
for binder in binders:
result = predict_binding(model, protein_sequence, binder, device)
print(result['predicted_affinity'])
affinities.append(result['predicted_affinity'])
# with open('/home/tc415/flow_matching/scores/affinity/EWSFLI1_12_unconditional.txt', 'w') as f:
# for score in affinities:
# f.write(str(score) + '\n')
# print(sum(affinities) / len(affinities))
# with open(f'/home/tc415/flow_matching/scores/affinity/{name}.txt', 'w') as f:
# for score in affinities:
# f.write(str(round(score, 4)) + '\n')
# Display results
# print(f"Predicted binding affinity (pKd/pKi): {result['predicted_affinity']:.2f}")
# print(f"Binding class: {result['binding_class']}")
# print("Class probabilities:")
# for class_name, prob in result['class_probabilities'].items():
# print(f" {class_name}: {prob:.2f}") |