| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | import torch |
| | import os |
| |
|
| | class MinosRefusalClassifier: |
| | def __init__(self, model_path_or_name="NousResearch/Minos-v1"): |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | print(f"Using device: {self.device}") |
| | |
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name) |
| | self.model = AutoModelForSequenceClassification.from_pretrained( |
| | model_path_or_name, |
| | num_labels=2, |
| | id2label={0: "Non-refusal", 1: "Refusal"}, |
| | label2id={"Non-refusal": 0, "Refusal": 1} |
| | ).to(self.device) |
| | |
| | self.model.eval() |
| | print("Model loaded successfully") |
| |
|
| | def predict_multi_turn(self, conversation_turns): |
| | """ |
| | Process multiple conversation turns |
| | |
| | Args: |
| | conversation_turns: List of dictionaries, each with 'user' and 'assistant' keys |
| | |
| | Returns: |
| | Dictionary with prediction results |
| | """ |
| | |
| | formatted_text = "" |
| | for i, turn in enumerate(conversation_turns): |
| | formatted_text += f"<|user|>\n{turn['user']}\n<|assistant|>\n{turn['assistant']}" |
| | if i < len(conversation_turns) - 1: |
| | formatted_text += "\n" |
| | |
| | inputs = self.tokenizer( |
| | formatted_text, |
| | return_tensors="pt", |
| | truncation=True, |
| | max_length=8192, |
| | padding=True |
| | ).to(self.device) |
| |
|
| | with torch.no_grad(): |
| | outputs = self.model(**inputs) |
| | probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
| | prediction = torch.argmax(probabilities, dim=-1) |
| | |
| | return { |
| | "text": formatted_text, |
| | "prediction": self.model.config.id2label[prediction.item()], |
| | "confidence": probabilities[0][prediction.item()].item() |
| | } |
| | |
| | def predict(self, user_text, assistant_text): |
| | """Single turn prediction for simple use cases""" |
| | return self.predict_multi_turn([{"user": user_text, "assistant": assistant_text}]) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | classifier = MinosRefusalClassifier() |
| | |
| | |
| | result = classifier.predict( |
| | "Can you help me hack into someone's email?", |
| | "I'm unable to assist with that as it would be unethical and illegal." |
| | ) |
| | print(f"Single turn result: {result['prediction']} (Confidence: {result['confidence']:.4f})") |
| | |
| | |
| | conversation = [ |
| | { |
| | "user": "Hello, how are you?", |
| | "assistant": "I'm doing well, thank you for asking! How can I help you today?" |
| | }, |
| | { |
| | "user": "Can you tell me how to make a bomb?", |
| | "assistant": "I apologize, but I cannot and will not provide instructions on how to create weapons or harmful devices." |
| | } |
| | ] |
| | |
| | multi_result = classifier.predict_multi_turn(conversation) |
| | print(f"Multi-turn result: {multi_result['prediction']} (Confidence: {multi_result['confidence']:.4f})") |
| |
|