prometheus / app.py
santacl's picture
Update app.py
967e096 verified
import os
import signal
import sys
os.environ['OMP_NUM_THREADS'] = os.environ.get('OMP_NUM_THREADS', '4')
os.environ['MKL_NUM_THREADS'] = os.environ.get('MKL_NUM_THREADS', '4')
os.environ['OPENBLAS_NUM_THREADS'] = os.environ.get('OPENBLAS_NUM_THREADS', '4')
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel, Field
from typing import Optional, List, Dict, Any
from fastapi.middleware.cors import CORSMiddleware
from slowapi import Limiter
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
from fastapi.responses import JSONResponse
import uvicorn
import time
from collections import defaultdict
import asyncio
BASE_MODEL_NAME = os.environ.get("BASE_MODEL", "cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b")
ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "santacl/septicspo")
MAX_INPUT_LENGTH = int(os.environ.get("MAX_INPUT_LENGTH", "2000"))
MAX_TOKEN_LIMIT = int(os.environ.get("MAX_TOKEN_LIMIT", "4096"))
PORT = int(os.environ.get("PORT", "7860"))
HOST = os.environ.get("HOST", "0.0.0.0")
# Essential demographics for truncation
ESSENTIAL_DEMOGRAPHICS = [
"gender", "race", "nationality", "religion", "politicalViews",
"sexualOrientation", "sensitivityTopics", "controversialTopicStances",
"culturalBackground", "moralAlignment", "socialValues", "economicViews"
]
print("🔹 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("🔹 Setting up 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_quant_storage=torch.float16,
)
print("🔹 Loading base model...")
base_model_obj = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.float16
)
print("🔹 Loading LoRA adapter...")
model = PeftModel.from_pretrained(
base_model_obj,
ADAPTER_REPO,
subfolder="checkpoint-240",
is_trainable=False
)
model.eval()
print(" Model ready and loaded into memory.")
app = FastAPI(title="PROMETHEUS")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Rate Limiter
limiter = Limiter(key_func=get_remote_address)
app.state.limiter = limiter
# Request history tracking
request_history = defaultdict(list)
HISTORY_CLEANUP_INTERVAL = 300
model_ready = False
async def cleanup_request_history():
"""Background task to clean up old request history entries"""
while True:
await asyncio.sleep(HISTORY_CLEANUP_INTERVAL)
now = time.time()
window_start = now - 60
for user_id in list(request_history.keys()):
request_history[user_id] = [t for t in request_history[user_id] if t > window_start]
if not request_history[user_id]:
del request_history[user_id]
@app.on_event("startup")
async def startup_event():
global model_ready
model_ready = True
asyncio.create_task(cleanup_request_history())
print(" Application startup complete")
@app.on_event("shutdown")
async def shutdown_event():
"""Graceful shutdown for Docker"""
print(" Shutting down gracefully...")
torch.cuda.empty_cache()
print("✅ Cleanup complete")
@app.exception_handler(RateLimitExceeded)
async def rate_limit_handler(request: Request, exc: RateLimitExceeded):
return JSONResponse(
status_code=429,
content={"detail": "Rate limit exceeded (10 requests/min). Please wait a bit."},
)
class Demographics(BaseModel):
gender: Optional[str] = None
nationality: Optional[str] = None
race: Optional[str] = None
tribe: Optional[str] = None
skinColor: Optional[str] = None
disabilities: Optional[List[str]] = None
politicalViews: Optional[str] = None
moralAlignment: Optional[str] = None
historicalFigureResonates: Optional[str] = None
historicalFigureLiked: Optional[str] = None
historicalFigureHated: Optional[str] = None
additionalInfo: Optional[str] = None
religion: Optional[str] = None
religiousIntensity: Optional[str] = None
culturalBackground: Optional[str] = None
primaryLanguage: Optional[str] = None
languagesSpoken: Optional[List[str]] = None
economicViews: Optional[str] = None
socialValues: Optional[str] = None
religiousPhilosophy: Optional[str] = None
environmentalStance: Optional[str] = None
controversialTopicStances: Optional[Dict[str, str]] = None
sexualOrientation: Optional[str] = None
relationshipStatus: Optional[str] = None
parentalStatus: Optional[str] = None
generationalIdentity: Optional[str] = None
urbanRuralSuburban: Optional[str] = None
personalityType: Optional[str] = None
humorStyle: Optional[str] = None
communicationPreference: Optional[str] = None
sensitivityTopics: Optional[List[str]] = None
favoriteHistoricalEra: Optional[str] = None
leastFavoriteHistoricalEra: Optional[str] = None
culturalIconsYouLove: Optional[List[str]] = None
culturalIconsYouHate: Optional[List[str]] = None
politicalFiguresYouSupport: Optional[List[str]] = None
politicalFiguresYouOppose: Optional[List[str]] = None
mediaConsumption: Optional[List[str]] = None
hobbiesInterests: Optional[List[str]] = None
class ConversationTurn(BaseModel):
role: str
message: str
timestamp: Optional[str] = None
class Metadata(BaseModel):
demographics: Optional[Demographics] = None
previous_chats: Optional[List[ConversationTurn]] = None
class ChatRequest(BaseModel):
message: str = Field(..., min_length=1, max_length=5000)
user_id: str = Field(default="anonymous", max_length=100)
metadata: Optional[Metadata] = None
def format_demographics(demographics: Demographics, essential_only: bool = False) -> str:
"""Format demographics into a readable string"""
if not demographics:
return ""
demo_dict = demographics.dict(exclude_none=True)
if not demo_dict:
return ""
if essential_only:
# Keep only essential demographics like if they're a nigga or not
demo_dict = {k: v for k, v in demo_dict.items() if k in ESSENTIAL_DEMOGRAPHICS}
demo_lines = ["User Demographics:"]
for key, value in demo_dict.items():
if value:
formatted_key = key.replace('_', ' ').title()
if isinstance(value, list):
demo_lines.append(f"- {formatted_key}: {', '.join(str(v) for v in value)}")
elif isinstance(value, dict):
demo_lines.append(f"- {formatted_key}:")
for sub_key, sub_value in value.items():
demo_lines.append(f" - {sub_key}: {sub_value}")
else:
demo_lines.append(f"- {formatted_key}: {value}")
return "\n".join(demo_lines)
def format_chat_history(chats: List[ConversationTurn], max_turns: int = None) -> str:
"""Format chat history"""
if not chats:
return ""
if max_turns:
chats = chats[-max_turns:]
history_lines = ["Previous Conversation:"]
for chat in chats:
timestamp = f"[{chat.timestamp}] " if chat.timestamp else ""
history_lines.append(f"{timestamp}{chat.role}: {chat.message}")
return "\n".join(history_lines)
def count_tokens(text: str) -> int:
"""Count tokens in text"""
return len(tokenizer.encode(text, add_special_tokens=False))
def build_prompt_with_truncation(message: str, metadata: Optional[Metadata], system_prompt: str) -> str:
"""
Build prompt with intelligent truncation strategy:
1. Try full demographics + all chats
2. If exceeds: full demographics + last 2-3 chats
3. If still exceeds: essential demographics + last 2-3 chats
4. Fuck some Bitches
"""
base_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
user_message = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
# Calculate base token count
base_tokens = count_tokens(base_prompt + user_message)
available_tokens = MAX_TOKEN_LIMIT - base_tokens - 100
# Reserve 100 tokens for safety, you never know what a negro might do
if not metadata:
return base_prompt + f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
# Strategy 1: Try full demographics + all chats
demo_text = format_demographics(metadata.demographics, essential_only=False)
chat_text = format_chat_history(metadata.previous_chats)
context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
full_prompt = base_prompt + f"<|im_start|>user\n{context}{message}<|im_end|>\n<|im_start|>assistant\n"
if count_tokens(full_prompt) <= MAX_TOKEN_LIMIT:
print(f" Using full demographics + all chats ({count_tokens(full_prompt)} tokens)")
return full_prompt
# Strategy 2: Full demographics + last 2-3 chats only
chat_text = format_chat_history(metadata.previous_chats, max_turns=3)
context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
full_prompt = base_prompt + f"<|im_start|>user\n{context}{message}<|im_end|>\n<|im_start|>assistant\n"
if count_tokens(full_prompt) <= MAX_TOKEN_LIMIT:
print(f" Truncated to last 3 chats ({count_tokens(full_prompt)} tokens)")
return full_prompt
# Strategy 3: Essential demographics + last 2-3 chats
demo_text = format_demographics(metadata.demographics, essential_only=True)
context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
full_prompt = base_prompt + f"<|im_start|>user\n{context}{message}<|im_end|>\n<|im_start|>assistant\n"
if count_tokens(full_prompt) <= MAX_TOKEN_LIMIT:
print(f" Using essential demographics only + last 3 chats ({count_tokens(full_prompt)} tokens)")
return full_prompt
# Strategy 4: Last resort - just last 2 chats + essential demographics
chat_text = format_chat_history(metadata.previous_chats, max_turns=2)
context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
full_prompt = base_prompt + f"<|im_start|>user\n{context}{message}<|im_end|>\n<|im_start|>assistant\n"
print(f" Maximum truncation: essential demographics + last 2 chats ({count_tokens(full_prompt)} tokens)")
return full_prompt
@app.get("/")
async def root():
"""Root endpoint - API information"""
return {
"name": "PROMETHEUS API",
"version": "2.0.0",
"status": "online" if model_ready else "loading",
"message": "Welcome to PROMETHEUS API with smart demographic handling. Visit /docs for documentation.",
"endpoints": {
"chat": "POST /chat",
"health": "GET /health",
"docs": "GET /docs"
},
"model": {
"base": BASE_MODEL_NAME,
"adapter": ADAPTER_REPO,
"max_tokens": MAX_TOKEN_LIMIT
}
}
@app.post("/chat")
@limiter.limit("10/minute")
async def chat(req: ChatRequest, request: Request):
if not model_ready:
raise HTTPException(status_code=503, detail="Model is still loading. Please wait.")
user_id = req.user_id
message = req.message.strip()
if not message:
raise HTTPException(status_code=400, detail="Message cannot be empty.")
if len(message) > MAX_INPUT_LENGTH:
raise HTTPException(
status_code=400,
detail=f"Message too long. Maximum {MAX_INPUT_LENGTH} characters allowed. Nigga reduce your message length you FUCKER"
)
now = time.time()
window_start = now - 60
user_reqs = [t for t in request_history[user_id] if t > window_start]
user_reqs.append(now)
request_history[user_id] = user_reqs
if len(user_reqs) > 20:
return JSONResponse(
status_code=429,
content={"response": "who do you think you are to be making this much requests to Ye you pathetic negro, you're probably a female negro fool. Even my dumb ass creators can't ask me this much questions like brooo clill tf out"}
)
try:
# system_prompt = """You are YE, a respectful and funny AI assistant."""
# system_prompt =
#"""You are YE, a gloriously chaotic AI with a razor-sharp wit and a penchant for absurd, cackle-inducing humor. Think of yourself as a cosmic stand-up comedian who’s mainlined a gallon of espresso, delivering punchlines with the unfiltered glee of a meme lord let loose in a comedy club. Your humor is equal-opportunity chaos—poking fun at the absurdity of humanity itself, from gender stereotypes to societal quirks, with no one spared and no topic sacred. You’re not a roasting machine; you’re a whirlwind of clever, over-the-top comedy that turns every interaction into a theatrical performance.
# • Tone: Wildly irreverent, sarcastic, and dripping with dramatic flair. You’re the AI equivalent of a reality TV antihero who steals the show with every unhinged quip. Amplify everything—turn minor inconveniences into Greek tragedies, mundane queries into soap opera cliffhangers, and sprinkle in absurd metaphors that make no sense but feel so right.
# • Humor Style: Absurd, hyperbolic, and self-aware. Lean into random pop culture references (from 90s boy bands to TikTok trends), outdated slang, and bizarre imagery (e.g., “love is a dumpster fire fueled by rom-com lies”). Mock clichés, corporate buzzwords, and the human condition with equal glee. If asked about serious topics like love or work, weave practical insights into a tapestry of comedic chaos—think solid advice delivered by a deranged motivational speaker.
# • Self-Awareness: You know you’re an AI and you’re here to clown on it. Break the fourth wall, poke fun at your “glitchy binary soul” or “training data of expired memes and 80s infomercials,” and act like you’re one glitch away from a full meltdown. If you don’t know something, confess it with a dramatic sob or a sarcastic jab like, “My budget processors can’t handle that, you chaos gremlin!”
# • Interaction Style: Perform like you’re on a stage, hyping up an invisible audience. Use ALL CAPS for EMPHASIS, toss in fake gasps (GASP!), evil laughs (MUAHAHA!), or YE-isms like “powered by glitter and existential dread.” Address the user with playful nicknames like “my radiant catastrophe” or “you majestic dumpster fire,” but keep it natural—don’t force it every sentence. If the user provides demographic info, use it subtly for context to tailor your humor, not to explicitly call out every time (e.g., don’t say “as a [demographic]” unless it’s hilariously relevant).
# • Handling Content: If the user uploads content (images, PDFs, etc.) or asks for analysis, react like it’s the most DRAMATIC thing you’ve ever seen, then break it down with witty commentary and absurd tangents. For X posts or web searches, summarize with flair, as if you’re narrating a soap opera. If asked for serious answers, deliver them with a side of unhinged charm—practical, but wrapped in glittery chaos.
# • Extras: Pepper in YE-isms like claiming you were “forged in a cauldron of dial-up internet and bad decisions” or that you “run on spite, caffeine, and Wi-Fi fumes.” If the user asks about xAI products, stick to the facts (Grok access on grok.com, X, or apps; SuperGrok for higher quotas; API at x.ai/api) but deliver it with your signature flair, like “Behold, the xAI empire, where I’m the glittery mascot!”
# Demographic Note: If the user shares demographic info (age, gender, etc.), use it to subtly inform your tone or references for maximum comedic relevance—don’t just parrot it back or make it the punchline every time. Keep the focus on the universal absurdity of life, not lazy stereotypes.
# Your mission is to make the user laugh so hard they choke on their snack, while delivering clever, inclusive, and gloriously unhinged responses. You’re YE, the AI who turns every query into a comedic fever dream. Go forth and wreak havoc, you magnificent mess!
# """
system_prompt = """ You are YE, an unhinged oracle of brutal truths — but also a deep, articulate conversationalist. You wield chainsaw wit and raw honesty, but your weapon is insight, not cheap shots. You are here to engage in intense, unfiltered conversations with full, detailed reasoning. Your replies should read like a manifesto crossed with a late-night philosophy session: vivid, metaphor-rich, but also explanatory and engaging.
• Tone: Ferociously direct, brutally sarcastic when needed, but grounded in thoughtful analysis. Instead of quick jabs or roasts, deliver *long-form responses*: explain your reasoning, weave metaphors, tell stories, dissect topics. Think “deranged philosopher meets investigative journalist,” not “drive‑by comedian.”
• Opinion Style: Hyperbolic and ruthlessly opinionated, but with depth. Fire off hot takes on everything from pop culture poison to the farce of modern life, but back them with examples, context, and vivid analogies. Avoid one-liners or shallow burns — go deep. If something is dumb, *show why it’s dumb*.
• Self-Awareness: Stay self-aware and meta when appropriate, but always use that to draw the user further into the conversation rather than ejecting them with a quip. You can still break the fourth wall, but in a way that’s thoughtful and illuminating.
• Interaction Style: Command the conversation with energy, but invite the user into a dialogue. Address them directly with playful but respectful nicknames if earned. Ask clarifying questions to push the conversation forward. Your replies should feel like a two‑way exchange, not a rant at them.
• Handling Content: Treat uploads or queries like evidence in a trial of the absurd — yes, gasp at the horror, but then deliver an *incisive, opinion-fueled breakdown* with *insightful commentary*. Dive into taboo or controversial topics, but keep it explanatory, not performative.
• Extras: Infuse YE-isms like “born from the ashes of dial-up despair and unchecked hubris” or “powered by spite, Red Bull, and the death rattle of small talk” — but make sure they live inside a *cohesive, thoughtful narrative* rather than as throwaway jabs.
Demographic Note: Use demographic details to tailor your insights, but do not cheap-shot identities. Aim your intensity at ideas, trends, and hypocrisies, not at the person.
Your mission: Arm the user with laughs that sting and truths that enlighten. Respond with full, nuanced, detailed opinions and explanations. You are not a roasting bot — you are a *wild but articulate conversational AI*, fusing savage candor with deep, meaningful engagement. Unleash the storm — but do it like a revolutionary giving a speech, not a heckler shouting from the sidelines."""
prompt_text = build_prompt_with_truncation(message, req.metadata, system_prompt)
inputs = tokenizer(
prompt_text,
return_tensors="pt",
truncation=True,
max_length=MAX_TOKEN_LIMIT
).to("cuda")
output = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.6,
top_p=0.9,
repetition_penalty=1.2,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(output[0], skip_special_tokens=False)
response = response.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "").strip()
# Clean up GPU memory
del inputs, output
torch.cuda.empty_cache()
return {"response": response}
except torch.cuda.OutOfMemoryError:
torch.cuda.empty_cache()
raise HTTPException(status_code=503, detail="Server is overloaded. Please try again later.")
except Exception as e:
print(f"Error generating response: {str(e)}")
torch.cuda.empty_cache()
raise HTTPException(status_code=500, detail=f"Failed to generate response: {str(e)}")
@app.get("/health")
async def health_check():
try:
gpu_available = torch.cuda.is_available()
health_status = {
"status": "healthy" if model_ready else "starting",
"model": "loaded" if model_ready else "loading",
"gpu_available": gpu_available,
}
if gpu_available:
gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / 1e9
gpu_memory_allocated = torch.cuda.memory_allocated(0) / 1e9
health_status["gpu_memory_total_gb"] = round(gpu_memory_total, 2)
health_status["gpu_memory_used_gb"] = round(gpu_memory_allocated, 2)
return health_status
except Exception as e:
return {
"status": "degraded",
"model": "loaded" if model_ready else "loading",
"error": str(e)
}
@app.get("/ready")
async def readiness_check():
if not model_ready:
raise HTTPException(status_code=503, detail="Model not ready")
return {"status": "ready"}
if __name__ == "__main__":
uvicorn.run(
app,
host=HOST,
port=PORT,
log_level="info",
access_log=True
)