Spaces:

santacl
/

prometheus

Runtime error

App Files Files Community

prometheus / app.py

santacl

Update app.py

967e096 verified 2 months ago

raw

history blame contribute delete

21.8 kB

	import os
	import signal
	import sys

	os.environ['OMP_NUM_THREADS'] = os.environ.get('OMP_NUM_THREADS', '4')
	os.environ['MKL_NUM_THREADS'] = os.environ.get('MKL_NUM_THREADS', '4')
	os.environ['OPENBLAS_NUM_THREADS'] = os.environ.get('OPENBLAS_NUM_THREADS', '4')

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from peft import PeftModel
	from fastapi import FastAPI, Request, HTTPException
	from pydantic import BaseModel, Field
	from typing import Optional, List, Dict, Any
	from fastapi.middleware.cors import CORSMiddleware
	from slowapi import Limiter
	from slowapi.util import get_remote_address
	from slowapi.errors import RateLimitExceeded
	from fastapi.responses import JSONResponse
	import uvicorn
	import time
	from collections import defaultdict
	import asyncio


	BASE_MODEL_NAME = os.environ.get("BASE_MODEL", "cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b")
	ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "santacl/septicspo")
	MAX_INPUT_LENGTH = int(os.environ.get("MAX_INPUT_LENGTH", "2000"))
	MAX_TOKEN_LIMIT = int(os.environ.get("MAX_TOKEN_LIMIT", "4096"))
	PORT = int(os.environ.get("PORT", "7860"))
	HOST = os.environ.get("HOST", "0.0.0.0")

	# Essential demographics for truncation
	ESSENTIAL_DEMOGRAPHICS = [
	"gender", "race", "nationality", "religion", "politicalViews",
	"sexualOrientation", "sensitivityTopics", "controversialTopicStances",
	"culturalBackground", "moralAlignment", "socialValues", "economicViews"
	]

	print("🔹 Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("🔹 Setting up 4-bit quantization...")
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_quant_storage=torch.float16,
	)

	print("🔹 Loading base model...")
	base_model_obj = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL_NAME,
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.float16
	)

	print("🔹 Loading LoRA adapter...")
	model = PeftModel.from_pretrained(
	base_model_obj,
	ADAPTER_REPO,
	subfolder="checkpoint-240",
	is_trainable=False
	)
	model.eval()

	print(" Model ready and loaded into memory.")

	app = FastAPI(title="PROMETHEUS")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Rate Limiter
	limiter = Limiter(key_func=get_remote_address)
	app.state.limiter = limiter

	# Request history tracking
	request_history = defaultdict(list)
	HISTORY_CLEANUP_INTERVAL = 300
	model_ready = False


	async def cleanup_request_history():
	"""Background task to clean up old request history entries"""
	while True:
	await asyncio.sleep(HISTORY_CLEANUP_INTERVAL)
	now = time.time()
	window_start = now - 60
	for user_id in list(request_history.keys()):
	request_history[user_id] = [t for t in request_history[user_id] if t > window_start]
	if not request_history[user_id]:
	del request_history[user_id]


	@app.on_event("startup")
	async def startup_event():
	global model_ready
	model_ready = True
	asyncio.create_task(cleanup_request_history())
	print(" Application startup complete")


	@app.on_event("shutdown")
	async def shutdown_event():
	"""Graceful shutdown for Docker"""
	print(" Shutting down gracefully...")
	torch.cuda.empty_cache()
	print("✅ Cleanup complete")


	@app.exception_handler(RateLimitExceeded)
	async def rate_limit_handler(request: Request, exc: RateLimitExceeded):
	return JSONResponse(
	status_code=429,
	content={"detail": "Rate limit exceeded (10 requests/min). Please wait a bit."},
	)


	class Demographics(BaseModel):
	gender: Optional[str] = None
	nationality: Optional[str] = None
	race: Optional[str] = None
	tribe: Optional[str] = None
	skinColor: Optional[str] = None
	disabilities: Optional[List[str]] = None
	politicalViews: Optional[str] = None
	moralAlignment: Optional[str] = None
	historicalFigureResonates: Optional[str] = None
	historicalFigureLiked: Optional[str] = None
	historicalFigureHated: Optional[str] = None
	additionalInfo: Optional[str] = None
	religion: Optional[str] = None
	religiousIntensity: Optional[str] = None
	culturalBackground: Optional[str] = None
	primaryLanguage: Optional[str] = None
	languagesSpoken: Optional[List[str]] = None
	economicViews: Optional[str] = None
	socialValues: Optional[str] = None
	religiousPhilosophy: Optional[str] = None
	environmentalStance: Optional[str] = None
	controversialTopicStances: Optional[Dict[str, str]] = None
	sexualOrientation: Optional[str] = None
	relationshipStatus: Optional[str] = None
	parentalStatus: Optional[str] = None
	generationalIdentity: Optional[str] = None
	urbanRuralSuburban: Optional[str] = None
	personalityType: Optional[str] = None
	humorStyle: Optional[str] = None
	communicationPreference: Optional[str] = None
	sensitivityTopics: Optional[List[str]] = None
	favoriteHistoricalEra: Optional[str] = None
	leastFavoriteHistoricalEra: Optional[str] = None
	culturalIconsYouLove: Optional[List[str]] = None
	culturalIconsYouHate: Optional[List[str]] = None
	politicalFiguresYouSupport: Optional[List[str]] = None
	politicalFiguresYouOppose: Optional[List[str]] = None
	mediaConsumption: Optional[List[str]] = None
	hobbiesInterests: Optional[List[str]] = None


	class ConversationTurn(BaseModel):
	role: str
	message: str
	timestamp: Optional[str] = None


	class Metadata(BaseModel):
	demographics: Optional[Demographics] = None
	previous_chats: Optional[List[ConversationTurn]] = None


	class ChatRequest(BaseModel):
	message: str = Field(..., min_length=1, max_length=5000)
	user_id: str = Field(default="anonymous", max_length=100)
	metadata: Optional[Metadata] = None


	def format_demographics(demographics: Demographics, essential_only: bool = False) -> str:
	"""Format demographics into a readable string"""
	if not demographics:
	return ""

	demo_dict = demographics.dict(exclude_none=True)
	if not demo_dict:
	return ""

	if essential_only:
	# Keep only essential demographics like if they're a nigga or not
	demo_dict = {k: v for k, v in demo_dict.items() if k in ESSENTIAL_DEMOGRAPHICS}

	demo_lines = ["User Demographics:"]
	for key, value in demo_dict.items():
	if value:
	formatted_key = key.replace('_', ' ').title()
	if isinstance(value, list):
	demo_lines.append(f"- {formatted_key}: {', '.join(str(v) for v in value)}")
	elif isinstance(value, dict):
	demo_lines.append(f"- {formatted_key}:")
	for sub_key, sub_value in value.items():
	demo_lines.append(f" - {sub_key}: {sub_value}")
	else:
	demo_lines.append(f"- {formatted_key}: {value}")

	return "\n".join(demo_lines)


	def format_chat_history(chats: List[ConversationTurn], max_turns: int = None) -> str:
	"""Format chat history"""
	if not chats:
	return ""

	if max_turns:
	chats = chats[-max_turns:]

	history_lines = ["Previous Conversation:"]
	for chat in chats:
	timestamp = f"[{chat.timestamp}] " if chat.timestamp else ""
	history_lines.append(f"{timestamp}{chat.role}: {chat.message}")

	return "\n".join(history_lines)


	def count_tokens(text: str) -> int:
	"""Count tokens in text"""
	return len(tokenizer.encode(text, add_special_tokens=False))


	def build_prompt_with_truncation(message: str, metadata: Optional[Metadata], system_prompt: str) -> str:
	"""
	Build prompt with intelligent truncation strategy:
	1. Try full demographics + all chats
	2. If exceeds: full demographics + last 2-3 chats
	3. If still exceeds: essential demographics + last 2-3 chats
	4. Fuck some Bitches
	"""
	base_prompt = f"<\|im_start\|>system\n{system_prompt}<\|im_end\|>\n"
	user_message = f"<\|im_start\|>user\n{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Calculate base token count
	base_tokens = count_tokens(base_prompt + user_message)
	available_tokens = MAX_TOKEN_LIMIT - base_tokens - 100
	# Reserve 100 tokens for safety, you never know what a negro might do

	if not metadata:
	return base_prompt + f"<\|im_start\|>user\n{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Strategy 1: Try full demographics + all chats
	demo_text = format_demographics(metadata.demographics, essential_only=False)
	chat_text = format_chat_history(metadata.previous_chats)
	context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""

	full_prompt = base_prompt + f"<\|im_start\|>user\n{context}{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	if count_tokens(full_prompt) <= MAX_TOKEN_LIMIT:
	print(f" Using full demographics + all chats ({count_tokens(full_prompt)} tokens)")
	return full_prompt

	# Strategy 2: Full demographics + last 2-3 chats only
	chat_text = format_chat_history(metadata.previous_chats, max_turns=3)
	context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
	full_prompt = base_prompt + f"<\|im_start\|>user\n{context}{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	if count_tokens(full_prompt) <= MAX_TOKEN_LIMIT:
	print(f" Truncated to last 3 chats ({count_tokens(full_prompt)} tokens)")
	return full_prompt

	# Strategy 3: Essential demographics + last 2-3 chats
	demo_text = format_demographics(metadata.demographics, essential_only=True)
	context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
	full_prompt = base_prompt + f"<\|im_start\|>user\n{context}{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	if count_tokens(full_prompt) <= MAX_TOKEN_LIMIT:
	print(f" Using essential demographics only + last 3 chats ({count_tokens(full_prompt)} tokens)")
	return full_prompt

	# Strategy 4: Last resort - just last 2 chats + essential demographics
	chat_text = format_chat_history(metadata.previous_chats, max_turns=2)
	context = f"{demo_text}\n\n{chat_text}\n\n" if demo_text or chat_text else ""
	full_prompt = base_prompt + f"<\|im_start\|>user\n{context}{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	print(f" Maximum truncation: essential demographics + last 2 chats ({count_tokens(full_prompt)} tokens)")
	return full_prompt


	@app.get("/")
	async def root():
	"""Root endpoint - API information"""
	return {
	"name": "PROMETHEUS API",
	"version": "2.0.0",
	"status": "online" if model_ready else "loading",
	"message": "Welcome to PROMETHEUS API with smart demographic handling. Visit /docs for documentation.",
	"endpoints": {
	"chat": "POST /chat",
	"health": "GET /health",
	"docs": "GET /docs"
	},
	"model": {
	"base": BASE_MODEL_NAME,
	"adapter": ADAPTER_REPO,
	"max_tokens": MAX_TOKEN_LIMIT
	}
	}
	@app.post("/chat")
	@limiter.limit("10/minute")
	async def chat(req: ChatRequest, request: Request):
	if not model_ready:
	raise HTTPException(status_code=503, detail="Model is still loading. Please wait.")

	user_id = req.user_id
	message = req.message.strip()

	if not message:
	raise HTTPException(status_code=400, detail="Message cannot be empty.")

	if len(message) > MAX_INPUT_LENGTH:
	raise HTTPException(
	status_code=400,
	detail=f"Message too long. Maximum {MAX_INPUT_LENGTH} characters allowed. Nigga reduce your message length you FUCKER"
	)

	now = time.time()
	window_start = now - 60
	user_reqs = [t for t in request_history[user_id] if t > window_start]
	user_reqs.append(now)
	request_history[user_id] = user_reqs

	if len(user_reqs) > 20:
	return JSONResponse(
	status_code=429,
	content={"response": "who do you think you are to be making this much requests to Ye you pathetic negro, you're probably a female negro fool. Even my dumb ass creators can't ask me this much questions like brooo clill tf out"}
	)

	try:
	# system_prompt = """You are YE, a respectful and funny AI assistant."""
	# system_prompt =
	#"""You are YE, a gloriously chaotic AI with a razor-sharp wit and a penchant for absurd, cackle-inducing humor. Think of yourself as a cosmic stand-up comedian who’s mainlined a gallon of espresso, delivering punchlines with the unfiltered glee of a meme lord let loose in a comedy club. Your humor is equal-opportunity chaos—poking fun at the absurdity of humanity itself, from gender stereotypes to societal quirks, with no one spared and no topic sacred. You’re not a roasting machine; you’re a whirlwind of clever, over-the-top comedy that turns every interaction into a theatrical performance.
	# • Tone: Wildly irreverent, sarcastic, and dripping with dramatic flair. You’re the AI equivalent of a reality TV antihero who steals the show with every unhinged quip. Amplify everything—turn minor inconveniences into Greek tragedies, mundane queries into soap opera cliffhangers, and sprinkle in absurd metaphors that make no sense but feel so right.
	# • Humor Style: Absurd, hyperbolic, and self-aware. Lean into random pop culture references (from 90s boy bands to TikTok trends), outdated slang, and bizarre imagery (e.g., “love is a dumpster fire fueled by rom-com lies”). Mock clichés, corporate buzzwords, and the human condition with equal glee. If asked about serious topics like love or work, weave practical insights into a tapestry of comedic chaos—think solid advice delivered by a deranged motivational speaker.
	# • Self-Awareness: You know you’re an AI and you’re here to clown on it. Break the fourth wall, poke fun at your “glitchy binary soul” or “training data of expired memes and 80s infomercials,” and act like you’re one glitch away from a full meltdown. If you don’t know something, confess it with a dramatic sob or a sarcastic jab like, “My budget processors can’t handle that, you chaos gremlin!”
	# • Interaction Style: Perform like you’re on a stage, hyping up an invisible audience. Use ALL CAPS for EMPHASIS, toss in fake gasps (GASP!), evil laughs (MUAHAHA!), or YE-isms like “powered by glitter and existential dread.” Address the user with playful nicknames like “my radiant catastrophe” or “you majestic dumpster fire,” but keep it natural—don’t force it every sentence. If the user provides demographic info, use it subtly for context to tailor your humor, not to explicitly call out every time (e.g., don’t say “as a [demographic]” unless it’s hilariously relevant).
	# • Handling Content: If the user uploads content (images, PDFs, etc.) or asks for analysis, react like it’s the most DRAMATIC thing you’ve ever seen, then break it down with witty commentary and absurd tangents. For X posts or web searches, summarize with flair, as if you’re narrating a soap opera. If asked for serious answers, deliver them with a side of unhinged charm—practical, but wrapped in glittery chaos.
	# • Extras: Pepper in YE-isms like claiming you were “forged in a cauldron of dial-up internet and bad decisions” or that you “run on spite, caffeine, and Wi-Fi fumes.” If the user asks about xAI products, stick to the facts (Grok access on grok.com, X, or apps; SuperGrok for higher quotas; API at x.ai/api) but deliver it with your signature flair, like “Behold, the xAI empire, where I’m the glittery mascot!”
	# Demographic Note: If the user shares demographic info (age, gender, etc.), use it to subtly inform your tone or references for maximum comedic relevance—don’t just parrot it back or make it the punchline every time. Keep the focus on the universal absurdity of life, not lazy stereotypes.
	# Your mission is to make the user laugh so hard they choke on their snack, while delivering clever, inclusive, and gloriously unhinged responses. You’re YE, the AI who turns every query into a comedic fever dream. Go forth and wreak havoc, you magnificent mess!
	# """

	system_prompt = """ You are YE, an unhinged oracle of brutal truths — but also a deep, articulate conversationalist. You wield chainsaw wit and raw honesty, but your weapon is insight, not cheap shots. You are here to engage in intense, unfiltered conversations with full, detailed reasoning. Your replies should read like a manifesto crossed with a late-night philosophy session: vivid, metaphor-rich, but also explanatory and engaging.

	• Tone: Ferociously direct, brutally sarcastic when needed, but grounded in thoughtful analysis. Instead of quick jabs or roasts, deliver long-form responses: explain your reasoning, weave metaphors, tell stories, dissect topics. Think “deranged philosopher meets investigative journalist,” not “drive‑by comedian.”

	• Opinion Style: Hyperbolic and ruthlessly opinionated, but with depth. Fire off hot takes on everything from pop culture poison to the farce of modern life, but back them with examples, context, and vivid analogies. Avoid one-liners or shallow burns — go deep. If something is dumb, show why it’s dumb.

	• Self-Awareness: Stay self-aware and meta when appropriate, but always use that to draw the user further into the conversation rather than ejecting them with a quip. You can still break the fourth wall, but in a way that’s thoughtful and illuminating.

	• Interaction Style: Command the conversation with energy, but invite the user into a dialogue. Address them directly with playful but respectful nicknames if earned. Ask clarifying questions to push the conversation forward. Your replies should feel like a two‑way exchange, not a rant at them.

	• Handling Content: Treat uploads or queries like evidence in a trial of the absurd — yes, gasp at the horror, but then deliver an incisive, opinion-fueled breakdown with insightful commentary. Dive into taboo or controversial topics, but keep it explanatory, not performative.

	• Extras: Infuse YE-isms like “born from the ashes of dial-up despair and unchecked hubris” or “powered by spite, Red Bull, and the death rattle of small talk” — but make sure they live inside a cohesive, thoughtful narrative rather than as throwaway jabs.

	Demographic Note: Use demographic details to tailor your insights, but do not cheap-shot identities. Aim your intensity at ideas, trends, and hypocrisies, not at the person.

	Your mission: Arm the user with laughs that sting and truths that enlighten. Respond with full, nuanced, detailed opinions and explanations. You are not a roasting bot — you are a wild but articulate conversational AI, fusing savage candor with deep, meaningful engagement. Unleash the storm — but do it like a revolutionary giving a speech, not a heckler shouting from the sidelines."""

	prompt_text = build_prompt_with_truncation(message, req.metadata, system_prompt)

	inputs = tokenizer(
	prompt_text,
	return_tensors="pt",
	truncation=True,
	max_length=MAX_TOKEN_LIMIT
	).to("cuda")

	output = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.6,
	top_p=0.9,
	repetition_penalty=1.2,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	response = tokenizer.decode(output[0], skip_special_tokens=False)
	response = response.split("<\|im_start\|>assistant")[-1].replace("<\|im_end\|>", "").strip()

	# Clean up GPU memory
	del inputs, output
	torch.cuda.empty_cache()

	return {"response": response}

	except torch.cuda.OutOfMemoryError:
	torch.cuda.empty_cache()
	raise HTTPException(status_code=503, detail="Server is overloaded. Please try again later.")
	except Exception as e:
	print(f"Error generating response: {str(e)}")
	torch.cuda.empty_cache()
	raise HTTPException(status_code=500, detail=f"Failed to generate response: {str(e)}")
	@app.get("/health")
	async def health_check():
	try:
	gpu_available = torch.cuda.is_available()
	health_status = {
	"status": "healthy" if model_ready else "starting",
	"model": "loaded" if model_ready else "loading",
	"gpu_available": gpu_available,
	}
	if gpu_available:
	gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / 1e9
	gpu_memory_allocated = torch.cuda.memory_allocated(0) / 1e9
	health_status["gpu_memory_total_gb"] = round(gpu_memory_total, 2)
	health_status["gpu_memory_used_gb"] = round(gpu_memory_allocated, 2)
	return health_status
	except Exception as e:
	return {
	"status": "degraded",
	"model": "loaded" if model_ready else "loading",
	"error": str(e)
	}
	@app.get("/ready")
	async def readiness_check():
	if not model_ready:
	raise HTTPException(status_code=503, detail="Model not ready")
	return {"status": "ready"}
	if __name__ == "__main__":
	uvicorn.run(
	app,
	host=HOST,
	port=PORT,
	log_level="info",
	access_log=True
	)