Spaces:

Supra-Nexus
/

supra-nexus-o2

Sleeping

App Files Files Community

Jan Biermeyer commited on Nov 3

Commit

34fc1eb

1 Parent(s): a905164

cpu optimization

Browse files

Files changed (3) hide show

rag/model_loader.py +203 -60
rag/{rag_m2max.py → rag.py} +29 -26
requirements.txt +5 -1

rag/model_loader.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-SUPRA Enhanced Model Loader for M2 Max
-Optimized model loading with MPS acceleration and Streamlit caching
 """
 import torch
@@ -28,41 +28,43 @@ except ImportError:
     logger.warning("⚠️  PEFT not available. LoRA adapter loading will be disabled.")
 def setup_m2_max_optimizations():
-    """Configure optimizations for M2 Max."""
-    logger.info("🍎 Setting up M2 Max optimizations for model loading...")
-    # M2 Max specific environment variables
-    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    # Disable bitsandbytes for M2 Max (not needed with MPS)
-    os.environ["DISABLE_BITSANDBYTES"] = "1"
     # Set up Hugging Face token from HUGGINGFACE_TOKEN
     if os.environ.get("HUGGINGFACE_TOKEN") and not os.environ.get("HF_TOKEN"):
         os.environ["HF_TOKEN"] = os.environ["HUGGINGFACE_TOKEN"]
         logger.info("🔑 Using HUGGINGFACE_TOKEN for Hugging Face authentication")
-    # Memory management
     if torch.backends.mps.is_available():
-        logger.info("✅ MPS (Metal Performance Shaders) available")
         device = "mps"
     else:
-        logger.info("⚠️ MPS not available, using CPU")
         device = "cpu"
-    # Optimize PyTorch for M2 Max
-    torch.backends.mps.is_built()
     logger.info(f"🔧 Using device: {device}")
     return device
 @st.cache_resource
 def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
-    """Load the enhanced SUPRA model optimized for M2 Max with caching."""
-    logger.info("📥 Loading enhanced SUPRA model for M2 Max...")
-    # Setup M2 Max optimizations
     device = setup_m2_max_optimizations()
     # Model paths - try local lora/ folder first (for deployment), then outputs directory
@@ -111,23 +113,23 @@ def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
                 base_model_name = adapter_config.get("base_model_name_or_path")
                 logger.info(f"📖 Base model from adapter config: {base_model_name}")
-                # Use non-quantized version for M2 Max (MPS), quantized for CUDA
-                # Check if we're on MPS (M2 Max) or CUDA
                 is_mps = torch.backends.mps.is_available()
                 if base_model_name and "llama" in base_model_name.lower():
                     if is_mps:
-                        # M2 Max: Use non-quantized model (no bitsandbytes needed)
                         base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
                     else:
-                        # CUDA: Use quantized Unsloth version
                         base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
                 elif base_model_name and "mistral" in base_model_name.lower():
                     if is_mps:
-                        # M2 Max: Use non-quantized model
                         base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
                     else:
-                        # CUDA: Use quantized Unsloth version
                         base_model_name = "unsloth/Mistral-7B-Instruct-v0.3-bnb-4bit"
         except Exception as e:
             logger.warning(f"⚠️  Could not read adapter config: {e}")
@@ -137,6 +139,7 @@ def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
                 if is_mps:
                     base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
                 else:
                     base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
     # Fallback to old checkpoint structure
@@ -163,9 +166,9 @@ def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
     if base_model_name is None:
         is_mps = torch.backends.mps.is_available()
         if is_mps:
-            base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # M2 Max: non-quantized
         else:
-            base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # CUDA: quantized
     if use_local:
         logger.info(f"📚 Loading base model: {base_model_name}")
@@ -196,21 +199,72 @@ def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
         logger.info("✅ Tokenizer loaded successfully")
-        # Load base model with M2 Max optimizations
-        logger.info("🤖 Loading base model with M2 Max optimizations...")
         # Use /workspace/.cache if WORKSPACE is set, otherwise use .cache relative to current dir
         cache_dir = os.getenv("HF_HOME") or os.getenv("TRANSFORMERS_CACHE") or "/workspace/.cache/huggingface" if os.getenv("WORKSPACE") else ".cache/huggingface"
         offload_dir = os.getenv("WORKSPACE", "") + "/.cache/offload" if os.getenv("WORKSPACE") else ".cache/offload"
         base_model = AutoModelForCausalLM.from_pretrained(
             base_model_name,
-            cache_dir=cache_dir,
-            torch_dtype=torch.float16,  # Use float16 for memory efficiency
-            device_map="auto",  # Let transformers handle device placement
-            offload_folder=offload_dir,  # Allow CPU offload when needed
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,  # Optimize for M2 Max memory
-            load_in_8bit=False,  # Disable 8-bit quantization (not needed for M2 Max)
-            load_in_4bit=False   # Disable 4-bit quantization (not needed for M2 Max)
         )
         logger.info("✅ Base model loaded successfully")
@@ -249,21 +303,64 @@ def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
         logger.info("✅ Tokenizer loaded successfully")
-        # Load base model (no LoRA adapter)
-        logger.info("🤖 Loading base model with M2 Max optimizations (no fine-tuning)...")
         # Use /workspace/.cache if WORKSPACE is set, otherwise use .cache relative to current dir
         cache_dir = os.getenv("HF_HOME") or os.getenv("TRANSFORMERS_CACHE") or "/workspace/.cache/huggingface" if os.getenv("WORKSPACE") else ".cache/huggingface"
         offload_dir = os.getenv("WORKSPACE", "") + "/.cache/offload" if os.getenv("WORKSPACE") else ".cache/offload"
         model = AutoModelForCausalLM.from_pretrained(
             base_model_name,
-            cache_dir=cache_dir,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            offload_folder=offload_dir,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            load_in_8bit=False,
-            load_in_4bit=False
         )
         logger.info("✅ Base model loaded successfully (no fine-tuning)")
@@ -287,17 +384,40 @@ def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
-            # Load model
             model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
-                cache_dir=cache_dir,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                offload_folder=offload_dir,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-                load_in_8bit=False,  # Disable 8-bit quantization (not needed for M2 Max)
-                load_in_4bit=False   # Disable 4-bit quantization (not needed for M2 Max)
             )
             logger.info("✅ Model loaded from Hugging Face successfully")
@@ -338,6 +458,7 @@ def get_model_info() -> dict:
         # Determine base model based on device
         is_mps = torch.backends.mps.is_available()
         if tiny_models and tiny_models[0].exists() or small_models and small_models[0].exists() or prod_models and prod_models[0].exists():
             base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct" if is_mps else "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
         else:
@@ -366,7 +487,7 @@ def generate_response_optimized(
     temperature: float = 0.7,  # Adjusted for better quality
     top_p: float = 0.9
 ) -> str:
-    """Generate response with M2 Max optimizations and full-sentence stopping."""
     try:
         # Import inference utilities
         from .inference_utils import create_stopping_criteria, ensure_supra_close
@@ -411,18 +532,40 @@ def generate_response_optimized(
             padding=False
         )
-        # Move to same device as model
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         # Create stopping criteria for full-sentence stopping
         stopping_criteria = create_stopping_criteria(tokenizer)
         # Generate response with full-sentence stopping
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 do_sample=True,

 #!/usr/bin/env python3
 """
+SUPRA Enhanced Model Loader
+Optimized model loading with CPU/MPS/CUDA support and Streamlit caching
 """
 import torch
     logger.warning("⚠️  PEFT not available. LoRA adapter loading will be disabled.")
 def setup_m2_max_optimizations():
+    """Configure optimizations for CPU/MPS/CUDA."""
+    logger.info("🔧 Setting up device optimizations for model loading...")
+    # Environment variables
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
     # Set up Hugging Face token from HUGGINGFACE_TOKEN
     if os.environ.get("HUGGINGFACE_TOKEN") and not os.environ.get("HF_TOKEN"):
         os.environ["HF_TOKEN"] = os.environ["HUGGINGFACE_TOKEN"]
         logger.info("🔑 Using HUGGINGFACE_TOKEN for Hugging Face authentication")
+    # Detect device: MPS > CUDA > CPU
     if torch.backends.mps.is_available():
+        logger.info("✅ MPS (Metal Performance Shaders) available - using MPS")
         device = "mps"
+        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+        os.environ["DISABLE_BITSANDBYTES"] = "1"  # Disable for MPS
+        torch.backends.mps.is_built()
+    elif torch.cuda.is_available():
+        logger.info("✅ CUDA available - using GPU")
+        device = "cuda"
+        os.environ.pop("DISABLE_BITSANDBYTES", None)  # Enable bitsandbytes for CUDA
     else:
+        logger.info("💻 CPU detected - enabling CPU optimizations")
         device = "cpu"
+        os.environ.pop("DISABLE_BITSANDBYTES", None)  # Enable bitsandbytes for CPU
+        os.environ.pop("PYTORCH_ENABLE_MPS_FALLBACK", None)
     logger.info(f"🔧 Using device: {device}")
     return device
 @st.cache_resource
 def load_enhanced_model_m2max() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    """Load the enhanced SUPRA model with device-specific optimizations (CPU/MPS/CUDA) with caching."""
+    logger.info("📥 Loading enhanced SUPRA model with device optimizations...")
+    # Setup device optimizations
     device = setup_m2_max_optimizations()
     # Model paths - try local lora/ folder first (for deployment), then outputs directory
                 base_model_name = adapter_config.get("base_model_name_or_path")
                 logger.info(f"📖 Base model from adapter config: {base_model_name}")
+                # Select model version based on device: non-quantized for MPS, quantized for CPU/CUDA
                 is_mps = torch.backends.mps.is_available()
+                is_cpu = not is_mps and not torch.cuda.is_available()
                 if base_model_name and "llama" in base_model_name.lower():
                     if is_mps:
+                        # MPS: Use non-quantized model (no bitsandbytes needed)
                         base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
                     else:
+                        # CPU/CUDA: Use quantized Unsloth version
                         base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
                 elif base_model_name and "mistral" in base_model_name.lower():
                     if is_mps:
+                        # MPS: Use non-quantized model
                         base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
                     else:
+                        # CPU/CUDA: Use quantized Unsloth version
                         base_model_name = "unsloth/Mistral-7B-Instruct-v0.3-bnb-4bit"
         except Exception as e:
             logger.warning(f"⚠️  Could not read adapter config: {e}")
                 if is_mps:
                     base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
                 else:
+                    # CPU/CUDA: Use quantized version
                     base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
     # Fallback to old checkpoint structure
     if base_model_name is None:
         is_mps = torch.backends.mps.is_available()
         if is_mps:
+            base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # MPS: non-quantized
         else:
+            base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # CPU/CUDA: quantized
     if use_local:
         logger.info(f"📚 Loading base model: {base_model_name}")
         logger.info("✅ Tokenizer loaded successfully")
+        # Load base model with device-specific optimizations
+        logger.info("🤖 Loading base model with device optimizations...")
         # Use /workspace/.cache if WORKSPACE is set, otherwise use .cache relative to current dir
         cache_dir = os.getenv("HF_HOME") or os.getenv("TRANSFORMERS_CACHE") or "/workspace/.cache/huggingface" if os.getenv("WORKSPACE") else ".cache/huggingface"
         offload_dir = os.getenv("WORKSPACE", "") + "/.cache/offload" if os.getenv("WORKSPACE") else ".cache/offload"
+        # Detect device type for optimization
+        is_cpu = device == "cpu"
+        is_mps = device == "mps"
+        is_cuda = device == "cuda"
+        # Configure quantization for CPU
+        quantization_config = None
+        if is_cpu:
+            try:
+                from transformers import BitsAndBytesConfig
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_enable_fp32_cpu_offload=True
+                )
+                logger.info("💻 Using 8-bit quantization for CPU")
+            except ImportError:
+                logger.warning("⚠️ bitsandbytes not available, loading without quantization")
+        # Set dtype and quantization settings based on device
+        if is_cpu:
+            torch_dtype = torch.float32  # CPU: use float32
+            # If quantization_config is provided, don't also pass load_in_8bit
+            load_in_8bit = False if quantization_config else False
+            load_in_4bit = False
+        elif is_mps:
+            torch_dtype = torch.float16  # MPS: use float16
+            load_in_8bit = False
+            load_in_4bit = False
+        else:  # CUDA
+            torch_dtype = torch.float16  # CUDA: use float16
+            load_in_8bit = False  # CUDA can use 4-bit if needed
+            load_in_4bit = False
+        # Build model loading kwargs
+        model_kwargs = {
+            "cache_dir": cache_dir,
+            "torch_dtype": torch_dtype,
+            "trust_remote_code": True,
+            "low_cpu_mem_usage": True,
+        }
+        # Add device-specific settings
+        if is_cpu:
+            if quantization_config:
+                model_kwargs["quantization_config"] = quantization_config
+            # For CPU, don't use device_map (model stays on CPU)
+            model_kwargs["offload_folder"] = offload_dir
+        else:
+            model_kwargs["device_map"] = "auto"
+            if not is_mps:  # For CUDA, we can add offload if needed
+                model_kwargs["offload_folder"] = offload_dir
+        # Add quantization flags only if quantization_config is None
+        if not quantization_config:
+            model_kwargs["load_in_8bit"] = load_in_8bit
+            model_kwargs["load_in_4bit"] = load_in_4bit
         base_model = AutoModelForCausalLM.from_pretrained(
             base_model_name,
+            **model_kwargs
         )
         logger.info("✅ Base model loaded successfully")
         logger.info("✅ Tokenizer loaded successfully")
+        # Load base model (no LoRA adapter) with device-specific optimizations
+        logger.info("🤖 Loading base model with device optimizations (no fine-tuning)...")
         # Use /workspace/.cache if WORKSPACE is set, otherwise use .cache relative to current dir
         cache_dir = os.getenv("HF_HOME") or os.getenv("TRANSFORMERS_CACHE") or "/workspace/.cache/huggingface" if os.getenv("WORKSPACE") else ".cache/huggingface"
         offload_dir = os.getenv("WORKSPACE", "") + "/.cache/offload" if os.getenv("WORKSPACE") else ".cache/offload"
+        # Detect device type for optimization
+        is_cpu = device == "cpu"
+        is_mps = device == "mps"
+        # Configure quantization for CPU
+        quantization_config = None
+        if is_cpu:
+            try:
+                from transformers import BitsAndBytesConfig
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_enable_fp32_cpu_offload=True
+                )
+                logger.info("💻 Using 8-bit quantization for CPU")
+            except ImportError:
+                logger.warning("⚠️ bitsandbytes not available, loading without quantization")
+        # Set dtype and quantization settings based on device
+        if is_cpu:
+            torch_dtype = torch.float32
+            load_in_8bit = False if quantization_config else False
+            load_in_4bit = False
+        else:
+            torch_dtype = torch.float16
+            load_in_8bit = False
+            load_in_4bit = False
+        # Build model loading kwargs
+        model_kwargs = {
+            "cache_dir": cache_dir,
+            "torch_dtype": torch_dtype,
+            "trust_remote_code": True,
+            "low_cpu_mem_usage": True,
+        }
+        # Add device-specific settings
+        if is_cpu:
+            if quantization_config:
+                model_kwargs["quantization_config"] = quantization_config
+            model_kwargs["offload_folder"] = offload_dir
+        else:
+            model_kwargs["device_map"] = "auto"
+            model_kwargs["offload_folder"] = offload_dir
+        # Add quantization flags only if quantization_config is None
+        if not quantization_config:
+            model_kwargs["load_in_8bit"] = load_in_8bit
+            model_kwargs["load_in_4bit"] = load_in_4bit
         model = AutoModelForCausalLM.from_pretrained(
             base_model_name,
+            **model_kwargs
         )
         logger.info("✅ Base model loaded successfully (no fine-tuning)")
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
+            # Load model with device-specific optimizations (fallback code - usually not used)
+            is_cpu = device == "cpu"
+            quantization_config = None
+            if is_cpu:
+                try:
+                    from transformers import BitsAndBytesConfig
+                    quantization_config = BitsAndBytesConfig(
+                        load_in_8bit=True,
+                        llm_int8_enable_fp32_cpu_offload=True
+                    )
+                except ImportError:
+                    pass
+            # Build model loading kwargs
+            model_kwargs = {
+                "cache_dir": cache_dir,
+                "torch_dtype": torch.float32 if is_cpu else torch.float16,
+                "trust_remote_code": True,
+                "low_cpu_mem_usage": True,
+            }
+            if is_cpu:
+                if quantization_config:
+                    model_kwargs["quantization_config"] = quantization_config
+                model_kwargs["offload_folder"] = offload_dir
+            else:
+                model_kwargs["device_map"] = "auto"
+                model_kwargs["offload_folder"] = offload_dir
+                model_kwargs["load_in_8bit"] = False
+                model_kwargs["load_in_4bit"] = False
             model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
+                **model_kwargs
             )
             logger.info("✅ Model loaded from Hugging Face successfully")
         # Determine base model based on device
         is_mps = torch.backends.mps.is_available()
+        is_cpu = not is_mps and not torch.cuda.is_available()
         if tiny_models and tiny_models[0].exists() or small_models and small_models[0].exists() or prod_models and prod_models[0].exists():
             base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct" if is_mps else "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
         else:
     temperature: float = 0.7,  # Adjusted for better quality
     top_p: float = 0.9
 ) -> str:
+    """Generate response with device-specific optimizations and full-sentence stopping."""
     try:
         # Import inference utilities
         from .inference_utils import create_stopping_criteria, ensure_supra_close
             padding=False
         )
+        # Move to same device as model (handle quantized models on CPU)
+        try:
+            device = next(model.parameters()).device
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+        except (StopIteration, AttributeError):
+            # Quantized models on CPU might not have .device on parameters
+            # Check if model has a device attribute or default to CPU
+            if hasattr(model, 'device'):
+                device = model.device
+            else:
+                device = torch.device('cpu')
+            inputs = {k: v.to(device) for k, v in inputs.items()}
         # Create stopping criteria for full-sentence stopping
         stopping_criteria = create_stopping_criteria(tokenizer)
+        # Reduce max_new_tokens for CPU to optimize performance
+        try:
+            model_device = next(model.parameters()).device if hasattr(model, 'parameters') else None
+            is_cpu_device = model_device is None or str(model_device) == 'cpu'
+        except (StopIteration, AttributeError):
+            is_cpu_device = True
+        # Adjust max_new_tokens for CPU (reduce for faster inference)
+        effective_max_tokens = max_new_tokens
+        if is_cpu_device and max_new_tokens > 512:
+            effective_max_tokens = 512
+            logger.info(f"💻 CPU detected: reducing max_new_tokens from {max_new_tokens} to {effective_max_tokens} for faster inference")
         # Generate response with full-sentence stopping
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=effective_max_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 do_sample=True,

rag/{rag_m2max.py → rag.py} RENAMED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-SUPRA RAG System with M2 Max Optimizations
-Optimized for Apple Silicon with efficient memory management
 """
 import json
@@ -18,7 +18,7 @@ import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class SupraRAGM2Max:
     def __init__(self, rag_data_path: str = None):
         # Default RAG data path (for HF Spaces deployment)
         if rag_data_path is None:
@@ -37,17 +37,19 @@ class SupraRAGM2Max:
                 rag_data_path = "data/processed/rag_seeds/rag_seeds.jsonl"
         self.rag_data_path = Path(rag_data_path)
-        # M2 Max optimizations
-        self._setup_m2_max_optimizations()
-        # Initialize ChromaDB with M2 Max optimizations
         self.client = chromadb.Client()
         self.collection_name = "supra_knowledge"
-        # Use efficient embedding model for M2 Max
         self.embedding_model = SentenceTransformer(
             'all-MiniLM-L6-v2',
-            device='cpu'  # Force CPU for M2 Max compatibility
         )
         # Initialize or load collection
@@ -71,29 +73,30 @@ class SupraRAGM2Max:
             self.collection = self.client.create_collection(self.collection_name)
             self._load_rag_documents()
-    def _setup_m2_max_optimizations(self):
-        """Configure optimizations for M2 Max."""
-        logger.info("🍎 Setting up M2 Max optimizations...")
-        # M2 Max specific environment variables
-        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        # Memory management
         if torch.backends.mps.is_available():
-            logger.info("✅ MPS (Metal Performance Shaders) available")
             self.device = "mps"
         else:
-            logger.info("⚠️ MPS not available, using CPU")
             self.device = "cpu"
-        # Optimize PyTorch for M2 Max
-        torch.backends.mps.is_built()
         logger.info(f"🔧 Using device: {self.device}")
     def _load_rag_documents(self):
-        """Load RAG documents from JSONL file with M2 Max optimizations."""
         if not self.rag_data_path.exists():
             logger.warning("⚠️ RAG data file not found")
             if st:
@@ -112,7 +115,7 @@ class SupraRAGM2Max:
                     try:
                         doc = json.loads(line)
                         if 'content' in doc and 'id' in doc:
-                            # Truncate content for M2 Max memory efficiency
                             content = doc['content']
                             if len(content) > 2000:  # Limit content length
                                 content = content[:2000] + "..."
@@ -131,8 +134,8 @@ class SupraRAGM2Max:
                         logger.warning(f"⚠️ Skipping line {line_num}: JSON decode error - {e}")
         if documents:
-            # Add to ChromaDB with batch processing for M2 Max
-            batch_size = 50  # Smaller batches for M2 Max
             for i in range(0, len(documents), batch_size):
                 batch_docs = documents[i:i+batch_size]
                 batch_metadatas = metadatas[i:i+batch_size]
@@ -265,13 +268,13 @@ class SupraRAGM2Max:
                 st.error(f"Error generating response: {e}")
             return f"I apologize, but I encountered an error while generating a response: {e}"
-# Global RAG instance with M2 Max optimizations
 @st.cache_resource
 def get_supra_rag_m2max():
-    """Get cached SUPRA RAG instance optimized for M2 Max."""
     return SupraRAGM2Max()
 # Backward compatibility
 def get_supra_rag():
-    """Backward compatible function that returns M2 Max optimized RAG."""
     return get_supra_rag_m2max()

 #!/usr/bin/env python3
 """
+SUPRA RAG System with CPU/MPS/CUDA Optimizations
+Optimized for CPU (HF Spaces), MPS (Apple Silicon), and CUDA with efficient memory management
 """
 import json
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class SupraRAG:
     def __init__(self, rag_data_path: str = None):
         # Default RAG data path (for HF Spaces deployment)
         if rag_data_path is None:
                 rag_data_path = "data/processed/rag_seeds/rag_seeds.jsonl"
         self.rag_data_path = Path(rag_data_path)
+        # Device-specific optimizations
+        self._setup_device_optimizations()
+        # Initialize ChromaDB with device optimizations
         self.client = chromadb.Client()
         self.collection_name = "supra_knowledge"
+        # Use efficient embedding model (CPU for HF Spaces free tier)
+        # CPU is optimal for sentence-transformers on CPU-only deployments
+        embedding_device = 'cpu' if self.device == 'cpu' else self.device
         self.embedding_model = SentenceTransformer(
             'all-MiniLM-L6-v2',
+            device=embedding_device
         )
         # Initialize or load collection
             self.collection = self.client.create_collection(self.collection_name)
             self._load_rag_documents()
+    def _setup_device_optimizations(self):
+        """Configure optimizations for CPU/MPS/CUDA."""
+        logger.info("🔧 Setting up device optimizations...")
+        # Environment variables
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        # Detect device: MPS > CUDA > CPU
         if torch.backends.mps.is_available():
+            logger.info("✅ MPS (Metal Performance Shaders) available - using MPS")
             self.device = "mps"
+            os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+            torch.backends.mps.is_built()
+        elif torch.cuda.is_available():
+            logger.info("✅ CUDA available - using GPU")
+            self.device = "cuda"
         else:
+            logger.info("💻 CPU detected - using CPU optimizations")
             self.device = "cpu"
         logger.info(f"🔧 Using device: {self.device}")
     def _load_rag_documents(self):
+        """Load RAG documents from JSONL file with device optimizations."""
         if not self.rag_data_path.exists():
             logger.warning("⚠️ RAG data file not found")
             if st:
                     try:
                         doc = json.loads(line)
                         if 'content' in doc and 'id' in doc:
+                            # Truncate content for memory efficiency
                             content = doc['content']
                             if len(content) > 2000:  # Limit content length
                                 content = content[:2000] + "..."
                         logger.warning(f"⚠️ Skipping line {line_num}: JSON decode error - {e}")
         if documents:
+            # Add to ChromaDB with batch processing
+            batch_size = 50  # Smaller batches for memory efficiency
             for i in range(0, len(documents), batch_size):
                 batch_docs = documents[i:i+batch_size]
                 batch_metadatas = metadatas[i:i+batch_size]
                 st.error(f"Error generating response: {e}")
             return f"I apologize, but I encountered an error while generating a response: {e}"
+# Global RAG instance with device-specific optimizations
 @st.cache_resource
 def get_supra_rag_m2max():
+    """Get cached SUPRA RAG instance optimized for CPU/MPS/CUDA."""
     return SupraRAGM2Max()
 # Backward compatibility
 def get_supra_rag():
+    """Backward compatible function that returns device-optimized RAG."""
     return get_supra_rag_m2max()

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 # SUPRA-Nexus RAG UI Dependencies
-# For Hugging Face Spaces Deployment
 # Streamlit UI Framework
 streamlit>=1.28.0
@@ -15,6 +15,10 @@ torch>=2.0.0
 # PEFT for LoRA loading
 peft>=0.6.0
 # NLP utilities
 nltk>=3.8.0

 # SUPRA-Nexus RAG UI Dependencies
+# For Hugging Face Spaces Deployment (CPU Optimized)
 # Streamlit UI Framework
 streamlit>=1.28.0
 # PEFT for LoRA loading
 peft>=0.6.0
+# CPU Optimizations
+accelerate>=0.30.0  # For CPU inference optimization
+bitsandbytes>=0.43.0  # For 8-bit quantization (CPU compatible)
 # NLP utilities
 nltk>=3.8.0