#!/bin/bash # MLX Model Conversion Utility for Dragon M3 Ultra # Updated: January 2025 for MLX 0.26+ and modern uv workflow # Supports Q5 quantization and M3 Ultra optimizations # Text formatting BOLD="\033[1m" BLUE="\033[34m" GREEN="\033[32m" YELLOW="\033[33m" RED="\033[31m" CYAN="\033[36m" MAGENTA="\033[35m" RESET="\033[0m" # Detect system specs TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0) TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824)) CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown") # Check if running on M3 Ultra if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then IS_M3_ULTRA=true echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}" else IS_M3_ULTRA=false fi echo -e "${BOLD}${BLUE}=====================================${RESET}" echo -e "${BOLD}${BLUE} MLX Model Conversion Utility v2.0 ${RESET}" echo -e "${BOLD}${BLUE}=====================================${RESET}" echo -e "Updated for MLX 0.26+ with Q5 support and M3 Ultra optimizations\n" # Default values DEFAULT_HF_PATH="meta-llama/Llama-3.1-405B" DEFAULT_OUTPUT_DIR="models/Llama-3.1-405B-MLX-Q5" DEFAULT_QUANTIZE="y" DEFAULT_BITS="5" # Changed to Q5 as default for better quality/size ratio DEFAULT_GROUP_SIZE="64" DEFAULT_DTYPE="float16" # hf-xet optimization for Dragon M3 Ultra if [[ "$IS_M3_ULTRA" == true ]]; then export HF_XET_HIGH_PERFORMANCE_MODE=1 export HF_XET_CHUNK_CACHE_SIZE_BYTES=107374182400 # 100GB cache export HF_XET_CONCURRENT_DOWNLOADS=32 echo -e "${CYAN}✓ hf-xet optimizations enabled for Dragon${RESET}" fi # Get HF Path echo -e "${BOLD}Hugging Face model path or local directory:${RESET}" echo -e "(Default: ${DEFAULT_HF_PATH})" echo -e "${CYAN}Examples:${RESET}" echo -e " HF repo: meta-llama/Llama-3.1-405B" echo -e " Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name" read -p "> " HF_PATH HF_PATH=${HF_PATH:-$DEFAULT_HF_PATH} # Check if it's a local path if [[ -d "$HF_PATH" ]]; then echo -e "${GREEN}✓ Local model detected: ${HF_PATH}${RESET}" IS_LOCAL=true else IS_LOCAL=false # Ask about hf-xet for remote models echo -e "\n${BOLD}Use hf-xet for faster download? [y/n]${RESET}" echo -e "(10x faster downloads with chunk deduplication)" echo -e "Default: y" read -p "> " USE_HF_XET USE_HF_XET=${USE_HF_XET:-y} if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then # Check if hf-xet is installed if ! uv run python -c "import hf_xet" 2>/dev/null; then echo -e "${YELLOW}⚠️ hf-xet not installed. Installing...${RESET}" echo -e "Run: uv add 'huggingface_hub[hf_xet]'" echo -e "${CYAN}Note: hf-xet only works with Xet-backed repos${RESET}" else echo -e "${GREEN}✓ hf-xet enabled for download${RESET}" fi fi fi # Get output directory echo -e "\n${BOLD}Output MLX model directory:${RESET}" echo -e "(Default: ${DEFAULT_OUTPUT_DIR})" read -p "> " MLX_PATH MLX_PATH=${MLX_PATH:-$DEFAULT_OUTPUT_DIR} # Ask about data type echo -e "\n${BOLD}Model data type:${RESET}" echo -e "(Default: ${DEFAULT_DTYPE}, Options: float16, bfloat16, float32)" read -p "> " DTYPE DTYPE=${DTYPE:-$DEFAULT_DTYPE} # Ask about quantization echo -e "\n${BOLD}Quantize the model? [y/n]${RESET}" echo -e "(Default: ${DEFAULT_QUANTIZE})" read -p "> " QUANTIZE QUANTIZE=${QUANTIZE:-$DEFAULT_QUANTIZE} # If quantizing, get more details if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then echo -e "\n${BOLD}Quantization bits:${RESET}" echo -e "${CYAN}Options:${RESET}" echo -e " 2 - Extreme compression (lowest quality)" echo -e " 3 - High compression" echo -e " 4 - Standard compression (good balance)" echo -e " ${GREEN}5 - Recommended (best quality/size ratio)${RESET}" echo -e " 6 - Low compression" echo -e " 8 - Minimal compression (highest quality)" echo -e "(Default: ${DEFAULT_BITS})" read -p "> " BITS BITS=${BITS:-$DEFAULT_BITS} echo -e "\n${BOLD}Group size:${RESET}" echo -e "(Default: ${DEFAULT_GROUP_SIZE}, Options: 32, 64, 128)" if [[ "$IS_M3_ULTRA" == true ]]; then echo -e "${CYAN}💡 M3 Ultra tip: Use 64 or 128 for better performance${RESET}" fi read -p "> " GROUP_SIZE GROUP_SIZE=${GROUP_SIZE:-$DEFAULT_GROUP_SIZE} echo -e "\n${BOLD}Quantization strategy:${RESET}" echo -e "${CYAN}Options:${RESET}" echo -e " none - Uniform quantization (default)" echo -e " mixed_2_6 - Mix of 2 and 6 bit" echo -e " ${GREEN}mixed_3_4 - Mix of 3 and 4 bit${RESET}" echo -e " mixed_3_6 - Mix of 3 and 6 bit" echo -e " mixed_4_6 - Mix of 4 and 6 bit" echo -e "Leave empty for uniform quantization" read -p "> " QUANT_PREDICATE QUANT_OPTIONS="-q --q-bits ${BITS} --q-group-size ${GROUP_SIZE}" if [[ -n "$QUANT_PREDICATE" ]]; then QUANT_OPTIONS="${QUANT_OPTIONS} --quant-predicate ${QUANT_PREDICATE}" fi else QUANT_OPTIONS="" fi # Memory optimization options for M3 Ultra if [[ "$IS_M3_ULTRA" == true ]]; then echo -e "\n${BOLD}${MAGENTA}M3 Ultra optimization note:${RESET}" echo -e "${CYAN}MLX will automatically optimize for your 512GB system${RESET}" echo -e "${CYAN}The framework uses unified memory efficiently${RESET}" M3_ULTRA_FLAGS="" else M3_ULTRA_FLAGS="" fi # Ask about upload repository (optional) echo -e "\n${BOLD}Upload to Hugging Face Hub? (optional):${RESET}" echo -e "(Leave empty to skip upload)" read -p "> " UPLOAD_REPO if [[ -n "$UPLOAD_REPO" ]]; then UPLOAD_OPTION="--upload-repo ${UPLOAD_REPO}" else UPLOAD_OPTION="" fi # Build the command - UV is now default UV_CMD="uv run mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}" # Alternative commands DIRECT_CMD="mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}" PYTHON_CMD="python -m mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}" # Print the preview echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}" echo -e "$UV_CMD" # Expected outcomes based on options echo -e "\n${BOLD}${YELLOW}Expected outcomes:${RESET}" if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then MODEL_SIZE_GB=500 # Approximate for 405B model case "$BITS" in 2) EXPECTED_SIZE=$((MODEL_SIZE_GB / 8)) echo -e "- ${GREEN}Q2: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" echo -e "- ${YELLOW}⚠️ Significant quality loss expected${RESET}" ;; 3) EXPECTED_SIZE=$((MODEL_SIZE_GB * 3 / 16)) echo -e "- ${GREEN}Q3: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" echo -e "- ${YELLOW}Moderate quality loss${RESET}" ;; 4) EXPECTED_SIZE=$((MODEL_SIZE_GB / 4)) echo -e "- ${GREEN}Q4: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" echo -e "- ${GREEN}Good balance of quality and size${RESET}" ;; 5) EXPECTED_SIZE=$((MODEL_SIZE_GB * 5 / 16)) echo -e "- ${GREEN}Q5: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" echo -e "- ${GREEN}✨ Excellent quality/size ratio${RESET}" ;; 6) EXPECTED_SIZE=$((MODEL_SIZE_GB * 6 / 16)) echo -e "- ${GREEN}Q6: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" echo -e "- ${GREEN}High quality preservation${RESET}" ;; 8) EXPECTED_SIZE=$((MODEL_SIZE_GB / 2)) echo -e "- ${GREEN}Q8: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" echo -e "- ${GREEN}Near-lossless quality${RESET}" ;; esac if [[ -n "$QUANT_PREDICATE" ]]; then echo -e "- ${CYAN}Using mixed precision: ${QUANT_PREDICATE}${RESET}" fi if [[ "$IS_M3_ULTRA" == true ]]; then echo -e "- ${MAGENTA}Expected memory usage: ${EXPECTED_SIZE}-$((EXPECTED_SIZE * 2))GB peak${RESET}" echo -e "- ${MAGENTA}M3 Ultra can handle this comfortably${RESET}" else echo -e "- ${YELLOW}Expected memory usage: High - monitor closely${RESET}" fi else echo -e "- ${GREEN}No quantization - model remains in ${DTYPE} format${RESET}" echo -e "- ${YELLOW}Very high memory requirements (400-500GB)${RESET}" fi echo -e "- ${CYAN}Expected conversion time: 2-6 hours${RESET}" # Ask for command format choice echo -e "\n${BOLD}${GREEN}Choose command format:${RESET}" echo -e "1. ${YELLOW}UV (recommended): ${RESET}${UV_CMD}" echo -e "2. ${YELLOW}Direct command: ${RESET}${DIRECT_CMD}" echo -e "3. ${YELLOW}Python module: ${RESET}${PYTHON_CMD}" read -p "> " FORMAT_CHOICE case "$FORMAT_CHOICE" in 2) FINAL_CMD="${DIRECT_CMD}" ;; 3) FINAL_CMD="${PYTHON_CMD}" ;; *) FINAL_CMD="${UV_CMD}" ;; esac # M3 Ultra specific preparation tips if [[ "$IS_M3_ULTRA" == true ]]; then echo -e "\n${BOLD}${MAGENTA}🐉 Dragon M3 Ultra Preparation:${RESET}" echo -e "1. ${CYAN}Your 512GB RAM can handle even 405B models${RESET}" echo -e "2. ${CYAN}Enable High Power Mode in Energy Saver${RESET}" echo -e "3. ${CYAN}Consider using Activity Monitor to track memory${RESET}" echo -e "4. ${CYAN}MLX will use unified memory efficiently${RESET}" else echo -e "\n${BOLD}${BLUE}Preparation tips:${RESET}" echo -e "1. ${YELLOW}Ensure Mac is plugged in and won't sleep${RESET}" echo -e "2. ${YELLOW}Close other memory-intensive applications${RESET}" echo -e "3. ${YELLOW}Be prepared for high fan speeds${RESET}" echo -e "4. ${YELLOW}The process may appear to hang - this is normal${RESET}" fi # Print the final command echo -e "\n${BOLD}${RED}Your conversion command:${RESET}" echo -e "${FINAL_CMD}" # Copy to clipboard option echo -e "\n${BOLD}${GREEN}Copy command to clipboard? [y/n]${RESET}" read -p "> " COPY_CMD if [[ "$COPY_CMD" == "y" || "$COPY_CMD" == "Y" ]]; then echo "${FINAL_CMD}" | pbcopy echo -e "${GREEN}✓ Command copied to clipboard!${RESET}" fi # Download command if using remote model if [[ "$IS_LOCAL" == false ]]; then echo -e "\n${BOLD}${CYAN}Optional: Download model first (if needed):${RESET}" if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then echo -e "# With hf-xet (10x faster):" echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}" else echo -e "# Standard download:" echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}" fi fi # Test commands echo -e "\n${BOLD}${BLUE}After conversion, test with:${RESET}" echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"Hello, I am\" --max-tokens 50" # Memory monitoring for M3 Ultra if [[ "$IS_M3_ULTRA" == true ]]; then echo -e "\n${BOLD}${MAGENTA}Monitor Dragon performance:${RESET}" echo -e "uv run python -c \"import mlx.core as mx; print(f'Peak: {mx.metal.get_peak_memory()/1e9:.2f}GB of ${TOTAL_MEMORY_GB}GB')\"" echo -e "\n${BOLD}${CYAN}Pro tip for large models:${RESET}" echo -e "# Set memory limit before conversion (optional):" echo -e "export MLX_METAL_MEMORY_LIMIT=$((TOTAL_MEMORY_GB * 95 / 100))GB" fi # Benchmark command echo -e "\n${BOLD}${CYAN}Benchmark the converted model:${RESET}" echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"The\" --max-tokens 100 --verbose" echo -e "\n${BOLD}${BLUE}=====================================${RESET}" echo -e "${BOLD}${GREEN}✨ Conversion setup complete!${RESET}" if [[ "$IS_M3_ULTRA" == true ]]; then echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra ready to roar!${RESET}" fi echo -e "${BOLD}${BLUE}=====================================${RESET}"