{ "model": { "base_model": "zai-org/GLM-4.5-Air", "final_model_path": "outputs_fsdp/final_model" }, "training_config": { "lora_r": 128, "lora_alpha": 256, "lora_dropout": 0.05, "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj" ], "learning_rate": 2.5e-05, "lr_scheduler_type": "cosine", "micro_batch_size": 1, "gradient_accumulation_steps": 2, "effective_batch_size": 32, "sequence_length": 16384, "chunk_overlap": 2048, "weight_decay": 0.01, "max_grad_norm": 1.0, "warmup_ratio": 0.1, "eval_split": 0.05, "bf16": true, "seed": 42 }, "hardware": { "num_gpus": 16, "gpu_name": "NVIDIA H200", "num_nodes": 1, "gpus_per_node": 8 }, "phases": [ { "phase": 1, "name": "phase1_foundation", "description": "Foundation: Learn codebase structure and file patterns", "dataset": "dataset/phase1_foundation.jsonl", "epochs": 2, "learning_rate": 2.5e-05, "warmup_ratio": 0.15, "num_train_samples": 9293, "num_eval_samples": 512, "num_chunks": 9805, "train_metrics": { "train_runtime": 45748.92132782936, "train_runtime_minutes": 762.4820221304893, "train_steps": 581, "train_loss": 0.5921854273129171, "train_perplexity": 1.8079352121008547, "samples_per_second": 0.40626094475136876, "steps_per_second": 0.012699752980767526 }, "eval_metrics": { "eval_loss": 0.36529209305808763, "eval_perplexity": 1.4409348337482015, "eval_accuracy": 88.77101374493351, "best_eval_loss": 0.36561795309899026 } }, { "phase": 2, "name": "phase2_evolution", "description": "Evolution: Learn commit patterns and code changes", "dataset": "dataset/phase2_evolution.jsonl", "epochs": 2, "learning_rate": 2e-05, "warmup_ratio": 0.1, "num_train_samples": 16622, "num_eval_samples": 1545, "num_chunks": 18167, "train_metrics": { "train_runtime": 88820.11419820786, "train_runtime_minutes": 1480.3352366367976, "train_steps": 1039, "train_loss": 0.790716471444525, "train_perplexity": 2.204975662547615, "samples_per_second": 0.37428458970243894, "steps_per_second": 0.0116978007670808 }, "eval_metrics": { "eval_loss": 2.551615942151948, "eval_perplexity": 12.827816051917177, "eval_accuracy": 40.84345327062199, "best_eval_loss": 2.5516352893463 } }, { "phase": 3, "name": "phase3_pr_mastery", "description": "PR Mastery: Learn PR review patterns and discussions", "dataset": "dataset/phase3_pr_mastery.jsonl", "epochs": 1, "learning_rate": 1.5e-05, "warmup_ratio": 0.05, "num_train_samples": 9797, "num_eval_samples": 509, "num_chunks": 10306, "train_metrics": { "train_runtime": 24744.46716451645, "train_runtime_minutes": 412.40778607527415, "train_steps": 306, "train_loss": 0.49508867293498876, "train_perplexity": 1.6406437133004639, "samples_per_second": 0.3959268928631, "steps_per_second": 0.012366400859049565 }, "eval_metrics": { "eval_loss": 0.5012174650255474, "eval_perplexity": 1.6507297535648182, "eval_accuracy": 90.171501333015, "best_eval_loss": 0.5012283607793506 } } ], "phase_checkpoints": [ "outputs_fsdp/phase1_foundation/final", "outputs_fsdp/phase2_evolution/final", "outputs_fsdp/phase3_pr_mastery/final" ], "summary": { "initial_train_loss": 0.5921854273129171, "final_train_loss": 0.49508867293498876, "initial_eval_loss": 0.36529209305808763, "final_eval_loss": 0.5012174650255474, "initial_perplexity": 1.4409348337482015, "final_perplexity": 1.6507297535648182, "total_epochs": 5, "total_phases": 3, "total_steps": 1926, "total_training_time_seconds": 161561.2551908493, "total_training_time_hours": 44.87812644190259 }, "timestamp": "20251211_212051", "run_name": "glm-air-curriculum-16gpu", "output_directory": "outputs_fsdp" }