AdityaNarayan's picture
Upload 7 files
fec3222 verified
{
"model": {
"base_model": "zai-org/GLM-4.5-Air",
"final_model_path": "outputs_fsdp/final_model"
},
"training_config": {
"lora_r": 128,
"lora_alpha": 256,
"lora_dropout": 0.05,
"lora_target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj"
],
"learning_rate": 2.5e-05,
"lr_scheduler_type": "cosine",
"micro_batch_size": 1,
"gradient_accumulation_steps": 2,
"effective_batch_size": 32,
"sequence_length": 16384,
"chunk_overlap": 2048,
"weight_decay": 0.01,
"max_grad_norm": 1.0,
"warmup_ratio": 0.1,
"eval_split": 0.05,
"bf16": true,
"seed": 42
},
"hardware": {
"num_gpus": 16,
"gpu_name": "NVIDIA H200",
"num_nodes": 1,
"gpus_per_node": 8
},
"phases": [
{
"phase": 1,
"name": "phase1_foundation",
"description": "Foundation: Learn codebase structure and file patterns",
"dataset": "dataset/phase1_foundation.jsonl",
"epochs": 2,
"learning_rate": 2.5e-05,
"warmup_ratio": 0.15,
"num_train_samples": 9293,
"num_eval_samples": 512,
"num_chunks": 9805,
"train_metrics": {
"train_runtime": 45748.92132782936,
"train_runtime_minutes": 762.4820221304893,
"train_steps": 581,
"train_loss": 0.5921854273129171,
"train_perplexity": 1.8079352121008547,
"samples_per_second": 0.40626094475136876,
"steps_per_second": 0.012699752980767526
},
"eval_metrics": {
"eval_loss": 0.36529209305808763,
"eval_perplexity": 1.4409348337482015,
"eval_accuracy": 88.77101374493351,
"best_eval_loss": 0.36561795309899026
}
},
{
"phase": 2,
"name": "phase2_evolution",
"description": "Evolution: Learn commit patterns and code changes",
"dataset": "dataset/phase2_evolution.jsonl",
"epochs": 2,
"learning_rate": 2e-05,
"warmup_ratio": 0.1,
"num_train_samples": 16622,
"num_eval_samples": 1545,
"num_chunks": 18167,
"train_metrics": {
"train_runtime": 88820.11419820786,
"train_runtime_minutes": 1480.3352366367976,
"train_steps": 1039,
"train_loss": 0.790716471444525,
"train_perplexity": 2.204975662547615,
"samples_per_second": 0.37428458970243894,
"steps_per_second": 0.0116978007670808
},
"eval_metrics": {
"eval_loss": 2.551615942151948,
"eval_perplexity": 12.827816051917177,
"eval_accuracy": 40.84345327062199,
"best_eval_loss": 2.5516352893463
}
},
{
"phase": 3,
"name": "phase3_pr_mastery",
"description": "PR Mastery: Learn PR review patterns and discussions",
"dataset": "dataset/phase3_pr_mastery.jsonl",
"epochs": 1,
"learning_rate": 1.5e-05,
"warmup_ratio": 0.05,
"num_train_samples": 9797,
"num_eval_samples": 509,
"num_chunks": 10306,
"train_metrics": {
"train_runtime": 24744.46716451645,
"train_runtime_minutes": 412.40778607527415,
"train_steps": 306,
"train_loss": 0.49508867293498876,
"train_perplexity": 1.6406437133004639,
"samples_per_second": 0.3959268928631,
"steps_per_second": 0.012366400859049565
},
"eval_metrics": {
"eval_loss": 0.5012174650255474,
"eval_perplexity": 1.6507297535648182,
"eval_accuracy": 90.171501333015,
"best_eval_loss": 0.5012283607793506
}
}
],
"phase_checkpoints": [
"outputs_fsdp/phase1_foundation/final",
"outputs_fsdp/phase2_evolution/final",
"outputs_fsdp/phase3_pr_mastery/final"
],
"summary": {
"initial_train_loss": 0.5921854273129171,
"final_train_loss": 0.49508867293498876,
"initial_eval_loss": 0.36529209305808763,
"final_eval_loss": 0.5012174650255474,
"initial_perplexity": 1.4409348337482015,
"final_perplexity": 1.6507297535648182,
"total_epochs": 5,
"total_phases": 3,
"total_steps": 1926,
"total_training_time_seconds": 161561.2551908493,
"total_training_time_hours": 44.87812644190259
},
"timestamp": "20251211_212051",
"run_name": "glm-air-curriculum-16gpu",
"output_directory": "outputs_fsdp"
}