ethanker commited on
Commit
2a9b282
·
verified ·
1 Parent(s): 325ba65

Upload step_002000 checkpoint, training script and run command.

Browse files
RUN_COMMAND.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nohup python /workspace/nanomind/train.py --data_path /workspace/nanomind_data/pretrain_1m.jsonl.gz --out_dir /workspace/nanomind_runs/run1 --tokenizer_name hf-internal-testing/llama-tokenizer --seq_len 2048 --hidden_size 512 --n_layers 16 --n_heads 8 --n_kv_heads 1 --global_batch_size 64 --micro_batch_size 1 --lr 1e-3 --warmup_steps 2000 --max_steps 50000 --save_every 1000 --bf16 > /workspace/nanomind_runs/run1/train.log 2>&1
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1126,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 16,
20
+ "num_key_value_heads": 1,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 1000000.0,
25
+ "tie_word_embeddings": true,
26
+ "transformers_version": "4.57.0.dev0",
27
+ "use_cache": true,
28
+ "vocab_size": 32000
29
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.0.dev0"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db6eefbdc543761743970f19ef7a504ba5be8e7980fb2f30dfa6a5f44f9259c7
3
+ size 214058792
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": true,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
train_run1.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Nanomind pretraining script for decoder-only causal LM on JSONL.gz data.
4
+
5
+ - Expects input file with one JSON object per line containing a `text` field.
6
+ - Streams, tokenizes, and packs sequences to a fixed length for efficient training.
7
+ - Uses a small LLaMA-style config by default (RMSNorm + SwiGLU + RoPE, MQA).
8
+
9
+ Usage example:
10
+ python /workspace/nanomind/train.py \
11
+ --data_path /workspace/nanomind_data/pretrain_1m.jsonl.gz \
12
+ --out_dir /workspace/nanomind_runs/run1 \
13
+ --tokenizer_name hf-internal-testing/llama-tokenizer \
14
+ --seq_len 4096 --global_batch_size 256 \
15
+ --lr 1e-3 --warmup_steps 2000 --max_steps 50000 --bf16
16
+ """
17
+
18
+ import os
19
+ import io
20
+ import gc
21
+ import gzip
22
+ import json
23
+ import math
24
+ import time
25
+ import random
26
+ import argparse
27
+ from pathlib import Path
28
+ from typing import Iterator, List, Dict, Optional
29
+
30
+ import torch
31
+ from torch import nn
32
+ from torch.utils.data import IterableDataset, DataLoader
33
+
34
+ from transformers import (
35
+ AutoTokenizer,
36
+ LlamaConfig,
37
+ LlamaForCausalLM,
38
+ get_cosine_schedule_with_warmup,
39
+ )
40
+
41
+
42
+ class JsonlPackedDataset(IterableDataset):
43
+ """
44
+ Streams a JSONL(.gz) file of objects with a `text` field, tokenizes, and
45
+ packs tokens into fixed-length blocks of `seq_len`.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ data_path: str,
51
+ tokenizer,
52
+ seq_len: int,
53
+ shuffle_lines: bool = False,
54
+ add_bos_eos: bool = True,
55
+ repeat: bool = True,
56
+ buffer_tokens_limit: int = 4_000_000,
57
+ ) -> None:
58
+ super().__init__()
59
+ self.data_path = str(data_path)
60
+ self.tokenizer = tokenizer
61
+ self.seq_len = int(seq_len)
62
+ self.shuffle_lines = bool(shuffle_lines)
63
+ self.add_bos_eos = bool(add_bos_eos)
64
+ self.repeat = bool(repeat)
65
+ self.buffer_tokens_limit = int(buffer_tokens_limit)
66
+
67
+ # pack buffers
68
+ self._token_buffer: List[int] = []
69
+
70
+ def _line_iter(self) -> Iterator[str]:
71
+ path = self.data_path
72
+ is_gz = path.endswith(".gz")
73
+ open_fn = gzip.open if is_gz else open
74
+ mode = "rt"
75
+ while True:
76
+ with open_fn(path, mode, encoding="utf-8") as f:
77
+ for line in f:
78
+ yield line
79
+ if not self.repeat:
80
+ break
81
+
82
+ def _yield_blocks(self) -> Iterator[Dict[str, torch.Tensor]]:
83
+ bos_id = getattr(self.tokenizer, "bos_token_id", None)
84
+ eos_id = getattr(self.tokenizer, "eos_token_id", None)
85
+
86
+ # local references for speed
87
+ token_buffer = self._token_buffer
88
+ seq_len = self.seq_len
89
+
90
+ for raw_line in self._line_iter():
91
+ raw_line = raw_line.strip()
92
+ if not raw_line:
93
+ continue
94
+ try:
95
+ obj = json.loads(raw_line)
96
+ except json.JSONDecodeError:
97
+ continue
98
+ text = obj.get("text")
99
+ if not text or len(text) < 10:
100
+ continue
101
+
102
+ if self.add_bos_eos and bos_id is not None and eos_id is not None:
103
+ encoded = self.tokenizer.encode(
104
+ text, add_special_tokens=False
105
+ )
106
+ # Guard against rare None returns
107
+ if not encoded:
108
+ continue
109
+ token_buffer.append(bos_id)
110
+ token_buffer.extend(encoded)
111
+ token_buffer.append(eos_id)
112
+ else:
113
+ encoded = self.tokenizer.encode(text, add_special_tokens=True)
114
+ if not encoded:
115
+ continue
116
+ token_buffer.extend(encoded)
117
+
118
+ # If buffer grows too large, drop tail to constrain RAM
119
+ if len(token_buffer) > self.buffer_tokens_limit:
120
+ del token_buffer[: len(token_buffer) - self.buffer_tokens_limit]
121
+
122
+ # Emit fixed-length blocks
123
+ while len(token_buffer) >= seq_len:
124
+ block = token_buffer[:seq_len]
125
+ del token_buffer[:seq_len]
126
+
127
+ input_ids = torch.tensor(block, dtype=torch.long)
128
+ attention_mask = torch.ones_like(input_ids)
129
+ # Causal LM uses labels equal to inputs
130
+ yield {
131
+ "input_ids": input_ids,
132
+ "attention_mask": attention_mask,
133
+ "labels": input_ids.clone(),
134
+ }
135
+
136
+ def __iter__(self) -> Iterator[Dict[str, torch.Tensor]]:
137
+ # Worker-specific shard: in IterableDataset DataLoader workers receive cloned objects.
138
+ # To keep it simple and deterministic, don't split lines per-worker; rely on global batching.
139
+ return self._yield_blocks()
140
+
141
+
142
+ def build_model_and_tokenizer(
143
+ tokenizer_name: Optional[str],
144
+ tokenizer_dir: Optional[str],
145
+ model_name: Optional[str],
146
+ vocab_size_override: Optional[int],
147
+ hidden_size: int,
148
+ n_layers: int,
149
+ n_heads: int,
150
+ n_kv_heads: int,
151
+ rope_theta: float,
152
+ max_position_embeddings: int,
153
+ ) -> tuple:
154
+ # Tokenizer
155
+ if tokenizer_name:
156
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
157
+ elif tokenizer_dir:
158
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=True)
159
+ else:
160
+ raise ValueError("Provide --tokenizer_name or --tokenizer_dir")
161
+
162
+ # Ensure pad token for batching; map to eos if missing (common for causal LMs)
163
+ if tokenizer.pad_token_id is None:
164
+ if tokenizer.eos_token_id is not None:
165
+ tokenizer.pad_token = tokenizer.eos_token
166
+ else:
167
+ # Fallback: add a [PAD] token
168
+ tokenizer.add_special_tokens({"pad_token": "[PAD]"})
169
+
170
+ vocab_size = vocab_size_override or len(tokenizer)
171
+
172
+ # Model
173
+ if model_name:
174
+ model = LlamaForCausalLM.from_pretrained(model_name)
175
+ # Resize embeddings if tokenizer changed
176
+ if model.get_input_embeddings().weight.shape[0] != vocab_size:
177
+ model.resize_token_embeddings(vocab_size)
178
+ else:
179
+ config = LlamaConfig(
180
+ vocab_size=vocab_size,
181
+ hidden_size=hidden_size, # d_model
182
+ intermediate_size=int(hidden_size * 2.2), # SwiGLU widen 2.0–2.5
183
+ num_hidden_layers=n_layers,
184
+ num_attention_heads=n_heads,
185
+ num_key_value_heads=n_kv_heads,
186
+ rms_norm_eps=1e-5,
187
+ rope_theta=rope_theta,
188
+ max_position_embeddings=max_position_embeddings,
189
+ tie_word_embeddings=True,
190
+ )
191
+ model = LlamaForCausalLM(config)
192
+
193
+ return model, tokenizer
194
+
195
+
196
+ def get_dataloader(
197
+ data_path: str,
198
+ tokenizer,
199
+ seq_len: int,
200
+ micro_batch_size: int,
201
+ num_workers: int,
202
+ ) -> DataLoader:
203
+ dataset = JsonlPackedDataset(
204
+ data_path=data_path,
205
+ tokenizer=tokenizer,
206
+ seq_len=seq_len,
207
+ shuffle_lines=False,
208
+ add_bos_eos=True,
209
+ repeat=True,
210
+ )
211
+ return DataLoader(
212
+ dataset,
213
+ batch_size=micro_batch_size,
214
+ num_workers=num_workers,
215
+ pin_memory=True,
216
+ drop_last=True,
217
+ collate_fn=_collate_batch,
218
+ )
219
+
220
+
221
+ def _collate_batch(features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
222
+ # All are fixed-length; just stack
223
+ input_ids = torch.stack([f["input_ids"] for f in features], dim=0)
224
+ attention_mask = torch.stack([f["attention_mask"] for f in features], dim=0)
225
+ labels = torch.stack([f["labels"] for f in features], dim=0)
226
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
227
+
228
+
229
+ def parse_args() -> argparse.Namespace:
230
+ ap = argparse.ArgumentParser()
231
+ # Data
232
+ ap.add_argument("--data_path", required=True, help="Path to JSONL(.gz) with {text}")
233
+ ap.add_argument("--seq_len", type=int, default=4096)
234
+ ap.add_argument("--num_workers", type=int, default=2)
235
+
236
+ # Tokenizer & Model
237
+ ap.add_argument("--tokenizer_name", default=None, help="HF tokenizer name")
238
+ ap.add_argument("--tokenizer_dir", default=None, help="Local dir of HF tokenizer")
239
+ ap.add_argument("--model_name", default=None, help="HF model name to continue from (CPT)")
240
+ ap.add_argument("--vocab_size_override", type=int, default=None)
241
+
242
+ # Small LLaMA-like config (used when --model_name not provided)
243
+ ap.add_argument("--hidden_size", type=int, default=768)
244
+ ap.add_argument("--n_layers", type=int, default=24)
245
+ ap.add_argument("--n_heads", type=int, default=12)
246
+ ap.add_argument("--n_kv_heads", type=int, default=1)
247
+ ap.add_argument("--rope_theta", type=float, default=1e6)
248
+ ap.add_argument("--max_position_embeddings", type=int, default=4096)
249
+
250
+ # Training
251
+ ap.add_argument("--out_dir", required=True)
252
+ ap.add_argument("--global_batch_size", type=int, default=256)
253
+ ap.add_argument("--micro_batch_size", type=int, default=None, help="Per-step batch size before grad accumulation")
254
+ ap.add_argument("--lr", type=float, default=1e-3)
255
+ ap.add_argument("--weight_decay", type=float, default=0.05)
256
+ ap.add_argument("--warmup_steps", type=int, default=2000)
257
+ ap.add_argument("--max_steps", type=int, default=50_000)
258
+ ap.add_argument("--save_every", type=int, default=2000)
259
+ ap.add_argument("--clip_grad", type=float, default=1.0)
260
+ ap.add_argument("--bf16", action="store_true")
261
+ ap.add_argument("--seed", type=int, default=42)
262
+
263
+ return ap.parse_args()
264
+
265
+
266
+ def set_seed(seed: int) -> None:
267
+ random.seed(seed)
268
+ os.environ["PYTHONHASHSEED"] = str(seed)
269
+ torch.manual_seed(seed)
270
+ torch.cuda.manual_seed_all(seed)
271
+
272
+
273
+ def main() -> None:
274
+ args = parse_args()
275
+ set_seed(args.seed)
276
+
277
+ out_dir = Path(args.out_dir)
278
+ out_dir.mkdir(parents=True, exist_ok=True)
279
+
280
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
281
+ torch.backends.cuda.matmul.allow_tf32 = True
282
+ torch.backends.cudnn.allow_tf32 = True
283
+
284
+ model, tokenizer = build_model_and_tokenizer(
285
+ tokenizer_name=args.tokenizer_name,
286
+ tokenizer_dir=args.tokenizer_dir,
287
+ model_name=args.model_name,
288
+ vocab_size_override=args.vocab_size_override,
289
+ hidden_size=args.hidden_size,
290
+ n_layers=args.n_layers,
291
+ n_heads=args.n_heads,
292
+ n_kv_heads=args.n_kv_heads,
293
+ rope_theta=args.rope_theta,
294
+ max_position_embeddings=args.max_position_embeddings,
295
+ )
296
+
297
+ model = model.to(device)
298
+
299
+ # Data
300
+ micro_bs = args.micro_batch_size or min( max(1, args.global_batch_size // 8), args.global_batch_size)
301
+ grad_accum = max(1, args.global_batch_size // micro_bs)
302
+ train_loader = get_dataloader(
303
+ data_path=args.data_path,
304
+ tokenizer=tokenizer,
305
+ seq_len=args.seq_len,
306
+ micro_batch_size=micro_bs,
307
+ num_workers=args.num_workers,
308
+ )
309
+
310
+ # Optimizer & Scheduler
311
+ optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, betas=(0.9, 0.95))
312
+ scheduler = get_cosine_schedule_with_warmup(
313
+ optimizer=optimizer,
314
+ num_warmup_steps=args.warmup_steps,
315
+ num_training_steps=args.max_steps,
316
+ )
317
+
318
+ scaler = None
319
+ use_bf16 = args.bf16 and torch.cuda.is_available()
320
+ autocast_dtype = torch.bfloat16 if use_bf16 else torch.float16
321
+
322
+ model.train()
323
+ step = 0
324
+ running_loss = 0.0
325
+ tokens_per_step = args.global_batch_size * args.seq_len
326
+ last_log = time.time()
327
+
328
+ # Simple training loop over streaming dataloader
329
+ data_iter = iter(train_loader)
330
+ while step < args.max_steps:
331
+ optimizer.zero_grad(set_to_none=True)
332
+ for micro_step in range(grad_accum):
333
+ try:
334
+ batch = next(data_iter)
335
+ except StopIteration:
336
+ data_iter = iter(train_loader)
337
+ batch = next(data_iter)
338
+
339
+ input_ids = batch["input_ids"].to(device, non_blocking=True)
340
+ attention_mask = batch["attention_mask"].to(device, non_blocking=True)
341
+ labels = batch["labels"].to(device, non_blocking=True)
342
+
343
+ with torch.autocast(device_type="cuda", dtype=autocast_dtype, enabled=use_bf16):
344
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
345
+ loss = outputs.loss / grad_accum
346
+
347
+ loss.backward()
348
+ running_loss += loss.item()
349
+
350
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad)
351
+ optimizer.step()
352
+ scheduler.step()
353
+ step += 1
354
+
355
+ # Logging
356
+ if step % 10 == 0:
357
+ now = time.time()
358
+ dt = now - last_log
359
+ last_log = now
360
+ avg_loss = running_loss / 10
361
+ running_loss = 0.0
362
+ ppl = math.exp(avg_loss) if avg_loss < 30 else float("inf")
363
+ tokens_sec = tokens_per_step / dt if dt > 0 else 0.0
364
+ print(
365
+ f"step {step:6d} | loss {avg_loss:.4f} | ppl {ppl:.2f} | tokens/s {tokens_sec:,.0f} | lr {scheduler.get_last_lr()[0]:.2e}",
366
+ flush=True,
367
+ )
368
+
369
+ # Checkpointing
370
+ if step % args.save_every == 0 or step == args.max_steps:
371
+ ckpt_dir = out_dir / f"step_{step:06d}"
372
+ ckpt_dir.mkdir(parents=True, exist_ok=True)
373
+ model.save_pretrained(ckpt_dir)
374
+ tokenizer.save_pretrained(ckpt_dir)
375
+
376
+ # Small memory hygiene
377
+ if step % 100 == 0:
378
+ gc.collect()
379
+ if torch.cuda.is_available():
380
+ torch.cuda.empty_cache()
381
+
382
+ # Final save
383
+ model.save_pretrained(out_dir / "final")
384
+ tokenizer.save_pretrained(out_dir / "final")
385
+
386
+
387
+ if __name__ == "__main__":
388
+ main()
389
+
390
+