-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_gpt2.py
537 lines (480 loc) · 24.9 KB
/
train_gpt2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
import os
import numpy as np
from dataclasses import dataclass
import inspect
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
from utils import Logger, VariableTracker
from hellaswag import HellaSwagEvaluator
from lambada_openai import LambadaEvaluator
from storycloze import StoryClozeEvaluator
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads, but in a batch
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
# output projection
self.c_proj = nn.Linear(config.n_embd, config.n_embd)
self.c_proj.NANOGPT_SCALE_INIT = 1 # 用于除以一个缩放因子
# regularization
self.n_head = config.n_head
self.n_embd = config.n_embd
def forward(self, x):
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
# nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
# e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
qkv = self.c_attn(x)
q, k, v = qkv.split(self.n_embd, dim=2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.c_proj(y)
return y
class MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
self.gelu = nn.GELU(approximate='tanh')
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
self.c_proj.NANOGPT_SACLE_INIT = 1 # 用于除以一个缩放因子
def forward(self, x):
x = self.c_fc(x)
x = self.gelu(x)
x = self.c_proj(x)
return x
class Block(nn.Module):
def __init__(self, config):
super().__init__()
self.ln_1 = nn.LayerNorm(config.n_embd)
self.attn = CausalSelfAttention(config)
self.ln_2 = nn.LayerNorm(config.n_embd)
self.mlp = MLP(config)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
@dataclass
class GPTConfig:
block_size: int = 2048 # max sequence length
vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
n_layer: int = 12 # number of layers
n_head: int = 12 # number of heads
n_embd: int = 768 # embedding dimension
class GPT(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.transformer = nn.ModuleDict(dict(
wte = nn.Embedding(config.vocab_size, config.n_embd),# token embedding
wpe = nn.Embedding(config.block_size, config.n_embd),# position embedding
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),# 将Block模块重复n_layer次
ln_f = nn.LayerNorm(config.n_embd),
))
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)# 最后的线性输出层
# 权重共享(transformer原文: In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [ 24 ])
self.transformer.wte.weight = self.lm_head.weight
# 权重初始化
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
std = 0.02
if hasattr(module, 'NANOGPT_SCALE_INIT'):
std *= (2 * self.config.n_layer)**-0.5
torch.nn.init.normal_(module.weight, mean=0.0, std=std)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, idx, targets=None):
# idx is of shape (B, T)
B, T = idx.size()
assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
# forward the token and posisition embeddings
pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
x = tok_emb + pos_emb
# forward the blocks of the transformer
for block in self.transformer.h:
x = block(x)
# forward the final layernorm and the classifier
x = self.transformer.ln_f(x)
logits = self.lm_head(x) # (B, T, vocab_size)
loss = None
if targets is not None:
# logits.view(-1, logits.size(-1)):将所有批次的logits展平,然后和targets展平
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
return logits, loss
@classmethod
def from_pretrained(cls, model_type, model_path):
"""Loads pretrained GPT-2 model weights from huggingface"""
assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
from transformers import GPT2LMHeadModel
logger.mes("loading weights from pretrained gpt: %s" % model_type)
# n_layer, n_head and n_embd are determined from model_type
config_args = {
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
}[model_type]
config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
# create a from-scratch initialized minGPT model
config = GPTConfig(**config_args)
model = GPT(config)
sd = model.state_dict()
sd_keys = sd.keys()
sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
# init a huggingface/transformers model
model_hf = GPT2LMHeadModel.from_pretrained(model_path)
sd_hf = model_hf.state_dict()
# copy while ensuring all of the parameters are aligned and match in names and shapes
sd_keys_hf = sd_hf.keys()
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
# this means that we have to transpose these weights when we import them
assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
for k in sd_keys_hf:
if any(k.endswith(w) for w in transposed):
# special treatment for the Conv1D weights we need to transpose
assert sd_hf[k].shape[::-1] == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k].t())
else:
# vanilla copy over the other parameters
assert sd_hf[k].shape == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k])
return model
def configure_optimizers(self, weight_decay, learning_rate, device_type):
# start with all of the candidate parameters (that require grad)
param_dict = {pn: p for pn, p in self.named_parameters()}
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
{'params': decay_params, 'weight_decay': weight_decay},
{'params': nodecay_params, 'weight_decay': 0.0}
]
num_decay_params = sum(p.numel() for p in decay_params)
num_nodecay_params = sum(p.numel() for p in nodecay_params)
logger.mes(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
logger.mes(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
# Create AdamW optimizer and use the fused version if it is available
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and device_type == "cuda"
logger.mes(f"using fused AdamW: {use_fused}")
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
return optimizer
class DataLoaderLite:
def __init__(self, B, T, process_rank, num_processes, split):
self.B = B
self.T = T
self.process_rank = process_rank
self.num_processes = num_processes
assert split in {'train', 'val'}
# get the shard filenames
data_root = "edu_fineweb10B"
shards = os.listdir(data_root)
shards = [s for s in shards if split in s] # 只读取train或val
shards = sorted(shards)
shards = [os.path.join(data_root, s) for s in shards] # 得到数据读取的地址
self.shards = shards
assert len(shards) > 0, f"no shards found for split {split}"
logger.mes(f"found {len(shards)} shards for split {split}")
self.reset()
def _load_tokens(self, filename):
npt = np.load(filename)
npt = npt.astype(np.int32) # added after video
ptt = torch.tensor(npt, dtype=torch.long)
return ptt
def reset(self):
# state, init at shard zero
self.current_shard = 0
self.tokens = self._load_tokens(self.shards[self.current_shard])
self.current_position = self.B * self.T * self.process_rank # self.process_rank取值0-7
def next_batch(self):
B, T = self.B, self.T
buf = self.tokens[self.current_position : self.current_position+B*T+1]
x = (buf[:-1]).view(B, T) # inputs
y = (buf[1:]).view(B, T) # targets
# advance the position in the tensor
self.current_position += B * T * self.num_processes
# if loading the next batch would be out of bounds, advance to next shard
if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
self.current_shard = (self.current_shard + 1) % len(self.shards)
self.tokens = self._load_tokens(self.shards[self.current_shard])
self.current_position = B * T * self.process_rank
return x, y
#--------------------------------------------------------------------------------
# run the training loop
# simple launch:
# python train_gpt2.py
# DDP launch for e.g. 8 GPUs:
# torchrun --standalone --nproc_per_node=8 train_gpt2.py
import time
import math
from datetime import datetime
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
# set up DDP (distributed data parallel).
# torchrun command sets the env variables RANK, LOCAL_RANK, and WORLD_SIZE
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #(保证程序cuda序号与实际cuda序号对应)
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" # 使用第几个cuda
# use of DDP atm demands CUDA, we set the device appropriately according to rank
assert torch.cuda.is_available(), "for now i think we need CUDA for DDP"
init_process_group(backend='nccl') # 初始化进程组,使用nccl作为后端。
ddp_rank = int(os.environ['RANK']) # 进程号
ddp_local_rank = int(os.environ['LOCAL_RANK']) # 本地进程号
ddp_world_size = int(os.environ['WORLD_SIZE']) # 总进程数
device = f'cuda:{ddp_local_rank}' # 使进程编号与cuda设备编号对应
torch.cuda.set_device(device)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
else:
# vanilla, non-DDP run
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
# 注意:这种方式会将cuda:x映射到cuda:0,从而导致查询到的cuda还是cuda:0,而实际使用的是cuda:x
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #(保证程序cuda序号与实际cuda序号对应)
os.environ["CUDA_VISIBLE_DEVICES"] = "4" # 使用第几个cuda
device = "cuda"
# RECORD_CACHE_DIR 是记录训练过程的文件夹,其中包含每次训练的日志 权重文件 变量
# RECORD_DIR 是当前训练循环的记录文件夹
RECORD_CACHE_DIR = os.path.join(os.path.dirname(__file__), "gpt2_training_record")
RECORD_DIR = os.path.join(RECORD_CACHE_DIR, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
if master_process:
os.makedirs(RECORD_CACHE_DIR, exist_ok=True)
os.makedirs(os.path.join(RECORD_DIR, "checkpoints"), exist_ok=True)
os.makedirs(os.path.join(RECORD_DIR, "variables"), exist_ok=True)
HELLASWAG_CACHE_DIR = os.path.join(os.path.dirname(__file__), "hellaswag")
HELLASWAG_PATH = os.path.join(HELLASWAG_CACHE_DIR, f"hellaswag_val.jsonl")
STORYCLOZE_CACHE_DIR = os.path.join(os.path.dirname(__file__), "StoryCloze")
STORYCLOZE_PATH = os.path.join(STORYCLOZE_CACHE_DIR, "cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
LAMBADA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "lambada_openai")
LAMBADA_PATH = os.path.join(LAMBADA_CACHE_DIR, "lambada_test_en.jsonl")
logger = Logger(log_directory=RECORD_DIR, use_logging=master_process) # 只有主进程进行日志记录
vt = VariableTracker(keys=['step', 'train_loss','val_loss','hellaswg_acc', 'storycloze_acc', 'lambada_openai','lr'], master_process=master_process) # 只有主进程进行数据添加
# added after video, pytorch can be serious about it's device vs. device_type distinction
# device是设备,device_type是设备类型,device_type被用于autocast中
device_type = "cuda" if device.startswith("cuda") else "cpu"
torch.manual_seed(1337)
torch.cuda.manual_seed(1337)
enc = tiktoken.get_encoding("gpt2")
total_batch_size = 524288 # 2**19, ~0.5M, in number of tokens
B = 2 # micro batch size
T = 2048 # sequence length
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
logger.mes(f"total batch size: {total_batch_size}")
logger.mes(f"gradient accumulation steps: {grad_accum_steps}")
train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, split='train')
val_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, split="val")
torch.set_float32_matmul_precision('high')
# 获得logits
model = GPT(GPTConfig(vocab_size=50304))
model.to(device)
use_compile = False # torch.compile interferes with HellaSwag eval and Generation. TODO fix
if use_compile:
model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model # model经过DDP封装后,取出module
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 715
# 10B tokens,每一个 step 是 0.5M tokens,19073个 step 刚好可以遍历完 10B tokens
max_steps = 19073 * 4 # 19,073 steps is ~1 epoch, if data is 10B tokens and batch size 0.5M tokens
def get_lr(it):
# 1) 预热期,当iter数小于warmup_steps时,lr线性增加
if it < warmup_steps:
return max_lr * (it+1) / warmup_steps
# 2) 稳定期,当iter数大于max_steps时,返回最小lr
if it > max_steps:
return min_lr
# 3) 中间态,使用余弦退火算法
decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
return min_lr + coeff * (max_lr - min_lr)
# optimize
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95),eps=1e-8)
optimizer = raw_model.configure_optimizers(weight_decay=0.1, learning_rate=max_lr, device_type=device)
for step in range(max_steps):
vt.append("step", step)
t0 = time.time()
last_step = (step == max_steps - 1)
# once in a while evaluate our validation loss
if step % 250 == 0 or last_step:
model.eval()
val_loader.reset()
with torch.no_grad():
val_loss_accum = 0.0
val_loss_steps = 20
for _ in range(val_loss_steps):
x, y = val_loader.next_batch()
x, y = x.to(device), y.to(device)
with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
logits, loss = model(x, y)
loss = loss / val_loss_steps
val_loss_accum += loss.detach()
if ddp:
dist.all_reduce(val_loss_accum, op=dist.ReduceOp.AVG)
if master_process:
logger.mes(f"validation loss: {val_loss_accum.item():.4f}")
vt.append("val_loss", val_loss_accum.item())
if step > 0 and (step % 5000 == 0 or last_step):
# optionally write model checkpoints
checkpoint_path = os.path.join(RECORD_DIR, "checkpoints", f"model_{step:05d}.pt")
checkpoint = {
'model': raw_model.state_dict(),
'config': raw_model.config,
'step': step,
'val_loss': val_loss_accum.item()
}
# you might also want to add optimizer.state_dict() and
# rng seeds etc., if you wanted to more exactly resume training
torch.save(checkpoint, checkpoint_path)
vt.save(os.path.join(RECORD_DIR, "variables", f"variables_{step:05d}.npy"))
else:
vt.append("val_loss", None)
# evaluate hellaswag-------------------------------------
if (step % 250 == 0 or last_step) and (not use_compile):
hellaswag_evaluator = HellaSwagEvaluator()
num_correct_norm = 0
num_total = 0
for i, example in enumerate(hellaswag_evaluator.iterate_examples(HELLASWAG_PATH)):
# only process examples where i % ddp_world_size == ddp_rank
if i % ddp_world_size != ddp_rank:
continue
# render the example into tokens and labels
_, tokens, mask, label = hellaswag_evaluator.render_example(example, enc)
tokens = tokens.to(device)
mask = mask.to(device)
# get the logits
with torch.no_grad():
with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
logits, _ = model(tokens)
_, pred_norm = hellaswag_evaluator.evaluate_in_training(tokens, mask, logits)
num_total += 1
num_correct_norm += int(pred_norm == label)
# reduce the stats across all processes
if ddp:
num_total = torch.tensor(num_total, dtype=torch.long, device=device)
num_correct_norm = torch.tensor(num_correct_norm, dtype=torch.long, device=device)
dist.all_reduce(num_total, op=dist.ReduceOp.SUM)
dist.all_reduce(num_correct_norm, op=dist.ReduceOp.SUM)
num_total = num_total.item()
num_correct_norm = num_correct_norm.item()
acc_norm = num_correct_norm / num_total
logger.mes(f"HellaSwag accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")
vt.append("hellaswg_acc", acc_norm)
else:
vt.append("hellaswg_acc", None)
# evaluate storycloze-------------------------------------
if (step % 250 == 0 or last_step) and (not use_compile):
storycloze_evaluator = StoryClozeEvaluator()
num_correct_norm = 0
num_total = 0
for i, example in enumerate(storycloze_evaluator.iterate_examples(STORYCLOZE_PATH)):
if i % ddp_world_size != ddp_rank:
continue
tokens, mask, label = storycloze_evaluator.render_example(example, enc)
tokens, mask = tokens.to(device), mask.to(device)
label = label - 1 # label只有1和2,使其从0开始
with torch.no_grad():
with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
logits, _ = model(tokens)
_, pred_norm = storycloze_evaluator.evaluate_in_training(tokens, mask, logits)
num_total += 1
num_correct_norm += int(pred_norm == label)
if ddp:
num_total = torch.tensor(num_total, dtype=torch.long, device=device)
num_correct_norm = torch.tensor(num_correct_norm, dtype=torch.long, device=device)
dist.all_reduce(num_total, op=dist.ReduceOp.SUM)
dist.all_reduce(num_correct_norm, op=dist.ReduceOp.SUM)
num_total = num_total.item()
num_correct_norm = num_correct_norm.item()
acc_norm = num_correct_norm / num_total
logger.mes(f"StoryCloze accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")
vt.append("storycloze_acc", acc_norm)
else:
vt.append("storycloze_acc", None)
# evaluate lambada_openai-----------------------------------
if (step % 250 == 0 or last_step) and (not use_compile):
lambada_evaluator = LambadaEvaluator()
num_correct = 0
num_total = 0
for i, example in enumerate(lambada_evaluator.iterate_examples(LAMBADA_PATH)):
if i % ddp_world_size != ddp_rank:
continue
tokens = lambada_evaluator.render_example(example, enc)
tokens = tokens.to(device)
with torch.no_grad():
with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
logits, _ = model(tokens)
predicted_tokens = lambada_evaluator.evaluate_in_training(logits)
num_total += 1
num_correct += int(predicted_tokens == tokens.view(-1)[-1])
if ddp:
num_total = torch.tensor(num_total, dtype=torch.long, device=device)
num_correct = torch.tensor(num_correct, dtype=torch.long, device=device)
dist.all_reduce(num_total, op=dist.ReduceOp.SUM)
dist.all_reduce(num_correct, op=dist.ReduceOp.SUM)
num_total = num_total.item()
num_correct = num_correct.item()
acc = num_correct / num_total
logger.mes(f"Lambada_openai accuracy: {num_correct}/{num_total}={acc:.4f}")
vt.append("lambada_openai", acc)
else:
vt.append("lambada_openai", None)
# do one step of the optimization
model.train()
optimizer.zero_grad() # 梯度清零
loss_accum = 0.0
for micro_step in range(grad_accum_steps):
x, y = train_loader.next_batch()
x, y = x.to(device), y.to(device)
if ddp:
model.require_backward_grad_sync = (micro_step == grad_accum_steps-1) #
# 使用混合精度进行训练
with torch.autocast(device_type=device_type,dtype=torch.bfloat16):
logits, loss = model(x, y)
loss = loss/grad_accum_steps
loss_accum += loss.detach()
loss.backward() # 计算梯度
if ddp:
dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG) # 在分布式计算环境中将所有进程的loss_accum求平均值,并将结果放回每个进程
norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
lr = get_lr(step) # 动态修改学习率
for param_group in optimizer.param_groups:
param_group['lr'] = lr
optimizer.step()
torch.cuda.synchronize() # wait for the GPU to finish work
t1 = time.time()
dt = (t1-t0) # 计算单个epoch耗时(单位秒)
tokens_processed = train_loader.B * train_loader.T * grad_accum_steps * ddp_world_size
tokens_per_sec = tokens_processed / dt
logger.mes(f"step{step:5d} | loss: {loss_accum.item():.8f} | lr: {lr:.4e} | norm: {norm:.4f} | tok/sec: {tokens_per_sec:.2f}")
vt.append("train_loss", loss_accum.item())
vt.append("lr", lr)
if ddp:
destroy_process_group()
# 退出程序
import sys; sys.exit(0)