这是indexloc提供的服务,不要输入任何密码
Skip to content

Mega #442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open

Mega #442

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion eval_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def init_model(args):
num_hidden_layers=args.num_hidden_layers,
use_moe=args.use_moe
))

model = torch.compile(model)

model.load_state_dict(torch.load(ckp, map_location=args.device), strict=True)

if args.lora_name != 'None':
Expand Down
Binary file removed images/1-wiki.png
Binary file not shown.
Binary file removed images/2-wiki.png
Binary file not shown.
Binary file removed images/3-wiki.png
Binary file not shown.
Binary file removed images/4-wiki.png
Binary file not shown.
Binary file removed images/5-wiki.png
Binary file not shown.
Binary file removed images/LLM-structure-moe.png
Binary file not shown.
Binary file removed images/LLM-structure.png
Binary file not shown.
Binary file removed images/and_huggingface.png
Binary file not shown.
Binary file removed images/and_modelscope.png
Binary file not shown.
Binary file removed images/compare_radar.png
Binary file not shown.
Binary file removed images/dataset.jpg
Binary file not shown.
Binary file removed images/gpt3_config.png
Binary file not shown.
Binary file removed images/logo.png
Binary file not shown.
Binary file removed images/logo2.png
Binary file not shown.
Binary file removed images/minimind2.gif
Binary file not shown.
Binary file removed images/pre_512_loss.png
Binary file not shown.
Binary file removed images/pre_768_loss.png
Binary file not shown.
Binary file removed images/sft_512_loss.png
Binary file not shown.
Binary file removed images/sft_768_loss.png
Binary file not shown.
14 changes: 12 additions & 2 deletions trainer/train_full_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ def init_model(lm_config):

Logger(f'LLM可训练总参数量:{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} 百万')
model = model.to(args.device)

# 使用 torch.compile 优化模型以提升速度 (需要 PyTorch 2.0+)
try:
# 使用 "reduce-overhead" 模式来降低编译时的内存消耗
model = torch.compile(model, mode="reduce-overhead")
Logger("Model compiled with torch.compile() for optimization.")
except Exception as e:
Logger(f"torch.compile() failed: {e}")

return model, tokenizer


Expand Down Expand Up @@ -130,7 +139,7 @@ def init_distributed_mode():
parser.add_argument("--wandb_project", type=str, default="MiniMind-Full-SFT")
parser.add_argument("--num_workers", type=int, default=1)
parser.add_argument("--ddp", action="store_true")
parser.add_argument("--accumulation_steps", type=int, default=1)
parser.add_argument("--accumulation_steps", type=int, default=2)
parser.add_argument("--grad_clip", type=float, default=1.0)
parser.add_argument("--warmup_iters", type=int, default=0)
parser.add_argument("--log_interval", type=int, default=100)
Expand Down Expand Up @@ -191,7 +200,8 @@ def init_distributed_mode():
)

scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)
# 在 CUDA 上使用 fused AdamW 以获得潜在的速度提升
optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate, fused=(device_type == 'cuda'))

if ddp:
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
Expand Down
9 changes: 6 additions & 3 deletions trainer/train_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dataset.lm_dataset import PretrainDataset

warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('high') # <--- 添加此行


def Logger(content):
Expand Down Expand Up @@ -118,7 +119,7 @@ def init_distributed_mode():
parser = argparse.ArgumentParser(description="MiniMind Pretraining")
parser.add_argument("--out_dir", type=str, default="../out")
# 若要以最快速度实现zero则epochs设置为1轮;否则应当利用有限的数据训练2~6个epochs。
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--epochs", type=int, default=2)
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--learning_rate", type=float, default=5e-4)
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu")
Expand Down Expand Up @@ -174,6 +175,7 @@ def init_distributed_mode():
wandb = None

model, tokenizer = init_model(lm_config)
model = torch.compile(model) # <--- 添加此行
train_ds = PretrainDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
train_sampler = DistributedSampler(train_ds) if ddp else None
train_loader = DataLoader(
Expand All @@ -187,11 +189,12 @@ def init_distributed_mode():
)

scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)
optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate, fused=True if device_type == 'cuda' else False) # <--- 修改此行

if ddp:
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
# 注意:torch.compile 应该在 DDP 封装之前调用
model = DistributedDataParallel(model, device_ids=[ddp_local_rank], static_graph=True) # <--- 修改此行

iter_per_epoch = len(train_loader)
for epoch in range(args.epochs):
Expand Down