代码样例
compare_data.py
from ptdbg_ascend import compare
pkl_path = "/home/npu_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump.pkl"
dump_data_dir = "/home/npu_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump"
dump_path_param = {
"npu_pkl_path":pkl_path ,
"bench_pkl_path": "/home/bench_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump.pkl",
"npu_dump_data_dir":dump_data_dir ,
"bench_dump_data_dir":"/home/bench_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump" ,
"is_print_compare_log": True
}
compare(dump_path_param, output_path="/home/output", stack_mode=True)
ddp_basic.py
import torch
import torch_npu
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from utils import MyTrainDataset
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group, barrier
import os
import sys
from model import SimpleNet, ResNetOverflow
from torch_npu.contrib import transfer_to_npu
from ptdbg_ascend import *
#dump
debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", rank=0, step=[0], enable_dataloader=True)
#只dump统计量pkl
#debugger.configure_hook(summary_only=True)
#dump指定API列表
#debugger.configure_hook(mode="list", scope=["Functional_batch_norm_1_forward", "Functional_conv2d_5_backward", "Tensor___iadd___2_forward"])
#dump指定范围
#debugger.configure_hook(mode="range", scope=["Functional_conv2d_5_forward", "Tensor___iadd___2_backward"])
#dump指定某一类API的API级别输入输出数据
#debugger.configure_hook(mode="api_list", api_list=["relu"])
#溢出检测
#debugger = PrecisionDebugger(dump_path="./dump_overflow_path", hook_name="overflow_check")
def ddp_setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
init_process_group(backend='hccl', rank=rank, world_size=world_size)
class Trainer:
def __init__(
self, model, train_loader, optimizer, gpu_id, save_every, world_size):
self.gpu_id = gpu_id
self.model = model.to(f"npu:{gpu_id}")
#self.model = model
self.train_loader = train_loader
self.optimizer = optimizer
self.save_every = save_every
self.world_size = world_size
if world_size!=-1:
self.model = DDP(self.model, device_ids=[self.gpu_id])
def _run_batch(self, source, targets):
self.optimizer.zero_grad()
output = self.model(source)
loss = torch.nn.CrossEntropyLoss()(output, targets)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
b_sz = len(next(iter(self.train_loader))[0])
print(f"[NPU{self.gpu_id}] Epoch {epoch} | Batchsize:{b_sz} | Steps: {len(self.train_loader)}")
for i, (source, targets) in enumerate(self.train_loader):
print(i)
if i == 0:
print(i)
#当enable_dataloader为False时,需设置debugger.start()
#PrecisionDebugger.start()
source = source.to(self.gpu_id)
targets = targets.to(self.gpu_id).long()
targets=targets.long()
self._run_batch(source, targets)
if i==0:
print(i)
#当enable_dataloader为False时,需设置debugger.stop()
#PrecisionDebugger.stop()
def _save_checkpoint(self, epoch):
if self.world_size !=-1 and self.gpu_id==0:
ckp = self.model.module.state_dict()
else:
ckp = self.model.state_dict()
torch.save(ckp, "checkpoint.pt")
print(f"Epoch {epoch} | checkpoint saved")
def train(self, max_epochs):
for epoch in range(max_epochs):
self._run_epoch(epoch)
if self.gpu_id==0 and epoch % self.save_every == 0:
self._save_checkpoint(epoch)
def load_train_objs():
train_set = MyTrainDataset(2048, shape=(3,64,64))
model = ResNetOverflow()
optimizer = torch.optim.SGD(model.parameters(),lr=0.1)
return train_set, model, optimizer
def prepare_dataloader(dataset: Dataset, batch_size:int, world_size:int):
return DataLoader(
dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False if world_size!=-1 else True,
sampler=DistributedSampler(dataset) if world_size!=-1 else None
)
def main(rank, world_size, total_epochs, save_every):
torch.npu.set_device(f"npu:{rank}")
if world_size!=-1:
ddp_setup(rank, world_size)
dataset, model, optimizer = load_train_objs()
train_data = prepare_dataloader(dataset, batch_size=32, world_size=world_size)
trainer = Trainer(model, train_data, optimizer, rank, save_every, world_size)
trainer.train(total_epochs)
if world_size!=-1:
destroy_process_group()
if __name__ == "__main__":
total_epochs = 1
save_every = 5
n_device = int(sys.argv[1])
if n_device>=2:
world_size = n_device
mp.spawn(main, args=(world_size, total_epochs, save_every), nprocs=world_size)
else:
device = 0
main(device, -1, total_epochs, save_every)
model.py
import torch
import torch.nn as nn
from torchvision.models import resnet18
class ModelParallelNet(nn.Module):
def __init__(self):
super(ModelParallelNet,self).__init__()
self.linear1 = nn.Linear(20,10).to("npu:0")
self.linear2 = nn.Linear(10,1).to("npu:1")
def forward(self, x):
x = self.linear1(x.to("npu:0"))
x = self.linear2(x.to("npu:1"))
return x
class SimpleNet(nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.linear1 = nn.Linear(20,10)
self.linear2 = nn.Linear(10,1)
def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
x = x.half()
x += 65536*2
return x
class ResNetOverflow(nn.Module):
def __init__(self):
super().__init__()
self.resnet = resnet18()
self.linear = nn.Linear(1000,100)
def forward(self, x):
x = self.resnet(x)
x = self.linear(x)
x = x.half()
x += 65536*2
return x
utils.py
import torch
import torch_npu
from torch.utils.data import Dataset
class MyTrainDataset(Dataset):
def __init__(self, size, shape=(20,)):
super().__init__()
self.data = torch.randn(size, *shape)
self.label = torch.randn(size)*10
self.size = size
def __len__(self):
return self.size
def __getitem__(self, idx):
return self.data[idx], self.label[idx]
父主题: 精度比对工具