{
"ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 1, "tid": 7,
"ts": 1713351140570122, "dur": 386,
"args": {
"External id": 1529,
"device": 1, "context": 1,
"stream": 7, "correlation": 1529,
"bytes": 163840000, "memory bandwidth (GB/s)": 424.2921773719471
}
}
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.profiler
from torch.nn.parallel import DistributedDataParallel as DDP
def setup(rank, world_size):
import os
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def cleanup():
dist.destroy_process_group()
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc = nn.Linear(6400, 6400)
def forward(self, x):
return self.fc(x)
def demo_basic(rank, world_size):
setup(rank, world_size)
# Create model and move it to GPU with id rank
model = SimpleModel().to(rank)
model = DDP(model, device_ids=[rank])
optimizer = optim.SGD(model.parameters(), lr=0.01)
# Create a random tensor to simulate input data
inputs = torch.randn(200, 6400).to(rank)
labels = torch.randn(200, 6400).to(rank)
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
profile_memory=True, # Track memory allocation/deallocation.
with_stack=True
) as prof:
for _ in range(10):
outputs = model(inputs)
loss = nn.functional.mse_loss(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
prof.step()
cleanup()
def main():
world_size = 2
torch.multiprocessing.spawn(demo_basic,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
main()
第一个代码块是第二个代码块的日志的一份信息,我想测试两个卡之间的通信带宽,但是这个代码块一中的 bandwidth 有点看不懂了,为什么能达到 400+GB/s,硬件时 PCIE 4.0 x16 单机双卡 4090, 我用 https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/p2pBandwidthLatencyTest 测试了 p2p=disable 时的带宽(见代码块三),求老哥/师傅们解惑
Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)
D\D 0 1
0 919.12 2.28
1 2.49 812.51