-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
2张卡流水并行,显存都累积到了1号卡 #71413
Labels
Comments
终端内只打印其中一张卡的显存
进行查看。 # 启动脚本:
# 多卡 python3 -m paddle.distributed.launch --device=0,2 test_pp.py
# python3 test_pp.py
import paddle
import paddle.distributed as dist
mesh0 = dist.ProcessMesh([3], dim_names=['x'])
mesh1 = dist.ProcessMesh([4], dim_names=['x'])
class MLP_PP(paddle.nn.Layer):
def __init__(self):
super(MLP_PP, self).__init__()
self.w0 = dist.shard_tensor(
self.create_parameter(shape=[1024, 40960]),
mesh0, [dist.Replicate()])
self.w1 = dist.shard_tensor(
self.create_parameter(shape=[40960, 1024]),
mesh1, [dist.Replicate()])
def forward(self, x):
card1 = 0
card2 = 2
print("\nstep 0 : x", x.place, x.placements, x.process_mesh)
print("step 0 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
y = paddle.matmul(x, self.w0)
print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
print("\nstep 1 : x", x.place, x.placements, x.process_mesh)
print("step 1 : y", y.place, y.placements, y.process_mesh)
print("step 1 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
# 重切分,将 stage0 上的中间计算结果传输给 stage1
y = dist.reshard(y, mesh1, [dist.Replicate()])
print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
print("\nstep 2 : x", x.place, x.placements, x.process_mesh)
print("step 2 : y", y.place, y.placements, y.process_mesh)
print("step 2 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
z = paddle.matmul(y, self.w1)
print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
print("\nstep 3 : x", x.place, x.placements, x.process_mesh)
print("step 3 : y", y.place, y.placements, y.process_mesh)
print("step 3 : z", z.place, z.placements, z.process_mesh)
print("step 3 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
return z
class MLP(paddle.nn.Layer):
def __init__(self):
super(MLP, self).__init__()
self.w0 = self.create_parameter(shape=[1024, 40960])
self.w1 = self.create_parameter(shape=[40960, 1024])
def forward(self, x):
card1 = 0
card2 = 2
print("\nstep 0 : x", x.place, x.placements, x.process_mesh)
print("step 0 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
y = paddle.matmul(x, self.w0)
print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
print("\nstep 1 : x", x.place, x.placements, x.process_mesh)
print("step 1 : y", y.place, y.placements, y.process_mesh)
print("step 1 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
# 重切分,将 stage0 上的中间计算结果传输给 stage1
print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
print("\nstep 2 : x", x.place, x.placements, x.process_mesh)
print("step 2 : y", y.place, y.placements, y.process_mesh)
print("step 2 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
z = paddle.matmul(y, self.w1)
print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
print("\nstep 3 : x", x.place, x.placements, x.process_mesh)
print("step 3 : y", y.place, y.placements, y.process_mesh)
print("step 3 : z", z.place, z.placements, z.process_mesh)
print("step 3 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
return z
# 多卡
# model = MLP_PP()
# 单卡
model = MLP()
for name, layer in model.named_sublayers():
layer_type = str(type(layer))
print(name, layer_type)
x = paddle.ones([1024,])
y = model(x) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/paddle_v3_features/auto_parallel_cn.html#liushuibingxing
目的是执行的是流水并行
希望w0参数在3号卡
希望w1参数在4号卡
执行过程中,4号显存没有任何波动,同时查看place发现,两个mesh的process_id,3和4,都指定了gpu:3作为设备
The text was updated successfully, but these errors were encountered: