Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2张卡流水并行,显存都累积到了1号卡 #71413

Open
wangguan1995 opened this issue Mar 4, 2025 · 1 comment
Open

2张卡流水并行,显存都累积到了1号卡 #71413

wangguan1995 opened this issue Mar 4, 2025 · 1 comment
Assignees
Labels

Comments

@wangguan1995
Copy link

wangguan1995 commented Mar 4, 2025

https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/paddle_v3_features/auto_parallel_cn.html#liushuibingxing

目的是执行的是流水并行
希望w0参数在3号卡
希望w1参数在4号卡
执行过程中,4号显存没有任何波动,同时查看place发现,两个mesh的process_id,3和4,都指定了gpu:3作为设备

# 启动脚本:
# python3 -m paddle.distributed.launch --device=3,4 test_pp.py
import paddle
import paddle.distributed as dist
mesh0 = dist.ProcessMesh([3], dim_names=['x'])
mesh1 = dist.ProcessMesh([4], dim_names=['x'])

class MlpModel(paddle.nn.Layer):
    def __init__(self):
        super(MlpModel, self).__init__()
        self.w0 = dist.shard_tensor(
                    self.create_parameter(shape=[1024, 40960]),
                    mesh0, [dist.Replicate()])
        self.w1 = dist.shard_tensor(
                    self.create_parameter(shape=[40960, 1024]),
                    mesh1, [dist.Replicate()])

    def forward(self, x):
        print("\nstep 0 : x", x.place, x.placements, x.process_mesh)
        print("step 0 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
        y = paddle.matmul(x, self.w0)
        print("\nstep 1 : x", x.place, x.placements, x.process_mesh)
        print("step 1 : y", y.place, y.placements, y.process_mesh)
        print("step 1 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
        # 重切分,将 stage0 上的中间计算结果传输给 stage1
        print(f"代码执行到此处时,消耗的GPU:3显存:{paddle.device.cuda.max_memory_allocated(3) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:4显存:{paddle.device.cuda.max_memory_allocated(4) / (1024 ** 3):.2f} GB")
        y = dist.reshard(y, mesh1, [dist.Replicate()])
        print("\nstep 2 : x", x.place, x.placements, x.process_mesh)
        print("step 2 : y", y.place, y.placements, y.process_mesh)
        print("step 2 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
        z = paddle.matmul(y, self.w1)
        print(f"代码执行到此处时,消耗的GPU:3显存:{paddle.device.cuda.max_memory_allocated(3) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:4显存:{paddle.device.cuda.max_memory_allocated(4) / (1024 ** 3):.2f} GB")
        print("\nstep 3 : x", x.place, x.placements, x.process_mesh)
        print("step 3 : y", y.place, y.placements, y.process_mesh)
        print("step 3 : z", z.place, z.placements, z.process_mesh)
        print("step 3 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
        return z

model = MlpModel()
for name, layer in model.named_sublayers():
    layer_type = str(type(layer))
    print(name, layer_type)
x = paddle.ones([1024,])
y = model(x)
python3 -m paddle.distributed.launch --device=3,4 test_pp.py
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
LAUNCH INFO 2025-03-04 12:52:00,930 -----------  Configuration  ----------------------
LAUNCH INFO 2025-03-04 12:52:00,930 auto_cluster_config: 0
LAUNCH INFO 2025-03-04 12:52:00,930 auto_parallel_config: None
LAUNCH INFO 2025-03-04 12:52:00,931 auto_tuner_json: None
LAUNCH INFO 2025-03-04 12:52:00,931 devices: 3,4
LAUNCH INFO 2025-03-04 12:52:00,931 elastic_level: -1
LAUNCH INFO 2025-03-04 12:52:00,931 elastic_timeout: 30
LAUNCH INFO 2025-03-04 12:52:00,931 enable_gpu_log: True
LAUNCH INFO 2025-03-04 12:52:00,931 gloo_port: 6767
LAUNCH INFO 2025-03-04 12:52:00,931 host: None
LAUNCH INFO 2025-03-04 12:52:00,931 ips: None
LAUNCH INFO 2025-03-04 12:52:00,931 job_id: default
LAUNCH INFO 2025-03-04 12:52:00,931 legacy: False
LAUNCH INFO 2025-03-04 12:52:00,931 log_dir: log
LAUNCH INFO 2025-03-04 12:52:00,931 log_level: INFO
LAUNCH INFO 2025-03-04 12:52:00,931 log_overwrite: False
LAUNCH INFO 2025-03-04 12:52:00,931 master: None
LAUNCH INFO 2025-03-04 12:52:00,931 max_restart: 3
LAUNCH INFO 2025-03-04 12:52:00,931 nnodes: 1
LAUNCH INFO 2025-03-04 12:52:00,931 nproc_per_node: None
LAUNCH INFO 2025-03-04 12:52:00,932 rank: -1
LAUNCH INFO 2025-03-04 12:52:00,932 run_mode: collective
LAUNCH INFO 2025-03-04 12:52:00,932 server_num: None
LAUNCH INFO 2025-03-04 12:52:00,932 servers: 
LAUNCH INFO 2025-03-04 12:52:00,932 sort_ip: False
LAUNCH INFO 2025-03-04 12:52:00,932 start_port: 6070
LAUNCH INFO 2025-03-04 12:52:00,932 trainer_num: None
LAUNCH INFO 2025-03-04 12:52:00,932 trainers: 
LAUNCH INFO 2025-03-04 12:52:00,932 training_script: test_pp.py
LAUNCH INFO 2025-03-04 12:52:00,932 training_script_args: []
LAUNCH INFO 2025-03-04 12:52:00,932 with_gloo: 1
LAUNCH INFO 2025-03-04 12:52:00,932 --------------------------------------------------
LAUNCH INFO 2025-03-04 12:52:00,933 Job: default, mode collective, replicas 1[1:1], elastic False
LAUNCH INFO 2025-03-04 12:52:00,937 Run Pod: dzutfz, replicas 2, status ready
LAUNCH INFO 2025-03-04 12:52:01,005 Watching Pod: dzutfz, replicas 2, status running
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
W0304 12:52:03.075841 38336 gpu_resources.cc:119] Please NOTE: device: 3, GPU Compute Capability: 8.0, Driver API Version: 12.3, Runtime API Version: 12.3
W0304 12:52:03.076944 38336 gpu_resources.cc:164] device: 3, cuDNN Version: 9.1.

step 0 : x Place(gpu:3) None None
step 0 : self.w0 Place(gpu:3) [Replicate()] {shape: [1], process_ids: [3], dim_names: [x]}

step 1 : x Place(gpu:3) [Replicate()] {shape: [1], process_ids: [3], dim_names: [x]}
step 1 : y Place(gpu:3) [Replicate()] {shape: [1], process_ids: [3], dim_names: [x]}
step 1 : self.w0 Place(gpu:3) [Replicate()] {shape: [1], process_ids: [3], dim_names: [x]}
代码执行到此处时,消耗的GPU:3显存:0.16 GB
代码执行到此处时,消耗的GPU:4显存:0.00 GB

step 2 : x Place(gpu:3) [Replicate()] {shape: [1], process_ids: [3], dim_names: [x]}
step 2 : y Place(gpu:3) [Replicate()] {shape: [1], process_ids: [4], dim_names: [x]}
step 2 : self.w1 Place(gpu:3) [Replicate()] {shape: [1], process_ids: [4], dim_names: [x]}
代码执行到此处时,消耗的GPU:3显存:0.16 GB
代码执行到此处时,消耗的GPU:4显存:0.00 GB

step 3 : x Place(gpu:3) [Replicate()] {shape: [1], process_ids: [3], dim_names: [x]}
step 3 : y Place(gpu:3) [Replicate()] {shape: [1], process_ids: [4], dim_names: [x]}
step 3 : z Place(gpu:3) [Replicate()] {shape: [1], process_ids: [4], dim_names: [x]}
step 3 : self.w1 Place(gpu:3) [Replicate()] {shape: [1], process_ids: [4], dim_names: [x]}
LAUNCH INFO 2025-03-04 12:52:04,009 Pod completed
LAUNCH INFO 2025-03-04 12:52:04,009 Exit code 0
@wangguan1995 wangguan1995 changed the title 2张卡流水并行,显存分配异常 2张卡流水并行,显存都累积到了1号卡 Mar 4, 2025
@wangguan1995
Copy link
Author

终端内只打印其中一张卡的显存
具体每张卡的显存要在

./log/workerlog.0
./log/workerlog.1
./log/workerlog.2
...

进行查看。

# 启动脚本:
# 多卡 python3 -m paddle.distributed.launch --device=0,2 test_pp.py
# python3 test_pp.py

import paddle
import paddle.distributed as dist
mesh0 = dist.ProcessMesh([3], dim_names=['x'])
mesh1 = dist.ProcessMesh([4], dim_names=['x'])

class MLP_PP(paddle.nn.Layer):
    def __init__(self):
        super(MLP_PP, self).__init__()
        self.w0 = dist.shard_tensor(
                    self.create_parameter(shape=[1024, 40960]),
                    mesh0, [dist.Replicate()])
        self.w1 = dist.shard_tensor(
                    self.create_parameter(shape=[40960, 1024]),
                    mesh1, [dist.Replicate()])

    def forward(self, x):
        card1 = 0
        card2 = 2
        print("\nstep 0 : x", x.place, x.placements, x.process_mesh)
        print("step 0 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
        y = paddle.matmul(x, self.w0)
        print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
        print("\nstep 1 : x", x.place, x.placements, x.process_mesh)
        print("step 1 : y", y.place, y.placements, y.process_mesh)
        print("step 1 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
        # 重切分,将 stage0 上的中间计算结果传输给 stage1
        y = dist.reshard(y, mesh1, [dist.Replicate()])
        print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
        print("\nstep 2 : x", x.place, x.placements, x.process_mesh)
        print("step 2 : y", y.place, y.placements, y.process_mesh)
        print("step 2 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
        z = paddle.matmul(y, self.w1)
        print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
        print("\nstep 3 : x", x.place, x.placements, x.process_mesh)
        print("step 3 : y", y.place, y.placements, y.process_mesh)
        print("step 3 : z", z.place, z.placements, z.process_mesh)
        print("step 3 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
        return z

class MLP(paddle.nn.Layer):
    def __init__(self):
        super(MLP, self).__init__()
        self.w0 = self.create_parameter(shape=[1024, 40960])
        self.w1 = self.create_parameter(shape=[40960, 1024])

    def forward(self, x):
        card1 = 0
        card2 = 2
        print("\nstep 0 : x", x.place, x.placements, x.process_mesh)
        print("step 0 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
        y = paddle.matmul(x, self.w0)
        print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
        print("\nstep 1 : x", x.place, x.placements, x.process_mesh)
        print("step 1 : y", y.place, y.placements, y.process_mesh)
        print("step 1 : self.w0", self.w0.place, self.w0.placements, self.w0.process_mesh)
        # 重切分,将 stage0 上的中间计算结果传输给 stage1
        print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
        print("\nstep 2 : x", x.place, x.placements, x.process_mesh)
        print("step 2 : y", y.place, y.placements, y.process_mesh)
        print("step 2 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
        z = paddle.matmul(y, self.w1)
        print(f"代码执行到此处时,消耗的GPU:{card1}显存:{paddle.device.cuda.max_memory_allocated(card1) / (1024 ** 3):.2f} GB")
        print(f"代码执行到此处时,消耗的GPU:{card2}显存:{paddle.device.cuda.max_memory_allocated(card2) / (1024 ** 3):.2f} GB")
        print("\nstep 3 : x", x.place, x.placements, x.process_mesh)
        print("step 3 : y", y.place, y.placements, y.process_mesh)
        print("step 3 : z", z.place, z.placements, z.process_mesh)
        print("step 3 : self.w1", self.w1.place, self.w1.placements, self.w1.process_mesh)
        return z

# 多卡
# model = MLP_PP()

# 单卡
model = MLP()

for name, layer in model.named_sublayers():
    layer_type = str(type(layer))
    print(name, layer_type)
x = paddle.ones([1024,])
y = model(x)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants