联邦学习显存溢出问题

求助!!!!
最近在将一个 torch 的联邦学习项目迁移为 mindspore 的,
以下代码发生了泄漏,但是同逻辑的torch 和 paddle 版本的都不会发生泄漏,为什么?
代码是简化版的联邦学习逻辑,每个 train 是需要有 model 的。
每次变化的显存是 1024MB
已经尝试过的修改:

# 1.两个模式都设置过,一样的。
mindspore.set_context(device_id=gpu_id, mode=mindspore.GRAPH_MODE) 
mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE) 

#2.内存复用模式,动态和静态无效
mindspore.runtime.set_memory(optimize_level="O0")

#3.删除变量,强制 gc,并且mindspore.ms_memory_recycle(),也是无效
mindspore.ms_memory_recycle()

# 4.train 方法中的 model 无论是方法内新建还是外部传递参数,也是会溢出泄漏

# 5.重置 重置内存池中的全部峰值。
# mindspore.runtime.reset_peak_memory_stats()
for batch_idx, kkk in enumerate(range(1)):
        images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
        labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
        mem_before = get_gpu_memory_usage(gpu_id)
        # (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
        # optimizer(grads)        
        result = train_net(images, labels)
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
    del model,loss_fn,optimizer,images,labels,train_net
    gc.collect()
    mindspore.ms_memory_recycle()
    mindspore.runtime.reset_peak_memory_stats()

mindspore 是会锁住资源吗?有办法强制释放吗?以下是 mindspore、torch、paddle 三个版本的代码。

求各位大佬指点!!!如何解决这个显存泄漏的问题

mindspore

# mindspore
import os
os.environ['GLOG_v'] = '3'
import numpy as np
import mindspore
import mindspore.nn as nn
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
    """获取指定GPU的已用显存(单位:MiB)"""
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.used / (1024**2)
pynvml.nvmlInit()

class Network(nn.Cell):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_relu_sequential = nn.SequentialCell(
            nn.Dense(28*28, 512),
            nn.ReLU(),
            nn.Dense(512, 512),
            nn.ReLU(),
            nn.Dense(512, 10)
        )

    def construct(self, x):
        x = self.flatten(x)
        logits = self.dense_relu_sequential(x)
        return logits

def train0(epoch=0, gpu_id=0):
    model = Network()
    model.set_train()
    model.set_grad(False)
    
    
    loss_fn = nn.CrossEntropyLoss()
    def forward_fn(data, label):
        logits = model(data)
        loss = loss_fn(logits, label)
        return loss,0,0,0,0,0,0
        return loss
    optimizer = nn.SGD(model.trainable_params(), 1e-2)
    grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
    
    for batch_idx, kkk in enumerate(range(2)):
        images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
        labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
        mem_before = get_gpu_memory_usage(gpu_id)
        (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
        optimizer(grads)
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    del model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
    gc.collect()
    mindspore.ms_memory_recycle()
    
    return


def train(model:nn.Cell,epoch=0, gpu_id=0):
    model = Network()
    # model.set_train()
    # model.set_grad(False)
    
    
    loss_fn = nn.CrossEntropyLoss()
    def forward_fn(data, label):
        logits = model(data)
        loss = loss_fn(logits, label)
        return loss,0,0,0,0,0,0
        return loss
    optimizer = nn.SGD(model.trainable_params(), 1e-2)
    # grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)

    train_net= nn.TrainOneStepCell(nn.WithLossCell(model, loss_fn), optimizer=optimizer)
    train_net.set_train(True)
    for batch_idx, kkk in enumerate(range(2)):
        images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
        labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
        mem_before = get_gpu_memory_usage(gpu_id)
        # (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
        # optimizer(grads)
        
        result = train_net(images, labels)

        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    # del model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
    del model,loss_fn,optimizer,images,labels,train_net
    gc.collect()
    mindspore.ms_memory_recycle()
    # mindspore._pynative_executor.clear_resource()
    
    return

gpu_id = 0  # 可根据需要设置
# mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE) 
mindspore.set_context(device_id=gpu_id, mode=mindspore.GRAPH_MODE) 
mindspore.runtime.set_memory(optimize_level="O0")
# global rounds
# for i in range(100000):
#     # model = Network()
#     # clients training
#     for j in range(10):
#         # train(model, j, gpu_id)
#         train(None,j, gpu_id)
for j in range(10000000):
    train(None,j, gpu_id)

torch


import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
    """获取指定GPU的已用显存(单位:MiB)"""
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.used / (1024**2)
pynvml.nvmlInit()

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_relu_sequential = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.dense_relu_sequential(x)
        return logits

def train(epoch=0, gpu_id=0):
    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = Network().to(device)
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=1e-2)

    for batch_idx, _ in enumerate(range(2)):
        images = torch.tensor(np.random.rand(1280, 1, 28, 28), dtype=torch.float32).to(device)
        labels = torch.tensor(np.random.randint(0, 10, size=(1280,)), dtype=torch.long).to(device)
        mem_before = get_gpu_memory_usage(gpu_id)
        optimizer.zero_grad()
        logits = model(images)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    del model, optimizer, loss_fn
    gc.collect()
    return

gpu_id = 1  # 可根据需要设置
for j in range(100000):
    train(j, gpu_id)
exit()

paddle


import numpy as np
import paddle
import paddle.nn as nn
import paddle.optimizer as optim
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
    """获取指定GPU的已用显存(单位:MiB)"""
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.used / (1024**2)
pynvml.nvmlInit()

class Network(nn.Layer):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_relu_sequential = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.dense_relu_sequential(x)
        return logits

def train(epoch=0, gpu_id=0):
    model = Network()
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(parameters=model.parameters(), learning_rate=1e-2)

    for batch_idx, _ in enumerate(range(2)):
        images = paddle.to_tensor(np.random.rand(1280, 1, 28, 28), dtype='float32')
        labels = paddle.to_tensor(np.random.randint(0, 10, size=(1280,)), dtype='int64')
        mem_before = get_gpu_memory_usage(gpu_id)
        logits = model(images)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    del model, optimizer, loss_fn
    gc.collect()
    return

gpu_id = 2  # 可根据需要设置
paddle.set_device(f"gpu:{gpu_id}" if paddle.is_compiled_with_cuda() else "cpu")

for j in range(100000):
    train(j, gpu_id)
exit()

####发生显存变化的日志记录

Epoch 246 GPU 2 - batch_idx 0 - before: 3692.9375 - after: 3692.9375 - 显存变化: 0.00 MiB - Tag: False
Epoch 246 GPU 2 - batch_idx 1 - before: 3692.9375 - after: 4716.9375 - 显存变化: 1024.00 MiB - Tag: True

每次变化都是1024mb,并且是后续会进行多次增加。

用户您好,欢迎使用MindSpore,已经收到您上述的问题,还请耐心等待下答复~

您好,请问现在有解决方法了吗?

你好,从提供的信息还无法定位问题。请提供可以复现问题的源码、环境信息,使用的环境变量,代码预期结果以及当前有问题结果。可以使用export GLOG_v=0export VLOG_v=“(12900,12905)”开启全量日志,由于日志可能较大,请尽可能用简单场景复现问题。

环境信息

设备:nvidia 5880ada
版本:mindspore-dev 2.6.0.dev20250323
python 3.11
环境变量:GLOG_v=“3”

源码

问题:会出现显存泄露,最后会爆显存,而跑不完程序。每次变化是1024MB,mem_after - mem_before = 1024
预期结果:mem_after - mem_before == 0。也就是,显存在运行期间不会发生变化,不会发生泄露,不会每个几个epoch增加,能够跑完循环。
和torch版本的比较:torch 版本显存不会发生泄露,也就是 mem_after - mem_before == 0,在代码运行期间显存不会发生变化,能够跑完循环。

全量日志太多了,报错也是最后因为爆显存了没有显存分配了会爆 OOM。

def debug_mindspore():
    import os
    from scipy import optimize
    os.environ['GLOG_v'] = '3'
    import numpy as np
    import mindspore
    import mindspore.nn as nn
    import pynvml
    import gc
    import copy
    gpu_id = 0
    mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE) 
    def get_gpu_memory_usage(device_id=0):
        """获取指定GPU的已用显存(单位:MiB)"""
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return mem_info.used / (1024**2)
    pynvml.nvmlInit()
    for i in range(100000):
        for j in range(10000):
            mem_before = get_gpu_memory_usage(gpu_id)
            model = nn.Dense(10000,1000)
            loss_fn = nn.MSELoss()
            optimizer = nn.SGD(model.trainable_params(), 1e-2)
            def forward_fn(data, label):
                logits = model(data)
                loss = loss_fn(logits, label)
                return loss, logits
            grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
            for batch_idx, kkk in enumerate(range(1)):
                x = mindspore.Tensor(np.random.rand(12800, 10000), mindspore.float32)
                y = mindspore.Tensor(np.random.rand(12800, 1000), mindspore.float32)
                (loss, _), grads = grad_fn(x, y)
                optimizer(grads)
                mem_after = get_gpu_memory_usage(gpu_id)
                break
            print(f"Epoch {i} Client {j} GPU {gpu_id} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
            if mem_after - mem_before == 1024:
                exit()
debug_mindspore()

以下是和上面mindspore源码相同逻辑的 torch 版本

def debug_torch():
    import numpy as np
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import pynvml
    import gc
    import copy
    gpu_id = 0
    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    def get_gpu_memory_usage(device_id=0):
        """获取指定GPU的已用显存(单位:MiB)"""
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return mem_info.used / (1024**2)
    pynvml.nvmlInit()
    
    for i in range(100000):
        for j in range(10):
            mem_before = get_gpu_memory_usage(gpu_id)
            model = nn.Linear(10000,1000).to(device)
            loss_fn = nn.MSELoss()
            optimizer = optim.SGD(model.parameters(), 1e-2)
            for batch_idx, kkk in enumerate(range(1)):
                x = torch.from_numpy(np.random.rand(12800, 10000).astype(np.float32)).to(device)
                y = torch.from_numpy(np.random.rand(12800, 1000).astype(np.float32)).to(device)
                logits = model(x)
                loss = loss_fn(logits, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                break
            mem_after = get_gpu_memory_usage(gpu_id)
            print(f"Epoch {i} Client {j} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
            if mem_after - mem_before  == 1024:
                exit()
debug_torch