联邦学习显存溢出问题

求助!!!!
最近在将一个 torch 的联邦学习项目迁移为 mindspore 的,
以下代码发生了泄漏,但是同逻辑的torch 和 paddle 版本的都不会发生泄漏,为什么?
代码是简化版的联邦学习逻辑,每个 train 是需要有 model 的。
每次变化的显存是 1024MB
已经尝试过的修改:

# 1.两个模式都设置过,一样的。
mindspore.set_context(device_id=gpu_id, mode=mindspore.GRAPH_MODE) 
mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE) 

#2.内存复用模式,动态和静态无效
mindspore.runtime.set_memory(optimize_level="O0")

#3.删除变量,强制 gc,并且mindspore.ms_memory_recycle(),也是无效
mindspore.ms_memory_recycle()

# 4.train 方法中的 model 无论是方法内新建还是外部传递参数,也是会溢出泄漏

# 5.重置 重置内存池中的全部峰值。
# mindspore.runtime.reset_peak_memory_stats()
for batch_idx, kkk in enumerate(range(1)):
        images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
        labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
        mem_before = get_gpu_memory_usage(gpu_id)
        # (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
        # optimizer(grads)        
        result = train_net(images, labels)
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
    del model,loss_fn,optimizer,images,labels,train_net
    gc.collect()
    mindspore.ms_memory_recycle()
    mindspore.runtime.reset_peak_memory_stats()

mindspore 是会锁住资源吗?有办法强制释放吗?以下是 mindspore、torch、paddle 三个版本的代码。

求各位大佬指点!!!如何解决这个显存泄漏的问题

mindspore

# mindspore
import os
os.environ['GLOG_v'] = '3'
import numpy as np
import mindspore
import mindspore.nn as nn
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
    """获取指定GPU的已用显存(单位:MiB)"""
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.used / (1024**2)
pynvml.nvmlInit()

class Network(nn.Cell):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_relu_sequential = nn.SequentialCell(
            nn.Dense(28*28, 512),
            nn.ReLU(),
            nn.Dense(512, 512),
            nn.ReLU(),
            nn.Dense(512, 10)
        )

    def construct(self, x):
        x = self.flatten(x)
        logits = self.dense_relu_sequential(x)
        return logits

def train0(epoch=0, gpu_id=0):
    model = Network()
    model.set_train()
    model.set_grad(False)
    
    
    loss_fn = nn.CrossEntropyLoss()
    def forward_fn(data, label):
        logits = model(data)
        loss = loss_fn(logits, label)
        return loss,0,0,0,0,0,0
        return loss
    optimizer = nn.SGD(model.trainable_params(), 1e-2)
    grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
    
    for batch_idx, kkk in enumerate(range(2)):
        images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
        labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
        mem_before = get_gpu_memory_usage(gpu_id)
        (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
        optimizer(grads)
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    del model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
    gc.collect()
    mindspore.ms_memory_recycle()
    
    return


def train(model:nn.Cell,epoch=0, gpu_id=0):
    model = Network()
    # model.set_train()
    # model.set_grad(False)
    
    
    loss_fn = nn.CrossEntropyLoss()
    def forward_fn(data, label):
        logits = model(data)
        loss = loss_fn(logits, label)
        return loss,0,0,0,0,0,0
        return loss
    optimizer = nn.SGD(model.trainable_params(), 1e-2)
    # grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)

    train_net= nn.TrainOneStepCell(nn.WithLossCell(model, loss_fn), optimizer=optimizer)
    train_net.set_train(True)
    for batch_idx, kkk in enumerate(range(2)):
        images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
        labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
        mem_before = get_gpu_memory_usage(gpu_id)
        # (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
        # optimizer(grads)
        
        result = train_net(images, labels)

        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    # del model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
    del model,loss_fn,optimizer,images,labels,train_net
    gc.collect()
    mindspore.ms_memory_recycle()
    # mindspore._pynative_executor.clear_resource()
    
    return

gpu_id = 0  # 可根据需要设置
# mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE) 
mindspore.set_context(device_id=gpu_id, mode=mindspore.GRAPH_MODE) 
mindspore.runtime.set_memory(optimize_level="O0")
# global rounds
# for i in range(100000):
#     # model = Network()
#     # clients training
#     for j in range(10):
#         # train(model, j, gpu_id)
#         train(None,j, gpu_id)
for j in range(10000000):
    train(None,j, gpu_id)

torch


import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
    """获取指定GPU的已用显存(单位:MiB)"""
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.used / (1024**2)
pynvml.nvmlInit()

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_relu_sequential = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.dense_relu_sequential(x)
        return logits

def train(epoch=0, gpu_id=0):
    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = Network().to(device)
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=1e-2)

    for batch_idx, _ in enumerate(range(2)):
        images = torch.tensor(np.random.rand(1280, 1, 28, 28), dtype=torch.float32).to(device)
        labels = torch.tensor(np.random.randint(0, 10, size=(1280,)), dtype=torch.long).to(device)
        mem_before = get_gpu_memory_usage(gpu_id)
        optimizer.zero_grad()
        logits = model(images)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    del model, optimizer, loss_fn
    gc.collect()
    return

gpu_id = 1  # 可根据需要设置
for j in range(100000):
    train(j, gpu_id)
exit()

paddle


import numpy as np
import paddle
import paddle.nn as nn
import paddle.optimizer as optim
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
    """获取指定GPU的已用显存(单位:MiB)"""
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.used / (1024**2)
pynvml.nvmlInit()

class Network(nn.Layer):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_relu_sequential = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.dense_relu_sequential(x)
        return logits

def train(epoch=0, gpu_id=0):
    model = Network()
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(parameters=model.parameters(), learning_rate=1e-2)

    for batch_idx, _ in enumerate(range(2)):
        images = paddle.to_tensor(np.random.rand(1280, 1, 28, 28), dtype='float32')
        labels = paddle.to_tensor(np.random.randint(0, 10, size=(1280,)), dtype='int64')
        mem_before = get_gpu_memory_usage(gpu_id)
        logits = model(images)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        mem_after = get_gpu_memory_usage(gpu_id)
        print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
    del model, optimizer, loss_fn
    gc.collect()
    return

gpu_id = 2  # 可根据需要设置
paddle.set_device(f"gpu:{gpu_id}" if paddle.is_compiled_with_cuda() else "cpu")

for j in range(100000):
    train(j, gpu_id)
exit()

####发生显存变化的日志记录

Epoch 246 GPU 2 - batch_idx 0 - before: 3692.9375 - after: 3692.9375 - 显存变化: 0.00 MiB - Tag: False
Epoch 246 GPU 2 - batch_idx 1 - before: 3692.9375 - after: 4716.9375 - 显存变化: 1024.00 MiB - Tag: True

每次变化都是1024mb,并且是后续会进行多次增加。

用户您好,欢迎使用MindSpore,已经收到您上述的问题,还请耐心等待下答复~

您好,请问现在有解决方法了吗?

你好,从提供的信息还无法定位问题。请提供可以复现问题的源码、环境信息,使用的环境变量,代码预期结果以及当前有问题结果。可以使用export GLOG_v=0export VLOG_v=“(12900,12905)”开启全量日志,由于日志可能较大,请尽可能用简单场景复现问题。

环境信息

设备:nvidia 5880ada
版本:mindspore-dev 2.6.0.dev20250323
python 3.11
环境变量:GLOG_v=“3”

源码

问题:会出现显存泄露,最后会爆显存,而跑不完程序。每次变化是1024MB,mem_after - mem_before = 1024
预期结果:mem_after - mem_before == 0。也就是,显存在运行期间不会发生变化,不会发生泄露,不会每个几个epoch增加,能够跑完循环。
和torch版本的比较:torch 版本显存不会发生泄露,也就是 mem_after - mem_before == 0,在代码运行期间显存不会发生变化,能够跑完循环。

全量日志太多了,报错也是最后因为爆显存了没有显存分配了会爆 OOM。

def debug_mindspore():
    import os
    from scipy import optimize
    os.environ['GLOG_v'] = '3'
    import numpy as np
    import mindspore
    import mindspore.nn as nn
    import pynvml
    import gc
    import copy
    gpu_id = 0
    mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE) 
    def get_gpu_memory_usage(device_id=0):
        """获取指定GPU的已用显存(单位:MiB)"""
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return mem_info.used / (1024**2)
    pynvml.nvmlInit()
    for i in range(100000):
        for j in range(10000):
            mem_before = get_gpu_memory_usage(gpu_id)
            model = nn.Dense(10000,1000)
            loss_fn = nn.MSELoss()
            optimizer = nn.SGD(model.trainable_params(), 1e-2)
            def forward_fn(data, label):
                logits = model(data)
                loss = loss_fn(logits, label)
                return loss, logits
            grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
            for batch_idx, kkk in enumerate(range(1)):
                x = mindspore.Tensor(np.random.rand(12800, 10000), mindspore.float32)
                y = mindspore.Tensor(np.random.rand(12800, 1000), mindspore.float32)
                (loss, _), grads = grad_fn(x, y)
                optimizer(grads)
                mem_after = get_gpu_memory_usage(gpu_id)
                break
            print(f"Epoch {i} Client {j} GPU {gpu_id} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
            if mem_after - mem_before == 1024:
                exit()
debug_mindspore()

以下是和上面mindspore源码相同逻辑的 torch 版本

def debug_torch():
    import numpy as np
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import pynvml
    import gc
    import copy
    gpu_id = 0
    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    def get_gpu_memory_usage(device_id=0):
        """获取指定GPU的已用显存(单位:MiB)"""
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return mem_info.used / (1024**2)
    pynvml.nvmlInit()
    
    for i in range(100000):
        for j in range(10):
            mem_before = get_gpu_memory_usage(gpu_id)
            model = nn.Linear(10000,1000).to(device)
            loss_fn = nn.MSELoss()
            optimizer = optim.SGD(model.parameters(), 1e-2)
            for batch_idx, kkk in enumerate(range(1)):
                x = torch.from_numpy(np.random.rand(12800, 10000).astype(np.float32)).to(device)
                y = torch.from_numpy(np.random.rand(12800, 1000).astype(np.float32)).to(device)
                logits = model(x)
                loss = loss_fn(logits, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                break
            mem_after = get_gpu_memory_usage(gpu_id)
            print(f"Epoch {i} Client {j} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
            if mem_after - mem_before  == 1024:
                exit()
debug_torch

我已经理解你的问题了,你的意思是在每个for循环后mindspore不应该持有任何卡上内存(也就是以下代码)。

...
            mem_before = get_gpu_memory_usage(gpu_id)
            model = nn.Dense(10000,1000)
            loss_fn = nn.MSELoss()
            optimizer = nn.SGD(model.trainable_params(), 1e-2)
            def forward_fn(data, label):
                logits = model(data)
                loss = loss_fn(logits, label)
                return loss, logits
            grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
            for batch_idx, kkk in enumerate(range(1)):
                x = mindspore.Tensor(np.random.rand(12800, 10000), mindspore.float32)
                y = mindspore.Tensor(np.random.rand(12800, 1000), mindspore.float32)
                (loss, _), grads = grad_fn(x, y)
                optimizer(grads)
                mem_after = get_gpu_memory_usage(gpu_id)
                break
...

你可以跟随以下步骤协助问题定位:

  1. 使用mindspore.runtime.memory_allocated来替代get_gpu_memory_usage获取当前ms已使用的内存大小,并使用mindspore.runtime.memory_stats打印出内存数据。
  2. 将for循环数量调整到最低可复现的数量,比如跑1个循环能复现该问题就把i和j的循环数调整成1。从代码看你应该不需要i和j两层循环,仅一层for循环就可以复现问题。
  3. 在运行该python脚本前使用export VLOG_v=“(12900,12905)”;export GLOG_v=1打开INFO日志,并将日志上传至平台。我们会根据你上传的日志进行进一步的定位。

是的,我的需求是在每个for循环之后,mindspore 不应该持有任何卡上的显存。
我按照你的建议来修改代码,以便更清晰复现。具体来说:

  1. 使用了 mindspore.runtime.memory_allocated() 来获取当前 ms 已使用内存大小
  2. 使用 mindspore.runtime.memory_stats() 获取当前内存使用信息
  3. 将循环更改为仅一层循环,循环次数更改为最小次数,为3
  4. 设置了环境变量,os.environ[‘GLOG_v’] = ‘1’ 和 os.environ[‘VLOG_v’] = ‘(12900,12905)’

以下是最低可复现代码:

def debug_mindspore():
    import os
    os.environ['GLOG_v'] = '1'
    os.environ['VLOG_v'] = '(12900,12905)'
    import numpy as np
    import mindspore
    import mindspore.nn as nn
    mindspore.set_context(device_id=0, mode=mindspore.PYNATIVE_MODE) 
    for j in range(3):
        mem_before = mindspore.runtime.memory_allocated()
        model = nn.Dense(10000,10000)
        loss_fn = nn.MSELoss()
        optimizer = nn.SGD(model.trainable_params(), 1e-2)
        def forward_fn(data, label):
            logits = model(data)
            loss = loss_fn(logits, label)
            return loss, logits
        grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
        x = mindspore.Tensor(np.random.rand(12800, 10000), mindspore.float32)
        y = mindspore.Tensor(np.random.rand(12800, 10000), mindspore.float32)
        (loss, _), grads = grad_fn(x, y)
        optimizer(grads)
        mem_after = mindspore.runtime.memory_allocated()
        print(f"Epoch {j} before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f}  - Tag: {mem_after - mem_before>0}")
        print("memory_stats: ")
        for k, v in mindspore.runtime.memory_stats().items():
            print(f"  {k}: {v}")
        print("="*75,"\n")
debug_mindspore()

附件为日志。
ms_debug.zip (54.5 KB)

程序中 print 的内容如下:

Epoch 0 before: 0 - after: 3136167936 - 显存变化: 3136167936.00  - Tag: True
memory_stats: 
  total_reserved_memory: 5368709120
  total_allocated_memory: 3136167936
  total_idle_memory: 2232541184
  total_eager_free_memory: 0
  max_reserved_memory: 5368709120
  max_allocated_memory: 4384128512
  commom_mem_pool_stats: {'block_unit_size': 1073741824, 'block_counts': 4, 'blocks_info': {<capsule object NULL at 0x7febad6216b0>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad621380>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620660>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620570>: {'block_stream_id': 0, 'block_memory_size': 1073741824}}}
  persistent_mem_pool_stats: {'block_counts': 1, 'block_unit_size': 1073741824, 'blocks_info': {<capsule object NULL at 0x7febad620a20>: {'block_stream_id': 0, 'block_memory_size': 1073741824}}}
=========================================================================== 

Epoch 1 before: 3136167936 - after: 4336292352 - 显存变化: 1200124416.00  - Tag: True
memory_stats: 
  total_reserved_memory: 7516192768
  total_allocated_memory: 4336292352
  total_idle_memory: 3179900416
  total_eager_free_memory: 0
  max_reserved_memory: 7516192768
  max_allocated_memory: 6096293888
  commom_mem_pool_stats: {'block_unit_size': 1073741824, 'block_counts': 6, 'blocks_info': {<capsule object NULL at 0x7febad799410>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620120>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620060>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad6201e0>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad6201b0>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620240>: {'block_stream_id': 0, 'block_memory_size': 1073741824}}}
  persistent_mem_pool_stats: {'block_counts': 1, 'block_unit_size': 1073741824, 'blocks_info': {<capsule object NULL at 0x7febad620090>: {'block_stream_id': 0, 'block_memory_size': 1073741824}}}
=========================================================================== 

Epoch 2 before: 4336292352 - after: 5536416768 - 显存变化: 1200124416.00  - Tag: True
memory_stats: 
  total_reserved_memory: 8589934592
  total_allocated_memory: 5536416768
  total_idle_memory: 3053517824
  total_eager_free_memory: 0
  max_reserved_memory: 8589934592
  max_allocated_memory: 7296418304
  commom_mem_pool_stats: {'block_unit_size': 1073741824, 'block_counts': 7, 'blocks_info': {<capsule object NULL at 0x7febad6204b0>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620480>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620f60>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad622d00>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620450>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad620540>: {'block_stream_id': 0, 'block_memory_size': 1073741824}, <capsule object NULL at 0x7febad6205a0>: {'block_stream_id': 0, 'block_memory_size': 1073741824}}}
  persistent_mem_pool_stats: {'block_counts': 1, 'block_unit_size': 1073741824, 'blocks_info': {<capsule object NULL at 0x7febad621650>: {'block_stream_id': 0, 'block_memory_size': 1073741824}}}
=========================================================================== 

该现象已在昇腾硬件上复现,但定位分析需要时间请耐心等待,有进展会及时同步,多谢

好的

当前已经识别到问题根因,这个代码每个循环后的资源没有释放导致的内存增长。是否是问题还在定位,大概一周内会有结果。

你好,问题已经定位到在每个循环中都创建一个优化器实例导致。将这行代码移出for循环即可解决最低可复现代码的显存增长问题。

请问下为什么在你的例子中每个循环需要新创建model及optimizer?

我有一个联邦学习的项目要迁移。原项目中每次循环是将更新好的模型传到循环中进行训练。虽然可以改写逻辑将客户端移动到循环外,不过,经过改动之后,精度又和原来项目对不上了。所以希望尽可能对原项目的逻辑上有较小的改动。

有办法在循环中移除这个优化器造成的显存泄漏吗?而不用更改循环逻辑的前提下。

目前没有办法在不修改代码的前提下解决这个问题,还是建议把模型和优化器的定义移动到循环外解决这个问题。

@xiuguangli 用户您好,MindSpore支撑人已经分析并给出了问题的原因,由于较长时间未看到您回复,这里版主将进行采纳回答的结帖操作,如果还其他疑问请发新帖子提问,谢谢支持~

此话题已在最后回复的 60 分钟后被自动关闭。不再允许新回复。