求助!!!!
最近在将一个 torch 的联邦学习项目迁移为 mindspore 的,
以下代码发生了泄漏,但是同逻辑的torch 和 paddle 版本的都不会发生泄漏,为什么?
代码是简化版的联邦学习逻辑,每个 train 是需要有 model 的。
每次变化的显存是 1024MB
已经尝试过的修改:
# 1.两个模式都设置过,一样的。
mindspore.set_context(device_id=gpu_id, mode=mindspore.GRAPH_MODE)
mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE)
#2.内存复用模式,动态和静态无效
mindspore.runtime.set_memory(optimize_level="O0")
#3.删除变量,强制 gc,并且mindspore.ms_memory_recycle(),也是无效
mindspore.ms_memory_recycle()
# 4.train 方法中的 model 无论是方法内新建还是外部传递参数,也是会溢出泄漏
# 5.重置 重置内存池中的全部峰值。
# mindspore.runtime.reset_peak_memory_stats()
for batch_idx, kkk in enumerate(range(1)):
images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
mem_before = get_gpu_memory_usage(gpu_id)
# (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
# optimizer(grads)
result = train_net(images, labels)
mem_after = get_gpu_memory_usage(gpu_id)
print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
del model,loss_fn,optimizer,images,labels,train_net
gc.collect()
mindspore.ms_memory_recycle()
mindspore.runtime.reset_peak_memory_stats()
mindspore 是会锁住资源吗?有办法强制释放吗?以下是 mindspore、torch、paddle 三个版本的代码。
求各位大佬指点!!!如何解决这个显存泄漏的问题
mindspore
# mindspore
import os
os.environ['GLOG_v'] = '3'
import numpy as np
import mindspore
import mindspore.nn as nn
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
"""获取指定GPU的已用显存(单位:MiB)"""
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return mem_info.used / (1024**2)
pynvml.nvmlInit()
class Network(nn.Cell):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.dense_relu_sequential = nn.SequentialCell(
nn.Dense(28*28, 512),
nn.ReLU(),
nn.Dense(512, 512),
nn.ReLU(),
nn.Dense(512, 10)
)
def construct(self, x):
x = self.flatten(x)
logits = self.dense_relu_sequential(x)
return logits
def train0(epoch=0, gpu_id=0):
model = Network()
model.set_train()
model.set_grad(False)
loss_fn = nn.CrossEntropyLoss()
def forward_fn(data, label):
logits = model(data)
loss = loss_fn(logits, label)
return loss,0,0,0,0,0,0
return loss
optimizer = nn.SGD(model.trainable_params(), 1e-2)
grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
for batch_idx, kkk in enumerate(range(2)):
images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
mem_before = get_gpu_memory_usage(gpu_id)
(loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
optimizer(grads)
mem_after = get_gpu_memory_usage(gpu_id)
print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
del model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
gc.collect()
mindspore.ms_memory_recycle()
return
def train(model:nn.Cell,epoch=0, gpu_id=0):
model = Network()
# model.set_train()
# model.set_grad(False)
loss_fn = nn.CrossEntropyLoss()
def forward_fn(data, label):
logits = model(data)
loss = loss_fn(logits, label)
return loss,0,0,0,0,0,0
return loss
optimizer = nn.SGD(model.trainable_params(), 1e-2)
# grad_fn = mindspore.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)
train_net= nn.TrainOneStepCell(nn.WithLossCell(model, loss_fn), optimizer=optimizer)
train_net.set_train(True)
for batch_idx, kkk in enumerate(range(2)):
images = mindspore.Tensor(np.random.rand(1280, 1, 28, 28), mindspore.float32)
labels = mindspore.Tensor(np.random.randint(0, 10, size=(1280,)), mindspore.int32)
mem_before = get_gpu_memory_usage(gpu_id)
# (loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item),grads = grad_fn(images, labels)
# optimizer(grads)
result = train_net(images, labels)
mem_after = get_gpu_memory_usage(gpu_id)
print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
# del model,loss_fn,optimizer,grad_fn,images,labels,loss,loss_ce,loss_1_item,loss_pi_item,loss_pa_item,loss_con_item,loss_con_2_item
del model,loss_fn,optimizer,images,labels,train_net
gc.collect()
mindspore.ms_memory_recycle()
# mindspore._pynative_executor.clear_resource()
return
gpu_id = 0 # 可根据需要设置
# mindspore.set_context(device_id=gpu_id, mode=mindspore.PYNATIVE_MODE)
mindspore.set_context(device_id=gpu_id, mode=mindspore.GRAPH_MODE)
mindspore.runtime.set_memory(optimize_level="O0")
# global rounds
# for i in range(100000):
# # model = Network()
# # clients training
# for j in range(10):
# # train(model, j, gpu_id)
# train(None,j, gpu_id)
for j in range(10000000):
train(None,j, gpu_id)
torch
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
"""获取指定GPU的已用显存(单位:MiB)"""
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return mem_info.used / (1024**2)
pynvml.nvmlInit()
class Network(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.dense_relu_sequential = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.dense_relu_sequential(x)
return logits
def train(epoch=0, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = Network().to(device)
model.train()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)
for batch_idx, _ in enumerate(range(2)):
images = torch.tensor(np.random.rand(1280, 1, 28, 28), dtype=torch.float32).to(device)
labels = torch.tensor(np.random.randint(0, 10, size=(1280,)), dtype=torch.long).to(device)
mem_before = get_gpu_memory_usage(gpu_id)
optimizer.zero_grad()
logits = model(images)
loss = loss_fn(logits, labels)
loss.backward()
optimizer.step()
mem_after = get_gpu_memory_usage(gpu_id)
print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
del model, optimizer, loss_fn
gc.collect()
return
gpu_id = 1 # 可根据需要设置
for j in range(100000):
train(j, gpu_id)
exit()
paddle
import numpy as np
import paddle
import paddle.nn as nn
import paddle.optimizer as optim
import pynvml
import gc
def get_gpu_memory_usage(device_id=0):
"""获取指定GPU的已用显存(单位:MiB)"""
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return mem_info.used / (1024**2)
pynvml.nvmlInit()
class Network(nn.Layer):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.dense_relu_sequential = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.dense_relu_sequential(x)
return logits
def train(epoch=0, gpu_id=0):
model = Network()
model.train()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(parameters=model.parameters(), learning_rate=1e-2)
for batch_idx, _ in enumerate(range(2)):
images = paddle.to_tensor(np.random.rand(1280, 1, 28, 28), dtype='float32')
labels = paddle.to_tensor(np.random.randint(0, 10, size=(1280,)), dtype='int64')
mem_before = get_gpu_memory_usage(gpu_id)
logits = model(images)
loss = loss_fn(logits, labels)
loss.backward()
optimizer.step()
optimizer.clear_grad()
mem_after = get_gpu_memory_usage(gpu_id)
print(f"Epoch {epoch} GPU {gpu_id} - batch_idx {batch_idx} - before: {mem_before} - after: {mem_after} - 显存变化: {mem_after - mem_before:.2f} MiB - Tag: {mem_after - mem_before>0}")
del model, optimizer, loss_fn
gc.collect()
return
gpu_id = 2 # 可根据需要设置
paddle.set_device(f"gpu:{gpu_id}" if paddle.is_compiled_with_cuda() else "cpu")
for j in range(100000):
train(j, gpu_id)
exit()
####发生显存变化的日志记录
Epoch 246 GPU 2 - batch_idx 0 - before: 3692.9375 - after: 3692.9375 - 显存变化: 0.00 MiB - Tag: False
Epoch 246 GPU 2 - batch_idx 1 - before: 3692.9375 - after: 4716.9375 - 显存变化: 1024.00 MiB - Tag: True
每次变化都是1024mb,并且是后续会进行多次增加。