1 系统环境
- 硬件环境(Ascend/GPU/CPU): Ascend910
- MindSpore版本: mindspore=2.4.0
- 执行模式(PyNative/ Graph): 不限
- Python版本: Python=3.8
- 操作系统平台: linux
2 报错信息
2.1 问题描述
使用昇腾hub仓库镜像
swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-mindspore:24.0.RC1-A2-ubuntu20.04
2.2 脚本信息
run.py代码内容:
import os
from mindcv.data import create_dataset, create_transforms, create_loader
cifar10_dir = '/home/HwHiAiUser/datasets-cifar10-bin/cifar-10-batches-bin/' # 你的数据存放路径
num_classes = 10 # 类别数
num_workers = 8 # 数据读取及加载的工作线程数
# 创建数据集
dataset_train = create_dataset(
name='cifar10', root=cifar10_dir, split='train', shuffle=True, num_parallel_workers=num_workers
)
# 创建所需的数据增强操作的列表
trans = create_transforms(dataset_name='cifar10', image_resize=224)
# 执行数据增强操作,生成所需数据集。
loader_train = create_loader(dataset=dataset_train,
batch_size=64,
is_training=True,
num_classes=num_classes,
transform=trans,
num_parallel_workers=num_workers)
num_batches = loader_train.get_dataset_size()
from mindcv.models import create_model
# 实例化 DenseNet-121 模型并加载预训练权重。
network = create_model(model_name='densenet121', num_classes=num_classes, pretrained=True)
from mindcv.loss import create_loss
loss = create_loss(name='CE')
from mindcv.scheduler import create_scheduler
# 设置学习率策略
lr_scheduler = create_scheduler(steps_per_epoch=num_batches,
scheduler='constant',
lr=0.0001)
from mindcv.optim import create_optimizer
# 设置优化器
opt = create_optimizer(network.trainable_params(), opt='adam', lr=lr_scheduler)
from mindspore import Model
# 封装可训练或推理的实例
model = Model(network, loss_fn=loss, optimizer=opt, metrics={'accuracy'})
from mindspore import LossMonitor, TimeMonitor, CheckpointConfig, ModelCheckpoint
# 设置在训练过程中保存网络参数的回调函数
ckpt_save_dir = './ckpt'
ckpt_config = CheckpointConfig(save_checkpoint_steps=num_batches)
ckpt_cb = ModelCheckpoint(prefix='densenet121-cifar10',
directory=ckpt_save_dir,
config=ckpt_config)
model.train(5, loader_train, callbacks=[LossMonitor(num_batches//5), TimeMonitor(num_batches//5), ckpt_cb], dataset_sink_mode=False)
# 加载验证数据集
dataset_val = create_dataset(name='cifar10', root=cifar10_dir, split='test', shuffle=True, num_parallel_workers=num_workers, download=download)
# 执行数据增强操作,生成所需数据集。
loader_val = create_loader(dataset=dataset_val,
batch_size=64,
is_training=False,
num_classes=num_classes,
transform=trans,
num_parallel_workers=num_workers)
# 验证微调后的DenseNet121的精度
acc = model.eval(loader_val, dataset_sink_mode=False)
print(acc)
2.3 报错信息
root@aa7694f83d88:/home/HwHiAiUser# python run.py
[WARNING] GE_ADPT(236,ffff84256010,python):2025-02-19-01:05:14.181.688 [mindspore/ccsrc/utils/dlopen_macro.h:82] DlsymAscend] Dynamically load symbol acltdtGetSliceInfoFromItem failed, ret/lib64/libacl_tdt_channel.so: undefined symbol: acltdtGetSliceInfoFromItem
/usr/local/python3.9.2/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.
setattr(self, word, getattr(machar, word).flat[0])
/usr/local/python3.9.2/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.
return self._float_to_str(self.smallest_subnormal)
/usr/local/python3.9.2/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.
setattr(self, word, getattr(machar, word).flat[0])
/usr/local/python3.9.2/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.
return self._float_to_str(self.smallest_subnormal)
32293888B [00:01, 20216098.00B/s]
[WARNING] ME(236:281472898785296,MainProcess):2025-02-19-01:06:05.518.737 [mindspore/train/serialization.py:1456] For 'load_param_into_net', 2 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint.
[WARNING] ME(236:281472898785296,MainProcess):2025-02-19-01:06:05.522.433 [mindspore/train/serialization.py:1460] ['classifier.weight', 'classifier.bias'] are not loaded.
/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/impl/util/util_conv2d_dynamic.py:130: UserWarning: conv2d fmap ori_range changed from [[32, 2147483647], [128, 128], [16, 63], [16, 63]] to [[32, 2147483647], [128, 128], [16, 63], (16, 63)].
warnings.warn(to_print)
/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/impl/util/util_conv2d_dynamic.py:130: UserWarning: conv2d fmap ori_range changed from [[32, 2147483647], [256, 256], [16, 63], [16, 63]] to [[32, 2147483647], [256, 256], [16, 63], (16, 63)].
warnings.warn(to_print)
/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/impl/util/util_conv2d_dynamic.py:130: UserWarning: conv2d fmap ori_range changed from [[32, 2147483647], [512, 512], [4, 15], [4, 15]] to [[32, 2147483647], [512, 512], [4, 15], (4, 15)].
warnings.warn(to_print)
[ERROR] KERNEL(236,fffc917fa1e0,python):2025-02-19-01:06:32.168.621 [mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_mod.cc:261] Launch] Kernel launch failed, msg: Malloc Mem From Mem Pool failed, size:25690144
----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/transform/acl_ir/acl_allocator.cc:32 AllocFunc
[ERROR] DEVICE(236,fffc917fa1e0,python):2025-02-19-01:06:32.168.737 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:948] LaunchKernel] Launch kernel failed, kernel full name: Default/AvgPool-op0
[ERROR] DEVICE(236,fffc917fa1e0,python):2025-02-19-01:06:32.217.974 [mindspore/ccsrc/runtime/pynative/op_runner.cc:324] MallocForKernelOutput] Allocate output memory failed, node:Default/BNTrainingUpdate-op0
Traceback (most recent call last):
File "/home/HwHiAiUser/run.py", line 60, in <module>
model.train(5, loader_train, callbacks=[LossMonitor(num_batches//5), TimeMonitor(num_batches//5), ckpt_cb], dataset_sink_mode=False)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/train/model.py", line 1082, in train
self._train(epoch,
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/train/model.py", line 115, in wrapper
func(self, *args, **kwargs)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/train/model.py", line 630, in _train
self._train_process(epoch, train_dataset, list_callback, cb_params, initial_epoch, valid_infos)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/train/model.py", line 932, in _train_process
outputs = self._train_network(*next_element)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/cell.py", line 693, in __call__
raise err
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/cell.py", line 689, in __call__
output = self._run_construct(args, kwargs)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/cell.py", line 477, in _run_construct
output = self.construct(*cast_inputs, **kwargs)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/wrap/cell_wrapper.py", line 418, in construct
return self._no_sens_impl(*inputs)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/wrap/cell_wrapper.py", line 433, in _no_sens_impl
loss = self.network(*inputs)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/cell.py", line 693, in __call__
raise err
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/nn/cell.py", line 690, in __call__
_pynative_executor.end_graph(self, output, *args, **kwargs)
File "/usr/local/python3.9.2/lib/python3.9/site-packages/mindspore/common/api.py", line 1264, in end_graph
self._executor.end_graph(obj, output, *args, *(kwargs.values()))
RuntimeError: Malloc for kernel input failed, Memory isn't enough, node:Default/ReduceMean-op0
----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/runtime/pynative/op_runner.cc:587 LaunchKernels
root@aa7694f83d88:/home/HwHiAiUser#
3 根因分析
根据报错信息的提示,这是因为使用的硬件算力不足造成的。
RuntimeError: Malloc for kernel input failed, Memory isn't enough, node:Default/ReduceMean-op0
Kernel launch failed, msg: Malloc Mem From Mem Pool failed, size:25690144
4 解决方案
对于解决算力不足的问题,有两个解决方向。
第一是可以使用算力内存更大的硬件,这可以在保证最终的模型精度。
第二是修改脚本中的超参数,比如将batch_size设置的更小,这样也能减小每个batch计算时所需的算力,那么也可以在不修改硬件的情况下继续训练模型。还可以使用更小一些的精度计算,这样也能节省硬件算力,但是这种方式都可能对模型的精度造成降低。