1 系统环境
硬件环境(Ascend/GPU/CPU): GPU
MindSpore版本: 2.0rc1
执行模式(PyNative/ Graph): 不限
Python版本: 3.7.5
操作系统平台: 不限
2 报错信息
2.1 问题描述
使用自定义数据集运行mindformers的gpt2大模型,报错
2.2 报错信息
[WARNING] ME(287:139835160470080,MainProcess):2023-05-31-14:04:41.269.777 [mindspore/train/model.py:1096] For MFLossMonitor callback, {'epoch_end', 'step_begin', 'step_end', 'epoch_begin'} methods may not be supported in later version, Use methods prefixed with 'on_train' or 'on_eval' instead when using customized callbacks.
[WARNING] ME(287:139835160470080,MainProcess):2023-05-31-14:04:41.270.411 [mindspore/train/model.py:1096] For Local2ObsMonitor callback, {'step_end'} methods may not be supported in later version, Use methods prefixed with 'on_train' or 'on_eval' instead when using customized callbacks.
[WARNING] DEVICE(287,7f2adeffd700,python):2023-05-31-14:04:42.063.418 [mindspore/ccsrc/runtime/pynative/async/async_queue.cc:67] WorkerLoop] Run task failed, error msg:The predict type and infer type is not match, predict type is Tuple, infer type is Tensor[Int32], the name of operator is [GetNext]. Please modify or add predict type of operator in predict_out_type_map.h.
----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/pipeline/pynative/forward/forward.cc:154 UpdateStubNodeAbs
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_287/3074327597.py in <module>
13 optimizers=opt)
14 # 方式1: 开启训练,并使用训练好的权重进行推理
---> 15 trainer.train()
/opt/conda/lib/python3.7/site-packages/mindformers/trainer/trainer.py in train(self, resume_or_finetune_from_checkpoint, initial_epoch, do_eval, do_finetune, **kwargs)
406 wrapper=self.wrapper,
407 callbacks=self.callbacks,
--> 408 is_full_config=True, **kwargs)
409
410 def evaluate(self, eval_checkpoint: Optional[Union[str, bool]] = False, **kwargs):
/opt/conda/lib/python3.7/site-packages/mindformers/trainer/causal_language_modeling/causal_language_modeling.py in train(self, config, network, dataset, wrapper, optimizer, callbacks, **kwargs)
93 wrapper=wrapper,
94 optimizer=optimizer,
---> 95 **kwargs)
96
97 def evaluate(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/mindformers/trainer/base_trainer.py in training_process(self, config, network, dataset, optimizer, wrapper, callbacks, **kwargs)
519 dataset_sink_mode=config.runner_config.sink_mode,
520 sink_size=config.runner_config.per_epoch_size,
--> 521 initial_epoch=config.runner_config.initial_epoch)
522 logger.info(".........Training Over!.............")
523
/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size, initial_epoch)
1059 dataset_sink_mode=dataset_sink_mode,
1060 sink_size=sink_size,
-> 1061 initial_epoch=initial_epoch)
1062
1063 # When it's distributed training and using MindRT,
/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in wrapper(self, *args, **kwargs)
98 raise e
99 else:
--> 100 func(self, *args, **kwargs)
101 return wrapper
102
/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in _train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size, initial_epoch, valid_dataset, valid_frequency, valid_dataset_sink_mode)
615 else:
616 self._train_dataset_sink_process(epoch, train_dataset, list_callback,
--> 617 cb_params, sink_size, initial_epoch, valid_infos)
618
619 @staticmethod
/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in _train_dataset_sink_process(self, epoch, train_dataset, list_callback, cb_params, sink_size, initial_epoch, valid_infos)
693 list_callback.on_train_step_begin(run_context)
694 train_network = self._check_network_mode(train_network, True)
--> 695 outputs = train_network(*inputs)
696 cb_params.net_outputs = outputs
697
/opt/conda/lib/python3.7/site-packages/mindspore/nn/cell.py in __call__(self, *args, **kwargs)
659 except Exception as err:
660 _pynative_executor.clear_res()
--> 661 raise err
662
663 if isinstance(output, Parameter):
/opt/conda/lib/python3.7/site-packages/mindspore/nn/cell.py in __call__(self, *args, **kwargs)
655 try:
656 _pynative_executor.new_graph(self, *args, **kwargs)
--> 657 output = self._run_construct(args, kwargs)
658 _pynative_executor.end_graph(self, output, *args, **kwargs)
659 except Exception as err:
/opt/conda/lib/python3.7/site-packages/mindspore/nn/cell.py in _run_construct(self, cast_inputs, kwargs)
443 output = self._shard_fn(*cast_inputs, **kwargs)
444 else:
--> 445 output = self.construct(*cast_inputs, **kwargs)
446 if self._enable_forward_hook:
447 output = self._run_forward_hook(cast_inputs, output)
/opt/conda/lib/python3.7/site-packages/mindspore/train/dataset_helper.py in construct(self)
98
99 def construct(self):
--> 100 outputs = self.get_next()
101 return self.network(*outputs)
102
/opt/conda/lib/python3.7/site-packages/mindspore/ops/primitive.py in __call__(self, *args)
315 if should_elim:
316 return output
--> 317 return _run_op(self, self.name, args)
318
319 def __getstate__(self):
/opt/conda/lib/python3.7/site-packages/mindspore/ops/primitive.py in _run_op(obj, op_name, args)
890 if _RUN_OP_ASYNC:
891 stub = _pynative_executor.run_op_async(obj, args)
--> 892 return _convert_stub(stub)
893 return _run_op_sync(obj, op_name, args)
894
/opt/conda/lib/python3.7/site-packages/mindspore/common/_stub_tensor.py in _convert_stub(stub)
191 return tuple(_convert_stub(e) for e in stub)
192 if isinstance(stub, SequenceNode):
--> 193 elements = stub.get_elements()
194 return tuple(_convert_stub(e) for e in elements)
195 if isinstance(stub, NoneTypeNode):
TypeError: The predict type and infer type is not match, predict type is Tuple, infer type is Tensor[Int32], the name of operator is [GetNext]. Please modify or add predict type of operator in predict_out_type_map.h.
----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/pipeline/pynative/forward/forward.cc:154 UpdateStubNodeAbs复制
2.3 脚本代码
https://openi.pcl.ac.cn/kewei/mindspore-gpt2-finetune/src/branch/master/gpt2-finetune.ipynb
from mindformers import GPT2Tokenizer
from mindspore.dataset import GeneratorDataset
import mindspore as ms
ms.set_context(device_target="GPU")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
class TextSet:
def __init__(self, tokenizer):
with open('data.txt', 'r') as f:
self.content = f.read().replace('\n', '').split('。')
self.tokenizer = tokenizer
def __getitem__(self, idx):
input_ids = self.tokenizer(
self.content[idx],
padding='max_length',
max_length=1025,
return_tensors='ms')
return input_ids['input_ids'].astype(ms.int32)
def __len__(self):
return len(self.content)
txtset = TextSet(tokenizer)
dataset = GeneratorDataset(source=txtset, column_names=['input_ids'])
from mindarmour.privacy.diff_privacy import DPOptimizerClassFactory
from mindformers import AutoModel
model = AutoModel.from_pretrained("gpt2")
GaussianSGD = DPOptimizerClassFactory(micro_batches=2)
GaussianSGD.set_mechanisms('Gaussian', norm_bound=1.0, initial_noise_multiplier=1.5)
opt = GaussianSGD.create('Momentum')(params=model.trainable_params(),learning_rate=0.001,momentum=0.911)
from mindformers.trainer import Trainer
# 初始化预训练任务
trainer = Trainer(task='text_generation',model=model,train_dataset=dataset.batch(4),optimizers=opt)
# 方式1: 开启训练,并使用训练好的权重进行推理
trainer.train()
3 根因分析
1、根据日志提示,分析报错代码:问题和GetNext算子的输出类型相关GetNext算子预期输出为Tuple类型,实际输出为Tensor
TypeError: The predict type and infer type is not match, predict type is Tuple, infer type is Tensor[Int32], the name of operator is [GetNext]. Please modify or add predict type of operator in predict_out_type_map.h.
if (!success) {
const auto &op_name = op_run_info->base_op_run_info.op_name;
MS_EXCEPTION(TypeError) << "The predict type and infer type is not match, predict type is "
<< PredictOutTypeByName(op_name) << ", infer type is " << abs->BuildType()
<< ", the name of operator is [" << op_name
<< "]. Please modify or add predict type of operator in predict_out_type_map.h.";
}
2、查看并执行dataset相关脚本:dataset输出为Tuple类型,该结果和上述预期相符,说明框架代码中存在将Tuple(Tensor)解包为Tensor的逻辑
class TextSet:
def __init__(self, tokenizer):
with open('data.txt', 'r') as f:
self.content = f.read().replace('\n', '').split('。')
self.tokenizer = tokenizer
def __getitem__(self, idx):
input_ids = self.tokenizer(
self.content[idx],
padding='max_length',
max_length=1025,
return_tensors='ms')
return input_ids['input_ids'].astype(ms.int32)
def __len__(self):
return len(self.content)
txtset = TextSet(tokenizer)
dataset = GeneratorDataset(source=txtset, column_names=['input_ids'])
for item in dataset:
print(item)
break
执行结果如下
[Tensor(shape=[1025], dtype=Int32, value= [50256, 447, 251 ... 50256, 50256, 50256])]
3、分析代码:UpdateOutputAbstract函数在outputs个数为1时,将Tuple(Tensor)解包为Tensor,问题根因明确。
void UpdateOutputAbstract(const VectorRef &outputs, const session::BackendOpRunInfoPtr &op_run_info) {
auto output_size = outputs.size();
if (output_size == 1) {
auto output_tensor = utils::cast<tensor::TensorPtr>(outputs[0]);
MS_EXCEPTION_IF_NULL(output_tensor);
op_run_info->base_op_run_info.abstract = output_tensor->ToAbstract();
return;
}
4 解决方案
修改UpdateOutputAbstract逻辑,当算子为GetNext的输出类型Tuple,不做解包处理
void UpdateOutputAbstract(const VectorRef &outputs, const session::BackendOpRunInfoPtr &op_run_info) {
auto output_size = outputs.size();
if (output_size == 1 && op_run_info->base_op_run_info.op_name != kGetNextOpName) {
auto output_tensor = utils::cast<tensor::TensorPtr>(outputs[0]);
MS_EXCEPTION_IF_NULL(output_tensor);
op_run_info->base_op_run_info.abstract = output_tensor->ToAbstract();
return;
}