使用自定义数据集运行mindformers的gpt2大模型,报错TypeError: The predict type and infer type is not match, predict type is Tuple

1 系统环境

硬件环境(Ascend/GPU/CPU): GPU
MindSpore版本: 2.0rc1
执行模式(PyNative/ Graph): 不限
Python版本: 3.7.5
操作系统平台: 不限

2 报错信息

2.1 问题描述

使用自定义数据集运行mindformers的gpt2大模型,报错

2.2 报错信息

[WARNING] ME(287:139835160470080,MainProcess):2023-05-31-14:04:41.269.777 [mindspore/train/model.py:1096] For MFLossMonitor callback, {'epoch_end', 'step_begin', 'step_end', 'epoch_begin'} methods may not be supported in later version, Use methods prefixed with 'on_train' or 'on_eval' instead when using customized callbacks.
[WARNING] ME(287:139835160470080,MainProcess):2023-05-31-14:04:41.270.411 [mindspore/train/model.py:1096] For Local2ObsMonitor callback, {'step_end'} methods may not be supported in later version, Use methods prefixed with 'on_train' or 'on_eval' instead when using customized callbacks.
[WARNING] DEVICE(287,7f2adeffd700,python):2023-05-31-14:04:42.063.418 [mindspore/ccsrc/runtime/pynative/async/async_queue.cc:67] WorkerLoop] Run task failed, error msg:The predict type and infer type is not match, predict type is Tuple, infer type is Tensor[Int32], the name of operator is [GetNext]. Please modify or add predict type of operator in predict_out_type_map.h.

----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/pipeline/pynative/forward/forward.cc:154 UpdateStubNodeAbs
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_287/3074327597.py in <module>
     13     optimizers=opt)
     14 # 方式1: 开启训练,并使用训练好的权重进行推理
---> 15 trainer.train()

/opt/conda/lib/python3.7/site-packages/mindformers/trainer/trainer.py in train(self, resume_or_finetune_from_checkpoint, initial_epoch, do_eval, do_finetune, **kwargs)
    406             wrapper=self.wrapper,
    407             callbacks=self.callbacks,
--> 408             is_full_config=True, **kwargs)
    409 
    410     def evaluate(self, eval_checkpoint: Optional[Union[str, bool]] = False, **kwargs):

/opt/conda/lib/python3.7/site-packages/mindformers/trainer/causal_language_modeling/causal_language_modeling.py in train(self, config, network, dataset, wrapper, optimizer, callbacks, **kwargs)
     93             wrapper=wrapper,
     94             optimizer=optimizer,
---> 95             **kwargs)
     96 
     97     def evaluate(self, *args, **kwargs):

/opt/conda/lib/python3.7/site-packages/mindformers/trainer/base_trainer.py in training_process(self, config, network, dataset, optimizer, wrapper, callbacks, **kwargs)
    519                     dataset_sink_mode=config.runner_config.sink_mode,
    520                     sink_size=config.runner_config.per_epoch_size,
--> 521                     initial_epoch=config.runner_config.initial_epoch)
    522         logger.info(".........Training Over!.............")
    523 

/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size, initial_epoch)
   1059                     dataset_sink_mode=dataset_sink_mode,
   1060                     sink_size=sink_size,
-> 1061                     initial_epoch=initial_epoch)
   1062 
   1063         # When it's distributed training and using MindRT,

/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in wrapper(self, *args, **kwargs)
     98                 raise e
     99         else:
--> 100             func(self, *args, **kwargs)
    101     return wrapper
    102 

/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in _train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size, initial_epoch, valid_dataset, valid_frequency, valid_dataset_sink_mode)
    615             else:
    616                 self._train_dataset_sink_process(epoch, train_dataset, list_callback,
--> 617                                                  cb_params, sink_size, initial_epoch, valid_infos)
    618 
    619     @staticmethod

/opt/conda/lib/python3.7/site-packages/mindspore/train/model.py in _train_dataset_sink_process(self, epoch, train_dataset, list_callback, cb_params, sink_size, initial_epoch, valid_infos)
    693                 list_callback.on_train_step_begin(run_context)
    694                 train_network = self._check_network_mode(train_network, True)
--> 695                 outputs = train_network(*inputs)
    696                 cb_params.net_outputs = outputs
    697 

/opt/conda/lib/python3.7/site-packages/mindspore/nn/cell.py in __call__(self, *args, **kwargs)
    659         except Exception as err:
    660             _pynative_executor.clear_res()
--> 661             raise err
    662 
    663         if isinstance(output, Parameter):

/opt/conda/lib/python3.7/site-packages/mindspore/nn/cell.py in __call__(self, *args, **kwargs)
    655         try:
    656             _pynative_executor.new_graph(self, *args, **kwargs)
--> 657             output = self._run_construct(args, kwargs)
    658             _pynative_executor.end_graph(self, output, *args, **kwargs)
    659         except Exception as err:

/opt/conda/lib/python3.7/site-packages/mindspore/nn/cell.py in _run_construct(self, cast_inputs, kwargs)
    443             output = self._shard_fn(*cast_inputs, **kwargs)
    444         else:
--> 445             output = self.construct(*cast_inputs, **kwargs)
    446         if self._enable_forward_hook:
    447             output = self._run_forward_hook(cast_inputs, output)

/opt/conda/lib/python3.7/site-packages/mindspore/train/dataset_helper.py in construct(self)
     98 
     99     def construct(self):
--> 100         outputs = self.get_next()
    101         return self.network(*outputs)
    102 

/opt/conda/lib/python3.7/site-packages/mindspore/ops/primitive.py in __call__(self, *args)
    315         if should_elim:
    316             return output
--> 317         return _run_op(self, self.name, args)
    318 
    319     def __getstate__(self):

/opt/conda/lib/python3.7/site-packages/mindspore/ops/primitive.py in _run_op(obj, op_name, args)
    890     if _RUN_OP_ASYNC:
    891         stub = _pynative_executor.run_op_async(obj, args)
--> 892         return _convert_stub(stub)
    893     return _run_op_sync(obj, op_name, args)
    894 

/opt/conda/lib/python3.7/site-packages/mindspore/common/_stub_tensor.py in _convert_stub(stub)
    191         return tuple(_convert_stub(e) for e in stub)
    192     if isinstance(stub, SequenceNode):
--> 193         elements = stub.get_elements()
    194         return tuple(_convert_stub(e) for e in elements)
    195     if isinstance(stub, NoneTypeNode):

TypeError: The predict type and infer type is not match, predict type is Tuple, infer type is Tensor[Int32], the name of operator is [GetNext]. Please modify or add predict type of operator in predict_out_type_map.h.

----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/pipeline/pynative/forward/forward.cc:154 UpdateStubNodeAbs复制

2.3 脚本代码

https://openi.pcl.ac.cn/kewei/mindspore-gpt2-finetune/src/branch/master/gpt2-finetune.ipynb

from mindformers import GPT2Tokenizer
from mindspore.dataset import GeneratorDataset
import mindspore as ms
ms.set_context(device_target="GPU")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
class TextSet:
    def __init__(self, tokenizer):
        with open('data.txt', 'r') as f:
            self.content = f.read().replace('\n', '').split('。')
        self.tokenizer = tokenizer
        
    def __getitem__(self, idx):
        input_ids = self.tokenizer(
            self.content[idx], 
            padding='max_length',
            max_length=1025,
            return_tensors='ms')
        return input_ids['input_ids'].astype(ms.int32)
    
    def __len__(self):
        return len(self.content)
txtset = TextSet(tokenizer)
dataset = GeneratorDataset(source=txtset, column_names=['input_ids'])

from mindarmour.privacy.diff_privacy import DPOptimizerClassFactory
from mindformers import AutoModel
model = AutoModel.from_pretrained("gpt2")
GaussianSGD = DPOptimizerClassFactory(micro_batches=2)
GaussianSGD.set_mechanisms('Gaussian', norm_bound=1.0, initial_noise_multiplier=1.5)
opt = GaussianSGD.create('Momentum')(params=model.trainable_params(),learning_rate=0.001,momentum=0.911)


from mindformers.trainer import Trainer

# 初始化预训练任务
trainer = Trainer(task='text_generation',model=model,train_dataset=dataset.batch(4),optimizers=opt)

# 方式1: 开启训练,并使用训练好的权重进行推理
trainer.train()

3 根因分析

1、根据日志提示,分析报错代码:问题和GetNext算子的输出类型相关GetNext算子预期输出为Tuple类型,实际输出为Tensor

TypeError: The predict type and infer type is not match, predict type is Tuple, infer type is Tensor[Int32], the name of operator is [GetNext]. Please modify or add predict type of operator in predict_out_type_map.h.

  if (!success) {  
    const auto &op_name = op_run_info->base_op_run_info.op_name;  
    MS_EXCEPTION(TypeError) << "The predict type and infer type is not match, predict type is "  
                            << PredictOutTypeByName(op_name) << ", infer type is " << abs->BuildType()  
                            << ", the name of operator is [" << op_name  
                            << "]. Please modify or add predict type of operator in predict_out_type_map.h.";  
  }

2、查看并执行dataset相关脚本:dataset输出为Tuple类型,该结果和上述预期相符,说明框架代码中存在将Tuple(Tensor)解包为Tensor的逻辑

class TextSet:   
    def __init__(self, tokenizer):   
        with open('data.txt', 'r') as f:   
            self.content = f.read().replace('\n', '').split('。')   
        self.tokenizer = tokenizer   
            
    def __getitem__(self, idx):   
        input_ids = self.tokenizer(   
            self.content[idx],    
            padding='max_length',   
            max_length=1025,   
            return_tensors='ms')   
        return input_ids['input_ids'].astype(ms.int32)   
        
    def __len__(self):   
        return len(self.content)   
txtset = TextSet(tokenizer)   
dataset = GeneratorDataset(source=txtset, column_names=['input_ids'])   

for item in dataset:   
    print(item)   
    break

执行结果如下

[Tensor(shape=[1025], dtype=Int32, value= [50256,   447,   251 ... 50256, 50256, 50256])]

3、分析代码:UpdateOutputAbstract函数在outputs个数为1时,将Tuple(Tensor)解包为Tensor,问题根因明确。

void UpdateOutputAbstract(const VectorRef &outputs, const session::BackendOpRunInfoPtr &op_run_info) {  
  auto output_size = outputs.size();  
  if (output_size == 1) {  
    auto output_tensor = utils::cast<tensor::TensorPtr>(outputs[0]);  
    MS_EXCEPTION_IF_NULL(output_tensor);  
    op_run_info->base_op_run_info.abstract = output_tensor->ToAbstract();  
    return;  
  }

4 解决方案

修改UpdateOutputAbstract逻辑,当算子为GetNext的输出类型Tuple,不做解包处理

void UpdateOutputAbstract(const VectorRef &outputs, const session::BackendOpRunInfoPtr &op_run_info) {  
    auto output_size = outputs.size();  
    if (output_size == 1 && op_run_info->base_op_run_info.op_name != kGetNextOpName) {  
    auto output_tensor = utils::cast<tensor::TensorPtr>(outputs[0]);  
    MS_EXCEPTION_IF_NULL(output_tensor);  
    op_run_info->base_op_run_info.abstract = output_tensor->ToAbstract();  
    return;  
    }