1 系统环境
硬件环境(Ascend/GPU/CPU): CPU
MindSpore版本: 1.6.0
执行模式(动态图): 不限模式
Python版本: 3.7.5
操作系统平台: Ubuntu 16.04
2 报错信息
2.1 问题描述
自定义数据集转化成MindSpore Record数据格式出现问题
2.2 报错信息
RuntimeError: Unexpected error. Invalid data, the number of schema should be positive but got: 0. Please check the input schema.
2.3 脚本代码
import os
import numpy as np
import mindspore.mindrecord as record
input_ids = [[1,2,3],[4,5,6],[7,8,9]]
intent_ids = [1,2,3]
# 输出的MindSpore Record文件完整路径
MINDRECORD_FILE = "test1.mindrecord"
if os.path.exists(MINDRECORD_FILE):
os.remove(MINDRECORD_FILE)
os.remove(MINDRECORD_FILE +".db")
#定义样本数据包含的字段
nlp_schema = {"id": {"type": "int32"}, "input_ids": {"type": "int32","shape":[-1]}, "intent_ids": {"type": "int32"}}
def get_data(input_ids,intent_ids):
data_list = [ ]
length = len(intent_ids)
for i in range(length):
data_json = {"id": i,
"input_ids": np.array(input_ids[i]).astype(np.int32),
"intent_ids": np.array(intent_ids[i]).astype(np.int32)}
data_list.append(data json)
return data List
#声明MindSpore Record文件格式
writer = record.FileWriter(file_name=MINDRECORD_FILE, shard_num=1)
writer.add schema(nlp_schema,"preprocessed nlp dataset.")
data = get_data(input_ids,intent_ids)
writer.write_raw_data(data)
writer.co㎜it()
3 根因分析
“Invalid data”意为无效数据,分析为“intent_ids”定义的类型与实际使用到的数据类型不一致,导致匹配不到合适的数据。
从上面的代码可以看到id和intent_ids定义的类型是一样的,但是在get_data时候生成的data_json, 里面的id和intent_ids却不一样,
一个是int32, 一个却是np.array
4 解决方案
有两个解决方法:
方法一:
在data_json中,把intent_ids改为跟id一样的int数据
代码:
import os
import numpy as np
import mindspore.mindrecord as record
input_ids = [[1,2,3],[4,5,6],[7,8,9]]
intent_ids = [1,2,3]
# 输出的MindSpore Record文件完整路径
MINDRECORD_FILE = "test1.mindrecord"
if os.path.exists(MINDRECORD_FILE):
os.remove(MINDRECORD_FILE)
os.remove(MINDRECORD_FILE +".db")
#定义样本数据包含的字段
nlp_schema = {"id": {"type": "int32"}, "input_ids": {"type": "int32","shape":[-1]}, "intent_ids": {"type": "int32"}}
def get_data(input_ids,intent_ids):
data_list = [ ]
length = len(intent_ids)
print("murongmeng_length", length)
for i in range(length):
data_json = {"id": i,
"input_ids": np.array(input_ids[i]).astype(np.int32),
#"intent_ids": np.array(intent_ids[i]).astype(np.int32)}
"intent_ids": i}
data_list.append(data_json)
print("murongmeng_data_list:", data_list)
return data_list
#声明MindSpore Record文件格式
writer = record.FileWriter(file_name=MINDRECORD_FILE, shard_num=1)
writer.add_schema(nlp_schema,"preprocessed nlp dataset.")
data = get_data(input_ids,intent_ids)
writer.write_raw_data(data)
writer.commit()
执行结果:
(wys_op) root@ubuntu:/home/wys# python test.py
murongmeng_length 3
murongmeng_data_list: [{'id': 0, 'input_ids': array([1, 2, 3], dtype=int32), 'intent_ids': 0}, {'id': 1, 'input_ids': array([4, 5, 6], dtype=int32), 'intent_ids': 1}, {'id': 2, 'input_ids': array([7, 8, 9], dtype=int32), 'intent_ids': 2}]
(wys_op) root@ubuntu:/home/wys#

方法二:
定义样本数据的时候,把intent_ids改为跟input_ids一样
import os
import numpy as np
import mindspore.mindrecord as record
input_ids = [[1,2,3],[4,5,6],[7,8,9]]
intent_ids = [1,2,3]
# 输出的MindSpore Record文件完整路径
MINDRECORD_FILE = "test1.mindrecord"
if os.path.exists(MINDRECORD_FILE):
os.remove(MINDRECORD_FILE)
os.remove(MINDRECORD_FILE +".db")
#定义样本数据包含的字段
nlp_schema = {"id": {"type": "int32"}, "input_ids": {"type": "int32","shape":[-1]}, "intent_ids": {"type": "int32", "shape":[-1]}}
def get_data(input_ids,intent_ids):
data_list = [ ]
length = len(intent_ids)
print("murongmeng_length", length)
for i in range(length):
data_json = {"id": i,
"input_ids": np.array(input_ids[i]).astype(np.int32),
"intent_ids": np.array(intent_ids[i]).astype(np.int32)}
#"intent_ids": i}
data_list.append(data_json)
print("murongmeng_data_list:", data_list)
return data_list
#声明MindSpore Record文件格式
writer = record.FileWriter(file_name=MINDRECORD_FILE, shard_num=1)
writer.add_schema(nlp_schema,"preprocessed nlp dataset.")
data = get_data(input_ids,intent_ids)
writer.write_raw_data(data)
writer.commit()
执行结果:
(wys_op) root@ubuntu:/home/wys# python test.py
murongmeng_length 3
murongmeng_data_list: [{'id': 0, 'input_ids': array([1, 2, 3], dtype=int32), 'intent_ids': array(1, dtype=int32)}, {'id': 1, 'input_ids': array([4, 5, 6], dtype=int32), 'intent_ids': array(2, dtype=int32)}, {'id': 2, 'input_ids': array([7, 8, 9], dtype=int32), 'intent_ids': array(3, dtype=int32)}]
(wys_op) root@ubuntu:/home/wys#
