[ERROR] map operation

wangzixu · 2025 年6 月 27 日 16:09

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[16], line 4
      1 # for x in full_dataset.combined.create_dict_iterator():
      2 #     print(x)
      3 #     break
----> 4 for x in full_dataset.combined:
      5     print(x)
      6     break

File ~/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindspore/dataset/engine/iterators.py:152, in Iterator.__next__(self)
    149     raise RuntimeError("Iterator does not have a running C++ pipeline.")
    151 # Note offload is applied inside _get_next() if applicable since get_next converts to output format
--> 152 data = self._get_next()
    153 if not data:
    154     if self.__index == 0:

File ~/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindspore/dataset/engine/iterators.py:301, in TupleIterator._get_next(self)
    293 """
    294 Returns the next record in the dataset as a list
    295 
    296 Returns:
    297     List, the next record in the dataset.
    298 """
    300 if self.offload_model is None:
--> 301     return [self._transform_md_to_output(t) for t in self._iterator.GetNextAsList()]
    302 data = [self._transform_md_to_tensor(t) for t in self._iterator.GetNextAsList()]
    303 if data:

RuntimeError: Exception thrown from user defined Python function in dataset.

------------------------------------------------------------------
- Dataset Pipeline Error Message: 
------------------------------------------------------------------
[ERROR] map operation: [PyFunc] failed. The corresponding data file is: /home/ma-user/work/dataset/ExDark/ExDark/images/People/2015_06418.jpg, ./temp_annotations/exdark_coco.json. Error description:
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.

------------------------------------------------------------------
- C++ Call Stack: (For framework developers) 
------------------------------------------------------------------

mindspore/ccsrc/minddata/dataset/engine/datasetops/map_op/map_job.h(57).

import mindspore.dataset as ds
import mindspore.dataset.vision as vision
import mindspore.dataset.transforms as transforms
import numpy as np
import os
import json
import xml.etree.ElementTree as ET
from tqdm import tqdm
from pycocotools.coco import COCO

# ================== ExDark数据集转换函数 ==================
import os
import json
from PIL import Image
from tqdm import tqdm

def convert_exdark_to_coco(img_root, ann_root, output_path):
    """转换ExDark的TXT标注为COCO格式"""
    # 1. 定义COCO结构
    coco_data = {
        "images": [],
        "annotations": [],
        "categories": [
            {"id": i+1, "name": cls} for i, cls in enumerate([
                "Bicycle", "Boat", "Bottle", "Bus", "Car", 
                "Cat", "Chair", "Cup", "Dog", "Motorbike", "People", "Table"
            ])
        ]
    }
    annotation_id = 1

    # 2. 递归收集所有图像路径
    image_paths = []
    for cls_dir in os.listdir(img_root):
        cls_path = os.path.join(img_root, cls_dir)
        if os.path.isdir(cls_path):
            for img_file in os.listdir(cls_path):
                if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    image_paths.append((cls_dir, img_file, os.path.join(cls_path, img_file)))
    cur_img_id = 0
    # 3. 处理每张图像
    for img_id, (cls_name, img_file, img_path) in enumerate(tqdm(image_paths), 1):
        # 获取图像尺寸
        with Image.open(img_path) as img:
            width, height = img.size
        
        # 添加到images列表
        coco_data["images"].append({
            "id": img_id,
            "file_name": os.path.join(cls_name, img_file),  # 保留相对路径
            "width": width,
            "height": height
        })

        # 4. 解析对应的标注文件
        txt_path = os.path.join(ann_root, cls_name, img_file + ".txt")
        if not os.path.exists(txt_path):
            print('Warn:缺少' + txt_path)
            continue  # 跳过缺失标注

        with open(txt_path, 'r') as f:
            lines = f.readlines()
            for line in lines[1:]:  # 跳过首行注释
                if not line.strip(): 
                    continue
                parts = line.split()
                if len(parts) < 5: 
                    continue

                class_name = parts[0]
                x, y, w, h = map(float, parts[1:5])  # 提取坐标

                # 添加到annotations
                coco_data["annotations"].append({
                    "id": annotation_id,
                    "image_id": img_id,
                    "category_id": next(c["id"] for c in coco_data["categories"] if c["name"] == class_name),
                    "bbox": [x, y, w, h],  # [x_top_left, y_top_left, width, height]
                    "area": w * h,
                    "iscrowd": 0
                })
                annotation_id += 1

    # 5. 保存COCO格式JSON
    with open(output_path, 'w') as f:
        json.dump(coco_data, f, indent=2)
    return output_path

# ================== 数据集处理器类 ==================
class DualPathProcessor:
    """双路径数据处理器（自然光+低光）"""
    def __init__(self, img_size=640):
        self.resize = vision.Resize((img_size, img_size))
        self.hflip = vision.RandomHorizontalFlip(prob=0.5)
        self.color_jitter = vision.RandomColorAdjust(
            brightness=0.2, contrast=0.2, saturation=0.2
        )
        self.to_tensor = vision.ToTensor()
        
    def dark_isp(self, image):
        """合成低光图像"""
        gamma = np.random.uniform(1.8, 3.0)
        noise_sigma = np.random.uniform(0.03, 0.1)
        
        # 亮度衰减
        low_light = np.power(image.astype(np.float32) / 255.0, gamma) * 255
        
        # 添加传感器噪声
        noise = np.random.randn(*image.shape) * noise_sigma * 255
        result = np.clip(low_light + noise, 0, 255).astype(np.uint8)
        return result

    def __call__(self, image, bbox, category_id, iscrowd):
        print(f'{image.shape=}, {bbox.shape=}, {category_id.shape=}, {iscrowd.shape=}')
        # 自然光路径增强
        
        norm_img = self.color_jitter(self.hflip(self.resize(image)))
        # 合成低光路径
        low_img = self.resize(self.dark_isp(image))
        print(f'{norm_img.shape=}, {low_img.shape=}')
        
        return norm_img, low_img, (image, bbox, category_id, iscrowd)

# ================== 数据集加载器类 ==================
class MultiDatasetLoader:
    def merge_annotations(image, bbox, category_id, iscrowd):
        # 将bbox, category_id, iscrowd合并成一个元组
        annotation = (bbox, category_id, iscrowd)
        return image, annotation
    """多数据集统一加载器"""
    def __init__(self, dataset_dirs, img_size=640):
        """
        dataset_dirs: dict {
            'coco': {'img_dir': ..., 'ann_file': ...},
            'exdark': {'img_dir': ..., 'ann_dir': ...},
            'darkface': {'img_dir': ..., 'ann_file': ...},
            'lolv2': {'img_dir': ...}
        }
        """
        self.datasets = []
        self.processor = DualPathProcessor(img_size)
        
        # 临时目录用于存储转换后的标注
        self.temp_dir = "./temp_annotations"
        os.makedirs(self.temp_dir, exist_ok=True)
        
        # 加载COCO数据集（自然光主数据）
        if 'coco' in dataset_dirs:
            print("Loading COCO dataset...")
            coco_data = ds.CocoDataset(
                dataset_dir=dataset_dirs['coco']['img_dir'],
                annotation_file=dataset_dirs['coco']['ann_file'],
                task='Detection',
                decode=True,
                shuffle=True,
                num_parallel_workers=100
            )
            print(f'{coco_data.get_col_names()=}')
            # coco_data = coco_data.map(operations=self.merge_annotations, 
            #        input_columns=["image", "bbox", "category_id", "iscrowd"],
            #        output_columns=["image", "annotation"],
            #        num_parallel_workers=100)
            # print(f'{coco_data.get_col_names()=}')
            # coco_data = coco_data.map(operations=self.processor, 
            #                          input_columns=["image", "bbox", "category_id", "iscrowd"],
            #                          output_columns=["norm_img", "low_img", "annotation"],
            #                          num_parallel_workers=100,
            #                          python_multiprocessing=True)
            self.datasets.append(coco_data)
        
        # 加载ExDark（转换为COCO格式）
        if 'exdark' in dataset_dirs:
            print("Converting and loading ExDark dataset...")
            # 转换标注格式
            exdark_coco = os.path.join(self.temp_dir, "exdark_coco.json")
            convert_exdark_to_coco(
                dataset_dirs['exdark']['img_dir'],
                dataset_dirs['exdark']['ann_dir'],
                exdark_coco
            )
            print(f'{exdark_coco=}')
            # 加载转换后的数据集
            exdark_data = ds.CocoDataset(
                dataset_dir=dataset_dirs['exdark']['img_dir'],
                annotation_file=exdark_coco,
                task='Detection',
                decode=True,
                num_parallel_workers=100
            )
            print(f'{exdark_data.get_col_names()=}')
            # exdark_data = exdark_data.map(operations=self.merge_annotations, 
            #                    input_columns=["image", "bbox", "category_id", "iscrowd"],
            #                    output_columns=["image", "annotation"],
            #                    num_parallel_workers=100)
            # print(f'{exdark_data.get_col_names()=}')
            # exdark_data = exdark_data.map(operations=self.processor, 
            #                              input_columns=["image", "bbox", "category_id", "iscrowd"],
            #                              output_columns=["norm_img", "low_img", "annotation"],
            #                              num_parallel_workers=100)
            # print(f'{exdark_data.get_col_names()=}')
            self.datasets.append(exdark_data)
            
        # 加载DARK FACE（自定义格式）
#         if 'darkface' in dataset_dirs:
#             print("Loading DARK FACE dataset...")

#             # 1. 加载真实标注
#             def load_darkface_annotations(img_path):
#                 """解析DarkFace真实标注文件"""
#                 # 构建标注文件路径（假设标注文件与图像同名，扩展名为.txt）
#                 ann_path = img_path.replace('image', 'label').replace('.jpg', '.txt')

#                 annotations = []
#                 if os.path.exists(ann_path):
#                     with open(ann_path, 'r') as f:
#                         for line in f.readlines():
#                             if not line.strip():
#                                 continue
#                             # 假设标注格式：class_id x_min y_min x_max y_max
#                             parts = line.split()
#                             if len(parts) == 1:
#                                 class_id = int(parts[0])
#                             if len(parts) >= 4:
#                                 bbox = list(map(float, parts[0:4]))
#                                 annotations.append(bbox)
#                 return np.array(annotations, dtype=np.float32) if annotations else np.zeros((0, 5), dtype=np.float32)

#             # 2. 创建数据生成器
#             def darkface_generator():
#                 """DarkFace数据生成器"""
#                 img_dir = dataset_dirs['darkface']['img_dir']
#                 for img_file in os.listdir(img_dir):
#                     if not img_file.lower().endswith('.jpg'):
#                         continue

#                     img_path = os.path.join(img_dir, img_file)

#                     # 加载图像
#                     with open(img_path, 'rb') as f:
#                         img_data = np.frombuffer(f.read(), dtype=np.uint8)

#                     # 加载真实标注
#                     annotations = load_darkface_annotations(img_path)
#                     yield img_data, annotations

#             # 3. 创建数据集
#             darkface_data = ds.GeneratorDataset(
#                 source=darkface_generator(),
#                 column_names=["image", "annotation"],
#                 num_parallel_workers=100
#             )
            

#             # 4. 图像解码
#             darkface_data = darkface_data.map(
#                 operations=vision.Decode(),
#                 input_columns=["image"],
#                 num_parallel_workers=100
#             )
            

#             # 5. 应用处理器
#             darkface_data = darkface_data.map(
#                 operations=self.processor,
#                 input_columns=["image", "annotation"],
#                 output_columns=["norm_img", "low_img", "annotation"],
#                 num_parallel_workers=100
#             )
            
#             print(f'{darkface_data.get_datasets()=}')

#             self.datasets.append(darkface_data)
            
        # 加载LOL-v2（成对数据）
        if 'lolv2' in dataset_dirs:
            print("Loading LOL-v2 dataset...")
            lol_data = ds.GeneratorDataset(
                source=self.lolv2_generator(dataset_dirs['lolv2']['img_dir']),
                column_names=["image", "annotation"],
                num_parallel_workers=100
            )
            print(f'{lol_data.get_col_names()=}')
            print(f'{next(lol_data.create_tuple_iterator())}')
            # 应用处理器
            lol_data = lol_data.map(
                operations=self.processor,
                input_columns=["image", "annotation"],
                output_columns=["norm_img", "low_img", "annotation"],
                num_parallel_workers=100
            )
            self.datasets.append(lol_data)
        # 合并数据集
        if self.datasets:
            self.combined = self.datasets[0]
            for i in range(1, len(self.datasets)):
                self.combined = self.combined + self.datasets[i]
            self.combined.use_sampler(ds.RandomSampler())
            self.combined = self.combined.map(operations=self.processor, 
                             input_columns=["image", "bbox", "category_id", "iscrowd"],
                             output_columns=["norm_img", "low_img", "annotation"],
                             num_parallel_workers=100)
#             self.combined = self.combined.batch(1, drop_remainder=True, 
#                                                 num_parallel_workers=100)
        else:
            raise ValueError("No datasets loaded")
    
    def lolv2_generator(self, data_dir):
        """LOL-v2数据生成器（返回成对图像）"""
        # 收集所有低光图像路径
        low_light_files = []
        for root, _, files in os.walk(data_dir):
            for f in files:
                if f.lower().endswith(('.jpg', '.jpeg', '.png')) and 'low' in f.lower():
                    low_light_files.append(os.path.join(root, f))
        
        for low_path in low_light_files:
            # 找到对应的正常光照图像
            norm_path = low_path.replace('low', 'high').replace('LOW', 'HIGH')
            if not os.path.exists(norm_path):
                continue
                
            # 加载图像
            norm_img = vision.decode_image(open(norm_path, "rb").read())
            low_img = vision.decode_image(open(low_path, "rb").read())
            
            # 返回伪标注
            yield norm_img, np.array([[0, 0, 1, 1, 0]], dtype=np.float32)

# ================== NPU训练配置 ==================
def configure_for_npu(dataset):
    """NPU专用优化配置"""
    # 启用自动优化数据流水线
    ds.config.set_auto_offload(True)
    ds.config.set_enable_shared_mem(False)  # 避免共享内存冲突
    ds.config.set_numa_enable(True)
    if not hasattr(dataset, 'prefetch'):
        print("警告：当前数据集类型不支持prefetch，降级处理")
        # dataset = dataset.repeat(1)
        return dataset
    # 设置数据队列参数（防止NPU饥饿）
    dataset = dataset.prefetch(buffer_size=64, 
                               num_parallel_workers=100)
    dataset = dataset.repeat(1)
    return dataset

# # ================== 主执行流程 ==================
# if __name__ == "__main__":
dataset_config = {
    'coco': {
        'img_dir': dspath + '/coco2017/val2017',
        'ann_file': dspath + '/coco2017/annotations/instances_val2017.json'
    },
    'exdark': {
        'img_dir': dspath + '/ExDark/ExDark/images',
        'ann_dir': dspath + '/ExDark/ExDark/Annnotations'
    },
    # 'darkface': {
    #     'img_dir': dspath + '/DarkFace_Train_2021/image'
    # },
    # 'lolv2': {
    #     'img_dir': dspath + '/LOLdataset/'
    # }
}

# 初始化数据集
print("Initializing dataset loader...")
full_dataset = MultiDatasetLoader(dataset_config)
print(f'{full_dataset=}')
print(f'{full_dataset.combined=}')
for x in full_dataset.combined:
    print(x)
    break

chengxiaoli · 2025 年6 月 28 日 00:08

用户您好，欢迎安装使用MindSpore。还请补充下运行时的环境信息，方便定位问题~

wangzixu · 2025 年6 月 28 日 06:17

启智里打开的mindspore_2_2_cann7_cb2_caai
一个是coco2017数据集用cocodataset加载的，还有一个exdark数据集转换成coco格式再用cocodataset加载，想用map把那4个列转换下，然后图片也做一些操作，不加这个map就能正常迭代，加了就报错

wangzixu · 2025 年6 月 28 日 06:17

Package Version

aiofiles 23.2.1
albumentations 1.3.1
altair 5.1.2
antlr4-python3-runtime 4.9.3
anyio 3.7.1
arrow 1.2.2
asttokens 2.4.1
astunparse 1.6.3
attrs 23.1.0
auto-tune 0.1.0
backcall 0.2.0
backports.functools-lru-cache 1.6.5
binaryornot 0.4.4
c2net 0.2.0
cached-property 1.5.2
certifi 2023.7.22
cffi 1.15.0
chardet 4.0.0
charset-normalizer 3.3.1
click 8.1.7
cloudpickle 3.0.0
comm 0.1.4
configparser 3.7.3
contourpy 1.1.1
cookiecutter 2.1.1
cryptography 3.4.7
cycler 0.12.1
dataflow 0.0.1
debugpy 1.8.0
decorator 5.1.1
einops 0.7.0
ephemeral-port-reserve 1.1.1
esdk-obs-python 3.20.11
exceptiongroup 1.1.3
executing 1.2.0
fastapi 0.104.0
ffmpy 0.3.1
filelock 3.12.4
fonttools 4.43.1
fsspec 2023.10.0
ftfy 6.1.1
futures 3.0.5
gradio 3.50.2
gradio_client 0.6.1
h11 0.14.0
h5py 3.11.0
hccl 0.1.0
hccl-parser 0.1
httpcore 0.18.0
httpx 0.25.0
huaweicloudsdkcore 3.0.50
huggingface-hub 0.18.0
idna 3.4
imageio 2.31.6
imagesize 1.4.1
importlib-metadata 6.8.0
importlib-resources 6.1.0
ipykernel 6.26.0
ipython 8.16.1
jedi 0.19.1
jieba 0.42.1
Jinja2 3.1.2
jinja2-time 0.2.0
joblib 1.3.2
jsonschema 4.19.1
jsonschema-specifications 2023.7.1
jupyter_client 8.5.0
jupyter_core 5.4.0
kiwisolver 1.4.5
latex2mathml 3.76.0
lazy-import 0.2.2
lazy_loader 0.3
lxml 4.9.3
ma-cli 1.1.3
Markdown 3.5
MarkupSafe 2.1.3
matplotlib 3.8.0
matplotlib-inline 0.1.6
mdtex2html 1.2.0
mindpet 1.0.2
mindspore 2.2.0
mindspore-lite 2.2.0
modelarts 1.4.4
moxing-framework 2.1.16.2ae09d45
mpmath 1.3.0
msadvisor 1.0.0
nest-asyncio 1.5.8
networkx 3.2
nltk 3.8.1
numpy 1.26.1
omegaconf 2.3.0
op-compile-tool 0.1.0
op-gen 0.1
op-test-frame 0.1
opc-tool 0.1.0
opencv-python 4.8.1.78
opencv-python-headless 4.8.1.78
orjson 3.9.10
packaging 23.2
pandas 2.1.2
parso 0.8.3
pathlib2 2.3.7.post1
pexpect 4.8.0
pickleshare 0.7.5
Pillow 10.0.1
pip 23.3.1
platformdirs 3.11.0
prettytable 2.1.0
prompt-toolkit 3.0.39
protobuf 3.20.3
psutil 5.9.5
ptyprocess 0.7.0
pure-eval 0.2.2
pyarrow 12.0.1
pycocotools 2.0.10
pycparser 2.21
pydantic 1.10.11
pydub 0.25.1
Pygments 2.16.1
pyparsing 3.1.1
python-dateutil 2.8.2
python-multipart 0.0.6
python-slugify 6.1.2
pytz 2023.3.post1
PyYAML 6.0.1
pyzmq 25.1.1
qudida 0.0.4
referencing 0.30.2
regex 2023.10.3
requests 2.31.0
requests-futures 1.0.0
rouge-chinese 1.0.3
rpds-py 0.10.6
schedule-search 0.0.1
scikit-image 0.22.0
scikit-learn 1.3.2
scipy 1.11.3
semantic-version 2.10.0
sentencepiece 0.1.99
setuptools 68.2.2
six 1.16.0
sniffio 1.3.0
stack-data 0.6.2
starlette 0.27.0
sympy 1.12
synr 0.5.0
tabulate 0.8.9
te 0.4.0
tenacity 8.0.1
text-unidecode 1.3
threadpoolctl 3.2.0
tifffile 2023.9.26
toolz 0.12.0
tornado 6.3.3
tqdm 4.66.1
traitlets 5.12.0
typing_extensions 4.8.0
tzdata 2023.3
urllib3 2.0.7
uvicorn 0.23.2
wcwidth 0.2.8
websockets 11.0.3
wheel 0.41.2
zipp 3.17.0

longvoyage · 2025 年7 月 1 日 10:42

那个报错是如下原因,应该是map函数写的有问题.

setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.

a=[[1],[1,2],[1,2,3],[1,2,3,4]]
a=np.asarray(a)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.

chengxiaoli · 2025 年7 月 7 日 01:19

用户您好，MindSpore支撑人已经分析并给出了问题的原因，由于较长时间未看到您采纳回答，这里版主将进行采纳回答的结帖操作，如果还其他疑问请发新帖子提问，谢谢支持~

话题		回复	浏览量
使用map函数报错RuntimeError: Exception thrown from user defined Python function in dataset. 其他干货-Others	0	34	2025 年8 月 4 日
【MindSpore报错解决地图】常见报错问题和解决方案（持续更新）经验分享 Tech Blogs	3	218	2025 年11 月 17 日
MindSpore数据集加载-GeneratorDataset数据处理报错：The pointer[cnode] is null 数据加载及处理-Data Loading&Processing	0	71	2025 年7 月 22 日
MindSpore自定义数据增强报错【args should be Numpy narray.Got <class 'tuple'>】数据加载及处理-Data Loading&Processing	0	55	2025 年7 月 23 日
MindSpore报错RuntimeError: Invalid python function, the 'source' of 'GeneratorDataset' should return same number ... 数据加载及处理-Data Loading&Processing	0	20	2025 年8 月 25 日

[ERROR] map operation

相关话题