使用.om格式模型结合gradio框架进行推理出现模型执行错误

1 系统环境

硬件环境(Ascend/GPU/CPU): Ascend
MindSpore版本: mindspore=2.4.1
执行模式(PyNative/ Graph):不限
Python版本: Python=3.9
操作系统平台: Ubuntu 22.04

2 报错信息

2.1 问题描述

在香橙派上结合使用.om格式模型搭配gradio框架进行模型推理时出现

Execute model failed for acl.mdl.execute error 107004

2.2 脚本信息

资源初始化的几行代码:

acl_resource = AclLiteResource()  
acl_resource.init()  
path = os.getcwd()  
model_path = os.path.join(path, "nested_unet.om")  
model = AclLiteModel(model_path)

写在处理函数外会导致该错误,若填到gradio的处理函数中去,即脚本如下

import os  
import time  
import cv2  
import numpy as np  
import matplotlib.pyplot as plt  
import gradio as gr  
import acl  
import acllite_utils as utils  
from acllite_model import AclLiteModel  
from acllite_resource import resource_list  
from src.deep_learning.utils import get_time  
    
class AclLiteResource:  
    """  
    AclLiteResource  
    """  
    def __init__(self, device_id=0):  
        self.device_id = device_id  
        self.context = None  
        self.stream = None  
        self.run_mode = None  
            
    def init(self):  
        """  
        init resource  
        """  
        print("init resource stage:")  
        ret = acl.init()  
    
        ret = acl.rt.set_device(self.device_id)  
        utils.check_ret("acl.rt.set_device", ret)  
    
        self.context, ret = acl.rt.create_context(self.device_id)  
        utils.check_ret("acl.rt.create_context", ret)  
    
        self.stream, ret = acl.rt.create_stream()  
        utils.check_ret("acl.rt.create_stream", ret)  
    
        self.run_mode, ret = acl.rt.get_run_mode()  
        utils.check_ret("acl.rt.get_run_mode", ret)  
    
        print("Init resource success")  
    
    def __del__(self):  
        print("acl resource release all resource")  
        resource_list.destroy()  
        if self.stream:  
            print("acl resource release stream")  
            acl.rt.destroy_stream(self.stream)  
    
        if self.context:  
            print("acl resource release context")  
            acl.rt.destroy_context(self.context)  
    
        print("Reset acl device ", self.device_id)  
        acl.rt.reset_device(self.device_id)  
        print("Release acl resource success")  
    
def infer_ultrasound_image(image):  
    # 图像读取由gradio框架自动进行,为RGB格式  
    start = time.time()  
        
    """挪进来的代码"""  
    acl_resource = AclLiteResource()  
    acl_resource.init()  
    path = os.getcwd()  
    model_path = os.path.join(path, "nested_unet.om")  
    model = AclLiteModel(model_path)  
        
    copied_image = np.copy(image)  
    image = cv2.resize(np.array(image), dsize=(256, 256))  
    if len(image.shape) == 3:  
        input_array = np.expand_dims(image.astype(np.float32).transpose((2, 0, 1)), axis=0) / 127.5 - 1  
    else:  
        input_array = np.expand_dims(np.tile(image.astype(np.float32), reps=(3, 1, 1), axis=0)) / 127.5 - 1  
    result = model.execute([input_array, ])  
    output_as_numpy = np.argmax(result[0], axis=1).astype(np.uint8) * 255  
    output_as_numpy = output_as_numpy.reshape(256, 256)  
    
    kernel = np.ones((5, 5), np.uint8)  
    opened_output = cv2.morphologyEx(output_as_numpy, cv2.MORPH_OPEN, kernel)  
    processed_output = cv2.morphologyEx(opened_output, cv2.MORPH_CLOSE, kernel)  
    
    resized_output = cv2.resize(processed_output, dsize=(572, 572))  
    contours, hierarchy = cv2.findContours(resized_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)  
    res = cv2.drawContours(cv2.resize(copied_image, dsize=(572, 572)), contours, -1, (100, 255, 0), 1)  
    end = time.time()  
    print(get_time(start=start, end=end))  
    return res  
    
input_data = gr.Image(label='请输入甲状腺超声图像')  
output_data = gr.Image(label="结节位置如下图所示")  
    
    
iface = gr.Interface(fn=infer_ultrasound_image,  
                        inputs=input_data,  
                        outputs=output_data,  
                        title = "基于UNet++的甲状腺超声结节区域检测器",  
                        description = "选择甲状腺超声图像,通过图像分割和轮廓检测确定结节区域。")  
iface.launch()

运行的结果是正常的,能够在gradio构建的网页中返回预期图片,但是每次调用处理函数都要初始化一次model会导致性能的大幅降低,比直接用checkpoints推理还要慢不少。

3 根因分析

应该是gradio每次访问时调用的那个函数是不同的子线程,默认pyacl中子线程应该是用了不同的且也是未初始化的上下文导致推理出错。

4 解决方案

需要手动设置下子线程中也是用上面初始化中的主线程里所使用的上下文,参考代码:

context, _ = acl.rt.get_context()  
if context != acl_resource.context:  
    print("do set context...")  
    acl.rt.context(acl_resource.context)