使用MindSpore静态图速度慢的问题

1 系统环境

硬件环境(Ascend/GPU/CPU): Ascend
MindSpore版本: mindspore=2.3.1
执行模式(PyNative/ Graph): Graph
Python版本: Python=3.9
操作系统平台: linux

2 报错信息

2.1 问题描述

使用如下代码运行速度太慢,使用torch+gpu版本运行只需要不到1秒的时间,改为用mindspore+npu需要十多秒。

2.2 脚本信息

# 测试脚本如下 
from mindspore import nn  
import math,time  
import mindspore.ops.functional as mF  
import mindspore as ms  
from mindspore import jit  
import torch  
import numpy as np  
from mindspore.profiler import Profiler  
    
    
def merge(container, item):  
    if container is None: return item  
    else: return ms.ops.cat((container, item), axis=0)  
    
    
def backProcess(class_tokens, patch_tokens):  
    anomaly_maps_r, N = None, patch_tokens.shape[0]  
    # anomaly_maps_r, N = None, len(patch_tokens)  
    B, L, C = patch_tokens[0].shape[1:]  # 1, 324, 1024  
    l2, L = int(math.sqrt(L - 1)), L - 1  # L取消一维cls token  
    k_min, k_max = 0.,0.3  
    if k_max < 1: k_max = int((N - 1) * k_max)  
    if k_min < 1: k_min = int((N - 1) * k_min)  
    gamma = ms.ops.ones((C, l2, l2), ms.float16)  
    beta = ms.ops.zeros((C, l2, l2), ms.float16)  
    # nn.LayerNorm()  
    layer_norm = ms.ops.LayerNorm(begin_norm_axis=1, begin_params_axis=1)  
    for patchsize in [3,5]:  
        Z_layers = None  
        for patch_token in patch_tokens:  # 遍历layer(4)  
            features_layers = []  
            for feature in patch_token:  
                feature = feature[:, 1:, :]  # remove the cls token  
                feature = feature.reshape(B, l2, l2, C)  
                feature = feature.transpose((0, 3, 1, 2))  # B, C, l2, l2  
                feature, _, _ = layer_norm(feature, gamma, beta)  
                padding = int((patchsize - 1) / 2)  
                unfolded_feature = ms.ops.unfold(feature, patchsize, 1, padding, 1)  
                unfolded_feature = unfolded_feature.reshape(B, C, patchsize, patchsize, -1)  # B,C,ps,ps,L  
                feature = unfolded_feature.permute(0, 4, 1, 2, 3)  # B,L,C,ps,ps,  
                feature = feature.reshape(-1, 1, C * patchsize * patchsize)  # B*L,1,C*ps*ps  
                feature = ms.ops.adaptive_avg_pool1d(feature, C).squeeze(1)  # B*L,C\  
                features_layers.append(feature)  
            features_layers = ms.ops.stack(features_layers, axis=1)  # B*L,4,C  
            features = features_layers.reshape(B, L, -1, C)  # (B, L, 4, C)  
            features /= features.norm(dim=-1, keepdim=True)  
            Z_layers = merge(Z_layers, features)  
        anomaly_maps_l = None  
        # print('121:',Z_layers.shape)  
        for i in range(Z_layers.shape[2]):  
            Z = Z_layers[:, :, i]  # (N, L, C)  
            anomaly_maps_msm = None  
            for i in range(N):  
                Z_ref = ms.ops.cat((Z[:i], Z[i + 1:]), axis=0)  # 参考Z  
                x1 = Z[i:i + 1].to(dtype=ms.float16)  # 1,L,C  
                x2 = Z_ref.reshape(-1, C).unsqueeze(0).to(dtype=ms.float16)  # 1,(N-1)*L,C  
                rscdist = ms.ops.cdist(x1, x2)  
                # print('234:', rscdist.shape, (L, N - 1, L))  
                patch2image = rscdist.reshape(L, N - 1, L)  
                patch2image = mF.reduce_min(patch2image, -1)  
                # interval average  
                vals, _ = ms.ops.topk(patch2image.float(), k_max, largest=False, sorted=True)  
                vals, _ = ms.ops.topk(vals.float(), k_max - k_min, largest=True, sorted=True)  
                anomaly_scores = ms.ops.mean(vals, axis=1).unsqueeze(0)  
                anomaly_maps_msm = merge(anomaly_maps_msm, anomaly_scores)  
            anomaly_maps_l = merge(anomaly_maps_l, anomaly_maps_msm.unsqueeze(0))  
        anomaly_maps_l = ms.ops.mean(anomaly_maps_l, axis=0).unsqueeze(0)  
        anomaly_maps_r = merge(anomaly_maps_r, anomaly_maps_l)  
    anomaly_maps = ms.ops.mean(anomaly_maps_r, axis=0)  
    ac_score, _ = ms.ops.max(anomaly_maps, axis=-1)  
    scores_cls = backFun(ac_score, class_tokens, k_list=[1, 2, 3])  
    return scores_cls  
    
    
def backFun(scores_old, cls_tokens=None, k_list=[0]):  
    if cls_tokens is None or 0 in k_list: return scores_old  
    # cls_tokens = ms.ops.stack(cls_tokens, 0).float()  
    cls_tokens = cls_tokens.float()  
    # print('cls_tokens:',cls_tokens)  
    scores = (scores_old - scores_old.min()) / (scores_old.max() - scores_old.min())  
    W = cls_tokens @ cls_tokens.T  # similarity_matrix  
    S_list = []  
    for k in k_list:  
        _, topk_matrix = ms.ops.topk(W.float(), W.shape[0] - k, largest=False, sorted=True)  
        W_mask = W.copy()  
        for i in range(W.shape[0]): W_mask[i, topk_matrix[i]] = 0  
        n = W.shape[-1]  
        D_ = ms.ops.zeros_like(W).float()  
        for i in range(n): D_[i, i] = 1 / (W_mask[i, :].sum())  
        P = D_ @ W_mask  
        S = scores.copy().unsqueeze(-1)  
        S = P @ S  
        S_list.append(S)  
    scores_new = ms.ops.cat(S_list, -1).mean(-1)  
    return scores_new  
    
    
if __name__ == '__main__':  
    t1 = time.time()  
    ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target="Ascend", device_id=0)  
    patch_tokens_list = []  
    class_token_list = []  
    for _ in range(10):  
        patch_tokens_list = [ms.Tensor.from_numpy(np.random.randn(4, 1, 325, 1024).astype(np.float16)) for x in range(10)]  
        class_token_list = [ms.Tensor.from_numpy(np.random.randn(768, ).astype(np.float16)) for x in range(10)]  
        class_tokens = ms.ops.stack(class_token_list)  
        patch_tokens = ms.ops.stack(patch_tokens_list)  
    scores_cls = backProcess(class_tokens,patch_tokens)  
    t2 = time.time()  
    print(scores_cls)  
    print('timeconsume:',t2-t1)

3 根因分析

这是因为设置成了静态图,静态图在一开始会有图编译相关操作的时间,在昇腾上可能还有一些初始化操作,这也会造成第一次执行时间的加长。

4 解决方案

如果是使用动态图,一般可以排除前面一两次的时间再观察执行时间,如果是静态图的话,要确保每次输入模型的shape是一致的,不然会触发重编译,每次都编译,速度就变慢了。