1 系统环境
硬件环境(Ascend/GPU/CPU): Ascend
MindSpore版本: mindspore=2.3.1
执行模式(PyNative/ Graph): Graph
Python版本: Python=3.9
操作系统平台: linux
2 报错信息
2.1 问题描述
使用如下代码运行速度太慢,使用torch+gpu版本运行只需要不到1秒的时间,改为用mindspore+npu需要十多秒。
2.2 脚本信息
# 测试脚本如下
from mindspore import nn
import math,time
import mindspore.ops.functional as mF
import mindspore as ms
from mindspore import jit
import torch
import numpy as np
from mindspore.profiler import Profiler
def merge(container, item):
if container is None: return item
else: return ms.ops.cat((container, item), axis=0)
def backProcess(class_tokens, patch_tokens):
anomaly_maps_r, N = None, patch_tokens.shape[0]
# anomaly_maps_r, N = None, len(patch_tokens)
B, L, C = patch_tokens[0].shape[1:] # 1, 324, 1024
l2, L = int(math.sqrt(L - 1)), L - 1 # L取消一维cls token
k_min, k_max = 0.,0.3
if k_max < 1: k_max = int((N - 1) * k_max)
if k_min < 1: k_min = int((N - 1) * k_min)
gamma = ms.ops.ones((C, l2, l2), ms.float16)
beta = ms.ops.zeros((C, l2, l2), ms.float16)
# nn.LayerNorm()
layer_norm = ms.ops.LayerNorm(begin_norm_axis=1, begin_params_axis=1)
for patchsize in [3,5]:
Z_layers = None
for patch_token in patch_tokens: # 遍历layer(4)
features_layers = []
for feature in patch_token:
feature = feature[:, 1:, :] # remove the cls token
feature = feature.reshape(B, l2, l2, C)
feature = feature.transpose((0, 3, 1, 2)) # B, C, l2, l2
feature, _, _ = layer_norm(feature, gamma, beta)
padding = int((patchsize - 1) / 2)
unfolded_feature = ms.ops.unfold(feature, patchsize, 1, padding, 1)
unfolded_feature = unfolded_feature.reshape(B, C, patchsize, patchsize, -1) # B,C,ps,ps,L
feature = unfolded_feature.permute(0, 4, 1, 2, 3) # B,L,C,ps,ps,
feature = feature.reshape(-1, 1, C * patchsize * patchsize) # B*L,1,C*ps*ps
feature = ms.ops.adaptive_avg_pool1d(feature, C).squeeze(1) # B*L,C\
features_layers.append(feature)
features_layers = ms.ops.stack(features_layers, axis=1) # B*L,4,C
features = features_layers.reshape(B, L, -1, C) # (B, L, 4, C)
features /= features.norm(dim=-1, keepdim=True)
Z_layers = merge(Z_layers, features)
anomaly_maps_l = None
# print('121:',Z_layers.shape)
for i in range(Z_layers.shape[2]):
Z = Z_layers[:, :, i] # (N, L, C)
anomaly_maps_msm = None
for i in range(N):
Z_ref = ms.ops.cat((Z[:i], Z[i + 1:]), axis=0) # 参考Z
x1 = Z[i:i + 1].to(dtype=ms.float16) # 1,L,C
x2 = Z_ref.reshape(-1, C).unsqueeze(0).to(dtype=ms.float16) # 1,(N-1)*L,C
rscdist = ms.ops.cdist(x1, x2)
# print('234:', rscdist.shape, (L, N - 1, L))
patch2image = rscdist.reshape(L, N - 1, L)
patch2image = mF.reduce_min(patch2image, -1)
# interval average
vals, _ = ms.ops.topk(patch2image.float(), k_max, largest=False, sorted=True)
vals, _ = ms.ops.topk(vals.float(), k_max - k_min, largest=True, sorted=True)
anomaly_scores = ms.ops.mean(vals, axis=1).unsqueeze(0)
anomaly_maps_msm = merge(anomaly_maps_msm, anomaly_scores)
anomaly_maps_l = merge(anomaly_maps_l, anomaly_maps_msm.unsqueeze(0))
anomaly_maps_l = ms.ops.mean(anomaly_maps_l, axis=0).unsqueeze(0)
anomaly_maps_r = merge(anomaly_maps_r, anomaly_maps_l)
anomaly_maps = ms.ops.mean(anomaly_maps_r, axis=0)
ac_score, _ = ms.ops.max(anomaly_maps, axis=-1)
scores_cls = backFun(ac_score, class_tokens, k_list=[1, 2, 3])
return scores_cls
def backFun(scores_old, cls_tokens=None, k_list=[0]):
if cls_tokens is None or 0 in k_list: return scores_old
# cls_tokens = ms.ops.stack(cls_tokens, 0).float()
cls_tokens = cls_tokens.float()
# print('cls_tokens:',cls_tokens)
scores = (scores_old - scores_old.min()) / (scores_old.max() - scores_old.min())
W = cls_tokens @ cls_tokens.T # similarity_matrix
S_list = []
for k in k_list:
_, topk_matrix = ms.ops.topk(W.float(), W.shape[0] - k, largest=False, sorted=True)
W_mask = W.copy()
for i in range(W.shape[0]): W_mask[i, topk_matrix[i]] = 0
n = W.shape[-1]
D_ = ms.ops.zeros_like(W).float()
for i in range(n): D_[i, i] = 1 / (W_mask[i, :].sum())
P = D_ @ W_mask
S = scores.copy().unsqueeze(-1)
S = P @ S
S_list.append(S)
scores_new = ms.ops.cat(S_list, -1).mean(-1)
return scores_new
if __name__ == '__main__':
t1 = time.time()
ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target="Ascend", device_id=0)
patch_tokens_list = []
class_token_list = []
for _ in range(10):
patch_tokens_list = [ms.Tensor.from_numpy(np.random.randn(4, 1, 325, 1024).astype(np.float16)) for x in range(10)]
class_token_list = [ms.Tensor.from_numpy(np.random.randn(768, ).astype(np.float16)) for x in range(10)]
class_tokens = ms.ops.stack(class_token_list)
patch_tokens = ms.ops.stack(patch_tokens_list)
scores_cls = backProcess(class_tokens,patch_tokens)
t2 = time.time()
print(scores_cls)
print('timeconsume:',t2-t1)
3 根因分析
这是因为设置成了静态图,静态图在一开始会有图编译相关操作的时间,在昇腾上可能还有一些初始化操作,这也会造成第一次执行时间的加长。
4 解决方案
如果是使用动态图,一般可以排除前面一两次的时间再观察执行时间,如果是静态图的话,要确保每次输入模型的shape是一致的,不然会触发重编译,每次都编译,速度就变慢了。