使用ops.nonzero算子报错TypeError

1 系统环境

  • 硬件环境(Ascend/GPU/CPU): Ascend/GPU/CPU
  • MindSpore版本: mindspore=1.9.0
  • 执行模式(动态图):GRAPH/PYNATIVE
  • Python版本: Python=3.9
  • 操作系统平台: win10

2 报错信息

Traceback (most recent call last):
	File "D:lailrw.py", line 35,in <module>
		main()
	File "D:lailrw.py", line 30, in main
		(loss, logits, extra_output),inputs_gradient = grad_fn(x, y, labels)
	File "D: \python3.9\liblsite-packages\mindspore\commonlapi.py",line 594, in staging_specialize
		out = _MindsporeFunctionExecutor(func, hash_obj, input_signature,process_obj, jit_config)(*args)
	File "D: \python3.9\liblsite-packages\mindsporelcommonlapi.py", line 98, in wrapper
		results = fn(*arg, **kwargs)
	File "D:\python3.9\liblsite-packagles\mindsporelcommonlapi.py",line 405, in -_call_
		phase = self.compile(args_list, self.fn.-_name-
	File "D: \python3.9\liblsite-packages\mindspore\commonlapi.py",line 379, in compile
		is_compile = self._graph_executor.compile(self.fn, compile_args, phase, True)
TypeError: Type Join Failed: dtype1 = Float32, dtype2 = Intó4.
For more details, please refer to https://www.mindspore.cn/search?inputValue=Type%20Join%20Failed

Inner Message:
This: AbstractScalar(Type: FLoat32, VaLue: AnyValue, Shape: Noshape), other:AbstractScalar(Type: Intó4, Value: AnyValue, Shape: NoShape). Please check the node
The function call stack:
In file D:\python3.9\liblsite-packages\mindsporelops\compositelbase.py:527	return grad_(fn, weights)(*args)/

----------------------------------------------------------
- C++ Call Stack: (For frameworkdevelopers)
----------------------------------------------------------
mindspore\corelabstractlabstract_value.cc:65 TypeJoinLogging

2.1 问题描述

It raised TypeError: Type Join Failed: dtype1 = Float32, dtype2 = Int64.

2.2 脚本代码

import numpy as np
import mindspore as ms
import mindspore.ops as ops
from mindspore import Tensor, nn, context

class Graph(nn.Cell):
    def __init__(self):
        super().__init__()
        self.net = nn.Dense(10, 1)
    def construct(self, x, y):
        logits = self.net(x)
        extra_output = ops.nonzero(y > 0)
        return logits, extra_output

def main():
    context.set_context(mode=context.GRAPH_MODE)
    net = Graph()
    x = Tensor(np.random.randn(16, 10), dtype=ms.float32)
    y = Tensor([0, 0, 1, 1])
    labels = Tensor(np.random.randn(16, 1), dtype=ms.float32)
    logit, extra_output = net(x, y)
    assert extra_output.shape == (2, 1)
    loss_fn = nn.MSELoss()
    def forward(x, y, labels):
        logits, extra_output = net(x, y)
        loss = loss_fn(logits, labels)
        return loss, logits, extra_output
    grad_fn = ops.value_and_grad(forward, grad_position=None, weights=net.trainable_params(), has_aux=True)
    # raised TypeError here
    (loss, logits, extra_output), inputs_gradient = grad_fn(x, y, labels)
    assert extra_output.shape == (2, 1)

if __name__ == '__main__':
    main()

3 根因分析

  • value_and_grad 是同时计算网络的正向输出和梯度。Graph网络输出是一个tuple,(logits, extra_output)

    • logits是nn.Dense的输出结果,类型float32。
    • extra_output是ops.nonzero的输出结果,类型int64。
  • 梯度计算时由于MindSpore采用的是反向自动微分机制, 会对输出结果求和后再对输入求导。

    • 当输出类型不一致的情况下就会报错。

4 解决方案

  • 两种方法:
    • 1.把nonzero算子的返回值转换成float32,转换的算子是Cast。
    • 2.使用返回值是float32的算子。
  • 比如ops.count_nonzero
    • 第1种方案:
import numpy as np
import mindspore as ms
import mindspore.ops as ops
from mindspore import Tensor, nn, context

class Graph(nn.Cell):
    def __init__(self):
        super().__init__()
        self.net = nn.Dense(10, 1)
        self.cast = ops.Cast()
    def construct(self, x, y):
        logits = self.net(x)
        extra_output = ops.nonzero(y > 0)
        extra_output = self.cast(extra_output, ms.float32)
        return logits, extra_output
    
def main():
    context.set_context(mode=context.GRAPH_MODE)
    net = Graph()
    x = Tensor(np.random.randn(16, 10), dtype=ms.float32)
    y = Tensor([0, 0, 1, 1])
    labels = Tensor(np.random.randn(16, 1), dtype=ms.float32)
    logit, extra_output = net(x, y)
    print(extra_output.shape)
    print(extra_output)
    assert extra_output.shape == (2, 1)
    loss_fn = nn.MSELoss()
    def forward(x, y, labels):
        logits, extra_output = net(x, y)
        loss = loss_fn(logits, labels)
        return loss, logits, extra_output
    grad_fn = ops.value_and_grad(forward, grad_position=None, weights=net.trainable_params(), has_aux=True)
    # raised TypeError here
    (loss, logits, extra_output), inputs_gradient = grad_fn(x, y, labels)
    assert extra_output.shape == (2, 1)
    
if __name__ == '__main__':
    main()
  • 第2种方案:
import numpy as np
import mindspore as ms
import mindspore.ops as ops
from mindspore import Tensor, nn, context

class Graph(nn.Cell):
    def __init__(self):
        super().__init__()
        self.net = nn.Dense(10, 1)
    def construct(self, x, y):
        logits = self.net(x)
        extra_output = ops.count_nonzero(x=y, keep_dims=True, dtype=ms.float32)

        return logits, extra_output
    
def main():
    context.set_context(mode=context.GRAPH_MODE)
    net = Graph()
    x = Tensor(np.random.randn(16, 10), dtype=ms.float32)
    y = Tensor([0, 0, 1, 1])
    labels = Tensor(np.random.randn(16, 1), dtype=ms.float32)
    logit, extra_output = net(x, y)
    # assert extra_output.shape == (2, 1)
    loss_fn = nn.MSELoss()
    def forward(x, y, labels):
        logits, extra_output = net(x, y)
        loss = loss_fn(logits, labels)
        return loss, logits, extra_output
    grad_fn = ops.value_and_grad(forward, grad_position=None, weights=net.trainable_params(), has_aux=True)
    # raised TypeError here
    (loss, logits, extra_output), inputs_gradient = grad_fn(x, y, labels)
    # assert extra_output.shape == (2, 1)

if __name__ == '__main__':
    main()

最终可以正常执行。