From 02d74da9ae920ad3a07e5f064c8acd9399cd19f9 Mon Sep 17 00:00:00 2001 From: DaiFu Date: Thu, 3 Jul 2025 20:26:35 +0800 Subject: [PATCH] add ut --- test/_inductor/test_add.py | 10 +++ test/_inductor/test_add_layernorm.py | 32 ++++++++ test/_inductor/test_argmax_unalign.py | 15 +++- test/_inductor/test_attncp.py | 86 +++++++++++++++++++++- test/_inductor/test_broadcast.py | 14 +++- test/_inductor/test_broadcast_permute.py | 36 +++++++++ test/_inductor/test_deberta.py | 94 ++++++++++++++++++++++++ test/_inductor/test_issue61.py | 30 ++++++++ test/_inductor/test_mamba.py | 32 ++++++++ test/_inductor/test_permute.py | 32 ++++++++ test/_inductor/test_permute_reshape.py | 27 +++++++ test/_inductor/test_reduction.py | 34 +++++++++ test/_inductor/test_repeat.py | 11 ++- test/_inductor/test_reshape_permute.py | 29 ++++++++ test/_inductor/test_softmax.py | 53 +++++++++++++ test/_inductor/test_store_permute.py | 29 ++++++++ 16 files changed, 556 insertions(+), 8 deletions(-) create mode 100644 test/_inductor/test_add_layernorm.py create mode 100644 test/_inductor/test_broadcast_permute.py create mode 100644 test/_inductor/test_deberta.py create mode 100644 test/_inductor/test_issue61.py create mode 100644 test/_inductor/test_mamba.py create mode 100644 test/_inductor/test_permute_reshape.py create mode 100644 test/_inductor/test_reduction.py create mode 100644 test/_inductor/test_reshape_permute.py create mode 100644 test/_inductor/test_softmax.py create mode 100644 test/_inductor/test_store_permute.py diff --git a/test/_inductor/test_add.py b/test/_inductor/test_add.py index f34078e105..182787656d 100644 --- a/test/_inductor/test_add.py +++ b/test/_inductor/test_add.py @@ -22,6 +22,16 @@ class TestAdd(TestUtils): self.assertEqual(std_sum, inductor_sum) + @parametrize('shape', [(8, 8, 1024, 2048), (8, 8, 2048, 1024), (8, 1024, 2048, 8), (2048, 1024, 8, 8), + (2048, 8, 1024, 8), (8, 2048, 8, 1024)]) + def test_pointwise(self, shape): + a = self._generate_tensor(shape, 'float32') + b = self._generate_tensor(shape, 'float32') + r = self.op_calc(a, b) + func = torch.compile(self.op_calc, backend="inductor") + r1 = func(a, b) + self.assertEqual(r, r1, rtol=1e-3, atol=1e-3) + instantiate_parametrized_tests(TestAdd) diff --git a/test/_inductor/test_add_layernorm.py b/test/_inductor/test_add_layernorm.py new file mode 100644 index 0000000000..e1ae481b4f --- /dev/null +++ b/test/_inductor/test_add_layernorm.py @@ -0,0 +1,32 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + + +class TestAddLayerNorm(TestUtils): + def add_LayerNorm(self, a, b): + x = a + b + mean = torch.mean(x, dim=2, keepdim=True) + var = torch.mean((x - mean) ** 2, dim=2, keepdim=True) + 1e-5 + y = (x - mean) / torch.sqrt(var) + return y + + def test_add_layernorm(self): + Z = 64 + X = 512 + Y = 256 + hidden_states = torch.randn((Z, X, Y), dtype=torch.float32).npu() + add_layer = torch.randn((Z, X, Y), dtype=torch.float32).npu() + + eagerOutput = self.add_LayerNorm(hidden_states, add_layer) + comp_func = torch.compile(self.add_LayerNorm, backend="inductor", dynamic=False) + output = comp_func(hidden_states, add_layer) + self.assertEqual(eagerOutput, output, rtol=1e-4, atol=1e-4) + +if __name__ == "__main__": + run_tests() + + + diff --git a/test/_inductor/test_argmax_unalign.py b/test/_inductor/test_argmax_unalign.py index 34baef1ba1..c3bce7d40d 100644 --- a/test/_inductor/test_argmax_unalign.py +++ b/test/_inductor/test_argmax_unalign.py @@ -8,16 +8,27 @@ class TestMaxWithIndex(TestUtils): def op_calc(self, input_element, dim): return torch.argmax(input_element, dim) - @parametrize('shape', [(512, 64)]) # (513, 64), (514,33) + @parametrize('shape', [(512, 64)]) @parametrize('dim', [-1]) @parametrize('dtype', ['float32']) def test_reduction_cases(self, shape, dim, dtype): - input_element = torch.randn(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) * 2000 + input_element = self._generate_tensor(shape, dtype) * 2000 std_argmax = self.op_calc(input_element, dim) compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) inductor_argmax = compiled_op_calc(input_element, dim) self.assertEqual(std_argmax, inductor_argmax, atol=1e-2, rtol=1e-2) + @parametrize('shape', [(513, 64)]) + @parametrize('dim', [-1]) + @parametrize('dtype', ['float32', 'int64']) + def test_reduction_cases1(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) * 2000 + + std_argmax = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_argmax = compiled_op_calc(input_element, dim) + self.assertEqual(std_argmax, inductor_argmax, rtol=1e-2, atol=1e-2) + instantiate_parametrized_tests(TestMaxWithIndex) if __name__ == "__main__": diff --git a/test/_inductor/test_attncp.py b/test/_inductor/test_attncp.py index 966ecc855f..1828ce872c 100644 --- a/test/_inductor/test_attncp.py +++ b/test/_inductor/test_attncp.py @@ -1,5 +1,5 @@ import torch -from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from torch.testing._internal.common_utils import run_tests from testutils import TestUtils import torch_npu @@ -16,8 +16,7 @@ class TestAttnCp(TestUtils): y = c + y.permute(0, 1, 3, 2) return y - - def test_pointwise_cases(self): + def test_basic_pointwise_cases(self): a, b = [torch.randn(self.shape, dtype=torch.float32, device="npu") for _ in range(2)] d = torch.randn(self.shape, dtype=torch.float32, device="npu") c = d.permute(0, 1, 3, 2).contiguous() @@ -26,7 +25,86 @@ class TestAttnCp(TestUtils): r1 = self.foo(a, b, c) self.assertEqual(r, r1, atol=1e-3, rtol=1e-3) -instantiate_parametrized_tests(TestAttnCp) + + def trans_BNSD2SBH(self, x): + """Trans data layout from BNSD to SBH""" + return rearrange(x, 'b n s d -> s b (n d)').contiguous() + + def broadcast_and_trans_BNSD2SBH(self, x, h): + """broadcast and trans a tensor from [b, n, s, 8] to [s, b, h]""" + n = x.shape[1] + d = h // n + # [b, n, s, 8] -> [b, n, s, d] + new_x = x[..., 0].unsqueeze(3) + new_x = new_x.repeat(1, 1, 1, d) + return self.trans_BNSD2SBH(new_x) + + def forward_update(self, prev_attn_out, prev_softmax_max, prev_softmax_sum, + cur_attn_out, cur_softmax_max, cur_softmax_sum): + org_dtype = prev_attn_out.dtype + softmax_max = torch.maximum(prev_softmax_max, cur_softmax_max) + prev_scale = torch.exp(prev_softmax_max - softmax_max) + cur_scale = torch.exp(cur_softmax_max - softmax_max) + # update softmax_sum + prev_softmax_sum_scaled = prev_softmax_sum * prev_scale + cur_softmax_sum_scaled = cur_softmax_sum * cur_scale + softmax_sum = prev_softmax_sum_scaled + cur_softmax_sum_scaled + # out updating scale + prev_out_scale = prev_softmax_sum_scaled / softmax_sum + cur_out_scale = cur_softmax_sum_scaled / softmax_sum + # [b, n, s, 8] -> [s, b, h] + prev_out_scale_sbh = broadcast_and_trans_BNSD2SBH(prev_out_scale, prev_attn_out.shape[-1]) + cur_out_scale_sbh = broadcast_and_trans_BNSD2SBH(cur_out_scale, prev_attn_out.shape[-1]) + # update output + attn_out = prev_attn_out * prev_out_scale_sbh + cur_attn_out * cur_out_scale_sbh + attn_out = attn_out.to(org_dtype) + return attn_out, softmax_max, softmax_sum + + def data_validation(self, forward_update_triton, prev_softmax_max, cur_softmax_max, prev_softmax_sum, + cur_softmax_sum, prev_attn_out, cur_attn_out): + (attn_out, softmax_max, softmax_sum) = self.forward_update(prev_attn_out, prev_softmax_max, prev_softmax_sum, + cur_attn_out, cur_softmax_max, cur_softmax_sum) + + (tt_attn_out, tt_softmax_max, tt_softmax_sum) = forward_update_triton(prev_attn_out, + prev_softmax_max, + prev_softmax_sum, + cur_attn_out, + cur_softmax_max, + cur_softmax_sum) + + self.assertEqual(softmax_max, tt_softmax_max) + self.assertEqual(softmax_sum, tt_softmax_sum) + self.assertEqual(attn_out, tt_attn_out) + + def test_pointwise_cases(self): + enable_npu_indexing_ori = torch_npu._inductor.config.enable_npu_indexing + torch_npu._inductor.config.enable_npu_indexing = True + (S, B, H, N) = (4096, 1, 1536, 12) + DS = 2 * S + DTYPE_ATTN = torch.float32 + DTYPE = torch.float32 + + prev_attn_out = torch.randn((DS, B, H), dtype=DTYPE_ATTN).npu() + prev_softmax_max = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE) + prev_softmax_sum = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE) + cur_attn_out = torch.randn((DS, B, H), dtype=DTYPE_ATTN).npu() + cur_softmax_max = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE) + cur_softmax_sum = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE) + forward_update_triton_2s = torch.compile(self.forward_update, backend="inductor", options={"aggressive_fusion": True}) + self.data_validation(forward_update_triton_2s, prev_softmax_max, cur_softmax_max, prev_softmax_sum, cur_softmax_sum, + prev_attn_out, cur_attn_out) + + prev_attn_out_s = prev_attn_out.view(2, S, B, H)[1] + prev_softmax_max_s = prev_softmax_max.view(B, N, 2, S, F32_BLK_SIZE)[:, :, 1, :, :] + prev_softmax_sum_s = prev_softmax_sum.view(B, N, 2, S, F32_BLK_SIZE)[:, :, 1, :, :] + cur_attn_out_s = torch.randn((S, B, H), dtype=DTYPE_ATTN).npu() + cur_softmax_max_s = torch.rand((B, N, S), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE) + cur_softmax_sum_s = torch.rand((B, N, S), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE) + forward_update_triton = torch.compile(self.forward_update, backend="inductor") + self.data_validation(forward_update_triton, prev_softmax_max_s, cur_softmax_max_s, prev_softmax_sum_s, + cur_softmax_sum_s, prev_attn_out_s, cur_attn_out_s) + + torch_npu._inductor.config.enable_npu_indexing = enable_npu_indexing_ori if __name__ == "__main__": run_tests() diff --git a/test/_inductor/test_broadcast.py b/test/_inductor/test_broadcast.py index 93e78f0351..f9ec526f95 100644 --- a/test/_inductor/test_broadcast.py +++ b/test/_inductor/test_broadcast.py @@ -16,7 +16,6 @@ class TestBroadcast(TestUtils): y = a + b return y - @parametrize('shape', [(8, 8, 256)]) @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16']) def test_view_cases(self, shape, dtype): @@ -33,6 +32,19 @@ class TestBroadcast(TestUtils): self.assertEqual(std_broadcast.float(), inductor_broadcast.float(), atol=1e-3, rtol=1e-3) + def bar(self, a, b): + return a + b + + def test_add_cases(self): + a = torch.randn((16, 1), device='npu', dtype=torch.float16) + b = torch.randn((1, 16), device='npu', dtype=torch.float16) + + ret = self.bar(a, b) + compiled_bar = torch.compile(self.bar, backend="inductor") + inductor_ret = compiled_bar(a, b) + self.assertEqual(ret, inductor_ret, rtol=1e-3, atol=1e-3) + + instantiate_parametrized_tests(TestBroadcast) if __name__ == "__main__": diff --git a/test/_inductor/test_broadcast_permute.py b/test/_inductor/test_broadcast_permute.py new file mode 100644 index 0000000000..9dec6421b8 --- /dev/null +++ b/test/_inductor/test_broadcast_permute.py @@ -0,0 +1,36 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + +torch_npu._inductor.config.enable_npu_indexing = True + + +class TestBroadcastPermute(TestUtils): + base_shape = (8, 8, 256, 128) + + def foo(self, a, b, c, dim, permute_shape): + y = a + b + y = y.sum(dim) + y = y.unsqueeze(dim) + y = y.broadcast_to(self.base_shape) + b + y = c + y.permute(permute_shape) + return y + + a, b = [torch.randn(self.base_shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)] + d = torch.randn(self.base_shape, requires_grad=False, dtype=torch.float32, device="npu") + + @parametrize('shape', [(0, 1, 3, 2), (2, 0, 1, 3), (1, 0, 2, 3), (3, 0, 1, 2), (0, 2, 1, 3)]) + @parametrize('dim', [3, 2, 1, 0]) + def test_broadcast_permute(self, shape, dim): + c = self.d.permute(shape).contiguous() + func = torch.compile(foo, backend="inductor") + r = func(self.a, self.b, c, dim, shape) + r1 = foo(self.a, self.b, c, dim, shape) + self.assertEqual(r, r1, rtol=1e-3, atol=1e-3) + + +instantiate_parametrized_tests(TestBroadcastPermute) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_deberta.py b/test/_inductor/test_deberta.py new file mode 100644 index 0000000000..72e5b11413 --- /dev/null +++ b/test/_inductor/test_deberta.py @@ -0,0 +1,94 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class TestDeberta(TestUtils): + def fused_14_eager(self, mul_6, arg23_1): + eq_2: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 1) + npu_dtype_cast_2: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_2, torch.float16) + unsqueeze_19: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_2, 1) + unsqueeze_20: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3) + mul_7: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_20) + amax: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_7, [2]) + eq_3: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 2) + npu_dtype_cast_3: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_3, torch.float16) + unsqueeze_21: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_3, 1) + unsqueeze_22: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_21, 3) + mul_8: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_22) + amax_1: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_8, [2]) + eq_4: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 3) + npu_dtype_cast_4: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_4, torch.float16) + unsqueeze_23: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_4, 1) + unsqueeze_24: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_23, 3) + mul_9: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_24) + amax_2: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_9, [2]) + eq_5: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 4) + npu_dtype_cast_5: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_5, torch.float16) + unsqueeze_25: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_5, 1) + unsqueeze_26: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_25, 3) + mul_10: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_26) + amax_3: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_10, [2]) + amax_4: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_6, [2]) + return amax, amax_1, amax_2, amax_3, amax_4 + + def reduction_test(self, mul_6, arg23_1): + eq_2: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 1) + npu_dtype_cast_2: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_2, torch.float16) + unsqueeze_19: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_2, 1) + unsqueeze_20: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3) + mul_7: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_20) + return torch.amax(mul_7, [2]) + + arg23_1 = torch.rand_strided((10, 255), (255, 1), device='npu:0', dtype=torch.float16) + buf50 = torch.empty_strided((10, 15, 255, 96), (367200, 1, 1440, 15), device='npu', dtype=torch.float16) + + def test_fused_14_eager(self): + _, _, _, _, r = fused_14_eager(self.buf50, self.arg23_1) + fused_14_triton = torch.compile(fused_14_eager, backend="inductor") + _, _, _, _, r1 = fused_14_triton(self.buf50, self.arg23_1) + self.assertEqual(r, r1) + + def test_reduction(self): + r = reduction_test(self.buf50, self.arg23_1) + reduction_triton = torch.compile(reduction_test, backend="inductor") + r1 = reduction_triton(self.buf50, self.arg23_1) + self.assertEqual(r, r1) + + def triton_unk_fused_div_mul_sum_14_eager(self, cat, sum_7, cat_1, sum_8): + view_151: "f16[10, 256, 4, 128]" = torch.ops.aten.view.default(cat_1, [10, 256, 4, 128]) + slice_19: "f16[10, 255, 4, 128]" = torch.ops.aten.slice.Tensor(view_151, 1, 1, 9223372036854775807) + unsqueeze_6: "f16[10, 1, 255, 4, 128]" = torch.ops.aten.unsqueeze.default(slice_19, 1) + view_150: "f16[10, 16, 4, 128]" = torch.ops.aten.view.default(cat, [10, 16, 4, 128]); + slice_18: "f16[10, 15, 4, 128]" = torch.ops.aten.slice.Tensor(view_150, 1, 1, 9223372036854775807) + unsqueeze_5: "f16[10, 15, 1, 4, 128]" = torch.ops.aten.unsqueeze.default(slice_18, 2) + pow_2: "f32[10, 15, 1, 4, 1]" = torch.ops.aten.pow.Tensor_Scalar(sum_7, 0.5) + convert_element_type_63: "f16[10, 15, 1, 4, 1]" = torch.ops.prims.convert_element_type.default(pow_2, torch.float16) + clamp_min_2: "f16[10, 15, 1, 4, 1]" = torch.ops.aten.clamp_min.default(convert_element_type_63, 1e-06) + expand_13: "f16[10, 15, 1, 4, 128]" = torch.ops.aten.expand.default(clamp_min_2, [10, 15, 1, 4, 128]) + div_6: "f16[10, 15, 1, 4, 128]" = torch.ops.aten.div.Tensor(unsqueeze_5, expand_13) + pow_4: "f32[10, 1, 255, 4, 1]" = torch.ops.aten.pow.Tensor_Scalar(sum_8, 0.5) + convert_element_type_65: "f16[10, 1, 255, 4, 1]" = torch.ops.prims.convert_element_type.default(pow_4, torch.float16) + clamp_min_3: "f16[10, 1, 255, 4, 1]" = torch.ops.aten.clamp_min.default(convert_element_type_65, 1e-06) + expand_14: "f16[10, 1, 255, 4, 128]" = torch.ops.aten.expand.default(clamp_min_3, [10, 1, 255, 4, 128]) + div_7: "f16[10, 1, 255, 4, 128]" = torch.ops.aten.div.Tensor(unsqueeze_6, expand_14) + mul_45: "f16[10, 15, 255, 4, 128]" = torch.ops.aten.mul.Tensor(div_6, div_7) + sum_9: "f16[10, 15, 255, 4]" = torch.ops.aten.sum.dim_IntList(mul_45, [4]) + mul_46: "f16[10, 15, 255, 4]" = torch.ops.aten.mul.Tensor(sum_9, 0.5) + return mul_46 + + def test_reduction(self): + buf377 = torch.rand_strided((10, 16, 512), (8192, 512, 1), device='npu', dtype=torch.float16) + buf379 = torch.rand_strided((10, 15, 1, 4, 1), (60, 4, 600, 1, 600), device='npu', dtype=torch.float32) + buf380 = torch.rand_strided((10, 256, 512), (131072, 512, 1), device='npu', dtype=torch.float16) + buf382 = torch.rand_strided((10, 1, 255, 4, 1), (1020, 10208, 4, 1, 10208), device='npu', dtype=torch.float32) + r1 = triton_unk_fused_div_mul_sum_14_eager(buf377, buf379, buf380, buf382) + compiled_14 = torch.compile(triton_unk_fused_div_mul_sum_14_eager, backend="inductor") + r = compiled_14(buf377, buf379, buf380, buf382) + + self.assertEqual(r1, r, rtol=1e-3, atol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_issue61.py b/test/_inductor/test_issue61.py new file mode 100644 index 0000000000..3869ef4dc2 --- /dev/null +++ b/test/_inductor/test_issue61.py @@ -0,0 +1,30 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class Test_issue61(TestUtils): + def test_fx_graph(self, view_12, embedding_1): + permute_7 = torch.ops.aten.permute.default(embedding_1, [2, 0, 1]) + unsqueeze_2 = torch.ops.aten.unsqueeze.default(permute_7, 0) + slice_3 = torch.ops.aten.slice.Tensor(unsqueeze_2, 0, 0, 9223372036854775807) + slice_4 = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807) + slice_5 = torch.ops.aten.slice.Tensor(slice_4, 2, -128, 9223372036854775807) + slice_6 = torch.ops.aten.slice.Tensor(slice_5, 3, 0, 9223372036854775807) + add_5 = torch.ops.aten.add.Tensor(view_12, slice_6) + view_13 = torch.ops.aten.view.default(add_5, [384, 128, 128]) + view_14 = torch.ops.aten.view.default(view_13, [64, 6, 128, 128]) + return view_14 + + def test_issue54(self): + buf85 = torch.empty_strided((64, 6, 128, 128), (98304, 16384, 128, 1), device='npu', dtype=torch.float32) + buf84 = torch.empty_strided((128, 128, 6), (768, 6, 1), device='npu', dtype=torch.float32) + model = torch.compile(self.test_fx_graph, backend="inductor") + data_t = model(buf85, buf84) + data = self.test_fx_graph(buf85, buf84) + self.assertEqual(data, data_t, atol=1e-3, rtol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_mamba.py b/test/_inductor/test_mamba.py new file mode 100644 index 0000000000..e74833ea13 --- /dev/null +++ b/test/_inductor/test_mamba.py @@ -0,0 +1,32 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestMamba(TestUtils): + def foo(self, a, b): + permute = a.permute(1, 0, 2).contiguous() + _, _, bt = torch.split(permute, [2048, 4096, 32], 2) + clone = bt.contiguous() + clone_1 = clone.contiguous() + + add = clone_1 + b + y = torch.exp(add) + log1p = torch.log1p(y) + where = torch.where(add < 20, add, log1p) + return add, where, bt + + def test_mamba(self): + a = torch.randn(4096, 3, 6176, requires_grad=False, dtype=torch.float32, device="npu") + b = torch.randn(32, requires_grad=False, dtype=torch.float32, device="npu") + + compile_foo = torch.compile(foo, backend="inductor") + r, s, _ = foo(a, b) + r1, s1, _ = compile_foo(a, b) + torch.testing.assert_close(r, r1) + torch.testing.assert_close(s, s1) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_permute.py b/test/_inductor/test_permute.py index d59e2f9ee4..1169bf486a 100644 --- a/test/_inductor/test_permute.py +++ b/test/_inductor/test_permute.py @@ -33,6 +33,38 @@ class TestPermute(TestUtils): self.assertEqual(std_permute, inductor_permute, atol=1e-3, rtol=1e-3) + + def foo(self, a, b, c, shape): + y = a + b + y = c + y.permute(shape) + return y.clone() + + def deberta_permute(self, permute_2): + permute_7: "f16[10, 4, 128, 255, 1, 1, 1, 1]" = torch.ops.aten.permute.default(permute_2, [0, 4, 6, 2, 7, 1, 3, 5]) + clone_3: "f16[10, 4, 128, 255, 1, 1, 1, 1]" = torch.ops.aten.clone.default(permute_7, memory_format=torch.contiguous_format) + view_4: "f16[40, 128, 255]" = torch.ops.aten.view.default(clone_3, [40, 128, 255]) + return view_4 + + def test_permute_deberta(self): + permute_2 = torch.randn((10, 1, 255, 1, 4, 1, 128, 1), device='npu', dtype=torch.float16) + eager_ret = deberta_permute(permute_2) + compiled_func = torch.compile(deberta_permute, backend="inductor") + inductor_ret = compiled_func(permute_2) + self.assertEqual(eager_ret, inductor_ret, rtol=1e-3, atol=1e-3) + + base_shape = (8, 8, 512, 128) + a, b = [torch.randn(base_shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)] + d = torch.randn(base_shape, requires_grad=False, dtype=torch.float32, device="npu") + + @parametrize('shape', [(2, 0, 1, 3), (1, 0, 2, 3), (1, 0, 2, 3), (0, 1, 3, 2), (3, 0, 1, 2), (0, 2, 1, 3)]) + def test_pointwise_cases(self, shape): + c = self.d.permute(shape).contiguous() + func = torch.compile(foo, backend="inductor") + r = func(self.a, self.b, c, shape) + r1 = foo(self.a, self.b, c, shape) + self.assertEqual(r, r1, rtol=1e-3, atol=1e-3) + + instantiate_parametrized_tests(TestPermute) if __name__ == "__main__": diff --git a/test/_inductor/test_permute_reshape.py b/test/_inductor/test_permute_reshape.py new file mode 100644 index 0000000000..4edb66af59 --- /dev/null +++ b/test/_inductor/test_permute_reshape.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class TestAddRearrange(TestUtils): + def foo(self, a, b, c): + y = a + b + y = c + rearrange(y, 'b n s d -> s b (n d)').contiguous() + return y + + def test_pointwise_cases(self): + torch_npu._inductor.config.enable_npu_indexing = True + + a, b = [torch.randn(1, 12, 4096, 8, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)] + c = torch.randn(4096, 1, 96, requires_grad=False, dtype=torch.float32, device="npu") + + func = torch.compile(foo, backend="inductor") + + r = func(a, b, c) + r1 = foo(a, b, c) + torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_reduction.py b/test/_inductor/test_reduction.py new file mode 100644 index 0000000000..3b0efba353 --- /dev/null +++ b/test/_inductor/test_reduction.py @@ -0,0 +1,34 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + +torch_npu._inductor.config.enable_npu_indexing = True + + +class TestReduction(TestUtils): + def reduction(self, a, b, dim, reduction_type="sum"): + y = a + b + if reduction_type == "sum": + return torch.sum(y, dim) + if reduction_type == "mean": + return torch.mean(y, dim) + return y + + @parametrize('shape', [(1, 1, 1, 1024), (2053, 1023, 7, 9), (8, 8, 1024, 2048), (8, 8, 2048, 1024), + (8, 1024, 2048, 8), (2048, 8, 1024, 8), (2048, 1024, 8, 8)]) + @parametrize('reduction_type', ['sum', 'mean']) + @parametrize('dim', range(4)) + def test_reduction(self, shape, reduction_type, dim): + a, b, c = [self._generate_tensor(shape, 'float32') for _ in range(3)] + + reduction_func = torch.compile(reduction, backend="inductor", dynamic=False) + r = reduction(a, b, dim, reduction_type) + r1 = reduction_func(a, b, dim, reduction_type) + self.assertEqual(r, r1, rtol=1e-3, atol=1e-3) + + +instantiate_parametrized_tests(TestReduction) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_repeat.py b/test/_inductor/test_repeat.py index 9df53202ac..3eab9dda9e 100644 --- a/test/_inductor/test_repeat.py +++ b/test/_inductor/test_repeat.py @@ -8,7 +8,6 @@ class TestRepeat(TestUtils): def op_calc(self, input_element, dim): return input_element.repeat(dim) - # case:change shapes @parametrize('shape', [(16, 128, 64)]) @parametrize('dim', [(1, 1, 2), (1, 2, 1), (2, 1, 1)]) @parametrize('dtype', ['float32']) @@ -22,6 +21,16 @@ class TestRepeat(TestUtils): self.assertEqual(std_ret, inductor_ret, atol=1e-1, rtol=1e-1) + @parametrize('shape', [(8, 1024, 64)]) + @parametrize('dim', [(1, 2, 1), (2, 1, 1), (1, 2, 2)]) + def test_repeat(self, shape, dim): + a = torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") + + repeat_triton = torch.compile(self.op_calc, backend="inductor") + r = self.op_calc(a, dim=dim) + r1 = repeat_triton(a, dim=dim) + self.assertEqual(r, r1, rtol=1e-3, atol=1e-3) + instantiate_parametrized_tests(TestRepeat) diff --git a/test/_inductor/test_reshape_permute.py b/test/_inductor/test_reshape_permute.py new file mode 100644 index 0000000000..c7ed7652cf --- /dev/null +++ b/test/_inductor/test_reshape_permute.py @@ -0,0 +1,29 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestReshapePermute(TestUtils): + def foo(self, a, d, shape): + y = a.reshape(shape) + y = y.permute(0, 2, 1) + d + return y + + @parametrize('shape', [(8, 2048, 4), (8, 2048, 3), (8, 526, 3), (50, 526, 3), (50, 526, 129)]) + @parametrize('dtype', ['float32']) + def test_reshape_and_permute(self, shape, dtype): + a = self._generate_tensor([shape[0], shape[1] * shape[2]], dtype) + d = self._generate_tensor([shape[0], shape[2], shape[1]], dtype) + + func = torch.compile(foo, backend="inductor", dynamic=False) + + r = func(a, d, shape) + r1 = foo(a, d, shape) + self.assertEqual(r, r1, rtol=1e-3, atol=1e-3) + + +instantiate_parametrized_tests(TestReshapePermute) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_softmax.py b/test/_inductor/test_softmax.py new file mode 100644 index 0000000000..8f7559e79b --- /dev/null +++ b/test/_inductor/test_softmax.py @@ -0,0 +1,53 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + +DATA_TYPE = torch.float16 + + +class TestSoftmax(TestUtils): + def torch_cal(self, view_7, gather, gather_1, arg107_1, full_default): + slice_3 = torch.ops.aten.slice.Tensor(arg107_1, 1, 0, 256) + unsqueeze_1 = torch.ops.aten.unsqueeze.default(slice_3, 1) + unsqueeze_2 = torch.ops.aten.unsqueeze.default(unsqueeze_1, 2) + expand = torch.ops.aten.expand.default(unsqueeze_2, [11, 1, 256, 256]) + npu_dtype_cast_1 = torch.ops.npu.npu_dtype_cast.default(expand, dtype=DATA_TYPE) + + sub_1 = torch.ops.aten.sub.Tensor(1.0, npu_dtype_cast_1) + + npu_dtype_cast_2 = torch.ops.npu.npu_dtype_cast.default(sub_1, torch.bool) + where = torch.ops.aten.where.self(npu_dtype_cast_2, full_default, sub_1) + + permute_10 = torch.ops.aten.permute.default(gather_1, [0, 1, 3, 2]) + add_4 = torch.ops.aten.add.Tensor(gather, permute_10) + mul_4 = torch.ops.aten.mul.Tensor(add_4, 0.07216878364870322) + add_5 = torch.ops.aten.add.Tensor(view_7, mul_4) + add_6 = torch.ops.aten.add.Tensor(add_5, where) + + convert_element_type_20 = torch.ops.prims.convert_element_type.default(add_6, torch.float32) + amax = torch.ops.aten.amax.default(convert_element_type_20, [-1], True) + sub_4 = torch.ops.aten.sub.Tensor(convert_element_type_20, amax) + exp = torch.ops.aten.exp.default(sub_4) + sum_1 = torch.ops.aten.sum.dim_IntList(exp, [-1], True) + div = torch.ops.aten.div.Tensor(exp, sum_1) + convert_element_type_21 = torch.ops.prims.convert_element_type.default(div, dtype=DATA_TYPE) + return convert_element_type_21 + + def test_softmax(self): + buf34 = torch.rand([132, 256, 256], dtype=DATA_TYPE).npu() + buf47 = torch.rand([11, 12, 256, 256], dtype=DATA_TYPE).npu() + buf59 = torch.rand([11, 12, 256, 256], dtype=DATA_TYPE).npu() + arg107_1 = torch.randint(1, 1000, (11, 256), dtype=torch.int64).npu() + buf61 = torch.full([], -65504.0, dtype=DATA_TYPE).npu() + view_7 = torch.ops.aten.view.default(buf34, [11, 12, 256, 256]).npu() + + res_ref = self.torch_cal(view_7, buf47, buf59, arg107_1, buf61) + + compiled = torch.compile(self.torch_cal, backend="inductor") + res1 = compiled(view_7, buf47, buf59, arg107_1, buf61) + self.assertEqual(res1, res_ref, rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_store_permute.py b/test/_inductor/test_store_permute.py new file mode 100644 index 0000000000..befb0f49bf --- /dev/null +++ b/test/_inductor/test_store_permute.py @@ -0,0 +1,29 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestStorePermute(TestUtils): + def store_permute(self, bmm_25, arg113_1): + view_157: "f16[10, 4, 15, 4, 4, 1, 255, 1]" = torch.ops.aten.view.default(bmm_25, [10, 4, 15, 4, 4, 1, 255, 1]) + permute_95: "f16[10, 15, 255, 4, 4, 4, 1, 1]" = torch.ops.aten.permute.default(view_157, [0, 2, 6, 3, 1, 4, 5, 7]) + view_158: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.view.default(permute_95, [10, 15, 255, 4, 4, 4]) + add_57: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.add.Tensor(view_158, arg113_1) + tanh_1: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.tanh.default(add_57) + clone_40: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.clone.default(tanh_1, memory_format=torch.contiguous_format) + return clone_40 + + def test_store_permute(self): + buf400 = torch.empty_strided((10, 15, 255, 4, 4, 4), (244800, 16320, 64, 16, 4, 1), device='npu:0', dtype=torch.float16) + buf391 = torch.rand_strided((40, 240, 255), (61200, 255, 1), device='npu:0', dtype=torch.float16) + arg113_1 = torch.rand_strided((1, 1, 1, 4, 4, 4), (64, 64, 64, 16, 4, 1), device='npu:0', dtype=torch.float16) + compiled_func = torch.compile(store_permute, backend='inductor') + r = store_permute(buf391, arg113_1) + r1 = compiled_func(buf391, arg113_1) + + self.assertEqual(r, r1, rtol=0.01, atol=0.01) + + +if __name__ == "__main__": + run_tests() -- Gitee