From 02d74da9ae920ad3a07e5f064c8acd9399cd19f9 Mon Sep 17 00:00:00 2001
From: DaiFu <daifu2@huawei.com>
Date: Thu, 3 Jul 2025 20:26:35 +0800
Subject: [PATCH] add ut

---
 test/_inductor/test_add.py               | 10 +++
 test/_inductor/test_add_layernorm.py     | 32 ++++++++
 test/_inductor/test_argmax_unalign.py    | 15 +++-
 test/_inductor/test_attncp.py            | 86 +++++++++++++++++++++-
 test/_inductor/test_broadcast.py         | 14 +++-
 test/_inductor/test_broadcast_permute.py | 36 +++++++++
 test/_inductor/test_deberta.py           | 94 ++++++++++++++++++++++++
 test/_inductor/test_issue61.py           | 30 ++++++++
 test/_inductor/test_mamba.py             | 32 ++++++++
 test/_inductor/test_permute.py           | 32 ++++++++
 test/_inductor/test_permute_reshape.py   | 27 +++++++
 test/_inductor/test_reduction.py         | 34 +++++++++
 test/_inductor/test_repeat.py            | 11 ++-
 test/_inductor/test_reshape_permute.py   | 29 ++++++++
 test/_inductor/test_softmax.py           | 53 +++++++++++++
 test/_inductor/test_store_permute.py     | 29 ++++++++
 16 files changed, 556 insertions(+), 8 deletions(-)
 create mode 100644 test/_inductor/test_add_layernorm.py
 create mode 100644 test/_inductor/test_broadcast_permute.py
 create mode 100644 test/_inductor/test_deberta.py
 create mode 100644 test/_inductor/test_issue61.py
 create mode 100644 test/_inductor/test_mamba.py
 create mode 100644 test/_inductor/test_permute_reshape.py
 create mode 100644 test/_inductor/test_reduction.py
 create mode 100644 test/_inductor/test_reshape_permute.py
 create mode 100644 test/_inductor/test_softmax.py
 create mode 100644 test/_inductor/test_store_permute.py

diff --git a/test/_inductor/test_add.py b/test/_inductor/test_add.py
index f34078e105..182787656d 100644
--- a/test/_inductor/test_add.py
+++ b/test/_inductor/test_add.py
@@ -22,6 +22,16 @@ class TestAdd(TestUtils):
 
         self.assertEqual(std_sum, inductor_sum)
 
+    @parametrize('shape', [(8, 8, 1024, 2048), (8, 8, 2048, 1024), (8, 1024, 2048, 8), (2048, 1024, 8, 8),
+                 (2048, 8, 1024, 8), (8, 2048, 8, 1024)])
+    def test_pointwise(self, shape):
+        a = self._generate_tensor(shape, 'float32')
+        b = self._generate_tensor(shape, 'float32')
+        r = self.op_calc(a, b)
+        func = torch.compile(self.op_calc, backend="inductor")
+        r1 = func(a, b)
+        self.assertEqual(r, r1, rtol=1e-3, atol=1e-3)
+
 
 instantiate_parametrized_tests(TestAdd)
 
diff --git a/test/_inductor/test_add_layernorm.py b/test/_inductor/test_add_layernorm.py
new file mode 100644
index 0000000000..e1ae481b4f
--- /dev/null
+++ b/test/_inductor/test_add_layernorm.py
@@ -0,0 +1,32 @@
+import torch
+from torch.testing._internal.common_utils import run_tests
+from testutils import TestUtils
+import torch_npu
+
+
+
+class TestAddLayerNorm(TestUtils):
+    def add_LayerNorm(self, a, b):
+        x = a + b
+        mean = torch.mean(x, dim=2, keepdim=True)
+        var = torch.mean((x - mean) ** 2, dim=2, keepdim=True) + 1e-5
+        y = (x - mean) / torch.sqrt(var)
+        return y
+
+    def test_add_layernorm(self):
+        Z = 64
+        X = 512
+        Y = 256
+        hidden_states = torch.randn((Z, X, Y), dtype=torch.float32).npu()
+        add_layer = torch.randn((Z, X, Y), dtype=torch.float32).npu()
+
+        eagerOutput = self.add_LayerNorm(hidden_states, add_layer)
+        comp_func = torch.compile(self.add_LayerNorm, backend="inductor", dynamic=False)
+        output = comp_func(hidden_states, add_layer)
+        self.assertEqual(eagerOutput, output, rtol=1e-4, atol=1e-4)
+
+if __name__ == "__main__":
+    run_tests()
+
+
+
diff --git a/test/_inductor/test_argmax_unalign.py b/test/_inductor/test_argmax_unalign.py
index 34baef1ba1..c3bce7d40d 100644
--- a/test/_inductor/test_argmax_unalign.py
+++ b/test/_inductor/test_argmax_unalign.py
@@ -8,16 +8,27 @@ class TestMaxWithIndex(TestUtils):
     def op_calc(self, input_element, dim):
         return torch.argmax(input_element, dim)
 
-    @parametrize('shape', [(512, 64)]) # (513, 64), (514,33)
+    @parametrize('shape', [(512, 64)])
     @parametrize('dim', [-1])
     @parametrize('dtype', ['float32'])
     def test_reduction_cases(self, shape, dim, dtype):
-        input_element = torch.randn(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) * 2000
+        input_element = self._generate_tensor(shape, dtype) * 2000
         std_argmax = self.op_calc(input_element, dim)
         compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
         inductor_argmax = compiled_op_calc(input_element, dim)
         self.assertEqual(std_argmax, inductor_argmax, atol=1e-2, rtol=1e-2)
 
+    @parametrize('shape', [(513, 64)])
+    @parametrize('dim', [-1])
+    @parametrize('dtype', ['float32', 'int64'])
+    def test_reduction_cases1(self, shape, dim, dtype):
+        input_element = self._generate_tensor(shape, dtype) * 2000
+
+        std_argmax = self.op_calc(input_element, dim)
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_argmax = compiled_op_calc(input_element, dim)
+        self.assertEqual(std_argmax, inductor_argmax, rtol=1e-2, atol=1e-2)
+
 instantiate_parametrized_tests(TestMaxWithIndex)
 
 if __name__ == "__main__":
diff --git a/test/_inductor/test_attncp.py b/test/_inductor/test_attncp.py
index 966ecc855f..1828ce872c 100644
--- a/test/_inductor/test_attncp.py
+++ b/test/_inductor/test_attncp.py
@@ -1,5 +1,5 @@
 import torch
-from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from torch.testing._internal.common_utils import run_tests
 from testutils import TestUtils
 import torch_npu
 
@@ -16,8 +16,7 @@ class TestAttnCp(TestUtils):
         y = c + y.permute(0, 1, 3, 2)
         return y
 
-
-    def test_pointwise_cases(self):
+    def test_basic_pointwise_cases(self):
         a, b = [torch.randn(self.shape, dtype=torch.float32, device="npu") for _ in range(2)]
         d = torch.randn(self.shape, dtype=torch.float32, device="npu")
         c = d.permute(0, 1, 3, 2).contiguous()
@@ -26,7 +25,86 @@ class TestAttnCp(TestUtils):
         r1 = self.foo(a, b, c)
         self.assertEqual(r, r1, atol=1e-3, rtol=1e-3)
 
-instantiate_parametrized_tests(TestAttnCp)
+
+    def trans_BNSD2SBH(self, x):
+        """Trans data layout from BNSD to SBH"""
+        return rearrange(x, 'b n s d -> s b (n d)').contiguous()
+
+    def broadcast_and_trans_BNSD2SBH(self, x, h):
+        """broadcast and trans a tensor from [b, n, s, 8] to [s, b, h]"""
+        n = x.shape[1]
+        d = h // n
+        # [b, n, s, 8] -> [b, n, s, d]
+        new_x = x[..., 0].unsqueeze(3)
+        new_x = new_x.repeat(1, 1, 1, d)
+        return self.trans_BNSD2SBH(new_x)
+
+    def forward_update(self, prev_attn_out, prev_softmax_max, prev_softmax_sum,
+                    cur_attn_out, cur_softmax_max, cur_softmax_sum):
+        org_dtype = prev_attn_out.dtype
+        softmax_max = torch.maximum(prev_softmax_max, cur_softmax_max)
+        prev_scale = torch.exp(prev_softmax_max - softmax_max)
+        cur_scale = torch.exp(cur_softmax_max - softmax_max)
+        # update softmax_sum
+        prev_softmax_sum_scaled = prev_softmax_sum * prev_scale
+        cur_softmax_sum_scaled = cur_softmax_sum * cur_scale
+        softmax_sum = prev_softmax_sum_scaled + cur_softmax_sum_scaled
+        # out updating scale
+        prev_out_scale = prev_softmax_sum_scaled / softmax_sum
+        cur_out_scale = cur_softmax_sum_scaled / softmax_sum
+        # [b, n, s, 8] -> [s, b, h]
+        prev_out_scale_sbh = broadcast_and_trans_BNSD2SBH(prev_out_scale, prev_attn_out.shape[-1])
+        cur_out_scale_sbh = broadcast_and_trans_BNSD2SBH(cur_out_scale, prev_attn_out.shape[-1])
+        # update output
+        attn_out = prev_attn_out * prev_out_scale_sbh + cur_attn_out * cur_out_scale_sbh
+        attn_out = attn_out.to(org_dtype)
+        return attn_out, softmax_max, softmax_sum
+
+    def data_validation(self, forward_update_triton, prev_softmax_max, cur_softmax_max, prev_softmax_sum,
+                        cur_softmax_sum, prev_attn_out, cur_attn_out):
+        (attn_out, softmax_max, softmax_sum) = self.forward_update(prev_attn_out, prev_softmax_max, prev_softmax_sum,
+                                                                   cur_attn_out, cur_softmax_max, cur_softmax_sum)
+
+        (tt_attn_out, tt_softmax_max, tt_softmax_sum) = forward_update_triton(prev_attn_out,
+                                                                              prev_softmax_max,
+                                                                              prev_softmax_sum,
+                                                                              cur_attn_out,
+                                                                              cur_softmax_max,
+                                                                              cur_softmax_sum)
+
+        self.assertEqual(softmax_max, tt_softmax_max)
+        self.assertEqual(softmax_sum, tt_softmax_sum)
+        self.assertEqual(attn_out, tt_attn_out)
+
+    def test_pointwise_cases(self):
+        enable_npu_indexing_ori = torch_npu._inductor.config.enable_npu_indexing
+        torch_npu._inductor.config.enable_npu_indexing = True
+        (S, B, H, N) = (4096, 1, 1536, 12)
+        DS = 2 * S
+        DTYPE_ATTN = torch.float32
+        DTYPE = torch.float32
+
+        prev_attn_out = torch.randn((DS, B, H), dtype=DTYPE_ATTN).npu()
+        prev_softmax_max = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE)
+        prev_softmax_sum = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE)
+        cur_attn_out = torch.randn((DS, B, H), dtype=DTYPE_ATTN).npu()
+        cur_softmax_max = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE)
+        cur_softmax_sum = torch.rand((B, N, DS), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE)
+        forward_update_triton_2s = torch.compile(self.forward_update, backend="inductor", options={"aggressive_fusion": True})
+        self.data_validation(forward_update_triton_2s, prev_softmax_max, cur_softmax_max, prev_softmax_sum, cur_softmax_sum,
+                             prev_attn_out, cur_attn_out)
+
+        prev_attn_out_s = prev_attn_out.view(2, S, B, H)[1]
+        prev_softmax_max_s = prev_softmax_max.view(B, N, 2, S, F32_BLK_SIZE)[:, :, 1, :, :]
+        prev_softmax_sum_s = prev_softmax_sum.view(B, N, 2, S, F32_BLK_SIZE)[:, :, 1, :, :]
+        cur_attn_out_s = torch.randn((S, B, H), dtype=DTYPE_ATTN).npu()
+        cur_softmax_max_s = torch.rand((B, N, S), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE)
+        cur_softmax_sum_s = torch.rand((B, N, S), dtype=DTYPE).npu().unsqueeze(3).repeat(1, 1, 1, F32_BLK_SIZE)
+        forward_update_triton = torch.compile(self.forward_update, backend="inductor")
+        self.data_validation(forward_update_triton, prev_softmax_max_s, cur_softmax_max_s, prev_softmax_sum_s,
+                             cur_softmax_sum_s, prev_attn_out_s, cur_attn_out_s)
+
+        torch_npu._inductor.config.enable_npu_indexing = enable_npu_indexing_ori
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/_inductor/test_broadcast.py b/test/_inductor/test_broadcast.py
index 93e78f0351..f9ec526f95 100644
--- a/test/_inductor/test_broadcast.py
+++ b/test/_inductor/test_broadcast.py
@@ -16,7 +16,6 @@ class TestBroadcast(TestUtils):
         y = a + b
         return y
 
-
     @parametrize('shape', [(8, 8, 256)])
     @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16'])
     def test_view_cases(self, shape, dtype):
@@ -33,6 +32,19 @@ class TestBroadcast(TestUtils):
             self.assertEqual(std_broadcast.float(), inductor_broadcast.float(), atol=1e-3, rtol=1e-3)
 
 
+    def bar(self, a, b):
+        return a + b
+
+    def test_add_cases(self):
+        a = torch.randn((16, 1), device='npu', dtype=torch.float16)
+        b = torch.randn((1, 16), device='npu', dtype=torch.float16)
+
+        ret = self.bar(a, b)
+        compiled_bar = torch.compile(self.bar, backend="inductor")
+        inductor_ret = compiled_bar(a, b)
+        self.assertEqual(ret, inductor_ret, rtol=1e-3, atol=1e-3)
+
+
 instantiate_parametrized_tests(TestBroadcast)
 
 if __name__ == "__main__":
diff --git a/test/_inductor/test_broadcast_permute.py b/test/_inductor/test_broadcast_permute.py
new file mode 100644
index 0000000000..9dec6421b8
--- /dev/null
+++ b/test/_inductor/test_broadcast_permute.py
@@ -0,0 +1,36 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from testutils import TestUtils
+import torch_npu
+
+torch_npu._inductor.config.enable_npu_indexing = True
+
+
+class TestBroadcastPermute(TestUtils):
+    base_shape = (8, 8, 256, 128)
+
+    def foo(self, a, b, c, dim, permute_shape):
+        y = a + b
+        y = y.sum(dim)
+        y = y.unsqueeze(dim)
+        y = y.broadcast_to(self.base_shape) + b
+        y = c + y.permute(permute_shape)
+        return y
+
+    a, b = [torch.randn(self.base_shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)]
+    d = torch.randn(self.base_shape, requires_grad=False, dtype=torch.float32, device="npu")
+
+    @parametrize('shape', [(0, 1, 3, 2), (2, 0, 1, 3), (1, 0, 2, 3), (3, 0, 1, 2), (0, 2, 1, 3)])
+    @parametrize('dim', [3, 2, 1, 0])
+    def test_broadcast_permute(self, shape, dim):
+        c = self.d.permute(shape).contiguous()
+        func = torch.compile(foo, backend="inductor")
+        r = func(self.a, self.b, c, dim, shape)
+        r1 = foo(self.a, self.b, c, dim, shape)
+        self.assertEqual(r, r1, rtol=1e-3, atol=1e-3)
+
+
+instantiate_parametrized_tests(TestBroadcastPermute)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_deberta.py b/test/_inductor/test_deberta.py
new file mode 100644
index 0000000000..72e5b11413
--- /dev/null
+++ b/test/_inductor/test_deberta.py
@@ -0,0 +1,94 @@
+import torch
+from torch.testing._internal.common_utils import run_tests
+from testutils import TestUtils
+import torch_npu
+
+
+class TestDeberta(TestUtils):
+    def fused_14_eager(self, mul_6, arg23_1):
+        eq_2: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 1)
+        npu_dtype_cast_2: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_2, torch.float16)
+        unsqueeze_19: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_2, 1)
+        unsqueeze_20: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3)
+        mul_7: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_20)
+        amax: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_7, [2])
+        eq_3: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 2)
+        npu_dtype_cast_3: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_3, torch.float16)
+        unsqueeze_21: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_3, 1)
+        unsqueeze_22: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_21, 3)
+        mul_8: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_22)
+        amax_1: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_8, [2])
+        eq_4: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 3)
+        npu_dtype_cast_4: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_4, torch.float16)
+        unsqueeze_23: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_4, 1)
+        unsqueeze_24: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_23, 3)
+        mul_9: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_24)
+        amax_2: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_9, [2])
+        eq_5: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 4)
+        npu_dtype_cast_5: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_5, torch.float16)
+        unsqueeze_25: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_5, 1)
+        unsqueeze_26: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_25, 3)
+        mul_10: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_26)
+        amax_3: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_10, [2])
+        amax_4: "f16[10, 15, 96]" = torch.ops.aten.amax.default(mul_6, [2])
+        return amax, amax_1, amax_2, amax_3, amax_4
+
+    def reduction_test(self, mul_6, arg23_1):
+        eq_2: "b8[10, 255]" = torch.ops.aten.eq.Scalar(arg23_1, 1)
+        npu_dtype_cast_2: "f16[10, 255]" = torch.ops.npu.npu_dtype_cast.default(eq_2, torch.float16)
+        unsqueeze_19: "f16[10, 1, 255]" = torch.ops.aten.unsqueeze.default(npu_dtype_cast_2, 1)
+        unsqueeze_20: "f16[10, 1, 255, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3)
+        mul_7: "f16[10, 15, 255, 96]" = torch.ops.aten.mul.Tensor(mul_6, unsqueeze_20)
+        return torch.amax(mul_7, [2])
+
+    arg23_1 = torch.rand_strided((10, 255), (255, 1), device='npu:0', dtype=torch.float16)
+    buf50 = torch.empty_strided((10, 15, 255, 96), (367200, 1, 1440, 15), device='npu', dtype=torch.float16)
+
+    def test_fused_14_eager(self):
+        _, _, _, _, r = fused_14_eager(self.buf50, self.arg23_1)
+        fused_14_triton = torch.compile(fused_14_eager, backend="inductor")
+        _, _, _, _, r1 = fused_14_triton(self.buf50, self.arg23_1)
+        self.assertEqual(r, r1)
+
+    def test_reduction(self):
+        r = reduction_test(self.buf50, self.arg23_1)
+        reduction_triton = torch.compile(reduction_test, backend="inductor")
+        r1 = reduction_triton(self.buf50, self.arg23_1)
+        self.assertEqual(r, r1)
+
+    def triton_unk_fused_div_mul_sum_14_eager(self, cat, sum_7, cat_1, sum_8):
+        view_151: "f16[10, 256, 4, 128]" = torch.ops.aten.view.default(cat_1, [10, 256, 4, 128])
+        slice_19: "f16[10, 255, 4, 128]" = torch.ops.aten.slice.Tensor(view_151, 1, 1, 9223372036854775807)
+        unsqueeze_6: "f16[10, 1, 255, 4, 128]" = torch.ops.aten.unsqueeze.default(slice_19, 1)
+        view_150: "f16[10, 16, 4, 128]" = torch.ops.aten.view.default(cat, [10, 16, 4, 128]);  
+        slice_18: "f16[10, 15, 4, 128]" = torch.ops.aten.slice.Tensor(view_150, 1, 1, 9223372036854775807)
+        unsqueeze_5: "f16[10, 15, 1, 4, 128]" = torch.ops.aten.unsqueeze.default(slice_18, 2)
+        pow_2: "f32[10, 15, 1, 4, 1]" = torch.ops.aten.pow.Tensor_Scalar(sum_7, 0.5) 
+        convert_element_type_63: "f16[10, 15, 1, 4, 1]" = torch.ops.prims.convert_element_type.default(pow_2, torch.float16)
+        clamp_min_2: "f16[10, 15, 1, 4, 1]" = torch.ops.aten.clamp_min.default(convert_element_type_63, 1e-06)
+        expand_13: "f16[10, 15, 1, 4, 128]" = torch.ops.aten.expand.default(clamp_min_2, [10, 15, 1, 4, 128])
+        div_6: "f16[10, 15, 1, 4, 128]" = torch.ops.aten.div.Tensor(unsqueeze_5, expand_13)
+        pow_4: "f32[10, 1, 255, 4, 1]" = torch.ops.aten.pow.Tensor_Scalar(sum_8, 0.5)
+        convert_element_type_65: "f16[10, 1, 255, 4, 1]" = torch.ops.prims.convert_element_type.default(pow_4, torch.float16)
+        clamp_min_3: "f16[10, 1, 255, 4, 1]" = torch.ops.aten.clamp_min.default(convert_element_type_65, 1e-06)
+        expand_14: "f16[10, 1, 255, 4, 128]" = torch.ops.aten.expand.default(clamp_min_3, [10, 1, 255, 4, 128])
+        div_7: "f16[10, 1, 255, 4, 128]" = torch.ops.aten.div.Tensor(unsqueeze_6, expand_14)
+        mul_45: "f16[10, 15, 255, 4, 128]" = torch.ops.aten.mul.Tensor(div_6, div_7)
+        sum_9: "f16[10, 15, 255, 4]" = torch.ops.aten.sum.dim_IntList(mul_45, [4])
+        mul_46: "f16[10, 15, 255, 4]" = torch.ops.aten.mul.Tensor(sum_9, 0.5)
+        return mul_46
+
+    def test_reduction(self):
+        buf377 = torch.rand_strided((10, 16, 512), (8192, 512, 1), device='npu', dtype=torch.float16)
+        buf379 = torch.rand_strided((10, 15, 1, 4, 1), (60, 4, 600, 1, 600), device='npu', dtype=torch.float32)
+        buf380 = torch.rand_strided((10, 256, 512), (131072, 512, 1), device='npu', dtype=torch.float16)
+        buf382 = torch.rand_strided((10, 1, 255, 4, 1), (1020, 10208, 4, 1, 10208), device='npu', dtype=torch.float32)
+        r1 = triton_unk_fused_div_mul_sum_14_eager(buf377, buf379, buf380, buf382)
+        compiled_14 = torch.compile(triton_unk_fused_div_mul_sum_14_eager, backend="inductor")
+        r = compiled_14(buf377, buf379, buf380, buf382)
+
+        self.assertEqual(r1, r, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_issue61.py b/test/_inductor/test_issue61.py
new file mode 100644
index 0000000000..3869ef4dc2
--- /dev/null
+++ b/test/_inductor/test_issue61.py
@@ -0,0 +1,30 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from testutils import TestUtils
+import torch_npu
+
+
+class Test_issue61(TestUtils):
+    def test_fx_graph(self, view_12, embedding_1):
+        permute_7 = torch.ops.aten.permute.default(embedding_1, [2, 0, 1])
+        unsqueeze_2 = torch.ops.aten.unsqueeze.default(permute_7, 0)
+        slice_3 = torch.ops.aten.slice.Tensor(unsqueeze_2, 0, 0, 9223372036854775807)
+        slice_4 = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807)
+        slice_5 = torch.ops.aten.slice.Tensor(slice_4, 2, -128, 9223372036854775807)
+        slice_6 = torch.ops.aten.slice.Tensor(slice_5, 3, 0, 9223372036854775807)
+        add_5 = torch.ops.aten.add.Tensor(view_12, slice_6)
+        view_13 = torch.ops.aten.view.default(add_5, [384, 128, 128])
+        view_14 = torch.ops.aten.view.default(view_13, [64, 6, 128, 128])
+        return view_14
+
+    def test_issue54(self):
+        buf85 = torch.empty_strided((64, 6, 128, 128), (98304, 16384, 128, 1), device='npu', dtype=torch.float32)
+        buf84 = torch.empty_strided((128, 128, 6), (768, 6, 1), device='npu', dtype=torch.float32)
+        model = torch.compile(self.test_fx_graph, backend="inductor")
+        data_t = model(buf85, buf84)
+        data = self.test_fx_graph(buf85, buf84)
+        self.assertEqual(data, data_t, atol=1e-3, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_mamba.py b/test/_inductor/test_mamba.py
new file mode 100644
index 0000000000..e74833ea13
--- /dev/null
+++ b/test/_inductor/test_mamba.py
@@ -0,0 +1,32 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from testutils import TestUtils
+import torch_npu
+
+
+class TestMamba(TestUtils):
+    def foo(self, a, b):
+        permute = a.permute(1, 0, 2).contiguous()
+        _, _, bt = torch.split(permute, [2048, 4096, 32], 2)
+        clone = bt.contiguous()
+        clone_1 = clone.contiguous()
+
+        add = clone_1 + b
+        y = torch.exp(add)
+        log1p = torch.log1p(y)
+        where = torch.where(add < 20, add, log1p)
+        return add, where, bt
+
+    def test_mamba(self):
+        a = torch.randn(4096, 3, 6176, requires_grad=False, dtype=torch.float32, device="npu")
+        b = torch.randn(32, requires_grad=False, dtype=torch.float32, device="npu")
+
+        compile_foo = torch.compile(foo, backend="inductor")
+        r, s, _ = foo(a, b)
+        r1, s1, _ = compile_foo(a, b)
+        torch.testing.assert_close(r, r1)
+        torch.testing.assert_close(s, s1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_permute.py b/test/_inductor/test_permute.py
index d59e2f9ee4..1169bf486a 100644
--- a/test/_inductor/test_permute.py
+++ b/test/_inductor/test_permute.py
@@ -33,6 +33,38 @@ class TestPermute(TestUtils):
 
             self.assertEqual(std_permute, inductor_permute, atol=1e-3, rtol=1e-3)
 
+
+    def foo(self, a, b, c, shape):
+        y = a + b
+        y = c + y.permute(shape)
+        return y.clone()
+
+    def deberta_permute(self, permute_2):
+        permute_7: "f16[10, 4, 128, 255, 1, 1, 1, 1]" = torch.ops.aten.permute.default(permute_2, [0, 4, 6, 2, 7, 1, 3, 5])
+        clone_3: "f16[10, 4, 128, 255, 1, 1, 1, 1]" = torch.ops.aten.clone.default(permute_7, memory_format=torch.contiguous_format)
+        view_4: "f16[40, 128, 255]" = torch.ops.aten.view.default(clone_3, [40, 128, 255])
+        return view_4
+
+    def test_permute_deberta(self):
+        permute_2 = torch.randn((10, 1, 255, 1, 4, 1, 128, 1), device='npu', dtype=torch.float16)
+        eager_ret = deberta_permute(permute_2)
+        compiled_func = torch.compile(deberta_permute, backend="inductor")
+        inductor_ret = compiled_func(permute_2)
+        self.assertEqual(eager_ret, inductor_ret, rtol=1e-3, atol=1e-3)
+
+    base_shape = (8, 8, 512, 128)
+    a, b = [torch.randn(base_shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)]
+    d = torch.randn(base_shape, requires_grad=False, dtype=torch.float32, device="npu")
+
+    @parametrize('shape', [(2, 0, 1, 3), (1, 0, 2, 3), (1, 0, 2, 3), (0, 1, 3, 2), (3, 0, 1, 2), (0, 2, 1, 3)])
+    def test_pointwise_cases(self, shape):
+        c = self.d.permute(shape).contiguous()
+        func = torch.compile(foo, backend="inductor")
+        r = func(self.a, self.b, c, shape)
+        r1 = foo(self.a, self.b, c, shape)
+        self.assertEqual(r, r1, rtol=1e-3, atol=1e-3)
+
+
 instantiate_parametrized_tests(TestPermute)
 
 if __name__ == "__main__":
diff --git a/test/_inductor/test_permute_reshape.py b/test/_inductor/test_permute_reshape.py
new file mode 100644
index 0000000000..4edb66af59
--- /dev/null
+++ b/test/_inductor/test_permute_reshape.py
@@ -0,0 +1,27 @@
+import torch
+from torch.testing._internal.common_utils import run_tests
+from testutils import TestUtils
+import torch_npu
+
+
+class TestAddRearrange(TestUtils):
+    def foo(self, a, b, c):
+        y = a + b
+        y = c + rearrange(y, 'b n s d -> s b (n d)').contiguous()
+        return y
+
+    def test_pointwise_cases(self):
+        torch_npu._inductor.config.enable_npu_indexing = True
+
+        a, b = [torch.randn(1, 12, 4096, 8, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)]
+        c = torch.randn(4096, 1, 96, requires_grad=False, dtype=torch.float32, device="npu")
+
+        func = torch.compile(foo, backend="inductor")
+
+        r = func(a, b, c)
+        r1 = foo(a, b, c)
+        torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_reduction.py b/test/_inductor/test_reduction.py
new file mode 100644
index 0000000000..3b0efba353
--- /dev/null
+++ b/test/_inductor/test_reduction.py
@@ -0,0 +1,34 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from testutils import TestUtils
+import torch_npu
+
+torch_npu._inductor.config.enable_npu_indexing = True
+
+
+class TestReduction(TestUtils):
+    def reduction(self, a, b, dim, reduction_type="sum"):
+        y = a + b
+        if reduction_type == "sum":
+            return torch.sum(y, dim)
+        if reduction_type == "mean":
+            return torch.mean(y, dim)
+        return y
+
+    @parametrize('shape', [(1, 1, 1, 1024), (2053, 1023, 7, 9), (8, 8, 1024, 2048), (8, 8, 2048, 1024), 
+                 (8, 1024, 2048, 8), (2048, 8, 1024, 8), (2048, 1024, 8, 8)])
+    @parametrize('reduction_type', ['sum', 'mean'])
+    @parametrize('dim', range(4))
+    def test_reduction(self, shape, reduction_type, dim):
+        a, b, c = [self._generate_tensor(shape, 'float32') for _ in range(3)]
+
+        reduction_func = torch.compile(reduction, backend="inductor", dynamic=False)
+        r = reduction(a, b, dim, reduction_type)
+        r1 = reduction_func(a, b, dim, reduction_type)
+        self.assertEqual(r, r1, rtol=1e-3, atol=1e-3)
+
+
+instantiate_parametrized_tests(TestReduction)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_repeat.py b/test/_inductor/test_repeat.py
index 9df53202ac..3eab9dda9e 100644
--- a/test/_inductor/test_repeat.py
+++ b/test/_inductor/test_repeat.py
@@ -8,7 +8,6 @@ class TestRepeat(TestUtils):
     def op_calc(self, input_element, dim):
         return input_element.repeat(dim)
 
-    # case：change shapes
     @parametrize('shape', [(16, 128, 64)])
     @parametrize('dim', [(1, 1, 2), (1, 2, 1), (2, 1, 1)])
     @parametrize('dtype', ['float32'])
@@ -22,6 +21,16 @@ class TestRepeat(TestUtils):
 
         self.assertEqual(std_ret, inductor_ret, atol=1e-1, rtol=1e-1)
 
+    @parametrize('shape', [(8, 1024, 64)])
+    @parametrize('dim', [(1, 2, 1), (2, 1, 1), (1, 2, 2)])
+    def test_repeat(self, shape, dim):
+        a = torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu")
+
+        repeat_triton = torch.compile(self.op_calc, backend="inductor")
+        r = self.op_calc(a, dim=dim)
+        r1 = repeat_triton(a, dim=dim)
+        self.assertEqual(r, r1, rtol=1e-3, atol=1e-3)
+
 
 instantiate_parametrized_tests(TestRepeat)
 
diff --git a/test/_inductor/test_reshape_permute.py b/test/_inductor/test_reshape_permute.py
new file mode 100644
index 0000000000..c7ed7652cf
--- /dev/null
+++ b/test/_inductor/test_reshape_permute.py
@@ -0,0 +1,29 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from testutils import TestUtils
+import torch_npu
+
+
+class TestReshapePermute(TestUtils):
+    def foo(self, a, d, shape):
+        y = a.reshape(shape)
+        y = y.permute(0, 2, 1) + d
+        return y
+
+    @parametrize('shape', [(8, 2048, 4), (8, 2048, 3), (8, 526, 3), (50, 526, 3), (50, 526, 129)])
+    @parametrize('dtype', ['float32'])
+    def test_reshape_and_permute(self, shape, dtype):
+        a = self._generate_tensor([shape[0], shape[1] * shape[2]], dtype)
+        d = self._generate_tensor([shape[0], shape[2], shape[1]], dtype)
+
+        func = torch.compile(foo, backend="inductor", dynamic=False)
+
+        r = func(a, d, shape)
+        r1 = foo(a, d, shape)
+        self.assertEqual(r, r1, rtol=1e-3, atol=1e-3)
+
+
+instantiate_parametrized_tests(TestReshapePermute)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_softmax.py b/test/_inductor/test_softmax.py
new file mode 100644
index 0000000000..8f7559e79b
--- /dev/null
+++ b/test/_inductor/test_softmax.py
@@ -0,0 +1,53 @@
+import torch
+from torch.testing._internal.common_utils import run_tests
+from testutils import TestUtils
+import torch_npu
+
+DATA_TYPE = torch.float16
+
+
+class TestSoftmax(TestUtils):
+    def torch_cal(self, view_7, gather, gather_1, arg107_1, full_default):
+        slice_3 = torch.ops.aten.slice.Tensor(arg107_1, 1, 0, 256)
+        unsqueeze_1 = torch.ops.aten.unsqueeze.default(slice_3, 1)
+        unsqueeze_2 = torch.ops.aten.unsqueeze.default(unsqueeze_1, 2)
+        expand = torch.ops.aten.expand.default(unsqueeze_2, [11, 1, 256, 256])
+        npu_dtype_cast_1 = torch.ops.npu.npu_dtype_cast.default(expand, dtype=DATA_TYPE)
+
+        sub_1 = torch.ops.aten.sub.Tensor(1.0, npu_dtype_cast_1)
+
+        npu_dtype_cast_2 = torch.ops.npu.npu_dtype_cast.default(sub_1, torch.bool)
+        where = torch.ops.aten.where.self(npu_dtype_cast_2, full_default, sub_1)
+
+        permute_10 = torch.ops.aten.permute.default(gather_1, [0, 1, 3, 2])
+        add_4 = torch.ops.aten.add.Tensor(gather, permute_10)
+        mul_4 = torch.ops.aten.mul.Tensor(add_4, 0.07216878364870322)
+        add_5 = torch.ops.aten.add.Tensor(view_7, mul_4)
+        add_6 = torch.ops.aten.add.Tensor(add_5, where)
+
+        convert_element_type_20 = torch.ops.prims.convert_element_type.default(add_6, torch.float32)
+        amax = torch.ops.aten.amax.default(convert_element_type_20, [-1], True)
+        sub_4 = torch.ops.aten.sub.Tensor(convert_element_type_20, amax)
+        exp = torch.ops.aten.exp.default(sub_4)
+        sum_1 = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
+        div = torch.ops.aten.div.Tensor(exp, sum_1)
+        convert_element_type_21 = torch.ops.prims.convert_element_type.default(div, dtype=DATA_TYPE)
+        return convert_element_type_21
+
+    def test_softmax(self):
+        buf34 = torch.rand([132, 256, 256], dtype=DATA_TYPE).npu()
+        buf47 = torch.rand([11, 12, 256, 256], dtype=DATA_TYPE).npu()
+        buf59 = torch.rand([11, 12, 256, 256], dtype=DATA_TYPE).npu()
+        arg107_1 = torch.randint(1, 1000, (11, 256), dtype=torch.int64).npu()
+        buf61 = torch.full([], -65504.0, dtype=DATA_TYPE).npu()
+        view_7 = torch.ops.aten.view.default(buf34, [11, 12, 256, 256]).npu()
+
+        res_ref = self.torch_cal(view_7, buf47, buf59, arg107_1, buf61)
+
+        compiled = torch.compile(self.torch_cal, backend="inductor")
+        res1 = compiled(view_7, buf47, buf59, arg107_1, buf61)
+        self.assertEqual(res1, res_ref, rtol=1e-2, atol=1e-2)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/_inductor/test_store_permute.py b/test/_inductor/test_store_permute.py
new file mode 100644
index 0000000000..befb0f49bf
--- /dev/null
+++ b/test/_inductor/test_store_permute.py
@@ -0,0 +1,29 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests
+from testutils import TestUtils
+import torch_npu
+
+
+class TestStorePermute(TestUtils):
+    def store_permute(self, bmm_25, arg113_1):
+        view_157: "f16[10, 4, 15, 4, 4, 1, 255, 1]" = torch.ops.aten.view.default(bmm_25, [10, 4, 15, 4, 4, 1, 255, 1])
+        permute_95: "f16[10, 15, 255, 4, 4, 4, 1, 1]" = torch.ops.aten.permute.default(view_157, [0, 2, 6, 3, 1, 4, 5, 7])
+        view_158: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.view.default(permute_95, [10, 15, 255, 4, 4, 4])
+        add_57: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.add.Tensor(view_158, arg113_1)
+        tanh_1: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.tanh.default(add_57)
+        clone_40: "f16[10, 15, 255, 4, 4, 4]" = torch.ops.aten.clone.default(tanh_1, memory_format=torch.contiguous_format)
+        return clone_40
+
+    def test_store_permute(self):
+        buf400 = torch.empty_strided((10, 15, 255, 4, 4, 4), (244800, 16320, 64, 16, 4, 1), device='npu:0', dtype=torch.float16)
+        buf391 = torch.rand_strided((40, 240, 255), (61200, 255, 1), device='npu:0', dtype=torch.float16)
+        arg113_1 = torch.rand_strided((1, 1, 1, 4, 4, 4), (64, 64, 64, 16, 4, 1), device='npu:0', dtype=torch.float16)
+        compiled_func = torch.compile(store_permute, backend='inductor')
+        r = store_permute(buf391, arg113_1)
+        r1 = compiled_func(buf391, arg113_1)
+    
+        self.assertEqual(r, r1, rtol=0.01, atol=0.01)
+
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee