From a7ad16619ef15874d15809714ee0be45e451fd55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Thu, 19 Sep 2024 06:58:11 +0000
Subject: [PATCH 01/96] =?UTF-8?q?!14658=20Checkout=20branch=20to=20RC3.=20?=
 =?UTF-8?q?Merge=20pull=20request=20!14658=20from=20=E5=88=98=E5=98=89?=
 =?UTF-8?q?=E5=B7=8D/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitmodules | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitmodules b/.gitmodules
index 8c21a3877b..ca763662de 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,12 +2,14 @@
 	path = third_party/op-plugin
 	url = https://gitee.com/ascend/op-plugin.git
 	ignore = dirty
+    branch = 6.0.rc3
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://gitee.com/mirrors/googletest.git
 [submodule "third_party/torchair/torchair"]
 	path = third_party/torchair/torchair
 	url = https://gitee.com/ascend/torchair.git
+    branch = 6.0.rc3
 [submodule "third_party/Tensorpipe"]
 	path = third_party/Tensorpipe
 	url = https://gitee.com/ascend/Tensorpipe.git
-- 
Gitee


From bf98f5c570998aaa60ac15884e41c4c35da65c45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= <guoguanghao@huawei.com>
Date: Thu, 19 Sep 2024 09:20:37 +0000
Subject: [PATCH 02/96] =?UTF-8?q?!14705=20modify=20version=20Merge=20pull?=
 =?UTF-8?q?=20request=20!14705=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.1.0?=
 =?UTF-8?q?-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b2e74958dd..cfed83ba12 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel
 
 BASE_DIR = os.path.dirname(os.path.realpath(__file__))
 THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party")
-VERSION = '2.1.0.post7'
+VERSION = '2.1.0.post8'
 UNKNOWN = "Unknown"
 BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP
 
-- 
Gitee


From 4b0c253d926b2a1067b29f02d696cac31018cb11 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 20 Sep 2024 08:43:55 +0000
Subject: [PATCH 03/96] !14785 Update op_plugin commit id Merge pull request
 !14785 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index cd3de98674..785ba5248e 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit cd3de98674aac44775448672d7193c8e1339fc7d
+Subproject commit 785ba5248e2d1d51e13719177f7328c4aa38e836
-- 
Gitee


From fddba1498bb422f1269d7ff7b9920dabd86e3714 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Fri, 20 Sep 2024 09:35:13 +0000
Subject: [PATCH 04/96] =?UTF-8?q?!14698=20silentCheckV2:=20filter=20models?=
 =?UTF-8?q?=20with=20fp16=20dtype=20Merge=20pull=20request=20!14698=20from?=
 =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Fsilent3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/_step.py | 43 +++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py
index 2b78813f30..960c0afff2 100644
--- a/torch_npu/utils/_step.py
+++ b/torch_npu/utils/_step.py
@@ -78,8 +78,9 @@ class SilentCheckState:
     def __init__(self):
         self.init_param()
         self.init_marks = {}
-        self.weight_hook_flags = {}
-        self.last_weight_hook_flags = {}
+        self.weight_hook_handles = {}
+        self.last_weight_hook_handles = {}
+        self.dtype_support = True
 
     def init_param(self):
         self.first_forward = True
@@ -101,6 +102,18 @@ class SilentCheckState:
         else:
             torch_npu._C._npu_set_module_train_state("infer")
 
+    def check_tensor_dtype(self, tensor):
+        if not self.dtype_support:
+            return
+        if isinstance(tensor, torch.Tensor) and tensor.requires_grad and tensor.dtype == torch.float16:
+            self.dtype_support = False
+
+    def check_dtype(self, module, *args):
+        for x in args:
+            self.check_tensor_dtype(x)
+        for param_name, param in module._parameters.items():
+            self.check_tensor_dtype(param)
+
     def search_first_weight(self, module):
         # Search the first weight
         if not self.init_marks.get(self.first_module_id, False) and self.first_weight is None:
@@ -145,15 +158,15 @@ class SilentCheckState:
             if self.first_tensor_id != self.last_tensor_id:
                 if self.last_tensor is not None:
                     self.last_tensor.register_hook(output_hook)
-                if not self.last_weight_hook_flags.get(self.first_module_id, False):
+                if self.last_weight_hook_handles.get(self.first_module_id, None) is None:
                     if self.last_weight is not None:
-                        self.last_weight.register_hook(output_hook)
-                        self.last_weight_hook_flags[self.first_module_id] = True
-                if not self.weight_hook_flags.get(self.first_module_id, False):
+                        last_weight_handle = self.last_weight.register_hook(output_hook)
+                        self.last_weight_hook_handles[self.first_module_id] = last_weight_handle
+                if self.weight_hook_handles.get(self.first_module_id, None) is None:
                     if self.first_weight is not None:
-                        self.first_weight.register_hook(input_hook("", asd_flag))
-                        self.weight_hook_flags[self.first_module_id] = True
-                self.init_marks[self.first_module_id] = True
+                        first_weight_handle = self.first_weight.register_hook(input_hook("", asd_flag))
+                        self.weight_hook_handles[self.first_module_id] = first_weight_handle
+            self.init_marks[self.first_module_id] = True
 
 
 silent_check = SilentCheckState()
@@ -275,6 +288,18 @@ def _custom_call(self, *args, **kwargs):
             silent_check.init_module_info(id(self), self.training)
             self.outer = True
 
+        if silent_check.is_training and not silent_check.init_marks.get(silent_check.first_module_id, False):
+            silent_check.check_dtype(self, *args)
+            if not silent_check.dtype_support:
+                for value in silent_check.weight_hook_handles.values():
+                    if value is not None:
+                        value.remove()
+                for value in silent_check.last_weight_hook_handles.values():
+                    if value is not None:
+                        value.remove()
+                asd_enable = 0
+                warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.")
+
         # Search the first tensor (if the first tensor is input)
         silent_check.register_input_hook_before_call(asd_enable, *args)
 
-- 
Gitee


From 37b954f0cd4208a8b185fbdb5bf629ba4c8f28ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Sat, 21 Sep 2024 03:33:27 +0000
Subject: [PATCH 05/96] =?UTF-8?q?!14802=20update=20torchair=20commitid=20M?=
 =?UTF-8?q?erge=20pull=20request=20!14802=20from=20=E5=85=B3=E9=BE=99?=
 =?UTF-8?q?=E9=94=8B/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 5143b41229..485484ca71 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 5143b41229264549c27510ebbd310169568e7758
+Subproject commit 485484ca7143cdf47415793ca76db9210cff8a4c
-- 
Gitee


From cc375cc5804ac2c4c43eb5f60b593c6d1a61614e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 23 Sep 2024 04:43:42 +0000
Subject: [PATCH 06/96] !14834 Update op_plugin commit id Merge pull request
 !14834 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 785ba5248e..3b738f2d6c 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 785ba5248e2d1d51e13719177f7328c4aa38e836
+Subproject commit 3b738f2d6c2aff15a77b94c37a71873038877df7
-- 
Gitee


From 074ae4b7a1fe7e47e55e3b736d1ca69bd49c22dc Mon Sep 17 00:00:00 2001
From: sunjiayang <sunxinlei1@huawei.com>
Date: Mon, 23 Sep 2024 07:02:08 +0000
Subject: [PATCH 07/96] !14770 only last error Merge pull request !14770 from
 sunjiayang/last_error_210_rc3

---
 third_party/acl/inc/acl/acl_base.h            |  2 +-
 third_party/acl/inc/acl/acl_rt.h              | 55 ++++++++++++++++++-
 third_party/acl/libs/acl.cpp                  |  3 +
 torch_npu/csrc/core/npu/NPUEventManager.cpp   |  5 +-
 torch_npu/csrc/core/npu/NPUException.h        | 20 ++++++-
 torch_npu/csrc/core/npu/NPUFunctions.cpp      | 14 +++--
 torch_npu/csrc/core/npu/NPUQueue.cpp          |  4 +-
 torch_npu/csrc/core/npu/NPUStream.cpp         |  6 +-
 .../core/npu/THNPUCachingHostAllocator.cpp    |  8 +--
 .../csrc/core/npu/interface/AclInterface.cpp  | 28 ++++++++++
 .../csrc/core/npu/interface/AclInterface.h    |  4 ++
 .../csrc/distributed/ProcessGroupHCCL.cpp     |  7 ++-
 torch_npu/csrc/framework/OpParamMaker.cpp     | 46 ++++++++++++----
 13 files changed, 168 insertions(+), 34 deletions(-)

diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h
index 091e45aa23..6411f94794 100644
--- a/third_party/acl/inc/acl/acl_base.h
+++ b/third_party/acl/inc/acl/acl_base.h
@@ -132,7 +132,7 @@ static const int ACL_ERROR_GE_FAILURE = 500002;
 static const int ACL_ERROR_RT_FAILURE = 500003;
 static const int ACL_ERROR_DRV_FAILURE = 500004;
 static const int ACL_ERROR_PROFILING_FAILURE = 500005;
-static const int ACL_ERROR_RT_DEVICE_MTE_ERROR = 507053;
+static const int ACL_ERROR_RT_DEVICE_MEM_ERROR = 507053;
 
 #define ACL_TENSOR_SHAPE_RANGE_NUM 2
 #define ACL_TENSOR_VALUE_RANGE_NUM 2
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index 6a4add2c16..33052829bf 100644
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -168,6 +168,10 @@ typedef enum aclrtCmoType {
     ACL_RT_CMO_TYPE_PREFETCH = 0,
 } aclrtCmoType;
 
+typedef enum aclrtLastErrLevel {
+    ACL_RT_THREAD_LEVEL = 0,
+} aclrtLastErrLevel;
+
 typedef void* aclrtDrvMemHandle;
 
 typedef void (*aclrtCallback)(void *userData);
@@ -1453,11 +1457,58 @@ ACL_FUNC_VISIBILITY aclError aclrtResetOverflowStatus(aclrtStream stream);
 */
 ACL_FUNC_VISIBILITY aclError aclrtCmoAsync(void *src, size_t size, aclrtCmoType cmoType, aclrtStream stream);
 
-ACL_FUNC_VISIBILITY aclError aclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t *retSize);
+/**
+ * @ingroup AscendCL
+ * @brief get the mem uce info
+ * @param [in] deviceId
+ * @param [in/out] memUceInfoArray
+ * @param [in] arraySize
+ * @param [out] retSize
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo *memUceInfoArray, size_t arraySize, size_t *retSize);
 
+/**
+ * @ingroup AscendCL
+ * @brief stop the task on specified device
+ * @param [in] deviceId
+ * @param [in] timeout
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
 ACL_FUNC_VISIBILITY aclError aclrtDeviceTaskAbort(int32_t deviceId, uint32_t timeout);
 
-ACL_FUNC_VISIBILITY aclError aclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize);
+/**
+ * @ingroup AscendCL
+ * @brief repair the mem uce
+ * @param [in] deviceId
+ * @param [in/out] memUceInfoArray
+ * @param [in] arraySize
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo *memUceInfoArray, size_t arraySize);
+
+/**
+ * @ingroup AscendCL
+ * @brief peek at last error by level
+ *
+ * @param level [IN] error level
+ *
+ * @retval Runtime error code
+ */
+ACL_FUNC_VISIBILITY aclError aclrtPeekAtLastError(aclrtLastErrLevel level);
+
+/**
+ * @ingroup AscendCL
+ * @brief get last error by level
+ *
+ * @param level [IN] error level
+ *
+ * @retval Runtime error code
+ */
+ACL_FUNC_VISIBILITY aclError aclrtGetLastError(aclrtLastErrLevel level);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp
index b8f598f163..4f24e6bf04 100644
--- a/third_party/acl/libs/acl.cpp
+++ b/third_party/acl/libs/acl.cpp
@@ -51,6 +51,9 @@ aclError aclrtGetMemInfo(aclrtMemAttr attr, size_t *free, size_t *total){return
 aclError aclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t *retSize){return 0;}
 aclError aclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize){return 0;}
 aclError aclrtCmoAsync(void *src, size_t size, aclrtCmoType cmoType, aclrtStream stream){return 0;}
+aclError aclrtGetLastError(aclrtLastErrLevel flag){return 0;}
+aclError aclrtPeekAtLastError(aclrtLastErrLevel flag){return 0;}
+
 
 // op相关操作
 aclopAttr *aclopCreateAttr(){return NULL;}
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.cpp b/torch_npu/csrc/core/npu/NPUEventManager.cpp
index d69dd4622e..75cb33c240 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.cpp
+++ b/torch_npu/csrc/core/npu/NPUEventManager.cpp
@@ -64,8 +64,9 @@ aclError NPUEventManager::LazyDestroy(aclrtEvent npu_event)
         int err = aclrtDestroyEvent(npu_event);
         if (err == ACL_ERROR_NONE) {
             ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", npu_event);
+        } else {
+            CHECK_AND_THROW_FORCE_STOP(err);
         }
-        CHECK_AND_THROW_FORCE_STOP(err);
         return err;
     }
     std::lock_guard<std::mutex> guard(event_queue_mutex_);
@@ -88,8 +89,8 @@ void NPUEventManager::ClearEvent()
         }
 #endif
         auto err = aclrtDestroyEvent(event);
-        CHECK_AND_THROW_FORCE_STOP(err);
         if (err != ACL_ERROR_NONE) {
+            CHECK_AND_THROW_FORCE_STOP(err);
             NPU_CHECK_WARN(err);
         } else {
             ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", event);
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index d6d09d443c..bd2f49c5d6 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -89,6 +89,9 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode);
 #define GRAPH_ERROR(error) formatErrorCode(SubModule::GRAPH, error)
 #define PROF_ERROR(error) formatErrorCode(SubModule::PROF, error)
 
+#define DEVICE_TASK_ABORT "107022"
+#define DEVICE_MEM_ERROR "507053"
+
 inline const char* getErrorFunction(const char* msg)
 {
     return msg;
@@ -101,7 +104,12 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
 }
 
 #define CHECK_AND_THROW_FORCE_STOP(err_code)                                 \
-    if ((err_code) == ACL_ERROR_RT_DEVICE_TASK_ABORT) {                      \
+    auto Error_stop = (int)(err_code);                                       \
+    auto stop_error = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);                 \
+    if ((stop_error) != ACL_ERROR_NONE) {                                    \
+        Error_stop = stop_error;                                             \
+    }                                                                        \
+    if ((Error_stop) == ACL_ERROR_RT_DEVICE_TASK_ABORT) {                    \
         c10_npu::set_has_throw_error(true);                                  \
         TORCH_CHECK(                                                         \
             false,                                                           \
@@ -111,12 +119,17 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
             ":",                                                             \
             __LINE__,                                                        \
             " NPU function error: FORCE STOP.",                              \
-            ", error code is ", err_code,                                    \
+            ", error code is ", Error_stop,                                  \
             PTA_ERROR(ErrCode::ACL));                                        \
     }                                                                        \
 
 #define CHECK_AND_THROW_UCE_ERROR(err_code)                                  \
-    if ((err_code) == ACL_ERROR_RT_DEVICE_MTE_ERROR &&                       \
+    auto Error_uce = (int)(err_code);                                        \
+    auto uce_error = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);                  \
+    if ((uce_error) != ACL_ERROR_NONE) {                                     \
+        Error_uce = uce_error;                                               \
+    }                                                                        \
+    if ((Error_uce) == ACL_ERROR_RT_DEVICE_MEM_ERROR &&                      \
         c10_npu::get_has_throw_error() == false && c10_npu::checkUceErrAndRepair()) { \
         c10_npu::set_has_throw_error(true);                                  \
         TORCH_CHECK(                                                         \
@@ -127,6 +140,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
             ":",                                                             \
             __LINE__,                                                        \
             " NPU function error: UCE ERROR.",                               \
+            ", error code is ", Error_uce,                                   \
             PTA_ERROR(ErrCode::ACL));                                        \
     }                                                                        \
 
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index f27bc24ed0..59456b3349 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -20,9 +20,9 @@ c10::DeviceIndex device_count() noexcept
     // initialize number of devices only once
     if (dev_count == 0) {
         aclError error = aclrtGetDeviceCount(&dev_count);
-        CHECK_AND_THROW_FORCE_STOP(error);
-        CHECK_AND_THROW_UCE_ERROR(error);
         if (error != ACL_ERROR_NONE) {
+            CHECK_AND_THROW_FORCE_STOP(error);
+            CHECK_AND_THROW_UCE_ERROR(error);
             ASCEND_LOGE("get device count of NPU failed");
             return 0;
         }
@@ -48,8 +48,10 @@ aclError GetDevice(int32_t *device)
         return ACL_ERROR_NONE;
     }
     aclError err =  aclrtGetDevice(device);
-    CHECK_AND_THROW_FORCE_STOP(err);
-    CHECK_AND_THROW_UCE_ERROR(err);
+    if (err != ACL_ERROR_NONE) {
+        CHECK_AND_THROW_FORCE_STOP(err);
+        CHECK_AND_THROW_UCE_ERROR(err);
+    }
     if (err == ACL_ERROR_NONE) {
         local_device = *device;
     } else if (err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) {
@@ -155,9 +157,9 @@ aclError SynchronizeUsedDevices()
     for (const auto it : used_devices) {
         NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first));
         aclError acl_ret = aclrtSynchronizeDevice();
-        CHECK_AND_THROW_FORCE_STOP(acl_ret);
-        CHECK_AND_THROW_UCE_ERROR(acl_ret);
         if (acl_ret != ACL_ERROR_NONE) {
+            CHECK_AND_THROW_FORCE_STOP(acl_ret);
+            CHECK_AND_THROW_UCE_ERROR(acl_ret);
             return acl_ret;
         }
     }
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 0cdd29b6ef..b5b762942c 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -265,7 +265,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
         }
 #endif
         read_idx.idx = write_idx.idx;
-        if (call_ret == ACL_ERROR_RT_DEVICE_MTE_ERROR && checkUceErrAndRepair()) {
+        if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
             set_has_throw_error(true);
             call_ret = 0;
             if (check_error) {
@@ -387,7 +387,7 @@ void Repository::Enqueue(void* cur_paras) {
     SetStatus(CAN_EXIT);
     read_idx.idx = write_idx.idx;
 
-    if (call_ret == ACL_ERROR_RT_DEVICE_MTE_ERROR && checkUceErrAndRepair()) {
+    if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
         set_has_throw_error(true);
         call_ret = 0;
         throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL));
diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index cf9baf20d5..3df354ce50 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -429,8 +429,10 @@ bool npuSynchronizeDevice(bool check_error)
         }
     }
     auto acl_ret = aclrtSynchronizeDevice();
-    CHECK_AND_THROW_FORCE_STOP(acl_ret);
-    CHECK_AND_THROW_UCE_ERROR(acl_ret);
+    if (acl_ret != ACL_ERROR_NONE) {
+        CHECK_AND_THROW_FORCE_STOP(acl_ret);
+        CHECK_AND_THROW_UCE_ERROR(acl_ret);
+    }
 #ifndef BUILD_LIBTORCH
     if (acl_ret == ACL_ERROR_NONE) {
         const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
diff --git a/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp b/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp
index 8820c9f864..15645f966b 100644
--- a/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp
+++ b/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp
@@ -132,9 +132,9 @@ struct HostAllocator {
 
         // allocate a new block if no cached allocation is found
         err = aclrtMallocHost(ptr, size);
-        CHECK_AND_THROW_FORCE_STOP(err);
-        CHECK_AND_THROW_UCE_ERROR(err);
         if (err != ACL_ERROR_NONE) {
+            CHECK_AND_THROW_FORCE_STOP(err);
+            CHECK_AND_THROW_UCE_ERROR(err);
             return err;
         }
 
@@ -161,9 +161,9 @@ struct HostAllocator {
 
         // insert npu events for each stream on which this block was used. This
         aclError err = insertEvents(block);
-        CHECK_AND_THROW_FORCE_STOP(err);
-        CHECK_AND_THROW_UCE_ERROR(err);
         if (err != ACL_ERROR_NONE) {
+            CHECK_AND_THROW_FORCE_STOP(err);
+            CHECK_AND_THROW_UCE_ERROR(err);
             return err;
         }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 04bb398e76..dc31bd985b 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -65,6 +65,8 @@ LOAD_FUNCTION(aclrtGetMemUceInfo)
 LOAD_FUNCTION(aclrtDeviceTaskAbort)
 LOAD_FUNCTION(aclrtMemUceRepair)
 LOAD_FUNCTION(aclrtCmoAsync)
+LOAD_FUNCTION(aclrtGetLastError)
+LOAD_FUNCTION(aclrtPeekAtLastError)
 
 aclprofStepInfoPtr init_stepinfo() {
   typedef aclprofStepInfoPtr(*npdInitFunc)();
@@ -607,6 +609,32 @@ aclError AclrtCmoAsync(void* src, size_t size, aclrtCmoType cmoType, aclrtStream
     return func(src, size, cmoType, stream);
 }
 
+aclError AclrtGetLastError(aclrtLastErrLevel flag)
+{
+    typedef aclError (*AclrtGetLastError)(aclrtLastErrLevel flag);
+    static AclrtGetLastError func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtGetLastError) GET_FUNC(aclrtGetLastError);
+    }
+    if (func == nullptr) {
+        return ACL_ERROR_NONE;
+    }
+    return func(flag);
+}
+
+aclError AclrtPeekAtLastError(aclrtLastErrLevel flag)
+{
+    typedef aclError (*AclrtPeekAtLastError)(aclrtLastErrLevel flag);
+    static AclrtPeekAtLastError func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtPeekAtLastError) GET_FUNC(aclrtPeekAtLastError);
+    }
+    if (func == nullptr) {
+        return ACL_ERROR_NONE;
+    }
+    return func(flag);
+}
+
 aclError AclStressDetect(int32_t deviceId, void *workspace, size_t workspaceSize)
 {
     typedef aclError (*AclStressDetect)(int32_t, void*, size_t);
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 30270d2aeb..d868d46423 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -168,6 +168,10 @@ aclError AclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, s
 
 aclError AclrtCmoAsync(void* src, size_t size, aclrtCmoType cmoType, aclrtStream stream);
 
+aclError AclrtGetLastError(aclrtLastErrLevel flag);
+
+aclError AclrtPeekAtLastError(aclrtLastErrLevel flag);
+
 aclError AclStressDetect(int32_t deviceId, void *workspace, size_t workspaceSize);
 
 } // namespace acl
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 0e5ac539b8..61c018dda3 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1488,12 +1488,15 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL
     if (uce_error_flag) {
         uce_error_flag = false;
         c10_npu::set_has_throw_error(true);
-        throw std::runtime_error("UCE ERROR.");
+        ASCEND_LOGE("uce_error_flag is true when workEnqueue, throw UCE ERROR.");
+        throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
         return;
     }
     if (force_stop_error_flag) {
         force_stop_error_flag = false;
-        CHECK_AND_THROW_FORCE_STOP(ACL_ERROR_RT_DEVICE_TASK_ABORT);
+        c10_npu::set_has_throw_error(true);
+        ASCEND_LOGE("force_stop_error_flag is true when workEnqueue, throw FORCE STOP.");
+        throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
         return;
     }
     if (watchdogStatus == WatchdogStatus::STOP) {
diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index 7e766ee341..cb1ec66302 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -141,7 +141,8 @@ aclError OpCommandImpl::InnerRun(
     aclError ret;
     auto stream = c10_npu::getCurrentNPUStream();
     if (stream.getRepoStopFlag()) {
-        CHECK_AND_THROW_FORCE_STOP(ACL_ERROR_RT_DEVICE_TASK_ABORT);
+        ASCEND_LOGE("getRepoStopFlag in InnerRun, throw FORCE STOP.");
+        throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
     }
     auto inputSize = params.inBuffer.size();
     auto outputSize = params.outBuffer.size();
@@ -271,15 +272,16 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
         try {
             ret = cur_paras->customHandler();
         } catch (std::exception &e) {
-            if (std::string(e.what()).find("device task abort") != std::string::npos) {
-                ret = ACL_ERROR_RT_DEVICE_TASK_ABORT;
+            if (std::string(e.what()).find(DEVICE_TASK_ABORT) != std::string::npos ||
+                std::string(e.what()).find(DEVICE_MEM_ERROR) != std::string::npos) {
+                ret =c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
             } else {
                 ret = ACL_ERROR_INVALID_PARAM;
                 LOG(ERROR) << e.what();
             }
             ASCEND_LOGE("Custom hand error:%s", e.what());
         }
-        if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+        if (ret != ACL_ERROR_NONE && ret != ACL_ERROR_RT_DEVICE_TASK_ABORT && ret != ACL_ERROR_RT_DEVICE_MEM_ERROR) {
             ASCEND_LOGE("Custom hand fail! name=%s, ret=0x%#x", cur_paras->opType, ret);
             C10_NPU_SHOW_ERR_MSG();
         }
@@ -304,7 +306,11 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
             ACL_ENGINE_SYS,
             at_npu::native::aoe::aoe_manager().GetDumpGraphPath().c_str(),
             nullptr);
-        if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+        if (ret != ACL_ERROR_NONE) {
+            auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
+            if (ret_temp != ACL_ERROR_NONE) {
+                ret = ret_temp;
+            }
             ASCEND_LOGE("In aoe mode, AclGenGraphAndDumpForOp failed!");
             C10_NPU_SHOW_ERR_MSG();
             return ret;
@@ -327,7 +333,11 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
         NPU_CHECK_ERROR_WITHOUT_UCE(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "disable"));
     }
 
-    if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+    if (ret != ACL_ERROR_NONE) {
+        auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
+        if (ret_temp != ACL_ERROR_NONE) {
+            ret = ret_temp;
+        }
         printErrorLog(cur_paras);
         C10_NPU_SHOW_ERR_MSG();
     }
@@ -340,7 +350,11 @@ int MemcopyAsyncFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
     auto cur_paras = static_cast<c10_npu::queue::CopyParas *>(in->paramVal);
     aclError ret =
         aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream);
-    if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+    if (ret != ACL_ERROR_NONE) {
+        auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
+        if (ret_temp != ACL_ERROR_NONE) {
+            ret = ret_temp;
+        }
         ASCEND_LOGE(
             "aclrtMemcpyAsync error! ret = %d, dstLen = %zu, srcLen = %zu, kind = %d",
             ret,
@@ -357,7 +371,11 @@ int RecordEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
     auto cur_paras = static_cast<c10_npu::queue::EventParas *>(in->paramVal);
 
     aclError ret = aclrtRecordEvent(cur_paras->event, stream);
-    if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+    if (ret != ACL_ERROR_NONE) {
+        auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
+        if (ret_temp != ACL_ERROR_NONE) {
+            ret = ret_temp;
+        }
         ASCEND_LOGE("aclrtRecordEvent error! ret = %d, eventAllocatorType = %d", ret, cur_paras->eventAllocatorType);
         C10_NPU_SHOW_ERR_MSG();
     }
@@ -374,7 +392,11 @@ int WaitEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
 {
     auto cur_paras = static_cast<c10_npu::queue::EventParas *>(in->paramVal);
     aclError ret = aclrtStreamWaitEvent(stream, cur_paras->event);
-    if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+    if (ret != ACL_ERROR_NONE) {
+        auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
+        if (ret_temp != ACL_ERROR_NONE) {
+            ret = ret_temp;
+        }
         ASCEND_LOGE(
             "aclrtStreamWaitEvent error! ret = %d, eventAllocatorType = %d",
             ret,
@@ -392,7 +414,11 @@ int LazyDestroyEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
 {
     auto cur_paras = static_cast<c10_npu::queue::EventParas *>(in->paramVal);
     aclError ret = c10_npu::NPUEventManager::GetInstance().LazyDestroy(cur_paras->event);
-    if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) {
+    if (ret != ACL_ERROR_NONE) {
+        auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
+        if (ret_temp != ACL_ERROR_NONE) {
+            ret = ret_temp;
+        }
         ASCEND_LOGE("LazyDestroy error! ret = %d, eventAllocatorType = %d", ret, cur_paras->eventAllocatorType);
         C10_NPU_SHOW_ERR_MSG();
     }
-- 
Gitee


From 3a34bb7eccee08e762fa5232e22d91518ee761c6 Mon Sep 17 00:00:00 2001
From: will-devil <wangyicheng16@huawei.com>
Date: Mon, 23 Sep 2024 09:34:58 +0000
Subject: [PATCH 08/96] !14796 [Bugfix] Reduce unnecessary memory allocation.
 Merge pull request !14796 from will-devil/v2.1.0-6.0.rc3

---
 .../csrc/aten/ops/op_api/CopyKernelOpApi.cpp  | 43 ++++++-------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
index 76d2a6a6b9..7baad2af45 100644
--- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
+++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
@@ -182,45 +182,30 @@ void copy_d2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_
 at::Tensor& NPUNativeOpApiFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking)
 {
     DO_COMPATIBILITY(aclnnInplaceCopy, NPUNativeFunctions::copy_(self, src, non_blocking));
+
     if (self.numel() == 0) {
         return self;
     }
-    auto result = OpPreparation::apply_tensor_without_format(src);
 
-    if (src.is_complex() && torch_npu::utils::is_npu(src)) {
-        auto real_tensor = at::real(src);
-        auto imag_tensor = OpPreparation::apply_tensor_without_format(src);
+    auto maybe_outnames = at::namedinference::compute_broadcast_outnames(self, src);
 
-        if (src.is_conj()) {
-            auto tmp = at::imag(src);
-            tmp._set_neg(false);
-            imag_tensor = tmp.neg();
+    if (torch_npu::utils::is_npu(self)) {
+        if (torch_npu::utils::is_npu(src)) {
+            copy_d2d_baseformat_opapi(self, src, non_blocking);
         } else {
-            imag_tensor = at::imag(src);
+            copy_h2d_baseformat_opapi(self, src, non_blocking);
         }
-
-        auto outDtype = src.dtype();
-        auto outputSize = op_infer::broadcast_ops_npu_output_size(real_tensor, imag_tensor);
-        result = OpPreparation::apply_tensor_without_format(outputSize, real_tensor.options().dtype(outDtype));
-        EXEC_NPU_CMD(aclnnComplex, real_tensor, imag_tensor, result);
-    } else {
-        result = src;
-        if (src.is_neg()) {
-            src._set_neg(false);
-            result = src.neg();
+        if (src.is_complex() && src.is_conj()) {
+            auto real_tensor = at::real(self);
+            auto imag_tensor = at::imag(self).neg();
+            EXEC_NPU_CMD(aclnnComplex, real_tensor, imag_tensor, self);
         }
-    }
-    auto maybe_outnames = at::namedinference::compute_broadcast_outnames(self, result);
-
-    if (torch_npu::utils::is_npu(self)) {
-        if (torch_npu::utils::is_npu(result)) {
-            copy_d2d_baseformat_opapi(self, result, non_blocking);
-        } else {
-            copy_h2d_baseformat_opapi(self, result, non_blocking);
+        if (src.is_neg()) {
+            self.neg_();
         }
     } else {
-        if (torch_npu::utils::is_npu(result)) {
-            copy_d2h_baseformat_opapi(self, result, non_blocking);
+        if (torch_npu::utils::is_npu(src)) {
+            copy_d2h_baseformat_opapi(self, src, non_blocking);
         }
     }
     at::namedinference::propagate_names_if_nonempty(self, maybe_outnames);
-- 
Gitee


From d5399f76b9274673db5c830f6254b04e2e6be8fc Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 23 Sep 2024 10:43:41 +0000
Subject: [PATCH 09/96] !14841 Update op_plugin commit id Merge pull request
 !14841 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 3b738f2d6c..5952cf37f3 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 3b738f2d6c2aff15a77b94c37a71873038877df7
+Subproject commit 5952cf37f3c76ec37105ffe9dd01003101a1437c
-- 
Gitee


From d33726ef5eb664f02122897a7587fffcbb860466 Mon Sep 17 00:00:00 2001
From: wangqihui01 <wangqh10@163.com>
Date: Mon, 23 Sep 2024 12:40:42 +0000
Subject: [PATCH 10/96] !14818 check analyse_flat, schedule and on_trace_ready
 parameters Merge pull request !14818 from wangqihui01/v2.1.0-6.0.rc3

---
 torch_npu/profiler/profiler.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py
index dea02aaded..d15f7e1ec7 100644
--- a/torch_npu/profiler/profiler.py
+++ b/torch_npu/profiler/profiler.py
@@ -136,6 +136,9 @@ class _KinetoProfile:
 @no_exception_func()
 def tensorboard_trace_handler(dir_name: str = None, worker_name: str = None, analyse_flag: bool = True):
     ProfPathCreator().init(worker_name=worker_name, dir_name=dir_name)
+    if not isinstance(analyse_flag, bool):
+        print_warn_msg("analyse_flag is not bool, set by default.")
+        analyse_flag = True
 
     def handler_fn(prof_inst) -> None:
         if analyse_flag:
@@ -162,13 +165,18 @@ class profile(_KinetoProfile):
     ):
         super().__init__()
         activities_set = set(activities) if activities else supported_activities()
-        if schedule:
+        if schedule and isinstance(schedule, Callable):
             self.schedule = schedule
             # add step markers into the trace and table view
             self.record_steps = True
         else:
+            if schedule:
+                print_warn_msg("schedule is not Callable, set by default.")
             self.schedule = _default_schedule_fn
             self.record_steps = False
+        if on_trace_ready and not isinstance(on_trace_ready, Callable):
+            print_warn_msg("on_trace_ready is not Callable, set by default.")
+            on_trace_ready = None
         self.prof_if = _ProfInterface(
             activities=activities_set,
             record_shapes=record_shapes,
-- 
Gitee


From d9d36cbf77a727df2e73526e949969a34dc211c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Mon, 23 Sep 2024 13:16:30 +0000
Subject: [PATCH 11/96] =?UTF-8?q?!14790=20[Fix]=20Fix=20public=20bindings.?=
 =?UTF-8?q?=20Merge=20pull=20request=20!14790=20from=20=E5=88=98=E5=98=89?=
 =?UTF-8?q?=E5=B7=8D/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/npu/test_public_bindings.py |  7 +++++++
 test/torch_npu_schema.json       | 21 ---------------------
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py
index 1afd5ac8ad..3b5f8000c9 100644
--- a/test/npu/test_public_bindings.py
+++ b/test/npu/test_public_bindings.py
@@ -42,6 +42,13 @@ tempFilter = {
     "torch_npu.npu_masked_softmax_with_rel_pos_bias",
     "torch_npu.npu_moe_gating_top_k_softmax",
     "torch_npu.npu_moe_init_routing",
+    "torch_npu.npu_ifmr",
+    "torch_npu.npu_masked_fill_range",
+    "torch_npu.npu_normalize_batch",
+    "torch_npu.npu_rotated_box_decode",
+    "torch_npu.npu_rotated_box_encode",
+    "torch_npu.npu_scatter",
+    "torch_npu.npu_stride_add",
     "torch_npu.utils.collect_env.main",
     "torch_npu.utils.collect_env.namedtuple",
     "torch_npu.one_",
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index d2b4ba302f..f168547aa8 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2744,9 +2744,6 @@
   "torch_npu.npu_gru": {
     "signature": "(inputs, hx, weight_input, weight_hidden, bias_input, bias_hidden, seq_length, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
   },
-  "torch_npu.npu_ifmr": {
-    "signature": "(data, data_min, data_max, cumsum, min_percentile, max_percentile, search_start, search_end, search_step, with_offset)"
-  },
   "torch_npu.npu_incre_flash_attention": {
     "signature": "(self, query, key, value, padding_mask, atten_mask, pse_shift, actual_seq_lengths, antiquant_scale, antiquant_offset, block_table, num_heads, scale_value, input_layout, num_key_value_heads, block_size, inner_precise)"
   },
@@ -2765,9 +2762,6 @@
   "torch_npu.npu_lstm": {
     "signature": "(inputs, weight, bias, seqMask, h, c, has_biases, num_layers, dropout, train, bidirectional, batch_first, flagSeq, direction)"
   },
-  "torch_npu.npu_masked_fill_range": {
-    "signature": "(self, start, end, value, axis=-1)"
-  },
   "torch_npu.npu_max": {
     "signature": "(self, dim, keepdim=False)"
   },
@@ -2798,9 +2792,6 @@
   "torch_npu.npu_nms_with_mask": {
     "signature": "(inputs, iou_threshold)"
   },
-  "torch_npu.npu_normalize_batch": {
-    "signature": "(self, seq_len, normalize_type=0)"
-  },
   "torch_npu.npu_one_hot": {
     "signature": "(self, num_classes=-1, depth=1, on_value=1, off_value=0)"
   },
@@ -2840,12 +2831,6 @@
   "torch_npu.npu_rotary_mul": {
     "signature": "(x, r1, r2)"
   },
-  "torch_npu.npu_rotated_box_decode": {
-    "signature": "(self, deltas, weight)"
-  },
-  "torch_npu.npu_rotated_box_encode": {
-    "signature": "(self, gt_bboxes, weight)"
-  },
   "torch_npu.npu_rotated_iou": {
     "signature": "(self, query_boxes, trans=False, mode=0, is_cross=True, v_threshold=0.0, e_threshold=0.0)"
   },
@@ -2855,9 +2840,6 @@
   "torch_npu.npu_scaled_masked_softmax": {
     "signature": "(x, mask, scale=1, fixed_triu_mask=False)"
   },
-  "torch_npu.npu_scatter": {
-    "signature": "(self, indices, updates, dim)"
-  },
   "torch_npu.npu_scatter_nd_update": {
     "signature": "(self, indices, updates)"
   },
@@ -2882,9 +2864,6 @@
   "torch_npu.npu_sort_v2": {
     "signature": "(self, dim=-1, descending=False, out=None)"
   },
-  "torch_npu.npu_stride_add": {
-    "signature": "(self, other, offset1, offset2, c1_len)"
-  },
   "torch_npu.npu_stride_copy": {
     "signature": "(self, shape, stride, storage_offset, out=None)"
   },
-- 
Gitee


From 033dd355b721d138d942ea12b3da45d318e59eaa Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 23 Sep 2024 13:43:44 +0000
Subject: [PATCH 12/96] !14849 Update op_plugin commit id Merge pull request
 !14849 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5952cf37f3..11287f9900 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5952cf37f3c76ec37105ffe9dd01003101a1437c
+Subproject commit 11287f9900795dafab3d5fdce68cc6bc062f2e92
-- 
Gitee


From 7c07e10a5ba22d026f81cf202e43765fa57e6634 Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Mon, 23 Sep 2024 13:48:25 +0000
Subject: [PATCH 13/96] !14826 set default value disable for
 ALLOW_INTERNAL_FORMAT Merge pull request !14826 from huangyunlong/2.1rc3f

---
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 04176f3b58..679b2a262a 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -99,6 +99,17 @@ void GetAndSetDefaultJitCompileByAcl()
   ASCEND_LOGI("Get ACL JitCompile default value %s and set", value_str.c_str());
 }
 
+void SetDefaultAllowInternalFromatDisable()
+{
+    auto allow_internal_format = c10_npu::option::GetOption("ALLOW_INTERNAL_FORMAT");
+    if (allow_internal_format.has_value() && allow_internal_format.value() != "") {
+        return;
+    }
+
+    c10_npu::option::SetOption("ALLOW_INTERNAL_FORMAT", "disable");
+    ASCEND_LOGI("Set ALLOW_INTERNAL_FORMAT default value disable.");
+}
+
 void SetHF32DefaultValue() {
   // The default value of the flag used to control whether HF32 is allowed on conv is True.
   // The default value of the flag used to control whether HF32 is allowed on matmul is True,
@@ -238,6 +249,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
     MakeCompileCacheDirAndSetOption();
     // set default jit_Compile value from Get acl defalut value
     GetAndSetDefaultJitCompileByAcl();
+    // set default allow_internal_format value
+    if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910_9391) {
+        SetDefaultAllowInternalFromatDisable();
+    }
 
     SetHF32DefaultValue();
 
-- 
Gitee


From 02bcb117b5e3688a0e40d891b1d59e5ec0ddd11f Mon Sep 17 00:00:00 2001
From: sunjiayang <sunxinlei1@huawei.com>
Date: Mon, 23 Sep 2024 14:28:56 +0000
Subject: [PATCH 14/96] !14762 mem uce bug fix Merge pull request !14762 from
 sunjiayang/mem_uce_210_rc3

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 35 ++++++++++++-------
 torch_npu/csrc/core/npu/NPUException.cpp      | 19 ++++------
 torch_npu/csrc/core/npu/NPUException.h        | 21 ++++++++---
 torch_npu/csrc/npu/Module.cpp                 |  4 +--
 4 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 1486954eed..8be7af41c0 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -866,27 +866,36 @@ class DeviceCachingAllocator {
   bool checkUceInMemPool()
   {
       auto memUceInfo_ = c10_npu::get_mem_uce_info();
-      auto info = memUceInfo_.info.data();
+      auto info = memUceInfo_.info;
       const auto all_blocks = get_all_blocks();
 
       for (int i = 0; i < memUceInfo_.retSize; ++i) {
-          size_t length = info[i].len;
           void* addr = info[i].addr;
-          for (int j = 0; j < length; ++j) {
-              bool found = false;
-              for (const Block* const head_block : all_blocks) {
-                  if (head_block->ptr <= addr && addr < head_block->ptr + head_block->size) {
-                      const_cast<Block*>(head_block)->is_safe = false;
+          size_t length = info[i].len;
+
+          // Calculate the start and end address for info[i]
+          void* addr_end = static_cast<char*>(addr) + length - 1;
+
+          bool found = false;
+
+          // Iterate through all blocks and check if there's an overlap with addr
+          for (const Block* const head_block : all_blocks) {
+              void* block_start = head_block->ptr;
+              void* block_end = static_cast<char*>(head_block->ptr) + head_block->size - 1;
+
+              // If there is an overlap, mark the block as unsafe
+              if (addr <= block_end && addr_end >= block_start) {
+                  const_cast<Block*>(head_block)->is_safe = false;
+                  found = true;
+                  // Set the unsafe flag only once
+                  if (c10_npu::get_npu_data_unsafe_flag() == false) {
                       c10_npu::set_npu_data_unsafe_flag(true);
-                      found = true;
-                      break;
                   }
               }
+          }
 
-              if (!found) {
-                return false;
-              }
-              addr += 1;
+          if (!found) {
+              return false;
           }
       }
       return true;
diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index d0f9dbe48c..4cc680261b 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -97,10 +97,7 @@ MemUceInfo get_mem_uce_info()
 void clear_mem_uce_info()
 {
     std::lock_guard<std::mutex> lock(memUceInfoMutex);
-    memUceInfo.device = 0;
-    memUceInfo.info.clear();
-    memUceInfo.retSize = 0;
-    memUceInfo.mem_type = 0;
+    memUceInfo.clear();
 }
 
 const char *c10_npu_get_error_message()
@@ -116,17 +113,13 @@ bool checkUceErrAndRepair()
         TORCH_CHECK(false, "ERROR happend in GetDevice.", PTA_ERROR(ErrCode::ACL))
     }
 
-    aclrtMemUceInfo info[MAX_MEM_UCE_INFO_ARRAY_SIZE];
-    size_t retSize = 0;
+    MemUceInfo memUceInfo_;
+    memUceInfo_.device = device;
 
-    err = c10_npu::acl::AclrtGetMemUceInfo(device, info, sizeof(info) / sizeof(aclrtMemUceInfo), &retSize);
+    err = c10_npu::acl::AclrtGetMemUceInfo(device, memUceInfo_.info, sizeof(memUceInfo_.info) / sizeof(aclrtMemUceInfo), &memUceInfo_.retSize);
     if (err == ACL_ERROR_NONE) {
-        if (retSize > 0) {
-            ASCEND_LOGE("AclrtGetMemUceInfo get UCE ERROR, retSize is %d", retSize);
-            MemUceInfo memUceInfo_;
-            memUceInfo_.device = device;
-            memUceInfo_.info.assign(info, info + retSize);
-            memUceInfo_.retSize = retSize;
+        if (memUceInfo_.retSize > 0) {
+            ASCEND_LOGE("AclrtGetMemUceInfo get UCE ERROR, retSize is %d", memUceInfo_.retSize);
             set_mem_uce_info(memUceInfo_);
             return true;
         } else {
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index bd2f49c5d6..eb4620a13c 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -233,10 +233,23 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
 namespace c10_npu {
 
 struct MemUceInfo {
-    int device = 0;
-    std::vector<aclrtMemUceInfo> info;
-    size_t retSize = 0;
-    int mem_type = 0;
+    int device;
+    aclrtMemUceInfo info[MAX_MEM_UCE_INFO_ARRAY_SIZE];
+    size_t retSize;
+    int mem_type;
+
+    MemUceInfo() : device(-1), retSize(0), mem_type(0)
+    {
+        std::memset(info, 0, sizeof(info));
+    }
+
+    void clear()
+    {
+        device = -1;
+        std::memset(info, 0, sizeof(info));
+        retSize = 0;
+        mem_type = 0;
+    }
 };
 
 C10_NPU_API const char *c10_npu_get_error_message();
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index ef82be1221..d630aa10d8 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -375,8 +375,8 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg)
     HANDLE_TH_ERRORS
     int device = THPUtils_unpackLong(arg);
     auto memUceInfo_ = c10_npu::get_mem_uce_info();
-    if (memUceInfo_.retSize > 0  && memUceInfo_.mem_type == 3) {
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info.data(), memUceInfo_.retSize));
+    if (memUceInfo_.retSize > 0 && memUceInfo_.mem_type == 3) {
+        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info, memUceInfo_.retSize));
     }
     
     c10_npu::clear_mem_uce_info();
-- 
Gitee


From 3881ab8ea0aaed6981dd0475c3a5fed44f5970cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Tue, 24 Sep 2024 03:44:29 +0000
Subject: [PATCH 15/96] =?UTF-8?q?!14741=20Add=20FA=20Flop=20Count=20Merge?=
 =?UTF-8?q?=20pull=20request=20!14741=20from=20=E5=8F=B6=E5=AD=90=E5=87=A1?=
 =?UTF-8?q?/v2.1.0=5FFLOPS?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/flopcount/FlopCounter.cpp | 226 +++++++++++++++++++++++
 torch_npu/csrc/flopcount/FlopCounter.h   |   7 +
 2 files changed, 233 insertions(+)

diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp
index 4f8d53248e..3fa7feaa88 100644
--- a/torch_npu/csrc/flopcount/FlopCounter.cpp
+++ b/torch_npu/csrc/flopcount/FlopCounter.cpp
@@ -129,3 +129,229 @@ int64_t FlopCounter::conv_backward_flop(const at::Tensor &grad_output, const at:
 
     return flop_count;
 }
+
+std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>>> _unpack_flash_attention_nested_shapes(std::vector<int64_t> query,
+    std::vector<int64_t> key, std::vector<int64_t> value, int64_t head_num, std::vector<int64_t> grad_out,
+    c10::ArrayRef<int64_t> cum_seq_q, c10::ArrayRef<int64_t> cum_seq_k, std::string input_layer_str)
+{
+    // Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
+    // GQA and MQA and TND
+
+    // for GQA and MQA, the dim 2 or 3 of kv should equal to q
+    // for general, shape should view to [B, N, S, D]
+    TORCH_CHECK(head_num != 0, "Divisor head_num may be 0, please check it.")
+    std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>>> result;
+    int64_t q_1 = query[1];
+    int64_t q_2 = query[2];
+    int64_t k_1 = key[1];
+    int64_t k_2 = key[2];
+    int64_t v_1 = value[1];
+    int64_t v_2 = value[2];
+
+    // for GQA and MQA
+    if (input_layer_str == "SBH" || input_layer_str == "BSH" || input_layer_str == "BSND") {
+        if (q_2 != k_2 && q_2!= v_2) {
+            k_2 = q_2;
+            v_2 = q_2;
+        }
+    } else {
+        if (q_1 != k_1 && q_1!= v_1) {
+            k_1 = q_1;
+            v_1 = q_1;
+        }
+    }
+
+    if (input_layer_str == "BSH") {
+        std::vector<int64_t> new_query_shape = {query[0], head_num, q_1, q_2/head_num};
+        std::vector<int64_t> new_key_shape = {key[0], head_num, k_1, k_2/head_num};
+        std::vector<int64_t> new_value_shape = {value[0], head_num, v_1, v_2/head_num};
+        std::vector<int64_t> new_grad_out_shape;
+        if (!grad_out.empty()) {
+            new_grad_out_shape = new_query_shape;
+        }
+        result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape);
+    } else if (input_layer_str == "SBH") {
+        std::vector<int64_t> new_query_shape = {q_1, head_num, query[0], q_2/head_num};
+        std::vector<int64_t> new_key_shape = {k_1, head_num, key[0], k_2/head_num};
+        std::vector<int64_t> new_value_shape = {v_1, head_num, value[0], v_2/head_num};
+        std::vector<int64_t> new_grad_out_shape;
+        if (!grad_out.empty()) {
+            new_grad_out_shape = new_query_shape;
+        }
+        result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape);
+    } else if (input_layer_str == "BNSD") {
+        std::vector<int64_t> new_grad_out_shape;
+        if (!grad_out.empty()) {
+            new_grad_out_shape = query;
+        }
+        result.emplace_back(query, key, value, new_grad_out_shape);
+    } else if (input_layer_str == "BSND") {
+        std::vector<int64_t> new_query_shape = {query[0], q_2, q_1, query[3]};
+        std::vector<int64_t> new_key_shape = {key[0], k_2, k_1, key[3]};
+        std::vector<int64_t> new_value_shape = {value[0], v_2, v_1, value[3]};
+        std::vector<int64_t> new_grad_out_shape;
+        if (!grad_out.empty()) {
+            new_grad_out_shape = new_query_shape;
+        }
+        result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape);
+    } else if (input_layer_str == "TND") {
+        TORCH_CHECK(!cum_seq_q.empty(), "The actual_seq_qlen should not be empty when TND");
+        TORCH_CHECK(!cum_seq_k.empty(), "The actual_seq_kvlen should not be empty when TND");
+        TORCH_CHECK(cum_seq_q.size() == cum_seq_k.size(), "The size of actual_seq_qlen should be equal to actual_seq_kvlen when TND");
+
+        int64_t b = cum_seq_q.size();
+        TORCH_CHECK(b != 0, "Divisor b may be 0, please check it.")
+        std::vector<int64_t> new_query_shape = {b, q_1, query[0]/b, q_2};
+        std::vector<int64_t> new_key_shape = {b, k_1, key[0]/b, k_2};
+        std::vector<int64_t> new_value_shape = {b, v_1, value[0]/b, v_2};
+        std::vector<int64_t> new_grad_out_shape;
+        if (!grad_out.empty()) {
+            new_grad_out_shape = new_query_shape;
+        }
+        result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape);
+    }
+
+    return result;
+}
+
+int64_t sdpa_flop_count(const std::vector<int64_t> query_shape, const std::vector<int64_t> key_shape, const std::vector<int64_t> value_shape)
+{
+    int64_t b, h, s_q, d_q;
+    int64_t _b2, _h2, s_k, _d2;
+    int64_t _b3, _h3, _s3, d_v;
+
+    b = query_shape[0];
+    h = query_shape[1];
+    s_q = query_shape[2];
+    d_q = query_shape[3];
+
+    _b2 = key_shape[0];
+    _h2 = key_shape[1];
+    s_k = key_shape[2];
+    _d2 = key_shape[3];
+
+    _b3 = value_shape[0];
+    _h3 = value_shape[1];
+    _s3 = value_shape[2];
+    d_v = value_shape[3];
+
+    TORCH_CHECK(b == _b2 && b == _b3, "the dim of 0 is not equal between q and kv");
+    TORCH_CHECK(h == _h2 && h == _h3, "the dim of 1 is not equal between q and kv");
+    TORCH_CHECK(s_k == _s3, "the dim of 2 is not equal between k and v");
+    TORCH_CHECK(d_q == _d2, "the dim of 3 is not equal between q and k");
+
+    int64_t total_flops = 0;
+
+    // q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
+    const at::Tensor shape1 = at::empty({b * h, s_q, d_q}, at::kFloat);
+    const at::Tensor shape2 = at::empty({b * h, d_q, s_k}, at::kFloat);
+    total_flops += FlopCounter::bmm_flop(shape1, shape2);
+
+    // scores: [b, h, s_q, s_k] @ v: [b, h, s_k, d_v] -> out: [b, h, s_q, d_v]
+    const at::Tensor shape3 = at::empty({b * h, s_q, s_k}, at::kFloat);
+    const at::Tensor shape4 = at::empty({b * h, s_k, d_v}, at::kFloat);
+    total_flops += FlopCounter::bmm_flop(shape3, shape4);
+
+    return total_flops;
+}
+
+int64_t sdpa_backward_flop_count(const std::vector<int64_t> query_shape, const std::vector<int64_t> key_shape, const std::vector<int64_t> value_shape, const std::vector<int64_t> grad_out_shape)
+{
+    int64_t b, h, s_q, d_q;
+    int64_t _b2, _h2, s_k, _d2;
+    int64_t _b3, _h3, _s3, d_v;
+    int64_t _b4, _h4, _s4, d_4;
+
+    b = query_shape[0];
+    h = query_shape[1];
+    s_q = query_shape[2];
+    d_q = query_shape[3];
+
+    _b2 = key_shape[0];
+    _h2 = key_shape[1];
+    s_k = key_shape[2];
+    _d2 = key_shape[3];
+
+    _b3 = value_shape[0];
+    _h3 = value_shape[1];
+    _s3 = value_shape[2];
+    d_v = value_shape[3];
+
+    _b4 = grad_out_shape[0];
+    _h4 = grad_out_shape[1];
+    _s4 = grad_out_shape[2];
+    d_4 = grad_out_shape[3];
+
+    TORCH_CHECK(b == _b2 && b == _b3 && b == _b4, "the dim of 0 is not equal between qkv and grad");
+    TORCH_CHECK(h == _h2 && h == _h3 && h == _h4, "the dim of 1 is not equal between qkv and grad");
+    TORCH_CHECK(s_k == _s3, "the dim of 2 is not equal between k and v");
+    TORCH_CHECK(s_q == _s4, "the dim of 2 is not equal between q and grad");
+    TORCH_CHECK(d_q == _d2, "the dim of 3 is not equal between q and k");
+    TORCH_CHECK(d_v == d_4, "the dim of 3 is not equal between v and grad");
+
+    int64_t total_flops = 0;
+
+    // gradOut: [b, h, s_q, d_v] @ v: [b, h, d_v, s_k] -> gradScores: [b, h, s_q, s_k]
+    const at::Tensor shape1 = at::empty({b * h, s_q, d_v}, at::kFloat);
+    const at::Tensor shape2 = at::empty({b * h, d_v, s_k}, at::kFloat);
+    total_flops += FlopCounter::bmm_flop(shape1, shape2);
+
+    // scores: [b, h, s_k, s_q] @ gradOut: [b, h, s_q, d_v] -> gradV: [b, h, s_k, d_v]
+    const at::Tensor shape3 = at::empty({b * h, s_k, s_q}, at::kFloat);
+    const at::Tensor shape4 = at::empty({b * h, s_q, d_v}, at::kFloat);
+    total_flops += FlopCounter::bmm_flop(shape3, shape4);
+
+    // gradScores: [b, h, s_q, s_k] @ k: [b, h, s_k, d_q] -> gradQ: [b, h, s_q, d_q]
+    const at::Tensor shape5 = at::empty({b * h, s_q, s_k}, at::kFloat);
+    const at::Tensor shape6 = at::empty({b * h, s_k, d_q}, at::kFloat);
+    total_flops += FlopCounter::bmm_flop(shape5, shape6);
+
+    // q: [b, h, d_q, s_q] @ gradScores: [b, h, s_q, s_k] -> gradK: [b, h, d_q, s_k]
+    const at::Tensor shape7 = at::empty({b * h, d_q, s_q}, at::kFloat);
+    const at::Tensor shape8 = at::empty({b * h, s_q, s_k}, at::kFloat);
+    total_flops += FlopCounter::bmm_flop(shape7, shape8);
+
+    return total_flops;
+}
+
+int64_t FlopCounter::flash_attention_forward_flop(
+    const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, int64_t head_num,
+    const std::string &input_layout, const c10::OptionalIntArrayRef &actual_seq_qlen,
+    const c10::OptionalIntArrayRef &actual_seq_kvlen)
+{
+    std::vector<int64_t> grad_out_shape;
+    std::vector<int64_t> query_shape(query.sizes().begin(), query.sizes().end());
+    std::vector<int64_t> key_shape(key.sizes().begin(), key.sizes().end());
+    std::vector<int64_t> value_shape(value.sizes().begin(), value.sizes().end());
+    auto ac_seq_qlen_tmp = actual_seq_qlen.value_or(c10::ArrayRef<int64_t>{});
+    auto ac_seq_kvlen_tmp = actual_seq_kvlen.value_or(c10::ArrayRef<int64_t>{});
+
+    auto sizes = _unpack_flash_attention_nested_shapes(query_shape, key_shape, value_shape, head_num, grad_out_shape, ac_seq_qlen_tmp, ac_seq_kvlen_tmp, input_layout);
+
+    int64_t total_flops = 0;
+    for (const auto& [query_shape_new, key_shape_new, value_shape_new, _] : sizes) {
+        total_flops += sdpa_flop_count(query_shape_new, key_shape_new, value_shape_new);
+    }
+    return total_flops;
+}
+
+int64_t FlopCounter::flash_attention_backward_flop(
+    const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, const at::Tensor &dy, int64_t head_num,
+    const std::string &input_layout, const c10::OptionalIntArrayRef &actual_seq_qlen,
+    const c10::OptionalIntArrayRef &actual_seq_kvlen)
+{
+    std::vector<int64_t> dy_shape(query.sizes().begin(), query.sizes().end());
+    std::vector<int64_t> query_shape(query.sizes().begin(), query.sizes().end());
+    std::vector<int64_t> key_shape(key.sizes().begin(), key.sizes().end());
+    std::vector<int64_t> value_shape(value.sizes().begin(), value.sizes().end());
+    auto ac_seq_qlen_tmp = actual_seq_qlen.value_or(c10::ArrayRef<int64_t>{});
+    auto ac_seq_kvlen_tmp = actual_seq_kvlen.value_or(c10::ArrayRef<int64_t>{});
+
+    auto sizes = _unpack_flash_attention_nested_shapes(query_shape, key_shape, value_shape, head_num, dy_shape, ac_seq_qlen_tmp, ac_seq_kvlen_tmp, input_layout);
+
+    int64_t total_flops = 0;
+    for (const auto& [query_shape_new, key_shape_new, value_shape_new, grad_out_shape] : sizes) {
+        total_flops += sdpa_backward_flop_count(query_shape_new, key_shape_new, value_shape_new, grad_out_shape);
+    }
+    return total_flops;
+}
diff --git a/torch_npu/csrc/flopcount/FlopCounter.h b/torch_npu/csrc/flopcount/FlopCounter.h
index fdf829b5b7..43ee5fe04d 100644
--- a/torch_npu/csrc/flopcount/FlopCounter.h
+++ b/torch_npu/csrc/flopcount/FlopCounter.h
@@ -18,6 +18,13 @@ public:
     static int64_t conv_backward_flop(const at::Tensor &grad_output, const at::Tensor &input,
         const at::Tensor &weight, bool transposed, ::std::array<bool, 3> output_mask,
         const at::Tensor &gradInput, const at::Tensor &gradeWeight);
+    static int64_t flash_attention_forward_flop(const at::Tensor &query, const at::Tensor &key, const at::Tensor &value,
+        int64_t head_num, const std::string &input_layout, const c10::OptionalIntArrayRef &actual_seq_qlen,
+        const c10::OptionalIntArrayRef &actual_seq_kvlen);
+    static int64_t flash_attention_backward_flop(const at::Tensor &query, const at::Tensor &key, const at::Tensor &value,
+        const at::Tensor &dy, int64_t head_num, const std::string &input_layout,
+        const c10::OptionalIntArrayRef &actual_seq_qlen,
+        const c10::OptionalIntArrayRef &actual_seq_kvlen);
 };
 
 #endif
-- 
Gitee


From 2c98088f9ca98e6059712ec9bfbf861c148307d2 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 24 Sep 2024 13:13:48 +0000
Subject: [PATCH 16/96] !14886 Update op_plugin commit id Merge pull request
 !14886 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 11287f9900..bba68c7744 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 11287f9900795dafab3d5fdce68cc6bc062f2e92
+Subproject commit bba68c77445f84c70c57aaf655ad0580ee3ee91b
-- 
Gitee


From 18de45d7e81ede0a9a643a3b233a9ff8330f70a8 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 25 Sep 2024 02:43:48 +0000
Subject: [PATCH 17/96] !14908 Update op_plugin commit id Merge pull request
 !14908 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index bba68c7744..f463b37ad2 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit bba68c77445f84c70c57aaf655ad0580ee3ee91b
+Subproject commit f463b37ad2d5926294f111d3b0bd689d70da1635
-- 
Gitee


From 90ac55586e7582360510e5606bc884208f551082 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 25 Sep 2024 09:28:41 +0000
Subject: [PATCH 18/96] !14928 Update op_plugin commit id Merge pull request
 !14928 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index f463b37ad2..d0987b49a2 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit f463b37ad2d5926294f111d3b0bd689d70da1635
+Subproject commit d0987b49a28feb673196ac1375a14546195bf952
-- 
Gitee


From f6c9fa2af83e477d4f195f5c5315fb83da1e2f95 Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Wed, 25 Sep 2024 10:15:46 +0000
Subject: [PATCH 19/96] =?UTF-8?q?!14876=20=E3=80=90Bugfix=E3=80=91Fix=20pr?=
 =?UTF-8?q?ofiler=20task=5Fmanager=20sleep=20time=20on=20v2.1.0-6.0.rc3=20?=
 =?UTF-8?q?Merge=20pull=20request=20!14876=20from=20Mrtutu/task=5Fmgr=5Fv2?=
 =?UTF-8?q?.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/profiler/analysis/prof_common_func/_constant.py     | 2 +-
 torch_npu/profiler/analysis/prof_common_func/_task_manager.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py
index a178c53070..edcfe328c0 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_constant.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py
@@ -12,7 +12,7 @@ class Constant(object):
     INVALID_VALUE = -1
     NULL_VALUE = 0
     DEFAULT_PROCESS_NUMBER = os.cpu_count() // 2
-    SLEEP_TIME = 0.5
+    SLEEP_TIME = 0.1
 
     # dir name
     FRAMEWORK_DIR = "FRAMEWORK"
diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
index cb800471b3..e652b996c4 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
@@ -307,7 +307,7 @@ class ConcurrentTasksManager:
                 need_exit = False
                 break
         if need_exit:
-            time.sleep(Constant.SLEEP_TIME * 2)
+            time.sleep(Constant.SLEEP_TIME * 5)
             if all((task_info.task.is_non_blocking for task_info in self.listening_infos.values())):
                 return True
 
-- 
Gitee


From c92557a83895d5aa570e8e85e18ab6f6dc7af90f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com>
Date: Wed, 25 Sep 2024 10:16:48 +0000
Subject: [PATCH 20/96] =?UTF-8?q?!14859=20[PROF]=20fix=20mstx.range=5Fstar?=
 =?UTF-8?q?t=20err=20without=20input=20stream=20Merge=20pull=20request=20!?=
 =?UTF-8?q?14859=20from=20=E6=A2=85=E9=A3=9E=E8=A6=81/2.1=5Frc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/npu/mstx.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/torch_npu/npu/mstx.py b/torch_npu/npu/mstx.py
index 38dd465d38..2710d6aeec 100644
--- a/torch_npu/npu/mstx.py
+++ b/torch_npu/npu/mstx.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 import torch_npu._C
 
 
@@ -22,19 +23,22 @@ class mstx:
     @staticmethod
     def range_start(message: str, stream=None) -> int:
         if not message:
-            print(Warning, "Invalid message for mstx.range_start func. Please input valid message string.")
+            warnings.warn("Invalid message for mstx.range_start func. Please input valid message string.")
             return 0
-        if isinstance(stream, torch_npu.npu.streams.Stream):
-            stream = stream.npu_stream
+        if stream:
+            if isinstance(stream, torch_npu.npu.streams.Stream):
+                stream = stream.npu_stream
+                return torch_npu._C._mstx._range_start(message, stream)
+            else:
+                warnings.warn("Invalid stream for mstx.range_start func. Please input valid stream.")
+                return 0
         else:
-            print(Warning, 'Invalid type for stream argument, must be `torch_npu.npu.Stream`')
-            return 0
-        return torch_npu._C._mstx._range_start(message, stream)
+            return torch_npu._C._mstx._range_start_on_host(message)
 
     @staticmethod
     def range_end(range_id: int):
         if not isinstance(range_id, int):
-            print(Warning, "Invalid message for mstx.range_start func. Please input return value from mstx.range_start().")
+            warnings.warn("Invalid message for mstx.range_start func. Please input return value from mstx.range_start.")
             return
         torch_npu._C._mstx._range_end(range_id)
 
-- 
Gitee


From 220c740152d4301be66e76fc054c37c891bcf037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Wed, 25 Sep 2024 11:31:08 +0000
Subject: [PATCH 21/96] =?UTF-8?q?!14864=20Different=20device=20copying=20i?=
 =?UTF-8?q?s=20supported=20by=20delivering=20AclrtMemcpyAsync=20task=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!14864=20from=20=E9=97=AB=E9=B9=8F?=
 =?UTF-8?q?=E5=85=A8/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/aten/common/CopyKernel.cpp     | 56 ++++++++++---------
 .../csrc/aten/common/InnerNpuNativeFunction.h |  2 +
 .../csrc/aten/ops/op_api/CopyKernelOpApi.cpp  | 26 +++------
 3 files changed, 38 insertions(+), 46 deletions(-)

diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 803fb11e4e..1253c6b5f7 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -78,33 +78,6 @@ void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blo
   copy_d2d_dtype_baseformat(self, src, non_blocking);
 }
 
-void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
-    c10_npu::NPUGuard guard(src.device());
-    // p2p enable and synchronize self stream
-    if (self.device().index() != src.device().index()) {
-        bool warning_flag = false;
-        bool p2p_enabled = NpuP2pCtrl::get_instance().get_p2p_access(src.device().index(), self.device().index(), warning_flag);
-        // In the same 'os', tensor can copy even if the enable fails
-        if (warning_flag) {
-            ASCEND_LOGW("p2p enable from %d to %d is fails", src.device().index(), self.device().index());
-        }
-        guard.set_device(self.device());
-        c10_npu::NPUStream dst_stream = c10_npu::getCurrentNPUStream(self.device().index());
-        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(dst_stream));
-        guard.set_device(src.device());
-    }
-    if (self.dtype() != src.dtype()) {
-        custom_ops::npu_dtype_cast_(self, src); // npu_dtype_cast_ will call copy function.
-        return;
-    }
-    copy_d2d_dtype(self, src, non_blocking);
-    // synchronize src stream for different devices copy
-    if (self.device().index() != src.device().index()) {
-        c10_npu::NPUStream copy_stream = c10_npu::getCurrentNPUStream();
-        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(copy_stream));
-    }
-}
-
 // the format of dst and src is base format now
 // the dtype of dst and src is same
 // and src and dst are contiguous
@@ -273,6 +246,35 @@ bool can_use_memcpy(at::Tensor& dst, const at::Tensor& src) {
   return false;
 }
 
+void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
+    c10_npu::NPUGuard guard(src.device());
+    // p2p enable and synchronize self stream
+    auto self_device_idx = self.device().index();
+    auto src_device_idx = src.device().index();
+    if (self_device_idx != src_device_idx) {
+        bool warning_flag = false;
+        NpuP2pCtrl::get_instance().get_p2p_access(src_device_idx, self_device_idx, warning_flag);
+        // In the same 'os', tensor can copy even if the enable fails
+        if (warning_flag) {
+            ASCEND_LOGW("p2p enable from %d to %d is fails", src_device_idx, self_device_idx);
+        }
+        guard.set_device(self.device());
+        c10_npu::NPUStream dst_stream = c10_npu::getCurrentNPUStream(self_device_idx);
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(dst_stream));
+        guard.set_device(src.device());
+    }
+    if (self.dtype() != src.dtype()) {
+        custom_ops::npu_dtype_cast_(self, src); // npu_dtype_cast_ will call copy function.
+        return;
+    }
+    copy_d2d_dtype(self, src, non_blocking);
+    // synchronize src stream for different devices copy
+    if (self_device_idx != src_device_idx) {
+        c10_npu::NPUStream copy_stream = c10_npu::getCurrentNPUStream();
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(copy_stream));
+    }
+}
+
 at::Tensor copy_d2d_format_cast(at::Tensor& dst, const at::Tensor& src)
 {
     string srcFormat = FormatHelper::GetFormatName(src);
diff --git a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h
index b3e8b21023..7a2173c755 100644
--- a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h
+++ b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h
@@ -7,6 +7,8 @@ namespace at_npu {
 namespace native {
 
 bool can_use_memcpy(at::Tensor& dst, const at::Tensor& src);
+// Supports cross-chip copying of different devices
+void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking);
 void copy_d2d_by_memcpy(at::Tensor& dst, const at::Tensor& src, int64_t exceptSize = 0);
 void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking);
 void copy_d2d_dtype_baseformat(at::Tensor& self, const at::Tensor& src, bool non_blocking);
diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
index 7baad2af45..5c71b8fa26 100644
--- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
+++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
@@ -154,28 +154,16 @@ void copy_d2h_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_
 // the format of dst and src is baseformat now, copy d2d
 void copy_d2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
 {
-    c10_npu::NPUGuard guard(src.device());
     if (dst.device().index() != src.device().index()) {
-        bool warning_flag = false;
-        bool p2p_enabled = NpuP2pCtrl::get_instance().get_p2p_access(src.device().index(), dst.device().index(), warning_flag);
-        // In the same 'os', tensor can copy even if the enable fails
-        if (warning_flag) {
-            ASCEND_LOGW("p2p enable from %d to %d is fails", src.device().index(), dst.device().index());
-        }
-        guard.set_device(dst.device());
-        c10_npu::NPUStream dst_stream = c10_npu::getCurrentNPUStream(dst.device().index());
-        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(dst_stream));
-        guard.set_device(src.device());
-    } else {
-        c10::SmallVector<at::Tensor, N> inputs = {src};
-        c10::SmallVector<at::Tensor, N> outputs = {dst};
-        CalcuOpUtil::CheckMemoryOverLaps(inputs, outputs);
+        return copy_d2d(dst, src, non_blocking);
     }
+
+    c10_npu::NPUGuard guard(src.device());
+    c10::SmallVector<at::Tensor, N> inputs = {src};
+    c10::SmallVector<at::Tensor, N> outputs = {dst};
+    CalcuOpUtil::CheckMemoryOverLaps(inputs, outputs);
+
     EXEC_NPU_CMD(aclnnInplaceCopy, dst, src);
-    if (dst.device().index() != src.device().index()) {
-        c10_npu::NPUStream copy_stream = c10_npu::getCurrentNPUStream();
-        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(copy_stream));
-    }
 }
 
 
-- 
Gitee


From 3d6e60f788ee54ca6cd8d0fa5c5616e8240346a1 Mon Sep 17 00:00:00 2001
From: zhangyuan <zhangyuan106@huawei.com>
Date: Wed, 25 Sep 2024 13:58:31 +0000
Subject: [PATCH 22/96] !14909 Update torchair commit id Merge pull request
 !14909 from zhangyuan/v2.1.0-6.0.rc3

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 485484ca71..5c269fba4c 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 485484ca7143cdf47415793ca76db9210cff8a4c
+Subproject commit 5c269fba4c1ea53ef7e3812876756a36c1caf45c
-- 
Gitee


From 21ff9f2ed40d143c721b6bbd6c2c902d5e71046e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 25 Sep 2024 13:58:44 +0000
Subject: [PATCH 23/96] !14945 Update op_plugin commit id Merge pull request
 !14945 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index d0987b49a2..891c8d10ec 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit d0987b49a28feb673196ac1375a14546195bf952
+Subproject commit 891c8d10ec3f5ea468ae9d94d6b7f0b4526a2190
-- 
Gitee


From ea054a360bf590763c8fb00b19e923c3ced47f59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Thu, 26 Sep 2024 02:38:15 +0000
Subject: [PATCH 24/96] =?UTF-8?q?!14934=20Fix=20lowercase=20issues=20Merge?=
 =?UTF-8?q?=20pull=20request=20!14934=20from=20=E6=9D=9C=E9=87=91=E8=88=AA?=
 =?UTF-8?q?/cherry-pick-1727254577?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 8be7af41c0..aae2ed8d01 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -729,7 +729,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(
             NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
         } else {
             NPU_CHECK_SUPPORTED_OR_ERROR(status, "aclrtReserveMemAddress");
-            TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to expandable_segments = false.");
+            TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to `False`.");
             m_expandable_segments = false;
         }
     }
@@ -773,12 +773,12 @@ void CachingAllocatorConfig::parseArgs(const char* env) {
       if (set_expandable_segments_flag) {
           TORCH_CHECK(m_max_split_size == std::numeric_limits<size_t>::max() && m_garbage_collection_threshold == 0,
                       "`max_split_size_mb` or `garbage_collection_threshold`, cannot be enabled with "
-                      "`expandable_segments`, please set `expandable_segments` to `false`.",
+                      "`expandable_segments`, please set `expandable_segments` to `False`.",
                       OPS_ERROR(ErrCode::PARAM));
       } else if (m_max_split_size != std::numeric_limits<size_t>::max() || m_garbage_collection_threshold != 0) {
           m_expandable_segments = false;
           TORCH_NPU_WARN_ONCE("`max_split_size_mb` or `garbage_collection_threshold` is enabled, and the "
-                              "`expandable_segments` is changed to `false` by default.");
+                              "`expandable_segments` is changed to `False` by default.");
       }
   }
 }
-- 
Gitee


From a96563c3ba252e7d5bbf462548a32eee9655d41b Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 26 Sep 2024 16:13:50 +0000
Subject: [PATCH 25/96] !14988 Update op_plugin commit id Merge pull request
 !14988 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 891c8d10ec..69ff4ba2e9 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 891c8d10ec3f5ea468ae9d94d6b7f0b4526a2190
+Subproject commit 69ff4ba2e9d797462846ba0004e9584ddd7f2fb5
-- 
Gitee


From 100bc9e19a368aad7e72e871c86d12256414aa6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=96=9B=E9=B9=8F?= <xuepeng4@huawei.com>
Date: Fri, 27 Sep 2024 09:21:46 +0000
Subject: [PATCH 26/96] =?UTF-8?q?!14991=20update=20torchair=20commit=20id?=
 =?UTF-8?q?=20Merge=20pull=20request=20!14991=20from=20=E8=96=9B=E9=B9=8F/?=
 =?UTF-8?q?v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 5c269fba4c..731150ebc5 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 5c269fba4c1ea53ef7e3812876756a36c1caf45c
+Subproject commit 731150ebc561308c44fc7f699c7227551fc470b4
-- 
Gitee


From 492910ad89071aea0fcca393cb327d98b0c52edd Mon Sep 17 00:00:00 2001
From: tangmengcheng <tangmengcheng2@huawei.com>
Date: Fri, 27 Sep 2024 09:32:05 +0000
Subject: [PATCH 27/96] !15002 v2.1.0-6.0-rc3-bugfix Merge pull request !15002
 from tangmengcheng/v2.1.0-6.0-rc3-buf-fix

---
 .../analysis/prof_common_func/_task_manager.py  | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
index e652b996c4..7de29238af 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
@@ -6,6 +6,7 @@ import threading
 import multiprocessing
 import fcntl
 import pickle
+import signal
 from enum import Enum
 from abc import ABC, abstractmethod
 from torch_npu.utils._error_code import ErrCode, prof_error
@@ -137,6 +138,7 @@ class ConcurrentTasksManager:
 
     def run(self):
         try:
+            signal.signal(signal.SIGINT, self.finalize)
             if self.progress_bar:
                 self.__start_print_progress_bar()
 
@@ -149,13 +151,16 @@ class ConcurrentTasksManager:
         except Exception as e:
             print_error_msg(f"An error occurred: {e}")
         finally:
-            for task_info in self.task_infos.values():
-                if task_info.status != TaskStatus.Succeed:
-                    print_error_msg("Task %s has not run successfully." % task_info.task.name)
-                    self.__stop_task(task_info)
+            self.finalize()
 
-            if self.progress_bar:
-                self.__stop_print_progress_bar()
+    def finalize(self):
+        for task_info in self.task_infos.values():
+            if task_info.status != TaskStatus.Succeed:
+                print_error_msg("Task %s has not run successfully." % task_info.task.name)
+                self.__stop_task(task_info)
+
+        if self.progress_bar:
+            self.__stop_print_progress_bar()
 
     def clear(self):
         for task_info in self.listening_infos.values():
-- 
Gitee


From 752b29437638981d193dd459a53c4798b4ac3d35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Fri, 27 Sep 2024 09:53:56 +0000
Subject: [PATCH 28/96] =?UTF-8?q?!14924=20hcclAlltoAll=20put=20into=20task?=
 =?UTF-8?q?queue=20Merge=20pull=20request=20!14924=20from=20=E7=8E=8B?=
 =?UTF-8?q?=E8=B6=85/v2.1.0-6.0.rc3=5Ffix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 47 +++++++++++++------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 61c018dda3..1ce574c3e1 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2834,20 +2834,39 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                 HcclComm comm,
                 c10_npu::NPUStream& stream, std::shared_ptr<bool> is_dispatched) {
                     RECORD_FUNCTION("HcclAlltoAll", std::vector<c10::IValue>({input}));
-                    torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclAlltoAll", input_counts, getHcclDataType(input.scalar_type()), comm),
-                        stream.stream(false));
-                    auto hccl_result = hcclAlltoAll(
-                        input.data_ptr(),
-                        input_counts,
-                        getHcclDataType(input.scalar_type()),
-                        output.data_ptr(),
-                        output_counts,
-                        getHcclDataType(output.scalar_type()),
-                        comm,
-                        stream.stream());
-                    *is_dispatched = true;
-                    return hccl_result;
+                    auto inputDataPtr = input.data_ptr();
+                    auto outputDataPtr = output.data_ptr();
+                    auto inputhcclDataType = getHcclDataType(input.scalar_type());
+                    auto outputhcclDataType = getHcclDataType(output.scalar_type());
+                    auto hccl_call = [inputDataPtr,
+                                  input_counts,
+                                  inputhcclDataType,
+                                  outputDataPtr,
+                                  output_counts,
+                                  outputhcclDataType,
+                                  comm,
+                                  stream,
+                                  is_dispatched]() -> int {
+                        torch_npu::profiler::MstxRange range(
+                            getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm),
+                            stream.stream(false));
+                        auto hccl_result = hcclAlltoAll(
+                            inputDataPtr,
+                            input_counts,
+                            inputhcclDataType,
+                            outputDataPtr,
+                            output_counts,
+                            outputhcclDataType,
+                            comm,
+                            stream.stream(false));
+                        *is_dispatched = true;
+                        return hccl_result;
+                    };
+                    at_npu::native::OpCommand cmd;
+                    cmd.Name("HcclAlltoAll");
+                    cmd.SetCustomHandler(hccl_call);
+                    cmd.Run();
+                    return HCCL_SUCCESS;
                 },
             [&](std::vector<c10_npu::NPUStream>&, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {},
             [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>& work) {
-- 
Gitee


From fc645f197592932b244e44bc39ab159ad04c8c8e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 27 Sep 2024 10:58:47 +0000
Subject: [PATCH 29/96] !15029 Update op_plugin commit id Merge pull request
 !15029 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 69ff4ba2e9..1e6850a170 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 69ff4ba2e9d797462846ba0004e9584ddd7f2fb5
+Subproject commit 1e6850a1703e3d5cd89b0e5903d633de2ba388ff
-- 
Gitee


From 1678984808321762371f1a969c2b8cb25eea0f36 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Fri, 27 Sep 2024 12:36:24 +0000
Subject: [PATCH 30/96] !14938 egg_info deprecation Merge pull request !14938
 from dilililiwhy/cherry-pick-1727265326

---
 setup.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup.py b/setup.py
index cfed83ba12..6e44f86033 100644
--- a/setup.py
+++ b/setup.py
@@ -465,7 +465,6 @@ class BdistWheelBuild(bdist_wheel):
 
         dependencies = torch_dependencies + cann_dependencies + other_dependencies
 
-        self.run_command('egg_info')
         bdist_wheel.run(self)
 
         if is_manylinux:
@@ -609,7 +608,6 @@ setup(
         'build_ext': Build,
         'build_py': PythonPackageBuild,
         'bdist_wheel': BdistWheelBuild,
-        'egg_info': EggInfoBuild,
         'install': InstallCmd,
         'clean': Clean
     },
-- 
Gitee


From 9057ef3831bde8127721c6affe71c5b4d9134e84 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 27 Sep 2024 13:58:54 +0000
Subject: [PATCH 31/96] !15036 Update op_plugin commit id Merge pull request
 !15036 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 1e6850a170..e946a18555 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 1e6850a1703e3d5cd89b0e5903d633de2ba388ff
+Subproject commit e946a185558e0dd0a0bc5d444f1703d4ede168dd
-- 
Gitee


From 228714b813a5a4b53e58c6b7e591b56b4541969b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Fri, 27 Sep 2024 18:45:22 +0000
Subject: [PATCH 32/96] =?UTF-8?q?!14954=20Fix=20codecheck:=20the=20type=20?=
 =?UTF-8?q?of=20variable=20is=20signed,=20while=20the=20type=20of=20value?=
 =?UTF-8?q?=20is=20unsigned.=20Merge=20pull=20request=20!14954=20from=20?=
 =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1/v2.1.0=5FFLOPS=5FRC3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/flopcount/FlopCounter.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp
index 3fa7feaa88..0789441efd 100644
--- a/torch_npu/csrc/flopcount/FlopCounter.cpp
+++ b/torch_npu/csrc/flopcount/FlopCounter.cpp
@@ -199,7 +199,9 @@ std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::vector<i
         TORCH_CHECK(!cum_seq_k.empty(), "The actual_seq_kvlen should not be empty when TND");
         TORCH_CHECK(cum_seq_q.size() == cum_seq_k.size(), "The size of actual_seq_qlen should be equal to actual_seq_kvlen when TND");
 
-        int64_t b = cum_seq_q.size();
+        size_t sizeValue = cum_seq_q.size();
+        TORCH_CHECK(sizeValue <= static_cast<size_t>(std::numeric_limits<int64_t>::max()), "cum_seq_q.size() is too large to be represented as an int64_t", OPS_ERROR(ErrCode::PARAM));
+        int64_t b = static_cast<int64_t>(sizeValue);
         TORCH_CHECK(b != 0, "Divisor b may be 0, please check it.")
         std::vector<int64_t> new_query_shape = {b, q_1, query[0]/b, q_2};
         std::vector<int64_t> new_key_shape = {b, k_1, key[0]/b, k_2};
-- 
Gitee


From 7272f480f4b5bda68965944e5c2eb85969d7aa6c Mon Sep 17 00:00:00 2001
From: sunjiayang <sunxinlei1@huawei.com>
Date: Sat, 28 Sep 2024 06:15:10 +0000
Subject: [PATCH 33/96] !15013 stress detect in thread Merge pull request
 !15013 from sunjiayang/stess_926_210_rc3

---
 torch_npu/csrc/InitNpuBindings.cpp   |   3 +
 torch_npu/csrc/npu/Module.cpp        |  40 +--------
 torch_npu/csrc/npu/Stress_detect.cpp | 130 +++++++++++++++++++++++++++
 torch_npu/csrc/npu/Stress_detect.h   |  56 ++++++++++++
 4 files changed, 191 insertions(+), 38 deletions(-)
 create mode 100644 torch_npu/csrc/npu/Stress_detect.cpp
 create mode 100644 torch_npu/csrc/npu/Stress_detect.h

diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 7401446956..938b628f15 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -14,6 +14,7 @@
 #include "torch_npu/csrc/profiler/init.h"
 #include "torch_npu/csrc/flopcount/Init.h"
 #include "torch_npu/csrc/npu/Module.h"
+#include "torch_npu/csrc/npu/Stress_detect.h"
 #include "torch_npu/csrc/utils/TensorType.h"
 #include "torch_npu/csrc/utils/AutocastMode.h"
 #include "torch_npu/csrc/profiler/python/combined_traceback.h"
@@ -94,6 +95,8 @@ PyObject* THPModule_npu_shutdown_synchronize(PyObject* /* unused */)
         Py_RETURN_FALSE;
     }
 
+    StressDetector::stop_worker_thread();
+
     // Return aclrtSynchronizeDevice result. If sync device fails, release host
     // resources forcibly, only record WARN logs when acl interface of stream
     // or event fails.
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index d630aa10d8..99e3b1df7e 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -35,6 +35,7 @@
 #include "torch_npu/csrc/npu/NPUPluggableAllocator.h"
 #include "torch_npu/csrc/npu/Stream.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
+#include "torch_npu/csrc/npu/Stress_detect.h"
 #include "torch_npu/csrc/aten/python_functions.h"
 #include "torch_npu/csrc/utils/LazyInit.h"
 #include "third_party/acl/inc/acl/acl.h"
@@ -397,9 +398,6 @@ PyObject* THNPModule_getDevice_wrap(PyObject* self, PyObject* noargs)
     END_HANDLE_TH_ERRORS
 }
 
-std::unordered_map<int, std::chrono::time_point<std::chrono::steady_clock>> last_call_times;
-const int interval_time = 3600;
-
 PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* noargs)
 {
     HANDLE_TH_ERRORS
@@ -408,41 +406,7 @@ PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* noargs)
     int device_id;
     NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_id));
 
-    auto current_time = std::chrono::steady_clock::now();
-
-    if (last_call_times.find(device_id) != last_call_times.end() &&
-        std::chrono::duration_cast<std::chrono::seconds>(current_time - last_call_times[device_id]).count() < interval_time)
-    {
-        // StressDetect can only be called once every hour for the given device_id, Return 1.
-        ASCEND_LOGW("StressDetect can only be called once every hour for the given device_id:{%d}, Return 1.", device_id);
-        return PyLong_FromLong(1);
-    }
-    last_call_times[device_id] = current_time;
-
-    void* workspaceAddr = nullptr;
-    uint64_t size = 2;
-    size_t workspaceSize = size << 10 << 10 << 10;
-    if (workspaceSize > 0) {
-        auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
-        if (ret != ACL_ERROR_NONE) {
-            c10_npu::NPUCachingAllocator::emptyCache();
-            ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
-            if (ret != ACL_ERROR_NONE) {
-                ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret);
-                return PyLong_FromLong(ACL_ERROR_NONE);
-            }
-        }
-    }
-
-    std::future<int> result = std::async(std::launch::async, c10_npu::acl::AclStressDetect, device_id, workspaceAddr, workspaceSize);
-    int ret = result.get();
-
-    aclrtFree(workspaceAddr);
-    if (ret == ACLNN_CLEAR_DEVICE_STATE_FAIL) {
-        ASCEND_LOGE("call AclStressDetect failed, ERROR : %d, voltage recovery fail.", ret);
-        NPU_CHECK_ERROR(ACLNN_CLEAR_DEVICE_STATE_FAIL, "StressDetect");
-    }
-
+    int ret = StressDetector::perform_stress_detect(device_id);
     return PyLong_FromLong(ret);
     END_HANDLE_TH_ERRORS
 }
diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp
new file mode 100644
index 0000000000..bf35d583a6
--- /dev/null
+++ b/torch_npu/csrc/npu/Stress_detect.cpp
@@ -0,0 +1,130 @@
+#include "Stress_detect.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+
+std::unordered_map<int, std::chrono::time_point<std::chrono::steady_clock>> StressDetector::last_call_times;
+std::atomic<bool> StressDetector::task_in_progress(false);
+std::atomic<bool> StressDetector::stop_thread(false);
+std::atomic<bool> StressDetector::new_task_submitted(false);
+std::atomic<bool> StressDetector::thread_initialized(false);
+std::promise<int> StressDetector::promise;
+std::future<int> StressDetector::current_task_future;
+std::thread StressDetector::stress_detect_thread;
+std::condition_variable StressDetector::cv;
+std::mutex StressDetector::mtx;
+
+int StressDetector::device_id;
+void* StressDetector::workspaceAddr = nullptr;
+size_t StressDetector::workspaceSize = 0;
+const int StressDetector::interval_time = 3600;
+
+// Persistent worker thread implementation
+void StressDetector::worker_thread()
+{
+    if (prctl(PR_SET_NAME, ("StressDetect_thread")) != 0) {
+        ASCEND_LOGE("set thread name failed!");
+    }
+
+    while (!stop_thread.load()) {
+        std::unique_lock<std::mutex> lock(mtx);
+
+        // Wait for new task submission or thread stop signal
+        cv.wait(lock, [] { return new_task_submitted.load() || stop_thread.load(); });
+
+        if (stop_thread.load()) {
+            return; // Exit thread
+        }
+
+        // Execute the task
+        int ret = c10_npu::acl::AclStressDetect(device_id, workspaceAddr, workspaceSize);
+
+        // Task complete, free memory
+        aclrtFree(workspaceAddr);
+
+        // Set task result and reset flags
+        task_in_progress.store(false);
+        promise.set_value(ret);  // Pass the task execution result
+
+        // Reset task submission flag
+        new_task_submitted.store(false);
+    }
+}
+
+// Synchronous stress detection task execution
+int StressDetector::perform_stress_detect(int deviceid)
+{
+    auto current_time = std::chrono::steady_clock::now();
+    // Check the calling interval
+    if (last_call_times.find(deviceid) != last_call_times.end() &&
+        std::chrono::duration_cast<std::chrono::seconds>(current_time - last_call_times[deviceid]).count() < interval_time) {
+        ASCEND_LOGW("StressDetect can only be called once every hour for the given deviceid:{%d}, Return 1.", deviceid);
+        return 1;
+    }
+    last_call_times[deviceid] = current_time;
+
+    // If it's the first call, start the persistent thread
+    if (!thread_initialized.load()) {
+        std::lock_guard<std::mutex> lock(mtx);  // Ensure thread safety
+        if (!thread_initialized.load()) {  // Double check
+            stress_detect_thread = std::thread(worker_thread);
+            thread_initialized.store(true);  // Mark thread as started
+        }
+    }
+
+    // Set task parameters
+    task_in_progress.store(true);
+    
+    // Allocate workspace memory
+    workspaceAddr = nullptr;
+    uint64_t size = 2;
+    workspaceSize = size << 10 << 10 << 10;  // Assume memory size
+    if (workspaceSize > 0) {
+        auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        if (ret != ACL_ERROR_NONE) {
+            c10_npu::NPUCachingAllocator::emptyCache();
+            ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+            if (ret != ACL_ERROR_NONE) {
+                ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret);
+                task_in_progress.store(false); // Task ends
+                return ACL_ERROR_NONE;
+            }
+        }
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        // Prepare promise and future
+        promise = std::promise<int>();
+        current_task_future = promise.get_future();
+
+        // Update task-related information
+        StressDetector::device_id = deviceid;
+        StressDetector::workspaceAddr = workspaceAddr;
+        StressDetector::workspaceSize = workspaceSize;
+
+        // Mark new task submitted
+        new_task_submitted.store(true);
+    }
+
+    // Notify the persistent thread to start the task
+    cv.notify_one();
+
+    // Synchronously wait for the task to complete and get the result
+    int ret = current_task_future.get();
+
+    return ret;
+}
+
+// Stop the thread
+void StressDetector::stop_worker_thread()
+{
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        stop_thread.store(true);
+    }
+    cv.notify_one(); // Notify the thread to exit
+    if (stress_detect_thread.joinable()) {
+        stress_detect_thread.join(); // Wait for the thread to exit
+    }
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/npu/Stress_detect.h b/torch_npu/csrc/npu/Stress_detect.h
new file mode 100644
index 0000000000..7edc71683d
--- /dev/null
+++ b/torch_npu/csrc/npu/Stress_detect.h
@@ -0,0 +1,56 @@
+#ifndef STRESS_DETECT_H
+#define STRESS_DETECT_H
+
+#include <unordered_map>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <future>
+#include <condition_variable>
+#include <sys/prctl.h>
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+
+class StressDetector {
+public:
+    TORCH_NPU_API static int perform_stress_detect(int deviceid);
+    TORCH_NPU_API static void stop_worker_thread();
+
+private:
+    static void worker_thread();
+
+    // Records the last call time for each device
+    static std::unordered_map<int, std::chrono::time_point<std::chrono::steady_clock>> last_call_times;
+    
+    // Thread for handling the stress detection task
+    static std::thread stress_detect_thread;
+
+    // Condition variable and mutex to control the thread
+    static std::condition_variable cv;
+    static std::mutex mtx;
+    
+    // Flag to indicate if a task is in progress
+    static std::atomic<bool> task_in_progress;
+    
+    // Flag to signal the thread to stop
+    static std::atomic<bool> stop_thread;
+
+    // Flag to indicate if a new task has been submitted
+    static std::atomic<bool> new_task_submitted;
+
+    // Promise and future for the task, used for synchronizing task results
+    static std::promise<int> promise;
+    static std::future<int> current_task_future;
+
+    // Stores parameters related to the task
+    static int device_id;
+    static void* workspaceAddr;
+    static size_t workspaceSize;
+
+    // Interval between tasks
+    static const int interval_time;
+
+    // Flag to indicate if the thread has been initialized
+    static std::atomic<bool> thread_initialized;
+};
+
+#endif // STRESS_DETECT_H
\ No newline at end of file
-- 
Gitee


From c919ad1f44f62a946f5203e6837df22d4d4594ca Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 28 Sep 2024 10:43:54 +0000
Subject: [PATCH 34/96] !15063 Update op_plugin commit id Merge pull request
 !15063 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index e946a18555..a5ab151a48 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit e946a185558e0dd0a0bc5d444f1703d4ede168dd
+Subproject commit a5ab151a489e571176afe16be976bcc179dd7a87
-- 
Gitee


From 3cb9e12531fce8852873cdb5cf477d6f7edd8a4f Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 28 Sep 2024 10:43:54 +0000
Subject: [PATCH 35/96] !15063 Update op_plugin commit id Merge pull request
 !15063 from pta-robot/v2.1.0-6.0.rc3

-- 
Gitee


From 0ea525168b52ad100eca9031ffdd8e7e8ebc89ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Sat, 28 Sep 2024 13:44:58 +0000
Subject: [PATCH 36/96] =?UTF-8?q?!15015=20remove=20redundant=20check=20in?=
 =?UTF-8?q?=20uce=20error=20check=20Merge=20pull=20request=20!15015=20from?=
 =?UTF-8?q?=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUException.cpp        | 12 ------------
 torch_npu/csrc/core/npu/NPUException.h          |  8 +-------
 torch_npu/csrc/core/npu/NPUQueue.cpp            |  4 ----
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp |  2 --
 torch_npu/csrc/npu/Module.cpp                   |  1 -
 5 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 4cc680261b..20d95c68e7 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -76,8 +76,6 @@ static std::string getCurrentTimestamp()
 
 namespace c10_npu {
 
-bool has_throw_error = false;
-
 MemUceInfo memUceInfo;
 
 std::mutex memUceInfoMutex;
@@ -135,14 +133,4 @@ bool checkUceErrAndRepair()
     return false;
 }
 
-bool get_has_throw_error()
-{
-    return has_throw_error;
-}
-
-void set_has_throw_error(bool flag)
-{
-    has_throw_error = flag;
-}
-
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index eb4620a13c..0f03ee1865 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -110,7 +110,6 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
         Error_stop = stop_error;                                             \
     }                                                                        \
     if ((Error_stop) == ACL_ERROR_RT_DEVICE_TASK_ABORT) {                    \
-        c10_npu::set_has_throw_error(true);                                  \
         TORCH_CHECK(                                                         \
             false,                                                           \
             __func__,                                                        \
@@ -129,9 +128,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
     if ((uce_error) != ACL_ERROR_NONE) {                                     \
         Error_uce = uce_error;                                               \
     }                                                                        \
-    if ((Error_uce) == ACL_ERROR_RT_DEVICE_MEM_ERROR &&                      \
-        c10_npu::get_has_throw_error() == false && c10_npu::checkUceErrAndRepair()) { \
-        c10_npu::set_has_throw_error(true);                                  \
+    if ((Error_uce) == ACL_ERROR_RT_DEVICE_MEM_ERROR && c10_npu::checkUceErrAndRepair()) {     \
         TORCH_CHECK(                                                         \
             false,                                                           \
             __func__,                                                        \
@@ -262,7 +259,4 @@ MemUceInfo get_mem_uce_info();
 
 void clear_mem_uce_info();
 
-bool get_has_throw_error();
-
-void set_has_throw_error(bool flag);
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index b5b762942c..8c950cde17 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -248,7 +248,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 
     if (GetStatus() == RepoStatus::STOP_EXIT) {
         ClearQueue();
-        set_has_throw_error(true);
         if (check_error) {
             throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
         } else {
@@ -266,7 +265,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 #endif
         read_idx.idx = write_idx.idx;
         if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
-            set_has_throw_error(true);
             call_ret = 0;
             if (check_error) {
                 throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
@@ -378,7 +376,6 @@ void Repository::Enqueue(void* cur_paras) {
             return;
         }
         ClearQueue();
-        set_has_throw_error(true);
         throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
     }
 
@@ -388,7 +385,6 @@ void Repository::Enqueue(void* cur_paras) {
     read_idx.idx = write_idx.idx;
 
     if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
-        set_has_throw_error(true);
         call_ret = 0;
         throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL));
     }
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 1ce574c3e1..535179edb6 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1487,14 +1487,12 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL
 {
     if (uce_error_flag) {
         uce_error_flag = false;
-        c10_npu::set_has_throw_error(true);
         ASCEND_LOGE("uce_error_flag is true when workEnqueue, throw UCE ERROR.");
         throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
         return;
     }
     if (force_stop_error_flag) {
         force_stop_error_flag = false;
-        c10_npu::set_has_throw_error(true);
         ASCEND_LOGE("force_stop_error_flag is true when workEnqueue, throw FORCE STOP.");
         throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
         return;
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 99e3b1df7e..6fbb01d166 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -382,7 +382,6 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg)
     
     c10_npu::clear_mem_uce_info();
     setDefaultStreamsStatus(device, c10_npu::RepoStatus::INIT);
-    c10_npu::set_has_throw_error(false);
 
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
-- 
Gitee


From 0210b79c54cb32981b3f17c1c41fdafddd9a2ddf Mon Sep 17 00:00:00 2001
From: sunjiayang <sunxinlei1@huawei.com>
Date: Sun, 29 Sep 2024 01:38:41 +0000
Subject: [PATCH 37/96] !14871 fix check uce in mem bug Merge pull request
 !14871 from sunjiayang/unsafe_210_rc3

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 20 +++++++++++++++----
 torch_npu/csrc/npu/Module.cpp                 |  2 +-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index aae2ed8d01..e4b764ae36 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -868,16 +868,18 @@ class DeviceCachingAllocator {
       auto memUceInfo_ = c10_npu::get_mem_uce_info();
       auto info = memUceInfo_.info;
       const auto all_blocks = get_all_blocks();
+      bool any_found = false;
+      aclrtMemUceInfo temp_info[memUceInfo_.retSize];
+      size_t temp_retsize = 0;
 
       for (int i = 0; i < memUceInfo_.retSize; ++i) {
           void* addr = info[i].addr;
           size_t length = info[i].len;
+          bool found = false;
 
           // Calculate the start and end address for info[i]
           void* addr_end = static_cast<char*>(addr) + length - 1;
 
-          bool found = false;
-
           // Iterate through all blocks and check if there's an overlap with addr
           for (const Block* const head_block : all_blocks) {
               void* block_start = head_block->ptr;
@@ -887,6 +889,7 @@ class DeviceCachingAllocator {
               if (addr <= block_end && addr_end >= block_start) {
                   const_cast<Block*>(head_block)->is_safe = false;
                   found = true;
+                  any_found = true;
                   // Set the unsafe flag only once
                   if (c10_npu::get_npu_data_unsafe_flag() == false) {
                       c10_npu::set_npu_data_unsafe_flag(true);
@@ -894,10 +897,19 @@ class DeviceCachingAllocator {
               }
           }
 
-          if (!found) {
-              return false;
+          if (found) {
+            // update memuceinfo
+            temp_info[temp_retsize++] = info[i];
           }
       }
+
+      std::memcpy(memUceInfo_.info, temp_info, temp_retsize * sizeof(aclrtMemUceInfo));
+      memUceInfo_.retSize = temp_retsize;
+
+      c10_npu::set_mem_uce_info(memUceInfo_);
+      if (!any_found) {
+          return false;
+      }
       return true;
   }
 
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 6fbb01d166..7a9c655c12 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -376,7 +376,7 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg)
     HANDLE_TH_ERRORS
     int device = THPUtils_unpackLong(arg);
     auto memUceInfo_ = c10_npu::get_mem_uce_info();
-    if (memUceInfo_.retSize > 0 && memUceInfo_.mem_type == 3) {
+    if (memUceInfo_.retSize > 0) {
         NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info, memUceInfo_.retSize));
     }
     
-- 
Gitee


From 2d6fb9619a27a785eac0e6d5187ee7f6d7d7837c Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 29 Sep 2024 02:43:52 +0000
Subject: [PATCH 38/96] !15088 Update op_plugin commit id Merge pull request
 !15088 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index a5ab151a48..dd7b55c58a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit a5ab151a489e571176afe16be976bcc179dd7a87
+Subproject commit dd7b55c58a5de27370a04e9025be853bb803ed78
-- 
Gitee


From 5fffefb0a41f720ce1ee688904e58f7f11043af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Sun, 29 Sep 2024 09:49:34 +0000
Subject: [PATCH 39/96] =?UTF-8?q?!15082=20uce=20bug=20fix:=20In=20order=20?=
 =?UTF-8?q?to=20prevent=20the=20dequeue=20thread=20from=20terminating,=20R?=
 =?UTF-8?q?eadQueue=20should=20set=20uce=20status.=20Merge=20pull=20reques?=
 =?UTF-8?q?t=20!15082=20from=20=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Fuce2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUQueue.cpp | 30 ++++++++++++++--------------
 torch_npu/csrc/core/npu/NPUQueue.h   |  4 ++--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 8c950cde17..edbd5a8655 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -246,6 +246,14 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     }
   }
 
+    if (GetStatus() == RepoStatus::UCE_EXIT) {
+        if (check_error) {
+            throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
+        } else {
+            ASCEND_LOGE("UCE ERROR happend.");
+        }
+    }
+
     if (GetStatus() == RepoStatus::STOP_EXIT) {
         ClearQueue();
         if (check_error) {
@@ -264,14 +272,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
         }
 #endif
         read_idx.idx = write_idx.idx;
-        if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
-            call_ret = 0;
-            if (check_error) {
-                throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
-            } else {
-                ASCEND_LOGE("UCE ERROR happend.");
-            }
-        }
 
         if (check_error) {
             throw std::runtime_error("The Inner error is reported as above. "
@@ -338,14 +338,15 @@ bool Repository::ReadQueue()
 #endif
     if (ret != 0) {
         repo_error = get_func_error_msg(manager().getCurrentParams(datas, read_idx.idx));
-        call_ret = ret;
         ASCEND_LOGE("---Thread---%llu: device = %d, write_idx = %u, read_idx = %u, status = %d, ret = %d",
                     std::this_thread::get_id(), device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret);
         while (!IsEmptyQueue()) { // ignore other tasks
             manager().Release(datas, read_idx.idx, releaseQueue);
             read_idx.idx = (read_idx.idx + 1) & (kQueueCapacity - 1);
         }
-        if (GetStatus() != STOP_EXIT) {
+        if (ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
+            SetStatus(UCE_EXIT);
+        } else if (GetStatus() != STOP_EXIT) {
             SetStatus(ERROR_EXIT);
         }
         read_idx.idx = write_idx.idx;
@@ -369,6 +370,10 @@ void Repository::Enqueue(void* cur_paras) {
     return;
   }
 
+    if (GetStatus() == RepoStatus::UCE_EXIT) {
+        throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL));
+    }
+
     if (GetStatus() == RepoStatus::STOP_EXIT) {
         auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
         auto type = queueParam->paramType;
@@ -384,11 +389,6 @@ void Repository::Enqueue(void* cur_paras) {
     SetStatus(CAN_EXIT);
     read_idx.idx = write_idx.idx;
 
-    if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) {
-        call_ret = 0;
-        throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL));
-    }
-
     throw std::runtime_error("The Inner error is reported as above. "
                              "The process exits for this inner error, and " + repo_error + ".\n" +
                              "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h
index e2f2b64933..66e648069f 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.h
+++ b/torch_npu/csrc/core/npu/NPUQueue.h
@@ -22,7 +22,8 @@ enum RepoStatus {
   NEED_EXIT = 2,
   CAN_EXIT = 3,
   ERROR_EXIT = 4,
-  STOP_EXIT = 5,
+  UCE_EXIT = 5,
+  STOP_EXIT = 6,
 };
 
 // c10::SmallVector max size
@@ -115,7 +116,6 @@ private:
   c10::DeviceIndex device_idx;
 
 private:
-  int call_ret;
   sring_idx read_idx;
   sring_idx write_idx;
   std::atomic<RepoStatus> repo_status;
-- 
Gitee


From 5205b032dde29a992d8133a7bdfb7b6b76f90ed0 Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Sun, 29 Sep 2024 10:37:41 +0000
Subject: [PATCH 40/96] !15093 [Bug] Fix profiler db small timeout value on
 v2.1.0-rc3 Merge pull request !15093 from Mrtutu/db_timeout_v2.1.0-6.0.rc3

---
 torch_npu/profiler/analysis/prof_common_func/_db_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_npu/profiler/analysis/prof_common_func/_db_manager.py b/torch_npu/profiler/analysis/prof_common_func/_db_manager.py
index 74e52cbaae..4256823fc5 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_db_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_db_manager.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import sqlite3
 
 from ._constant import Constant, print_warn_msg, print_error_msg
@@ -27,6 +28,7 @@ class DbManager:
     INSERT_SIZE = 10000
     FETCH_SIZE = 10000
     MAX_ROW_COUNT = 100000000
+    MAX_TIMEOUT = int(sys.maxsize / 1000)
 
     @classmethod
     def create_connect_db(cls, db_path: str) -> tuple:
@@ -36,7 +38,7 @@ class DbManager:
         if os.path.exists(db_path):
             FileManager.check_db_file_vaild(db_path)
         try:
-            conn = sqlite3.connect(db_path)
+            conn = sqlite3.connect(db_path, timeout=cls.MAX_TIMEOUT)
         except sqlite3.Error as err:
             return EmptyClass("emoty conn"), EmptyClass("empty curs")
         
-- 
Gitee


From 03f553f7c016307fcb9228ea70f81a7b7613a2e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com>
Date: Sun, 29 Sep 2024 10:38:24 +0000
Subject: [PATCH 41/96] =?UTF-8?q?!15073=20[PROF]update=20mstx=20data=20for?=
 =?UTF-8?q?mat=20Merge=20pull=20request=20!15073=20from=20=E6=A2=85?=
 =?UTF-8?q?=E9=A3=9E=E8=A6=81/comm=5F1.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 535179edb6..023445c1a8 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1660,7 +1660,7 @@ std::string ProcessGroupHCCL::getMstxHcclMsg(
     if (!torch_npu::profiler::mstxEnable()) {
         return "";
     }
-    std::string hccl_message_str = opName + "@";
+    std::string hccl_message_str = "comm:" + opName + ",";
     auto nameIter = commNames.find(comm);
     if (nameIter == commNames.end()) {
         char commName[MAX_GROUP_NAME_LEN];
@@ -1671,13 +1671,13 @@ std::string ProcessGroupHCCL::getMstxHcclMsg(
     } else {
         hccl_message_str += nameIter->second;
     }
-    hccl_message_str += "@";
+    hccl_message_str += ",";
     std::string data_type_str = "na";
     auto iter = dataTypes.find(dataType);
     if (iter != dataTypes.end()) {
         data_type_str = iter->second;
     }
-    hccl_message_str = hccl_message_str + data_type_str + "@" + std::to_string(dataCnt);
+    hccl_message_str = hccl_message_str + data_type_str + "," + std::to_string(dataCnt);
     return hccl_message_str;
 }
 
-- 
Gitee


From 344f9e7258c0b174c4e43da2b6669a0398e9ac6c Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 29 Sep 2024 10:51:07 +0000
Subject: [PATCH 42/96] !15121 Update op_plugin commit id Merge pull request
 !15121 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index dd7b55c58a..d21d3941a4 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit dd7b55c58a5de27370a04e9025be853bb803ed78
+Subproject commit d21d3941a4215db730666cc543259973b443ad3d
-- 
Gitee


From 290b07b1b20b4d86a84f05dd2765de3880473ef3 Mon Sep 17 00:00:00 2001
From: kevin_huang <kevin.huangdi@huawei.com>
Date: Sun, 29 Sep 2024 11:26:14 +0000
Subject: [PATCH 43/96] !14724  [MoeFinalizeRouting] Modify the ONNX export
 parameters Merge pull request !14724 from kevin_huang/cherry-pick-1726737291

---
 test/torch_npu_schema.json         |  2 +-
 torch_npu/onnx/wrapper_onnx_ops.py | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index f168547aa8..5a36b1ea4b 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2778,7 +2778,7 @@
     "signature": "(sorted_experts, num_experts=1)"
   },
   "torch_npu.npu_moe_finalize_routing": {
-    "signature": "(expanded_permuted_rows, skip1, skip2_optional, bias, scales, expanded_src_to_dst_row, expert_for_source_row)"
+    "signature": "(expanded_permuted_rows, skip1, skip2, bias, scales, expanded_src_to_dst_row, export_for_source_row)"
   },
   "torch_npu.npu_multi_head_attention": {
     "signature": "(query, key, value, query_weight, key_weight, value_weight, attn_mask, out_proj_weight, query_bias, key_bias, value_bias, out_proj_bias, dropout_mask, attn_head_num, attn_dim_per_head, src_len, tgt_len, dropout_prob, softmax_use_float)"
diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py
index 98b515f149..f11ddb8421 100644
--- a/torch_npu/onnx/wrapper_onnx_ops.py
+++ b/torch_npu/onnx/wrapper_onnx_ops.py
@@ -922,12 +922,12 @@ class _NPUMoeFinalizeRoutingOP(torch.autograd.Function):
         return torch.ops.npu.npu_moe_finalize_routing(*args, **kwargs)
 
     @staticmethod
-    def symbolic(g, expanded_permuted_rows: Tensor, skip1: Tensor, skip2_optional: Optional[Tensor], bias: Tensor,
-                 scales: Tensor, expanded_src_to_dst_row: Tensor, expert_for_source_row: Tensor):
-        if skip2_optional is None:
-            skip2_optional = g.op("Constant", value_t=torch.tensor([]).to(torch.float))
-        return g.op("npu::NPUMoeFinalizeRouting", expanded_permuted_rows, skip1, skip2_optional, bias,
-                    scales, expanded_src_to_dst_row, expert_for_source_row)
+    def symbolic(g, expanded_permuted_rows: Tensor, skip1: Tensor, skip2: Optional[Tensor], bias: Tensor,
+                 scales: Tensor, expanded_src_to_dst_row: Tensor, export_for_source_row: Tensor):
+        if skip2 is None:
+            skip2 = g.op("Constant", value_t=torch.tensor([]).to(torch.float))
+        return g.op("npu::NPUMoeFinalizeRouting", expanded_permuted_rows, skip1, skip2, bias,
+                    scales, expanded_src_to_dst_row, export_for_source_row)
 
 
 class _NPUMoeGatingTopKSoftmaxOP(torch.autograd.Function):
@@ -1266,10 +1266,10 @@ def _wrapper_npu_moe_compute_expert_tokens(sorted_experts, num_experts=1):
     return _NPUMoeComputeExpertTokensOP.apply(sorted_experts, num_experts)
 
 
-def _wrapper_npu_moe_finalize_routing(expanded_permuted_rows, skip1, skip2_optional, bias,
-                                     scales, expanded_src_to_dst_row, expert_for_source_row):
-    return _NPUMoeFinalizeRoutingOP.apply(expanded_permuted_rows, skip1, skip2_optional, bias,
-                                         scales, expanded_src_to_dst_row, expert_for_source_row)
+def _wrapper_npu_moe_finalize_routing(expanded_permuted_rows, skip1, skip2, bias,
+                                     scales, expanded_src_to_dst_row, export_for_source_row):
+    return _NPUMoeFinalizeRoutingOP.apply(expanded_permuted_rows, skip1, skip2, bias,
+                                         scales, expanded_src_to_dst_row, export_for_source_row)
 
 
 def _wrapper_npu_moe_gating_top_k_softmax(x, finished, k):
-- 
Gitee


From f723214813e7a197c6b31135c75f73dd5e64ac6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=B2=81=E5=8D=9A=E6=B4=8B?= <luboyang@huawei.com>
Date: Sun, 29 Sep 2024 11:28:03 +0000
Subject: [PATCH 44/96] =?UTF-8?q?!15089=20update=20torchair=20commitid=20v?=
 =?UTF-8?q?2.1.0rc3=20Merge=20pull=20request=20!15089=20from=20=E9=B2=81?=
 =?UTF-8?q?=E5=8D=9A=E6=B4=8B/clamp=5Fv2.1.0rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 731150ebc5..9382e2f6a1 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 731150ebc561308c44fc7f699c7227551fc470b4
+Subproject commit 9382e2f6a1171502887c36d6556fa5fc1ab85b66
-- 
Gitee


From bc5a963efa6bef284c7fdabd0371f590e0162878 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 29 Sep 2024 13:36:19 +0000
Subject: [PATCH 45/96] !15134 Update op_plugin commit id Merge pull request
 !15134 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index d21d3941a4..d7523c27fb 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit d21d3941a4215db730666cc543259973b443ad3d
+Subproject commit d7523c27fb09878bf4da7da8dbfcc5eea46290d2
-- 
Gitee


From bdf7cee84ca4242f9c852093fe15429958d9ac1f Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Mon, 30 Sep 2024 09:09:44 +0000
Subject: [PATCH 46/96] !15123 [PROF] Proflier fix HOST_INFO and META_DATA
 table Merge pull request !15123 from wangjie/cherry-pick-1727604624

---
 test/profiler/analysis/prof_common_func/test_host_info.py | 2 +-
 torch_npu/profiler/analysis/prof_common_func/_constant.py | 8 ++++++--
 .../profiler/analysis/prof_common_func/_host_info.py      | 2 +-
 .../analysis/prof_view/prof_db_parse/_db_parser.py        | 6 ++++--
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/profiler/analysis/prof_common_func/test_host_info.py b/test/profiler/analysis/prof_common_func/test_host_info.py
index d008d0a911..e508d208d5 100644
--- a/test/profiler/analysis/prof_common_func/test_host_info.py
+++ b/test/profiler/analysis/prof_common_func/test_host_info.py
@@ -7,7 +7,7 @@ class TestHostInfo(TestCase):
 
     def test_get_host_info(self):
         host_info = get_host_info()
-        self.assertNotEqual(0, host_info.get('host_uid'))
+        self.assertNotEqual('0', host_info.get('host_uid'))
 
 
 if __name__ == "__main__":
diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py
index edcfe328c0..6cde5e6dcb 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_constant.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py
@@ -291,7 +291,7 @@ class DbConstant():
     TABLE_MEMORY_RECORD = "MEMORY_RECORD"
     TABLE_OPERATOR_MEMORY = "OP_MEMORY"
     TABLE_NPU_OP_MEM = "NPU_OP_MEM"
-    META_DATA_INFO = "META_DATA"
+    TABLE_META_DATA = "META_DATA"
     
     # rank device map table name
     TABLE_RANK_DEVICE_MAP = "RANK_DEVICE_MAP"
@@ -434,9 +434,13 @@ class TableColumnsManager():
             ("preparing", Constant.SQL_NUMERIC_TYPE)
         ],
         DbConstant.TABLE_HOST_INFO : [
-            ('hostUid', Constant.SQL_INTEGER_TYPE),
+            ('hostUid', Constant.SQL_TEXT_TYPE),
             ('hostName', Constant.SQL_TEXT_TYPE)
         ],
+        DbConstant.TABLE_META_DATA : [
+            ('name', Constant.SQL_TEXT_TYPE),
+            ('value', Constant.SQL_TEXT_TYPE)
+        ],
         DbConstant.TABLE_STEP_TIME : [
             ("id", Constant.SQL_INTEGER_TYPE),
             ("startNs", Constant.SQL_INTEGER_TYPE),
diff --git a/torch_npu/profiler/analysis/prof_common_func/_host_info.py b/torch_npu/profiler/analysis/prof_common_func/_host_info.py
index 6ae9981400..4a04ae9db7 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_host_info.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_host_info.py
@@ -21,7 +21,7 @@ __all__ = []
 
 def get_host_info() -> dict:
     host_name = socket.gethostname()
-    host_uid = _get_host_uid()
+    host_uid = str(_get_host_uid())
     return {
         'host_name': host_name,
         'host_uid': host_uid
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
index ada31ca301..297c3a878f 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
@@ -78,7 +78,8 @@ class DbParser(BaseParser):
 
     def save_env_vars_info_to_db(self):
         env_vars_dict = collect_env_vars()
-        DbManager.insert_data_into_table(self._conn, DbConstant.META_DATA_INFO,
+        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_META_DATA, TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA))
+        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_META_DATA,
                                          [['ENV_VARIABLES', json.dumps(env_vars_dict.get('ENV_VARIABLES'))]])
 
     def save_profiler_metadata_to_db(self):
@@ -94,4 +95,5 @@ class DbParser(BaseParser):
         data = [
             [str(key), json.dumps(value)] for key, value in profiler_metadata.items()
         ]
-        DbManager.insert_data_into_table(self._conn, DbConstant.META_DATA_INFO, data)
+        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_META_DATA, TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA))
+        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_META_DATA, data)
-- 
Gitee


From e020c49fddd228ce9afe8100c39d9e2965a0de12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= <zhanhao9@huawei.com>
Date: Mon, 30 Sep 2024 09:09:54 +0000
Subject: [PATCH 47/96] =?UTF-8?q?!15112=20foreach=20add=20compatibility=20?=
 =?UTF-8?q?check=20of=20cann=20version=20Merge=20pull=20request=20!15112?=
 =?UTF-8?q?=20from=20=E8=A9=B9=E6=98=8A/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/_optim.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/torch_npu/utils/_optim.py b/torch_npu/utils/_optim.py
index 287ff05976..0eaeb63a16 100644
--- a/torch_npu/utils/_optim.py
+++ b/torch_npu/utils/_optim.py
@@ -1,20 +1,33 @@
 import torch
 import torch.optim.optimizer as opt
 import torch_npu
+from torch_npu.utils.collect_env import get_cann_version
 
 
 _device_name = None
+_cann_version = get_cann_version()
+_foreach_black_list_for_cann_starts_with = ['8.0.RC1', '8.0.RC2']
+_foreach_black_list_for_cann_all = ['not known', '8.0.T1', '8.0.T2', '8.0.T3', '8.0.T37', '8.0.T5', '8.0.T6', '8.0.T7',
+    '8.0.T8', '8.0.T10', '8.0.T13', '8.0.T16', '8.0.T50', '8.0.T51', '8.0.T52']
 
 
 def patch_supported_devices():
     global _device_name
-    _device_name = (_device_name if _device_name is not None 
+    _device_name = (_device_name if _device_name is not None
                     else torch_npu.npu.get_device_name(torch_npu.npu.current_device()))
 
+    global _cann_version
+    if _cann_version is None or _cann_version < '8.0' or _cann_version in _foreach_black_list_for_cann_all:
+        return ["cuda", "xpu"]
+
+    for ver in _foreach_black_list_for_cann_starts_with:
+        if _cann_version.startswith(ver):
+            return ["cuda", "xpu"]
+
     if _device_name > "Ascend910B" and _device_name < "Ascend910PremiumA":
         return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
-    
-    return ["cuda", "xpu"] 
+
+    return ["cuda", "xpu"]
 
 
 def add_optim_method():
-- 
Gitee


From c486decf103e76bc9890c97b0f38cfb3ec962ebf Mon Sep 17 00:00:00 2001
From: liyou_b <2953090824@qq.com>
Date: Mon, 30 Sep 2024 09:10:10 +0000
Subject: [PATCH 48/96] =?UTF-8?q?!15109=20=E3=80=90PROF=E3=80=91=E3=80=90B?=
 =?UTF-8?q?ug=E3=80=91V210rc3:=20Fix=20share=20memory=20resource=5Ftracker?=
 =?UTF-8?q?=20bug=20Merge=20pull=20request=20!15109=20from=20liyou=5Fb/v21?=
 =?UTF-8?q?0=5Frc3=5Fshm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../_dynamic_profiler/_dynamic_profiler_monitor_shm.py    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
index fccaa63a14..9284706d91 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
@@ -117,14 +117,16 @@ class DynamicProfilerShareMemory:
 
     def _create_shm_over_py38(self):
         """Create a json monitor process based on whether the SharedMemory is successfully created py38"""
-        from multiprocessing import shared_memory, resource_tracker
+        from unittest.mock import patch
+        from multiprocessing import shared_memory
         try_times = 10
         while try_times:
             try:
                 # Step 1: try to open shm file, first time shm not exists.
-                self.shm = shared_memory.SharedMemory(name=self.shm_path)
+                with patch("multiprocessing.resource_tracker.register",
+                           lambda *args, **kwargs: None):
+                    self.shm = shared_memory.SharedMemory(name=self.shm_path)
                 self.is_create_process = False
-                resource_tracker.unregister(self.shm._name, 'shared_memory')
                 logger.info("Rank %d shared memory is connected.", self._rank_id)
                 break
             except FileNotFoundError:
-- 
Gitee


From 095e0c4c8a1f20b9f4853ae8597bc3b96cfc8ee5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Mon, 30 Sep 2024 09:14:39 +0000
Subject: [PATCH 49/96] =?UTF-8?q?!15052=20ranktable=20bug=20fix:=20global?=
 =?UTF-8?q?=20processgroup=20may=20not=20the=20first=20to=20be=20created?=
 =?UTF-8?q?=20Merge=20pull=20request=20!15052=20from=20=E7=8E=8B=E8=B6=85/?=
 =?UTF-8?q?v2.1.0-6.0.rc3=5Ffix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 15 ++++++---------
 torch_npu/csrc/distributed/ProcessGroupHCCL.hpp |  2 --
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 023445c1a8..f66e215d3c 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -604,8 +604,6 @@ std::vector<at::Tensor> ProcessGroupHCCL::WorkHCCL::result()
     return *outputs_;
 }
 
-static std::atomic<size_t> process_group_id = 0;
-
 ProcessGroupHCCL::ProcessGroupHCCL(
     const c10::intrusive_ptr<c10d::Store>& store,
     int rank,
@@ -617,8 +615,7 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     hcclCommCounter_(0),
     traceKeyStart_("HCCL_" + std::to_string(rank) + "_trace_start"),
     traceKeyEnd_("HCCL_" + std::to_string(rank) + "_trace_end"),
-    terminateProcessGroup_(false),
-    uid_(process_group_id++)
+    terminateProcessGroup_(false)
 {
     uint32_t hccl_event_timeout = c10_npu::option::OptionsManager::GetHCCLEventTimeout();
     uint32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout();
@@ -719,7 +716,7 @@ ProcessGroupHCCL::ProcessGroupHCCL(
         global_hccl_id_ = group_ranks + "_" + std::to_string(group_ranks_map_[group_ranks]);
     }
 
-    if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+    if (options_->global_ranks_in_group.empty()) {
         global_ = this;
     }
 }
@@ -769,7 +766,7 @@ void ProcessGroupHCCL::abort(c10::optional<std::string> abortReason)
 
 ProcessGroupHCCL::~ProcessGroupHCCL()
 {
-    if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+    if (options_->global_ranks_in_group.empty()) {
         global_ = nullptr;
     }
 
@@ -850,7 +847,7 @@ void ProcessGroupHCCL::logWorkEnd(WorkHCCL& work)
 
 const std::vector<uint32_t>& ProcessGroupHCCL::groupRanks() const
 {
-    if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+    if (options_->global_ranks_in_group.empty()) {
         static std::vector<uint32_t> globalRanks(size_);
         std::iota(globalRanks.begin(), globalRanks.end(), 0);
         return globalRanks;
@@ -1150,7 +1147,7 @@ void ProcessGroupHCCL::createHCCLComm(const std::vector<at::Device>& devices,
                     std::to_string((int)commType) + DIST_ERROR(ErrCode::PARAM));
         }
 
-        if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+        if (options_->global_ranks_in_group.empty()) {
             global_hccl_comm_ = hcclComms[i];
         }
 
@@ -1172,7 +1169,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
     }
     c10_npu::OptionalNPUGuard npuGuard;
     // global process group
-    if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+    if (options_->global_ranks_in_group.empty()) {
         if (!hcclCommInitClusterInfoConfigExist()) {
             ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface.");
             return false;
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 102cfc66ed..c09d255e17 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -604,8 +604,6 @@ protected:
 
     std::exception_ptr watchDogException_ = nullptr;
 
-    size_t uid_;
-
 private:
     // Helper that encapsulates work shared across all collective communication
     // primitives.
-- 
Gitee


From 62c9d5359e8cdf754a89cf0c639cb7fd37a7b9b9 Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Mon, 30 Sep 2024 09:46:34 +0000
Subject: [PATCH 50/96] !15083 fix torch.empty return random when
 use_deterministic_algorithms Merge pull request !15083 from
 huangyunlong/2.1rc3em

---
 test/npu/test_tensor.py                    |  8 ++++++++
 torch_npu/csrc/aten/common/EmptyTensor.cpp | 19 +++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/test/npu/test_tensor.py b/test/npu/test_tensor.py
index 98fd3c33d0..c4b2befd14 100644
--- a/test/npu/test_tensor.py
+++ b/test/npu/test_tensor.py
@@ -1,6 +1,7 @@
 import itertools
 import torch
 from torch.testing import make_tensor
+from torch.testing._internal.common_utils import DeterministicGuard
 
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
@@ -303,6 +304,13 @@ class TestTensor(TestCase):
 
         self.assertEqual(res1.to('cpu'), expected.to('cpu'))
 
+    def test_empty_with_deterministic(self):
+        with DeterministicGuard(True):
+            empty_tensor = torch.empty(2, 3, 4)
+            empty_strided_tensor = torch.empty_strided((2, 3, 4), (1, 1, 1))
+            self.assertTrue(empty_tensor.isnan().all())
+            self.assertTrue(empty_strided_tensor.isnan().all())
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch_npu/csrc/aten/common/EmptyTensor.cpp b/torch_npu/csrc/aten/common/EmptyTensor.cpp
index 0bfb7ed24f..b33ff954b7 100644
--- a/torch_npu/csrc/aten/common/EmptyTensor.cpp
+++ b/torch_npu/csrc/aten/common/EmptyTensor.cpp
@@ -2,6 +2,7 @@
 #include <torch/library.h>
 #include <c10/core/CPUAllocator.h>
 #include <ATen/EmptyTensor.h>
+#include <ATen/native/TensorFactories.h>
 
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/npu/THNPUCachingHostAllocator.h"
@@ -88,13 +89,23 @@ at::TensorBase empty_strided_cpu(
 }
 
 at::Tensor empty_memory_format(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt, c10::optional<at::Layout> layout_opt,
-    c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
-  return empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
+    c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt)
+{
+    at::Tensor result = empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
+    if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms())) {
+        at::native::fill_empty_deterministic_(result);
+    }
+    return result;
 }
 
 at::Tensor empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional<at::ScalarType> dtype_opt,
-                         c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt) {
-  return empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+                         c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt)
+{
+    at::Tensor result = empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+    if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms())) {
+        at::native::fill_empty_deterministic_(result);
+    }
+    return result;
 }
 
 TORCH_LIBRARY_IMPL(aten, CPU, m) {
-- 
Gitee


From 5067d07a9ce0afe0ea36058286bb158c4ee7f5f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Mon, 7 Oct 2024 09:01:11 +0000
Subject: [PATCH 51/96] =?UTF-8?q?!15165=20add=20ranktable=20warning=20Merg?=
 =?UTF-8?q?e=20pull=20request=20!15165=20from=20=E7=8E=8B=E8=B6=85/v2.1.0-?=
 =?UTF-8?q?6.0.RC3=5Fwarn?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/register/OptionsManager.cpp | 7 +++++++
 torch_npu/csrc/core/npu/register/OptionsManager.h   | 1 +
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp     | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index cad80f086a..8503361020 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -101,6 +101,13 @@ int OptionsManager::GetBoolTypeOption(const char* env_str, int defaultVal)
     return (envFlag != 0) ? 1 : 0;
 }
 
+uint32_t OptionsManager::GetHCCLConnectTimeout()
+{
+    char* env_val = std::getenv("HCCL_CONNECT_TIMEOUT");
+    int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 0;
+    return static_cast<uint32_t>(envFlag);
+}
+
 uint32_t OptionsManager::GetHCCLExecTimeout()
 {
     char* env_val = std::getenv("HCCL_EXEC_TIMEOUT");
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index ba2cb5a198..98e8fd72dc 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -31,6 +31,7 @@ public:
     static bool CheckCombinedOptimizerEnable();
     static bool CheckTriCombinedOptimizerEnable();
     static bool CheckAclDumpDateEnable();
+    static uint32_t GetHCCLConnectTimeout();
     static uint32_t GetHCCLExecTimeout();
     static uint32_t GetHCCLEventTimeout();
     static std::string CheckDisableDynamicPath();
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index f66e215d3c..7960f02725 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1167,6 +1167,10 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
         ASCEND_LOGI("The rank_table_file is not available, switch to original interface.");
         return false;
     }
+    if (c10_npu::option::OptionsManager::GetHCCLConnectTimeout() < 300) {
+        TORCH_NPU_WARN_ONCE("When creating an HCCL process group using the RANK_TABLE_FILE method, the connection may time out. ",
+            "It is recommended to set the timeout duration of HCCL_CONNECT_TIMEOUT to 300 seconds or more.");
+    }
     c10_npu::OptionalNPUGuard npuGuard;
     // global process group
     if (options_->global_ranks_in_group.empty()) {
-- 
Gitee


From 6818c87b39dc95a3b05b49496d95b45a41806076 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 8 Oct 2024 07:08:44 +0000
Subject: [PATCH 52/96] !15171 Update op_plugin commit id Merge pull request
 !15171 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index d7523c27fb..dfaad59f05 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit d7523c27fb09878bf4da7da8dbfcc5eea46290d2
+Subproject commit dfaad59f053d47f24dc4d2d8da095fcb39ecee5f
-- 
Gitee


From 3fdcbb34ec1dfb2b40d278d895741c546ec2a3c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Tue, 8 Oct 2024 07:39:43 +0000
Subject: [PATCH 53/96] =?UTF-8?q?!15142=20add=20`base=5Faddr=5Faligned=5Fk?=
 =?UTF-8?q?b`=20configuration=20Merge=20pull=20request=20!15142=20from=20?=
 =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA/cherry-pick-1727660032?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 43 +++++++++++++++++--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index e4b764ae36..c65a38b337 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -596,6 +596,11 @@ class CachingAllocatorConfig {
     return instance().m_expandable_segments;
   }
 
+  static size_t base_addr_aligned_size()
+  {
+      return instance().m_base_addr_aligned_size;
+  }
+
   static CachingAllocatorConfig &instance() {
     static CachingAllocatorConfig *s_instance = ([]() {
       auto inst = new CachingAllocatorConfig();
@@ -614,11 +619,13 @@ class CachingAllocatorConfig {
   double m_garbage_collection_threshold;
   bool m_expandable_segments;
   bool set_expandable_segments_flag = false;
+  size_t m_base_addr_aligned_size = kAlignRoundLarge;
 
   CachingAllocatorConfig()
       : m_max_split_size(std::numeric_limits<size_t>::max()),
         m_garbage_collection_threshold(0),
-        m_expandable_segments(true)
+        m_expandable_segments(true),
+        m_base_addr_aligned_size(kAlignRoundLarge)
         {
             void* ptr = nullptr;
             auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, NULL, 1);
@@ -643,6 +650,9 @@ class CachingAllocatorConfig {
   size_t parseExpandableSegments(
       const std::vector<std::string>& config,
       size_t i);
+  size_t parseAddrAlignSize(
+      const std::vector<std::string>& config,
+      size_t i);
 };
 
 void CachingAllocatorConfig::lexArgs(
@@ -740,6 +750,28 @@ size_t CachingAllocatorConfig::parseExpandableSegments(
   return i;
 }
 
+size_t CachingAllocatorConfig::parseAddrAlignSize(
+    const std::vector<std::string>& config,
+    size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        size_t val = static_cast<size_t>(stoi(config[i]));
+        TORCH_CHECK(config[i].length() == std::to_string(val).length(),
+                    "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+                    OPS_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(val >= 0, "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+                    OPS_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(val <= kAlignRoundLarge / 1024,
+                    "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+                    OPS_ERROR(ErrCode::VALUE));
+        m_base_addr_aligned_size = val * 1024;
+    } else {
+        TORCH_CHECK(false, "Error, expecting base_addr_aligned_kb value", OPS_ERROR(ErrCode::VALUE));
+    }
+    return i;
+}
+
 void CachingAllocatorConfig::parseArgs(const char* env) {
   // If empty, set the default values
   m_max_split_size = std::numeric_limits<size_t>::max();
@@ -760,6 +792,8 @@ void CachingAllocatorConfig::parseArgs(const char* env) {
     } else if (config[i] == "expandable_segments") {
       set_expandable_segments_flag = true;
       i = parseExpandableSegments(config, i);
+    } else if (config[i] == "base_addr_aligned_kb") {
+      i = parseAddrAlignSize(config, i);
     } else {
       TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], OPS_ERROR(ErrCode::PARAM));
     }
@@ -1058,9 +1092,10 @@ class DeviceCachingAllocator {
     }
 
     int64_t ori_block_ptr = int64_t(params.block->ptr);
-    if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() &&
-        ori_block_ptr % kAlignRoundLarge != 0) {
-        char* align_ptr = reinterpret_cast<char*>((ori_block_ptr + kAlignRoundLarge) - (ori_block_ptr % kAlignRoundLarge));
+    size_t align_round = CachingAllocatorConfig::base_addr_aligned_size();
+    if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && align_round != 0 &&
+        ori_block_ptr % align_round != 0) {
+        char* align_ptr = reinterpret_cast<char*>((ori_block_ptr + align_round) - (ori_block_ptr % align_round));
         size_t offset_size = align_ptr - (char*)params.block->ptr;
         if (offset_size + params.size() <= params.block->size) {
             auto size = params.block->size;
-- 
Gitee


From a62d352ddf641153143635f8a477e0d2f8f29641 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= <guoguanghao@huawei.com>
Date: Tue, 8 Oct 2024 07:45:19 +0000
Subject: [PATCH 54/96] =?UTF-8?q?!15175=20=E3=80=90bugfix=E3=80=91fix=20th?=
 =?UTF-8?q?e=20bug=20of=20intreactive=20Merge=20pull=20request=20!15175=20?=
 =?UTF-8?q?from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 50e220d87a..7a01246c9a 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -228,3 +228,8 @@ if 'TORCH_NPU_SANITIZER' in os.environ:
 
     apply_sanitizer_patch()
     csan.enable_npu_sanitizer()
+
+if hasattr(sys, 'ps1'):
+    os.environ["TASK_QUEUE_ENABLE"] = '0'
+    warnings.warn("On the interactive interface, the value of TASK_QUEUE_ENABLE is set to 0 by default. \
+                     Do not set it to 1 to prevent some unknown errors")
\ No newline at end of file
-- 
Gitee


From d44417f7f9b0b2b0fc8fce1b507e1cf5bf3e764a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Wed, 9 Oct 2024 02:44:29 +0000
Subject: [PATCH 55/96] =?UTF-8?q?!15186=20the=20expandable=5Fsegments=20fu?=
 =?UTF-8?q?nction=20defaults=20to=20false.=20Merge=20pull=20request=20!151?=
 =?UTF-8?q?86=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.1=5Fcleancode?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index c65a38b337..f9e4b723fe 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -624,18 +624,9 @@ class CachingAllocatorConfig {
   CachingAllocatorConfig()
       : m_max_split_size(std::numeric_limits<size_t>::max()),
         m_garbage_collection_threshold(0),
-        m_expandable_segments(true),
+        m_expandable_segments(false),
         m_base_addr_aligned_size(kAlignRoundLarge)
         {
-            void* ptr = nullptr;
-            auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, NULL, 1);
-            if (status == ACL_ERROR_NONE) {
-                NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
-            } else {
-                TORCH_NPU_WARN_ONCE("expandable_segments feature is not supportted \
-                    and the possible cause is that driver and firmware packages do not match.");
-                m_expandable_segments = false;
-            }
         }
 
   void lexArgs(const char* env, std::vector<std::string>& config);
-- 
Gitee


From 0dfad97f462d01fe1eed476debfe6e24c74e9514 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 9 Oct 2024 08:42:42 +0000
Subject: [PATCH 56/96] !15232 Update op_plugin commit id Merge pull request
 !15232 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index dfaad59f05..3911dbe7bd 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit dfaad59f053d47f24dc4d2d8da095fcb39ecee5f
+Subproject commit 3911dbe7bd2daf515d7190b3cd01f0204687b3d3
-- 
Gitee


From 6efd72ed7e588f586fe2b8d0ef5f97876615ef6c Mon Sep 17 00:00:00 2001
From: lilei zheng <zhenglilei@huawei.com>
Date: Wed, 9 Oct 2024 10:36:17 +0000
Subject: [PATCH 57/96] !15192 Fix the accuracy issue in the aclop conv3d fp32
 scenario Merge pull request !15192 from lilei zheng/cherry-pick-1728385829

---
 torch_npu/utils/_module.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index d51485b82c..92213c7ca9 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -24,11 +24,14 @@ from torch.nn.parallel.replicate import replicate
 
 import torch_npu
 from torch_npu.npu.amp.autocast_mode import autocast
+from torch_npu.npu.utils import get_device_name
 from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm
 from torch_npu.utils._error_code import ErrCode, pta_error
 
 origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__
 
+CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"]
+
 
 def npu(self, device=None):
     r"""Moves all model parameters and buffers to the npu.
@@ -117,6 +120,10 @@ def cast_weight(self, device):
             return
         if issubclass(class_name, torch.nn.Conv3d):
             module.weight.data = module.weight.data.to(device)
+            device_name = get_device_name()
+            if any(device_name.startswith(prefix) for prefix in CONV3D_SUPPORT_FP32_SOC_PREFIX):
+                module.weight.data = torch_npu.npu_format_cast(module.weight.data, 33)
+                return
             module.weight.data = torch_npu.npu_format_cast(module.weight.data.half(), 33).float()  # ACL_FRACTAL_Z_3D
 
     if device is None or "npu" not in str(device):
-- 
Gitee


From c57ce8ae485fff5645ab2d45e6b5b26fa1ace282 Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Wed, 9 Oct 2024 10:36:48 +0000
Subject: [PATCH 58/96] !15210 fix coredump when uncached_delete after
 Finalize. Merge pull request !15210 from huangyunlong/2.1rc3nome

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index f9e4b723fe..26613cf76b 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -2317,7 +2317,9 @@ class DeviceCachingAllocator {
 
 static void uncached_delete(void* ptr)
 {
-    c10_npu::npuSynchronizeDevice(false);
+    if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+        c10_npu::npuSynchronizeDevice(false);
+    }
     ASCEND_LOGD("Without NPUCachingAllocator, free by aclrtFree.");
     NPU_CHECK_ERROR(aclrtFree(ptr));
 }
-- 
Gitee


From 48c9c346d7cfda603d69fa34b498fec569b3a76f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B2=88=E7=8F=88=E9=9D=93?= <shenjialiang@huawei.com>
Date: Wed, 9 Oct 2024 10:37:09 +0000
Subject: [PATCH 59/96] =?UTF-8?q?!15220=20Update=20torchair=206.0.rc3=20Me?=
 =?UTF-8?q?rge=20pull=20request=20!15220=20from=20=E6=B2=88=E7=8F=88?=
 =?UTF-8?q?=E9=9D=93/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 9382e2f6a1..820f0378f4 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 9382e2f6a1171502887c36d6556fa5fc1ab85b66
+Subproject commit 820f0378f4591707969e1aa55935cff7b823b155
-- 
Gitee


From 0619c1279f6f43ae88fcad8a70330e45fbf122d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Thu, 10 Oct 2024 09:42:27 +0000
Subject: [PATCH 60/96] =?UTF-8?q?!15217=20remove=20weakptr,=20use=20global?=
 =?UTF-8?q?=5F's=20get=20function=20Merge=20pull=20request=20!15217=20from?=
 =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Frank?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 47 ++++++++++---------
 .../csrc/distributed/ProcessGroupHCCL.hpp     |  4 +-
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 7960f02725..7410f0e363 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -234,7 +234,6 @@ const int64_t ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis = 10 * 1000;
 thread_local uint64_t ProcessGroupHCCL::hcclActiveGroupCounter_ = 0;
 const int64_t ProcessGroupHCCL::kWatchdogThreadSleepMillis = 1000;
 std::string ProcessGroupHCCL::perfdumppath = "";
-std::weak_ptr<HCCLComm> ProcessGroupHCCL::global_hccl_comm_;
 std::unordered_map<std::string, uint32_t> ProcessGroupHCCL::group_ranks_map_;
 std::mutex ProcessGroupHCCL::group_ranks_map_mutex_;
 ProcessGroupHCCL* ProcessGroupHCCL::global_ = nullptr;
@@ -786,6 +785,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL()
                 hcclComm->destroyHcclComm();
             }
         }
+        devHCCLCommMap_.clear();
     }
 }
 
@@ -1147,10 +1147,6 @@ void ProcessGroupHCCL::createHCCLComm(const std::vector<at::Device>& devices,
                     std::to_string((int)commType) + DIST_ERROR(ErrCode::PARAM));
         }
 
-        if (options_->global_ranks_in_group.empty()) {
-            global_hccl_comm_ = hcclComms[i];
-        }
-
         // Creates the HCCL streams
         streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index()));
     }
@@ -1171,13 +1167,13 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
         TORCH_NPU_WARN_ONCE("When creating an HCCL process group using the RANK_TABLE_FILE method, the connection may time out. ",
             "It is recommended to set the timeout duration of HCCL_CONNECT_TIMEOUT to 300 seconds or more.");
     }
+    if (!hcclCommInitClusterInfoConfigExist()) {
+        ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface.");
+        return false;
+    }
     c10_npu::OptionalNPUGuard npuGuard;
     // global process group
     if (options_->global_ranks_in_group.empty()) {
-        if (!hcclCommInitClusterInfoConfigExist()) {
-            ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface.");
-            return false;
-        }
         auto startTime = std::chrono::steady_clock::now();
         for (size_t i = 0; i < devices.size(); ++i) {
             int rank = getRank() * static_cast<int>(devices.size()) + static_cast<int>(i);
@@ -1194,7 +1190,6 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
                 return false;
             }
             hcclComms[i] = comm;
-            global_hccl_comm_ = comm;
             // Creates the HCCL streams
             streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index()));
         }
@@ -1209,20 +1204,17 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
         ASCEND_LOGI("The hcclCreateSubCommConfig is not exist, switch to original interface.");
         return false;
     }
-    if (global_hccl_comm_.expired()) {
-        // only support create glabal process group by ranktable
-        if (global_ == nullptr || !hcclCommInitClusterInfoConfigExist()) {
-            ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface.");
-            return false;
-        }
-        try {
-            (void)global_->getHcclComm(global_->getRank());
-        } catch (const std::exception& e) {
-            ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what());
-            return false;
-        }
+    if (global_ == nullptr) {
+        ASCEND_LOGI("The global process group is not exist, switch to original interface.");
+        return false;
+    }
+    std::shared_ptr<HCCLComm> globalHcclComm = nullptr;
+    try {
+        globalHcclComm = global_->getHcclCommByRankid(devices);
+    } catch (const std::exception& e) {
+        ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what());
+        return false;
     }
-    std::shared_ptr<HCCLComm> globalHcclComm = global_hccl_comm_.lock();
     if (!globalHcclComm) {
         ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, globalHcclComm is nullptr.");
         return false;
@@ -1520,6 +1512,15 @@ ProcessGroupHCCL::Options::Options(bool is_high_priority_stream)
 {
 }
 
+std::shared_ptr<HCCLComm> ProcessGroupHCCL::getHcclCommByRankid(const std::vector<at::Device>& devices)
+{
+    const auto key = getKeyFromDevices(devices);
+    auto& hcclComms = getHCCLComm(key, devices);
+    TORCH_CHECK(hcclComms.size() == 1, "expect hcclComms.size() = 1, but hcclComms.size() = ",
+        hcclComms.size(), DIST_ERROR(ErrCode::VALUE));
+    return hcclComms[0];
+}
+
 int64_t ProcessGroupHCCL::getHcclComm(int rankid)
 {
     at::Device device = getDeviceForRank(rankid);
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index c09d255e17..5961971721 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -407,6 +407,8 @@ public:
     // may indicate that there is some sort of collective desynchronization.
     uint64_t getSequenceNumberForGroup() override;
 
+    std::shared_ptr<HCCLComm> getHcclCommByRankid(const std::vector<at::Device>& devices);
+
     int64_t getHcclComm(int rankid);
 
     void setHcclCommName(const std::string& hccl_comm_name);
@@ -677,8 +679,6 @@ private:
 
     WatchdogStatus watchdogStatus;
 
-    static std::weak_ptr<HCCLComm> global_hccl_comm_;
-
     static std::mutex group_ranks_map_mutex_;
     static std::unordered_map<std::string, uint32_t> group_ranks_map_;
     std::string global_hccl_id_;
-- 
Gitee


From b9e848ec29aff11495c7e36ac7f2b371ba4bb18a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 10 Oct 2024 10:51:31 +0000
Subject: [PATCH 61/96] !15278 Update op_plugin commit id Merge pull request
 !15278 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 3911dbe7bd..5b736ed12d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 3911dbe7bd2daf515d7190b3cd01f0204687b3d3
+Subproject commit 5b736ed12db8423133dd24e2f0e81813d8f53d80
-- 
Gitee


From 0a8106ce6d53a791da85c09c40256cacf9aba2dc Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 10 Oct 2024 13:59:44 +0000
Subject: [PATCH 62/96] !15292 Update op_plugin commit id Merge pull request
 !15292 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5b736ed12d..5dade4c396 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5b736ed12db8423133dd24e2f0e81813d8f53d80
+Subproject commit 5dade4c396054b9722fa94b0ff2ccbde5adacdfc
-- 
Gitee


From b14a1a84cb6630827d507eba0e3019a98edd5bc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Fri, 11 Oct 2024 02:06:27 +0000
Subject: [PATCH 63/96] =?UTF-8?q?!15237=20[Fix]=20Update=20README.=20Merge?=
 =?UTF-8?q?=20pull=20request=20!15237=20from=20=E5=88=98=E5=98=89=E5=B7=8D?=
 =?UTF-8?q?/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md    | 2 +-
 README.zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f6332eff96..a6cbb5c10c 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ pip3 install torch-npu==2.1.0.post6
 
 ### From Source
 
-In some special scenarios, users may need to compile **torch-npu** by themselves.Select a branch in table [Ascend Auxiliary Software](#ascend-auxiliary-software) and a Python version in table [PyTorch and Python Version Matching Table](#pytorch-and-python-version-matching-table) first. The docker image is recommended for compiling torch-npu through the following steps(It is recommended to mount the working path only and avoid the system path to reduce security risks), the generated .whl file path is ./dist/:
+In some special scenarios, users may need to compile **torch-npu** by themselves.Select a branch in table [Ascend Auxiliary Software](#ascend-auxiliary-software) and a Python version in table [PyTorch and Python Version Matching Table](#pytorch-and-python-version-matching-table) first. The docker image is recommended for compiling torch-npu through the following steps(It is recommended to mount the working path only and avoid the system path to reduce security risks), the generated .whl file path is ./dist/. Note that gcc version has the following constraints if you try to compile without using docker image: we recommend the use gcc 10.2 for ARM and gcc 9.3.1 for X86.
 
 1. **Clone torch-npu**
 
diff --git a/README.zh.md b/README.zh.md
index 55a25dcab7..91701847e7 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -57,7 +57,7 @@ pip3 install torch-npu==2.1.0.post6
 
 ### 使用源代码进行安装
 
-某些特殊场景下，用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#PyTorch与Python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**，可以通过以下步骤获取(建议只挂载工作路径，并避开系统路径，以降低安全风险), 生成的.whl文件路径为./dist/：
+某些特殊场景下，用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#PyTorch与Python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**，可以通过以下步骤获取(建议只挂载工作路径，并避开系统路径，以降低安全风险), 生成的.whl文件路径为./dist/。如果不使用镜像，编译时请注意gcc版本遵循如下约束：ARM架构下推荐使用gcc 10.2版本, X86架构下推荐使用gcc 9.3.1
 
 1. **克隆torch_npu代码仓**
 
-- 
Gitee


From 4aa987f573704248396c2803e75c886d028c3df8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Fri, 11 Oct 2024 02:22:40 +0000
Subject: [PATCH 64/96] =?UTF-8?q?!15234=20add=20event=20remove=20process?=
 =?UTF-8?q?=20for=20cachingAllocator=20when=20restart=20device=20Merge=20p?=
 =?UTF-8?q?ull=20request=20!15234=20from=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.?=
 =?UTF-8?q?1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 24 ++++++++++++++
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |  6 ++++
 torch_npu/csrc/core/npu/NPUException.h        |  6 ++--
 torch_npu/csrc/core/npu/NPUQueue.cpp          | 33 ++++++++++++++-----
 torch_npu/csrc/npu/Module.cpp                 |  1 +
 torch_npu/csrc/npu/NPUPluggableAllocator.cpp  |  7 ++++
 torch_npu/csrc/npu/NPUPluggableAllocator.h    |  1 +
 7 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 26613cf76b..40932437d8 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -1346,6 +1346,23 @@ class DeviceCachingAllocator {
     release_cached_blocks(check_error, context);
   }
 
+  void release_and_free_events()
+  {
+      std::unique_lock<std::recursive_mutex> lock(mutex);
+      std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+      for (auto& st : npu_events) {
+          for (auto& e : st.second) {
+              EventPool::Event event = std::move(e.first);
+              Block* block = e.second;
+              block->event_count--;
+              if (block->event_count == 0) {
+                  free_block(block, context);
+              }
+          }
+      }
+      npu_events.clear();
+  }
+
   /** Retrieves info (total size + largest block) of the memory cache **/
   void cacheInfo(size_t* total, size_t* largest) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
@@ -2487,6 +2504,13 @@ class NpuCachingAllocator : public NPUAllocator {
       block->is_safe = true;
   }
 
+  void cleanEvent() override
+  {
+      int count = static_cast<int>(device_allocator.size());
+      for (int i = 0; i < count; i++)
+          device_allocator[i]->release_and_free_events();
+  }
+
   void emptyCache(bool check_error) override
   {
     int count = static_cast<int>(device_allocator.size());
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 46dc7ecc65..44f1d8a7f4 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -207,6 +207,7 @@ public:
     virtual bool checkBlockIsSafe(const c10::DataPtr& ptr) = 0;
     virtual void markAllBlockUnsafe(int device) = 0;
     virtual void updateBlockToSafe(const c10::DataPtr &ptr) = 0;
+    virtual void cleanEvent() = 0;
 };
 
 // Allocator object, statically initialized
@@ -342,5 +343,10 @@ inline void updateBlockToSafe(const c10::DataPtr& ptr)
     return get()->updateBlockToSafe(ptr);
 }
 
+inline void cleanEvent()
+{
+    return get()->cleanEvent();
+}
+
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index 0f03ee1865..98de3f2260 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -16,6 +16,7 @@
 #include "torch_npu/csrc/core/npu/NPUMacros.h"
 #include "torch_npu/csrc/core/npu/interface/AclInterface.h"
 #include "torch_npu/csrc/core/npu/NPUErrorCodes.h"
+#include "torch_npu/csrc/core/npu/npu_log.h"
 
 
 #define C10_NPU_SHOW_ERR_MSG()                                           \
@@ -89,8 +90,8 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode);
 #define GRAPH_ERROR(error) formatErrorCode(SubModule::GRAPH, error)
 #define PROF_ERROR(error) formatErrorCode(SubModule::PROF, error)
 
-#define DEVICE_TASK_ABORT "107022"
-#define DEVICE_MEM_ERROR "507053"
+#define DEVICE_TASK_ABORT "reason=[device task abort]"
+#define DEVICE_MEM_ERROR "reason=[device mem error]"
 
 inline const char* getErrorFunction(const char* msg)
 {
@@ -110,6 +111,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
         Error_stop = stop_error;                                             \
     }                                                                        \
     if ((Error_stop) == ACL_ERROR_RT_DEVICE_TASK_ABORT) {                    \
+        ASCEND_LOGE("getRepoStopFlag in Run, throw FORCE STOP.");            \
         TORCH_CHECK(                                                         \
             false,                                                           \
             __func__,                                                        \
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index edbd5a8655..99b5d48e7d 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -257,6 +257,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     if (GetStatus() == RepoStatus::STOP_EXIT) {
         ClearQueue();
         if (check_error) {
+            ASCEND_LOGE("getRepoStopFlag in EmptyQueue, throw FORCE STOP.");
             throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
         } else {
             ASCEND_LOGE("FORCE STOP happend.");
@@ -296,17 +297,30 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 }
 
 bool Repository::WriteQueue(void* cur_paras) {
-  std::lock_guard<std::mutex> lock(mu_enqueue);
-  if (IsFullQueue()) {
-    return false;
-  }
+    std::lock_guard<std::mutex> lock(mu_enqueue);
 
-  __sync_synchronize();
-  manager().Copy(datas, write_idx.idx, cur_paras);
-  __sync_synchronize();
+    if (GetStatus() == RepoStatus::STOP_EXIT) {
+        auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
+        auto type = queueParam->paramType;
+        if (type == c10_npu::queue::LAZY_DESTROY_EVENT) {
+            return true;
+        } else {
+            ClearQueue();
+            ASCEND_LOGE("getRepoStopFlag in WriteQueue, throw FORCE STOP.");
+            throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
+        }
+    }
 
-  write_idx.idx = (write_idx.idx + 1) & (kQueueCapacity - 1);
-  return true;
+    if (IsFullQueue()) {
+        return false;
+    }
+
+    __sync_synchronize();
+    manager().Copy(datas, write_idx.idx, cur_paras);
+    __sync_synchronize();
+
+    write_idx.idx = (write_idx.idx + 1) & (kQueueCapacity - 1);
+    return true;
 }
 
 bool Repository::ReadQueue()
@@ -381,6 +395,7 @@ void Repository::Enqueue(void* cur_paras) {
             return;
         }
         ClearQueue();
+        ASCEND_LOGE("getRepoStopFlag in Enqueue, throw FORCE STOP.");
         throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
     }
 
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 7a9c655c12..d73b536e94 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -382,6 +382,7 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg)
     
     c10_npu::clear_mem_uce_info();
     setDefaultStreamsStatus(device, c10_npu::RepoStatus::INIT);
+    c10_npu::NPUCachingAllocator::cleanEvent();
 
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index 304b997bef..c7e43b23a8 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -316,6 +316,13 @@ void NPUPluggableAllocator::updateBlockToSafe(const c10::DataPtr& ptr)
         "If you need it, please file an issue describing your use case.");
 }
 
+void NPUPluggableAllocator::cleanEvent()
+{
+    TORCH_NPU_WARN(
+        "NPUPluggableAllocator does not yet support cleanEvent. "
+        "If you need it, please file an issue describing your use case.");
+}
+
 std::shared_ptr<c10_npu::NPUCachingAllocator::NPUAllocator>
     current_custom_allocator;
 
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index cca1df8952..d84025ebb5 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -81,6 +81,7 @@ struct NPUPluggableAllocator
     bool checkBlockIsSafe(const c10::DataPtr& ptr) override;
     void markAllBlockUnsafe(int device) override;
     void updateBlockToSafe(const c10::DataPtr &ptr) override;
+    void cleanEvent() override;
 
 protected:
     std::function<void*(size_t, int, aclrtStream)> alloc_fn_;
-- 
Gitee


From 6f64e7df4d03f4054562785f752a27e2d7973220 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 11 Oct 2024 02:34:09 +0000
Subject: [PATCH 65/96] !15300 Update op_plugin commit id Merge pull request
 !15300 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5dade4c396..de41acb424 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5dade4c396054b9722fa94b0ff2ccbde5adacdfc
+Subproject commit de41acb424b1d3b4ef4b44e1eb999173a24d9ea9
-- 
Gitee


From 8d55c035fca6d78142f6a7ff196123ac1e633ad6 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 11 Oct 2024 08:51:14 +0000
Subject: [PATCH 66/96] !15324 Update op_plugin commit id Merge pull request
 !15324 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index de41acb424..bc401c1eee 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit de41acb424b1d3b4ef4b44e1eb999173a24d9ea9
+Subproject commit bc401c1eee57604b5bfbee6e67cd293587bff13b
-- 
Gitee


From 6fffc905295ac74c0da4d0972a5d2195e5ba300a Mon Sep 17 00:00:00 2001
From: wangqihui01 <wangqh10@163.com>
Date: Fri, 11 Oct 2024 09:39:07 +0000
Subject: [PATCH 67/96] !15268 revise supported_export_type error Merge pull
 request !15268 from wangqihui01/v2.1.0-6.0.rc3

---
 torch_npu/profiler/experimental_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py
index 673c6ccfa0..2b3ff14563 100644
--- a/torch_npu/profiler/experimental_config.py
+++ b/torch_npu/profiler/experimental_config.py
@@ -15,17 +15,17 @@ __all__ = [
 
 
 def supported_profiler_level():
-    return set((ProfilerLevel.Level0, ProfilerLevel.Level1, ProfilerLevel.Level2))
+    return set((ProfilerLevel.Level0, ProfilerLevel.Level1, ProfilerLevel.Level2, ProfilerLevel.Level_none))
 
 
 def supported_ai_core_metrics():
-    return set((AiCMetrics.PipeUtilization, AiCMetrics.ArithmeticUtilization,
+    return set((AiCMetrics.AiCoreNone, AiCMetrics.PipeUtilization, AiCMetrics.ArithmeticUtilization,
                 AiCMetrics.Memory, AiCMetrics.MemoryL0, AiCMetrics.MemoryUB,
                 AiCMetrics.ResourceConflictRatio, AiCMetrics.L2Cache))
 
 
 def supported_export_type():
-    return set(ExportType.__members__.values())
+    return set((ExportType.Db, ExportType.Text))
 
 
 class ProfilerLevel:
-- 
Gitee


From 2f0fc1b5b9643a73190ba14b48e72f318ea34c9d Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 11 Oct 2024 10:45:10 +0000
Subject: [PATCH 68/96] !15331 Update op_plugin commit id Merge pull request
 !15331 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index bc401c1eee..0da965ba04 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit bc401c1eee57604b5bfbee6e67cd293587bff13b
+Subproject commit 0da965ba0437b2171a8a87ddd2ce3115f0aa8dda
-- 
Gitee


From effcf17b4ee32544d95bea34830b90f08ee980ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B5=E9=9D=9E=E5=87=A1?= <shaofeifan2@huawei.com>
Date: Fri, 11 Oct 2024 13:30:01 +0000
Subject: [PATCH 69/96] =?UTF-8?q?!15328=20delete=20unused=20info=20for=20a?=
 =?UTF-8?q?llreduce=20Merge=20pull=20request=20!15328=20from=20=E9=82=B5?=
 =?UTF-8?q?=E9=9D=9E=E5=87=A1/d=5Fcallback21rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 7410f0e363..2351d727a1 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -205,13 +205,6 @@ std::string getExceptionMsgFromExceptionPtr(const std::exception_ptr& exceptionP
     }
 }
 
-// exit call back for allreduce error
-void exceptionCallback(aclrtExceptionInfo* exceptionInfo)
-{
-    // notice: Do not raise error, otherwise we will get call stacks of the rts callback function.
-    fprintf(stdout, "Inner error, see details in Ascend logs.");
-}
-
 void getP2PHcclCommCofig(HcclCommConfig* config)
 {
     HcclCommConfigInit(config);
@@ -1911,8 +1904,6 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
         tensors_cp,
         tensors_cp,
         [&](at::Tensor& input, at::Tensor& output, HcclComm comm, c10_npu::NPUStream& stream, std::shared_ptr<bool> is_dispatched) {
-            aclrtSetExceptionInfoCallback(exceptionCallback);
-
             auto hcclType = getHcclDataType(input.scalar_type());
             checkSupportedDataType(hcclType, functionName);
             RECORD_FUNCTION("HcclAllreduce", std::vector<c10::IValue>({input}));
-- 
Gitee


From 86bb98d5dbfb25f39523e257245de20eba0fe51b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com>
Date: Sat, 12 Oct 2024 01:06:24 +0000
Subject: [PATCH 70/96] =?UTF-8?q?!15303=20[PROF]update=20mstx=20func=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!15303=20from=20=E6=A2=85=E9=A3=9E?=
 =?UTF-8?q?=E8=A6=81/mark=5F1=5Frc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/npu/test_mstx.py                | 67 ++++++++++++++++++++++++++++
 test/torch_npu_schema.json           |  4 +-
 torch_npu/csrc/profiler/mstx_mgr.cpp | 36 ++++++++++++++-
 torch_npu/csrc/profiler/mstx_mgr.h   |  3 ++
 torch_npu/npu/mstx.py                |  3 +-
 5 files changed, 109 insertions(+), 4 deletions(-)
 create mode 100644 test/npu/test_mstx.py

diff --git a/test/npu/test_mstx.py b/test/npu/test_mstx.py
new file mode 100644
index 0000000000..f2baf03b1b
--- /dev/null
+++ b/test/npu/test_mstx.py
@@ -0,0 +1,67 @@
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestMstx(TestCase):
+    mark_msg = ''
+    range_msg = ''
+    range_id = 0
+
+    def setUp(self):
+        def stub_mark(message : str = ''):
+            self.mark_msg = message
+
+        def stub_range_start_on_host(message : str) -> int:
+            self.range_msg = message
+            self.range_id += 1
+            return self.range_id
+
+        def stub_range_start(message : str, stream=None):
+            self.range_msg = message
+            self.range_id += 1
+            return self.range_id
+
+        def stub_range_end(range_id: int):
+            self.range_id = range_id
+
+        torch_npu._C._mark = stub_mark
+        torch_npu._C._mstx._range_start = stub_range_start
+        torch_npu._C._mstx._range_start_on_host = stub_range_start_on_host
+        torch_npu._C._mstx._range_end = stub_range_end
+
+    def test_mark(self):
+        torch_npu.npu.mstx.mark("test1")
+        self.assertEqual("test1", self.mark_msg)
+        torch_npu.npu.mstx().mark("test2") # Verify compatibility
+        self.assertEqual("test2", self.mark_msg)
+
+    def test_range_start(self):
+        self.range_id = 0
+        ret_id = torch_npu.npu.mstx.range_start("")
+        self.assertEqual(0, ret_id)
+        ret_id = torch_npu.npu.mstx.range_start("test1")
+        self.assertEqual(1, ret_id)
+        self.assertEqual("test1", self.range_msg)
+        ret_id = torch_npu.npu.mstx.range_start("test2", None)
+        self.assertEqual(2, ret_id)
+        self.assertEqual("test2", self.range_msg)
+        
+        torch.npu.set_device(0)
+        current_stream = torch.npu.current_stream()
+        ret_id = torch_npu.npu.mstx.range_start("test3", current_stream)
+        self.assertEqual(3, ret_id)
+        self.assertEqual("test3", self.range_msg)
+        ret_id = torch_npu.npu.mstx.range_start("test4", 'invalid_stream')
+        self.assertEqual(0, ret_id)
+
+    def test_range_end(self):
+        self.range_id = 0
+        torch_npu.npu.mstx.range_end('invalid_range_id')
+        self.assertEqual(0, self.range_id)
+        torch_npu.npu.mstx.range_end(1)
+        self.assertEqual(1, self.range_id)
+
+
+if __name__ == '__main__':
+    run_tests()
\ No newline at end of file
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 5a36b1ea4b..d23480e9a1 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -1212,7 +1212,7 @@
     "signature": "()"
   },
   "torch_npu.npu.mstx.mark": {
-    "signature": "(self, message: str = '')"
+    "signature": "(message: str = '')"
   },
   "torch_npu.npu.preferred_linalg_library": {
     "signature": "(backend: Union[NoneType, str, torch._C._LinalgBackend] = None) -> torch._C._LinalgBackend"
@@ -1566,7 +1566,7 @@
     "signature": "()"
   },
   "torch_npu.npu.mstx.mstx.mark": {
-    "signature": "(self, message: str = '')"
+    "signature": "(message: str = '')"
   },
   "torch_npu.npu.npu_config.finalize_dump": {
     "signature": "()"
diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp
index fac2b207ca..87c7af80fc 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.cpp
+++ b/torch_npu/csrc/profiler/mstx_mgr.cpp
@@ -5,6 +5,9 @@
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/framework/OpCommand.h"
 #include "torch_npu/csrc/profiler/profiler_mgr.h"
+#include "torch_npu/csrc/toolkit/profiler/common/utils.h"
+
+#include <sstream>
 
 namespace torch_npu {
 namespace profiler {
@@ -90,9 +93,40 @@ int MstxMgr::getRangeId()
     return ptRangeId_++;
 }
 
-bool MstxMgr::isMstxEnable()
+bool MstxMgr::isProfTxEnable()
 {
     return ProfilerMgr::GetInstance()->GetNpuTrace().load() && ProfilerMgr::GetInstance()->GetMsprofTx().load();
 }
+
+bool MstxMgr::isMsptiTxEnableImpl()
+{
+    bool ret = false;
+    const char* envVal = std::getenv("LD_PRELOAD");
+    if (envVal == nullptr) {
+        return ret;
+    }
+    static const std::string soName = "libmspti.so";
+    std::stringstream ss(envVal);
+    std::string path;
+    while (std::getline(ss, path, ':')) {
+        path = torch_npu::toolkit::profiler::Utils::RealPath(path);
+        if ((path.size() > soName.size()) && (path.substr(path.size() - soName.size()) == soName)) {
+            ret = true;
+            break;
+        }
+    }
+    return ret;
+}
+
+bool MstxMgr::isMsptiTxEnable()
+{
+    static bool isEnable = isMsptiTxEnableImpl();
+    return isEnable;
+}
+
+bool MstxMgr::isMstxEnable()
+{
+    return isProfTxEnable() || isMsptiTxEnable();
+}
 }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h
index cc91780ca0..883662cb4b 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.h
+++ b/torch_npu/csrc/profiler/mstx_mgr.h
@@ -26,6 +26,9 @@ private:
     explicit MstxMgr(MstxMgr &&obj) = delete;
     MstxMgr& operator=(MstxMgr &&obj) = delete;
 
+    bool isProfTxEnable();
+    bool isMsptiTxEnable();
+    bool isMsptiTxEnableImpl();
 private:
     std::atomic<int> ptRangeId_{1};
     std::unordered_set<int> ptRangeIdsWithStream_;
diff --git a/torch_npu/npu/mstx.py b/torch_npu/npu/mstx.py
index 2710d6aeec..0c33145b3a 100644
--- a/torch_npu/npu/mstx.py
+++ b/torch_npu/npu/mstx.py
@@ -17,7 +17,8 @@ import torch_npu._C
 
 
 class mstx:
-    def mark(self, message:str = ""):
+    @staticmethod
+    def mark(message:str = ""):
         torch_npu._C._mark(message)
 
     @staticmethod
-- 
Gitee


From 85eeba4f4c5d8cace22d7973b446154cfd9677de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Sat, 12 Oct 2024 01:23:20 +0000
Subject: [PATCH 71/96] =?UTF-8?q?!15311=20Update=20read=5Fidx=20only=20in?=
 =?UTF-8?q?=20Dequeue.=20Merge=20pull=20request=20!15311=20from=20?=
 =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Fforce?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUQueue.cpp            | 14 +++++---------
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp |  7 ++-----
 torch_npu/csrc/distributed/ProcessGroupHCCL.hpp |  2 +-
 torch_npu/npu/_recovery.py                      |  3 +--
 4 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 99b5d48e7d..39bb3514f1 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -255,7 +255,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     }
 
     if (GetStatus() == RepoStatus::STOP_EXIT) {
-        ClearQueue();
         if (check_error) {
             ASCEND_LOGE("getRepoStopFlag in EmptyQueue, throw FORCE STOP.");
             throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
@@ -272,7 +271,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
             PyEval_RestoreThread(gilState);
         }
 #endif
-        read_idx.idx = write_idx.idx;
 
         if (check_error) {
             throw std::runtime_error("The Inner error is reported as above. "
@@ -305,7 +303,6 @@ bool Repository::WriteQueue(void* cur_paras) {
         if (type == c10_npu::queue::LAZY_DESTROY_EVENT) {
             return true;
         } else {
-            ClearQueue();
             ASCEND_LOGE("getRepoStopFlag in WriteQueue, throw FORCE STOP.");
             throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
         }
@@ -363,10 +360,7 @@ bool Repository::ReadQueue()
         } else if (GetStatus() != STOP_EXIT) {
             SetStatus(ERROR_EXIT);
         }
-        read_idx.idx = write_idx.idx;
-        __sync_synchronize();
-        eventfd_write(efd_empty, 1);
-        eventfd_write(efd_write, 1);
+        ClearQueue();
         return false;
     }
 
@@ -394,7 +388,6 @@ void Repository::Enqueue(void* cur_paras) {
         if (type == c10_npu::queue::LAZY_DESTROY_EVENT) {
             return;
         }
-        ClearQueue();
         ASCEND_LOGE("getRepoStopFlag in Enqueue, throw FORCE STOP.");
         throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
     }
@@ -402,7 +395,6 @@ void Repository::Enqueue(void* cur_paras) {
   if (GetStatus() == RepoStatus::ERROR_EXIT) {
     // Avoid repeatedly throwing exceptions
     SetStatus(CAN_EXIT);
-    read_idx.idx = write_idx.idx;
 
     throw std::runtime_error("The Inner error is reported as above. "
                              "The process exits for this inner error, and " + repo_error + ".\n" +
@@ -491,6 +483,9 @@ void Repository::Dequeue() {
 
   SetReadWorking(true);
   while (ret == false && GetStatus() != RepoStatus::CAN_EXIT) {
+    if (GetStatus() == RepoStatus::STOP_EXIT) {
+        ClearQueue();
+    }
     ret = ReadQueue();
     if (ret == false) {
       if (GetStatus() == RepoStatus::NEED_EXIT) {
@@ -566,6 +561,7 @@ void Repository::ReleaseResource() {
 void Repository::ClearQueue()
 {
     read_idx.idx = write_idx.idx;
+    __sync_synchronize();
     eventfd_write(efd_empty, 1);
     eventfd_write(efd_write, 1);
 }
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 2351d727a1..704aca55e5 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -859,7 +859,6 @@ void ProcessGroupHCCL::workCleanupLoop()
         workMetaListCV_.wait_for(lock, std::chrono::milliseconds(kWatchdogThreadSleepMillis),
                                  [&]() -> bool { return terminateProcessGroup_.load(); });
         if (watchdogStatus == WatchdogStatus::STOP) {
-            workMetaList_.clear();
             continue;
         }
 
@@ -1203,7 +1202,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
     }
     std::shared_ptr<HCCLComm> globalHcclComm = nullptr;
     try {
-        globalHcclComm = global_->getHcclCommByRankid(devices);
+        globalHcclComm = global_->getHcclCommByDevices(devices);
     } catch (const std::exception& e) {
         ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what());
         return false;
@@ -1484,8 +1483,6 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL
         return;
     }
     if (watchdogStatus == WatchdogStatus::STOP) {
-        std::lock_guard<std::mutex> lock(workMetaListMutex_);
-        workMetaList_.clear();
         return;
     }
     if (!terminateProcessGroup_.load()) {
@@ -1505,7 +1502,7 @@ ProcessGroupHCCL::Options::Options(bool is_high_priority_stream)
 {
 }
 
-std::shared_ptr<HCCLComm> ProcessGroupHCCL::getHcclCommByRankid(const std::vector<at::Device>& devices)
+std::shared_ptr<HCCLComm> ProcessGroupHCCL::getHcclCommByDevices(const std::vector<at::Device>& devices)
 {
     const auto key = getKeyFromDevices(devices);
     auto& hcclComms = getHCCLComm(key, devices);
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 5961971721..f8cf3c3090 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -407,7 +407,7 @@ public:
     // may indicate that there is some sort of collective desynchronization.
     uint64_t getSequenceNumberForGroup() override;
 
-    std::shared_ptr<HCCLComm> getHcclCommByRankid(const std::vector<at::Device>& devices);
+    std::shared_ptr<HCCLComm> getHcclCommByDevices(const std::vector<at::Device>& devices);
 
     int64_t getHcclComm(int rankid);
 
diff --git a/torch_npu/npu/_recovery.py b/torch_npu/npu/_recovery.py
index 3203ce1594..e9238caa03 100644
--- a/torch_npu/npu/_recovery.py
+++ b/torch_npu/npu/_recovery.py
@@ -66,8 +66,8 @@ def restart_device(device_id: int, rebuild_all_resources: int = False):
     npu_device = torch.device('npu')
     for pg in _pg_map:
         if (npu_device in pg._device_types):
-            pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_RUN)
             pg._get_backend(npu_device).clear_workmeta_list()
+            pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_RUN)
 
 
 def stop_device(device_id):
@@ -78,4 +78,3 @@ def stop_device(device_id):
     for pg in _pg_map:
         if (npu_device in pg._device_types):
             pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_STOP)
-            pg._get_backend(npu_device).clear_workmeta_list()
-- 
Gitee


From e0dedf889e45252cf1e938bbe0684b52efdffdc6 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Sat, 12 Oct 2024 07:20:40 +0000
Subject: [PATCH 72/96] !15245 [PROF] Profiler trace step table fix Merge pull
 request !15245 from wangjie/cherry-pick-1728471938

---
 .../analysis/prof_view/_trace_step_time_parser.py        | 8 +++-----
 .../prof_view/prof_db_parse/_step_info_db_parser.py      | 9 +++------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
index d11208d9bf..f465cd97f8 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
@@ -79,13 +79,11 @@ class TraceStepTimeParser(BaseParser):
     def get_prepare_time(self, step, step_list):
         for cur_step in step_list:
             if cur_step[StepInfoIndex.ID.value] == step:
-                fwk_step_start_ts = cur_step[StepInfoIndex.FWK_START_TS.value]
+                first_task_start_ts = cur_step[StepInfoIndex.FIRST_TASK_TS.value]
                 if step is None:
                     first_fwk_op = FwkFileParser(self._profiler_path).get_first_fwk_op()
-                    start_time = convert_ns2us_float(first_fwk_op.ts) if first_fwk_op else fwk_step_start_ts
-                else:
-                    start_time = fwk_step_start_ts
-                return cur_step[StepInfoIndex.FIRST_TASK_TS.value] - start_time
+                    return (first_task_start_ts - convert_ns2us_float(first_fwk_op.ts)) if first_fwk_op else 0
+                return first_task_start_ts - cur_step[StepInfoIndex.FWK_START_TS.value]
         return 0
 
     def create_step_file(self, output_path: str, json_str: list, file_name: str) -> None:
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
index 1905c3227b..fb8d6c980c 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
@@ -35,9 +35,7 @@ class StepInfoDbParser(BaseParser):
         try:
             self._db_path = deps_data.get(Constant.DB_PARSER, "")
             torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
-            if not torch_op_node:
-                return Constant.SUCCESS, []
-            step_range = self.get_step_range(torch_op_node[0])
+            step_range = self.get_step_range(torch_op_node[0] if torch_op_node else None)
         except Exception:
             print_error_msg("Failed to get step info from db.")
             DbManager.destroy_db_connect(self.db_conn, self.db_curs)
@@ -74,9 +72,8 @@ class StepInfoDbParser(BaseParser):
 
     def get_step_range(self, root_node: TorchOpNode) -> list:
         step_node_list = []
-        for level1_node in root_node.child_node_list:
-            if level1_node.is_profiler_step():
-                step_node_list.append(level1_node)
+        if root_node is not None:
+            step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()]
         conn, curs = DbManager.create_connect_db(self._db_path)
         if not (conn and curs):
             print_warn_msg(f"Failed to connect to db file: {self._db_path}")
-- 
Gitee


From 879346a2453f2b7b96079db796cb6a6ecdc340a2 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 14 Oct 2024 02:54:18 +0000
Subject: [PATCH 73/96] !15348 Update op_plugin commit id Merge pull request
 !15348 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 0da965ba04..919680509f 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 0da965ba0437b2171a8a87ddd2ce3115f0aa8dda
+Subproject commit 919680509f869214d78b7b6d7a68c7d065394b68
-- 
Gitee


From 1664ceeb08d6895efe02fd755cb35c566b818165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= <guoguanghao@huawei.com>
Date: Mon, 14 Oct 2024 11:05:08 +0000
Subject: [PATCH 74/96] =?UTF-8?q?!15355=20modify=20readme=20Merge=20pull?=
 =?UTF-8?q?=20request=20!15355=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.1.0?=
 =?UTF-8?q?-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md    | 103 +++++++++++++++++++++++++++------------------------
 README.zh.md | 102 +++++++++++++++++++++++++++-----------------------
 2 files changed, 110 insertions(+), 95 deletions(-)

diff --git a/README.md b/README.md
index a6cbb5c10c..41304101e9 100644
--- a/README.md
+++ b/README.md
@@ -44,14 +44,17 @@ If the installation fails, use the download link or visit the [PyTorch official
 | x86     | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) |
 | x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) |
 | x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) |
+| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) |
 | aarch64 | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) |
 | aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) |
 | aarch64 | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) |
+| aarch64 | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) |
+
 
 3. **Install torch-npu**
 
 ```
-pip3 install torch-npu==2.1.0.post6
+pip3 install torch-npu==2.1.0.post8
 ```
 
 ### From Source
@@ -61,7 +64,7 @@ In some special scenarios, users may need to compile **torch-npu** by themselves
 1. **Clone torch-npu**
 
    ```
-   git clone https://github.com/ascend/pytorch.git -b v2.1.0-6.0.rc2 --depth 1
+   git clone https://github.com/ascend/pytorch.git -b v2.1.0-6.0.rc3 --depth 1
    ```
 
 2. **Build Docker Image**
@@ -120,52 +123,56 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 ## PyTorch and Python Version Matching Table
 
 | PyTorch Version | Python Version                                            |
-| ------------- | :----------------------------------------------------------- |
-| PyTorch1.11.0 | Python3.7.x(>=3.7.5),Python3.8.x,Python3.9.x,Python3.10.x |
-| PyTorch2.1.0  | Python3.8.x,Python3.9.x,Python3.10.x                       |
-| PyTorch2.2.0  | Python3.8.x,Python3.9.x,Python3.10.x                       |
-| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x                       |
-
+|-----------------|:----------------------------------------------------------|
+| PyTorch1.11.0   | Python3.7.x(>=3.7.5),Python3.8.x,Python3.9.x,Python3.10.x |
+| PyTorch2.1.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                      |
+| PyTorch2.2.0    | Python3.8.x,Python3.9.x,Python3.10.x                       |
+| PyTorch2.3.1    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                       |
+| PyTorch2.4.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                       |
 
 ## Ascend Auxiliary Software
 
 **PyTorch Extension** versions follow the naming convention `{PyTorch version}-{Ascend version}`, where the former represents the PyTorch version compatible with the **PyTorch Extension**, and the latter is used to match the CANN version. The detailed matching is as follows:
 
-| CANN Version         | Supported PyTorch Version | Supported Extension Version | Github Branch          | AscendHub Image Version/Name([Link](https://ascendhub.huawei.com/#/detail/pytorch-modelzoo)) |
-|----------------|--------------|-------------------|-------------------|----------------------|
-| CANN 8.0.RC2   | 2.3.1        | 2.3.1             | v2.3.1-6.0.rc2    | -                    |
-|                | 2.2.0        | 2.2.0.post2       | v2.2.0-6.0.rc2    | -                    |
-|                | 2.1.0        | 2.1.0.post6       | v2.1.0-6.0.rc2    | -                    |
-|                | 1.11.0       | 1.11.0.post14     | v1.11.0-6.0.rc2   | -                    |
-| CANN 8.0.RC1   | 2.2.0        | 2.2.0             | v2.2.0-6.0.rc1    | -                    |
-|                | 2.1.0        | 2.1.0.post3       | v2.1.0-6.0.rc1    | -                    |
-|                | 1.11.0       | 1.11.0.post11     | v1.11.0-6.0.rc1   | -                    |
-| CANN 7.0.0     | 2.1.0        | 2.1.0             | v2.1.0-5.0.0      | -                    |
-|                | 2.0.1        | 2.0.1.post1       | v2.0.1-5.0.0      | -                    |
-|                | 1.11.0       | 1.11.0.post8      | v1.11.0-5.0.0     | -                    |
-| CANN 7.0.RC1   | 2.1.0        | 2.1.0.rc1         | v2.1.0-5.0.rc3    | -                    |
-|                | 2.0.1        | 2.0.1             | v2.0.1-5.0.rc3    | -                    |
-|                | 1.11.0       | 1.11.0.post4      | v1.11.0-5.0.rc3   | -                    |
-| CANN 6.3.RC3.1 | 1.11.0       | 1.11.0.post3      | v1.11.0-5.0.rc2.2 | -                    |
-| CANN 6.3.RC3   | 1.11.0       | 1.11.0.post2      | v1.11.0-5.0.rc2.1 | -                    |
-| CANN 6.3.RC2   | 2.0.1        | 2.0.1.rc1         | v2.0.1-5.0.rc2    | -                    |
-|                | 1.11.0       | 1.11.0.post1      | v1.11.0-5.0.rc2   | 23.0.RC1-1.11.0      |
-|                | 1.8.1        | 1.8.1.post2       | v1.8.1-5.0.rc2    | 23.0.RC1-1.8.1       |
-| CANN 6.3.RC1   | 1.11.0       | 1.11.0            | v1.11.0-5.0.rc1   | -                    |
-|                | 1.8.1        | 1.8.1.post1       | v1.8.1-5.0.rc1    | -                    |
-| CANN 6.0.1     | 1.5.0        | 1.5.0.post8       | v1.5.0-3.0.0      | 22.0.0               |
-|                | 1.8.1        | 1.8.1             | v1.8.1-3.0.0      | 22.0.0-1.8.1         |
-|                | 1.11.0       | 1.11.0.rc2（beta) | v1.11.0-3.0.0     | -                    |
-| CANN 6.0.RC1   | 1.5.0        | 1.5.0.post7       | v1.5.0-3.0.rc3    | 22.0.RC3             |
-|                | 1.8.1        | 1.8.1.rc3         | v1.8.1-3.0.rc3    | 22.0.RC3-1.8.1       |
-|                | 1.11.0       | 1.11.0.rc1（beta) | v1.11.0-3.0.rc3   | -                    |
-| CANN 5.1.RC2   | 1.5.0        | 1.5.0.post6       | v1.5.0-3.0.rc2    | 22.0.RC2             |
-|                | 1.8.1        | 1.8.1.rc2         | v1.8.1-3.0.rc2    | 22.0.RC2-1.8.1       |
-| CANN 5.1.RC1   | 1.5.0        | 1.5.0.post5       | v1.5.0-3.0.rc1    | 22.0.RC1             |
-|                | 1.8.1        | 1.8.1.rc1         | v1.8.1-3.0.rc1    | -                    |
-| CANN 5.0.4     | 1.5.0        | 1.5.0.post4       | 2.0.4.tr5         | 21.0.4               |
-| CANN 5.0.3     | 1.8.1        | 1.5.0.post3       | 2.0.3.tr5         | 21.0.3               |
-| CANN 5.0.2     | 1.5.0        | 1.5.0.post2       | 2.0.2.tr5         | 21.0.2               |
+| CANN Version          | Supported PyTorch Version | Supported Extension Version | Github Branch     |
+|-----------------------|---------------------------|-----------------------------|-------------------|
+| CANN 8.0.RC3          | 2.4.0                     | 2.4.0                      | v2.4.0-6.0.rc3    |
+|                       | 2.3.1                     | 2.3.1.post2                | v2.3.1-6.0.rc3    |
+|                       | 2.1.0                     | 2.1.0.post8                 | v2.1.0-6.0.rc3    |
+| CANN 8.0.RC2          | 2.3.1                     | 2.3.1                      | v2.3.1-6.0.rc2    |
+|                       | 2.2.0                     | 2.2.0.post2                | v2.2.0-6.0.rc2    |
+|                       | 2.1.0                     | 2.1.0.post6                 | v2.1.0-6.0.rc2    |
+|                       | 1.11.0                    | 1.11.0.post14               | v1.11.0-6.0.rc2   |
+| CANN 8.0.RC2.alpha002 | 2.3.1                     | 2.3.1rc1                    | v2.3.1            |
+| CANN 8.0.RC1          | 2.2.0                     | 2.2.0                       | v2.2.0-6.0.rc1    |
+|                       | 2.1.0                     | 2.1.0.post4                 | v2.1.0-6.0.rc1    |
+|                       | 1.11.0                    | 1.11.0.post11               | v1.11.0-6.0.rc1   |
+| CANN 7.0.0            | 2.1.0                     | 2.1.0                       | v2.1.0-5.0.0      |
+|                       | 2.0.1                     | 2.0.1.post1                 | v2.0.1-5.0.0      |
+|                       | 1.11.0                    | 1.11.0.post8                | v1.11.0-5.0.0     |
+| CANN 7.0.RC1          | 2.1.0                     | 2.1.0.rc1                   | v2.1.0-5.0.rc3    |
+|                       | 2.0.1                     | 2.0.1                       | v2.0.1-5.0.rc3    |
+|                       | 1.11.0                    | 1.11.0.post4                | v1.11.0-5.0.rc3   |
+| CANN 6.3.RC3.1        | 1.11.0                    | 1.11.0.post3                | v1.11.0-5.0.rc2.2 |
+| CANN 6.3.RC3          | 1.11.0                    | 1.11.0.post2                | v1.11.0-5.0.rc2.1 |
+| CANN 6.3.RC2          | 2.0.1                     | 2.0.1.rc1                   | v2.0.1-5.0.rc2    |
+|                       | 1.11.0                    | 1.11.0.post1                | v1.11.0-5.0.rc2   |
+|                       | 1.8.1                     | 1.8.1.post2                 | v1.8.1-5.0.rc2    |
+| CANN 6.3.RC1          | 1.11.0                    | 1.11.0                      | v1.11.0-5.0.rc1   |
+|                       | 1.8.1                     | 1.8.1.post1                 | v1.8.1-5.0.rc1    |
+| CANN 6.0.1            | 1.5.0                     | 1.5.0.post8                 | v1.5.0-3.0.0      |
+|                       | 1.8.1                     | 1.8.1                       | v1.8.1-3.0.0      |
+|                       | 1.11.0                    | 1.11.0.rc2（beta)            | v1.11.0-3.0.0     |
+| CANN 6.0.RC1          | 1.5.0                     | 1.5.0.post7                 | v1.5.0-3.0.rc3    |
+|                       | 1.8.1                     | 1.8.1.rc3                   | v1.8.1-3.0.rc3    |
+|                       | 1.11.0                    | 1.11.0.rc1（beta)            | v1.11.0-3.0.rc3   |
+| CANN 5.1.RC2          | 1.5.0                     | 1.5.0.post6                 | v1.5.0-3.0.rc2    | 
+|                       | 1.8.1                     | 1.8.1.rc2                   | v1.8.1-3.0.rc2    | 
+| CANN 5.1.RC1          | 1.5.0                     | 1.5.0.post5                 | v1.5.0-3.0.rc1    |
+|                       | 1.8.1                     | 1.8.1.rc1                   | v1.8.1-3.0.rc1    |
+| CANN 5.0.4            | 1.5.0                     | 1.5.0.post4                 | 2.0.4.tr5         |
+| CANN 5.0.3            | 1.8.1                     | 1.5.0.post3                 | 2.0.3.tr5         |
+| CANN 5.0.2            | 1.5.0                     | 1.5.0.post2                 | 2.0.2.tr5         |
 
 ## Suggestions and Communication
 
@@ -186,7 +193,7 @@ The version branches of AscendPyTorch have the following maintenance phases:
 
 | **PyTorch** |  **Maintenance Policies** | **Status** | **Launch Date**       | **Subsequent Status**            | **EOL Date**     |
 |-----------|--------------------|--------------|------------|-----------------|-----------|
-| 2.4.0     |  Planning  | -  | - | - |           |
+| 2.4.0     |  Regular Release  | Development  | 2024/10/15 |Expected to enter maintenance status from  March 15, 2025 |           |
 | 2.3.1     |  Regular Release  | Development   | 2024/06/06 | Expected to enter maintenance status from  December 6, 2024 |           |
 | 2.2.0     |  Regular Release  | Maintained   | 2024/04/01 | Expected to enter maintenance free status from September 10th, 2025|           |
 | 2.1.0     | Long Term Support  | Development   | 2023/10/15 | Expected to enter maintenance status from March 30, 2025 |           |
@@ -201,10 +208,10 @@ For more detailed information on installation guides, model migration, training/
 
 | Document Name                    | Document Link                                                 |
 | -------------------------------- | ------------------------------------------------------------ |
-| AscendPyTorch Installation Guide  | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/configandinstg/instg/insg_0001.html) |
-| AscendPyTorch Network Model Migration and Training | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) |
-| AscendPyTorch Operator Adaptation | [link](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) |
-| AscendPyTorch API List (PyTorch and Custom Interfaces) | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/apiref/apilist/ptaoplist_000002.html) |
+| AscendPyTorch Installation Guide  | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) |
+| AscendPyTorch Network Model Migration and Training | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) |
+| AscendPyTorch Operator Adaptation | [link](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) |
+| AscendPyTorch API List (PyTorch and Custom Interfaces) | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/apiref/apilist/ptaoplist_000002.html) |
 
 ## License
 
diff --git a/README.zh.md b/README.zh.md
index 91701847e7..2ab796b19e 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -35,9 +35,11 @@ pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 | x86     | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) |
 | x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) |
 | x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) |
+| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) |
 | aarch64 | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) |
 | aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) |
 | aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) |
+| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) |
 
 2. **安装torch_npu依赖**
 
@@ -51,7 +53,7 @@ pip3 install setuptools
 3. **安装torch_npu**
 
 ```
-pip3 install torch-npu==2.1.0.post6
+pip3 install torch-npu==2.1.0.post8
 ```
 如需要保存安装日志，可在pip3 install命令后面加上参数 `--log <PATH>`，并对您指定的目录`<PATH>`做好权限管控。
 
@@ -62,7 +64,7 @@ pip3 install torch-npu==2.1.0.post6
 1. **克隆torch_npu代码仓**
 
    ```
-   git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-6.0.rc2 --depth 1
+   git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-6.0.rc3 --depth 1
    ```
 
 2. **构建镜像**
@@ -128,53 +130,59 @@ print(z)
 
 ## PyTorch与Python版本配套表
 
-| PyTorch版本   | Python版本                                                   |
-| ------------- | :----------------------------------------------------------- |
+## PyTorch与Python版本配套表
+
+| PyTorch版本     | Python版本                                                     |
+|---------------|:-------------------------------------------------------------|
 | PyTorch1.11.0 | Python3.7.x(>=3.7.5), Python3.8.x, Python3.9.x, Python3.10.x |
-| PyTorch2.1.0  | Python3.8.x, Python3.9.x, Python3.10.x                       |
+| PyTorch2.1.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
 | PyTorch2.2.0  | Python3.8.x, Python3.9.x, Python3.10.x                       |
-| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x                       |
-
+| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
+| PyTorch2.4.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
 
 ## 昇腾辅助软件
 
 **PyTorch Extension**版本号采用`{PyTorch版本}-{昇腾版本}`命名规则，前者为**PyTorch Extension**匹配的PyTorch版本，后者用于匹配CANN版本，详细匹配如下：
 
-| CANN版本         | 支持的PyTorch版本 | 支持的Extension版本 | Gitee分支          | AscendHub镜像版本/名称([链接](https://ascendhub.huawei.com/#/detail/pytorch-modelzoo)) |
-|----------------|--------------|-------------------|-------------------|----------------------|
-| CANN 8.0.RC2   | 2.3.1        | 2.3.1             | v2.3.1-6.0.rc2    | -                    |
-|                | 2.2.0        | 2.2.0.post2       | v2.2.0-6.0.rc2    | -                    |
-|                | 2.1.0        | 2.1.0.post6       | v2.1.0-6.0.rc2    | -                    |
-|                | 1.11.0       | 1.11.0.post14     | v1.11.0-6.0.rc2   | -                    |
-| CANN 8.0.RC1   | 2.2.0        | 2.2.0             | v2.2.0-6.0.rc1    | -                    |
-|                | 2.1.0        | 2.1.0.post3       | v2.1.0-6.0.rc1    | -                    |
-|                | 1.11.0       | 1.11.0.post11     | v1.11.0-6.0.rc1   | -                    |
-| CANN 7.0.0     | 2.1.0        | 2.1.0             | v2.1.0-5.0.0      | -                    |
-|                | 2.0.1        | 2.0.1.post1       | v2.0.1-5.0.0      | -                    |
-|                | 1.11.0       | 1.11.0.post8      | v1.11.0-5.0.0     | -                    |
-| CANN 7.0.RC1   | 2.1.0        | 2.1.0.rc1         | v2.1.0-5.0.rc3    | -                    |
-|                | 2.0.1        | 2.0.1             | v2.0.1-5.0.rc3    | -                    |
-|                | 1.11.0       | 1.11.0.post4      | v1.11.0-5.0.rc3   | -                    |
-| CANN 6.3.RC3.1 | 1.11.0       | 1.11.0.post3      | v1.11.0-5.0.rc2.2 | -                    |
-| CANN 6.3.RC3   | 1.11.0       | 1.11.0.post2      | v1.11.0-5.0.rc2.1 | -                    |
-| CANN 6.3.RC2   | 2.0.1        | 2.0.1.rc1         | v2.0.1-5.0.rc2    | -                    |
-|                | 1.11.0       | 1.11.0.post1      | v1.11.0-5.0.rc2   | 23.0.RC1-1.11.0      |
-|                | 1.8.1        | 1.8.1.post2       | v1.8.1-5.0.rc2    | 23.0.RC1-1.8.1       |
-| CANN 6.3.RC1   | 1.11.0       | 1.11.0            | v1.11.0-5.0.rc1   | -                    |
-|                | 1.8.1        | 1.8.1.post1       | v1.8.1-5.0.rc1    | -                    |
-| CANN 6.0.1     | 1.5.0        | 1.5.0.post8       | v1.5.0-3.0.0      | 22.0.0               |
-|                | 1.8.1        | 1.8.1             | v1.8.1-3.0.0      | 22.0.0-1.8.1         |
-|                | 1.11.0       | 1.11.0.rc2（beta) | v1.11.0-3.0.0     | -                    |
-| CANN 6.0.RC1   | 1.5.0        | 1.5.0.post7       | v1.5.0-3.0.rc3    | 22.0.RC3             |
-|                | 1.8.1        | 1.8.1.rc3         | v1.8.1-3.0.rc3    | 22.0.RC3-1.8.1       |
-|                | 1.11.0       | 1.11.0.rc1（beta) | v1.11.0-3.0.rc3   | -                    |
-| CANN 5.1.RC2   | 1.5.0        | 1.5.0.post6       | v1.5.0-3.0.rc2    | 22.0.RC2             |
-|                | 1.8.1        | 1.8.1.rc2         | v1.8.1-3.0.rc2    | 22.0.RC2-1.8.1       |
-| CANN 5.1.RC1   | 1.5.0        | 1.5.0.post5       | v1.5.0-3.0.rc1    | 22.0.RC1             |
-|                | 1.8.1        | 1.8.1.rc1         | v1.8.1-3.0.rc1    | -                    |
-| CANN 5.0.4     | 1.5.0        | 1.5.0.post4       | 2.0.4.tr5         | 21.0.4               |
-| CANN 5.0.3     | 1.8.1        | 1.5.0.post3       | 2.0.3.tr5         | 21.0.3               |
-| CANN 5.0.2     | 1.5.0        | 1.5.0.post2       | 2.0.2.tr5         | 21.0.2               |
+| CANN版本                | 支持的PyTorch版本 | 支持的Extension版本   | Gitee分支           | 
+|-----------------------|--------------|------------------|-------------------|
+| CANN 8.0.RC3          | 2.4.0        | 2.4.0            | v2.4.0-6.0.rc3    | 
+|                       | 2.3.1        | 2.3.1.post2      | v2.3.1-6.0.rc3    |
+|                       | 2.1.0        | 2.1.0.post8      | v2.1.0-6.0.rc3    | 
+| CANN 8.0.RC2          | 2.3.1        | 2.3.1            | v2.3.1-6.0.rc2    | 
+|                       | 2.2.0        | 2.2.0.post2      | v2.2.0-6.0.rc2    |
+|                       | 2.1.0        | 2.1.0.post6      | v2.1.0-6.0.rc2    |
+|                       | 1.11.0       | 1.11.0.post14    | v1.11.0-6.0.rc2   | 
+| CANN 8.0.RC2.alpha002 | 2.3.1        | 2.3.1rc1         | v2.3.1            | 
+| CANN 8.0.RC1          | 2.2.0        | 2.2.0            | v2.2.0-6.0.rc1    |
+|                       | 2.1.0        | 2.1.0.post4      | v2.1.0-6.0.rc1    | 
+|                       | 1.11.0       | 1.11.0.post11    | v1.11.0-6.0.rc1   | 
+| CANN 7.0.0            | 2.1.0        | 2.1.0            | v2.1.0-5.0.0      |
+|                       | 2.0.1        | 2.0.1.post1      | v2.0.1-5.0.0      | 
+|                       | 1.11.0       | 1.11.0.post8     | v1.11.0-5.0.0     | 
+| CANN 7.0.RC1          | 2.1.0        | 2.1.0.rc1        | v2.1.0-5.0.rc3    | 
+|                       | 2.0.1        | 2.0.1            | v2.0.1-5.0.rc3    | 
+|                       | 1.11.0       | 1.11.0.post4     | v1.11.0-5.0.rc3   | 
+| CANN 6.3.RC3.1        | 1.11.0       | 1.11.0.post3     | v1.11.0-5.0.rc2.2 | 
+| CANN 6.3.RC3          | 1.11.0       | 1.11.0.post2     | v1.11.0-5.0.rc2.1 | 
+| CANN 6.3.RC2          | 2.0.1        | 2.0.1.rc1        | v2.0.1-5.0.rc2    | 
+|                       | 1.11.0       | 1.11.0.post1     | v1.11.0-5.0.rc2   |
+|                       | 1.8.1        | 1.8.1.post2      | v1.8.1-5.0.rc2    |
+| CANN 6.3.RC1          | 1.11.0       | 1.11.0           | v1.11.0-5.0.rc1   | 
+|                       | 1.8.1        | 1.8.1.post1      | v1.8.1-5.0.rc1    | 
+| CANN 6.0.1            | 1.5.0        | 1.5.0.post8      | v1.5.0-3.0.0      |
+|                       | 1.8.1        | 1.8.1            | v1.8.1-3.0.0      |
+|                       | 1.11.0       | 1.11.0.rc2（beta) | v1.11.0-3.0.0     | 
+| CANN 6.0.RC1          | 1.5.0        | 1.5.0.post7      | v1.5.0-3.0.rc3    |
+|                       | 1.8.1        | 1.8.1.rc3        | v1.8.1-3.0.rc3    |
+|                       | 1.11.0       | 1.11.0.rc1（beta) | v1.11.0-3.0.rc3   | 
+| CANN 5.1.RC2          | 1.5.0        | 1.5.0.post6      | v1.5.0-3.0.rc2    |
+|                       | 1.8.1        | 1.8.1.rc2        | v1.8.1-3.0.rc2    |
+| CANN 5.1.RC1          | 1.5.0        | 1.5.0.post5      | v1.5.0-3.0.rc1    |
+|                       | 1.8.1        | 1.8.1.rc1        | v1.8.1-3.0.rc1    | 
+| CANN 5.0.4            | 1.5.0        | 1.5.0.post4      | 2.0.4.tr5         |
+| CANN 5.0.3            | 1.8.1        | 1.5.0.post3      | 2.0.3.tr5         |
+| CANN 5.0.2            | 1.5.0        | 1.5.0.post2      | 2.0.2.tr5         |
 
 ## 建议与交流
 
@@ -195,7 +203,7 @@ AscendPyTorch版本分支的维护阶段如下：
 
 | **PyTorch版本** | **维护策略** | **当前状态** | **发布时间** | **后续状态** | **EOL日期** |
 |-----------|-----------|--------|------------|-----------------------|-----------|
-| 2.4.0     |  常规分支  | 计划    | - | - |         -  |   |
+| 2.4.0     |  常规分支  | 开发    | 2024/10/15 | 预计2025/03/15起进入维护状态 |         -  | 
 | 2.3.1     |  常规分支  | 开发   | 2024/06/06 | 预计2024/12/06起进入维护状态 |           |
 | 2.2.0     |  常规分支   | 维护   | 2024/04/01 | 预计2025/9/10起进入无维护状态 |           |
 | 2.1.0     |  长期支持  | 开发   | 2023/10/15 | 预计2025/03/30起进入维护状态 |           |
@@ -215,10 +223,10 @@ AscendPyTorch版本分支的维护阶段如下：
 
 | 文档名称                   | 文档链接                                                     |
 | -------------------------- | ------------------------------------------------------------ |
-| AscendPyTorch 安装指南           | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/configandinstg/instg/insg_0001.html) |
-| AscendPyTorch 网络模型迁移和训练 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) |
-| AscendPyTorch 算子适配           | [参考链接](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) |
-| AscendPyTorch API清单（PyTorch原生接口与自定义接口）            | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/apiref/apilist/ptaoplist_000002.html) |
+| AscendPyTorch 安装指南           | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) |
+| AscendPyTorch 网络模型迁移和训练 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) |
+| AscendPyTorch 算子适配           | [参考链接](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) |
+| AscendPyTorch API清单（PyTorch原生接口与自定义接口）            | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/apiref/apilist/ptaoplist_000002.html) |
 
 ## 许可证
 
-- 
Gitee


From e57ee9803d1ed848cb3662886058c14053240727 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 14 Oct 2024 13:44:45 +0000
Subject: [PATCH 75/96] !15365 Update op_plugin commit id Merge pull request
 !15365 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 919680509f..ffd6a97674 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 919680509f869214d78b7b6d7a68c7d065394b68
+Subproject commit ffd6a976745747f4781bbf2231b6b8b254c46a0e
-- 
Gitee


From 30965319c1160b881c77577da611daec09c5c35b Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Tue, 15 Oct 2024 11:53:38 +0000
Subject: [PATCH 76/96] !15372 Update torchair commit id Merge pull request
 !15372 from torchair_robot/v2.1.0-6.0.rc3

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 820f0378f4..b79847a724 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 820f0378f4591707969e1aa55935cff7b823b155
+Subproject commit b79847a7243424badd59b18cccda3db7ad148c6c
-- 
Gitee


From f826e04a063da8ce885fddf56f69b6f41d717d93 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 16 Oct 2024 09:00:50 +0000
Subject: [PATCH 77/96] !15402 Update torchair commit id Merge pull request
 !15402 from torchair_robot/v2.1.0-6.0.rc3

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index b79847a724..dbf8c1fc68 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit b79847a7243424badd59b18cccda3db7ad148c6c
+Subproject commit dbf8c1fc6855b53e374f332eb792999468011b12
-- 
Gitee


From 80da101fd38c2ba50efecffa344d8990cf58e5d8 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Thu, 17 Oct 2024 07:51:37 +0000
Subject: [PATCH 78/96] !15423 Update torchair commit id Merge pull request
 !15423 from torchair_robot/v2.1.0-6.0.rc3

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index dbf8c1fc68..341bb795a6 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit dbf8c1fc6855b53e374f332eb792999468011b12
+Subproject commit 341bb795a69992114815f51ca9a51b99138ed20f
-- 
Gitee


From 147622e4252e9e6e9ee950d3cedf02f70f4ab53a Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Fri, 18 Oct 2024 06:30:37 +0000
Subject: [PATCH 79/96] !15411 update readme(add hardware support) Merge pull
 request !15411 from huangyunlong/2.1r3readme

---
 README.md    | 20 ++++++++++++++++++++
 README.zh.md | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/README.md b/README.md
index 41304101e9..90f968a4bc 100644
--- a/README.md
+++ b/README.md
@@ -174,6 +174,26 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 | CANN 5.0.3            | 1.8.1                     | 1.5.0.post3                 | 2.0.3.tr5         |
 | CANN 5.0.2            | 1.5.0                     | 1.5.0.post2                 | 2.0.2.tr5         |
 
+## Hardware support
+
+The Ascend training device includes the following models, all of which can be used as training environments for PyTorch models
+| Product series        | Product model                    |
+|-----------------------|----------------------------------|
+| Atlas Training series products     | Atlas 800（model: 9000） |
+|                       | Atlas 800（model：9010）          |
+|                       | Atlas 900 PoD（model：9000）      |
+|                       | Atlas 300T（model：9000）         |
+|                       | Atlas 300T Pro（model：9000）     |
+| Atlas A2 Training series products  | Atlas 800T A2       |
+|                       | Atlas 900 A2 PoD                 |
+|                       | Atlas 200T A2 Box16              |
+|                       | Atlas 300T A2                    |
+
+The Ascend inference device includes the following models, all of which can be used as inference environments for large models
+| Product series        | Product model                        |
+|-----------------------|----------------------------------|
+| Atlas 800I A2 Inference product  | Atlas 800I A2         |
+
 ## Suggestions and Communication
 
 Everyone is welcome to contribute to the community. If you have any questions or suggestions, you can submit [Github Issues](https://github.com/Ascend/pytorch/issues). We will reply to you as soon as possible. Thank you very much.
diff --git a/README.zh.md b/README.zh.md
index 2ab796b19e..de86930a32 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -184,6 +184,27 @@ print(z)
 | CANN 5.0.3            | 1.8.1        | 1.5.0.post3      | 2.0.3.tr5         |
 | CANN 5.0.2            | 1.5.0        | 1.5.0.post2      | 2.0.2.tr5         |
 
+## 硬件配套
+
+昇腾训练设备包含以下型号，都可作为PyTorch模型的训练环境
+| 产品系列               | 产品型号                         |
+|-----------------------|----------------------------------|
+| Atlas 训练系列产品     | Atlas 800 训练服务器（型号：9000） |
+|                       | Atlas 800 训练服务器（型号：9010） |
+|                       | Atlas 900 PoD（型号：9000）       |
+|                       | Atlas 300T 训练卡（型号：9000）    |
+|                       | Atlas 300T Pro 训练卡（型号：9000）|
+| Atlas A2 训练系列产品  | Atlas 800T A2 训练服务器          |
+|                       | Atlas 900 A2 PoD 集群基础单元     |
+|                       | Atlas 200T A2 Box16 异构子框      |
+|                       | Atlas 300T A2 训练卡              |
+
+昇腾推理设备包含以下型号，都可作为大模型的推理环境
+| 产品系列               | 产品型号                         |
+|-----------------------|----------------------------------|
+| Atlas 800I A2推理产品  | Atlas 800I A2 推理服务器          |
+
+
 ## 建议与交流
 
 欢迎大家为社区做贡献。如果有任何疑问或建议，请提交[gitee Issues](https://gitee.com/Ascend/pytorch/issues)，我们会尽快回复。感谢您的支持。
-- 
Gitee


From 6b77de56fff3d40c23f1d8f6b12feeab56530e5a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 18 Oct 2024 13:43:28 +0000
Subject: [PATCH 80/96] !15455 Update op_plugin commit id Merge pull request
 !15455 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ffd6a97674..070332e65b 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ffd6a976745747f4781bbf2231b6b8b254c46a0e
+Subproject commit 070332e65baff39923406c86d06eeb3e14047c6f
-- 
Gitee


From be57216f7d4295e3ef7a325a3cb21d0715256239 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Thu, 24 Oct 2024 12:45:04 +0000
Subject: [PATCH 81/96] !15496 [PROF] Profiler TraceStepTime table fix Merge
 pull request !15496 from wangjie/cherry-pick-1729677572

---
 .../analysis/prof_common_func/_constant.py    |  4 +-
 .../prof_view/_trace_step_time_parser.py      |  2 +-
 .../_trace_step_time_db_parser.py             | 49 ++++++++++++-------
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py
index 6cde5e6dcb..38493cc781 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_constant.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py
@@ -280,8 +280,8 @@ class DbConstant():
     TABLE_CANN_API = "CANN_API"
     # task table name
     TABLE_TASK = "TASK"
-    # communicate op table name
-    TABLE_COMMUNICATE_OP = "COMMUNICATE_OP"
+    # communication op table name
+    TABLE_COMMUNICATION_OP = "COMMUNICATION_OP"
     # compute task table name
     TABLE_COMPUTE_TASK_INFO = "COMPUTE_TASK_INFO"
     # communication task table name
diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
index f465cd97f8..8cb1df91e3 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
@@ -48,7 +48,7 @@ class TraceStepTimeParser(BaseParser):
         start_time = float(start_time)
         duration = float(duration)
         for step in step_list:
-            if step[StepInfoIndex.START_TS.value] <= start_time <= step[StepInfoIndex.END_TS.value]:
+            if step[StepInfoIndex.START_TS.value] <= start_time < step[StepInfoIndex.END_TS.value]:
                 cur_step = step[StepInfoIndex.ID.value]
                 break
         for step in step_list:
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
index fd936e80bc..96eb06f802 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import os
+from enum import Enum
 from .._base_parser import BaseParser
 from ...prof_common_func._constant import Constant, print_error_msg, print_warn_msg
 from ...prof_common_func._constant import DbConstant, TableColumnsManager
@@ -25,6 +26,12 @@ from ...prof_parse._fwk_file_parser import FwkFileParser
 __all__ = []
 
 
+class CommunicationOpIndex(Enum):
+    OP_NAME = 0
+    START_NS = 1
+    END_NS = 2
+
+
 class TraceStepTimeDbParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
@@ -32,7 +39,7 @@ class TraceStepTimeDbParser(BaseParser):
         self.step_range = []
         self.string_id_map = {}
         self.compute_task_info = {}
-        self.communication_task_info = {}
+        self.communication_op_info = []
         self.task_db_con = None
         self.task_db_curs = None
         self.analysis_db_con = None
@@ -97,7 +104,8 @@ class TraceStepTimeDbParser(BaseParser):
                     'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 
                     'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0
                 }
-                origin_compute_data, origin_communication_data, bubble_data = self._get_task_data_in_step(cur_step)
+                origin_compute_data = self._get_compute_data_in_step(cur_step)
+                origin_communication_data, bubble_data = self._get_communication_data_in_step(cur_step)
                 compute_data = RangeCaculator.merge_continuous_intervals(origin_compute_data)
                 save_info['compute'] = sum(data.end_ts - data.start_ts for data in compute_data)
                 communication_data = RangeCaculator.merge_continuous_intervals(origin_communication_data)
@@ -125,7 +133,7 @@ class TraceStepTimeDbParser(BaseParser):
 
     def _init_step_range(self, deps_data: dict):
         self.step_range = deps_data.get(Constant.STEP_INFO_DB_PARSER, [])
-    
+
     def _init_task_info_from_db(self):
         conn, curs = DbManager.create_connect_db(self.db_path)
         if not (conn and curs):
@@ -141,28 +149,33 @@ class TraceStepTimeDbParser(BaseParser):
             sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMPUTE_TASK_INFO)
             compute_task_data = DbManager.fetch_all_data(curs, sql)
             self.compute_task_info = {data[1]: data[0] for data in compute_task_data}
-        if DbManager.judge_table_exist(curs, DbConstant.TABLE_COMMUNICATION_TASK_INFO):
-            sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMMUNICATION_TASK_INFO)
-            communication_task_data = DbManager.fetch_all_data(curs, sql)
-            self.communication_task_info = {data[1]: data[0] for data in communication_task_data}
+        if DbManager.judge_table_exist(curs, DbConstant.TABLE_COMMUNICATION_OP):
+            sql = "select opName, startNs, endNs from {}".format(DbConstant.TABLE_COMMUNICATION_OP)
+            self.communication_op_info = DbManager.fetch_all_data(curs, sql)
         DbManager.destroy_db_connect(conn, curs)
 
-    def _get_task_data_in_step(self, step_info):
+    def _get_compute_data_in_step(self, step_info):
         compute_data = []
-        communication_data = []
-        bubble_data = []
         for task_id, task_info in step_info.get(Constant.TASK_INFO, {}).items():
             if task_id in self.compute_task_info:
                 compute_data.append(
                     RangeCaculator.generate_time_range(task_info.get("startNs"), task_info.get("endNs")))
-            if task_id in self.communication_task_info:
-                time_range = RangeCaculator.generate_time_range(
-                    task_info.get("startNs"), task_info.get("endNs"), class_range=CommunicationTimeRange)
-                communication_data.append(time_range)
-                task_name = self.string_id_map.get(self.communication_task_info.get(task_id), '')
-                if task_name.startswith('hcom_receive'):
-                    bubble_data.append(time_range)
-        return compute_data, communication_data, bubble_data
+        return compute_data
+
+    def _get_communication_data_in_step(self, step_info):
+        communication_data = []
+        bubble_data = []
+        for op_info in self.communication_op_info:
+            op_start_time = op_info[CommunicationOpIndex.START_NS.value]
+            if not (step_info.get(Constant.START_TS) <= op_start_time < step_info.get(Constant.END_TS)):
+                continue
+            time_range = RangeCaculator.generate_time_range(
+                op_start_time, op_info[CommunicationOpIndex.END_NS.value], class_range=CommunicationTimeRange)
+            communication_data.append(time_range)
+            op_name = self.string_id_map.get(op_info[CommunicationOpIndex.OP_NAME.value], '')
+            if op_name.startswith('hcom_receive'):
+                bubble_data.append(time_range)
+        return communication_data, bubble_data
 
     def _get_first_device_task_ts(self, compute_task, communication_task):
         first_compute_task = compute_task[0] if compute_task else None
-- 
Gitee


From fae30301ba072711d8b2a35a2a99e04819ff9a42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=99=9E=E8=89=AF=E6=96=8C?= <y1416490440@163.com>
Date: Thu, 24 Oct 2024 13:03:41 +0000
Subject: [PATCH 82/96] =?UTF-8?q?!15460=20[fix]rectify=20the=20spelling=20?=
 =?UTF-8?q?error=20of=20words=20in=20screen=20logs=20when=20gc=20is=20coll?=
 =?UTF-8?q?ected=20using=20PyTorch=20API=20Merge=20pull=20request=20!15460?=
 =?UTF-8?q?=20from=20=E8=99=9E=E8=89=AF=E6=96=8C/bug=5Fv2.1.0=5F6.0.rc3=5F?=
 =?UTF-8?q?1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../_dynamic_profiler/_dynamic_profiler_monitor_shm.py    | 8 +++++---
 torch_npu/profiler/experimental_config.py                 | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
index 9284706d91..ec4f4429c9 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
@@ -9,6 +9,7 @@ import struct
 from datetime import datetime
 
 from ...utils.path_manager import PathManager
+from ...utils._error_code import ErrCode, prof_error
 from ..analysis.prof_common_func._file_manager import FileManager
 from ._dynamic_profiler_log import logger
 
@@ -84,9 +85,10 @@ class DynamicProfilerShareMemory:
         time_shm = os.stat(shm_path).st_ctime
         pid_time = self._get_pid_st_ctime(os.getpid())
         eps = 60
-        if pid_time - time_shm > eps:
-            logger.error("There maybe exist share memory before this task, if you kill last task, "
-                         "dynamic profiler will not valid, please remove %s, and retry.", shm_path)
+        if pid_time is not None and pid_time - time_shm > eps:
+            raise RuntimeError(f"There may exist shared memory before this task. If you kill the last task, "
+                               f"dynamic profiler will not be valid. Please remove: {shm_path}, and retry." +
+                               prof_error(ErrCode.VALUE)) from err
 
     def _create_prof_cfg(self):
         if not os.path.exists(self.config_path):
diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py
index 2b3ff14563..2dc69aff4e 100644
--- a/torch_npu/profiler/experimental_config.py
+++ b/torch_npu/profiler/experimental_config.py
@@ -130,7 +130,7 @@ class _ExperimentalConfig:
             print_warn_msg("Invalid parameter op_attr, which must be of boolean type, reset it to False.")
             self._op_attr = False
         if self._export_type not in (ExportType.Text, ExportType.Db):
-            print_warn_msg("Invalid parameter type, reset it to text.")
+            print_warn_msg("Invalid parameter export_type, reset it to text.")
             self._export_type = ExportType.Text
         if self._op_attr and self._export_type != ExportType.Db:
             print_warn_msg("op_attr switch is invalid with export type set as text.")
@@ -140,7 +140,7 @@ class _ExperimentalConfig:
                 print_warn_msg("Parameter gc_detect_threshold is not int or float type, reset it to default.")
                 self._gc_detect_threshold = None
             elif self._gc_detect_threshold < 0.0:
-                print_warn_msg("Parameter gc_detect_threshold can not be negetive, reset it to default.")
+                print_warn_msg("Parameter gc_detect_threshold can not be negative, reset it to default.")
                 self._gc_detect_threshold = None
             elif self._gc_detect_threshold == 0.0:
                 print_info_msg("Parameter gc_detect_threshold is set to 0, it will collect all gc events.")
-- 
Gitee


From fc0b97a170c099d2dcb3ea9608db333a6712237d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Thu, 24 Oct 2024 13:04:40 +0000
Subject: [PATCH 83/96] =?UTF-8?q?!15511=20Compatible=20old=20hccl=20versio?=
 =?UTF-8?q?n=20Merge=20pull=20request=20!15511=20from=20=E5=85=B3=E9=BE=99?=
 =?UTF-8?q?=E9=94=8B/cherry-pick-1729759095?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 704aca55e5..e8a9f5a283 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -209,6 +209,12 @@ void getP2PHcclCommCofig(HcclCommConfig* config)
 {
     HcclCommConfigInit(config);
     config->hcclBufferSize = c10_npu::option::OptionsManager::GetP2PBufferSize();
+    // Compatible with the size check of the old version of HCCL, forcibly convert
+    // the config object to a size_t=32 object, and retain the N ± 2 version
+    if (!isHcclFeatureSupported(HcclCommConfigCapability::HCCL_COMM_CONFIG_COMM_NAME)) {
+        size_t *configSize = reinterpret_cast<size_t *>(config);
+        *configSize = 32;
+    }
 }
 
 void checkHcclCommConfigValid(const HcclCommConfig* config)
-- 
Gitee


From 1fa937ac8262dd378a5a538a02976d72a2ebbafd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= <zhanhao9@huawei.com>
Date: Mon, 28 Oct 2024 09:26:11 +0000
Subject: [PATCH 84/96] =?UTF-8?q?!15544=20add=208.0.T37,8.0.T38,8.0.T39=20?=
 =?UTF-8?q?to=20foreach=20black=20list=20Merge=20pull=20request=20!15544?=
 =?UTF-8?q?=20from=20=E8=A9=B9=E6=98=8A/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/_optim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/utils/_optim.py b/torch_npu/utils/_optim.py
index 0eaeb63a16..78cb1d7b05 100644
--- a/torch_npu/utils/_optim.py
+++ b/torch_npu/utils/_optim.py
@@ -7,8 +7,8 @@ from torch_npu.utils.collect_env import get_cann_version
 _device_name = None
 _cann_version = get_cann_version()
 _foreach_black_list_for_cann_starts_with = ['8.0.RC1', '8.0.RC2']
-_foreach_black_list_for_cann_all = ['not known', '8.0.T1', '8.0.T2', '8.0.T3', '8.0.T37', '8.0.T5', '8.0.T6', '8.0.T7',
-    '8.0.T8', '8.0.T10', '8.0.T13', '8.0.T16', '8.0.T50', '8.0.T51', '8.0.T52']
+_foreach_black_list_for_cann_all = ['not known', '8.0.T1', '8.0.T2', '8.0.T3', '8.0.T5', '8.0.T6', '8.0.T7',
+    '8.0.T8', '8.0.T10', '8.0.T13', '8.0.T16', '8.0.T37', '8.0.T38', '8.0.T39', '8.0.T50', '8.0.T51', '8.0.T52']
 
 
 def patch_supported_devices():
-- 
Gitee


From 21616d71e3096059a0cd129158ae93d28f312883 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Tue, 29 Oct 2024 08:35:59 +0000
Subject: [PATCH 85/96] !15578 Update torchair commit id Merge pull request
 !15578 from torchair_robot/v2.1.0-6.0.rc3

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 341bb795a6..549ff0f2bc 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 341bb795a69992114815f51ca9a51b99138ed20f
+Subproject commit 549ff0f2bc5ff0308051043f56dfbbb9c8383529
-- 
Gitee


From c3daabc9319c28d0646fa95f7ac73370db5f55b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Sat, 16 Nov 2024 03:35:17 +0000
Subject: [PATCH 86/96] =?UTF-8?q?!16018=20Implement=20recordDataPtrOnStrea?=
 =?UTF-8?q?m=20to=20ensure=20that=20cross-stream=20memory=20reuse=20is=20c?=
 =?UTF-8?q?orrect=20when=20backward.=20Merge=20pull=20request=20!16018=20f?=
 =?UTF-8?q?rom=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.1.0-6.0.rc3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/impl/NPUGuardImpl.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
index 1c3ba4e12e..4359db0136 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
@@ -5,6 +5,7 @@
 #include <c10/macros/Macros.h>
 
 #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
@@ -165,6 +166,12 @@ struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     NPU_CHECK_ERROR_WITHOUT_UCE(acl::AclQueryEventRecordedStatus(npu_event, &status));
     return (status == acl::ACL_EVENT_RECORDED_STATUS_COMPLETE);
   }
+
+    void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const c10::Stream& stream) const override
+    {
+        NPUStream npu_stream{stream};
+        c10_npu::NPUCachingAllocator::recordStream(data_ptr, npu_stream);
+    }
 };
 
 } // namespace impl
-- 
Gitee


From 16705c6d11ccdf0e59d5b348acca95efae48d6cd Mon Sep 17 00:00:00 2001
From: xudaohong <xudaohong@huawei.com>
Date: Sat, 23 Nov 2024 07:15:20 +0000
Subject: [PATCH 87/96] !16234 [feat] add optional arg offset for npu_prefetch
 Merge pull request !16234 from xudaohong/cherry-pick-1732195754

---
 test/test_fake_tensor.py              | 5 +++++
 third_party/op-plugin                 | 2 +-
 third_party/torchair/torchair         | 2 +-
 torch_npu/meta/_meta_registrations.py | 8 ++++++--
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index c2ead43c03..a1f549a67f 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1906,6 +1906,11 @@ class TestNpuPrefetch(TestCase):
             exception = cm.exception
             self.assertEqual(str(exception), "The max_size should be greater than zero, but got -1.")
 
+            with self.assertRaises(RuntimeError) as cm:
+                torch_npu.npu_prefetch(input1, None, 10, -1)
+            exception = cm.exception
+            self.assertEqual(str(exception), "The offset should be nonnegative, but got -1.")
+
 
 instantiate_parametrized_tests(FakeTensorTest)
 instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for="cpu")
diff --git a/third_party/op-plugin b/third_party/op-plugin
index 070332e65b..c518992967 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 070332e65baff39923406c86d06eeb3e14047c6f
+Subproject commit c5189929673935f3d04414f15cc183b72fc16941
diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 549ff0f2bc..0389f1b30f 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 549ff0f2bc5ff0308051043f56dfbbb9c8383529
+Subproject commit 0389f1b30f50772f840e745b4c298017fe906e38
diff --git a/torch_npu/meta/_meta_registrations.py b/torch_npu/meta/_meta_registrations.py
index 98cc126714..b99e9b6b98 100644
--- a/torch_npu/meta/_meta_registrations.py
+++ b/torch_npu/meta/_meta_registrations.py
@@ -946,8 +946,12 @@ has_side_effect(torch.ops.npu.npu_prefetch.default)
 
 
 @impl(m, "npu_prefetch")
-def npu_prefetch_meta(self, dependency, max_size):
+def npu_prefetch_meta(self, dependency, max_size, offset=0):
     torch._check(
         max_size > 0,
         lambda: f"The max_size should be greater than zero, but got {max_size}.",
-    )
\ No newline at end of file
+    )
+    torch._check(
+        offset >= 0,
+        lambda: f"The offset should be nonnegative, but got {offset}.",
+    )
-- 
Gitee


From 643e91be99c664379c5bd07d1a5ecffdce53b0b6 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Fri, 29 Nov 2024 06:40:56 +0000
Subject: [PATCH 88/96] !16448 Release 6.0.RC3.1 Merge pull request !16448 from
 dilililiwhy/release_rc31_210

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6e44f86033..ec8a463b44 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel
 
 BASE_DIR = os.path.dirname(os.path.realpath(__file__))
 THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party")
-VERSION = '2.1.0.post8'
+VERSION = '2.1.0.post9'
 UNKNOWN = "Unknown"
 BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP
 
-- 
Gitee


From 08e4654eb90d4b33b4b2fbc17f75f1ca905f2c4a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 29 Nov 2024 09:58:50 +0000
Subject: [PATCH 89/96] !16478 Update op_plugin commit id Merge pull request
 !16478 from pta-robot/v2.1.0-6.0.rc3

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index c518992967..b99362e256 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit c5189929673935f3d04414f15cc183b72fc16941
+Subproject commit b99362e2563fb20b1512e89d618a5f5ce7f7e44c
-- 
Gitee


From 4ad973c47dd083fbe4409c98e61a7a1204577378 Mon Sep 17 00:00:00 2001
From: liyou_b <2953090824@qq.com>
Date: Thu, 5 Dec 2024 02:24:41 +0000
Subject: [PATCH 90/96] =?UTF-8?q?!16614=20=E3=80=90PROF=E3=80=91=E3=80=90B?=
 =?UTF-8?q?UG=E3=80=91V2.1.0-6.0.0rc3:=20add=20start=20step=20for=20dynami?=
 =?UTF-8?q?c=20profiling=20Merge=20pull=20request=20!16614=20from=20liyou?=
 =?UTF-8?q?=5Fb/bug=5Ffixed=5F6.0rc3=5F210?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/profiler/test_dynamic_profiler.py        | 11 ++++++
 .../profiler/_dynamic_profiler/__init__.py    |  5 +--
 .../_dynamic_profiler_config_context.py       | 34 +++++++++----------
 .../_dynamic_profiler_monitor.py              |  6 ++--
 .../_dynamic_profiler_monitor_shm.py          |  5 +--
 ...iler_log.py => _dynamic_profiler_utils.py} | 31 +++++++++++++++--
 torch_npu/profiler/dynamic_profile.py         | 16 ++++-----
 7 files changed, 70 insertions(+), 38 deletions(-)
 rename torch_npu/profiler/_dynamic_profiler/{_dynamic_profiler_log.py => _dynamic_profiler_utils.py} (51%)

diff --git a/test/profiler/test_dynamic_profiler.py b/test/profiler/test_dynamic_profiler.py
index d8a7e8cb20..0b8b3729a8 100644
--- a/test/profiler/test_dynamic_profiler.py
+++ b/test/profiler/test_dynamic_profiler.py
@@ -55,6 +55,7 @@ class TestDynamicProfiler(TestCase):
     large_steps = 5
     flags = os.O_WRONLY
     mode = stat.S_IRUSR | stat.S_IWUSR
+    start_step = 0
 
     @classmethod
     def setUpClass(cls):
@@ -67,6 +68,7 @@ class TestDynamicProfiler(TestCase):
         cls.active_rank_prof_dir = os.path.join(cls.results_path, "active_rank_prof_dir")
         cls.cfg_prof_dir = os.path.join(cls.results_path, "cfg_prof_dir")
         cls.cfg_path = os.path.join(cls.results_path, "profiler_config.json")
+        os.environ["RANK"] = "0"
         dp.init(cls.results_path)
 
     @classmethod
@@ -451,13 +453,16 @@ class TestDynamicProfiler(TestCase):
     def test_dynamic_profiler_default(self):
         cfg_json = copy.deepcopy(self.json_sample)
         cfg_json['prof_dir'] = self.default_prof_dir
+        cfg_json['start_step'] = TestDynamicProfiler.start_step + 1
         with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f:
             time.sleep(1)
             json.dump(cfg_json, f, indent=4)
         time.sleep(3)
         dp.step()
+        TestDynamicProfiler.start_step += 1
         self.model_train.train_one_step()
         dp.step()
+        TestDynamicProfiler.start_step += 1
         has_prof = False
         if self.has_prof_dir(self.default_prof_dir):
             has_prof = True
@@ -470,14 +475,17 @@ class TestDynamicProfiler(TestCase):
         cfg_json['prof_dir'] = self.rank_prof_dir
         cfg_json['is_rank'] = True
         cfg_json['rank_list'] = [0]
+        cfg_json['start_step'] = TestDynamicProfiler.start_step + 1
 
         with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f:
             time.sleep(1)
             json.dump(cfg_json, f, indent=4)
         time.sleep(3)
         dp.step()
+        TestDynamicProfiler.start_step += 1
         self.model_train.train_one_step()
         dp.step()
+        TestDynamicProfiler.start_step += 1
         has_prof = False
         if self.has_prof_dir(self.rank_prof_dir):
             has_prof = True
@@ -490,14 +498,17 @@ class TestDynamicProfiler(TestCase):
         cfg_json['prof_dir'] = self.invalid_rank_prof_dir
         cfg_json['is_rank'] = True
         cfg_json['rank_list'] = [1]
+        cfg_json['start_step'] = TestDynamicProfiler.start_step + 1
 
         with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f:
             time.sleep(1)
             json.dump(cfg_json, f, indent=4)
         time.sleep(3)
         dp.step()
+        TestDynamicProfiler.start_step += 1
         self.model_train.train_one_step()
         dp.step()
+        TestDynamicProfiler.start_step += 1
         has_prof = False
         if self.has_prof_dir(self.invalid_rank_prof_dir):
             has_prof = True
diff --git a/torch_npu/profiler/_dynamic_profiler/__init__.py b/torch_npu/profiler/_dynamic_profiler/__init__.py
index 23852dd596..a9a2c5b3bb 100644
--- a/torch_npu/profiler/_dynamic_profiler/__init__.py
+++ b/torch_npu/profiler/_dynamic_profiler/__init__.py
@@ -1,4 +1 @@
-__all__ = ['logger', 'DynamicProfilerMonitor', 'init_logger']
-
-from ._dynamic_profiler_log import logger, init_logger
-from ._dynamic_profiler_monitor import DynamicProfilerMonitor
+__all__ = []
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index dc12b47cf2..a2df15718e 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -1,13 +1,12 @@
-import os
 import json
-import torch
 from torch_npu._C._profiler import ProfilerActivity
 from ..experimental_config import _ExperimentalConfig, ProfilerLevel, AiCMetrics
-from ._dynamic_profiler_log import logger
+from ._dynamic_profiler_utils import logger, _get_rank_id
 
 
 class ConfigContext:
     DEFAULT_ACTIVE_NUM = 1
+    DEFAULT_START_STEP = 0
 
     def __init__(self, json_data: dict):
         self.activity_set = set()
@@ -22,9 +21,10 @@ class ConfigContext:
         self.rank_set = set()
         self.experimental_config = None
         self._active = 1
+        self._start_step = 0
         self.is_valid = False
         self._meta_data = {}
-        self._rank_id = self.get_rank_id()
+        self._rank_id = _get_rank_id()
         self.parse(json_data)
 
     def parse(self, json_data: dict):
@@ -44,6 +44,12 @@ class ConfigContext:
         self.with_flops = json_data.get('with_flops', False)
         self.with_modules = json_data.get('with_modules', False)
         self._active = json_data.get('active', self.DEFAULT_ACTIVE_NUM)
+        self._start_step = json_data.get("start_step", self.DEFAULT_START_STEP)
+        if not isinstance(self._start_step, int) or self._start_step < 0:
+            logger.info(f"Start step is not valid, will be reset to {self.DEFAULT_START_STEP}.")
+            self._start_step = self.DEFAULT_START_STEP
+        else:
+            logger.info(f"Start step will be set to {self._start_step}.")
         exp_config = json_data.get('experimental_config')
         if not exp_config:
             self.experimental_config = None
@@ -86,7 +92,7 @@ class ConfigContext:
             logger.warning("Set rank_list failed, rank_list must be list!")
             return
         for rank in ranks:
-            if isinstance(rank, int):
+            if isinstance(rank, int) and rank >= 0:
                 self.rank_set.add(rank)
 
     def valid(self) -> bool:
@@ -139,6 +145,9 @@ class ConfigContext:
             return self.DEFAULT_ACTIVE_NUM
         return self._active
 
+    def start_step(self) -> int:
+        return self._start_step
+
     def experimental_config(self) -> _ExperimentalConfig:
         return self.experimental_config
 
@@ -154,16 +163,5 @@ class ConfigContext:
         cfg_json = json.loads(cfg_json_str)
         return cfg_json
 
-    @staticmethod
-    def get_rank_id() -> int:
-        try:
-            rank_id = os.environ.get('RANK')
-            if rank_id is None and torch.distributed.is_available() and torch.distributed.is_initialized():
-                rank_id = torch.distributed.get_rank()
-            if not isinstance(rank_id, int):
-                rank_id = int(rank_id)
-        except Exception as ex:
-            logger.warning("Get rank id  %s, rank_id will be set to 0 !", str(ex))
-            rank_id = 0
-
-        return rank_id
+
+
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
index 59ba639de7..c0703d517f 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
@@ -6,7 +6,7 @@ import json
 import struct
 import multiprocessing
 
-from ._dynamic_profiler_log import logger, logger_monitor, init_logger
+from ._dynamic_profiler_utils import logger, logger_monitor, init_logger, _get_rank_id
 from ._dynamic_profiler_config_context import ConfigContext
 from ._dynamic_profiler_monitor_shm import DynamicProfilerShareMemory
 
@@ -19,7 +19,7 @@ class DynamicProfilerMonitor:
             poll_interval: int = 2
     ):
         self._path = path
-        self._rank_id = ConfigContext.get_rank_id()
+        self._rank_id = _get_rank_id()
         self._buffer_size = buffer_size
         self._monitor_process = None
         self.prof_cfg_context = None
@@ -110,7 +110,7 @@ def worker_func(params_dict):
     file_stat_time = params_dict.get("file_stat_time")
     mmap_path = params_dict.get("mmap_path")
     is_mmap = params_dict.get("is_mmap")
-    init_logger(logger_monitor, os.path.dirname(cfg_path), True)
+    init_logger(logger_monitor, os.path.dirname(cfg_path), is_monitor_process=True)
 
     mmap_obj = None
     if is_mmap and mmap_path is not None:
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
index ec4f4429c9..944c115f44 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
@@ -11,7 +11,7 @@ from datetime import datetime
 from ...utils.path_manager import PathManager
 from ...utils._error_code import ErrCode, prof_error
 from ..analysis.prof_common_func._file_manager import FileManager
-from ._dynamic_profiler_log import logger
+from ._dynamic_profiler_utils import logger
 
 
 class DynamicProfilerShareMemory:
@@ -25,6 +25,7 @@ class DynamicProfilerShareMemory:
         "with_flops": False,
         "with_modules": False,
         "active": 1,
+        "start_step": 0,
         "is_rank": False,
         "rank_list": [],
         "experimental_config": {
@@ -88,7 +89,7 @@ class DynamicProfilerShareMemory:
         if pid_time is not None and pid_time - time_shm > eps:
             raise RuntimeError(f"There may exist shared memory before this task. If you kill the last task, "
                                f"dynamic profiler will not be valid. Please remove: {shm_path}, and retry." +
-                               prof_error(ErrCode.VALUE)) from err
+                               prof_error(ErrCode.VALUE))
 
     def _create_prof_cfg(self):
         if not os.path.exists(self.config_path):
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_log.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py
similarity index 51%
rename from torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_log.py
rename to torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py
index 77a620c05b..5f21003f94 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_log.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py
@@ -2,18 +2,19 @@ import os
 import socket
 import logging
 from logging.handlers import RotatingFileHandler
+import torch
 from ...utils.path_manager import PathManager
 
 logger = logging.getLogger("DynamicProfiler")
 logger_monitor = logging.getLogger("DynamicProfilerMonitor")
 
 
-def init_logger(logger_: logging.Logger, path: str, is_monitor_process=False):
+def init_logger(logger_: logging.Logger, path: str, is_monitor_process: bool = False):
     path = os.path.join(path, 'log')
     if not os.path.exists(path):
         PathManager.make_dir_safety(path)
     worker_name = "{}".format(socket.gethostname())
-    log_name = "dp_{}_{}.log".format(worker_name, os.getpid())
+    log_name = "dp_{}_{}_rank_{}.log".format(worker_name, os.getpid(), _get_rank_id())
     if is_monitor_process:
         log_name = "monitor_" + log_name
     log_file = os.path.join(path, log_name)
@@ -24,3 +25,29 @@ def init_logger(logger_: logging.Logger, path: str, is_monitor_process=False):
     handler.setFormatter(formatter)
     logger_.setLevel(logging.DEBUG)
     logger_.addHandler(handler)
+
+
+def _get_rank_id() -> int:
+    try:
+        rank_id = os.environ.get('RANK')
+        if rank_id is None and torch.distributed.is_available() and torch.distributed.is_initialized():
+            rank_id = torch.distributed.get_rank()
+        if not isinstance(rank_id, int):
+            rank_id = int(rank_id)
+    except Exception as ex:
+        logger.warning("Get rank id  %s, rank_id will be set to -1 !", str(ex))
+        rank_id = -1
+
+    return rank_id
+
+
+def _get_device_id() -> int:
+    try:
+        device_id = os.environ.get('LOCAL_RANK')
+        if not isinstance(device_id, int):
+            device_id = int(device_id)
+    except Exception as ex:
+        logger.warning("Get device id  %s, device_id will be set to -1 !", str(ex))
+        device_id = -1
+
+    return device_id
diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py
index 1a8d21dec1..c0fea38e9b 100644
--- a/torch_npu/profiler/dynamic_profile.py
+++ b/torch_npu/profiler/dynamic_profile.py
@@ -13,7 +13,8 @@ from .analysis.prof_common_func._constant import print_warn_msg
 from .analysis.prof_common_func._constant import print_error_msg
 from .analysis.prof_common_func._utils import no_exception_func
 from .analysis.prof_common_func._file_manager import FileManager
-from ._dynamic_profiler import logger, init_logger, DynamicProfilerMonitor
+from ._dynamic_profiler._dynamic_profiler_utils import logger, init_logger
+from ._dynamic_profiler._dynamic_profiler_monitor import DynamicProfilerMonitor
 from ._dynamic_profiler._dynamic_profiler_config_context import ConfigContext
 
 __all__ = [
@@ -58,13 +59,13 @@ class _DynamicProfile:
 
     def _dynamic_profiler_valid(self):
         prof_cfg_ctx = self._dynamic_monitor.shm_to_prof_conf_context()
-        if prof_cfg_ctx is None:
-            return None
-        else:
-            return prof_cfg_ctx
+        return prof_cfg_ctx
 
     def step(self):
         self.cur_step += 1
+        cfg_ctx = self._dynamic_profiler_valid()
+        if cfg_ctx is not None:
+            self.cfg_ctx = cfg_ctx
         if self.cur_step == self.RECORD_TIME_STEP:
             self._step_record_time = time.time()
         elif self.cur_step - self.RECORD_TIME_STEP == 1:
@@ -77,10 +78,7 @@ class _DynamicProfile:
                 self.prof.stop()
                 self.prof = None
                 logger.info(f"Stop Dynamic Profiler at {self.cur_step} step.")
-        elif self.prof is None:
-            self.cfg_ctx = self._dynamic_profiler_valid()
-            if self.cfg_ctx is None:
-                return
+        elif self.prof is None and self.cfg_ctx is not None and self.cur_step == self.cfg_ctx.start_step():
             self.step_num = self.cfg_ctx.active()
             self.enable_prof()
             self.cfg_ctx = None
-- 
Gitee


From c37b72f7ea16b0b102f0e62983dbfb2bf5b8de05 Mon Sep 17 00:00:00 2001
From: shaojieMike <tanshaojie1@huawei.com>
Date: Thu, 7 Nov 2024 01:18:39 +0000
Subject: [PATCH 91/96] !15717 Support fine-grained and custom CPU binding
 Merge pull request !15717 from shaojieMike/v2.1.0_PR_bindcore

---
 .../csrc/core/npu/NPUAffinityController.cpp   | 291 ++++++++++++++++++
 .../csrc/core/npu/NPUAffinityController.h     |  35 +++
 torch_npu/csrc/core/npu/NPUFunctions.cpp      |  34 --
 torch_npu/csrc/core/npu/NPUQueue.cpp          |  32 +-
 torch_npu/csrc/core/npu/NPUQueue.h            |   4 +-
 torch_npu/csrc/core/npu/impl/NPUGuardImpl.h   |   2 +
 .../csrc/core/npu/register/OptionsManager.cpp |   9 +-
 .../csrc/core/npu/register/OptionsManager.h   |   2 +-
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   |   5 +
 .../csrc/distributed/ProcessGroupHCCL.cpp     |   7 +-
 torch_npu/csrc/npu/Module.cpp                 |  25 ++
 torch_npu/utils/_module.py                    |   2 +
 12 files changed, 391 insertions(+), 57 deletions(-)
 create mode 100644 torch_npu/csrc/core/npu/NPUAffinityController.cpp
 create mode 100644 torch_npu/csrc/core/npu/NPUAffinityController.h

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
new file mode 100644
index 0000000000..e7beafecd4
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -0,0 +1,291 @@
+
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
+
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <cstdio>
+#include <sys/prctl.h>
+#include <string>
+#include <unordered_map>
+#include <cctype>
+#include <algorithm>
+
+namespace c10_npu {
+
+    static pthread_t mainthread_tid;
+
+    const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
+        {releaseThread, "release_thread"},
+        {aclThread, "acl_thread"},
+        {mainThread, "main_thread"},
+        {hcclCommWatchdogThread, "hcclComm_watchd"}, // thread name no more than 15 chars
+        {backwardThread, "backward_thread"}};
+
+    const std::unordered_map<std::string, ThreadType> threadNameToTypeMap = {
+        {"release_thread", releaseThread},
+        {"acl_thread", aclThread},
+        {"main_thread", mainThread},
+        {"hcclComm_watchd", hcclCommWatchdogThread},
+        {"backward_thread", backwardThread}};
+
+    void RecordMainThreadTid()
+    {
+        mainthread_tid = pthread_self();
+    }
+
+    ThreadType getCurrentThreadType()
+    {
+        char thread_name[16];
+
+        if (prctl(PR_GET_NAME, thread_name, 0, 0, 0) == 0) {
+            std::string name(thread_name);
+
+            auto it = threadNameToTypeMap.find(name);
+            if (it != threadNameToTypeMap.end()) {
+                return it->second;
+            }
+        }
+        return ThreadType::unknownThread;
+    }
+
+    aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread)
+    {
+        cpu_set_t mask;
+        CPU_ZERO(&mask);
+
+        for (auto i = core_range.start; i <= core_range.end; i++) {
+            CPU_SET(i, &mask);
+        }
+        if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) {
+            ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end);
+            return ACL_ERROR_NONE;
+        }
+        return ACL_ERROR_FEATURE_UNSUPPORTED;
+    }
+
+    coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id)
+    {
+        int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
+        int device_nums = device_count_ensure_non_zero();
+        int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0;
+        return coreIdRange{static_cast<unsigned int>(device_id * block_size),
+                           static_cast<coreId>(std::min((device_id + 1) * block_size, core_nums) - 1)};
+    }
+
+    inline bool has_set_pthread_affinity()
+    {
+        unsigned int core_nums = static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
+
+        cpu_set_t mask;
+        pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
+        for (unsigned int i = 0; i < core_nums; i++) {
+            if (!CPU_ISSET(i, &mask)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    std::string GetAffinityMapAsString(const std::unordered_map<ThreadType, coreIdRange> &threadToCoreidMap, c10::DeviceIndex device_id)
+    {
+        std::ostringstream oss;
+        oss << "threadToCoreidMap plan to bind device " << static_cast<unsigned int>(device_id) << " to "
+            << " [" << threadToCoreidMap.at(unknownThread).start << "," << threadToCoreidMap.at(unknownThread).end << "]、"
+            << " [" << threadToCoreidMap.at(mainThread).start << "," << threadToCoreidMap.at(mainThread).end << "]、"
+            << " [" << threadToCoreidMap.at(backwardThread).start << "," << threadToCoreidMap.at(backwardThread).end << "]、"
+            << " [" << threadToCoreidMap.at(aclThread).start << "," << threadToCoreidMap.at(aclThread).end << "]、"
+            << " [" << threadToCoreidMap.at(releaseThread).start << "," << threadToCoreidMap.at(releaseThread).end << "]、"
+            << " [" << threadToCoreidMap.at(hcclCommWatchdogThread).start << "," << threadToCoreidMap.at(hcclCommWatchdogThread).end << "]";
+
+        return oss.str();
+    }
+
+    std::unordered_map<ThreadType, coreIdRange> GetCpuAffinityMap(c10::DeviceIndex device_id)
+    {
+        std::unordered_map<ThreadType, coreIdRange> threadToCoreidMap;
+        std::initializer_list<ThreadType> thread_types = {unknownThread, mainThread, backwardThread, aclThread,
+                                                          releaseThread, hcclCommWatchdogThread};
+
+        coreIdRange current_core_range = GetCPUDefaultRange(device_id);
+        coreId offset = current_core_range.start;
+
+        // calculate env2 default map
+        coreId core_nums = current_core_range.end - current_core_range.start;
+        if (core_nums < thread_types.size()) {
+            ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.",
+                        core_nums, thread_types.size());
+            for (auto thread_type : thread_types) {
+                threadToCoreidMap[thread_type] = current_core_range;
+            }
+        } else {
+            int remaining_type_count = thread_types.size() - 1;
+            int i = 0;
+            for (auto thread_type : thread_types) {
+                if (thread_type == ThreadType::unknownThread) {
+                    threadToCoreidMap[ThreadType::unknownThread] = coreIdRange{current_core_range.start + remaining_type_count, current_core_range.end};
+                } else {
+                    threadToCoreidMap[thread_type] = coreIdRange{offset + i, offset + (i++)};
+                }
+            }
+        }
+
+        ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str());
+
+        return threadToCoreidMap;
+    }
+
+    aclError SetThreadAffinity(c10::DeviceIndex device_id)
+    {
+        return SetThreadAffinity(device_id, getCurrentThreadType());
+    }
+
+    void printCoreRanges(const std::vector<coreIdRange> &ranges, uint32_t mode)
+    {
+        std::ostringstream oss;
+        oss << "Mode: " << mode << " ";
+
+        for (size_t i = 0; i < ranges.size(); ++i) {
+            oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " ";
+        }
+
+        ASCEND_LOGD("Core ranges: %s", oss.str().c_str());
+    }
+
+    bool isAllDigits(const std::string &str)
+    {
+        if (str.empty()) {
+            return false;
+        }
+        return std::all_of(str.begin(), str.end(), [](unsigned char c) {
+            return std::isdigit(c);
+        });
+    }
+
+    void parseCPUAffinityConf(uint32_t &mode, std::vector<coreIdRange> &ranges)
+    {
+        const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf();
+
+        if (input == nullptr || strlen(input) == 0) {
+            mode = 0;
+            return;
+        }
+
+        mode = 0;
+        int device_nums = device_count_ensure_non_zero();
+        ranges.clear();
+        ranges.resize(device_nums);
+
+        // init
+        for (int i = 0; i < device_nums; ++i) {
+            ranges[i] = GetCPUDefaultRange(i);
+        }
+
+        std::string inputStr(input);
+        std::istringstream stream(inputStr);
+        std::string option;
+
+        // Handle cases where only `mode` is provided, or `mode:` without value
+        if (isAllDigits(inputStr)) {
+            mode = static_cast<uint32_t>(std::stoi(inputStr));
+            return; // Return directly, `mode` has already been processed
+        }
+
+        // Parse each option
+        while (std::getline(stream, option, ',')) {
+            // Split `option` based on colon
+            size_t colonPos = option.find(':');
+            if (colonPos != std::string::npos) {
+                std::string key = option.substr(0, colonPos);
+                std::string value = option.substr(colonPos + 1);
+
+                // Process `mode`
+                if (key == "mode") {
+                    if (isAllDigits(value)) {
+                        mode = static_cast<uint32_t>(std::stoi(value));
+                    } else {
+                        ASCEND_LOGW("mode is %s, should be all digits", value.c_str());
+                    }
+                } else if (key.rfind("npu", 0) == 0) {
+                    // Handle NPU core binding range
+                    if (isAllDigits(key.substr(3))) {
+                        int device_id = std::stoi(key.substr(3)); // Parse NPU device ID
+                        if (device_id < device_nums) {
+                            size_t dashPos = value.find('-');
+                            if (dashPos != std::string::npos) {
+                                std::string startStr = value.substr(0, dashPos);
+                                std::string endStr = value.substr(dashPos + 1);
+                                if (isAllDigits(startStr) && isAllDigits(endStr)) {
+                                    coreId start = static_cast<coreId>(std::stoi(startStr));
+                                    coreId end = static_cast<coreId>(std::stoi(endStr));
+                                    ranges[device_id] = {start, end};
+                                } else {
+                                    ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
+                                }
+                            } else {
+                                if (isAllDigits(value)) {
+                                    coreId singleCore = static_cast<coreId>(std::stoi(value));
+                                    ranges[device_id] = {singleCore, singleCore};
+                                } else {
+                                    ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str());
+                                }
+                            }
+                        }
+                    }
+                }
+            } else if (isAllDigits(option)) {
+                // If no colon and the value is a number, use it directly as `mode`
+                mode = static_cast<uint32_t>(std::stoi(option));
+            }
+        }
+    }
+
+    aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type)
+    {
+        uint32_t bind_conf;
+        std::vector<coreIdRange> ranges;
+        parseCPUAffinityConf(bind_conf, ranges);
+        printCoreRanges(ranges, bind_conf);
+
+        // bind_conf=1, bind cores averagely based on device_id
+        if (bind_conf == 1) {
+            static const bool set_pthread_affinity = has_set_pthread_affinity();
+            if (!set_pthread_affinity) {
+                return SetThreadAffinity(ranges[device_id], pthread_self());
+            }
+        } else if (bind_conf == 2) {
+            auto thread_core_map = GetCpuAffinityMap(device_id);
+            // When the PTA_init function runs on device 0, the main thread is initially assigned to this device 0.
+            // However, when the acl_thread is initialized, the target device ID(maybe 0-7) is determined.
+            // Therefore, the main thread should be rescheduled to the target device.
+            if (current_thread_type == ThreadType::aclThread)
+                SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
+            return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self());
+        } else {
+            ASCEND_LOGD("Thread affinity setting is disabled.");
+        }
+        return ACL_ERROR_NONE;
+    }
+
+    void SetBackwardThreadName(c10::DeviceIndex device_id)
+    {
+        static thread_local bool seted = false;
+        if (!seted) {
+            seted = true;
+            if (syscall(SYS_gettid) != getpid()) {
+                SetThreadName(ThreadType::backwardThread);
+                SetThreadAffinity(device_id);
+            }
+        }
+    }
+
+    void SetThreadName(ThreadType type)
+    {
+        // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function.
+        if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
+            ASCEND_LOGW("set thread name failed!");
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
new file mode 100644
index 0000000000..2c1e92ddc7
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -0,0 +1,35 @@
+#pragma once
+#include "torch_npu/csrc/core/npu/npu_log.h"
+
+namespace c10_npu {
+
+    typedef unsigned int coreId;
+
+    struct coreIdRange {
+        coreId start;
+        coreId end;
+    };
+
+    enum ThreadType {
+        unknownThread = 0, // Mostly refers to threads in PyTorch's motorized sleep thread pool, which are not considered in PTA.
+        mainThread = 1,    // 1st performance hotspot, responsible for operator dispatching during the forward phase.
+        backwardThread = 2,  // 2nd performance hotspot, responsible for operator dispatching during the backward phase.
+        aclThread = 3,     // 3rd performance hotspot in PTA, responsible for handling the task queue.
+        releaseThread = 4, // Thread responsible for resource release.
+        hcclCommWatchdogThread = 5 // Thread responsible for HCCL communication monitoring.
+    };
+
+    aclError SetThreadAffinity(c10::DeviceIndex device);
+    aclError SetThreadAffinity(c10::DeviceIndex device, ThreadType current_thread_type);
+    void SetThreadName(ThreadType type);
+
+    // The main thread of PTA, which is also the main thread of PyTorch, handles multiple phases of tasks
+    // (e.g., first parallel checkpoint data loading, then transitioning to forward training).
+    // Each phase may require different thread affinity settings. Therefore, we record the thread's TID
+    // to adjust its affinity later as needed.
+    void RecordMainThreadTid();
+
+    // Set backwardThread Name Once
+    void SetBackwardThreadName(c10::DeviceIndex device_id);
+
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index 59456b3349..4b7a40ec11 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -66,20 +66,6 @@ aclError GetDevice(int32_t *device)
     return err;
 }
 
-inline bool has_set_pthread_affinity()
-{
-    unsigned int core_nums = static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
-
-    cpu_set_t mask;
-    pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
-    for (unsigned int i = 0; i < core_nums; i++) {
-        if (!CPU_ISSET(i, &mask)) {
-            return true;
-        }
-    }
-    return false;
-}
-
 aclError SetDevice(c10::DeviceIndex device)
 {
     TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE));
@@ -88,26 +74,6 @@ aclError SetDevice(c10::DeviceIndex device)
         return ACL_ERROR_NONE;
     }
 
-    static uint32_t bind_conf = c10_npu::option::OptionsManager::GetCpuAffinityConf();
-    // bind_conf=1, bind cores averagely based on device_id
-    if (bind_conf == 1) {
-        static const bool set_pthread_affinity = has_set_pthread_affinity();
-        if (!set_pthread_affinity) {
-            int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-            int device_nums = device_count_ensure_non_zero();
-            int block_size = (core_nums + device_nums - 1) / device_nums;
-            unsigned int start_core = static_cast<unsigned int>(device * block_size);
-            unsigned int end_core = static_cast<unsigned int>(std::min((device + 1) * block_size, core_nums));
-
-            cpu_set_t mask;
-            CPU_ZERO(&mask);
-            for (unsigned int i = start_core; i < end_core; i++) {
-                CPU_SET(i, &mask);
-            }
-            pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
-        }
-    }
-
     aclError err = aclrtSetDevice(device);
     if (err == ACL_ERROR_NONE) {
         local_device = device;
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 39bb3514f1..0ea9d98527 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -1,6 +1,7 @@
 #include "torch_npu/csrc/core/npu/NPUQueue.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/framework/OpParamMaker.h"
@@ -15,7 +16,6 @@
 #include <sstream>
 #include <sys/time.h>
 #include <sys/eventfd.h>
-#include <sys/prctl.h>
 #include <third_party/acl/inc/acl/acl_rt.h>
 
 namespace c10_npu {
@@ -587,9 +587,8 @@ bool Repository::CheckInit() const {
 }
 
 void StartConsume(Repository* repo, c10::DeviceIndex device_id) {
-    if (prctl(PR_SET_NAME, ("ACL_thread")) != 0) {
-        ASCEND_LOGE("set thread name failed!");
-    }
+    SetThreadName(ThreadType::aclThread);
+    SetThreadAffinity(device_id);
 
     aclError ret = c10_npu::SetDevice(device_id);
     if (ret != 0) {
@@ -619,7 +618,7 @@ void Repository::InitRepo(c10::DeviceIndex device_id) {
   std::thread cur_consumer(StartConsume, this, device_id);
   consumer = std::move(cur_consumer);
 
-  releaseQueue.InitReleaseQueue();
+  releaseQueue.InitReleaseQueue(device_id);
 }
 
 std::string Repository::GetPara()
@@ -697,17 +696,17 @@ void ReleaseQueue::PopFromReleaseQueue() {
 }
 
 void StartRelease(ReleaseQueue* releaseQue) {
-  if (prctl(PR_SET_NAME, ("Release_thread")) != 0) {
-    ASCEND_LOGE("set thread name failed!");
-  }
+    SetThreadName(ThreadType::releaseThread);
+    SetThreadAffinity(releaseQue->GetDeviceID());
 
-  while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) {
-    releaseQue->PopFromReleaseQueue();
-  }
-  return;
+    while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) {
+        releaseQue->PopFromReleaseQueue();
+    }
+    return;
 }
 
-void ReleaseQueue::InitReleaseQueue() {
+void ReleaseQueue::InitReleaseQueue(c10::DeviceIndex device_id)
+{
   if (datas == nullptr) {
     datas = releaseManager().Init(kReleaseQueueCapacity);
   }
@@ -716,6 +715,7 @@ void ReleaseQueue::InitReleaseQueue() {
   SetStatus(INIT);
   std::thread cur_releaser(StartRelease, this);
   releaser = std::move(cur_releaser);
+  device_idx = device_id;
 }
 
 ReleaseQueue::~ReleaseQueue() {
@@ -740,6 +740,12 @@ RepoStatus ReleaseQueue::GetStatus() const {
   return repo_status.load();
 }
 
+c10::DeviceIndex ReleaseQueue::GetDeviceID() const
+{
+    return device_idx;
+}
+
+
 void ReleaseQueue::SetStatus(RepoStatus desired) {
   if (initialized == false) {
     ASCEND_LOGE("Release queue is not initialized, shouldn't call SetStatus(). !!");
diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h
index 66e648069f..2375ef945b 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.h
+++ b/torch_npu/csrc/core/npu/NPUQueue.h
@@ -38,8 +38,9 @@ public:
   ~ReleaseQueue();
   void PushToReleaseQueue(void* cur_paras);
   void PopFromReleaseQueue();
-  void InitReleaseQueue();
+  void InitReleaseQueue(c10::DeviceIndex device_id);
   RepoStatus GetStatus() const;
+  c10::DeviceIndex GetDeviceID() const;
 
 private:
   inline bool IsEmptyQueue() {return read_idx.idx == write_idx.idx;};
@@ -52,6 +53,7 @@ private:
 private:
   void* datas = nullptr;
   std::thread releaser;
+  c10::DeviceIndex device_idx;
 
 private:
   sring_idx read_idx;
diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
index 4359db0136..705e772799 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
@@ -8,6 +8,7 @@
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
@@ -53,6 +54,7 @@ struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     uncheckedSetDevice(d);
   }
   void uncheckedSetDevice(c10::Device d) const noexcept override {
+    SetBackwardThreadName(d.index());
     NPU_CHECK_WARN(c10_npu::SetDevice(d.index()));
   }
   c10::Stream getStream(c10::Device d) const noexcept override {
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 8503361020..6a07e170c4 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -350,14 +350,9 @@ uint32_t OptionsManager::GetP2PBufferSize()
     return buf_size;
 }
 
-uint32_t OptionsManager::GetCpuAffinityConf()
+char* OptionsManager::GetCpuAffinityConf()
 {
-    const static uint32_t cpu_affinity_conf = []() -> uint32_t {
-        char* cpu_affinity_str = std::getenv("CPU_AFFINITY_CONF");
-        int64_t cpu_affinity_conf = (cpu_affinity_str != nullptr) ? strtol(cpu_affinity_str, nullptr, 10) : 0;
-        return static_cast<uint32_t>(cpu_affinity_conf);
-    }();
-    return cpu_affinity_conf;
+    return std::getenv("CPU_AFFINITY_CONF");
 }
 
 uint32_t OptionsManager::GetTaskQueueEnable()
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index 98e8fd72dc..65a9c38a4b 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -51,7 +51,7 @@ public:
     static std::pair<double, double> GetSilenceSigmaThresh();
     static uint32_t GetP2PBufferSize();
     static uint32_t GetTaskQueueEnable();
-    static uint32_t GetCpuAffinityConf();
+    static char* GetCpuAffinityConf();
     static bool CheckForceUncached();
     static std::string GetOomSnapshotDumpPath();
     static void IsOomSnapshotEnable();
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 679b2a262a..bc1e1f9be3 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -13,6 +13,7 @@
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
@@ -266,8 +267,12 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
         const auto& in = iter.second;
         call_(in);
     }
+    
     lazy_fn_.clear();
 
+    SetThreadAffinity(device_id_);
+    RecordMainThreadTid();
+
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index e8a9f5a283..fc9e268f07 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -25,6 +25,7 @@
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/distributed/HCCLUtils.hpp"
@@ -791,6 +792,8 @@ ProcessGroupHCCL::~ProcessGroupHCCL()
 void ProcessGroupHCCL::hcclCommWatchdog()
 {
     try {
+        c10_npu::SetThreadName(c10_npu::ThreadType::hcclCommWatchdogThread);
+
         VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!";
         workCleanupLoop();
         VLOG(2) << "[Rank " << rank_
@@ -873,7 +876,9 @@ void ProcessGroupHCCL::workCleanupLoop()
             auto& work = *it;
             try {
                 if (needSetDevice) {
-                    NPU_CHECK_ERROR(c10_npu::SetDevice(static_cast<int>(work.devices_[0].index())));
+                    c10::DeviceIndex device = static_cast<int>(work.devices_[0].index());
+                    c10_npu::SetThreadAffinity(device);
+                    NPU_CHECK_ERROR(c10_npu::SetDevice(device));
                     needSetDevice = false;
                 }
             } catch (const std::exception& e) {
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index d73b536e94..948061f008 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -24,6 +24,7 @@
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/NPUQueue.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
@@ -1211,6 +1212,28 @@ PyObject* THNPModule_npu_support_silentClientV2(PyObject* self, PyObject* noargs
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    int device_index;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
+    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
+    c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::mainThread);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    int device_index;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
+    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
+    c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::unknownThread);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr},
     {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr},
@@ -1260,6 +1283,8 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr},
     {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr},
     {"_npu_support_silentClientV2", (PyCFunction)THNPModule_npu_support_silentClientV2, METH_NOARGS, nullptr},
+    {"_npu_set_threads_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_NOARGS, nullptr},
+    {"_npu_reset_threads_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr},
     {nullptr}};
 
 TORCH_NPU_API PyMethodDef* THNPModule_get_methods() {
diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index 92213c7ca9..4bd5bf55cd 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -362,7 +362,9 @@ def _mpdl_iter_init(self, *args, **kwargs):
         torch_npu.npu.synchronize()
     except:
         pass
+    torch_npu._C._npu_set_threads_affinity()
     origin_mpdl_iter_init(self, *args, **kwargs)
+    torch_npu._C._npu_reset_threads_affinity()
 
 
 def _parallel_apply(
-- 
Gitee


From 0ce7d8395118961df943f61e43c26aa94fd53473 Mon Sep 17 00:00:00 2001
From: shaojiemike <shaojiemike@mail.ustc.edu.cn>
Date: Thu, 28 Nov 2024 20:38:48 +0800
Subject: [PATCH 92/96] [feat]: bind remaining tasks when backward begin

[feat]: Add linux platform compatibility check
---
 .../csrc/core/npu/NPUAffinityController.cpp   | 128 ++++++++++++++++--
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   |   4 +-
 2 files changed, 120 insertions(+), 12 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index e7beafecd4..985c39ce37 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -11,10 +11,17 @@
 #include <unordered_map>
 #include <cctype>
 #include <algorithm>
+#include <sstream>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <regex>
+#include <sys/wait.h>
 
 namespace c10_npu {
 
     static pthread_t mainthread_tid;
+    static pid_t parentPid;
 
     const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
         {releaseThread, "release_thread"},
@@ -33,6 +40,7 @@ namespace c10_npu {
     void RecordMainThreadTid()
     {
         mainthread_tid = pthread_self();
+        parentPid = getpid();
     }
 
     ThreadType getCurrentThreadType()
@@ -44,12 +52,31 @@ namespace c10_npu {
 
             auto it = threadNameToTypeMap.find(name);
             if (it != threadNameToTypeMap.end()) {
-                return it->second;
+                return std::get<1>(*it);
             }
         }
         return ThreadType::unknownThread;
     }
 
+    ThreadType getThreadType(pid_t tid)
+    {
+        char thread_name[16];
+        std::string commFile = "/proc/" + std::to_string(tid) + "/comm"; // Path to thread name
+
+        std::ifstream commStream(commFile);
+        if (commStream.is_open()) {
+            commStream.getline(thread_name, sizeof(thread_name));
+
+            std::string name(thread_name);
+            auto it = threadNameToTypeMap.find(name);
+            if (it != threadNameToTypeMap.end()) {
+                return it->second;
+            }
+        }
+
+        return ThreadType::unknownThread; // Default if not found
+    }
+
     aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread)
     {
         cpu_set_t mask;
@@ -59,12 +86,30 @@ namespace c10_npu {
             CPU_SET(i, &mask);
         }
         if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) {
-            ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end);
+            ASCEND_LOGI("[affinity] Set Thread Affinity to %d-%d", core_range.start, core_range.end);
             return ACL_ERROR_NONE;
+        } else {
+            ASCEND_LOGW("[affinity] Set Thread Affinity to %d-%d failed", core_range.start, core_range.end);
         }
         return ACL_ERROR_FEATURE_UNSUPPORTED;
     }
 
+    void bindToCoreRange(pid_t pid, const coreIdRange &core_range)
+    {
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+
+        for (int core = core_range.start; core <= core_range.end; ++core) {
+            CPU_SET(core, &cpuset);
+        }
+
+        if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset) == -1) {
+            ASCEND_LOGW("[affinity] sched_setaffinity failed");
+        } else {
+            ASCEND_LOGI("[affinity] Set Thread %d Affinity to %d-%d", pid, core_range.start, core_range.end);
+        }
+    }
+
     coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id)
     {
         int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
@@ -114,7 +159,7 @@ namespace c10_npu {
         // calculate env2 default map
         coreId core_nums = current_core_range.end - current_core_range.start;
         if (core_nums < thread_types.size()) {
-            ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.",
+            ASCEND_LOGW("[affinity] Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.",
                         core_nums, thread_types.size());
             for (auto thread_type : thread_types) {
                 threadToCoreidMap[thread_type] = current_core_range;
@@ -131,7 +176,7 @@ namespace c10_npu {
             }
         }
 
-        ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str());
+        ASCEND_LOGI("[affinity] Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str());
 
         return threadToCoreidMap;
     }
@@ -150,7 +195,7 @@ namespace c10_npu {
             oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " ";
         }
 
-        ASCEND_LOGD("Core ranges: %s", oss.str().c_str());
+        ASCEND_LOGI("[affinity] Core ranges: %s", oss.str().c_str());
     }
 
     bool isAllDigits(const std::string &str)
@@ -205,7 +250,7 @@ namespace c10_npu {
                     if (isAllDigits(value)) {
                         mode = static_cast<uint32_t>(std::stoi(value));
                     } else {
-                        ASCEND_LOGW("mode is %s, should be all digits", value.c_str());
+                        ASCEND_LOGW("[affinity] mode is %s, should be all digits", value.c_str());
                     }
                 } else if (key.rfind("npu", 0) == 0) {
                     // Handle NPU core binding range
@@ -221,14 +266,14 @@ namespace c10_npu {
                                     coreId end = static_cast<coreId>(std::stoi(endStr));
                                     ranges[device_id] = {start, end};
                                 } else {
-                                    ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
+                                    ASCEND_LOGW("[affinity] core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
                                 }
                             } else {
                                 if (isAllDigits(value)) {
                                     coreId singleCore = static_cast<coreId>(std::stoi(value));
                                     ranges[device_id] = {singleCore, singleCore};
                                 } else {
-                                    ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str());
+                                    ASCEND_LOGW("[affinity] core range is string : %s, should be all digits", value.c_str());
                                 }
                             }
                         }
@@ -241,6 +286,64 @@ namespace c10_npu {
         }
     }
 
+    // Function to execute a shell command and capture its output
+    std::string executeCommand(const std::string &command)
+    {
+        std::array<char, 128> buffer;
+        std::string result;
+        std::shared_ptr<FILE> pipe(popen(command.c_str(), "r"), pclose);
+        if (!pipe) {
+            ASCEND_LOGE("[affinity] Failed to execute %s.", command.c_str());
+        }
+        while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+            result += buffer.data();
+        }
+        return result;
+    }
+
+    // Function to parse PIDs and TIDs from pstree output
+    std::vector<pid_t> parsePIDsFromPstree(const std::string &pstreeOutput)
+    {
+        std::vector<pid_t> pids;
+        std::regex pidRegex(R"(\((\d+)\))"); // Matches numbers inside parentheses
+        std::smatch match;
+        std::string::const_iterator searchStart(pstreeOutput.cbegin());
+        while (std::regex_search(searchStart, pstreeOutput.cend(), match, pidRegex)) {
+            pids.push_back(std::stoi(match[1]));
+            searchStart = match.suffix().first;
+        }
+        return pids;
+    }
+
+    void SetAffinityForRemainingTasks(coreIdRange core_range)
+    {
+         // Check if the platform is Linux
+#ifdef __linux__
+            // Check if pstree command exists
+            if (access("/usr/bin/pstree", F_OK) == 0) {
+                // Run pstree to get child processes and threads
+                std::string pstreeCommand = "/usr/bin/pstree -p " + std::to_string(parentPid) + " -t";
+                std::string pstreeOutput = executeCommand(pstreeCommand);
+
+                // Parse PIDs/TIDs from the pstree output
+                std::vector<pid_t> pids = parsePIDsFromPstree(pstreeOutput);
+                ASCEND_LOGI("[affinity] Parse %d PIDs/TIDs from the pstree output of parentPid %d", pids.size(), parentPid);
+
+                // Bind each PID/TID to the core range
+                for (pid_t pid : pids) {
+                    ThreadType type = getThreadType(pid);
+                    if (type == ThreadType::unknownThread && pid != parentPid) {
+                        bindToCoreRange(pid, core_range);
+                    }
+                }
+            } else {
+                ASCEND_LOGW("[affinity] pstree not found. Please install pstree or check your PATH.");
+            }
+#else
+            ASCEND_LOGW("[affinity] This function is only supported on Linux platforms.");
+#endif
+    }
+
     aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type)
     {
         uint32_t bind_conf;
@@ -261,9 +364,13 @@ namespace c10_npu {
             // Therefore, the main thread should be rescheduled to the target device.
             if (current_thread_type == ThreadType::aclThread)
                 SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
+            // In addition to data-loading processes, users often have other hot threads and processes.
+            // To isolate interference, all such processes must be confined to separate regions before the dispatch phase.
+            if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread)
+                SetAffinityForRemainingTasks(thread_core_map.at(ThreadType::unknownThread));
             return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self());
         } else {
-            ASCEND_LOGD("Thread affinity setting is disabled.");
+            ASCEND_LOGI("[affinity] Thread affinity setting is disabled.");
         }
         return ACL_ERROR_NONE;
     }
@@ -274,6 +381,7 @@ namespace c10_npu {
         if (!seted) {
             seted = true;
             if (syscall(SYS_gettid) != getpid()) {
+                ASCEND_LOGI("[affinity] Set Backward Thread Name");
                 SetThreadName(ThreadType::backwardThread);
                 SetThreadAffinity(device_id);
             }
@@ -284,7 +392,7 @@ namespace c10_npu {
     {
         // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function.
         if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
-            ASCEND_LOGW("set thread name failed!");
+            ASCEND_LOGW("[affinity] set thread name failed!");
         }
     }
 
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index bc1e1f9be3..d05f33168c 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -224,6 +224,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
         ASCEND_LOGW("Npu device %d has been set before global init.", device_id_);
     }
 
+    RecordMainThreadTid();
+    SetThreadAffinity(device_id_);
 
     if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) {
         const char *aclConfigPath = "acl.json";
@@ -270,8 +272,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
     
     lazy_fn_.clear();
 
-    SetThreadAffinity(device_id_);
-    RecordMainThreadTid();
 
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
-- 
Gitee


From 00814eb840920d9a77ec584cf2d8a55b972b2eb5 Mon Sep 17 00:00:00 2001
From: shaojiemike <shaojiemike@mail.ustc.edu.cn>
Date: Mon, 2 Dec 2024 14:39:40 +0800
Subject: [PATCH 93/96] [feat] fix dataloader set affinity error

---
 torch_npu/utils/_module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index 4bd5bf55cd..50f8e6ad3b 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -362,9 +362,9 @@ def _mpdl_iter_init(self, *args, **kwargs):
         torch_npu.npu.synchronize()
     except:
         pass
-    torch_npu._C._npu_set_threads_affinity()
-    origin_mpdl_iter_init(self, *args, **kwargs)
     torch_npu._C._npu_reset_threads_affinity()
+    origin_mpdl_iter_init(self, *args, **kwargs)
+    torch_npu._C._npu_set_threads_affinity()
 
 
 def _parallel_apply(
-- 
Gitee


From 5800b6e413da985a1197da53ed42cc9be6dd255a Mon Sep 17 00:00:00 2001
From: shaojiemike <943648187@qq.com>
Date: Sat, 28 Dec 2024 16:25:34 +0800
Subject: [PATCH 94/96] [feat] support user-defined fine-grained bind core

---
 torch_npu/csrc/core/npu/NPUAffinityController.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index 985c39ce37..a2a321fb69 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -147,13 +147,12 @@ namespace c10_npu {
         return oss.str();
     }
 
-    std::unordered_map<ThreadType, coreIdRange> GetCpuAffinityMap(c10::DeviceIndex device_id)
+    std::unordered_map<ThreadType, coreIdRange> GetCpuAffinityMap(c10::DeviceIndex device_id, coreIdRange current_core_range)
     {
         std::unordered_map<ThreadType, coreIdRange> threadToCoreidMap;
         std::initializer_list<ThreadType> thread_types = {unknownThread, mainThread, backwardThread, aclThread,
                                                           releaseThread, hcclCommWatchdogThread};
 
-        coreIdRange current_core_range = GetCPUDefaultRange(device_id);
         coreId offset = current_core_range.start;
 
         // calculate env2 default map
@@ -358,7 +357,7 @@ namespace c10_npu {
                 return SetThreadAffinity(ranges[device_id], pthread_self());
             }
         } else if (bind_conf == 2) {
-            auto thread_core_map = GetCpuAffinityMap(device_id);
+            auto thread_core_map = GetCpuAffinityMap(device_id, ranges[device_id]);
             // When the PTA_init function runs on device 0, the main thread is initially assigned to this device 0.
             // However, when the acl_thread is initialized, the target device ID(maybe 0-7) is determined.
             // Therefore, the main thread should be rescheduled to the target device.
-- 
Gitee


From 9900fd2cdfef7abfd70a41c4ce6f7b768c7af1af Mon Sep 17 00:00:00 2001
From: shaojiemike <943648187@qq.com>
Date: Mon, 30 Dec 2024 19:43:31 +0800
Subject: [PATCH 95/96] [perf]: optimize main thread affinity with lazy set for
 minimal impact on non-dispatch phase

---
 torch_npu/csrc/core/npu/NPUAffinityController.cpp | 4 +++-
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index a2a321fb69..a66a0547a4 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -365,8 +365,10 @@ namespace c10_npu {
                 SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
             // In addition to data-loading processes, users often have other hot threads and processes.
             // To isolate interference, all such processes must be confined to separate regions before the dispatch phase.
-            if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread)
+            if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread) {
+                SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
                 SetAffinityForRemainingTasks(thread_core_map.at(ThreadType::unknownThread));
+            }
             return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self());
         } else {
             ASCEND_LOGI("[affinity] Thread affinity setting is disabled.");
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index d05f33168c..c7ff88e9cb 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -225,7 +225,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
     }
 
     RecordMainThreadTid();
-    SetThreadAffinity(device_id_);
 
     if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) {
         const char *aclConfigPath = "acl.json";
-- 
Gitee


From 40376366ec01e1d441f1dfb3c97cabe9d8921b77 Mon Sep 17 00:00:00 2001
From: shaojiemike <943648187@qq.com>
Date: Tue, 31 Dec 2024 12:00:52 +0800
Subject: [PATCH 96/96] [feat] bind core based on original limited cores

[fix] delete useless check

[fix] codecheck

[fix] compile error
---
 .../csrc/core/npu/NPUAffinityController.cpp   | 80 ++++++++++++-------
 .../csrc/core/npu/NPUAffinityController.h     |  2 +-
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   |  2 +-
 3 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index a66a0547a4..6d951d6f6f 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -22,6 +22,7 @@ namespace c10_npu {
 
     static pthread_t mainthread_tid;
     static pid_t parentPid;
+    static coreIdRange originalRange;
 
     const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
         {releaseThread, "release_thread"},
@@ -37,10 +38,52 @@ namespace c10_npu {
         {"hcclComm_watchd", hcclCommWatchdogThread},
         {"backward_thread", backwardThread}};
 
-    void RecordMainThreadTid()
+    coreIdRange FindLongestCoreAffinityRange(pthread_t thread)
+    {
+        cpu_set_t mask;
+        CPU_ZERO(&mask);
+
+        coreIdRange range = {-1, -1};
+        int max_length = 0;
+        int current_start = -1;
+        int current_length = 0;
+
+        if (pthread_getaffinity_np(thread, sizeof(mask), &mask) == 0) {
+            for (int i = 0; i < CPU_SETSIZE; i++) {
+                if (CPU_ISSET(i, &mask)) {
+                    if (current_start == -1) {
+                        current_start = i;
+                    }
+                    current_length++;
+                } else {
+                    if (current_length > max_length) {
+                        max_length = current_length;
+                        range.start = current_start;
+                        range.end = i - 1;
+                    }
+                    current_start = -1;
+                    current_length = 0;
+                }
+            }
+
+            if (current_length > max_length) {
+                max_length = current_length;
+                range.start = current_start;
+                range.end = CPU_SETSIZE - 1;
+            }
+        } else {
+            ASCEND_LOGW("[affinity] Failed to get thread affinity");
+        }
+
+        return range;
+    }
+
+    void GetAffinityInfo()
     {
         mainthread_tid = pthread_self();
         parentPid = getpid();
+        originalRange = FindLongestCoreAffinityRange(mainthread_tid);
+        ASCEND_LOGI("[affinity] Original Affinity is %d-%d", originalRange.start, originalRange.end);
     }
 
     ThreadType getCurrentThreadType()
@@ -70,7 +113,7 @@ namespace c10_npu {
             std::string name(thread_name);
             auto it = threadNameToTypeMap.find(name);
             if (it != threadNameToTypeMap.end()) {
-                return it->second;
+                return std::get<1>(*it);
             }
         }
 
@@ -112,25 +155,12 @@ namespace c10_npu {
 
     coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id)
     {
-        int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
+        int offset = originalRange.start;
+        int core_nums = originalRange.end - originalRange.start + 1;
         int device_nums = device_count_ensure_non_zero();
         int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0;
-        return coreIdRange{static_cast<unsigned int>(device_id * block_size),
-                           static_cast<coreId>(std::min((device_id + 1) * block_size, core_nums) - 1)};
-    }
-
-    inline bool has_set_pthread_affinity()
-    {
-        unsigned int core_nums = static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
-
-        cpu_set_t mask;
-        pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
-        for (unsigned int i = 0; i < core_nums; i++) {
-            if (!CPU_ISSET(i, &mask)) {
-                return true;
-            }
-        }
-        return false;
+        return coreIdRange{offset + static_cast<unsigned int>(device_id * block_size),
+                           offset + static_cast<coreId>(std::min((device_id + 1) * block_size, core_nums) - 1)};
     }
 
     std::string GetAffinityMapAsString(const std::unordered_map<ThreadType, coreIdRange> &threadToCoreidMap, c10::DeviceIndex device_id)
@@ -286,13 +316,13 @@ namespace c10_npu {
     }
 
     // Function to execute a shell command and capture its output
-    std::string executeCommand(const std::string &command)
+    std::string executeCommand(const std::string &exe)
     {
         std::array<char, 128> buffer;
         std::string result;
-        std::shared_ptr<FILE> pipe(popen(command.c_str(), "r"), pclose);
+        std::shared_ptr<FILE> pipe(popen(exe.c_str(), "r"), pclose);
         if (!pipe) {
-            ASCEND_LOGE("[affinity] Failed to execute %s.", command.c_str());
+            ASCEND_LOGE("[affinity] %s failed.", exe.c_str());
         }
         while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
             result += buffer.data();
@@ -352,10 +382,7 @@ namespace c10_npu {
 
         // bind_conf=1, bind cores averagely based on device_id
         if (bind_conf == 1) {
-            static const bool set_pthread_affinity = has_set_pthread_affinity();
-            if (!set_pthread_affinity) {
-                return SetThreadAffinity(ranges[device_id], pthread_self());
-            }
+            return SetThreadAffinity(ranges[device_id], pthread_self());
         } else if (bind_conf == 2) {
             auto thread_core_map = GetCpuAffinityMap(device_id, ranges[device_id]);
             // When the PTA_init function runs on device 0, the main thread is initially assigned to this device 0.
@@ -363,7 +390,6 @@ namespace c10_npu {
             // Therefore, the main thread should be rescheduled to the target device.
             if (current_thread_type == ThreadType::aclThread)
                 SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
-            // In addition to data-loading processes, users often have other hot threads and processes.
             // To isolate interference, all such processes must be confined to separate regions before the dispatch phase.
             if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread) {
                 SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index 2c1e92ddc7..f2e78b69b6 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -27,7 +27,7 @@ namespace c10_npu {
     // (e.g., first parallel checkpoint data loading, then transitioning to forward training).
     // Each phase may require different thread affinity settings. Therefore, we record the thread's TID
     // to adjust its affinity later as needed.
-    void RecordMainThreadTid();
+    void GetAffinityInfo();
 
     // Set backwardThread Name Once
     void SetBackwardThreadName(c10::DeviceIndex device_id);
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index c7ff88e9cb..9081b686c4 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -224,7 +224,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
         ASCEND_LOGW("Npu device %d has been set before global init.", device_id_);
     }
 
-    RecordMainThreadTid();
+    GetAffinityInfo();
 
     if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) {
         const char *aclConfigPath = "acl.json";
-- 
Gitee