From 949a25bc996f52fd44016cfb14fa40f4df73ac83 Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Tue, 10 Jun 2025 07:19:55 +0000 Subject: [PATCH 01/10] update torch_npu/csrc/core/npu/register/OptionsManager.h. GetAscendOpExecTimeout Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/register/OptionsManager.h | 1 + 1 file changed, 1 insertion(+) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 8fd9f0446d..754a276250 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -102,6 +102,7 @@ public: static bool CheckTriCombinedOptimizerEnable(); static bool CheckAclDumpDateEnable(); static uint32_t GetHCCLConnectTimeout(); + static int32_t GetAscendOpExecTimeout(); static int32_t GetHCCLExecTimeout(); static int32_t GetHCCLEventTimeout(); static std::string CheckDisableDynamicPath(); -- Gitee From 5e1b28fb546c0a16a1cd7e58b96d82d56a35bd52 Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Tue, 10 Jun 2025 07:21:19 +0000 Subject: [PATCH 02/10] update torch_npu/csrc/core/npu/register/OptionsManager.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index d58b186ce2..4f14c206ff 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -146,6 +146,13 @@ uint32_t OptionsManager::GetHCCLConnectTimeout() return static_cast(envFlag); } +int32_t OptionsManager::GetAscendOpExecuteTimeout() +{ + char* env_val = std::getenv("ASCEND_OPEXEC_TIMEOUT"); + int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 547) : 0; + return static_cast(envFlag); +} + int32_t OptionsManager::GetHCCLExecTimeout() { char* env_val = std::getenv("HCCL_EXEC_TIMEOUT"); -- Gitee From 3c8c7e8ceaccb5c01c77a999c98d0f5ad8278f6b Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Tue, 10 Jun 2025 09:34:24 +0000 Subject: [PATCH 03/10] update torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 42d80b0792..bab6927013 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -177,7 +177,9 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) } NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + auto op_timeout = c10_npu::option::OptionsManager::GetAscendOpExecuteTimeout();// + NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(op_timeout));// + ASCEND_LOGD("============= AclrtSetOpExecuteTimeOut set successfully."); // lazy call for the setoption for (const auto &iter: lazy_fn_) { -- Gitee From 8735e0479e0d6f9311d44b7efe0d1c85214efd90 Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Tue, 10 Jun 2025 09:40:16 +0000 Subject: [PATCH 04/10] update torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index bab6927013..4a61a1555d 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -179,7 +179,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); auto op_timeout = c10_npu::option::OptionsManager::GetAscendOpExecuteTimeout();// NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(op_timeout));// - ASCEND_LOGD("============= AclrtSetOpExecuteTimeOut set successfully."); + ASCEND_LOGD("AclrtSetOpExecuteTimeOut set successfully."); // lazy call for the setoption for (const auto &iter: lazy_fn_) { -- Gitee From 30d973f99ca5003932403a056f7ee67922d0aea0 Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Tue, 10 Jun 2025 12:13:07 +0000 Subject: [PATCH 05/10] update torch_npu/csrc/core/npu/register/OptionsManager.h. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/register/OptionsManager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 754a276250..ab6375e4b5 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -102,7 +102,7 @@ public: static bool CheckTriCombinedOptimizerEnable(); static bool CheckAclDumpDateEnable(); static uint32_t GetHCCLConnectTimeout(); - static int32_t GetAscendOpExecTimeout(); + static int32_t GetAscendOpExecuteTimeout(); static int32_t GetHCCLExecTimeout(); static int32_t GetHCCLEventTimeout(); static std::string CheckDisableDynamicPath(); -- Gitee From c1b9f58ffd9441aa23aa54711968af796c774e95 Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Wed, 11 Jun 2025 05:57:10 +0000 Subject: [PATCH 06/10] update torch_npu/csrc/core/npu/register/OptionsManager.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 4f14c206ff..49e8fec7e3 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -149,7 +149,7 @@ uint32_t OptionsManager::GetHCCLConnectTimeout() int32_t OptionsManager::GetAscendOpExecuteTimeout() { char* env_val = std::getenv("ASCEND_OPEXEC_TIMEOUT"); - int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 547) : 0; + int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 547; return static_cast(envFlag); } -- Gitee From 8fe14ed33b9faa65cc5e0c41e5db715a215171aa Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Wed, 11 Jun 2025 05:58:52 +0000 Subject: [PATCH 07/10] update torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 4a61a1555d..51b05b4734 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -177,9 +177,9 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) } NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - auto op_timeout = c10_npu::option::OptionsManager::GetAscendOpExecuteTimeout();// - NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(op_timeout));// - ASCEND_LOGD("AclrtSetOpExecuteTimeOut set successfully."); + auto op_timeout = c10_npu::option::OptionsManager::GetAscendOpExecuteTimeout(); + NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(op_timeout)); + ASCEND_LOGD("AclrtSetOpExecuteTimeOut set successfully. op_timeout= %d s",op_timeout); // lazy call for the setoption for (const auto &iter: lazy_fn_) { -- Gitee From bd56761d9cb6b64fb1c6353f31f2f929d971737f Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Fri, 13 Jun 2025 03:07:28 +0000 Subject: [PATCH 08/10] update torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 51b05b4734..95dbf41779 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -177,9 +177,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) } NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - auto op_timeout = c10_npu::option::OptionsManager::GetAscendOpExecuteTimeout(); - NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(op_timeout)); - ASCEND_LOGD("AclrtSetOpExecuteTimeOut set successfully. op_timeout= %d s",op_timeout); + auto exec_timeout = c10_npu::option::OptionsManager::GetACLExecTimeout(); + exec_timeout = exec_timeout <=0 ? kMaxOpExecuteTimeOut : exec_timeout; + NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(exec_timeout)); + ASCEND_LOGD("AclrtSetOpExecuteTimeOut set successfully. exec_timeout= %d s", exec_timeout); // lazy call for the setoption for (const auto &iter: lazy_fn_) { -- Gitee From 1de9740ff5bb40bed1c611f57e2395bb435a90d5 Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Fri, 13 Jun 2025 03:08:17 +0000 Subject: [PATCH 09/10] update torch_npu/csrc/core/npu/register/OptionsManager.cpp. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 49e8fec7e3..d58b186ce2 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -146,13 +146,6 @@ uint32_t OptionsManager::GetHCCLConnectTimeout() return static_cast(envFlag); } -int32_t OptionsManager::GetAscendOpExecuteTimeout() -{ - char* env_val = std::getenv("ASCEND_OPEXEC_TIMEOUT"); - int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 547; - return static_cast(envFlag); -} - int32_t OptionsManager::GetHCCLExecTimeout() { char* env_val = std::getenv("HCCL_EXEC_TIMEOUT"); -- Gitee From 63e8128a4fa7454c0f7ee07ad7ea5a30a3cb92ef Mon Sep 17 00:00:00 2001 From: Luyao <14720358+luyao003@user.noreply.gitee.com> Date: Fri, 13 Jun 2025 03:08:51 +0000 Subject: [PATCH 10/10] update torch_npu/csrc/core/npu/register/OptionsManager.h. Signed-off-by: Luyao <14720358+luyao003@user.noreply.gitee.com> --- torch_npu/csrc/core/npu/register/OptionsManager.h | 1 - 1 file changed, 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index ab6375e4b5..8fd9f0446d 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -102,7 +102,6 @@ public: static bool CheckTriCombinedOptimizerEnable(); static bool CheckAclDumpDateEnable(); static uint32_t GetHCCLConnectTimeout(); - static int32_t GetAscendOpExecuteTimeout(); static int32_t GetHCCLExecTimeout(); static int32_t GetHCCLEventTimeout(); static std::string CheckDisableDynamicPath(); -- Gitee