From a7ad16619ef15874d15809714ee0be45e451fd55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com> Date: Thu, 19 Sep 2024 06:58:11 +0000 Subject: [PATCH 01/96] =?UTF-8?q?!14658=20Checkout=20branch=20to=20RC3.=20?= =?UTF-8?q?Merge=20pull=20request=20!14658=20from=20=E5=88=98=E5=98=89?= =?UTF-8?q?=E5=B7=8D/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitmodules | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitmodules b/.gitmodules index 8c21a3877b..ca763662de 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,12 +2,14 @@ path = third_party/op-plugin url = https://gitee.com/ascend/op-plugin.git ignore = dirty + branch = 6.0.rc3 [submodule "third_party/googletest"] path = third_party/googletest url = https://gitee.com/mirrors/googletest.git [submodule "third_party/torchair/torchair"] path = third_party/torchair/torchair url = https://gitee.com/ascend/torchair.git + branch = 6.0.rc3 [submodule "third_party/Tensorpipe"] path = third_party/Tensorpipe url = https://gitee.com/ascend/Tensorpipe.git -- Gitee From bf98f5c570998aaa60ac15884e41c4c35da65c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Thu, 19 Sep 2024 09:20:37 +0000 Subject: [PATCH 02/96] =?UTF-8?q?!14705=20modify=20version=20Merge=20pull?= =?UTF-8?q?=20request=20!14705=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.1.0?= =?UTF-8?q?-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b2e74958dd..cfed83ba12 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel BASE_DIR = os.path.dirname(os.path.realpath(__file__)) THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party") -VERSION = '2.1.0.post7' +VERSION = '2.1.0.post8' UNKNOWN = "Unknown" BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP -- Gitee From 4b0c253d926b2a1067b29f02d696cac31018cb11 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 20 Sep 2024 08:43:55 +0000 Subject: [PATCH 03/96] !14785 Update op_plugin commit id Merge pull request !14785 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index cd3de98674..785ba5248e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit cd3de98674aac44775448672d7193c8e1339fc7d +Subproject commit 785ba5248e2d1d51e13719177f7328c4aa38e836 -- Gitee From fddba1498bb422f1269d7ff7b9920dabd86e3714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Fri, 20 Sep 2024 09:35:13 +0000 Subject: [PATCH 04/96] =?UTF-8?q?!14698=20silentCheckV2:=20filter=20models?= =?UTF-8?q?=20with=20fp16=20dtype=20Merge=20pull=20request=20!14698=20from?= =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Fsilent3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/utils/_step.py | 43 +++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py index 2b78813f30..960c0afff2 100644 --- a/torch_npu/utils/_step.py +++ b/torch_npu/utils/_step.py @@ -78,8 +78,9 @@ class SilentCheckState: def __init__(self): self.init_param() self.init_marks = {} - self.weight_hook_flags = {} - self.last_weight_hook_flags = {} + self.weight_hook_handles = {} + self.last_weight_hook_handles = {} + self.dtype_support = True def init_param(self): self.first_forward = True @@ -101,6 +102,18 @@ class SilentCheckState: else: torch_npu._C._npu_set_module_train_state("infer") + def check_tensor_dtype(self, tensor): + if not self.dtype_support: + return + if isinstance(tensor, torch.Tensor) and tensor.requires_grad and tensor.dtype == torch.float16: + self.dtype_support = False + + def check_dtype(self, module, *args): + for x in args: + self.check_tensor_dtype(x) + for param_name, param in module._parameters.items(): + self.check_tensor_dtype(param) + def search_first_weight(self, module): # Search the first weight if not self.init_marks.get(self.first_module_id, False) and self.first_weight is None: @@ -145,15 +158,15 @@ class SilentCheckState: if self.first_tensor_id != self.last_tensor_id: if self.last_tensor is not None: self.last_tensor.register_hook(output_hook) - if not self.last_weight_hook_flags.get(self.first_module_id, False): + if self.last_weight_hook_handles.get(self.first_module_id, None) is None: if self.last_weight is not None: - self.last_weight.register_hook(output_hook) - self.last_weight_hook_flags[self.first_module_id] = True - if not self.weight_hook_flags.get(self.first_module_id, False): + last_weight_handle = self.last_weight.register_hook(output_hook) + self.last_weight_hook_handles[self.first_module_id] = last_weight_handle + if self.weight_hook_handles.get(self.first_module_id, None) is None: if self.first_weight is not None: - self.first_weight.register_hook(input_hook("", asd_flag)) - self.weight_hook_flags[self.first_module_id] = True - self.init_marks[self.first_module_id] = True + first_weight_handle = self.first_weight.register_hook(input_hook("", asd_flag)) + self.weight_hook_handles[self.first_module_id] = first_weight_handle + self.init_marks[self.first_module_id] = True silent_check = SilentCheckState() @@ -275,6 +288,18 @@ def _custom_call(self, *args, **kwargs): silent_check.init_module_info(id(self), self.training) self.outer = True + if silent_check.is_training and not silent_check.init_marks.get(silent_check.first_module_id, False): + silent_check.check_dtype(self, *args) + if not silent_check.dtype_support: + for value in silent_check.weight_hook_handles.values(): + if value is not None: + value.remove() + for value in silent_check.last_weight_hook_handles.values(): + if value is not None: + value.remove() + asd_enable = 0 + warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.") + # Search the first tensor (if the first tensor is input) silent_check.register_input_hook_before_call(asd_enable, *args) -- Gitee From 37b954f0cd4208a8b185fbdb5bf629ba4c8f28ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Sat, 21 Sep 2024 03:33:27 +0000 Subject: [PATCH 05/96] =?UTF-8?q?!14802=20update=20torchair=20commitid=20M?= =?UTF-8?q?erge=20pull=20request=20!14802=20from=20=E5=85=B3=E9=BE=99?= =?UTF-8?q?=E9=94=8B/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 5143b41229..485484ca71 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 5143b41229264549c27510ebbd310169568e7758 +Subproject commit 485484ca7143cdf47415793ca76db9210cff8a4c -- Gitee From cc375cc5804ac2c4c43eb5f60b593c6d1a61614e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 23 Sep 2024 04:43:42 +0000 Subject: [PATCH 06/96] !14834 Update op_plugin commit id Merge pull request !14834 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 785ba5248e..3b738f2d6c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 785ba5248e2d1d51e13719177f7328c4aa38e836 +Subproject commit 3b738f2d6c2aff15a77b94c37a71873038877df7 -- Gitee From 074ae4b7a1fe7e47e55e3b736d1ca69bd49c22dc Mon Sep 17 00:00:00 2001 From: sunjiayang Date: Mon, 23 Sep 2024 07:02:08 +0000 Subject: [PATCH 07/96] !14770 only last error Merge pull request !14770 from sunjiayang/last_error_210_rc3 --- third_party/acl/inc/acl/acl_base.h | 2 +- third_party/acl/inc/acl/acl_rt.h | 55 ++++++++++++++++++- third_party/acl/libs/acl.cpp | 3 + torch_npu/csrc/core/npu/NPUEventManager.cpp | 5 +- torch_npu/csrc/core/npu/NPUException.h | 20 ++++++- torch_npu/csrc/core/npu/NPUFunctions.cpp | 14 +++-- torch_npu/csrc/core/npu/NPUQueue.cpp | 4 +- torch_npu/csrc/core/npu/NPUStream.cpp | 6 +- .../core/npu/THNPUCachingHostAllocator.cpp | 8 +-- .../csrc/core/npu/interface/AclInterface.cpp | 28 ++++++++++ .../csrc/core/npu/interface/AclInterface.h | 4 ++ .../csrc/distributed/ProcessGroupHCCL.cpp | 7 ++- torch_npu/csrc/framework/OpParamMaker.cpp | 46 ++++++++++++---- 13 files changed, 168 insertions(+), 34 deletions(-) diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index 091e45aa23..6411f94794 100644 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -132,7 +132,7 @@ static const int ACL_ERROR_GE_FAILURE = 500002; static const int ACL_ERROR_RT_FAILURE = 500003; static const int ACL_ERROR_DRV_FAILURE = 500004; static const int ACL_ERROR_PROFILING_FAILURE = 500005; -static const int ACL_ERROR_RT_DEVICE_MTE_ERROR = 507053; +static const int ACL_ERROR_RT_DEVICE_MEM_ERROR = 507053; #define ACL_TENSOR_SHAPE_RANGE_NUM 2 #define ACL_TENSOR_VALUE_RANGE_NUM 2 diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 6a4add2c16..33052829bf 100644 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -168,6 +168,10 @@ typedef enum aclrtCmoType { ACL_RT_CMO_TYPE_PREFETCH = 0, } aclrtCmoType; +typedef enum aclrtLastErrLevel { + ACL_RT_THREAD_LEVEL = 0, +} aclrtLastErrLevel; + typedef void* aclrtDrvMemHandle; typedef void (*aclrtCallback)(void *userData); @@ -1453,11 +1457,58 @@ ACL_FUNC_VISIBILITY aclError aclrtResetOverflowStatus(aclrtStream stream); */ ACL_FUNC_VISIBILITY aclError aclrtCmoAsync(void *src, size_t size, aclrtCmoType cmoType, aclrtStream stream); -ACL_FUNC_VISIBILITY aclError aclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t *retSize); +/** + * @ingroup AscendCL + * @brief get the mem uce info + * @param [in] deviceId + * @param [in/out] memUceInfoArray + * @param [in] arraySize + * @param [out] retSize + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo *memUceInfoArray, size_t arraySize, size_t *retSize); +/** + * @ingroup AscendCL + * @brief stop the task on specified device + * @param [in] deviceId + * @param [in] timeout + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ ACL_FUNC_VISIBILITY aclError aclrtDeviceTaskAbort(int32_t deviceId, uint32_t timeout); -ACL_FUNC_VISIBILITY aclError aclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize); +/** + * @ingroup AscendCL + * @brief repair the mem uce + * @param [in] deviceId + * @param [in/out] memUceInfoArray + * @param [in] arraySize + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo *memUceInfoArray, size_t arraySize); + +/** + * @ingroup AscendCL + * @brief peek at last error by level + * + * @param level [IN] error level + * + * @retval Runtime error code + */ +ACL_FUNC_VISIBILITY aclError aclrtPeekAtLastError(aclrtLastErrLevel level); + +/** + * @ingroup AscendCL + * @brief get last error by level + * + * @param level [IN] error level + * + * @retval Runtime error code + */ +ACL_FUNC_VISIBILITY aclError aclrtGetLastError(aclrtLastErrLevel level); #ifdef __cplusplus } diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp index b8f598f163..4f24e6bf04 100644 --- a/third_party/acl/libs/acl.cpp +++ b/third_party/acl/libs/acl.cpp @@ -51,6 +51,9 @@ aclError aclrtGetMemInfo(aclrtMemAttr attr, size_t *free, size_t *total){return aclError aclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t *retSize){return 0;} aclError aclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize){return 0;} aclError aclrtCmoAsync(void *src, size_t size, aclrtCmoType cmoType, aclrtStream stream){return 0;} +aclError aclrtGetLastError(aclrtLastErrLevel flag){return 0;} +aclError aclrtPeekAtLastError(aclrtLastErrLevel flag){return 0;} + // op相关操作 aclopAttr *aclopCreateAttr(){return NULL;} diff --git a/torch_npu/csrc/core/npu/NPUEventManager.cpp b/torch_npu/csrc/core/npu/NPUEventManager.cpp index d69dd4622e..75cb33c240 100644 --- a/torch_npu/csrc/core/npu/NPUEventManager.cpp +++ b/torch_npu/csrc/core/npu/NPUEventManager.cpp @@ -64,8 +64,9 @@ aclError NPUEventManager::LazyDestroy(aclrtEvent npu_event) int err = aclrtDestroyEvent(npu_event); if (err == ACL_ERROR_NONE) { ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", npu_event); + } else { + CHECK_AND_THROW_FORCE_STOP(err); } - CHECK_AND_THROW_FORCE_STOP(err); return err; } std::lock_guard guard(event_queue_mutex_); @@ -88,8 +89,8 @@ void NPUEventManager::ClearEvent() } #endif auto err = aclrtDestroyEvent(event); - CHECK_AND_THROW_FORCE_STOP(err); if (err != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(err); NPU_CHECK_WARN(err); } else { ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", event); diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index d6d09d443c..bd2f49c5d6 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -89,6 +89,9 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode); #define GRAPH_ERROR(error) formatErrorCode(SubModule::GRAPH, error) #define PROF_ERROR(error) formatErrorCode(SubModule::PROF, error) +#define DEVICE_TASK_ABORT "107022" +#define DEVICE_MEM_ERROR "507053" + inline const char* getErrorFunction(const char* msg) { return msg; @@ -101,7 +104,12 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) } #define CHECK_AND_THROW_FORCE_STOP(err_code) \ - if ((err_code) == ACL_ERROR_RT_DEVICE_TASK_ABORT) { \ + auto Error_stop = (int)(err_code); \ + auto stop_error = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); \ + if ((stop_error) != ACL_ERROR_NONE) { \ + Error_stop = stop_error; \ + } \ + if ((Error_stop) == ACL_ERROR_RT_DEVICE_TASK_ABORT) { \ c10_npu::set_has_throw_error(true); \ TORCH_CHECK( \ false, \ @@ -111,12 +119,17 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) ":", \ __LINE__, \ " NPU function error: FORCE STOP.", \ - ", error code is ", err_code, \ + ", error code is ", Error_stop, \ PTA_ERROR(ErrCode::ACL)); \ } \ #define CHECK_AND_THROW_UCE_ERROR(err_code) \ - if ((err_code) == ACL_ERROR_RT_DEVICE_MTE_ERROR && \ + auto Error_uce = (int)(err_code); \ + auto uce_error = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); \ + if ((uce_error) != ACL_ERROR_NONE) { \ + Error_uce = uce_error; \ + } \ + if ((Error_uce) == ACL_ERROR_RT_DEVICE_MEM_ERROR && \ c10_npu::get_has_throw_error() == false && c10_npu::checkUceErrAndRepair()) { \ c10_npu::set_has_throw_error(true); \ TORCH_CHECK( \ @@ -127,6 +140,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) ":", \ __LINE__, \ " NPU function error: UCE ERROR.", \ + ", error code is ", Error_uce, \ PTA_ERROR(ErrCode::ACL)); \ } \ diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index f27bc24ed0..59456b3349 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -20,9 +20,9 @@ c10::DeviceIndex device_count() noexcept // initialize number of devices only once if (dev_count == 0) { aclError error = aclrtGetDeviceCount(&dev_count); - CHECK_AND_THROW_FORCE_STOP(error); - CHECK_AND_THROW_UCE_ERROR(error); if (error != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(error); + CHECK_AND_THROW_UCE_ERROR(error); ASCEND_LOGE("get device count of NPU failed"); return 0; } @@ -48,8 +48,10 @@ aclError GetDevice(int32_t *device) return ACL_ERROR_NONE; } aclError err = aclrtGetDevice(device); - CHECK_AND_THROW_FORCE_STOP(err); - CHECK_AND_THROW_UCE_ERROR(err); + if (err != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(err); + CHECK_AND_THROW_UCE_ERROR(err); + } if (err == ACL_ERROR_NONE) { local_device = *device; } else if (err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) { @@ -155,9 +157,9 @@ aclError SynchronizeUsedDevices() for (const auto it : used_devices) { NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); aclError acl_ret = aclrtSynchronizeDevice(); - CHECK_AND_THROW_FORCE_STOP(acl_ret); - CHECK_AND_THROW_UCE_ERROR(acl_ret); if (acl_ret != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(acl_ret); + CHECK_AND_THROW_UCE_ERROR(acl_ret); return acl_ret; } } diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 0cdd29b6ef..b5b762942c 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -265,7 +265,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) } #endif read_idx.idx = write_idx.idx; - if (call_ret == ACL_ERROR_RT_DEVICE_MTE_ERROR && checkUceErrAndRepair()) { + if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { set_has_throw_error(true); call_ret = 0; if (check_error) { @@ -387,7 +387,7 @@ void Repository::Enqueue(void* cur_paras) { SetStatus(CAN_EXIT); read_idx.idx = write_idx.idx; - if (call_ret == ACL_ERROR_RT_DEVICE_MTE_ERROR && checkUceErrAndRepair()) { + if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { set_has_throw_error(true); call_ret = 0; throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL)); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index cf9baf20d5..3df354ce50 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -429,8 +429,10 @@ bool npuSynchronizeDevice(bool check_error) } } auto acl_ret = aclrtSynchronizeDevice(); - CHECK_AND_THROW_FORCE_STOP(acl_ret); - CHECK_AND_THROW_UCE_ERROR(acl_ret); + if (acl_ret != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(acl_ret); + CHECK_AND_THROW_UCE_ERROR(acl_ret); + } #ifndef BUILD_LIBTORCH if (acl_ret == ACL_ERROR_NONE) { const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); diff --git a/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp b/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp index 8820c9f864..15645f966b 100644 --- a/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp +++ b/torch_npu/csrc/core/npu/THNPUCachingHostAllocator.cpp @@ -132,9 +132,9 @@ struct HostAllocator { // allocate a new block if no cached allocation is found err = aclrtMallocHost(ptr, size); - CHECK_AND_THROW_FORCE_STOP(err); - CHECK_AND_THROW_UCE_ERROR(err); if (err != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(err); + CHECK_AND_THROW_UCE_ERROR(err); return err; } @@ -161,9 +161,9 @@ struct HostAllocator { // insert npu events for each stream on which this block was used. This aclError err = insertEvents(block); - CHECK_AND_THROW_FORCE_STOP(err); - CHECK_AND_THROW_UCE_ERROR(err); if (err != ACL_ERROR_NONE) { + CHECK_AND_THROW_FORCE_STOP(err); + CHECK_AND_THROW_UCE_ERROR(err); return err; } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 04bb398e76..dc31bd985b 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -65,6 +65,8 @@ LOAD_FUNCTION(aclrtGetMemUceInfo) LOAD_FUNCTION(aclrtDeviceTaskAbort) LOAD_FUNCTION(aclrtMemUceRepair) LOAD_FUNCTION(aclrtCmoAsync) +LOAD_FUNCTION(aclrtGetLastError) +LOAD_FUNCTION(aclrtPeekAtLastError) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -607,6 +609,32 @@ aclError AclrtCmoAsync(void* src, size_t size, aclrtCmoType cmoType, aclrtStream return func(src, size, cmoType, stream); } +aclError AclrtGetLastError(aclrtLastErrLevel flag) +{ + typedef aclError (*AclrtGetLastError)(aclrtLastErrLevel flag); + static AclrtGetLastError func = nullptr; + if (func == nullptr) { + func = (AclrtGetLastError) GET_FUNC(aclrtGetLastError); + } + if (func == nullptr) { + return ACL_ERROR_NONE; + } + return func(flag); +} + +aclError AclrtPeekAtLastError(aclrtLastErrLevel flag) +{ + typedef aclError (*AclrtPeekAtLastError)(aclrtLastErrLevel flag); + static AclrtPeekAtLastError func = nullptr; + if (func == nullptr) { + func = (AclrtPeekAtLastError) GET_FUNC(aclrtPeekAtLastError); + } + if (func == nullptr) { + return ACL_ERROR_NONE; + } + return func(flag); +} + aclError AclStressDetect(int32_t deviceId, void *workspace, size_t workspaceSize) { typedef aclError (*AclStressDetect)(int32_t, void*, size_t); diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 30270d2aeb..d868d46423 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -168,6 +168,10 @@ aclError AclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, s aclError AclrtCmoAsync(void* src, size_t size, aclrtCmoType cmoType, aclrtStream stream); +aclError AclrtGetLastError(aclrtLastErrLevel flag); + +aclError AclrtPeekAtLastError(aclrtLastErrLevel flag); + aclError AclStressDetect(int32_t deviceId, void *workspace, size_t workspaceSize); } // namespace acl diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 0e5ac539b8..61c018dda3 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1488,12 +1488,15 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptrcustomHandler(); } catch (std::exception &e) { - if (std::string(e.what()).find("device task abort") != std::string::npos) { - ret = ACL_ERROR_RT_DEVICE_TASK_ABORT; + if (std::string(e.what()).find(DEVICE_TASK_ABORT) != std::string::npos || + std::string(e.what()).find(DEVICE_MEM_ERROR) != std::string::npos) { + ret =c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; LOG(ERROR) << e.what(); } ASCEND_LOGE("Custom hand error:%s", e.what()); } - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE && ret != ACL_ERROR_RT_DEVICE_TASK_ABORT && ret != ACL_ERROR_RT_DEVICE_MEM_ERROR) { ASCEND_LOGE("Custom hand fail! name=%s, ret=0x%#x", cur_paras->opType, ret); C10_NPU_SHOW_ERR_MSG(); } @@ -304,7 +306,11 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) ACL_ENGINE_SYS, at_npu::native::aoe::aoe_manager().GetDumpGraphPath().c_str(), nullptr); - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE) { + auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); + if (ret_temp != ACL_ERROR_NONE) { + ret = ret_temp; + } ASCEND_LOGE("In aoe mode, AclGenGraphAndDumpForOp failed!"); C10_NPU_SHOW_ERR_MSG(); return ret; @@ -327,7 +333,11 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) NPU_CHECK_ERROR_WITHOUT_UCE(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "disable")); } - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE) { + auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); + if (ret_temp != ACL_ERROR_NONE) { + ret = ret_temp; + } printErrorLog(cur_paras); C10_NPU_SHOW_ERR_MSG(); } @@ -340,7 +350,11 @@ int MemcopyAsyncFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) auto cur_paras = static_cast(in->paramVal); aclError ret = aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE) { + auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); + if (ret_temp != ACL_ERROR_NONE) { + ret = ret_temp; + } ASCEND_LOGE( "aclrtMemcpyAsync error! ret = %d, dstLen = %zu, srcLen = %zu, kind = %d", ret, @@ -357,7 +371,11 @@ int RecordEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) auto cur_paras = static_cast(in->paramVal); aclError ret = aclrtRecordEvent(cur_paras->event, stream); - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE) { + auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); + if (ret_temp != ACL_ERROR_NONE) { + ret = ret_temp; + } ASCEND_LOGE("aclrtRecordEvent error! ret = %d, eventAllocatorType = %d", ret, cur_paras->eventAllocatorType); C10_NPU_SHOW_ERR_MSG(); } @@ -374,7 +392,11 @@ int WaitEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) { auto cur_paras = static_cast(in->paramVal); aclError ret = aclrtStreamWaitEvent(stream, cur_paras->event); - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE) { + auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); + if (ret_temp != ACL_ERROR_NONE) { + ret = ret_temp; + } ASCEND_LOGE( "aclrtStreamWaitEvent error! ret = %d, eventAllocatorType = %d", ret, @@ -392,7 +414,11 @@ int LazyDestroyEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) { auto cur_paras = static_cast(in->paramVal); aclError ret = c10_npu::NPUEventManager::GetInstance().LazyDestroy(cur_paras->event); - if (ret != ACL_ERROR_NONE && ret!= ACL_ERROR_RT_DEVICE_TASK_ABORT) { + if (ret != ACL_ERROR_NONE) { + auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); + if (ret_temp != ACL_ERROR_NONE) { + ret = ret_temp; + } ASCEND_LOGE("LazyDestroy error! ret = %d, eventAllocatorType = %d", ret, cur_paras->eventAllocatorType); C10_NPU_SHOW_ERR_MSG(); } -- Gitee From 3a34bb7eccee08e762fa5232e22d91518ee761c6 Mon Sep 17 00:00:00 2001 From: will-devil Date: Mon, 23 Sep 2024 09:34:58 +0000 Subject: [PATCH 08/96] !14796 [Bugfix] Reduce unnecessary memory allocation. Merge pull request !14796 from will-devil/v2.1.0-6.0.rc3 --- .../csrc/aten/ops/op_api/CopyKernelOpApi.cpp | 43 ++++++------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp index 76d2a6a6b9..7baad2af45 100644 --- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp @@ -182,45 +182,30 @@ void copy_d2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_ at::Tensor& NPUNativeOpApiFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { DO_COMPATIBILITY(aclnnInplaceCopy, NPUNativeFunctions::copy_(self, src, non_blocking)); + if (self.numel() == 0) { return self; } - auto result = OpPreparation::apply_tensor_without_format(src); - if (src.is_complex() && torch_npu::utils::is_npu(src)) { - auto real_tensor = at::real(src); - auto imag_tensor = OpPreparation::apply_tensor_without_format(src); + auto maybe_outnames = at::namedinference::compute_broadcast_outnames(self, src); - if (src.is_conj()) { - auto tmp = at::imag(src); - tmp._set_neg(false); - imag_tensor = tmp.neg(); + if (torch_npu::utils::is_npu(self)) { + if (torch_npu::utils::is_npu(src)) { + copy_d2d_baseformat_opapi(self, src, non_blocking); } else { - imag_tensor = at::imag(src); + copy_h2d_baseformat_opapi(self, src, non_blocking); } - - auto outDtype = src.dtype(); - auto outputSize = op_infer::broadcast_ops_npu_output_size(real_tensor, imag_tensor); - result = OpPreparation::apply_tensor_without_format(outputSize, real_tensor.options().dtype(outDtype)); - EXEC_NPU_CMD(aclnnComplex, real_tensor, imag_tensor, result); - } else { - result = src; - if (src.is_neg()) { - src._set_neg(false); - result = src.neg(); + if (src.is_complex() && src.is_conj()) { + auto real_tensor = at::real(self); + auto imag_tensor = at::imag(self).neg(); + EXEC_NPU_CMD(aclnnComplex, real_tensor, imag_tensor, self); } - } - auto maybe_outnames = at::namedinference::compute_broadcast_outnames(self, result); - - if (torch_npu::utils::is_npu(self)) { - if (torch_npu::utils::is_npu(result)) { - copy_d2d_baseformat_opapi(self, result, non_blocking); - } else { - copy_h2d_baseformat_opapi(self, result, non_blocking); + if (src.is_neg()) { + self.neg_(); } } else { - if (torch_npu::utils::is_npu(result)) { - copy_d2h_baseformat_opapi(self, result, non_blocking); + if (torch_npu::utils::is_npu(src)) { + copy_d2h_baseformat_opapi(self, src, non_blocking); } } at::namedinference::propagate_names_if_nonempty(self, maybe_outnames); -- Gitee From d5399f76b9274673db5c830f6254b04e2e6be8fc Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 23 Sep 2024 10:43:41 +0000 Subject: [PATCH 09/96] !14841 Update op_plugin commit id Merge pull request !14841 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 3b738f2d6c..5952cf37f3 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 3b738f2d6c2aff15a77b94c37a71873038877df7 +Subproject commit 5952cf37f3c76ec37105ffe9dd01003101a1437c -- Gitee From d33726ef5eb664f02122897a7587fffcbb860466 Mon Sep 17 00:00:00 2001 From: wangqihui01 Date: Mon, 23 Sep 2024 12:40:42 +0000 Subject: [PATCH 10/96] !14818 check analyse_flat, schedule and on_trace_ready parameters Merge pull request !14818 from wangqihui01/v2.1.0-6.0.rc3 --- torch_npu/profiler/profiler.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py index dea02aaded..d15f7e1ec7 100644 --- a/torch_npu/profiler/profiler.py +++ b/torch_npu/profiler/profiler.py @@ -136,6 +136,9 @@ class _KinetoProfile: @no_exception_func() def tensorboard_trace_handler(dir_name: str = None, worker_name: str = None, analyse_flag: bool = True): ProfPathCreator().init(worker_name=worker_name, dir_name=dir_name) + if not isinstance(analyse_flag, bool): + print_warn_msg("analyse_flag is not bool, set by default.") + analyse_flag = True def handler_fn(prof_inst) -> None: if analyse_flag: @@ -162,13 +165,18 @@ class profile(_KinetoProfile): ): super().__init__() activities_set = set(activities) if activities else supported_activities() - if schedule: + if schedule and isinstance(schedule, Callable): self.schedule = schedule # add step markers into the trace and table view self.record_steps = True else: + if schedule: + print_warn_msg("schedule is not Callable, set by default.") self.schedule = _default_schedule_fn self.record_steps = False + if on_trace_ready and not isinstance(on_trace_ready, Callable): + print_warn_msg("on_trace_ready is not Callable, set by default.") + on_trace_ready = None self.prof_if = _ProfInterface( activities=activities_set, record_shapes=record_shapes, -- Gitee From d9d36cbf77a727df2e73526e949969a34dc211c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com> Date: Mon, 23 Sep 2024 13:16:30 +0000 Subject: [PATCH 11/96] =?UTF-8?q?!14790=20[Fix]=20Fix=20public=20bindings.?= =?UTF-8?q?=20Merge=20pull=20request=20!14790=20from=20=E5=88=98=E5=98=89?= =?UTF-8?q?=E5=B7=8D/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_public_bindings.py | 7 +++++++ test/torch_npu_schema.json | 21 --------------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py index 1afd5ac8ad..3b5f8000c9 100644 --- a/test/npu/test_public_bindings.py +++ b/test/npu/test_public_bindings.py @@ -42,6 +42,13 @@ tempFilter = { "torch_npu.npu_masked_softmax_with_rel_pos_bias", "torch_npu.npu_moe_gating_top_k_softmax", "torch_npu.npu_moe_init_routing", + "torch_npu.npu_ifmr", + "torch_npu.npu_masked_fill_range", + "torch_npu.npu_normalize_batch", + "torch_npu.npu_rotated_box_decode", + "torch_npu.npu_rotated_box_encode", + "torch_npu.npu_scatter", + "torch_npu.npu_stride_add", "torch_npu.utils.collect_env.main", "torch_npu.utils.collect_env.namedtuple", "torch_npu.one_", diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index d2b4ba302f..f168547aa8 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2744,9 +2744,6 @@ "torch_npu.npu_gru": { "signature": "(inputs, hx, weight_input, weight_hidden, bias_input, bias_hidden, seq_length, has_biases, num_layers, dropout, train, bidirectional, batch_first)" }, - "torch_npu.npu_ifmr": { - "signature": "(data, data_min, data_max, cumsum, min_percentile, max_percentile, search_start, search_end, search_step, with_offset)" - }, "torch_npu.npu_incre_flash_attention": { "signature": "(self, query, key, value, padding_mask, atten_mask, pse_shift, actual_seq_lengths, antiquant_scale, antiquant_offset, block_table, num_heads, scale_value, input_layout, num_key_value_heads, block_size, inner_precise)" }, @@ -2765,9 +2762,6 @@ "torch_npu.npu_lstm": { "signature": "(inputs, weight, bias, seqMask, h, c, has_biases, num_layers, dropout, train, bidirectional, batch_first, flagSeq, direction)" }, - "torch_npu.npu_masked_fill_range": { - "signature": "(self, start, end, value, axis=-1)" - }, "torch_npu.npu_max": { "signature": "(self, dim, keepdim=False)" }, @@ -2798,9 +2792,6 @@ "torch_npu.npu_nms_with_mask": { "signature": "(inputs, iou_threshold)" }, - "torch_npu.npu_normalize_batch": { - "signature": "(self, seq_len, normalize_type=0)" - }, "torch_npu.npu_one_hot": { "signature": "(self, num_classes=-1, depth=1, on_value=1, off_value=0)" }, @@ -2840,12 +2831,6 @@ "torch_npu.npu_rotary_mul": { "signature": "(x, r1, r2)" }, - "torch_npu.npu_rotated_box_decode": { - "signature": "(self, deltas, weight)" - }, - "torch_npu.npu_rotated_box_encode": { - "signature": "(self, gt_bboxes, weight)" - }, "torch_npu.npu_rotated_iou": { "signature": "(self, query_boxes, trans=False, mode=0, is_cross=True, v_threshold=0.0, e_threshold=0.0)" }, @@ -2855,9 +2840,6 @@ "torch_npu.npu_scaled_masked_softmax": { "signature": "(x, mask, scale=1, fixed_triu_mask=False)" }, - "torch_npu.npu_scatter": { - "signature": "(self, indices, updates, dim)" - }, "torch_npu.npu_scatter_nd_update": { "signature": "(self, indices, updates)" }, @@ -2882,9 +2864,6 @@ "torch_npu.npu_sort_v2": { "signature": "(self, dim=-1, descending=False, out=None)" }, - "torch_npu.npu_stride_add": { - "signature": "(self, other, offset1, offset2, c1_len)" - }, "torch_npu.npu_stride_copy": { "signature": "(self, shape, stride, storage_offset, out=None)" }, -- Gitee From 033dd355b721d138d942ea12b3da45d318e59eaa Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 23 Sep 2024 13:43:44 +0000 Subject: [PATCH 12/96] !14849 Update op_plugin commit id Merge pull request !14849 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5952cf37f3..11287f9900 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5952cf37f3c76ec37105ffe9dd01003101a1437c +Subproject commit 11287f9900795dafab3d5fdce68cc6bc062f2e92 -- Gitee From 7c07e10a5ba22d026f81cf202e43765fa57e6634 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 23 Sep 2024 13:48:25 +0000 Subject: [PATCH 13/96] !14826 set default value disable for ALLOW_INTERNAL_FORMAT Merge pull request !14826 from huangyunlong/2.1rc3f --- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 04176f3b58..679b2a262a 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -99,6 +99,17 @@ void GetAndSetDefaultJitCompileByAcl() ASCEND_LOGI("Get ACL JitCompile default value %s and set", value_str.c_str()); } +void SetDefaultAllowInternalFromatDisable() +{ + auto allow_internal_format = c10_npu::option::GetOption("ALLOW_INTERNAL_FORMAT"); + if (allow_internal_format.has_value() && allow_internal_format.value() != "") { + return; + } + + c10_npu::option::SetOption("ALLOW_INTERNAL_FORMAT", "disable"); + ASCEND_LOGI("Set ALLOW_INTERNAL_FORMAT default value disable."); +} + void SetHF32DefaultValue() { // The default value of the flag used to control whether HF32 is allowed on conv is True. // The default value of the flag used to control whether HF32 is allowed on matmul is True, @@ -238,6 +249,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) MakeCompileCacheDirAndSetOption(); // set default jit_Compile value from Get acl defalut value GetAndSetDefaultJitCompileByAcl(); + // set default allow_internal_format value + if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910_9391) { + SetDefaultAllowInternalFromatDisable(); + } SetHF32DefaultValue(); -- Gitee From 02bcb117b5e3688a0e40d891b1d59e5ec0ddd11f Mon Sep 17 00:00:00 2001 From: sunjiayang Date: Mon, 23 Sep 2024 14:28:56 +0000 Subject: [PATCH 14/96] !14762 mem uce bug fix Merge pull request !14762 from sunjiayang/mem_uce_210_rc3 --- .../csrc/core/npu/NPUCachingAllocator.cpp | 35 ++++++++++++------- torch_npu/csrc/core/npu/NPUException.cpp | 19 ++++------ torch_npu/csrc/core/npu/NPUException.h | 21 ++++++++--- torch_npu/csrc/npu/Module.cpp | 4 +-- 4 files changed, 47 insertions(+), 32 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 1486954eed..8be7af41c0 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -866,27 +866,36 @@ class DeviceCachingAllocator { bool checkUceInMemPool() { auto memUceInfo_ = c10_npu::get_mem_uce_info(); - auto info = memUceInfo_.info.data(); + auto info = memUceInfo_.info; const auto all_blocks = get_all_blocks(); for (int i = 0; i < memUceInfo_.retSize; ++i) { - size_t length = info[i].len; void* addr = info[i].addr; - for (int j = 0; j < length; ++j) { - bool found = false; - for (const Block* const head_block : all_blocks) { - if (head_block->ptr <= addr && addr < head_block->ptr + head_block->size) { - const_cast(head_block)->is_safe = false; + size_t length = info[i].len; + + // Calculate the start and end address for info[i] + void* addr_end = static_cast(addr) + length - 1; + + bool found = false; + + // Iterate through all blocks and check if there's an overlap with addr + for (const Block* const head_block : all_blocks) { + void* block_start = head_block->ptr; + void* block_end = static_cast(head_block->ptr) + head_block->size - 1; + + // If there is an overlap, mark the block as unsafe + if (addr <= block_end && addr_end >= block_start) { + const_cast(head_block)->is_safe = false; + found = true; + // Set the unsafe flag only once + if (c10_npu::get_npu_data_unsafe_flag() == false) { c10_npu::set_npu_data_unsafe_flag(true); - found = true; - break; } } + } - if (!found) { - return false; - } - addr += 1; + if (!found) { + return false; } } return true; diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index d0f9dbe48c..4cc680261b 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -97,10 +97,7 @@ MemUceInfo get_mem_uce_info() void clear_mem_uce_info() { std::lock_guard lock(memUceInfoMutex); - memUceInfo.device = 0; - memUceInfo.info.clear(); - memUceInfo.retSize = 0; - memUceInfo.mem_type = 0; + memUceInfo.clear(); } const char *c10_npu_get_error_message() @@ -116,17 +113,13 @@ bool checkUceErrAndRepair() TORCH_CHECK(false, "ERROR happend in GetDevice.", PTA_ERROR(ErrCode::ACL)) } - aclrtMemUceInfo info[MAX_MEM_UCE_INFO_ARRAY_SIZE]; - size_t retSize = 0; + MemUceInfo memUceInfo_; + memUceInfo_.device = device; - err = c10_npu::acl::AclrtGetMemUceInfo(device, info, sizeof(info) / sizeof(aclrtMemUceInfo), &retSize); + err = c10_npu::acl::AclrtGetMemUceInfo(device, memUceInfo_.info, sizeof(memUceInfo_.info) / sizeof(aclrtMemUceInfo), &memUceInfo_.retSize); if (err == ACL_ERROR_NONE) { - if (retSize > 0) { - ASCEND_LOGE("AclrtGetMemUceInfo get UCE ERROR, retSize is %d", retSize); - MemUceInfo memUceInfo_; - memUceInfo_.device = device; - memUceInfo_.info.assign(info, info + retSize); - memUceInfo_.retSize = retSize; + if (memUceInfo_.retSize > 0) { + ASCEND_LOGE("AclrtGetMemUceInfo get UCE ERROR, retSize is %d", memUceInfo_.retSize); set_mem_uce_info(memUceInfo_); return true; } else { diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index bd2f49c5d6..eb4620a13c 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -233,10 +233,23 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) namespace c10_npu { struct MemUceInfo { - int device = 0; - std::vector info; - size_t retSize = 0; - int mem_type = 0; + int device; + aclrtMemUceInfo info[MAX_MEM_UCE_INFO_ARRAY_SIZE]; + size_t retSize; + int mem_type; + + MemUceInfo() : device(-1), retSize(0), mem_type(0) + { + std::memset(info, 0, sizeof(info)); + } + + void clear() + { + device = -1; + std::memset(info, 0, sizeof(info)); + retSize = 0; + mem_type = 0; + } }; C10_NPU_API const char *c10_npu_get_error_message(); diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index ef82be1221..d630aa10d8 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -375,8 +375,8 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg) HANDLE_TH_ERRORS int device = THPUtils_unpackLong(arg); auto memUceInfo_ = c10_npu::get_mem_uce_info(); - if (memUceInfo_.retSize > 0 && memUceInfo_.mem_type == 3) { - NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info.data(), memUceInfo_.retSize)); + if (memUceInfo_.retSize > 0 && memUceInfo_.mem_type == 3) { + NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info, memUceInfo_.retSize)); } c10_npu::clear_mem_uce_info(); -- Gitee From 3881ab8ea0aaed6981dd0475c3a5fed44f5970cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= Date: Tue, 24 Sep 2024 03:44:29 +0000 Subject: [PATCH 15/96] =?UTF-8?q?!14741=20Add=20FA=20Flop=20Count=20Merge?= =?UTF-8?q?=20pull=20request=20!14741=20from=20=E5=8F=B6=E5=AD=90=E5=87=A1?= =?UTF-8?q?/v2.1.0=5FFLOPS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/flopcount/FlopCounter.cpp | 226 +++++++++++++++++++++++ torch_npu/csrc/flopcount/FlopCounter.h | 7 + 2 files changed, 233 insertions(+) diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp index 4f8d53248e..3fa7feaa88 100644 --- a/torch_npu/csrc/flopcount/FlopCounter.cpp +++ b/torch_npu/csrc/flopcount/FlopCounter.cpp @@ -129,3 +129,229 @@ int64_t FlopCounter::conv_backward_flop(const at::Tensor &grad_output, const at: return flop_count; } + +std::vector, std::vector, std::vector, std::vector>> _unpack_flash_attention_nested_shapes(std::vector query, + std::vector key, std::vector value, int64_t head_num, std::vector grad_out, + c10::ArrayRef cum_seq_q, c10::ArrayRef cum_seq_k, std::string input_layer_str) +{ + // Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for + // GQA and MQA and TND + + // for GQA and MQA, the dim 2 or 3 of kv should equal to q + // for general, shape should view to [B, N, S, D] + TORCH_CHECK(head_num != 0, "Divisor head_num may be 0, please check it.") + std::vector, std::vector, std::vector, std::vector>> result; + int64_t q_1 = query[1]; + int64_t q_2 = query[2]; + int64_t k_1 = key[1]; + int64_t k_2 = key[2]; + int64_t v_1 = value[1]; + int64_t v_2 = value[2]; + + // for GQA and MQA + if (input_layer_str == "SBH" || input_layer_str == "BSH" || input_layer_str == "BSND") { + if (q_2 != k_2 && q_2!= v_2) { + k_2 = q_2; + v_2 = q_2; + } + } else { + if (q_1 != k_1 && q_1!= v_1) { + k_1 = q_1; + v_1 = q_1; + } + } + + if (input_layer_str == "BSH") { + std::vector new_query_shape = {query[0], head_num, q_1, q_2/head_num}; + std::vector new_key_shape = {key[0], head_num, k_1, k_2/head_num}; + std::vector new_value_shape = {value[0], head_num, v_1, v_2/head_num}; + std::vector new_grad_out_shape; + if (!grad_out.empty()) { + new_grad_out_shape = new_query_shape; + } + result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape); + } else if (input_layer_str == "SBH") { + std::vector new_query_shape = {q_1, head_num, query[0], q_2/head_num}; + std::vector new_key_shape = {k_1, head_num, key[0], k_2/head_num}; + std::vector new_value_shape = {v_1, head_num, value[0], v_2/head_num}; + std::vector new_grad_out_shape; + if (!grad_out.empty()) { + new_grad_out_shape = new_query_shape; + } + result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape); + } else if (input_layer_str == "BNSD") { + std::vector new_grad_out_shape; + if (!grad_out.empty()) { + new_grad_out_shape = query; + } + result.emplace_back(query, key, value, new_grad_out_shape); + } else if (input_layer_str == "BSND") { + std::vector new_query_shape = {query[0], q_2, q_1, query[3]}; + std::vector new_key_shape = {key[0], k_2, k_1, key[3]}; + std::vector new_value_shape = {value[0], v_2, v_1, value[3]}; + std::vector new_grad_out_shape; + if (!grad_out.empty()) { + new_grad_out_shape = new_query_shape; + } + result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape); + } else if (input_layer_str == "TND") { + TORCH_CHECK(!cum_seq_q.empty(), "The actual_seq_qlen should not be empty when TND"); + TORCH_CHECK(!cum_seq_k.empty(), "The actual_seq_kvlen should not be empty when TND"); + TORCH_CHECK(cum_seq_q.size() == cum_seq_k.size(), "The size of actual_seq_qlen should be equal to actual_seq_kvlen when TND"); + + int64_t b = cum_seq_q.size(); + TORCH_CHECK(b != 0, "Divisor b may be 0, please check it.") + std::vector new_query_shape = {b, q_1, query[0]/b, q_2}; + std::vector new_key_shape = {b, k_1, key[0]/b, k_2}; + std::vector new_value_shape = {b, v_1, value[0]/b, v_2}; + std::vector new_grad_out_shape; + if (!grad_out.empty()) { + new_grad_out_shape = new_query_shape; + } + result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape); + } + + return result; +} + +int64_t sdpa_flop_count(const std::vector query_shape, const std::vector key_shape, const std::vector value_shape) +{ + int64_t b, h, s_q, d_q; + int64_t _b2, _h2, s_k, _d2; + int64_t _b3, _h3, _s3, d_v; + + b = query_shape[0]; + h = query_shape[1]; + s_q = query_shape[2]; + d_q = query_shape[3]; + + _b2 = key_shape[0]; + _h2 = key_shape[1]; + s_k = key_shape[2]; + _d2 = key_shape[3]; + + _b3 = value_shape[0]; + _h3 = value_shape[1]; + _s3 = value_shape[2]; + d_v = value_shape[3]; + + TORCH_CHECK(b == _b2 && b == _b3, "the dim of 0 is not equal between q and kv"); + TORCH_CHECK(h == _h2 && h == _h3, "the dim of 1 is not equal between q and kv"); + TORCH_CHECK(s_k == _s3, "the dim of 2 is not equal between k and v"); + TORCH_CHECK(d_q == _d2, "the dim of 3 is not equal between q and k"); + + int64_t total_flops = 0; + + // q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k] + const at::Tensor shape1 = at::empty({b * h, s_q, d_q}, at::kFloat); + const at::Tensor shape2 = at::empty({b * h, d_q, s_k}, at::kFloat); + total_flops += FlopCounter::bmm_flop(shape1, shape2); + + // scores: [b, h, s_q, s_k] @ v: [b, h, s_k, d_v] -> out: [b, h, s_q, d_v] + const at::Tensor shape3 = at::empty({b * h, s_q, s_k}, at::kFloat); + const at::Tensor shape4 = at::empty({b * h, s_k, d_v}, at::kFloat); + total_flops += FlopCounter::bmm_flop(shape3, shape4); + + return total_flops; +} + +int64_t sdpa_backward_flop_count(const std::vector query_shape, const std::vector key_shape, const std::vector value_shape, const std::vector grad_out_shape) +{ + int64_t b, h, s_q, d_q; + int64_t _b2, _h2, s_k, _d2; + int64_t _b3, _h3, _s3, d_v; + int64_t _b4, _h4, _s4, d_4; + + b = query_shape[0]; + h = query_shape[1]; + s_q = query_shape[2]; + d_q = query_shape[3]; + + _b2 = key_shape[0]; + _h2 = key_shape[1]; + s_k = key_shape[2]; + _d2 = key_shape[3]; + + _b3 = value_shape[0]; + _h3 = value_shape[1]; + _s3 = value_shape[2]; + d_v = value_shape[3]; + + _b4 = grad_out_shape[0]; + _h4 = grad_out_shape[1]; + _s4 = grad_out_shape[2]; + d_4 = grad_out_shape[3]; + + TORCH_CHECK(b == _b2 && b == _b3 && b == _b4, "the dim of 0 is not equal between qkv and grad"); + TORCH_CHECK(h == _h2 && h == _h3 && h == _h4, "the dim of 1 is not equal between qkv and grad"); + TORCH_CHECK(s_k == _s3, "the dim of 2 is not equal between k and v"); + TORCH_CHECK(s_q == _s4, "the dim of 2 is not equal between q and grad"); + TORCH_CHECK(d_q == _d2, "the dim of 3 is not equal between q and k"); + TORCH_CHECK(d_v == d_4, "the dim of 3 is not equal between v and grad"); + + int64_t total_flops = 0; + + // gradOut: [b, h, s_q, d_v] @ v: [b, h, d_v, s_k] -> gradScores: [b, h, s_q, s_k] + const at::Tensor shape1 = at::empty({b * h, s_q, d_v}, at::kFloat); + const at::Tensor shape2 = at::empty({b * h, d_v, s_k}, at::kFloat); + total_flops += FlopCounter::bmm_flop(shape1, shape2); + + // scores: [b, h, s_k, s_q] @ gradOut: [b, h, s_q, d_v] -> gradV: [b, h, s_k, d_v] + const at::Tensor shape3 = at::empty({b * h, s_k, s_q}, at::kFloat); + const at::Tensor shape4 = at::empty({b * h, s_q, d_v}, at::kFloat); + total_flops += FlopCounter::bmm_flop(shape3, shape4); + + // gradScores: [b, h, s_q, s_k] @ k: [b, h, s_k, d_q] -> gradQ: [b, h, s_q, d_q] + const at::Tensor shape5 = at::empty({b * h, s_q, s_k}, at::kFloat); + const at::Tensor shape6 = at::empty({b * h, s_k, d_q}, at::kFloat); + total_flops += FlopCounter::bmm_flop(shape5, shape6); + + // q: [b, h, d_q, s_q] @ gradScores: [b, h, s_q, s_k] -> gradK: [b, h, d_q, s_k] + const at::Tensor shape7 = at::empty({b * h, d_q, s_q}, at::kFloat); + const at::Tensor shape8 = at::empty({b * h, s_q, s_k}, at::kFloat); + total_flops += FlopCounter::bmm_flop(shape7, shape8); + + return total_flops; +} + +int64_t FlopCounter::flash_attention_forward_flop( + const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, int64_t head_num, + const std::string &input_layout, const c10::OptionalIntArrayRef &actual_seq_qlen, + const c10::OptionalIntArrayRef &actual_seq_kvlen) +{ + std::vector grad_out_shape; + std::vector query_shape(query.sizes().begin(), query.sizes().end()); + std::vector key_shape(key.sizes().begin(), key.sizes().end()); + std::vector value_shape(value.sizes().begin(), value.sizes().end()); + auto ac_seq_qlen_tmp = actual_seq_qlen.value_or(c10::ArrayRef{}); + auto ac_seq_kvlen_tmp = actual_seq_kvlen.value_or(c10::ArrayRef{}); + + auto sizes = _unpack_flash_attention_nested_shapes(query_shape, key_shape, value_shape, head_num, grad_out_shape, ac_seq_qlen_tmp, ac_seq_kvlen_tmp, input_layout); + + int64_t total_flops = 0; + for (const auto& [query_shape_new, key_shape_new, value_shape_new, _] : sizes) { + total_flops += sdpa_flop_count(query_shape_new, key_shape_new, value_shape_new); + } + return total_flops; +} + +int64_t FlopCounter::flash_attention_backward_flop( + const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, const at::Tensor &dy, int64_t head_num, + const std::string &input_layout, const c10::OptionalIntArrayRef &actual_seq_qlen, + const c10::OptionalIntArrayRef &actual_seq_kvlen) +{ + std::vector dy_shape(query.sizes().begin(), query.sizes().end()); + std::vector query_shape(query.sizes().begin(), query.sizes().end()); + std::vector key_shape(key.sizes().begin(), key.sizes().end()); + std::vector value_shape(value.sizes().begin(), value.sizes().end()); + auto ac_seq_qlen_tmp = actual_seq_qlen.value_or(c10::ArrayRef{}); + auto ac_seq_kvlen_tmp = actual_seq_kvlen.value_or(c10::ArrayRef{}); + + auto sizes = _unpack_flash_attention_nested_shapes(query_shape, key_shape, value_shape, head_num, dy_shape, ac_seq_qlen_tmp, ac_seq_kvlen_tmp, input_layout); + + int64_t total_flops = 0; + for (const auto& [query_shape_new, key_shape_new, value_shape_new, grad_out_shape] : sizes) { + total_flops += sdpa_backward_flop_count(query_shape_new, key_shape_new, value_shape_new, grad_out_shape); + } + return total_flops; +} diff --git a/torch_npu/csrc/flopcount/FlopCounter.h b/torch_npu/csrc/flopcount/FlopCounter.h index fdf829b5b7..43ee5fe04d 100644 --- a/torch_npu/csrc/flopcount/FlopCounter.h +++ b/torch_npu/csrc/flopcount/FlopCounter.h @@ -18,6 +18,13 @@ public: static int64_t conv_backward_flop(const at::Tensor &grad_output, const at::Tensor &input, const at::Tensor &weight, bool transposed, ::std::array output_mask, const at::Tensor &gradInput, const at::Tensor &gradeWeight); + static int64_t flash_attention_forward_flop(const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, + int64_t head_num, const std::string &input_layout, const c10::OptionalIntArrayRef &actual_seq_qlen, + const c10::OptionalIntArrayRef &actual_seq_kvlen); + static int64_t flash_attention_backward_flop(const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, + const at::Tensor &dy, int64_t head_num, const std::string &input_layout, + const c10::OptionalIntArrayRef &actual_seq_qlen, + const c10::OptionalIntArrayRef &actual_seq_kvlen); }; #endif -- Gitee From 2c98088f9ca98e6059712ec9bfbf861c148307d2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 24 Sep 2024 13:13:48 +0000 Subject: [PATCH 16/96] !14886 Update op_plugin commit id Merge pull request !14886 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 11287f9900..bba68c7744 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 11287f9900795dafab3d5fdce68cc6bc062f2e92 +Subproject commit bba68c77445f84c70c57aaf655ad0580ee3ee91b -- Gitee From 18de45d7e81ede0a9a643a3b233a9ff8330f70a8 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 25 Sep 2024 02:43:48 +0000 Subject: [PATCH 17/96] !14908 Update op_plugin commit id Merge pull request !14908 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index bba68c7744..f463b37ad2 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit bba68c77445f84c70c57aaf655ad0580ee3ee91b +Subproject commit f463b37ad2d5926294f111d3b0bd689d70da1635 -- Gitee From 90ac55586e7582360510e5606bc884208f551082 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 25 Sep 2024 09:28:41 +0000 Subject: [PATCH 18/96] !14928 Update op_plugin commit id Merge pull request !14928 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f463b37ad2..d0987b49a2 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f463b37ad2d5926294f111d3b0bd689d70da1635 +Subproject commit d0987b49a28feb673196ac1375a14546195bf952 -- Gitee From f6c9fa2af83e477d4f195f5c5315fb83da1e2f95 Mon Sep 17 00:00:00 2001 From: Mrtutu Date: Wed, 25 Sep 2024 10:15:46 +0000 Subject: [PATCH 19/96] =?UTF-8?q?!14876=20=E3=80=90Bugfix=E3=80=91Fix=20pr?= =?UTF-8?q?ofiler=20task=5Fmanager=20sleep=20time=20on=20v2.1.0-6.0.rc3=20?= =?UTF-8?q?Merge=20pull=20request=20!14876=20from=20Mrtutu/task=5Fmgr=5Fv2?= =?UTF-8?q?.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/profiler/analysis/prof_common_func/_constant.py | 2 +- torch_npu/profiler/analysis/prof_common_func/_task_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index a178c53070..edcfe328c0 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -12,7 +12,7 @@ class Constant(object): INVALID_VALUE = -1 NULL_VALUE = 0 DEFAULT_PROCESS_NUMBER = os.cpu_count() // 2 - SLEEP_TIME = 0.5 + SLEEP_TIME = 0.1 # dir name FRAMEWORK_DIR = "FRAMEWORK" diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py index cb800471b3..e652b996c4 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py @@ -307,7 +307,7 @@ class ConcurrentTasksManager: need_exit = False break if need_exit: - time.sleep(Constant.SLEEP_TIME * 2) + time.sleep(Constant.SLEEP_TIME * 5) if all((task_info.task.is_non_blocking for task_info in self.listening_infos.values())): return True -- Gitee From c92557a83895d5aa570e8e85e18ab6f6dc7af90f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com> Date: Wed, 25 Sep 2024 10:16:48 +0000 Subject: [PATCH 20/96] =?UTF-8?q?!14859=20[PROF]=20fix=20mstx.range=5Fstar?= =?UTF-8?q?t=20err=20without=20input=20stream=20Merge=20pull=20request=20!?= =?UTF-8?q?14859=20from=20=E6=A2=85=E9=A3=9E=E8=A6=81/2.1=5Frc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/npu/mstx.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/torch_npu/npu/mstx.py b/torch_npu/npu/mstx.py index 38dd465d38..2710d6aeec 100644 --- a/torch_npu/npu/mstx.py +++ b/torch_npu/npu/mstx.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import warnings import torch_npu._C @@ -22,19 +23,22 @@ class mstx: @staticmethod def range_start(message: str, stream=None) -> int: if not message: - print(Warning, "Invalid message for mstx.range_start func. Please input valid message string.") + warnings.warn("Invalid message for mstx.range_start func. Please input valid message string.") return 0 - if isinstance(stream, torch_npu.npu.streams.Stream): - stream = stream.npu_stream + if stream: + if isinstance(stream, torch_npu.npu.streams.Stream): + stream = stream.npu_stream + return torch_npu._C._mstx._range_start(message, stream) + else: + warnings.warn("Invalid stream for mstx.range_start func. Please input valid stream.") + return 0 else: - print(Warning, 'Invalid type for stream argument, must be `torch_npu.npu.Stream`') - return 0 - return torch_npu._C._mstx._range_start(message, stream) + return torch_npu._C._mstx._range_start_on_host(message) @staticmethod def range_end(range_id: int): if not isinstance(range_id, int): - print(Warning, "Invalid message for mstx.range_start func. Please input return value from mstx.range_start().") + warnings.warn("Invalid message for mstx.range_start func. Please input return value from mstx.range_start.") return torch_npu._C._mstx._range_end(range_id) -- Gitee From 220c740152d4301be66e76fc054c37c891bcf037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Wed, 25 Sep 2024 11:31:08 +0000 Subject: [PATCH 21/96] =?UTF-8?q?!14864=20Different=20device=20copying=20i?= =?UTF-8?q?s=20supported=20by=20delivering=20AclrtMemcpyAsync=20task=20Mer?= =?UTF-8?q?ge=20pull=20request=20!14864=20from=20=E9=97=AB=E9=B9=8F?= =?UTF-8?q?=E5=85=A8/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/aten/common/CopyKernel.cpp | 56 ++++++++++--------- .../csrc/aten/common/InnerNpuNativeFunction.h | 2 + .../csrc/aten/ops/op_api/CopyKernelOpApi.cpp | 26 +++------ 3 files changed, 38 insertions(+), 46 deletions(-) diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 803fb11e4e..1253c6b5f7 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -78,33 +78,6 @@ void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blo copy_d2d_dtype_baseformat(self, src, non_blocking); } -void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) { - c10_npu::NPUGuard guard(src.device()); - // p2p enable and synchronize self stream - if (self.device().index() != src.device().index()) { - bool warning_flag = false; - bool p2p_enabled = NpuP2pCtrl::get_instance().get_p2p_access(src.device().index(), self.device().index(), warning_flag); - // In the same 'os', tensor can copy even if the enable fails - if (warning_flag) { - ASCEND_LOGW("p2p enable from %d to %d is fails", src.device().index(), self.device().index()); - } - guard.set_device(self.device()); - c10_npu::NPUStream dst_stream = c10_npu::getCurrentNPUStream(self.device().index()); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(dst_stream)); - guard.set_device(src.device()); - } - if (self.dtype() != src.dtype()) { - custom_ops::npu_dtype_cast_(self, src); // npu_dtype_cast_ will call copy function. - return; - } - copy_d2d_dtype(self, src, non_blocking); - // synchronize src stream for different devices copy - if (self.device().index() != src.device().index()) { - c10_npu::NPUStream copy_stream = c10_npu::getCurrentNPUStream(); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(copy_stream)); - } -} - // the format of dst and src is base format now // the dtype of dst and src is same // and src and dst are contiguous @@ -273,6 +246,35 @@ bool can_use_memcpy(at::Tensor& dst, const at::Tensor& src) { return false; } +void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) { + c10_npu::NPUGuard guard(src.device()); + // p2p enable and synchronize self stream + auto self_device_idx = self.device().index(); + auto src_device_idx = src.device().index(); + if (self_device_idx != src_device_idx) { + bool warning_flag = false; + NpuP2pCtrl::get_instance().get_p2p_access(src_device_idx, self_device_idx, warning_flag); + // In the same 'os', tensor can copy even if the enable fails + if (warning_flag) { + ASCEND_LOGW("p2p enable from %d to %d is fails", src_device_idx, self_device_idx); + } + guard.set_device(self.device()); + c10_npu::NPUStream dst_stream = c10_npu::getCurrentNPUStream(self_device_idx); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(dst_stream)); + guard.set_device(src.device()); + } + if (self.dtype() != src.dtype()) { + custom_ops::npu_dtype_cast_(self, src); // npu_dtype_cast_ will call copy function. + return; + } + copy_d2d_dtype(self, src, non_blocking); + // synchronize src stream for different devices copy + if (self_device_idx != src_device_idx) { + c10_npu::NPUStream copy_stream = c10_npu::getCurrentNPUStream(); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(copy_stream)); + } +} + at::Tensor copy_d2d_format_cast(at::Tensor& dst, const at::Tensor& src) { string srcFormat = FormatHelper::GetFormatName(src); diff --git a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h index b3e8b21023..7a2173c755 100644 --- a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h +++ b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h @@ -7,6 +7,8 @@ namespace at_npu { namespace native { bool can_use_memcpy(at::Tensor& dst, const at::Tensor& src); +// Supports cross-chip copying of different devices +void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking); void copy_d2d_by_memcpy(at::Tensor& dst, const at::Tensor& src, int64_t exceptSize = 0); void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking); void copy_d2d_dtype_baseformat(at::Tensor& self, const at::Tensor& src, bool non_blocking); diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp index 7baad2af45..5c71b8fa26 100644 --- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp @@ -154,28 +154,16 @@ void copy_d2h_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_ // the format of dst and src is baseformat now, copy d2d void copy_d2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_blocking) { - c10_npu::NPUGuard guard(src.device()); if (dst.device().index() != src.device().index()) { - bool warning_flag = false; - bool p2p_enabled = NpuP2pCtrl::get_instance().get_p2p_access(src.device().index(), dst.device().index(), warning_flag); - // In the same 'os', tensor can copy even if the enable fails - if (warning_flag) { - ASCEND_LOGW("p2p enable from %d to %d is fails", src.device().index(), dst.device().index()); - } - guard.set_device(dst.device()); - c10_npu::NPUStream dst_stream = c10_npu::getCurrentNPUStream(dst.device().index()); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(dst_stream)); - guard.set_device(src.device()); - } else { - c10::SmallVector inputs = {src}; - c10::SmallVector outputs = {dst}; - CalcuOpUtil::CheckMemoryOverLaps(inputs, outputs); + return copy_d2d(dst, src, non_blocking); } + + c10_npu::NPUGuard guard(src.device()); + c10::SmallVector inputs = {src}; + c10::SmallVector outputs = {dst}; + CalcuOpUtil::CheckMemoryOverLaps(inputs, outputs); + EXEC_NPU_CMD(aclnnInplaceCopy, dst, src); - if (dst.device().index() != src.device().index()) { - c10_npu::NPUStream copy_stream = c10_npu::getCurrentNPUStream(); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(copy_stream)); - } } -- Gitee From 3d6e60f788ee54ca6cd8d0fa5c5616e8240346a1 Mon Sep 17 00:00:00 2001 From: zhangyuan Date: Wed, 25 Sep 2024 13:58:31 +0000 Subject: [PATCH 22/96] !14909 Update torchair commit id Merge pull request !14909 from zhangyuan/v2.1.0-6.0.rc3 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 485484ca71..5c269fba4c 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 485484ca7143cdf47415793ca76db9210cff8a4c +Subproject commit 5c269fba4c1ea53ef7e3812876756a36c1caf45c -- Gitee From 21ff9f2ed40d143c721b6bbd6c2c902d5e71046e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 25 Sep 2024 13:58:44 +0000 Subject: [PATCH 23/96] !14945 Update op_plugin commit id Merge pull request !14945 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index d0987b49a2..891c8d10ec 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit d0987b49a28feb673196ac1375a14546195bf952 +Subproject commit 891c8d10ec3f5ea468ae9d94d6b7f0b4526a2190 -- Gitee From ea054a360bf590763c8fb00b19e923c3ced47f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com> Date: Thu, 26 Sep 2024 02:38:15 +0000 Subject: [PATCH 24/96] =?UTF-8?q?!14934=20Fix=20lowercase=20issues=20Merge?= =?UTF-8?q?=20pull=20request=20!14934=20from=20=E6=9D=9C=E9=87=91=E8=88=AA?= =?UTF-8?q?/cherry-pick-1727254577?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 8be7af41c0..aae2ed8d01 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -729,7 +729,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments( NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr)); } else { NPU_CHECK_SUPPORTED_OR_ERROR(status, "aclrtReserveMemAddress"); - TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to expandable_segments = false."); + TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to `False`."); m_expandable_segments = false; } } @@ -773,12 +773,12 @@ void CachingAllocatorConfig::parseArgs(const char* env) { if (set_expandable_segments_flag) { TORCH_CHECK(m_max_split_size == std::numeric_limits::max() && m_garbage_collection_threshold == 0, "`max_split_size_mb` or `garbage_collection_threshold`, cannot be enabled with " - "`expandable_segments`, please set `expandable_segments` to `false`.", + "`expandable_segments`, please set `expandable_segments` to `False`.", OPS_ERROR(ErrCode::PARAM)); } else if (m_max_split_size != std::numeric_limits::max() || m_garbage_collection_threshold != 0) { m_expandable_segments = false; TORCH_NPU_WARN_ONCE("`max_split_size_mb` or `garbage_collection_threshold` is enabled, and the " - "`expandable_segments` is changed to `false` by default."); + "`expandable_segments` is changed to `False` by default."); } } } -- Gitee From a96563c3ba252e7d5bbf462548a32eee9655d41b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 26 Sep 2024 16:13:50 +0000 Subject: [PATCH 25/96] !14988 Update op_plugin commit id Merge pull request !14988 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 891c8d10ec..69ff4ba2e9 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 891c8d10ec3f5ea468ae9d94d6b7f0b4526a2190 +Subproject commit 69ff4ba2e9d797462846ba0004e9584ddd7f2fb5 -- Gitee From 100bc9e19a368aad7e72e871c86d12256414aa6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=96=9B=E9=B9=8F?= Date: Fri, 27 Sep 2024 09:21:46 +0000 Subject: [PATCH 26/96] =?UTF-8?q?!14991=20update=20torchair=20commit=20id?= =?UTF-8?q?=20Merge=20pull=20request=20!14991=20from=20=E8=96=9B=E9=B9=8F/?= =?UTF-8?q?v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 5c269fba4c..731150ebc5 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 5c269fba4c1ea53ef7e3812876756a36c1caf45c +Subproject commit 731150ebc561308c44fc7f699c7227551fc470b4 -- Gitee From 492910ad89071aea0fcca393cb327d98b0c52edd Mon Sep 17 00:00:00 2001 From: tangmengcheng Date: Fri, 27 Sep 2024 09:32:05 +0000 Subject: [PATCH 27/96] !15002 v2.1.0-6.0-rc3-bugfix Merge pull request !15002 from tangmengcheng/v2.1.0-6.0-rc3-buf-fix --- .../analysis/prof_common_func/_task_manager.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py index e652b996c4..7de29238af 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py @@ -6,6 +6,7 @@ import threading import multiprocessing import fcntl import pickle +import signal from enum import Enum from abc import ABC, abstractmethod from torch_npu.utils._error_code import ErrCode, prof_error @@ -137,6 +138,7 @@ class ConcurrentTasksManager: def run(self): try: + signal.signal(signal.SIGINT, self.finalize) if self.progress_bar: self.__start_print_progress_bar() @@ -149,13 +151,16 @@ class ConcurrentTasksManager: except Exception as e: print_error_msg(f"An error occurred: {e}") finally: - for task_info in self.task_infos.values(): - if task_info.status != TaskStatus.Succeed: - print_error_msg("Task %s has not run successfully." % task_info.task.name) - self.__stop_task(task_info) + self.finalize() - if self.progress_bar: - self.__stop_print_progress_bar() + def finalize(self): + for task_info in self.task_infos.values(): + if task_info.status != TaskStatus.Succeed: + print_error_msg("Task %s has not run successfully." % task_info.task.name) + self.__stop_task(task_info) + + if self.progress_bar: + self.__stop_print_progress_bar() def clear(self): for task_info in self.listening_infos.values(): -- Gitee From 752b29437638981d193dd459a53c4798b4ac3d35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Fri, 27 Sep 2024 09:53:56 +0000 Subject: [PATCH 28/96] =?UTF-8?q?!14924=20hcclAlltoAll=20put=20into=20task?= =?UTF-8?q?queue=20Merge=20pull=20request=20!14924=20from=20=E7=8E=8B?= =?UTF-8?q?=E8=B6=85/v2.1.0-6.0.rc3=5Ffix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/distributed/ProcessGroupHCCL.cpp | 47 +++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 61c018dda3..1ce574c3e1 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2834,20 +2834,39 @@ c10::intrusive_ptr ProcessGroupHCCL::alltoall_base( HcclComm comm, c10_npu::NPUStream& stream, std::shared_ptr is_dispatched) { RECORD_FUNCTION("HcclAlltoAll", std::vector({input})); - torch_npu::profiler::MstxRange range( - getMstxHcclMsg("HcclAlltoAll", input_counts, getHcclDataType(input.scalar_type()), comm), - stream.stream(false)); - auto hccl_result = hcclAlltoAll( - input.data_ptr(), - input_counts, - getHcclDataType(input.scalar_type()), - output.data_ptr(), - output_counts, - getHcclDataType(output.scalar_type()), - comm, - stream.stream()); - *is_dispatched = true; - return hccl_result; + auto inputDataPtr = input.data_ptr(); + auto outputDataPtr = output.data_ptr(); + auto inputhcclDataType = getHcclDataType(input.scalar_type()); + auto outputhcclDataType = getHcclDataType(output.scalar_type()); + auto hccl_call = [inputDataPtr, + input_counts, + inputhcclDataType, + outputDataPtr, + output_counts, + outputhcclDataType, + comm, + stream, + is_dispatched]() -> int { + torch_npu::profiler::MstxRange range( + getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm), + stream.stream(false)); + auto hccl_result = hcclAlltoAll( + inputDataPtr, + input_counts, + inputhcclDataType, + outputDataPtr, + output_counts, + outputhcclDataType, + comm, + stream.stream(false)); + *is_dispatched = true; + return hccl_result; + }; + at_npu::native::OpCommand cmd; + cmd.Name("HcclAlltoAll"); + cmd.SetCustomHandler(hccl_call); + cmd.Run(); + return HCCL_SUCCESS; }, [&](std::vector&, c10::intrusive_ptr&) {}, [&](std::vector& hcclStreams, c10::intrusive_ptr& work) { -- Gitee From fc645f197592932b244e44bc39ab159ad04c8c8e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 27 Sep 2024 10:58:47 +0000 Subject: [PATCH 29/96] !15029 Update op_plugin commit id Merge pull request !15029 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 69ff4ba2e9..1e6850a170 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 69ff4ba2e9d797462846ba0004e9584ddd7f2fb5 +Subproject commit 1e6850a1703e3d5cd89b0e5903d633de2ba388ff -- Gitee From 1678984808321762371f1a969c2b8cb25eea0f36 Mon Sep 17 00:00:00 2001 From: dilililiwhy Date: Fri, 27 Sep 2024 12:36:24 +0000 Subject: [PATCH 30/96] !14938 egg_info deprecation Merge pull request !14938 from dilililiwhy/cherry-pick-1727265326 --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index cfed83ba12..6e44f86033 100644 --- a/setup.py +++ b/setup.py @@ -465,7 +465,6 @@ class BdistWheelBuild(bdist_wheel): dependencies = torch_dependencies + cann_dependencies + other_dependencies - self.run_command('egg_info') bdist_wheel.run(self) if is_manylinux: @@ -609,7 +608,6 @@ setup( 'build_ext': Build, 'build_py': PythonPackageBuild, 'bdist_wheel': BdistWheelBuild, - 'egg_info': EggInfoBuild, 'install': InstallCmd, 'clean': Clean }, -- Gitee From 9057ef3831bde8127721c6affe71c5b4d9134e84 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 27 Sep 2024 13:58:54 +0000 Subject: [PATCH 31/96] !15036 Update op_plugin commit id Merge pull request !15036 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1e6850a170..e946a18555 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1e6850a1703e3d5cd89b0e5903d633de2ba388ff +Subproject commit e946a185558e0dd0a0bc5d444f1703d4ede168dd -- Gitee From 228714b813a5a4b53e58c6b7e591b56b4541969b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= Date: Fri, 27 Sep 2024 18:45:22 +0000 Subject: [PATCH 32/96] =?UTF-8?q?!14954=20Fix=20codecheck:=20the=20type=20?= =?UTF-8?q?of=20variable=20is=20signed,=20while=20the=20type=20of=20value?= =?UTF-8?q?=20is=20unsigned.=20Merge=20pull=20request=20!14954=20from=20?= =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1/v2.1.0=5FFLOPS=5FRC3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/flopcount/FlopCounter.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp index 3fa7feaa88..0789441efd 100644 --- a/torch_npu/csrc/flopcount/FlopCounter.cpp +++ b/torch_npu/csrc/flopcount/FlopCounter.cpp @@ -199,7 +199,9 @@ std::vector, std::vector, std::vector(std::numeric_limits::max()), "cum_seq_q.size() is too large to be represented as an int64_t", OPS_ERROR(ErrCode::PARAM)); + int64_t b = static_cast(sizeValue); TORCH_CHECK(b != 0, "Divisor b may be 0, please check it.") std::vector new_query_shape = {b, q_1, query[0]/b, q_2}; std::vector new_key_shape = {b, k_1, key[0]/b, k_2}; -- Gitee From 7272f480f4b5bda68965944e5c2eb85969d7aa6c Mon Sep 17 00:00:00 2001 From: sunjiayang Date: Sat, 28 Sep 2024 06:15:10 +0000 Subject: [PATCH 33/96] !15013 stress detect in thread Merge pull request !15013 from sunjiayang/stess_926_210_rc3 --- torch_npu/csrc/InitNpuBindings.cpp | 3 + torch_npu/csrc/npu/Module.cpp | 40 +-------- torch_npu/csrc/npu/Stress_detect.cpp | 130 +++++++++++++++++++++++++++ torch_npu/csrc/npu/Stress_detect.h | 56 ++++++++++++ 4 files changed, 191 insertions(+), 38 deletions(-) create mode 100644 torch_npu/csrc/npu/Stress_detect.cpp create mode 100644 torch_npu/csrc/npu/Stress_detect.h diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 7401446956..938b628f15 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -14,6 +14,7 @@ #include "torch_npu/csrc/profiler/init.h" #include "torch_npu/csrc/flopcount/Init.h" #include "torch_npu/csrc/npu/Module.h" +#include "torch_npu/csrc/npu/Stress_detect.h" #include "torch_npu/csrc/utils/TensorType.h" #include "torch_npu/csrc/utils/AutocastMode.h" #include "torch_npu/csrc/profiler/python/combined_traceback.h" @@ -94,6 +95,8 @@ PyObject* THPModule_npu_shutdown_synchronize(PyObject* /* unused */) Py_RETURN_FALSE; } + StressDetector::stop_worker_thread(); + // Return aclrtSynchronizeDevice result. If sync device fails, release host // resources forcibly, only record WARN logs when acl interface of stream // or event fails. diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index d630aa10d8..99e3b1df7e 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -35,6 +35,7 @@ #include "torch_npu/csrc/npu/NPUPluggableAllocator.h" #include "torch_npu/csrc/npu/Stream.h" #include "torch_npu/csrc/npu/memory_snapshot.h" +#include "torch_npu/csrc/npu/Stress_detect.h" #include "torch_npu/csrc/aten/python_functions.h" #include "torch_npu/csrc/utils/LazyInit.h" #include "third_party/acl/inc/acl/acl.h" @@ -397,9 +398,6 @@ PyObject* THNPModule_getDevice_wrap(PyObject* self, PyObject* noargs) END_HANDLE_TH_ERRORS } -std::unordered_map> last_call_times; -const int interval_time = 3600; - PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS @@ -408,41 +406,7 @@ PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* noargs) int device_id; NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_id)); - auto current_time = std::chrono::steady_clock::now(); - - if (last_call_times.find(device_id) != last_call_times.end() && - std::chrono::duration_cast(current_time - last_call_times[device_id]).count() < interval_time) - { - // StressDetect can only be called once every hour for the given device_id, Return 1. - ASCEND_LOGW("StressDetect can only be called once every hour for the given device_id:{%d}, Return 1.", device_id); - return PyLong_FromLong(1); - } - last_call_times[device_id] = current_time; - - void* workspaceAddr = nullptr; - uint64_t size = 2; - size_t workspaceSize = size << 10 << 10 << 10; - if (workspaceSize > 0) { - auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); - if (ret != ACL_ERROR_NONE) { - c10_npu::NPUCachingAllocator::emptyCache(); - ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); - if (ret != ACL_ERROR_NONE) { - ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret); - return PyLong_FromLong(ACL_ERROR_NONE); - } - } - } - - std::future result = std::async(std::launch::async, c10_npu::acl::AclStressDetect, device_id, workspaceAddr, workspaceSize); - int ret = result.get(); - - aclrtFree(workspaceAddr); - if (ret == ACLNN_CLEAR_DEVICE_STATE_FAIL) { - ASCEND_LOGE("call AclStressDetect failed, ERROR : %d, voltage recovery fail.", ret); - NPU_CHECK_ERROR(ACLNN_CLEAR_DEVICE_STATE_FAIL, "StressDetect"); - } - + int ret = StressDetector::perform_stress_detect(device_id); return PyLong_FromLong(ret); END_HANDLE_TH_ERRORS } diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp new file mode 100644 index 0000000000..bf35d583a6 --- /dev/null +++ b/torch_npu/csrc/npu/Stress_detect.cpp @@ -0,0 +1,130 @@ +#include "Stress_detect.h" +#include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" + +std::unordered_map> StressDetector::last_call_times; +std::atomic StressDetector::task_in_progress(false); +std::atomic StressDetector::stop_thread(false); +std::atomic StressDetector::new_task_submitted(false); +std::atomic StressDetector::thread_initialized(false); +std::promise StressDetector::promise; +std::future StressDetector::current_task_future; +std::thread StressDetector::stress_detect_thread; +std::condition_variable StressDetector::cv; +std::mutex StressDetector::mtx; + +int StressDetector::device_id; +void* StressDetector::workspaceAddr = nullptr; +size_t StressDetector::workspaceSize = 0; +const int StressDetector::interval_time = 3600; + +// Persistent worker thread implementation +void StressDetector::worker_thread() +{ + if (prctl(PR_SET_NAME, ("StressDetect_thread")) != 0) { + ASCEND_LOGE("set thread name failed!"); + } + + while (!stop_thread.load()) { + std::unique_lock lock(mtx); + + // Wait for new task submission or thread stop signal + cv.wait(lock, [] { return new_task_submitted.load() || stop_thread.load(); }); + + if (stop_thread.load()) { + return; // Exit thread + } + + // Execute the task + int ret = c10_npu::acl::AclStressDetect(device_id, workspaceAddr, workspaceSize); + + // Task complete, free memory + aclrtFree(workspaceAddr); + + // Set task result and reset flags + task_in_progress.store(false); + promise.set_value(ret); // Pass the task execution result + + // Reset task submission flag + new_task_submitted.store(false); + } +} + +// Synchronous stress detection task execution +int StressDetector::perform_stress_detect(int deviceid) +{ + auto current_time = std::chrono::steady_clock::now(); + // Check the calling interval + if (last_call_times.find(deviceid) != last_call_times.end() && + std::chrono::duration_cast(current_time - last_call_times[deviceid]).count() < interval_time) { + ASCEND_LOGW("StressDetect can only be called once every hour for the given deviceid:{%d}, Return 1.", deviceid); + return 1; + } + last_call_times[deviceid] = current_time; + + // If it's the first call, start the persistent thread + if (!thread_initialized.load()) { + std::lock_guard lock(mtx); // Ensure thread safety + if (!thread_initialized.load()) { // Double check + stress_detect_thread = std::thread(worker_thread); + thread_initialized.store(true); // Mark thread as started + } + } + + // Set task parameters + task_in_progress.store(true); + + // Allocate workspace memory + workspaceAddr = nullptr; + uint64_t size = 2; + workspaceSize = size << 10 << 10 << 10; // Assume memory size + if (workspaceSize > 0) { + auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_ERROR_NONE) { + c10_npu::NPUCachingAllocator::emptyCache(); + ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_ERROR_NONE) { + ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret); + task_in_progress.store(false); // Task ends + return ACL_ERROR_NONE; + } + } + } + + { + std::lock_guard lock(mtx); + + // Prepare promise and future + promise = std::promise(); + current_task_future = promise.get_future(); + + // Update task-related information + StressDetector::device_id = deviceid; + StressDetector::workspaceAddr = workspaceAddr; + StressDetector::workspaceSize = workspaceSize; + + // Mark new task submitted + new_task_submitted.store(true); + } + + // Notify the persistent thread to start the task + cv.notify_one(); + + // Synchronously wait for the task to complete and get the result + int ret = current_task_future.get(); + + return ret; +} + +// Stop the thread +void StressDetector::stop_worker_thread() +{ + { + std::lock_guard lock(mtx); + stop_thread.store(true); + } + cv.notify_one(); // Notify the thread to exit + if (stress_detect_thread.joinable()) { + stress_detect_thread.join(); // Wait for the thread to exit + } +} \ No newline at end of file diff --git a/torch_npu/csrc/npu/Stress_detect.h b/torch_npu/csrc/npu/Stress_detect.h new file mode 100644 index 0000000000..7edc71683d --- /dev/null +++ b/torch_npu/csrc/npu/Stress_detect.h @@ -0,0 +1,56 @@ +#ifndef STRESS_DETECT_H +#define STRESS_DETECT_H + +#include +#include +#include +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUMacros.h" + +class StressDetector { +public: + TORCH_NPU_API static int perform_stress_detect(int deviceid); + TORCH_NPU_API static void stop_worker_thread(); + +private: + static void worker_thread(); + + // Records the last call time for each device + static std::unordered_map> last_call_times; + + // Thread for handling the stress detection task + static std::thread stress_detect_thread; + + // Condition variable and mutex to control the thread + static std::condition_variable cv; + static std::mutex mtx; + + // Flag to indicate if a task is in progress + static std::atomic task_in_progress; + + // Flag to signal the thread to stop + static std::atomic stop_thread; + + // Flag to indicate if a new task has been submitted + static std::atomic new_task_submitted; + + // Promise and future for the task, used for synchronizing task results + static std::promise promise; + static std::future current_task_future; + + // Stores parameters related to the task + static int device_id; + static void* workspaceAddr; + static size_t workspaceSize; + + // Interval between tasks + static const int interval_time; + + // Flag to indicate if the thread has been initialized + static std::atomic thread_initialized; +}; + +#endif // STRESS_DETECT_H \ No newline at end of file -- Gitee From c919ad1f44f62a946f5203e6837df22d4d4594ca Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 28 Sep 2024 10:43:54 +0000 Subject: [PATCH 34/96] !15063 Update op_plugin commit id Merge pull request !15063 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index e946a18555..a5ab151a48 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit e946a185558e0dd0a0bc5d444f1703d4ede168dd +Subproject commit a5ab151a489e571176afe16be976bcc179dd7a87 -- Gitee From 3cb9e12531fce8852873cdb5cf477d6f7edd8a4f Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 28 Sep 2024 10:43:54 +0000 Subject: [PATCH 35/96] !15063 Update op_plugin commit id Merge pull request !15063 from pta-robot/v2.1.0-6.0.rc3 -- Gitee From 0ea525168b52ad100eca9031ffdd8e7e8ebc89ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Sat, 28 Sep 2024 13:44:58 +0000 Subject: [PATCH 36/96] =?UTF-8?q?!15015=20remove=20redundant=20check=20in?= =?UTF-8?q?=20uce=20error=20check=20Merge=20pull=20request=20!15015=20from?= =?UTF-8?q?=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUException.cpp | 12 ------------ torch_npu/csrc/core/npu/NPUException.h | 8 +------- torch_npu/csrc/core/npu/NPUQueue.cpp | 4 ---- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 2 -- torch_npu/csrc/npu/Module.cpp | 1 - 5 files changed, 1 insertion(+), 26 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 4cc680261b..20d95c68e7 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -76,8 +76,6 @@ static std::string getCurrentTimestamp() namespace c10_npu { -bool has_throw_error = false; - MemUceInfo memUceInfo; std::mutex memUceInfoMutex; @@ -135,14 +133,4 @@ bool checkUceErrAndRepair() return false; } -bool get_has_throw_error() -{ - return has_throw_error; -} - -void set_has_throw_error(bool flag) -{ - has_throw_error = flag; -} - } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index eb4620a13c..0f03ee1865 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -110,7 +110,6 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) Error_stop = stop_error; \ } \ if ((Error_stop) == ACL_ERROR_RT_DEVICE_TASK_ABORT) { \ - c10_npu::set_has_throw_error(true); \ TORCH_CHECK( \ false, \ __func__, \ @@ -129,9 +128,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) if ((uce_error) != ACL_ERROR_NONE) { \ Error_uce = uce_error; \ } \ - if ((Error_uce) == ACL_ERROR_RT_DEVICE_MEM_ERROR && \ - c10_npu::get_has_throw_error() == false && c10_npu::checkUceErrAndRepair()) { \ - c10_npu::set_has_throw_error(true); \ + if ((Error_uce) == ACL_ERROR_RT_DEVICE_MEM_ERROR && c10_npu::checkUceErrAndRepair()) { \ TORCH_CHECK( \ false, \ __func__, \ @@ -262,7 +259,4 @@ MemUceInfo get_mem_uce_info(); void clear_mem_uce_info(); -bool get_has_throw_error(); - -void set_has_throw_error(bool flag); } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index b5b762942c..8c950cde17 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -248,7 +248,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) if (GetStatus() == RepoStatus::STOP_EXIT) { ClearQueue(); - set_has_throw_error(true); if (check_error) { throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); } else { @@ -266,7 +265,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) #endif read_idx.idx = write_idx.idx; if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { - set_has_throw_error(true); call_ret = 0; if (check_error) { throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL)); @@ -378,7 +376,6 @@ void Repository::Enqueue(void* cur_paras) { return; } ClearQueue(); - set_has_throw_error(true); throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); } @@ -388,7 +385,6 @@ void Repository::Enqueue(void* cur_paras) { read_idx.idx = write_idx.idx; if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { - set_has_throw_error(true); call_ret = 0; throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL)); } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 1ce574c3e1..535179edb6 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1487,14 +1487,12 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr Date: Sun, 29 Sep 2024 01:38:41 +0000 Subject: [PATCH 37/96] !14871 fix check uce in mem bug Merge pull request !14871 from sunjiayang/unsafe_210_rc3 --- .../csrc/core/npu/NPUCachingAllocator.cpp | 20 +++++++++++++++---- torch_npu/csrc/npu/Module.cpp | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index aae2ed8d01..e4b764ae36 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -868,16 +868,18 @@ class DeviceCachingAllocator { auto memUceInfo_ = c10_npu::get_mem_uce_info(); auto info = memUceInfo_.info; const auto all_blocks = get_all_blocks(); + bool any_found = false; + aclrtMemUceInfo temp_info[memUceInfo_.retSize]; + size_t temp_retsize = 0; for (int i = 0; i < memUceInfo_.retSize; ++i) { void* addr = info[i].addr; size_t length = info[i].len; + bool found = false; // Calculate the start and end address for info[i] void* addr_end = static_cast(addr) + length - 1; - bool found = false; - // Iterate through all blocks and check if there's an overlap with addr for (const Block* const head_block : all_blocks) { void* block_start = head_block->ptr; @@ -887,6 +889,7 @@ class DeviceCachingAllocator { if (addr <= block_end && addr_end >= block_start) { const_cast(head_block)->is_safe = false; found = true; + any_found = true; // Set the unsafe flag only once if (c10_npu::get_npu_data_unsafe_flag() == false) { c10_npu::set_npu_data_unsafe_flag(true); @@ -894,10 +897,19 @@ class DeviceCachingAllocator { } } - if (!found) { - return false; + if (found) { + // update memuceinfo + temp_info[temp_retsize++] = info[i]; } } + + std::memcpy(memUceInfo_.info, temp_info, temp_retsize * sizeof(aclrtMemUceInfo)); + memUceInfo_.retSize = temp_retsize; + + c10_npu::set_mem_uce_info(memUceInfo_); + if (!any_found) { + return false; + } return true; } diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 6fbb01d166..7a9c655c12 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -376,7 +376,7 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg) HANDLE_TH_ERRORS int device = THPUtils_unpackLong(arg); auto memUceInfo_ = c10_npu::get_mem_uce_info(); - if (memUceInfo_.retSize > 0 && memUceInfo_.mem_type == 3) { + if (memUceInfo_.retSize > 0) { NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info, memUceInfo_.retSize)); } -- Gitee From 2d6fb9619a27a785eac0e6d5187ee7f6d7d7837c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sun, 29 Sep 2024 02:43:52 +0000 Subject: [PATCH 38/96] !15088 Update op_plugin commit id Merge pull request !15088 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index a5ab151a48..dd7b55c58a 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit a5ab151a489e571176afe16be976bcc179dd7a87 +Subproject commit dd7b55c58a5de27370a04e9025be853bb803ed78 -- Gitee From 5fffefb0a41f720ce1ee688904e58f7f11043af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Sun, 29 Sep 2024 09:49:34 +0000 Subject: [PATCH 39/96] =?UTF-8?q?!15082=20uce=20bug=20fix:=20In=20order=20?= =?UTF-8?q?to=20prevent=20the=20dequeue=20thread=20from=20terminating,=20R?= =?UTF-8?q?eadQueue=20should=20set=20uce=20status.=20Merge=20pull=20reques?= =?UTF-8?q?t=20!15082=20from=20=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Fuce2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUQueue.cpp | 30 ++++++++++++++-------------- torch_npu/csrc/core/npu/NPUQueue.h | 4 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 8c950cde17..edbd5a8655 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -246,6 +246,14 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) } } + if (GetStatus() == RepoStatus::UCE_EXIT) { + if (check_error) { + throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL)); + } else { + ASCEND_LOGE("UCE ERROR happend."); + } + } + if (GetStatus() == RepoStatus::STOP_EXIT) { ClearQueue(); if (check_error) { @@ -264,14 +272,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) } #endif read_idx.idx = write_idx.idx; - if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { - call_ret = 0; - if (check_error) { - throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL)); - } else { - ASCEND_LOGE("UCE ERROR happend."); - } - } if (check_error) { throw std::runtime_error("The Inner error is reported as above. " @@ -338,14 +338,15 @@ bool Repository::ReadQueue() #endif if (ret != 0) { repo_error = get_func_error_msg(manager().getCurrentParams(datas, read_idx.idx)); - call_ret = ret; ASCEND_LOGE("---Thread---%llu: device = %d, write_idx = %u, read_idx = %u, status = %d, ret = %d", std::this_thread::get_id(), device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret); while (!IsEmptyQueue()) { // ignore other tasks manager().Release(datas, read_idx.idx, releaseQueue); read_idx.idx = (read_idx.idx + 1) & (kQueueCapacity - 1); } - if (GetStatus() != STOP_EXIT) { + if (ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { + SetStatus(UCE_EXIT); + } else if (GetStatus() != STOP_EXIT) { SetStatus(ERROR_EXIT); } read_idx.idx = write_idx.idx; @@ -369,6 +370,10 @@ void Repository::Enqueue(void* cur_paras) { return; } + if (GetStatus() == RepoStatus::UCE_EXIT) { + throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL)); + } + if (GetStatus() == RepoStatus::STOP_EXIT) { auto queueParam = static_cast(cur_paras); auto type = queueParam->paramType; @@ -384,11 +389,6 @@ void Repository::Enqueue(void* cur_paras) { SetStatus(CAN_EXIT); read_idx.idx = write_idx.idx; - if (call_ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair()) { - call_ret = 0; - throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL)); - } - throw std::runtime_error("The Inner error is reported as above. " "The process exits for this inner error, and " + repo_error + ".\n" + "Since the operator is called asynchronously, the stacktrace may be inaccurate. " diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h index e2f2b64933..66e648069f 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.h +++ b/torch_npu/csrc/core/npu/NPUQueue.h @@ -22,7 +22,8 @@ enum RepoStatus { NEED_EXIT = 2, CAN_EXIT = 3, ERROR_EXIT = 4, - STOP_EXIT = 5, + UCE_EXIT = 5, + STOP_EXIT = 6, }; // c10::SmallVector max size @@ -115,7 +116,6 @@ private: c10::DeviceIndex device_idx; private: - int call_ret; sring_idx read_idx; sring_idx write_idx; std::atomic repo_status; -- Gitee From 5205b032dde29a992d8133a7bdfb7b6b76f90ed0 Mon Sep 17 00:00:00 2001 From: Mrtutu Date: Sun, 29 Sep 2024 10:37:41 +0000 Subject: [PATCH 40/96] !15093 [Bug] Fix profiler db small timeout value on v2.1.0-rc3 Merge pull request !15093 from Mrtutu/db_timeout_v2.1.0-6.0.rc3 --- torch_npu/profiler/analysis/prof_common_func/_db_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_db_manager.py b/torch_npu/profiler/analysis/prof_common_func/_db_manager.py index 74e52cbaae..4256823fc5 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_db_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_db_manager.py @@ -1,4 +1,5 @@ import os +import sys import sqlite3 from ._constant import Constant, print_warn_msg, print_error_msg @@ -27,6 +28,7 @@ class DbManager: INSERT_SIZE = 10000 FETCH_SIZE = 10000 MAX_ROW_COUNT = 100000000 + MAX_TIMEOUT = int(sys.maxsize / 1000) @classmethod def create_connect_db(cls, db_path: str) -> tuple: @@ -36,7 +38,7 @@ class DbManager: if os.path.exists(db_path): FileManager.check_db_file_vaild(db_path) try: - conn = sqlite3.connect(db_path) + conn = sqlite3.connect(db_path, timeout=cls.MAX_TIMEOUT) except sqlite3.Error as err: return EmptyClass("emoty conn"), EmptyClass("empty curs") -- Gitee From 03f553f7c016307fcb9228ea70f81a7b7613a2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com> Date: Sun, 29 Sep 2024 10:38:24 +0000 Subject: [PATCH 41/96] =?UTF-8?q?!15073=20[PROF]update=20mstx=20data=20for?= =?UTF-8?q?mat=20Merge=20pull=20request=20!15073=20from=20=E6=A2=85?= =?UTF-8?q?=E9=A3=9E=E8=A6=81/comm=5F1.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 535179edb6..023445c1a8 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1660,7 +1660,7 @@ std::string ProcessGroupHCCL::getMstxHcclMsg( if (!torch_npu::profiler::mstxEnable()) { return ""; } - std::string hccl_message_str = opName + "@"; + std::string hccl_message_str = "comm:" + opName + ","; auto nameIter = commNames.find(comm); if (nameIter == commNames.end()) { char commName[MAX_GROUP_NAME_LEN]; @@ -1671,13 +1671,13 @@ std::string ProcessGroupHCCL::getMstxHcclMsg( } else { hccl_message_str += nameIter->second; } - hccl_message_str += "@"; + hccl_message_str += ","; std::string data_type_str = "na"; auto iter = dataTypes.find(dataType); if (iter != dataTypes.end()) { data_type_str = iter->second; } - hccl_message_str = hccl_message_str + data_type_str + "@" + std::to_string(dataCnt); + hccl_message_str = hccl_message_str + data_type_str + "," + std::to_string(dataCnt); return hccl_message_str; } -- Gitee From 344f9e7258c0b174c4e43da2b6669a0398e9ac6c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sun, 29 Sep 2024 10:51:07 +0000 Subject: [PATCH 42/96] !15121 Update op_plugin commit id Merge pull request !15121 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index dd7b55c58a..d21d3941a4 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit dd7b55c58a5de27370a04e9025be853bb803ed78 +Subproject commit d21d3941a4215db730666cc543259973b443ad3d -- Gitee From 290b07b1b20b4d86a84f05dd2765de3880473ef3 Mon Sep 17 00:00:00 2001 From: kevin_huang Date: Sun, 29 Sep 2024 11:26:14 +0000 Subject: [PATCH 43/96] !14724 [MoeFinalizeRouting] Modify the ONNX export parameters Merge pull request !14724 from kevin_huang/cherry-pick-1726737291 --- test/torch_npu_schema.json | 2 +- torch_npu/onnx/wrapper_onnx_ops.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index f168547aa8..5a36b1ea4b 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2778,7 +2778,7 @@ "signature": "(sorted_experts, num_experts=1)" }, "torch_npu.npu_moe_finalize_routing": { - "signature": "(expanded_permuted_rows, skip1, skip2_optional, bias, scales, expanded_src_to_dst_row, expert_for_source_row)" + "signature": "(expanded_permuted_rows, skip1, skip2, bias, scales, expanded_src_to_dst_row, export_for_source_row)" }, "torch_npu.npu_multi_head_attention": { "signature": "(query, key, value, query_weight, key_weight, value_weight, attn_mask, out_proj_weight, query_bias, key_bias, value_bias, out_proj_bias, dropout_mask, attn_head_num, attn_dim_per_head, src_len, tgt_len, dropout_prob, softmax_use_float)" diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py index 98b515f149..f11ddb8421 100644 --- a/torch_npu/onnx/wrapper_onnx_ops.py +++ b/torch_npu/onnx/wrapper_onnx_ops.py @@ -922,12 +922,12 @@ class _NPUMoeFinalizeRoutingOP(torch.autograd.Function): return torch.ops.npu.npu_moe_finalize_routing(*args, **kwargs) @staticmethod - def symbolic(g, expanded_permuted_rows: Tensor, skip1: Tensor, skip2_optional: Optional[Tensor], bias: Tensor, - scales: Tensor, expanded_src_to_dst_row: Tensor, expert_for_source_row: Tensor): - if skip2_optional is None: - skip2_optional = g.op("Constant", value_t=torch.tensor([]).to(torch.float)) - return g.op("npu::NPUMoeFinalizeRouting", expanded_permuted_rows, skip1, skip2_optional, bias, - scales, expanded_src_to_dst_row, expert_for_source_row) + def symbolic(g, expanded_permuted_rows: Tensor, skip1: Tensor, skip2: Optional[Tensor], bias: Tensor, + scales: Tensor, expanded_src_to_dst_row: Tensor, export_for_source_row: Tensor): + if skip2 is None: + skip2 = g.op("Constant", value_t=torch.tensor([]).to(torch.float)) + return g.op("npu::NPUMoeFinalizeRouting", expanded_permuted_rows, skip1, skip2, bias, + scales, expanded_src_to_dst_row, export_for_source_row) class _NPUMoeGatingTopKSoftmaxOP(torch.autograd.Function): @@ -1266,10 +1266,10 @@ def _wrapper_npu_moe_compute_expert_tokens(sorted_experts, num_experts=1): return _NPUMoeComputeExpertTokensOP.apply(sorted_experts, num_experts) -def _wrapper_npu_moe_finalize_routing(expanded_permuted_rows, skip1, skip2_optional, bias, - scales, expanded_src_to_dst_row, expert_for_source_row): - return _NPUMoeFinalizeRoutingOP.apply(expanded_permuted_rows, skip1, skip2_optional, bias, - scales, expanded_src_to_dst_row, expert_for_source_row) +def _wrapper_npu_moe_finalize_routing(expanded_permuted_rows, skip1, skip2, bias, + scales, expanded_src_to_dst_row, export_for_source_row): + return _NPUMoeFinalizeRoutingOP.apply(expanded_permuted_rows, skip1, skip2, bias, + scales, expanded_src_to_dst_row, export_for_source_row) def _wrapper_npu_moe_gating_top_k_softmax(x, finished, k): -- Gitee From f723214813e7a197c6b31135c75f73dd5e64ac6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=B2=81=E5=8D=9A=E6=B4=8B?= Date: Sun, 29 Sep 2024 11:28:03 +0000 Subject: [PATCH 44/96] =?UTF-8?q?!15089=20update=20torchair=20commitid=20v?= =?UTF-8?q?2.1.0rc3=20Merge=20pull=20request=20!15089=20from=20=E9=B2=81?= =?UTF-8?q?=E5=8D=9A=E6=B4=8B/clamp=5Fv2.1.0rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 731150ebc5..9382e2f6a1 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 731150ebc561308c44fc7f699c7227551fc470b4 +Subproject commit 9382e2f6a1171502887c36d6556fa5fc1ab85b66 -- Gitee From bc5a963efa6bef284c7fdabd0371f590e0162878 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sun, 29 Sep 2024 13:36:19 +0000 Subject: [PATCH 45/96] !15134 Update op_plugin commit id Merge pull request !15134 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index d21d3941a4..d7523c27fb 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit d21d3941a4215db730666cc543259973b443ad3d +Subproject commit d7523c27fb09878bf4da7da8dbfcc5eea46290d2 -- Gitee From bdf7cee84ca4242f9c852093fe15429958d9ac1f Mon Sep 17 00:00:00 2001 From: wangjie Date: Mon, 30 Sep 2024 09:09:44 +0000 Subject: [PATCH 46/96] !15123 [PROF] Proflier fix HOST_INFO and META_DATA table Merge pull request !15123 from wangjie/cherry-pick-1727604624 --- test/profiler/analysis/prof_common_func/test_host_info.py | 2 +- torch_npu/profiler/analysis/prof_common_func/_constant.py | 8 ++++++-- .../profiler/analysis/prof_common_func/_host_info.py | 2 +- .../analysis/prof_view/prof_db_parse/_db_parser.py | 6 ++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/test/profiler/analysis/prof_common_func/test_host_info.py b/test/profiler/analysis/prof_common_func/test_host_info.py index d008d0a911..e508d208d5 100644 --- a/test/profiler/analysis/prof_common_func/test_host_info.py +++ b/test/profiler/analysis/prof_common_func/test_host_info.py @@ -7,7 +7,7 @@ class TestHostInfo(TestCase): def test_get_host_info(self): host_info = get_host_info() - self.assertNotEqual(0, host_info.get('host_uid')) + self.assertNotEqual('0', host_info.get('host_uid')) if __name__ == "__main__": diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index edcfe328c0..6cde5e6dcb 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -291,7 +291,7 @@ class DbConstant(): TABLE_MEMORY_RECORD = "MEMORY_RECORD" TABLE_OPERATOR_MEMORY = "OP_MEMORY" TABLE_NPU_OP_MEM = "NPU_OP_MEM" - META_DATA_INFO = "META_DATA" + TABLE_META_DATA = "META_DATA" # rank device map table name TABLE_RANK_DEVICE_MAP = "RANK_DEVICE_MAP" @@ -434,9 +434,13 @@ class TableColumnsManager(): ("preparing", Constant.SQL_NUMERIC_TYPE) ], DbConstant.TABLE_HOST_INFO : [ - ('hostUid', Constant.SQL_INTEGER_TYPE), + ('hostUid', Constant.SQL_TEXT_TYPE), ('hostName', Constant.SQL_TEXT_TYPE) ], + DbConstant.TABLE_META_DATA : [ + ('name', Constant.SQL_TEXT_TYPE), + ('value', Constant.SQL_TEXT_TYPE) + ], DbConstant.TABLE_STEP_TIME : [ ("id", Constant.SQL_INTEGER_TYPE), ("startNs", Constant.SQL_INTEGER_TYPE), diff --git a/torch_npu/profiler/analysis/prof_common_func/_host_info.py b/torch_npu/profiler/analysis/prof_common_func/_host_info.py index 6ae9981400..4a04ae9db7 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_host_info.py +++ b/torch_npu/profiler/analysis/prof_common_func/_host_info.py @@ -21,7 +21,7 @@ __all__ = [] def get_host_info() -> dict: host_name = socket.gethostname() - host_uid = _get_host_uid() + host_uid = str(_get_host_uid()) return { 'host_name': host_name, 'host_uid': host_uid diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py index ada31ca301..297c3a878f 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py @@ -78,7 +78,8 @@ class DbParser(BaseParser): def save_env_vars_info_to_db(self): env_vars_dict = collect_env_vars() - DbManager.insert_data_into_table(self._conn, DbConstant.META_DATA_INFO, + DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_META_DATA, TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA)) + DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_META_DATA, [['ENV_VARIABLES', json.dumps(env_vars_dict.get('ENV_VARIABLES'))]]) def save_profiler_metadata_to_db(self): @@ -94,4 +95,5 @@ class DbParser(BaseParser): data = [ [str(key), json.dumps(value)] for key, value in profiler_metadata.items() ] - DbManager.insert_data_into_table(self._conn, DbConstant.META_DATA_INFO, data) + DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_META_DATA, TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA)) + DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_META_DATA, data) -- Gitee From e020c49fddd228ce9afe8100c39d9e2965a0de12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= Date: Mon, 30 Sep 2024 09:09:54 +0000 Subject: [PATCH 47/96] =?UTF-8?q?!15112=20foreach=20add=20compatibility=20?= =?UTF-8?q?check=20of=20cann=20version=20Merge=20pull=20request=20!15112?= =?UTF-8?q?=20from=20=E8=A9=B9=E6=98=8A/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/utils/_optim.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/torch_npu/utils/_optim.py b/torch_npu/utils/_optim.py index 287ff05976..0eaeb63a16 100644 --- a/torch_npu/utils/_optim.py +++ b/torch_npu/utils/_optim.py @@ -1,20 +1,33 @@ import torch import torch.optim.optimizer as opt import torch_npu +from torch_npu.utils.collect_env import get_cann_version _device_name = None +_cann_version = get_cann_version() +_foreach_black_list_for_cann_starts_with = ['8.0.RC1', '8.0.RC2'] +_foreach_black_list_for_cann_all = ['not known', '8.0.T1', '8.0.T2', '8.0.T3', '8.0.T37', '8.0.T5', '8.0.T6', '8.0.T7', + '8.0.T8', '8.0.T10', '8.0.T13', '8.0.T16', '8.0.T50', '8.0.T51', '8.0.T52'] def patch_supported_devices(): global _device_name - _device_name = (_device_name if _device_name is not None + _device_name = (_device_name if _device_name is not None else torch_npu.npu.get_device_name(torch_npu.npu.current_device())) + global _cann_version + if _cann_version is None or _cann_version < '8.0' or _cann_version in _foreach_black_list_for_cann_all: + return ["cuda", "xpu"] + + for ver in _foreach_black_list_for_cann_starts_with: + if _cann_version.startswith(ver): + return ["cuda", "xpu"] + if _device_name > "Ascend910B" and _device_name < "Ascend910PremiumA": return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()] - - return ["cuda", "xpu"] + + return ["cuda", "xpu"] def add_optim_method(): -- Gitee From c486decf103e76bc9890c97b0f38cfb3ec962ebf Mon Sep 17 00:00:00 2001 From: liyou_b <2953090824@qq.com> Date: Mon, 30 Sep 2024 09:10:10 +0000 Subject: [PATCH 48/96] =?UTF-8?q?!15109=20=E3=80=90PROF=E3=80=91=E3=80=90B?= =?UTF-8?q?ug=E3=80=91V210rc3:=20Fix=20share=20memory=20resource=5Ftracker?= =?UTF-8?q?=20bug=20Merge=20pull=20request=20!15109=20from=20liyou=5Fb/v21?= =?UTF-8?q?0=5Frc3=5Fshm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../_dynamic_profiler/_dynamic_profiler_monitor_shm.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py index fccaa63a14..9284706d91 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py @@ -117,14 +117,16 @@ class DynamicProfilerShareMemory: def _create_shm_over_py38(self): """Create a json monitor process based on whether the SharedMemory is successfully created py38""" - from multiprocessing import shared_memory, resource_tracker + from unittest.mock import patch + from multiprocessing import shared_memory try_times = 10 while try_times: try: # Step 1: try to open shm file, first time shm not exists. - self.shm = shared_memory.SharedMemory(name=self.shm_path) + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + self.shm = shared_memory.SharedMemory(name=self.shm_path) self.is_create_process = False - resource_tracker.unregister(self.shm._name, 'shared_memory') logger.info("Rank %d shared memory is connected.", self._rank_id) break except FileNotFoundError: -- Gitee From 095e0c4c8a1f20b9f4853ae8597bc3b96cfc8ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Mon, 30 Sep 2024 09:14:39 +0000 Subject: [PATCH 49/96] =?UTF-8?q?!15052=20ranktable=20bug=20fix:=20global?= =?UTF-8?q?=20processgroup=20may=20not=20the=20first=20to=20be=20created?= =?UTF-8?q?=20Merge=20pull=20request=20!15052=20from=20=E7=8E=8B=E8=B6=85/?= =?UTF-8?q?v2.1.0-6.0.rc3=5Ffix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 15 ++++++--------- torch_npu/csrc/distributed/ProcessGroupHCCL.hpp | 2 -- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 023445c1a8..f66e215d3c 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -604,8 +604,6 @@ std::vector ProcessGroupHCCL::WorkHCCL::result() return *outputs_; } -static std::atomic process_group_id = 0; - ProcessGroupHCCL::ProcessGroupHCCL( const c10::intrusive_ptr& store, int rank, @@ -617,8 +615,7 @@ ProcessGroupHCCL::ProcessGroupHCCL( hcclCommCounter_(0), traceKeyStart_("HCCL_" + std::to_string(rank) + "_trace_start"), traceKeyEnd_("HCCL_" + std::to_string(rank) + "_trace_end"), - terminateProcessGroup_(false), - uid_(process_group_id++) + terminateProcessGroup_(false) { uint32_t hccl_event_timeout = c10_npu::option::OptionsManager::GetHCCLEventTimeout(); uint32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout(); @@ -719,7 +716,7 @@ ProcessGroupHCCL::ProcessGroupHCCL( global_hccl_id_ = group_ranks + "_" + std::to_string(group_ranks_map_[group_ranks]); } - if (options_->global_ranks_in_group.empty() && uid_ == 0) { + if (options_->global_ranks_in_group.empty()) { global_ = this; } } @@ -769,7 +766,7 @@ void ProcessGroupHCCL::abort(c10::optional abortReason) ProcessGroupHCCL::~ProcessGroupHCCL() { - if (options_->global_ranks_in_group.empty() && uid_ == 0) { + if (options_->global_ranks_in_group.empty()) { global_ = nullptr; } @@ -850,7 +847,7 @@ void ProcessGroupHCCL::logWorkEnd(WorkHCCL& work) const std::vector& ProcessGroupHCCL::groupRanks() const { - if (options_->global_ranks_in_group.empty() && uid_ == 0) { + if (options_->global_ranks_in_group.empty()) { static std::vector globalRanks(size_); std::iota(globalRanks.begin(), globalRanks.end(), 0); return globalRanks; @@ -1150,7 +1147,7 @@ void ProcessGroupHCCL::createHCCLComm(const std::vector& devices, std::to_string((int)commType) + DIST_ERROR(ErrCode::PARAM)); } - if (options_->global_ranks_in_group.empty() && uid_ == 0) { + if (options_->global_ranks_in_group.empty()) { global_hccl_comm_ = hcclComms[i]; } @@ -1172,7 +1169,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, } c10_npu::OptionalNPUGuard npuGuard; // global process group - if (options_->global_ranks_in_group.empty() && uid_ == 0) { + if (options_->global_ranks_in_group.empty()) { if (!hcclCommInitClusterInfoConfigExist()) { ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface."); return false; diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 102cfc66ed..c09d255e17 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -604,8 +604,6 @@ protected: std::exception_ptr watchDogException_ = nullptr; - size_t uid_; - private: // Helper that encapsulates work shared across all collective communication // primitives. -- Gitee From 62c9d5359e8cdf754a89cf0c639cb7fd37a7b9b9 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 30 Sep 2024 09:46:34 +0000 Subject: [PATCH 50/96] !15083 fix torch.empty return random when use_deterministic_algorithms Merge pull request !15083 from huangyunlong/2.1rc3em --- test/npu/test_tensor.py | 8 ++++++++ torch_npu/csrc/aten/common/EmptyTensor.cpp | 19 +++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/test/npu/test_tensor.py b/test/npu/test_tensor.py index 98fd3c33d0..c4b2befd14 100644 --- a/test/npu/test_tensor.py +++ b/test/npu/test_tensor.py @@ -1,6 +1,7 @@ import itertools import torch from torch.testing import make_tensor +from torch.testing._internal.common_utils import DeterministicGuard import torch_npu from torch_npu.testing.testcase import TestCase, run_tests @@ -303,6 +304,13 @@ class TestTensor(TestCase): self.assertEqual(res1.to('cpu'), expected.to('cpu')) + def test_empty_with_deterministic(self): + with DeterministicGuard(True): + empty_tensor = torch.empty(2, 3, 4) + empty_strided_tensor = torch.empty_strided((2, 3, 4), (1, 1, 1)) + self.assertTrue(empty_tensor.isnan().all()) + self.assertTrue(empty_strided_tensor.isnan().all()) + if __name__ == '__main__': run_tests() diff --git a/torch_npu/csrc/aten/common/EmptyTensor.cpp b/torch_npu/csrc/aten/common/EmptyTensor.cpp index 0bfb7ed24f..b33ff954b7 100644 --- a/torch_npu/csrc/aten/common/EmptyTensor.cpp +++ b/torch_npu/csrc/aten/common/EmptyTensor.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/npu/THNPUCachingHostAllocator.h" @@ -88,13 +89,23 @@ at::TensorBase empty_strided_cpu( } at::Tensor empty_memory_format(c10::IntArrayRef size, c10::optional dtype_opt, c10::optional layout_opt, - c10::optional device_opt, c10::optional pin_memory_opt, c10::optional memory_format_opt) { - return empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); + c10::optional device_opt, c10::optional pin_memory_opt, c10::optional memory_format_opt) +{ + at::Tensor result = empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); + if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms())) { + at::native::fill_empty_deterministic_(result); + } + return result; } at::Tensor empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - return empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); + c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) +{ + at::Tensor result = empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); + if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms())) { + at::native::fill_empty_deterministic_(result); + } + return result; } TORCH_LIBRARY_IMPL(aten, CPU, m) { -- Gitee From 5067d07a9ce0afe0ea36058286bb158c4ee7f5f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Mon, 7 Oct 2024 09:01:11 +0000 Subject: [PATCH 51/96] =?UTF-8?q?!15165=20add=20ranktable=20warning=20Merg?= =?UTF-8?q?e=20pull=20request=20!15165=20from=20=E7=8E=8B=E8=B6=85/v2.1.0-?= =?UTF-8?q?6.0.RC3=5Fwarn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 7 +++++++ torch_npu/csrc/core/npu/register/OptionsManager.h | 1 + torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index cad80f086a..8503361020 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -101,6 +101,13 @@ int OptionsManager::GetBoolTypeOption(const char* env_str, int defaultVal) return (envFlag != 0) ? 1 : 0; } +uint32_t OptionsManager::GetHCCLConnectTimeout() +{ + char* env_val = std::getenv("HCCL_CONNECT_TIMEOUT"); + int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 0; + return static_cast(envFlag); +} + uint32_t OptionsManager::GetHCCLExecTimeout() { char* env_val = std::getenv("HCCL_EXEC_TIMEOUT"); diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index ba2cb5a198..98e8fd72dc 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -31,6 +31,7 @@ public: static bool CheckCombinedOptimizerEnable(); static bool CheckTriCombinedOptimizerEnable(); static bool CheckAclDumpDateEnable(); + static uint32_t GetHCCLConnectTimeout(); static uint32_t GetHCCLExecTimeout(); static uint32_t GetHCCLEventTimeout(); static std::string CheckDisableDynamicPath(); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index f66e215d3c..7960f02725 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1167,6 +1167,10 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, ASCEND_LOGI("The rank_table_file is not available, switch to original interface."); return false; } + if (c10_npu::option::OptionsManager::GetHCCLConnectTimeout() < 300) { + TORCH_NPU_WARN_ONCE("When creating an HCCL process group using the RANK_TABLE_FILE method, the connection may time out. ", + "It is recommended to set the timeout duration of HCCL_CONNECT_TIMEOUT to 300 seconds or more."); + } c10_npu::OptionalNPUGuard npuGuard; // global process group if (options_->global_ranks_in_group.empty()) { -- Gitee From 6818c87b39dc95a3b05b49496d95b45a41806076 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 8 Oct 2024 07:08:44 +0000 Subject: [PATCH 52/96] !15171 Update op_plugin commit id Merge pull request !15171 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index d7523c27fb..dfaad59f05 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit d7523c27fb09878bf4da7da8dbfcc5eea46290d2 +Subproject commit dfaad59f053d47f24dc4d2d8da095fcb39ecee5f -- Gitee From 3fdcbb34ec1dfb2b40d278d895741c546ec2a3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com> Date: Tue, 8 Oct 2024 07:39:43 +0000 Subject: [PATCH 53/96] =?UTF-8?q?!15142=20add=20`base=5Faddr=5Faligned=5Fk?= =?UTF-8?q?b`=20configuration=20Merge=20pull=20request=20!15142=20from=20?= =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA/cherry-pick-1727660032?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/core/npu/NPUCachingAllocator.cpp | 43 +++++++++++++++++-- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index e4b764ae36..c65a38b337 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -596,6 +596,11 @@ class CachingAllocatorConfig { return instance().m_expandable_segments; } + static size_t base_addr_aligned_size() + { + return instance().m_base_addr_aligned_size; + } + static CachingAllocatorConfig &instance() { static CachingAllocatorConfig *s_instance = ([]() { auto inst = new CachingAllocatorConfig(); @@ -614,11 +619,13 @@ class CachingAllocatorConfig { double m_garbage_collection_threshold; bool m_expandable_segments; bool set_expandable_segments_flag = false; + size_t m_base_addr_aligned_size = kAlignRoundLarge; CachingAllocatorConfig() : m_max_split_size(std::numeric_limits::max()), m_garbage_collection_threshold(0), - m_expandable_segments(true) + m_expandable_segments(true), + m_base_addr_aligned_size(kAlignRoundLarge) { void* ptr = nullptr; auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, NULL, 1); @@ -643,6 +650,9 @@ class CachingAllocatorConfig { size_t parseExpandableSegments( const std::vector& config, size_t i); + size_t parseAddrAlignSize( + const std::vector& config, + size_t i); }; void CachingAllocatorConfig::lexArgs( @@ -740,6 +750,28 @@ size_t CachingAllocatorConfig::parseExpandableSegments( return i; } +size_t CachingAllocatorConfig::parseAddrAlignSize( + const std::vector& config, + size_t i) +{ + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + size_t val = static_cast(stoi(config[i])); + TORCH_CHECK(config[i].length() == std::to_string(val).length(), + "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int", + OPS_ERROR(ErrCode::VALUE)); + TORCH_CHECK(val >= 0, "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int", + OPS_ERROR(ErrCode::VALUE)); + TORCH_CHECK(val <= kAlignRoundLarge / 1024, + "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int", + OPS_ERROR(ErrCode::VALUE)); + m_base_addr_aligned_size = val * 1024; + } else { + TORCH_CHECK(false, "Error, expecting base_addr_aligned_kb value", OPS_ERROR(ErrCode::VALUE)); + } + return i; +} + void CachingAllocatorConfig::parseArgs(const char* env) { // If empty, set the default values m_max_split_size = std::numeric_limits::max(); @@ -760,6 +792,8 @@ void CachingAllocatorConfig::parseArgs(const char* env) { } else if (config[i] == "expandable_segments") { set_expandable_segments_flag = true; i = parseExpandableSegments(config, i); + } else if (config[i] == "base_addr_aligned_kb") { + i = parseAddrAlignSize(config, i); } else { TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], OPS_ERROR(ErrCode::PARAM)); } @@ -1058,9 +1092,10 @@ class DeviceCachingAllocator { } int64_t ori_block_ptr = int64_t(params.block->ptr); - if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && - ori_block_ptr % kAlignRoundLarge != 0) { - char* align_ptr = reinterpret_cast((ori_block_ptr + kAlignRoundLarge) - (ori_block_ptr % kAlignRoundLarge)); + size_t align_round = CachingAllocatorConfig::base_addr_aligned_size(); + if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && align_round != 0 && + ori_block_ptr % align_round != 0) { + char* align_ptr = reinterpret_cast((ori_block_ptr + align_round) - (ori_block_ptr % align_round)); size_t offset_size = align_ptr - (char*)params.block->ptr; if (offset_size + params.size() <= params.block->size) { auto size = params.block->size; -- Gitee From a62d352ddf641153143635f8a477e0d2f8f29641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Tue, 8 Oct 2024 07:45:19 +0000 Subject: [PATCH 54/96] =?UTF-8?q?!15175=20=E3=80=90bugfix=E3=80=91fix=20th?= =?UTF-8?q?e=20bug=20of=20intreactive=20Merge=20pull=20request=20!15175=20?= =?UTF-8?q?from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 50e220d87a..7a01246c9a 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -228,3 +228,8 @@ if 'TORCH_NPU_SANITIZER' in os.environ: apply_sanitizer_patch() csan.enable_npu_sanitizer() + +if hasattr(sys, 'ps1'): + os.environ["TASK_QUEUE_ENABLE"] = '0' + warnings.warn("On the interactive interface, the value of TASK_QUEUE_ENABLE is set to 0 by default. \ + Do not set it to 1 to prevent some unknown errors") \ No newline at end of file -- Gitee From d44417f7f9b0b2b0fc8fce1b507e1cf5bf3e764a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com> Date: Wed, 9 Oct 2024 02:44:29 +0000 Subject: [PATCH 55/96] =?UTF-8?q?!15186=20the=20expandable=5Fsegments=20fu?= =?UTF-8?q?nction=20defaults=20to=20false.=20Merge=20pull=20request=20!151?= =?UTF-8?q?86=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.1=5Fcleancode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index c65a38b337..f9e4b723fe 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -624,18 +624,9 @@ class CachingAllocatorConfig { CachingAllocatorConfig() : m_max_split_size(std::numeric_limits::max()), m_garbage_collection_threshold(0), - m_expandable_segments(true), + m_expandable_segments(false), m_base_addr_aligned_size(kAlignRoundLarge) { - void* ptr = nullptr; - auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, NULL, 1); - if (status == ACL_ERROR_NONE) { - NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr)); - } else { - TORCH_NPU_WARN_ONCE("expandable_segments feature is not supportted \ - and the possible cause is that driver and firmware packages do not match."); - m_expandable_segments = false; - } } void lexArgs(const char* env, std::vector& config); -- Gitee From 0dfad97f462d01fe1eed476debfe6e24c74e9514 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 9 Oct 2024 08:42:42 +0000 Subject: [PATCH 56/96] !15232 Update op_plugin commit id Merge pull request !15232 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index dfaad59f05..3911dbe7bd 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit dfaad59f053d47f24dc4d2d8da095fcb39ecee5f +Subproject commit 3911dbe7bd2daf515d7190b3cd01f0204687b3d3 -- Gitee From 6efd72ed7e588f586fe2b8d0ef5f97876615ef6c Mon Sep 17 00:00:00 2001 From: lilei zheng Date: Wed, 9 Oct 2024 10:36:17 +0000 Subject: [PATCH 57/96] !15192 Fix the accuracy issue in the aclop conv3d fp32 scenario Merge pull request !15192 from lilei zheng/cherry-pick-1728385829 --- torch_npu/utils/_module.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index d51485b82c..92213c7ca9 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -24,11 +24,14 @@ from torch.nn.parallel.replicate import replicate import torch_npu from torch_npu.npu.amp.autocast_mode import autocast +from torch_npu.npu.utils import get_device_name from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm from torch_npu.utils._error_code import ErrCode, pta_error origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__ +CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"] + def npu(self, device=None): r"""Moves all model parameters and buffers to the npu. @@ -117,6 +120,10 @@ def cast_weight(self, device): return if issubclass(class_name, torch.nn.Conv3d): module.weight.data = module.weight.data.to(device) + device_name = get_device_name() + if any(device_name.startswith(prefix) for prefix in CONV3D_SUPPORT_FP32_SOC_PREFIX): + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 33) + return module.weight.data = torch_npu.npu_format_cast(module.weight.data.half(), 33).float() # ACL_FRACTAL_Z_3D if device is None or "npu" not in str(device): -- Gitee From c57ce8ae485fff5645ab2d45e6b5b26fa1ace282 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Wed, 9 Oct 2024 10:36:48 +0000 Subject: [PATCH 58/96] !15210 fix coredump when uncached_delete after Finalize. Merge pull request !15210 from huangyunlong/2.1rc3nome --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index f9e4b723fe..26613cf76b 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -2317,7 +2317,9 @@ class DeviceCachingAllocator { static void uncached_delete(void* ptr) { - c10_npu::npuSynchronizeDevice(false); + if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { + c10_npu::npuSynchronizeDevice(false); + } ASCEND_LOGD("Without NPUCachingAllocator, free by aclrtFree."); NPU_CHECK_ERROR(aclrtFree(ptr)); } -- Gitee From 48c9c346d7cfda603d69fa34b498fec569b3a76f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B2=88=E7=8F=88=E9=9D=93?= Date: Wed, 9 Oct 2024 10:37:09 +0000 Subject: [PATCH 59/96] =?UTF-8?q?!15220=20Update=20torchair=206.0.rc3=20Me?= =?UTF-8?q?rge=20pull=20request=20!15220=20from=20=E6=B2=88=E7=8F=88?= =?UTF-8?q?=E9=9D=93/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 9382e2f6a1..820f0378f4 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 9382e2f6a1171502887c36d6556fa5fc1ab85b66 +Subproject commit 820f0378f4591707969e1aa55935cff7b823b155 -- Gitee From 0619c1279f6f43ae88fcad8a70330e45fbf122d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Thu, 10 Oct 2024 09:42:27 +0000 Subject: [PATCH 60/96] =?UTF-8?q?!15217=20remove=20weakptr,=20use=20global?= =?UTF-8?q?=5F's=20get=20function=20Merge=20pull=20request=20!15217=20from?= =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Frank?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/distributed/ProcessGroupHCCL.cpp | 47 ++++++++++--------- .../csrc/distributed/ProcessGroupHCCL.hpp | 4 +- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 7960f02725..7410f0e363 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -234,7 +234,6 @@ const int64_t ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis = 10 * 1000; thread_local uint64_t ProcessGroupHCCL::hcclActiveGroupCounter_ = 0; const int64_t ProcessGroupHCCL::kWatchdogThreadSleepMillis = 1000; std::string ProcessGroupHCCL::perfdumppath = ""; -std::weak_ptr ProcessGroupHCCL::global_hccl_comm_; std::unordered_map ProcessGroupHCCL::group_ranks_map_; std::mutex ProcessGroupHCCL::group_ranks_map_mutex_; ProcessGroupHCCL* ProcessGroupHCCL::global_ = nullptr; @@ -786,6 +785,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL() hcclComm->destroyHcclComm(); } } + devHCCLCommMap_.clear(); } } @@ -1147,10 +1147,6 @@ void ProcessGroupHCCL::createHCCLComm(const std::vector& devices, std::to_string((int)commType) + DIST_ERROR(ErrCode::PARAM)); } - if (options_->global_ranks_in_group.empty()) { - global_hccl_comm_ = hcclComms[i]; - } - // Creates the HCCL streams streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index())); } @@ -1171,13 +1167,13 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, TORCH_NPU_WARN_ONCE("When creating an HCCL process group using the RANK_TABLE_FILE method, the connection may time out. ", "It is recommended to set the timeout duration of HCCL_CONNECT_TIMEOUT to 300 seconds or more."); } + if (!hcclCommInitClusterInfoConfigExist()) { + ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface."); + return false; + } c10_npu::OptionalNPUGuard npuGuard; // global process group if (options_->global_ranks_in_group.empty()) { - if (!hcclCommInitClusterInfoConfigExist()) { - ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface."); - return false; - } auto startTime = std::chrono::steady_clock::now(); for (size_t i = 0; i < devices.size(); ++i) { int rank = getRank() * static_cast(devices.size()) + static_cast(i); @@ -1194,7 +1190,6 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, return false; } hcclComms[i] = comm; - global_hccl_comm_ = comm; // Creates the HCCL streams streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index())); } @@ -1209,20 +1204,17 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, ASCEND_LOGI("The hcclCreateSubCommConfig is not exist, switch to original interface."); return false; } - if (global_hccl_comm_.expired()) { - // only support create glabal process group by ranktable - if (global_ == nullptr || !hcclCommInitClusterInfoConfigExist()) { - ASCEND_LOGI("The hcclCommInitClusterInfoConfig is not exist, switch to original interface."); - return false; - } - try { - (void)global_->getHcclComm(global_->getRank()); - } catch (const std::exception& e) { - ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what()); - return false; - } + if (global_ == nullptr) { + ASCEND_LOGI("The global process group is not exist, switch to original interface."); + return false; + } + std::shared_ptr globalHcclComm = nullptr; + try { + globalHcclComm = global_->getHcclCommByRankid(devices); + } catch (const std::exception& e) { + ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what()); + return false; } - std::shared_ptr globalHcclComm = global_hccl_comm_.lock(); if (!globalHcclComm) { ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, globalHcclComm is nullptr."); return false; @@ -1520,6 +1512,15 @@ ProcessGroupHCCL::Options::Options(bool is_high_priority_stream) { } +std::shared_ptr ProcessGroupHCCL::getHcclCommByRankid(const std::vector& devices) +{ + const auto key = getKeyFromDevices(devices); + auto& hcclComms = getHCCLComm(key, devices); + TORCH_CHECK(hcclComms.size() == 1, "expect hcclComms.size() = 1, but hcclComms.size() = ", + hcclComms.size(), DIST_ERROR(ErrCode::VALUE)); + return hcclComms[0]; +} + int64_t ProcessGroupHCCL::getHcclComm(int rankid) { at::Device device = getDeviceForRank(rankid); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index c09d255e17..5961971721 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -407,6 +407,8 @@ public: // may indicate that there is some sort of collective desynchronization. uint64_t getSequenceNumberForGroup() override; + std::shared_ptr getHcclCommByRankid(const std::vector& devices); + int64_t getHcclComm(int rankid); void setHcclCommName(const std::string& hccl_comm_name); @@ -677,8 +679,6 @@ private: WatchdogStatus watchdogStatus; - static std::weak_ptr global_hccl_comm_; - static std::mutex group_ranks_map_mutex_; static std::unordered_map group_ranks_map_; std::string global_hccl_id_; -- Gitee From b9e848ec29aff11495c7e36ac7f2b371ba4bb18a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 10 Oct 2024 10:51:31 +0000 Subject: [PATCH 61/96] !15278 Update op_plugin commit id Merge pull request !15278 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 3911dbe7bd..5b736ed12d 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 3911dbe7bd2daf515d7190b3cd01f0204687b3d3 +Subproject commit 5b736ed12db8423133dd24e2f0e81813d8f53d80 -- Gitee From 0a8106ce6d53a791da85c09c40256cacf9aba2dc Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 10 Oct 2024 13:59:44 +0000 Subject: [PATCH 62/96] !15292 Update op_plugin commit id Merge pull request !15292 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5b736ed12d..5dade4c396 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5b736ed12db8423133dd24e2f0e81813d8f53d80 +Subproject commit 5dade4c396054b9722fa94b0ff2ccbde5adacdfc -- Gitee From b14a1a84cb6630827d507eba0e3019a98edd5bc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com> Date: Fri, 11 Oct 2024 02:06:27 +0000 Subject: [PATCH 63/96] =?UTF-8?q?!15237=20[Fix]=20Update=20README.=20Merge?= =?UTF-8?q?=20pull=20request=20!15237=20from=20=E5=88=98=E5=98=89=E5=B7=8D?= =?UTF-8?q?/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- README.zh.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f6332eff96..a6cbb5c10c 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ pip3 install torch-npu==2.1.0.post6 ### From Source -In some special scenarios, users may need to compile **torch-npu** by themselves.Select a branch in table [Ascend Auxiliary Software](#ascend-auxiliary-software) and a Python version in table [PyTorch and Python Version Matching Table](#pytorch-and-python-version-matching-table) first. The docker image is recommended for compiling torch-npu through the following steps(It is recommended to mount the working path only and avoid the system path to reduce security risks), the generated .whl file path is ./dist/: +In some special scenarios, users may need to compile **torch-npu** by themselves.Select a branch in table [Ascend Auxiliary Software](#ascend-auxiliary-software) and a Python version in table [PyTorch and Python Version Matching Table](#pytorch-and-python-version-matching-table) first. The docker image is recommended for compiling torch-npu through the following steps(It is recommended to mount the working path only and avoid the system path to reduce security risks), the generated .whl file path is ./dist/. Note that gcc version has the following constraints if you try to compile without using docker image: we recommend the use gcc 10.2 for ARM and gcc 9.3.1 for X86. 1. **Clone torch-npu** diff --git a/README.zh.md b/README.zh.md index 55a25dcab7..91701847e7 100644 --- a/README.zh.md +++ b/README.zh.md @@ -57,7 +57,7 @@ pip3 install torch-npu==2.1.0.post6 ### 使用源代码进行安装 -某些特殊场景下,用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#PyTorch与Python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**,可以通过以下步骤获取(建议只挂载工作路径,并避开系统路径,以降低安全风险), 生成的.whl文件路径为./dist/: +某些特殊场景下,用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#PyTorch与Python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**,可以通过以下步骤获取(建议只挂载工作路径,并避开系统路径,以降低安全风险), 生成的.whl文件路径为./dist/。如果不使用镜像,编译时请注意gcc版本遵循如下约束:ARM架构下推荐使用gcc 10.2版本, X86架构下推荐使用gcc 9.3.1 1. **克隆torch_npu代码仓** -- Gitee From 4aa987f573704248396c2803e75c886d028c3df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Fri, 11 Oct 2024 02:22:40 +0000 Subject: [PATCH 64/96] =?UTF-8?q?!15234=20add=20event=20remove=20process?= =?UTF-8?q?=20for=20cachingAllocator=20when=20restart=20device=20Merge=20p?= =?UTF-8?q?ull=20request=20!15234=20from=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.?= =?UTF-8?q?1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/core/npu/NPUCachingAllocator.cpp | 24 ++++++++++++++ torch_npu/csrc/core/npu/NPUCachingAllocator.h | 6 ++++ torch_npu/csrc/core/npu/NPUException.h | 6 ++-- torch_npu/csrc/core/npu/NPUQueue.cpp | 33 ++++++++++++++----- torch_npu/csrc/npu/Module.cpp | 1 + torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 7 ++++ torch_npu/csrc/npu/NPUPluggableAllocator.h | 1 + 7 files changed, 67 insertions(+), 11 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 26613cf76b..40932437d8 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1346,6 +1346,23 @@ class DeviceCachingAllocator { release_cached_blocks(check_error, context); } + void release_and_free_events() + { + std::unique_lock lock(mutex); + std::shared_ptr context = maybeGatherContext(RecordContext::ALL); + for (auto& st : npu_events) { + for (auto& e : st.second) { + EventPool::Event event = std::move(e.first); + Block* block = e.second; + block->event_count--; + if (block->event_count == 0) { + free_block(block, context); + } + } + } + npu_events.clear(); + } + /** Retrieves info (total size + largest block) of the memory cache **/ void cacheInfo(size_t* total, size_t* largest) { std::lock_guard lock(mutex); @@ -2487,6 +2504,13 @@ class NpuCachingAllocator : public NPUAllocator { block->is_safe = true; } + void cleanEvent() override + { + int count = static_cast(device_allocator.size()); + for (int i = 0; i < count; i++) + device_allocator[i]->release_and_free_events(); + } + void emptyCache(bool check_error) override { int count = static_cast(device_allocator.size()); diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index 46dc7ecc65..44f1d8a7f4 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -207,6 +207,7 @@ public: virtual bool checkBlockIsSafe(const c10::DataPtr& ptr) = 0; virtual void markAllBlockUnsafe(int device) = 0; virtual void updateBlockToSafe(const c10::DataPtr &ptr) = 0; + virtual void cleanEvent() = 0; }; // Allocator object, statically initialized @@ -342,5 +343,10 @@ inline void updateBlockToSafe(const c10::DataPtr& ptr) return get()->updateBlockToSafe(ptr); } +inline void cleanEvent() +{ + return get()->cleanEvent(); +} + } // namespace NPUCachingAllocator } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 0f03ee1865..98de3f2260 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -16,6 +16,7 @@ #include "torch_npu/csrc/core/npu/NPUMacros.h" #include "torch_npu/csrc/core/npu/interface/AclInterface.h" #include "torch_npu/csrc/core/npu/NPUErrorCodes.h" +#include "torch_npu/csrc/core/npu/npu_log.h" #define C10_NPU_SHOW_ERR_MSG() \ @@ -89,8 +90,8 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode); #define GRAPH_ERROR(error) formatErrorCode(SubModule::GRAPH, error) #define PROF_ERROR(error) formatErrorCode(SubModule::PROF, error) -#define DEVICE_TASK_ABORT "107022" -#define DEVICE_MEM_ERROR "507053" +#define DEVICE_TASK_ABORT "reason=[device task abort]" +#define DEVICE_MEM_ERROR "reason=[device mem error]" inline const char* getErrorFunction(const char* msg) { @@ -110,6 +111,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) Error_stop = stop_error; \ } \ if ((Error_stop) == ACL_ERROR_RT_DEVICE_TASK_ABORT) { \ + ASCEND_LOGE("getRepoStopFlag in Run, throw FORCE STOP."); \ TORCH_CHECK( \ false, \ __func__, \ diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index edbd5a8655..99b5d48e7d 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -257,6 +257,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) if (GetStatus() == RepoStatus::STOP_EXIT) { ClearQueue(); if (check_error) { + ASCEND_LOGE("getRepoStopFlag in EmptyQueue, throw FORCE STOP."); throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); } else { ASCEND_LOGE("FORCE STOP happend."); @@ -296,17 +297,30 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) } bool Repository::WriteQueue(void* cur_paras) { - std::lock_guard lock(mu_enqueue); - if (IsFullQueue()) { - return false; - } + std::lock_guard lock(mu_enqueue); - __sync_synchronize(); - manager().Copy(datas, write_idx.idx, cur_paras); - __sync_synchronize(); + if (GetStatus() == RepoStatus::STOP_EXIT) { + auto queueParam = static_cast(cur_paras); + auto type = queueParam->paramType; + if (type == c10_npu::queue::LAZY_DESTROY_EVENT) { + return true; + } else { + ClearQueue(); + ASCEND_LOGE("getRepoStopFlag in WriteQueue, throw FORCE STOP."); + throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); + } + } - write_idx.idx = (write_idx.idx + 1) & (kQueueCapacity - 1); - return true; + if (IsFullQueue()) { + return false; + } + + __sync_synchronize(); + manager().Copy(datas, write_idx.idx, cur_paras); + __sync_synchronize(); + + write_idx.idx = (write_idx.idx + 1) & (kQueueCapacity - 1); + return true; } bool Repository::ReadQueue() @@ -381,6 +395,7 @@ void Repository::Enqueue(void* cur_paras) { return; } ClearQueue(); + ASCEND_LOGE("getRepoStopFlag in Enqueue, throw FORCE STOP."); throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); } diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 7a9c655c12..d73b536e94 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -382,6 +382,7 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg) c10_npu::clear_mem_uce_info(); setDefaultStreamsStatus(device, c10_npu::RepoStatus::INIT); + c10_npu::NPUCachingAllocator::cleanEvent(); Py_RETURN_NONE; END_HANDLE_TH_ERRORS diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 304b997bef..c7e43b23a8 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -316,6 +316,13 @@ void NPUPluggableAllocator::updateBlockToSafe(const c10::DataPtr& ptr) "If you need it, please file an issue describing your use case."); } +void NPUPluggableAllocator::cleanEvent() +{ + TORCH_NPU_WARN( + "NPUPluggableAllocator does not yet support cleanEvent. " + "If you need it, please file an issue describing your use case."); +} + std::shared_ptr current_custom_allocator; diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index cca1df8952..d84025ebb5 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -81,6 +81,7 @@ struct NPUPluggableAllocator bool checkBlockIsSafe(const c10::DataPtr& ptr) override; void markAllBlockUnsafe(int device) override; void updateBlockToSafe(const c10::DataPtr &ptr) override; + void cleanEvent() override; protected: std::function alloc_fn_; -- Gitee From 6f64e7df4d03f4054562785f752a27e2d7973220 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 11 Oct 2024 02:34:09 +0000 Subject: [PATCH 65/96] !15300 Update op_plugin commit id Merge pull request !15300 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5dade4c396..de41acb424 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5dade4c396054b9722fa94b0ff2ccbde5adacdfc +Subproject commit de41acb424b1d3b4ef4b44e1eb999173a24d9ea9 -- Gitee From 8d55c035fca6d78142f6a7ff196123ac1e633ad6 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 11 Oct 2024 08:51:14 +0000 Subject: [PATCH 66/96] !15324 Update op_plugin commit id Merge pull request !15324 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index de41acb424..bc401c1eee 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit de41acb424b1d3b4ef4b44e1eb999173a24d9ea9 +Subproject commit bc401c1eee57604b5bfbee6e67cd293587bff13b -- Gitee From 6fffc905295ac74c0da4d0972a5d2195e5ba300a Mon Sep 17 00:00:00 2001 From: wangqihui01 Date: Fri, 11 Oct 2024 09:39:07 +0000 Subject: [PATCH 67/96] !15268 revise supported_export_type error Merge pull request !15268 from wangqihui01/v2.1.0-6.0.rc3 --- torch_npu/profiler/experimental_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py index 673c6ccfa0..2b3ff14563 100644 --- a/torch_npu/profiler/experimental_config.py +++ b/torch_npu/profiler/experimental_config.py @@ -15,17 +15,17 @@ __all__ = [ def supported_profiler_level(): - return set((ProfilerLevel.Level0, ProfilerLevel.Level1, ProfilerLevel.Level2)) + return set((ProfilerLevel.Level0, ProfilerLevel.Level1, ProfilerLevel.Level2, ProfilerLevel.Level_none)) def supported_ai_core_metrics(): - return set((AiCMetrics.PipeUtilization, AiCMetrics.ArithmeticUtilization, + return set((AiCMetrics.AiCoreNone, AiCMetrics.PipeUtilization, AiCMetrics.ArithmeticUtilization, AiCMetrics.Memory, AiCMetrics.MemoryL0, AiCMetrics.MemoryUB, AiCMetrics.ResourceConflictRatio, AiCMetrics.L2Cache)) def supported_export_type(): - return set(ExportType.__members__.values()) + return set((ExportType.Db, ExportType.Text)) class ProfilerLevel: -- Gitee From 2f0fc1b5b9643a73190ba14b48e72f318ea34c9d Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 11 Oct 2024 10:45:10 +0000 Subject: [PATCH 68/96] !15331 Update op_plugin commit id Merge pull request !15331 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index bc401c1eee..0da965ba04 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit bc401c1eee57604b5bfbee6e67cd293587bff13b +Subproject commit 0da965ba0437b2171a8a87ddd2ce3115f0aa8dda -- Gitee From effcf17b4ee32544d95bea34830b90f08ee980ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B5=E9=9D=9E=E5=87=A1?= Date: Fri, 11 Oct 2024 13:30:01 +0000 Subject: [PATCH 69/96] =?UTF-8?q?!15328=20delete=20unused=20info=20for=20a?= =?UTF-8?q?llreduce=20Merge=20pull=20request=20!15328=20from=20=E9=82=B5?= =?UTF-8?q?=E9=9D=9E=E5=87=A1/d=5Fcallback21rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 7410f0e363..2351d727a1 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -205,13 +205,6 @@ std::string getExceptionMsgFromExceptionPtr(const std::exception_ptr& exceptionP } } -// exit call back for allreduce error -void exceptionCallback(aclrtExceptionInfo* exceptionInfo) -{ - // notice: Do not raise error, otherwise we will get call stacks of the rts callback function. - fprintf(stdout, "Inner error, see details in Ascend logs."); -} - void getP2PHcclCommCofig(HcclCommConfig* config) { HcclCommConfigInit(config); @@ -1911,8 +1904,6 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce( tensors_cp, tensors_cp, [&](at::Tensor& input, at::Tensor& output, HcclComm comm, c10_npu::NPUStream& stream, std::shared_ptr is_dispatched) { - aclrtSetExceptionInfoCallback(exceptionCallback); - auto hcclType = getHcclDataType(input.scalar_type()); checkSupportedDataType(hcclType, functionName); RECORD_FUNCTION("HcclAllreduce", std::vector({input})); -- Gitee From 86bb98d5dbfb25f39523e257245de20eba0fe51b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com> Date: Sat, 12 Oct 2024 01:06:24 +0000 Subject: [PATCH 70/96] =?UTF-8?q?!15303=20[PROF]update=20mstx=20func=20Mer?= =?UTF-8?q?ge=20pull=20request=20!15303=20from=20=E6=A2=85=E9=A3=9E?= =?UTF-8?q?=E8=A6=81/mark=5F1=5Frc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_mstx.py | 67 ++++++++++++++++++++++++++++ test/torch_npu_schema.json | 4 +- torch_npu/csrc/profiler/mstx_mgr.cpp | 36 ++++++++++++++- torch_npu/csrc/profiler/mstx_mgr.h | 3 ++ torch_npu/npu/mstx.py | 3 +- 5 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 test/npu/test_mstx.py diff --git a/test/npu/test_mstx.py b/test/npu/test_mstx.py new file mode 100644 index 0000000000..f2baf03b1b --- /dev/null +++ b/test/npu/test_mstx.py @@ -0,0 +1,67 @@ +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestMstx(TestCase): + mark_msg = '' + range_msg = '' + range_id = 0 + + def setUp(self): + def stub_mark(message : str = ''): + self.mark_msg = message + + def stub_range_start_on_host(message : str) -> int: + self.range_msg = message + self.range_id += 1 + return self.range_id + + def stub_range_start(message : str, stream=None): + self.range_msg = message + self.range_id += 1 + return self.range_id + + def stub_range_end(range_id: int): + self.range_id = range_id + + torch_npu._C._mark = stub_mark + torch_npu._C._mstx._range_start = stub_range_start + torch_npu._C._mstx._range_start_on_host = stub_range_start_on_host + torch_npu._C._mstx._range_end = stub_range_end + + def test_mark(self): + torch_npu.npu.mstx.mark("test1") + self.assertEqual("test1", self.mark_msg) + torch_npu.npu.mstx().mark("test2") # Verify compatibility + self.assertEqual("test2", self.mark_msg) + + def test_range_start(self): + self.range_id = 0 + ret_id = torch_npu.npu.mstx.range_start("") + self.assertEqual(0, ret_id) + ret_id = torch_npu.npu.mstx.range_start("test1") + self.assertEqual(1, ret_id) + self.assertEqual("test1", self.range_msg) + ret_id = torch_npu.npu.mstx.range_start("test2", None) + self.assertEqual(2, ret_id) + self.assertEqual("test2", self.range_msg) + + torch.npu.set_device(0) + current_stream = torch.npu.current_stream() + ret_id = torch_npu.npu.mstx.range_start("test3", current_stream) + self.assertEqual(3, ret_id) + self.assertEqual("test3", self.range_msg) + ret_id = torch_npu.npu.mstx.range_start("test4", 'invalid_stream') + self.assertEqual(0, ret_id) + + def test_range_end(self): + self.range_id = 0 + torch_npu.npu.mstx.range_end('invalid_range_id') + self.assertEqual(0, self.range_id) + torch_npu.npu.mstx.range_end(1) + self.assertEqual(1, self.range_id) + + +if __name__ == '__main__': + run_tests() \ No newline at end of file diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index 5a36b1ea4b..d23480e9a1 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -1212,7 +1212,7 @@ "signature": "()" }, "torch_npu.npu.mstx.mark": { - "signature": "(self, message: str = '')" + "signature": "(message: str = '')" }, "torch_npu.npu.preferred_linalg_library": { "signature": "(backend: Union[NoneType, str, torch._C._LinalgBackend] = None) -> torch._C._LinalgBackend" @@ -1566,7 +1566,7 @@ "signature": "()" }, "torch_npu.npu.mstx.mstx.mark": { - "signature": "(self, message: str = '')" + "signature": "(message: str = '')" }, "torch_npu.npu.npu_config.finalize_dump": { "signature": "()" diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp index fac2b207ca..87c7af80fc 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.cpp +++ b/torch_npu/csrc/profiler/mstx_mgr.cpp @@ -5,6 +5,9 @@ #include "torch_npu/csrc/core/npu/npu_log.h" #include "torch_npu/csrc/framework/OpCommand.h" #include "torch_npu/csrc/profiler/profiler_mgr.h" +#include "torch_npu/csrc/toolkit/profiler/common/utils.h" + +#include namespace torch_npu { namespace profiler { @@ -90,9 +93,40 @@ int MstxMgr::getRangeId() return ptRangeId_++; } -bool MstxMgr::isMstxEnable() +bool MstxMgr::isProfTxEnable() { return ProfilerMgr::GetInstance()->GetNpuTrace().load() && ProfilerMgr::GetInstance()->GetMsprofTx().load(); } + +bool MstxMgr::isMsptiTxEnableImpl() +{ + bool ret = false; + const char* envVal = std::getenv("LD_PRELOAD"); + if (envVal == nullptr) { + return ret; + } + static const std::string soName = "libmspti.so"; + std::stringstream ss(envVal); + std::string path; + while (std::getline(ss, path, ':')) { + path = torch_npu::toolkit::profiler::Utils::RealPath(path); + if ((path.size() > soName.size()) && (path.substr(path.size() - soName.size()) == soName)) { + ret = true; + break; + } + } + return ret; +} + +bool MstxMgr::isMsptiTxEnable() +{ + static bool isEnable = isMsptiTxEnableImpl(); + return isEnable; +} + +bool MstxMgr::isMstxEnable() +{ + return isProfTxEnable() || isMsptiTxEnable(); +} } } \ No newline at end of file diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h index cc91780ca0..883662cb4b 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.h +++ b/torch_npu/csrc/profiler/mstx_mgr.h @@ -26,6 +26,9 @@ private: explicit MstxMgr(MstxMgr &&obj) = delete; MstxMgr& operator=(MstxMgr &&obj) = delete; + bool isProfTxEnable(); + bool isMsptiTxEnable(); + bool isMsptiTxEnableImpl(); private: std::atomic ptRangeId_{1}; std::unordered_set ptRangeIdsWithStream_; diff --git a/torch_npu/npu/mstx.py b/torch_npu/npu/mstx.py index 2710d6aeec..0c33145b3a 100644 --- a/torch_npu/npu/mstx.py +++ b/torch_npu/npu/mstx.py @@ -17,7 +17,8 @@ import torch_npu._C class mstx: - def mark(self, message:str = ""): + @staticmethod + def mark(message:str = ""): torch_npu._C._mark(message) @staticmethod -- Gitee From 85eeba4f4c5d8cace22d7973b446154cfd9677de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Sat, 12 Oct 2024 01:23:20 +0000 Subject: [PATCH 71/96] =?UTF-8?q?!15311=20Update=20read=5Fidx=20only=20in?= =?UTF-8?q?=20Dequeue.=20Merge=20pull=20request=20!15311=20from=20?= =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.1.0-6.0.rc3=5Fforce?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUQueue.cpp | 14 +++++--------- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 7 ++----- torch_npu/csrc/distributed/ProcessGroupHCCL.hpp | 2 +- torch_npu/npu/_recovery.py | 3 +-- 4 files changed, 9 insertions(+), 17 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 99b5d48e7d..39bb3514f1 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -255,7 +255,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) } if (GetStatus() == RepoStatus::STOP_EXIT) { - ClearQueue(); if (check_error) { ASCEND_LOGE("getRepoStopFlag in EmptyQueue, throw FORCE STOP."); throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); @@ -272,7 +271,6 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) PyEval_RestoreThread(gilState); } #endif - read_idx.idx = write_idx.idx; if (check_error) { throw std::runtime_error("The Inner error is reported as above. " @@ -305,7 +303,6 @@ bool Repository::WriteQueue(void* cur_paras) { if (type == c10_npu::queue::LAZY_DESTROY_EVENT) { return true; } else { - ClearQueue(); ASCEND_LOGE("getRepoStopFlag in WriteQueue, throw FORCE STOP."); throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); } @@ -363,10 +360,7 @@ bool Repository::ReadQueue() } else if (GetStatus() != STOP_EXIT) { SetStatus(ERROR_EXIT); } - read_idx.idx = write_idx.idx; - __sync_synchronize(); - eventfd_write(efd_empty, 1); - eventfd_write(efd_write, 1); + ClearQueue(); return false; } @@ -394,7 +388,6 @@ void Repository::Enqueue(void* cur_paras) { if (type == c10_npu::queue::LAZY_DESTROY_EVENT) { return; } - ClearQueue(); ASCEND_LOGE("getRepoStopFlag in Enqueue, throw FORCE STOP."); throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL)); } @@ -402,7 +395,6 @@ void Repository::Enqueue(void* cur_paras) { if (GetStatus() == RepoStatus::ERROR_EXIT) { // Avoid repeatedly throwing exceptions SetStatus(CAN_EXIT); - read_idx.idx = write_idx.idx; throw std::runtime_error("The Inner error is reported as above. " "The process exits for this inner error, and " + repo_error + ".\n" + @@ -491,6 +483,9 @@ void Repository::Dequeue() { SetReadWorking(true); while (ret == false && GetStatus() != RepoStatus::CAN_EXIT) { + if (GetStatus() == RepoStatus::STOP_EXIT) { + ClearQueue(); + } ret = ReadQueue(); if (ret == false) { if (GetStatus() == RepoStatus::NEED_EXIT) { @@ -566,6 +561,7 @@ void Repository::ReleaseResource() { void Repository::ClearQueue() { read_idx.idx = write_idx.idx; + __sync_synchronize(); eventfd_write(efd_empty, 1); eventfd_write(efd_write, 1); } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 2351d727a1..704aca55e5 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -859,7 +859,6 @@ void ProcessGroupHCCL::workCleanupLoop() workMetaListCV_.wait_for(lock, std::chrono::milliseconds(kWatchdogThreadSleepMillis), [&]() -> bool { return terminateProcessGroup_.load(); }); if (watchdogStatus == WatchdogStatus::STOP) { - workMetaList_.clear(); continue; } @@ -1203,7 +1202,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, } std::shared_ptr globalHcclComm = nullptr; try { - globalHcclComm = global_->getHcclCommByRankid(devices); + globalHcclComm = global_->getHcclCommByDevices(devices); } catch (const std::exception& e) { ASCEND_LOGI("create the global HCCL Communicator failed, the exception info is %s.", e.what()); return false; @@ -1484,8 +1483,6 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr lock(workMetaListMutex_); - workMetaList_.clear(); return; } if (!terminateProcessGroup_.load()) { @@ -1505,7 +1502,7 @@ ProcessGroupHCCL::Options::Options(bool is_high_priority_stream) { } -std::shared_ptr ProcessGroupHCCL::getHcclCommByRankid(const std::vector& devices) +std::shared_ptr ProcessGroupHCCL::getHcclCommByDevices(const std::vector& devices) { const auto key = getKeyFromDevices(devices); auto& hcclComms = getHCCLComm(key, devices); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 5961971721..f8cf3c3090 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -407,7 +407,7 @@ public: // may indicate that there is some sort of collective desynchronization. uint64_t getSequenceNumberForGroup() override; - std::shared_ptr getHcclCommByRankid(const std::vector& devices); + std::shared_ptr getHcclCommByDevices(const std::vector& devices); int64_t getHcclComm(int rankid); diff --git a/torch_npu/npu/_recovery.py b/torch_npu/npu/_recovery.py index 3203ce1594..e9238caa03 100644 --- a/torch_npu/npu/_recovery.py +++ b/torch_npu/npu/_recovery.py @@ -66,8 +66,8 @@ def restart_device(device_id: int, rebuild_all_resources: int = False): npu_device = torch.device('npu') for pg in _pg_map: if (npu_device in pg._device_types): - pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_RUN) pg._get_backend(npu_device).clear_workmeta_list() + pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_RUN) def stop_device(device_id): @@ -78,4 +78,3 @@ def stop_device(device_id): for pg in _pg_map: if (npu_device in pg._device_types): pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_STOP) - pg._get_backend(npu_device).clear_workmeta_list() -- Gitee From e0dedf889e45252cf1e938bbe0684b52efdffdc6 Mon Sep 17 00:00:00 2001 From: wangjie Date: Sat, 12 Oct 2024 07:20:40 +0000 Subject: [PATCH 72/96] !15245 [PROF] Profiler trace step table fix Merge pull request !15245 from wangjie/cherry-pick-1728471938 --- .../analysis/prof_view/_trace_step_time_parser.py | 8 +++----- .../prof_view/prof_db_parse/_step_info_db_parser.py | 9 +++------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index d11208d9bf..f465cd97f8 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -79,13 +79,11 @@ class TraceStepTimeParser(BaseParser): def get_prepare_time(self, step, step_list): for cur_step in step_list: if cur_step[StepInfoIndex.ID.value] == step: - fwk_step_start_ts = cur_step[StepInfoIndex.FWK_START_TS.value] + first_task_start_ts = cur_step[StepInfoIndex.FIRST_TASK_TS.value] if step is None: first_fwk_op = FwkFileParser(self._profiler_path).get_first_fwk_op() - start_time = convert_ns2us_float(first_fwk_op.ts) if first_fwk_op else fwk_step_start_ts - else: - start_time = fwk_step_start_ts - return cur_step[StepInfoIndex.FIRST_TASK_TS.value] - start_time + return (first_task_start_ts - convert_ns2us_float(first_fwk_op.ts)) if first_fwk_op else 0 + return first_task_start_ts - cur_step[StepInfoIndex.FWK_START_TS.value] return 0 def create_step_file(self, output_path: str, json_str: list, file_name: str) -> None: diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py index 1905c3227b..fb8d6c980c 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py @@ -35,9 +35,7 @@ class StepInfoDbParser(BaseParser): try: self._db_path = deps_data.get(Constant.DB_PARSER, "") torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) - if not torch_op_node: - return Constant.SUCCESS, [] - step_range = self.get_step_range(torch_op_node[0]) + step_range = self.get_step_range(torch_op_node[0] if torch_op_node else None) except Exception: print_error_msg("Failed to get step info from db.") DbManager.destroy_db_connect(self.db_conn, self.db_curs) @@ -74,9 +72,8 @@ class StepInfoDbParser(BaseParser): def get_step_range(self, root_node: TorchOpNode) -> list: step_node_list = [] - for level1_node in root_node.child_node_list: - if level1_node.is_profiler_step(): - step_node_list.append(level1_node) + if root_node is not None: + step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()] conn, curs = DbManager.create_connect_db(self._db_path) if not (conn and curs): print_warn_msg(f"Failed to connect to db file: {self._db_path}") -- Gitee From 879346a2453f2b7b96079db796cb6a6ecdc340a2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Oct 2024 02:54:18 +0000 Subject: [PATCH 73/96] !15348 Update op_plugin commit id Merge pull request !15348 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 0da965ba04..919680509f 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 0da965ba0437b2171a8a87ddd2ce3115f0aa8dda +Subproject commit 919680509f869214d78b7b6d7a68c7d065394b68 -- Gitee From 1664ceeb08d6895efe02fd755cb35c566b818165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Mon, 14 Oct 2024 11:05:08 +0000 Subject: [PATCH 74/96] =?UTF-8?q?!15355=20modify=20readme=20Merge=20pull?= =?UTF-8?q?=20request=20!15355=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.1.0?= =?UTF-8?q?-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 103 +++++++++++++++++++++++++++------------------------ README.zh.md | 102 +++++++++++++++++++++++++++----------------------- 2 files changed, 110 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index a6cbb5c10c..41304101e9 100644 --- a/README.md +++ b/README.md @@ -44,14 +44,17 @@ If the installation fails, use the download link or visit the [PyTorch official | x86 | Python3.8 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) | | x86 | Python3.9 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) | | x86 | Python3.10 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) | +| x86 | Python3.11 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) | | aarch64 | Python3.8 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) | | aarch64 | Python3.9 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) | | aarch64 | Python3.10 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) | +| aarch64 | Python3.11 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) | + 3. **Install torch-npu** ``` -pip3 install torch-npu==2.1.0.post6 +pip3 install torch-npu==2.1.0.post8 ``` ### From Source @@ -61,7 +64,7 @@ In some special scenarios, users may need to compile **torch-npu** by themselves 1. **Clone torch-npu** ``` - git clone https://github.com/ascend/pytorch.git -b v2.1.0-6.0.rc2 --depth 1 + git clone https://github.com/ascend/pytorch.git -b v2.1.0-6.0.rc3 --depth 1 ``` 2. **Build Docker Image** @@ -120,52 +123,56 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m ## PyTorch and Python Version Matching Table | PyTorch Version | Python Version | -| ------------- | :----------------------------------------------------------- | -| PyTorch1.11.0 | Python3.7.x(>=3.7.5),Python3.8.x,Python3.9.x,Python3.10.x | -| PyTorch2.1.0 | Python3.8.x,Python3.9.x,Python3.10.x | -| PyTorch2.2.0 | Python3.8.x,Python3.9.x,Python3.10.x | -| PyTorch2.3.1 | Python3.8.x, Python3.9.x, Python3.10.x | - +|-----------------|:----------------------------------------------------------| +| PyTorch1.11.0 | Python3.7.x(>=3.7.5),Python3.8.x,Python3.9.x,Python3.10.x | +| PyTorch2.1.0 | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x | +| PyTorch2.2.0 | Python3.8.x,Python3.9.x,Python3.10.x | +| PyTorch2.3.1 | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x | +| PyTorch2.4.0 | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x | ## Ascend Auxiliary Software **PyTorch Extension** versions follow the naming convention `{PyTorch version}-{Ascend version}`, where the former represents the PyTorch version compatible with the **PyTorch Extension**, and the latter is used to match the CANN version. The detailed matching is as follows: -| CANN Version | Supported PyTorch Version | Supported Extension Version | Github Branch | AscendHub Image Version/Name([Link](https://ascendhub.huawei.com/#/detail/pytorch-modelzoo)) | -|----------------|--------------|-------------------|-------------------|----------------------| -| CANN 8.0.RC2 | 2.3.1 | 2.3.1 | v2.3.1-6.0.rc2 | - | -| | 2.2.0 | 2.2.0.post2 | v2.2.0-6.0.rc2 | - | -| | 2.1.0 | 2.1.0.post6 | v2.1.0-6.0.rc2 | - | -| | 1.11.0 | 1.11.0.post14 | v1.11.0-6.0.rc2 | - | -| CANN 8.0.RC1 | 2.2.0 | 2.2.0 | v2.2.0-6.0.rc1 | - | -| | 2.1.0 | 2.1.0.post3 | v2.1.0-6.0.rc1 | - | -| | 1.11.0 | 1.11.0.post11 | v1.11.0-6.0.rc1 | - | -| CANN 7.0.0 | 2.1.0 | 2.1.0 | v2.1.0-5.0.0 | - | -| | 2.0.1 | 2.0.1.post1 | v2.0.1-5.0.0 | - | -| | 1.11.0 | 1.11.0.post8 | v1.11.0-5.0.0 | - | -| CANN 7.0.RC1 | 2.1.0 | 2.1.0.rc1 | v2.1.0-5.0.rc3 | - | -| | 2.0.1 | 2.0.1 | v2.0.1-5.0.rc3 | - | -| | 1.11.0 | 1.11.0.post4 | v1.11.0-5.0.rc3 | - | -| CANN 6.3.RC3.1 | 1.11.0 | 1.11.0.post3 | v1.11.0-5.0.rc2.2 | - | -| CANN 6.3.RC3 | 1.11.0 | 1.11.0.post2 | v1.11.0-5.0.rc2.1 | - | -| CANN 6.3.RC2 | 2.0.1 | 2.0.1.rc1 | v2.0.1-5.0.rc2 | - | -| | 1.11.0 | 1.11.0.post1 | v1.11.0-5.0.rc2 | 23.0.RC1-1.11.0 | -| | 1.8.1 | 1.8.1.post2 | v1.8.1-5.0.rc2 | 23.0.RC1-1.8.1 | -| CANN 6.3.RC1 | 1.11.0 | 1.11.0 | v1.11.0-5.0.rc1 | - | -| | 1.8.1 | 1.8.1.post1 | v1.8.1-5.0.rc1 | - | -| CANN 6.0.1 | 1.5.0 | 1.5.0.post8 | v1.5.0-3.0.0 | 22.0.0 | -| | 1.8.1 | 1.8.1 | v1.8.1-3.0.0 | 22.0.0-1.8.1 | -| | 1.11.0 | 1.11.0.rc2(beta) | v1.11.0-3.0.0 | - | -| CANN 6.0.RC1 | 1.5.0 | 1.5.0.post7 | v1.5.0-3.0.rc3 | 22.0.RC3 | -| | 1.8.1 | 1.8.1.rc3 | v1.8.1-3.0.rc3 | 22.0.RC3-1.8.1 | -| | 1.11.0 | 1.11.0.rc1(beta) | v1.11.0-3.0.rc3 | - | -| CANN 5.1.RC2 | 1.5.0 | 1.5.0.post6 | v1.5.0-3.0.rc2 | 22.0.RC2 | -| | 1.8.1 | 1.8.1.rc2 | v1.8.1-3.0.rc2 | 22.0.RC2-1.8.1 | -| CANN 5.1.RC1 | 1.5.0 | 1.5.0.post5 | v1.5.0-3.0.rc1 | 22.0.RC1 | -| | 1.8.1 | 1.8.1.rc1 | v1.8.1-3.0.rc1 | - | -| CANN 5.0.4 | 1.5.0 | 1.5.0.post4 | 2.0.4.tr5 | 21.0.4 | -| CANN 5.0.3 | 1.8.1 | 1.5.0.post3 | 2.0.3.tr5 | 21.0.3 | -| CANN 5.0.2 | 1.5.0 | 1.5.0.post2 | 2.0.2.tr5 | 21.0.2 | +| CANN Version | Supported PyTorch Version | Supported Extension Version | Github Branch | +|-----------------------|---------------------------|-----------------------------|-------------------| +| CANN 8.0.RC3 | 2.4.0 | 2.4.0 | v2.4.0-6.0.rc3 | +| | 2.3.1 | 2.3.1.post2 | v2.3.1-6.0.rc3 | +| | 2.1.0 | 2.1.0.post8 | v2.1.0-6.0.rc3 | +| CANN 8.0.RC2 | 2.3.1 | 2.3.1 | v2.3.1-6.0.rc2 | +| | 2.2.0 | 2.2.0.post2 | v2.2.0-6.0.rc2 | +| | 2.1.0 | 2.1.0.post6 | v2.1.0-6.0.rc2 | +| | 1.11.0 | 1.11.0.post14 | v1.11.0-6.0.rc2 | +| CANN 8.0.RC2.alpha002 | 2.3.1 | 2.3.1rc1 | v2.3.1 | +| CANN 8.0.RC1 | 2.2.0 | 2.2.0 | v2.2.0-6.0.rc1 | +| | 2.1.0 | 2.1.0.post4 | v2.1.0-6.0.rc1 | +| | 1.11.0 | 1.11.0.post11 | v1.11.0-6.0.rc1 | +| CANN 7.0.0 | 2.1.0 | 2.1.0 | v2.1.0-5.0.0 | +| | 2.0.1 | 2.0.1.post1 | v2.0.1-5.0.0 | +| | 1.11.0 | 1.11.0.post8 | v1.11.0-5.0.0 | +| CANN 7.0.RC1 | 2.1.0 | 2.1.0.rc1 | v2.1.0-5.0.rc3 | +| | 2.0.1 | 2.0.1 | v2.0.1-5.0.rc3 | +| | 1.11.0 | 1.11.0.post4 | v1.11.0-5.0.rc3 | +| CANN 6.3.RC3.1 | 1.11.0 | 1.11.0.post3 | v1.11.0-5.0.rc2.2 | +| CANN 6.3.RC3 | 1.11.0 | 1.11.0.post2 | v1.11.0-5.0.rc2.1 | +| CANN 6.3.RC2 | 2.0.1 | 2.0.1.rc1 | v2.0.1-5.0.rc2 | +| | 1.11.0 | 1.11.0.post1 | v1.11.0-5.0.rc2 | +| | 1.8.1 | 1.8.1.post2 | v1.8.1-5.0.rc2 | +| CANN 6.3.RC1 | 1.11.0 | 1.11.0 | v1.11.0-5.0.rc1 | +| | 1.8.1 | 1.8.1.post1 | v1.8.1-5.0.rc1 | +| CANN 6.0.1 | 1.5.0 | 1.5.0.post8 | v1.5.0-3.0.0 | +| | 1.8.1 | 1.8.1 | v1.8.1-3.0.0 | +| | 1.11.0 | 1.11.0.rc2(beta) | v1.11.0-3.0.0 | +| CANN 6.0.RC1 | 1.5.0 | 1.5.0.post7 | v1.5.0-3.0.rc3 | +| | 1.8.1 | 1.8.1.rc3 | v1.8.1-3.0.rc3 | +| | 1.11.0 | 1.11.0.rc1(beta) | v1.11.0-3.0.rc3 | +| CANN 5.1.RC2 | 1.5.0 | 1.5.0.post6 | v1.5.0-3.0.rc2 | +| | 1.8.1 | 1.8.1.rc2 | v1.8.1-3.0.rc2 | +| CANN 5.1.RC1 | 1.5.0 | 1.5.0.post5 | v1.5.0-3.0.rc1 | +| | 1.8.1 | 1.8.1.rc1 | v1.8.1-3.0.rc1 | +| CANN 5.0.4 | 1.5.0 | 1.5.0.post4 | 2.0.4.tr5 | +| CANN 5.0.3 | 1.8.1 | 1.5.0.post3 | 2.0.3.tr5 | +| CANN 5.0.2 | 1.5.0 | 1.5.0.post2 | 2.0.2.tr5 | ## Suggestions and Communication @@ -186,7 +193,7 @@ The version branches of AscendPyTorch have the following maintenance phases: | **PyTorch** | **Maintenance Policies** | **Status** | **Launch Date** | **Subsequent Status** | **EOL Date** | |-----------|--------------------|--------------|------------|-----------------|-----------| -| 2.4.0 | Planning | - | - | - | | +| 2.4.0 | Regular Release | Development | 2024/10/15 |Expected to enter maintenance status from March 15, 2025 | | | 2.3.1 | Regular Release | Development | 2024/06/06 | Expected to enter maintenance status from December 6, 2024 | | | 2.2.0 | Regular Release | Maintained | 2024/04/01 | Expected to enter maintenance free status from September 10th, 2025| | | 2.1.0 | Long Term Support | Development | 2023/10/15 | Expected to enter maintenance status from March 30, 2025 | | @@ -201,10 +208,10 @@ For more detailed information on installation guides, model migration, training/ | Document Name | Document Link | | -------------------------------- | ------------------------------------------------------------ | -| AscendPyTorch Installation Guide | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/configandinstg/instg/insg_0001.html) | -| AscendPyTorch Network Model Migration and Training | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | -| AscendPyTorch Operator Adaptation | [link](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) | -| AscendPyTorch API List (PyTorch and Custom Interfaces) | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/apiref/apilist/ptaoplist_000002.html) | +| AscendPyTorch Installation Guide | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) | +| AscendPyTorch Network Model Migration and Training | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | +| AscendPyTorch Operator Adaptation | [link](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) | +| AscendPyTorch API List (PyTorch and Custom Interfaces) | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/apiref/apilist/ptaoplist_000002.html) | ## License diff --git a/README.zh.md b/README.zh.md index 91701847e7..2ab796b19e 100644 --- a/README.zh.md +++ b/README.zh.md @@ -35,9 +35,11 @@ pip3 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu | x86 | Python3.8 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) | | x86 | Python3.9 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) | | x86 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) | +| x86 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) | | aarch64 | Python3.8 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) | | aarch64 | Python3.9 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) | | aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) | +| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) | 2. **安装torch_npu依赖** @@ -51,7 +53,7 @@ pip3 install setuptools 3. **安装torch_npu** ``` -pip3 install torch-npu==2.1.0.post6 +pip3 install torch-npu==2.1.0.post8 ``` 如需要保存安装日志,可在pip3 install命令后面加上参数 `--log `,并对您指定的目录``做好权限管控。 @@ -62,7 +64,7 @@ pip3 install torch-npu==2.1.0.post6 1. **克隆torch_npu代码仓** ``` - git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-6.0.rc2 --depth 1 + git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-6.0.rc3 --depth 1 ``` 2. **构建镜像** @@ -128,53 +130,59 @@ print(z) ## PyTorch与Python版本配套表 -| PyTorch版本 | Python版本 | -| ------------- | :----------------------------------------------------------- | +## PyTorch与Python版本配套表 + +| PyTorch版本 | Python版本 | +|---------------|:-------------------------------------------------------------| | PyTorch1.11.0 | Python3.7.x(>=3.7.5), Python3.8.x, Python3.9.x, Python3.10.x | -| PyTorch2.1.0 | Python3.8.x, Python3.9.x, Python3.10.x | +| PyTorch2.1.0 | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x | | PyTorch2.2.0 | Python3.8.x, Python3.9.x, Python3.10.x | -| PyTorch2.3.1 | Python3.8.x, Python3.9.x, Python3.10.x | - +| PyTorch2.3.1 | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x | +| PyTorch2.4.0 | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x | ## 昇腾辅助软件 **PyTorch Extension**版本号采用`{PyTorch版本}-{昇腾版本}`命名规则,前者为**PyTorch Extension**匹配的PyTorch版本,后者用于匹配CANN版本,详细匹配如下: -| CANN版本 | 支持的PyTorch版本 | 支持的Extension版本 | Gitee分支 | AscendHub镜像版本/名称([链接](https://ascendhub.huawei.com/#/detail/pytorch-modelzoo)) | -|----------------|--------------|-------------------|-------------------|----------------------| -| CANN 8.0.RC2 | 2.3.1 | 2.3.1 | v2.3.1-6.0.rc2 | - | -| | 2.2.0 | 2.2.0.post2 | v2.2.0-6.0.rc2 | - | -| | 2.1.0 | 2.1.0.post6 | v2.1.0-6.0.rc2 | - | -| | 1.11.0 | 1.11.0.post14 | v1.11.0-6.0.rc2 | - | -| CANN 8.0.RC1 | 2.2.0 | 2.2.0 | v2.2.0-6.0.rc1 | - | -| | 2.1.0 | 2.1.0.post3 | v2.1.0-6.0.rc1 | - | -| | 1.11.0 | 1.11.0.post11 | v1.11.0-6.0.rc1 | - | -| CANN 7.0.0 | 2.1.0 | 2.1.0 | v2.1.0-5.0.0 | - | -| | 2.0.1 | 2.0.1.post1 | v2.0.1-5.0.0 | - | -| | 1.11.0 | 1.11.0.post8 | v1.11.0-5.0.0 | - | -| CANN 7.0.RC1 | 2.1.0 | 2.1.0.rc1 | v2.1.0-5.0.rc3 | - | -| | 2.0.1 | 2.0.1 | v2.0.1-5.0.rc3 | - | -| | 1.11.0 | 1.11.0.post4 | v1.11.0-5.0.rc3 | - | -| CANN 6.3.RC3.1 | 1.11.0 | 1.11.0.post3 | v1.11.0-5.0.rc2.2 | - | -| CANN 6.3.RC3 | 1.11.0 | 1.11.0.post2 | v1.11.0-5.0.rc2.1 | - | -| CANN 6.3.RC2 | 2.0.1 | 2.0.1.rc1 | v2.0.1-5.0.rc2 | - | -| | 1.11.0 | 1.11.0.post1 | v1.11.0-5.0.rc2 | 23.0.RC1-1.11.0 | -| | 1.8.1 | 1.8.1.post2 | v1.8.1-5.0.rc2 | 23.0.RC1-1.8.1 | -| CANN 6.3.RC1 | 1.11.0 | 1.11.0 | v1.11.0-5.0.rc1 | - | -| | 1.8.1 | 1.8.1.post1 | v1.8.1-5.0.rc1 | - | -| CANN 6.0.1 | 1.5.0 | 1.5.0.post8 | v1.5.0-3.0.0 | 22.0.0 | -| | 1.8.1 | 1.8.1 | v1.8.1-3.0.0 | 22.0.0-1.8.1 | -| | 1.11.0 | 1.11.0.rc2(beta) | v1.11.0-3.0.0 | - | -| CANN 6.0.RC1 | 1.5.0 | 1.5.0.post7 | v1.5.0-3.0.rc3 | 22.0.RC3 | -| | 1.8.1 | 1.8.1.rc3 | v1.8.1-3.0.rc3 | 22.0.RC3-1.8.1 | -| | 1.11.0 | 1.11.0.rc1(beta) | v1.11.0-3.0.rc3 | - | -| CANN 5.1.RC2 | 1.5.0 | 1.5.0.post6 | v1.5.0-3.0.rc2 | 22.0.RC2 | -| | 1.8.1 | 1.8.1.rc2 | v1.8.1-3.0.rc2 | 22.0.RC2-1.8.1 | -| CANN 5.1.RC1 | 1.5.0 | 1.5.0.post5 | v1.5.0-3.0.rc1 | 22.0.RC1 | -| | 1.8.1 | 1.8.1.rc1 | v1.8.1-3.0.rc1 | - | -| CANN 5.0.4 | 1.5.0 | 1.5.0.post4 | 2.0.4.tr5 | 21.0.4 | -| CANN 5.0.3 | 1.8.1 | 1.5.0.post3 | 2.0.3.tr5 | 21.0.3 | -| CANN 5.0.2 | 1.5.0 | 1.5.0.post2 | 2.0.2.tr5 | 21.0.2 | +| CANN版本 | 支持的PyTorch版本 | 支持的Extension版本 | Gitee分支 | +|-----------------------|--------------|------------------|-------------------| +| CANN 8.0.RC3 | 2.4.0 | 2.4.0 | v2.4.0-6.0.rc3 | +| | 2.3.1 | 2.3.1.post2 | v2.3.1-6.0.rc3 | +| | 2.1.0 | 2.1.0.post8 | v2.1.0-6.0.rc3 | +| CANN 8.0.RC2 | 2.3.1 | 2.3.1 | v2.3.1-6.0.rc2 | +| | 2.2.0 | 2.2.0.post2 | v2.2.0-6.0.rc2 | +| | 2.1.0 | 2.1.0.post6 | v2.1.0-6.0.rc2 | +| | 1.11.0 | 1.11.0.post14 | v1.11.0-6.0.rc2 | +| CANN 8.0.RC2.alpha002 | 2.3.1 | 2.3.1rc1 | v2.3.1 | +| CANN 8.0.RC1 | 2.2.0 | 2.2.0 | v2.2.0-6.0.rc1 | +| | 2.1.0 | 2.1.0.post4 | v2.1.0-6.0.rc1 | +| | 1.11.0 | 1.11.0.post11 | v1.11.0-6.0.rc1 | +| CANN 7.0.0 | 2.1.0 | 2.1.0 | v2.1.0-5.0.0 | +| | 2.0.1 | 2.0.1.post1 | v2.0.1-5.0.0 | +| | 1.11.0 | 1.11.0.post8 | v1.11.0-5.0.0 | +| CANN 7.0.RC1 | 2.1.0 | 2.1.0.rc1 | v2.1.0-5.0.rc3 | +| | 2.0.1 | 2.0.1 | v2.0.1-5.0.rc3 | +| | 1.11.0 | 1.11.0.post4 | v1.11.0-5.0.rc3 | +| CANN 6.3.RC3.1 | 1.11.0 | 1.11.0.post3 | v1.11.0-5.0.rc2.2 | +| CANN 6.3.RC3 | 1.11.0 | 1.11.0.post2 | v1.11.0-5.0.rc2.1 | +| CANN 6.3.RC2 | 2.0.1 | 2.0.1.rc1 | v2.0.1-5.0.rc2 | +| | 1.11.0 | 1.11.0.post1 | v1.11.0-5.0.rc2 | +| | 1.8.1 | 1.8.1.post2 | v1.8.1-5.0.rc2 | +| CANN 6.3.RC1 | 1.11.0 | 1.11.0 | v1.11.0-5.0.rc1 | +| | 1.8.1 | 1.8.1.post1 | v1.8.1-5.0.rc1 | +| CANN 6.0.1 | 1.5.0 | 1.5.0.post8 | v1.5.0-3.0.0 | +| | 1.8.1 | 1.8.1 | v1.8.1-3.0.0 | +| | 1.11.0 | 1.11.0.rc2(beta) | v1.11.0-3.0.0 | +| CANN 6.0.RC1 | 1.5.0 | 1.5.0.post7 | v1.5.0-3.0.rc3 | +| | 1.8.1 | 1.8.1.rc3 | v1.8.1-3.0.rc3 | +| | 1.11.0 | 1.11.0.rc1(beta) | v1.11.0-3.0.rc3 | +| CANN 5.1.RC2 | 1.5.0 | 1.5.0.post6 | v1.5.0-3.0.rc2 | +| | 1.8.1 | 1.8.1.rc2 | v1.8.1-3.0.rc2 | +| CANN 5.1.RC1 | 1.5.0 | 1.5.0.post5 | v1.5.0-3.0.rc1 | +| | 1.8.1 | 1.8.1.rc1 | v1.8.1-3.0.rc1 | +| CANN 5.0.4 | 1.5.0 | 1.5.0.post4 | 2.0.4.tr5 | +| CANN 5.0.3 | 1.8.1 | 1.5.0.post3 | 2.0.3.tr5 | +| CANN 5.0.2 | 1.5.0 | 1.5.0.post2 | 2.0.2.tr5 | ## 建议与交流 @@ -195,7 +203,7 @@ AscendPyTorch版本分支的维护阶段如下: | **PyTorch版本** | **维护策略** | **当前状态** | **发布时间** | **后续状态** | **EOL日期** | |-----------|-----------|--------|------------|-----------------------|-----------| -| 2.4.0 | 常规分支 | 计划 | - | - | - | | +| 2.4.0 | 常规分支 | 开发 | 2024/10/15 | 预计2025/03/15起进入维护状态 | - | | 2.3.1 | 常规分支 | 开发 | 2024/06/06 | 预计2024/12/06起进入维护状态 | | | 2.2.0 | 常规分支 | 维护 | 2024/04/01 | 预计2025/9/10起进入无维护状态 | | | 2.1.0 | 长期支持 | 开发 | 2023/10/15 | 预计2025/03/30起进入维护状态 | | @@ -215,10 +223,10 @@ AscendPyTorch版本分支的维护阶段如下: | 文档名称 | 文档链接 | | -------------------------- | ------------------------------------------------------------ | -| AscendPyTorch 安装指南 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/configandinstg/instg/insg_0001.html) | -| AscendPyTorch 网络模型迁移和训练 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | -| AscendPyTorch 算子适配 | [参考链接](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) | -| AscendPyTorch API清单(PyTorch原生接口与自定义接口) | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/apiref/apilist/ptaoplist_000002.html) | +| AscendPyTorch 安装指南 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) | +| AscendPyTorch 网络模型迁移和训练 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | +| AscendPyTorch 算子适配 | [参考链接](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) | +| AscendPyTorch API清单(PyTorch原生接口与自定义接口) | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/apiref/apilist/ptaoplist_000002.html) | ## 许可证 -- Gitee From e57ee9803d1ed848cb3662886058c14053240727 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Oct 2024 13:44:45 +0000 Subject: [PATCH 75/96] !15365 Update op_plugin commit id Merge pull request !15365 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 919680509f..ffd6a97674 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 919680509f869214d78b7b6d7a68c7d065394b68 +Subproject commit ffd6a976745747f4781bbf2231b6b8b254c46a0e -- Gitee From 30965319c1160b881c77577da611daec09c5c35b Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 15 Oct 2024 11:53:38 +0000 Subject: [PATCH 76/96] !15372 Update torchair commit id Merge pull request !15372 from torchair_robot/v2.1.0-6.0.rc3 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 820f0378f4..b79847a724 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 820f0378f4591707969e1aa55935cff7b823b155 +Subproject commit b79847a7243424badd59b18cccda3db7ad148c6c -- Gitee From f826e04a063da8ce885fddf56f69b6f41d717d93 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 16 Oct 2024 09:00:50 +0000 Subject: [PATCH 77/96] !15402 Update torchair commit id Merge pull request !15402 from torchair_robot/v2.1.0-6.0.rc3 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index b79847a724..dbf8c1fc68 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit b79847a7243424badd59b18cccda3db7ad148c6c +Subproject commit dbf8c1fc6855b53e374f332eb792999468011b12 -- Gitee From 80da101fd38c2ba50efecffa344d8990cf58e5d8 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 17 Oct 2024 07:51:37 +0000 Subject: [PATCH 78/96] !15423 Update torchair commit id Merge pull request !15423 from torchair_robot/v2.1.0-6.0.rc3 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index dbf8c1fc68..341bb795a6 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit dbf8c1fc6855b53e374f332eb792999468011b12 +Subproject commit 341bb795a69992114815f51ca9a51b99138ed20f -- Gitee From 147622e4252e9e6e9ee950d3cedf02f70f4ab53a Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Fri, 18 Oct 2024 06:30:37 +0000 Subject: [PATCH 79/96] !15411 update readme(add hardware support) Merge pull request !15411 from huangyunlong/2.1r3readme --- README.md | 20 ++++++++++++++++++++ README.zh.md | 21 +++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/README.md b/README.md index 41304101e9..90f968a4bc 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,26 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m | CANN 5.0.3 | 1.8.1 | 1.5.0.post3 | 2.0.3.tr5 | | CANN 5.0.2 | 1.5.0 | 1.5.0.post2 | 2.0.2.tr5 | +## Hardware support + +The Ascend training device includes the following models, all of which can be used as training environments for PyTorch models +| Product series | Product model | +|-----------------------|----------------------------------| +| Atlas Training series products | Atlas 800(model: 9000) | +| | Atlas 800(model:9010) | +| | Atlas 900 PoD(model:9000) | +| | Atlas 300T(model:9000) | +| | Atlas 300T Pro(model:9000) | +| Atlas A2 Training series products | Atlas 800T A2 | +| | Atlas 900 A2 PoD | +| | Atlas 200T A2 Box16 | +| | Atlas 300T A2 | + +The Ascend inference device includes the following models, all of which can be used as inference environments for large models +| Product series | Product model | +|-----------------------|----------------------------------| +| Atlas 800I A2 Inference product | Atlas 800I A2 | + ## Suggestions and Communication Everyone is welcome to contribute to the community. If you have any questions or suggestions, you can submit [Github Issues](https://github.com/Ascend/pytorch/issues). We will reply to you as soon as possible. Thank you very much. diff --git a/README.zh.md b/README.zh.md index 2ab796b19e..de86930a32 100644 --- a/README.zh.md +++ b/README.zh.md @@ -184,6 +184,27 @@ print(z) | CANN 5.0.3 | 1.8.1 | 1.5.0.post3 | 2.0.3.tr5 | | CANN 5.0.2 | 1.5.0 | 1.5.0.post2 | 2.0.2.tr5 | +## 硬件配套 + +昇腾训练设备包含以下型号,都可作为PyTorch模型的训练环境 +| 产品系列 | 产品型号 | +|-----------------------|----------------------------------| +| Atlas 训练系列产品 | Atlas 800 训练服务器(型号:9000) | +| | Atlas 800 训练服务器(型号:9010) | +| | Atlas 900 PoD(型号:9000) | +| | Atlas 300T 训练卡(型号:9000) | +| | Atlas 300T Pro 训练卡(型号:9000)| +| Atlas A2 训练系列产品 | Atlas 800T A2 训练服务器 | +| | Atlas 900 A2 PoD 集群基础单元 | +| | Atlas 200T A2 Box16 异构子框 | +| | Atlas 300T A2 训练卡 | + +昇腾推理设备包含以下型号,都可作为大模型的推理环境 +| 产品系列 | 产品型号 | +|-----------------------|----------------------------------| +| Atlas 800I A2推理产品 | Atlas 800I A2 推理服务器 | + + ## 建议与交流 欢迎大家为社区做贡献。如果有任何疑问或建议,请提交[gitee Issues](https://gitee.com/Ascend/pytorch/issues),我们会尽快回复。感谢您的支持。 -- Gitee From 6b77de56fff3d40c23f1d8f6b12feeab56530e5a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 18 Oct 2024 13:43:28 +0000 Subject: [PATCH 80/96] !15455 Update op_plugin commit id Merge pull request !15455 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index ffd6a97674..070332e65b 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit ffd6a976745747f4781bbf2231b6b8b254c46a0e +Subproject commit 070332e65baff39923406c86d06eeb3e14047c6f -- Gitee From be57216f7d4295e3ef7a325a3cb21d0715256239 Mon Sep 17 00:00:00 2001 From: wangjie Date: Thu, 24 Oct 2024 12:45:04 +0000 Subject: [PATCH 81/96] !15496 [PROF] Profiler TraceStepTime table fix Merge pull request !15496 from wangjie/cherry-pick-1729677572 --- .../analysis/prof_common_func/_constant.py | 4 +- .../prof_view/_trace_step_time_parser.py | 2 +- .../_trace_step_time_db_parser.py | 49 ++++++++++++------- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index 6cde5e6dcb..38493cc781 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -280,8 +280,8 @@ class DbConstant(): TABLE_CANN_API = "CANN_API" # task table name TABLE_TASK = "TASK" - # communicate op table name - TABLE_COMMUNICATE_OP = "COMMUNICATE_OP" + # communication op table name + TABLE_COMMUNICATION_OP = "COMMUNICATION_OP" # compute task table name TABLE_COMPUTE_TASK_INFO = "COMPUTE_TASK_INFO" # communication task table name diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index f465cd97f8..8cb1df91e3 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -48,7 +48,7 @@ class TraceStepTimeParser(BaseParser): start_time = float(start_time) duration = float(duration) for step in step_list: - if step[StepInfoIndex.START_TS.value] <= start_time <= step[StepInfoIndex.END_TS.value]: + if step[StepInfoIndex.START_TS.value] <= start_time < step[StepInfoIndex.END_TS.value]: cur_step = step[StepInfoIndex.ID.value] break for step in step_list: diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py index fd936e80bc..96eb06f802 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py @@ -14,6 +14,7 @@ # limitations under the License. import os +from enum import Enum from .._base_parser import BaseParser from ...prof_common_func._constant import Constant, print_error_msg, print_warn_msg from ...prof_common_func._constant import DbConstant, TableColumnsManager @@ -25,6 +26,12 @@ from ...prof_parse._fwk_file_parser import FwkFileParser __all__ = [] +class CommunicationOpIndex(Enum): + OP_NAME = 0 + START_NS = 1 + END_NS = 2 + + class TraceStepTimeDbParser(BaseParser): def __init__(self, name: str, param_dict: dict): @@ -32,7 +39,7 @@ class TraceStepTimeDbParser(BaseParser): self.step_range = [] self.string_id_map = {} self.compute_task_info = {} - self.communication_task_info = {} + self.communication_op_info = [] self.task_db_con = None self.task_db_curs = None self.analysis_db_con = None @@ -97,7 +104,8 @@ class TraceStepTimeDbParser(BaseParser): 'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0 } - origin_compute_data, origin_communication_data, bubble_data = self._get_task_data_in_step(cur_step) + origin_compute_data = self._get_compute_data_in_step(cur_step) + origin_communication_data, bubble_data = self._get_communication_data_in_step(cur_step) compute_data = RangeCaculator.merge_continuous_intervals(origin_compute_data) save_info['compute'] = sum(data.end_ts - data.start_ts for data in compute_data) communication_data = RangeCaculator.merge_continuous_intervals(origin_communication_data) @@ -125,7 +133,7 @@ class TraceStepTimeDbParser(BaseParser): def _init_step_range(self, deps_data: dict): self.step_range = deps_data.get(Constant.STEP_INFO_DB_PARSER, []) - + def _init_task_info_from_db(self): conn, curs = DbManager.create_connect_db(self.db_path) if not (conn and curs): @@ -141,28 +149,33 @@ class TraceStepTimeDbParser(BaseParser): sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMPUTE_TASK_INFO) compute_task_data = DbManager.fetch_all_data(curs, sql) self.compute_task_info = {data[1]: data[0] for data in compute_task_data} - if DbManager.judge_table_exist(curs, DbConstant.TABLE_COMMUNICATION_TASK_INFO): - sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMMUNICATION_TASK_INFO) - communication_task_data = DbManager.fetch_all_data(curs, sql) - self.communication_task_info = {data[1]: data[0] for data in communication_task_data} + if DbManager.judge_table_exist(curs, DbConstant.TABLE_COMMUNICATION_OP): + sql = "select opName, startNs, endNs from {}".format(DbConstant.TABLE_COMMUNICATION_OP) + self.communication_op_info = DbManager.fetch_all_data(curs, sql) DbManager.destroy_db_connect(conn, curs) - def _get_task_data_in_step(self, step_info): + def _get_compute_data_in_step(self, step_info): compute_data = [] - communication_data = [] - bubble_data = [] for task_id, task_info in step_info.get(Constant.TASK_INFO, {}).items(): if task_id in self.compute_task_info: compute_data.append( RangeCaculator.generate_time_range(task_info.get("startNs"), task_info.get("endNs"))) - if task_id in self.communication_task_info: - time_range = RangeCaculator.generate_time_range( - task_info.get("startNs"), task_info.get("endNs"), class_range=CommunicationTimeRange) - communication_data.append(time_range) - task_name = self.string_id_map.get(self.communication_task_info.get(task_id), '') - if task_name.startswith('hcom_receive'): - bubble_data.append(time_range) - return compute_data, communication_data, bubble_data + return compute_data + + def _get_communication_data_in_step(self, step_info): + communication_data = [] + bubble_data = [] + for op_info in self.communication_op_info: + op_start_time = op_info[CommunicationOpIndex.START_NS.value] + if not (step_info.get(Constant.START_TS) <= op_start_time < step_info.get(Constant.END_TS)): + continue + time_range = RangeCaculator.generate_time_range( + op_start_time, op_info[CommunicationOpIndex.END_NS.value], class_range=CommunicationTimeRange) + communication_data.append(time_range) + op_name = self.string_id_map.get(op_info[CommunicationOpIndex.OP_NAME.value], '') + if op_name.startswith('hcom_receive'): + bubble_data.append(time_range) + return communication_data, bubble_data def _get_first_device_task_ts(self, compute_task, communication_task): first_compute_task = compute_task[0] if compute_task else None -- Gitee From fae30301ba072711d8b2a35a2a99e04819ff9a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=99=9E=E8=89=AF=E6=96=8C?= Date: Thu, 24 Oct 2024 13:03:41 +0000 Subject: [PATCH 82/96] =?UTF-8?q?!15460=20[fix]rectify=20the=20spelling=20?= =?UTF-8?q?error=20of=20words=20in=20screen=20logs=20when=20gc=20is=20coll?= =?UTF-8?q?ected=20using=20PyTorch=20API=20Merge=20pull=20request=20!15460?= =?UTF-8?q?=20from=20=E8=99=9E=E8=89=AF=E6=96=8C/bug=5Fv2.1.0=5F6.0.rc3=5F?= =?UTF-8?q?1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../_dynamic_profiler/_dynamic_profiler_monitor_shm.py | 8 +++++--- torch_npu/profiler/experimental_config.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py index 9284706d91..ec4f4429c9 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py @@ -9,6 +9,7 @@ import struct from datetime import datetime from ...utils.path_manager import PathManager +from ...utils._error_code import ErrCode, prof_error from ..analysis.prof_common_func._file_manager import FileManager from ._dynamic_profiler_log import logger @@ -84,9 +85,10 @@ class DynamicProfilerShareMemory: time_shm = os.stat(shm_path).st_ctime pid_time = self._get_pid_st_ctime(os.getpid()) eps = 60 - if pid_time - time_shm > eps: - logger.error("There maybe exist share memory before this task, if you kill last task, " - "dynamic profiler will not valid, please remove %s, and retry.", shm_path) + if pid_time is not None and pid_time - time_shm > eps: + raise RuntimeError(f"There may exist shared memory before this task. If you kill the last task, " + f"dynamic profiler will not be valid. Please remove: {shm_path}, and retry." + + prof_error(ErrCode.VALUE)) from err def _create_prof_cfg(self): if not os.path.exists(self.config_path): diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py index 2b3ff14563..2dc69aff4e 100644 --- a/torch_npu/profiler/experimental_config.py +++ b/torch_npu/profiler/experimental_config.py @@ -130,7 +130,7 @@ class _ExperimentalConfig: print_warn_msg("Invalid parameter op_attr, which must be of boolean type, reset it to False.") self._op_attr = False if self._export_type not in (ExportType.Text, ExportType.Db): - print_warn_msg("Invalid parameter type, reset it to text.") + print_warn_msg("Invalid parameter export_type, reset it to text.") self._export_type = ExportType.Text if self._op_attr and self._export_type != ExportType.Db: print_warn_msg("op_attr switch is invalid with export type set as text.") @@ -140,7 +140,7 @@ class _ExperimentalConfig: print_warn_msg("Parameter gc_detect_threshold is not int or float type, reset it to default.") self._gc_detect_threshold = None elif self._gc_detect_threshold < 0.0: - print_warn_msg("Parameter gc_detect_threshold can not be negetive, reset it to default.") + print_warn_msg("Parameter gc_detect_threshold can not be negative, reset it to default.") self._gc_detect_threshold = None elif self._gc_detect_threshold == 0.0: print_info_msg("Parameter gc_detect_threshold is set to 0, it will collect all gc events.") -- Gitee From fc0b97a170c099d2dcb3ea9608db333a6712237d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Thu, 24 Oct 2024 13:04:40 +0000 Subject: [PATCH 83/96] =?UTF-8?q?!15511=20Compatible=20old=20hccl=20versio?= =?UTF-8?q?n=20Merge=20pull=20request=20!15511=20from=20=E5=85=B3=E9=BE=99?= =?UTF-8?q?=E9=94=8B/cherry-pick-1729759095?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 704aca55e5..e8a9f5a283 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -209,6 +209,12 @@ void getP2PHcclCommCofig(HcclCommConfig* config) { HcclCommConfigInit(config); config->hcclBufferSize = c10_npu::option::OptionsManager::GetP2PBufferSize(); + // Compatible with the size check of the old version of HCCL, forcibly convert + // the config object to a size_t=32 object, and retain the N ± 2 version + if (!isHcclFeatureSupported(HcclCommConfigCapability::HCCL_COMM_CONFIG_COMM_NAME)) { + size_t *configSize = reinterpret_cast(config); + *configSize = 32; + } } void checkHcclCommConfigValid(const HcclCommConfig* config) -- Gitee From 1fa937ac8262dd378a5a538a02976d72a2ebbafd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= Date: Mon, 28 Oct 2024 09:26:11 +0000 Subject: [PATCH 84/96] =?UTF-8?q?!15544=20add=208.0.T37,8.0.T38,8.0.T39=20?= =?UTF-8?q?to=20foreach=20black=20list=20Merge=20pull=20request=20!15544?= =?UTF-8?q?=20from=20=E8=A9=B9=E6=98=8A/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/utils/_optim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/utils/_optim.py b/torch_npu/utils/_optim.py index 0eaeb63a16..78cb1d7b05 100644 --- a/torch_npu/utils/_optim.py +++ b/torch_npu/utils/_optim.py @@ -7,8 +7,8 @@ from torch_npu.utils.collect_env import get_cann_version _device_name = None _cann_version = get_cann_version() _foreach_black_list_for_cann_starts_with = ['8.0.RC1', '8.0.RC2'] -_foreach_black_list_for_cann_all = ['not known', '8.0.T1', '8.0.T2', '8.0.T3', '8.0.T37', '8.0.T5', '8.0.T6', '8.0.T7', - '8.0.T8', '8.0.T10', '8.0.T13', '8.0.T16', '8.0.T50', '8.0.T51', '8.0.T52'] +_foreach_black_list_for_cann_all = ['not known', '8.0.T1', '8.0.T2', '8.0.T3', '8.0.T5', '8.0.T6', '8.0.T7', + '8.0.T8', '8.0.T10', '8.0.T13', '8.0.T16', '8.0.T37', '8.0.T38', '8.0.T39', '8.0.T50', '8.0.T51', '8.0.T52'] def patch_supported_devices(): -- Gitee From 21616d71e3096059a0cd129158ae93d28f312883 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 29 Oct 2024 08:35:59 +0000 Subject: [PATCH 85/96] !15578 Update torchair commit id Merge pull request !15578 from torchair_robot/v2.1.0-6.0.rc3 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 341bb795a6..549ff0f2bc 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 341bb795a69992114815f51ca9a51b99138ed20f +Subproject commit 549ff0f2bc5ff0308051043f56dfbbb9c8383529 -- Gitee From c3daabc9319c28d0646fa95f7ac73370db5f55b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Sat, 16 Nov 2024 03:35:17 +0000 Subject: [PATCH 86/96] =?UTF-8?q?!16018=20Implement=20recordDataPtrOnStrea?= =?UTF-8?q?m=20to=20ensure=20that=20cross-stream=20memory=20reuse=20is=20c?= =?UTF-8?q?orrect=20when=20backward.=20Merge=20pull=20request=20!16018=20f?= =?UTF-8?q?rom=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.1.0-6.0.rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/impl/NPUGuardImpl.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h index 1c3ba4e12e..4359db0136 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h @@ -5,6 +5,7 @@ #include #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/core/npu/NPUStream.h" @@ -165,6 +166,12 @@ struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { NPU_CHECK_ERROR_WITHOUT_UCE(acl::AclQueryEventRecordedStatus(npu_event, &status)); return (status == acl::ACL_EVENT_RECORDED_STATUS_COMPLETE); } + + void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const c10::Stream& stream) const override + { + NPUStream npu_stream{stream}; + c10_npu::NPUCachingAllocator::recordStream(data_ptr, npu_stream); + } }; } // namespace impl -- Gitee From 16705c6d11ccdf0e59d5b348acca95efae48d6cd Mon Sep 17 00:00:00 2001 From: xudaohong Date: Sat, 23 Nov 2024 07:15:20 +0000 Subject: [PATCH 87/96] !16234 [feat] add optional arg offset for npu_prefetch Merge pull request !16234 from xudaohong/cherry-pick-1732195754 --- test/test_fake_tensor.py | 5 +++++ third_party/op-plugin | 2 +- third_party/torchair/torchair | 2 +- torch_npu/meta/_meta_registrations.py | 8 ++++++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py index c2ead43c03..a1f549a67f 100644 --- a/test/test_fake_tensor.py +++ b/test/test_fake_tensor.py @@ -1906,6 +1906,11 @@ class TestNpuPrefetch(TestCase): exception = cm.exception self.assertEqual(str(exception), "The max_size should be greater than zero, but got -1.") + with self.assertRaises(RuntimeError) as cm: + torch_npu.npu_prefetch(input1, None, 10, -1) + exception = cm.exception + self.assertEqual(str(exception), "The offset should be nonnegative, but got -1.") + instantiate_parametrized_tests(FakeTensorTest) instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for="cpu") diff --git a/third_party/op-plugin b/third_party/op-plugin index 070332e65b..c518992967 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 070332e65baff39923406c86d06eeb3e14047c6f +Subproject commit c5189929673935f3d04414f15cc183b72fc16941 diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 549ff0f2bc..0389f1b30f 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 549ff0f2bc5ff0308051043f56dfbbb9c8383529 +Subproject commit 0389f1b30f50772f840e745b4c298017fe906e38 diff --git a/torch_npu/meta/_meta_registrations.py b/torch_npu/meta/_meta_registrations.py index 98cc126714..b99e9b6b98 100644 --- a/torch_npu/meta/_meta_registrations.py +++ b/torch_npu/meta/_meta_registrations.py @@ -946,8 +946,12 @@ has_side_effect(torch.ops.npu.npu_prefetch.default) @impl(m, "npu_prefetch") -def npu_prefetch_meta(self, dependency, max_size): +def npu_prefetch_meta(self, dependency, max_size, offset=0): torch._check( max_size > 0, lambda: f"The max_size should be greater than zero, but got {max_size}.", - ) \ No newline at end of file + ) + torch._check( + offset >= 0, + lambda: f"The offset should be nonnegative, but got {offset}.", + ) -- Gitee From 643e91be99c664379c5bd07d1a5ecffdce53b0b6 Mon Sep 17 00:00:00 2001 From: dilililiwhy Date: Fri, 29 Nov 2024 06:40:56 +0000 Subject: [PATCH 88/96] !16448 Release 6.0.RC3.1 Merge pull request !16448 from dilililiwhy/release_rc31_210 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6e44f86033..ec8a463b44 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel BASE_DIR = os.path.dirname(os.path.realpath(__file__)) THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party") -VERSION = '2.1.0.post8' +VERSION = '2.1.0.post9' UNKNOWN = "Unknown" BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP -- Gitee From 08e4654eb90d4b33b4b2fbc17f75f1ca905f2c4a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 29 Nov 2024 09:58:50 +0000 Subject: [PATCH 89/96] !16478 Update op_plugin commit id Merge pull request !16478 from pta-robot/v2.1.0-6.0.rc3 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c518992967..b99362e256 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c5189929673935f3d04414f15cc183b72fc16941 +Subproject commit b99362e2563fb20b1512e89d618a5f5ce7f7e44c -- Gitee From 4ad973c47dd083fbe4409c98e61a7a1204577378 Mon Sep 17 00:00:00 2001 From: liyou_b <2953090824@qq.com> Date: Thu, 5 Dec 2024 02:24:41 +0000 Subject: [PATCH 90/96] =?UTF-8?q?!16614=20=E3=80=90PROF=E3=80=91=E3=80=90B?= =?UTF-8?q?UG=E3=80=91V2.1.0-6.0.0rc3:=20add=20start=20step=20for=20dynami?= =?UTF-8?q?c=20profiling=20Merge=20pull=20request=20!16614=20from=20liyou?= =?UTF-8?q?=5Fb/bug=5Ffixed=5F6.0rc3=5F210?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/profiler/test_dynamic_profiler.py | 11 ++++++ .../profiler/_dynamic_profiler/__init__.py | 5 +-- .../_dynamic_profiler_config_context.py | 34 +++++++++---------- .../_dynamic_profiler_monitor.py | 6 ++-- .../_dynamic_profiler_monitor_shm.py | 5 +-- ...iler_log.py => _dynamic_profiler_utils.py} | 31 +++++++++++++++-- torch_npu/profiler/dynamic_profile.py | 16 ++++----- 7 files changed, 70 insertions(+), 38 deletions(-) rename torch_npu/profiler/_dynamic_profiler/{_dynamic_profiler_log.py => _dynamic_profiler_utils.py} (51%) diff --git a/test/profiler/test_dynamic_profiler.py b/test/profiler/test_dynamic_profiler.py index d8a7e8cb20..0b8b3729a8 100644 --- a/test/profiler/test_dynamic_profiler.py +++ b/test/profiler/test_dynamic_profiler.py @@ -55,6 +55,7 @@ class TestDynamicProfiler(TestCase): large_steps = 5 flags = os.O_WRONLY mode = stat.S_IRUSR | stat.S_IWUSR + start_step = 0 @classmethod def setUpClass(cls): @@ -67,6 +68,7 @@ class TestDynamicProfiler(TestCase): cls.active_rank_prof_dir = os.path.join(cls.results_path, "active_rank_prof_dir") cls.cfg_prof_dir = os.path.join(cls.results_path, "cfg_prof_dir") cls.cfg_path = os.path.join(cls.results_path, "profiler_config.json") + os.environ["RANK"] = "0" dp.init(cls.results_path) @classmethod @@ -451,13 +453,16 @@ class TestDynamicProfiler(TestCase): def test_dynamic_profiler_default(self): cfg_json = copy.deepcopy(self.json_sample) cfg_json['prof_dir'] = self.default_prof_dir + cfg_json['start_step'] = TestDynamicProfiler.start_step + 1 with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f: time.sleep(1) json.dump(cfg_json, f, indent=4) time.sleep(3) dp.step() + TestDynamicProfiler.start_step += 1 self.model_train.train_one_step() dp.step() + TestDynamicProfiler.start_step += 1 has_prof = False if self.has_prof_dir(self.default_prof_dir): has_prof = True @@ -470,14 +475,17 @@ class TestDynamicProfiler(TestCase): cfg_json['prof_dir'] = self.rank_prof_dir cfg_json['is_rank'] = True cfg_json['rank_list'] = [0] + cfg_json['start_step'] = TestDynamicProfiler.start_step + 1 with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f: time.sleep(1) json.dump(cfg_json, f, indent=4) time.sleep(3) dp.step() + TestDynamicProfiler.start_step += 1 self.model_train.train_one_step() dp.step() + TestDynamicProfiler.start_step += 1 has_prof = False if self.has_prof_dir(self.rank_prof_dir): has_prof = True @@ -490,14 +498,17 @@ class TestDynamicProfiler(TestCase): cfg_json['prof_dir'] = self.invalid_rank_prof_dir cfg_json['is_rank'] = True cfg_json['rank_list'] = [1] + cfg_json['start_step'] = TestDynamicProfiler.start_step + 1 with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f: time.sleep(1) json.dump(cfg_json, f, indent=4) time.sleep(3) dp.step() + TestDynamicProfiler.start_step += 1 self.model_train.train_one_step() dp.step() + TestDynamicProfiler.start_step += 1 has_prof = False if self.has_prof_dir(self.invalid_rank_prof_dir): has_prof = True diff --git a/torch_npu/profiler/_dynamic_profiler/__init__.py b/torch_npu/profiler/_dynamic_profiler/__init__.py index 23852dd596..a9a2c5b3bb 100644 --- a/torch_npu/profiler/_dynamic_profiler/__init__.py +++ b/torch_npu/profiler/_dynamic_profiler/__init__.py @@ -1,4 +1 @@ -__all__ = ['logger', 'DynamicProfilerMonitor', 'init_logger'] - -from ._dynamic_profiler_log import logger, init_logger -from ._dynamic_profiler_monitor import DynamicProfilerMonitor +__all__ = [] diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py index dc12b47cf2..a2df15718e 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py @@ -1,13 +1,12 @@ -import os import json -import torch from torch_npu._C._profiler import ProfilerActivity from ..experimental_config import _ExperimentalConfig, ProfilerLevel, AiCMetrics -from ._dynamic_profiler_log import logger +from ._dynamic_profiler_utils import logger, _get_rank_id class ConfigContext: DEFAULT_ACTIVE_NUM = 1 + DEFAULT_START_STEP = 0 def __init__(self, json_data: dict): self.activity_set = set() @@ -22,9 +21,10 @@ class ConfigContext: self.rank_set = set() self.experimental_config = None self._active = 1 + self._start_step = 0 self.is_valid = False self._meta_data = {} - self._rank_id = self.get_rank_id() + self._rank_id = _get_rank_id() self.parse(json_data) def parse(self, json_data: dict): @@ -44,6 +44,12 @@ class ConfigContext: self.with_flops = json_data.get('with_flops', False) self.with_modules = json_data.get('with_modules', False) self._active = json_data.get('active', self.DEFAULT_ACTIVE_NUM) + self._start_step = json_data.get("start_step", self.DEFAULT_START_STEP) + if not isinstance(self._start_step, int) or self._start_step < 0: + logger.info(f"Start step is not valid, will be reset to {self.DEFAULT_START_STEP}.") + self._start_step = self.DEFAULT_START_STEP + else: + logger.info(f"Start step will be set to {self._start_step}.") exp_config = json_data.get('experimental_config') if not exp_config: self.experimental_config = None @@ -86,7 +92,7 @@ class ConfigContext: logger.warning("Set rank_list failed, rank_list must be list!") return for rank in ranks: - if isinstance(rank, int): + if isinstance(rank, int) and rank >= 0: self.rank_set.add(rank) def valid(self) -> bool: @@ -139,6 +145,9 @@ class ConfigContext: return self.DEFAULT_ACTIVE_NUM return self._active + def start_step(self) -> int: + return self._start_step + def experimental_config(self) -> _ExperimentalConfig: return self.experimental_config @@ -154,16 +163,5 @@ class ConfigContext: cfg_json = json.loads(cfg_json_str) return cfg_json - @staticmethod - def get_rank_id() -> int: - try: - rank_id = os.environ.get('RANK') - if rank_id is None and torch.distributed.is_available() and torch.distributed.is_initialized(): - rank_id = torch.distributed.get_rank() - if not isinstance(rank_id, int): - rank_id = int(rank_id) - except Exception as ex: - logger.warning("Get rank id %s, rank_id will be set to 0 !", str(ex)) - rank_id = 0 - - return rank_id + + diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py index 59ba639de7..c0703d517f 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py @@ -6,7 +6,7 @@ import json import struct import multiprocessing -from ._dynamic_profiler_log import logger, logger_monitor, init_logger +from ._dynamic_profiler_utils import logger, logger_monitor, init_logger, _get_rank_id from ._dynamic_profiler_config_context import ConfigContext from ._dynamic_profiler_monitor_shm import DynamicProfilerShareMemory @@ -19,7 +19,7 @@ class DynamicProfilerMonitor: poll_interval: int = 2 ): self._path = path - self._rank_id = ConfigContext.get_rank_id() + self._rank_id = _get_rank_id() self._buffer_size = buffer_size self._monitor_process = None self.prof_cfg_context = None @@ -110,7 +110,7 @@ def worker_func(params_dict): file_stat_time = params_dict.get("file_stat_time") mmap_path = params_dict.get("mmap_path") is_mmap = params_dict.get("is_mmap") - init_logger(logger_monitor, os.path.dirname(cfg_path), True) + init_logger(logger_monitor, os.path.dirname(cfg_path), is_monitor_process=True) mmap_obj = None if is_mmap and mmap_path is not None: diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py index ec4f4429c9..944c115f44 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py @@ -11,7 +11,7 @@ from datetime import datetime from ...utils.path_manager import PathManager from ...utils._error_code import ErrCode, prof_error from ..analysis.prof_common_func._file_manager import FileManager -from ._dynamic_profiler_log import logger +from ._dynamic_profiler_utils import logger class DynamicProfilerShareMemory: @@ -25,6 +25,7 @@ class DynamicProfilerShareMemory: "with_flops": False, "with_modules": False, "active": 1, + "start_step": 0, "is_rank": False, "rank_list": [], "experimental_config": { @@ -88,7 +89,7 @@ class DynamicProfilerShareMemory: if pid_time is not None and pid_time - time_shm > eps: raise RuntimeError(f"There may exist shared memory before this task. If you kill the last task, " f"dynamic profiler will not be valid. Please remove: {shm_path}, and retry." + - prof_error(ErrCode.VALUE)) from err + prof_error(ErrCode.VALUE)) def _create_prof_cfg(self): if not os.path.exists(self.config_path): diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_log.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py similarity index 51% rename from torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_log.py rename to torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py index 77a620c05b..5f21003f94 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_log.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py @@ -2,18 +2,19 @@ import os import socket import logging from logging.handlers import RotatingFileHandler +import torch from ...utils.path_manager import PathManager logger = logging.getLogger("DynamicProfiler") logger_monitor = logging.getLogger("DynamicProfilerMonitor") -def init_logger(logger_: logging.Logger, path: str, is_monitor_process=False): +def init_logger(logger_: logging.Logger, path: str, is_monitor_process: bool = False): path = os.path.join(path, 'log') if not os.path.exists(path): PathManager.make_dir_safety(path) worker_name = "{}".format(socket.gethostname()) - log_name = "dp_{}_{}.log".format(worker_name, os.getpid()) + log_name = "dp_{}_{}_rank_{}.log".format(worker_name, os.getpid(), _get_rank_id()) if is_monitor_process: log_name = "monitor_" + log_name log_file = os.path.join(path, log_name) @@ -24,3 +25,29 @@ def init_logger(logger_: logging.Logger, path: str, is_monitor_process=False): handler.setFormatter(formatter) logger_.setLevel(logging.DEBUG) logger_.addHandler(handler) + + +def _get_rank_id() -> int: + try: + rank_id = os.environ.get('RANK') + if rank_id is None and torch.distributed.is_available() and torch.distributed.is_initialized(): + rank_id = torch.distributed.get_rank() + if not isinstance(rank_id, int): + rank_id = int(rank_id) + except Exception as ex: + logger.warning("Get rank id %s, rank_id will be set to -1 !", str(ex)) + rank_id = -1 + + return rank_id + + +def _get_device_id() -> int: + try: + device_id = os.environ.get('LOCAL_RANK') + if not isinstance(device_id, int): + device_id = int(device_id) + except Exception as ex: + logger.warning("Get device id %s, device_id will be set to -1 !", str(ex)) + device_id = -1 + + return device_id diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py index 1a8d21dec1..c0fea38e9b 100644 --- a/torch_npu/profiler/dynamic_profile.py +++ b/torch_npu/profiler/dynamic_profile.py @@ -13,7 +13,8 @@ from .analysis.prof_common_func._constant import print_warn_msg from .analysis.prof_common_func._constant import print_error_msg from .analysis.prof_common_func._utils import no_exception_func from .analysis.prof_common_func._file_manager import FileManager -from ._dynamic_profiler import logger, init_logger, DynamicProfilerMonitor +from ._dynamic_profiler._dynamic_profiler_utils import logger, init_logger +from ._dynamic_profiler._dynamic_profiler_monitor import DynamicProfilerMonitor from ._dynamic_profiler._dynamic_profiler_config_context import ConfigContext __all__ = [ @@ -58,13 +59,13 @@ class _DynamicProfile: def _dynamic_profiler_valid(self): prof_cfg_ctx = self._dynamic_monitor.shm_to_prof_conf_context() - if prof_cfg_ctx is None: - return None - else: - return prof_cfg_ctx + return prof_cfg_ctx def step(self): self.cur_step += 1 + cfg_ctx = self._dynamic_profiler_valid() + if cfg_ctx is not None: + self.cfg_ctx = cfg_ctx if self.cur_step == self.RECORD_TIME_STEP: self._step_record_time = time.time() elif self.cur_step - self.RECORD_TIME_STEP == 1: @@ -77,10 +78,7 @@ class _DynamicProfile: self.prof.stop() self.prof = None logger.info(f"Stop Dynamic Profiler at {self.cur_step} step.") - elif self.prof is None: - self.cfg_ctx = self._dynamic_profiler_valid() - if self.cfg_ctx is None: - return + elif self.prof is None and self.cfg_ctx is not None and self.cur_step == self.cfg_ctx.start_step(): self.step_num = self.cfg_ctx.active() self.enable_prof() self.cfg_ctx = None -- Gitee From c37b72f7ea16b0b102f0e62983dbfb2bf5b8de05 Mon Sep 17 00:00:00 2001 From: shaojieMike Date: Thu, 7 Nov 2024 01:18:39 +0000 Subject: [PATCH 91/96] !15717 Support fine-grained and custom CPU binding Merge pull request !15717 from shaojieMike/v2.1.0_PR_bindcore --- .../csrc/core/npu/NPUAffinityController.cpp | 291 ++++++++++++++++++ .../csrc/core/npu/NPUAffinityController.h | 35 +++ torch_npu/csrc/core/npu/NPUFunctions.cpp | 34 -- torch_npu/csrc/core/npu/NPUQueue.cpp | 32 +- torch_npu/csrc/core/npu/NPUQueue.h | 4 +- torch_npu/csrc/core/npu/impl/NPUGuardImpl.h | 2 + .../csrc/core/npu/register/OptionsManager.cpp | 9 +- .../csrc/core/npu/register/OptionsManager.h | 2 +- .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 5 + .../csrc/distributed/ProcessGroupHCCL.cpp | 7 +- torch_npu/csrc/npu/Module.cpp | 25 ++ torch_npu/utils/_module.py | 2 + 12 files changed, 391 insertions(+), 57 deletions(-) create mode 100644 torch_npu/csrc/core/npu/NPUAffinityController.cpp create mode 100644 torch_npu/csrc/core/npu/NPUAffinityController.h diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp new file mode 100644 index 0000000000..e7beafecd4 --- /dev/null +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -0,0 +1,291 @@ + +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" +#include "torch_npu/csrc/core/npu/NPUFunctions.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10_npu { + + static pthread_t mainthread_tid; + + const std::unordered_map threadTypeToNameMap = { + {releaseThread, "release_thread"}, + {aclThread, "acl_thread"}, + {mainThread, "main_thread"}, + {hcclCommWatchdogThread, "hcclComm_watchd"}, // thread name no more than 15 chars + {backwardThread, "backward_thread"}}; + + const std::unordered_map threadNameToTypeMap = { + {"release_thread", releaseThread}, + {"acl_thread", aclThread}, + {"main_thread", mainThread}, + {"hcclComm_watchd", hcclCommWatchdogThread}, + {"backward_thread", backwardThread}}; + + void RecordMainThreadTid() + { + mainthread_tid = pthread_self(); + } + + ThreadType getCurrentThreadType() + { + char thread_name[16]; + + if (prctl(PR_GET_NAME, thread_name, 0, 0, 0) == 0) { + std::string name(thread_name); + + auto it = threadNameToTypeMap.find(name); + if (it != threadNameToTypeMap.end()) { + return it->second; + } + } + return ThreadType::unknownThread; + } + + aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread) + { + cpu_set_t mask; + CPU_ZERO(&mask); + + for (auto i = core_range.start; i <= core_range.end; i++) { + CPU_SET(i, &mask); + } + if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) { + ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end); + return ACL_ERROR_NONE; + } + return ACL_ERROR_FEATURE_UNSUPPORTED; + } + + coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id) + { + int core_nums = sysconf(_SC_NPROCESSORS_ONLN); + int device_nums = device_count_ensure_non_zero(); + int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0; + return coreIdRange{static_cast(device_id * block_size), + static_cast(std::min((device_id + 1) * block_size, core_nums) - 1)}; + } + + inline bool has_set_pthread_affinity() + { + unsigned int core_nums = static_cast(sysconf(_SC_NPROCESSORS_ONLN)); + + cpu_set_t mask; + pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); + for (unsigned int i = 0; i < core_nums; i++) { + if (!CPU_ISSET(i, &mask)) { + return true; + } + } + return false; + } + + std::string GetAffinityMapAsString(const std::unordered_map &threadToCoreidMap, c10::DeviceIndex device_id) + { + std::ostringstream oss; + oss << "threadToCoreidMap plan to bind device " << static_cast(device_id) << " to " + << " [" << threadToCoreidMap.at(unknownThread).start << "," << threadToCoreidMap.at(unknownThread).end << "]、" + << " [" << threadToCoreidMap.at(mainThread).start << "," << threadToCoreidMap.at(mainThread).end << "]、" + << " [" << threadToCoreidMap.at(backwardThread).start << "," << threadToCoreidMap.at(backwardThread).end << "]、" + << " [" << threadToCoreidMap.at(aclThread).start << "," << threadToCoreidMap.at(aclThread).end << "]、" + << " [" << threadToCoreidMap.at(releaseThread).start << "," << threadToCoreidMap.at(releaseThread).end << "]、" + << " [" << threadToCoreidMap.at(hcclCommWatchdogThread).start << "," << threadToCoreidMap.at(hcclCommWatchdogThread).end << "]"; + + return oss.str(); + } + + std::unordered_map GetCpuAffinityMap(c10::DeviceIndex device_id) + { + std::unordered_map threadToCoreidMap; + std::initializer_list thread_types = {unknownThread, mainThread, backwardThread, aclThread, + releaseThread, hcclCommWatchdogThread}; + + coreIdRange current_core_range = GetCPUDefaultRange(device_id); + coreId offset = current_core_range.start; + + // calculate env2 default map + coreId core_nums = current_core_range.end - current_core_range.start; + if (core_nums < thread_types.size()) { + ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.", + core_nums, thread_types.size()); + for (auto thread_type : thread_types) { + threadToCoreidMap[thread_type] = current_core_range; + } + } else { + int remaining_type_count = thread_types.size() - 1; + int i = 0; + for (auto thread_type : thread_types) { + if (thread_type == ThreadType::unknownThread) { + threadToCoreidMap[ThreadType::unknownThread] = coreIdRange{current_core_range.start + remaining_type_count, current_core_range.end}; + } else { + threadToCoreidMap[thread_type] = coreIdRange{offset + i, offset + (i++)}; + } + } + } + + ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str()); + + return threadToCoreidMap; + } + + aclError SetThreadAffinity(c10::DeviceIndex device_id) + { + return SetThreadAffinity(device_id, getCurrentThreadType()); + } + + void printCoreRanges(const std::vector &ranges, uint32_t mode) + { + std::ostringstream oss; + oss << "Mode: " << mode << " "; + + for (size_t i = 0; i < ranges.size(); ++i) { + oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " "; + } + + ASCEND_LOGD("Core ranges: %s", oss.str().c_str()); + } + + bool isAllDigits(const std::string &str) + { + if (str.empty()) { + return false; + } + return std::all_of(str.begin(), str.end(), [](unsigned char c) { + return std::isdigit(c); + }); + } + + void parseCPUAffinityConf(uint32_t &mode, std::vector &ranges) + { + const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf(); + + if (input == nullptr || strlen(input) == 0) { + mode = 0; + return; + } + + mode = 0; + int device_nums = device_count_ensure_non_zero(); + ranges.clear(); + ranges.resize(device_nums); + + // init + for (int i = 0; i < device_nums; ++i) { + ranges[i] = GetCPUDefaultRange(i); + } + + std::string inputStr(input); + std::istringstream stream(inputStr); + std::string option; + + // Handle cases where only `mode` is provided, or `mode:` without value + if (isAllDigits(inputStr)) { + mode = static_cast(std::stoi(inputStr)); + return; // Return directly, `mode` has already been processed + } + + // Parse each option + while (std::getline(stream, option, ',')) { + // Split `option` based on colon + size_t colonPos = option.find(':'); + if (colonPos != std::string::npos) { + std::string key = option.substr(0, colonPos); + std::string value = option.substr(colonPos + 1); + + // Process `mode` + if (key == "mode") { + if (isAllDigits(value)) { + mode = static_cast(std::stoi(value)); + } else { + ASCEND_LOGW("mode is %s, should be all digits", value.c_str()); + } + } else if (key.rfind("npu", 0) == 0) { + // Handle NPU core binding range + if (isAllDigits(key.substr(3))) { + int device_id = std::stoi(key.substr(3)); // Parse NPU device ID + if (device_id < device_nums) { + size_t dashPos = value.find('-'); + if (dashPos != std::string::npos) { + std::string startStr = value.substr(0, dashPos); + std::string endStr = value.substr(dashPos + 1); + if (isAllDigits(startStr) && isAllDigits(endStr)) { + coreId start = static_cast(std::stoi(startStr)); + coreId end = static_cast(std::stoi(endStr)); + ranges[device_id] = {start, end}; + } else { + ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str()); + } + } else { + if (isAllDigits(value)) { + coreId singleCore = static_cast(std::stoi(value)); + ranges[device_id] = {singleCore, singleCore}; + } else { + ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str()); + } + } + } + } + } + } else if (isAllDigits(option)) { + // If no colon and the value is a number, use it directly as `mode` + mode = static_cast(std::stoi(option)); + } + } + } + + aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type) + { + uint32_t bind_conf; + std::vector ranges; + parseCPUAffinityConf(bind_conf, ranges); + printCoreRanges(ranges, bind_conf); + + // bind_conf=1, bind cores averagely based on device_id + if (bind_conf == 1) { + static const bool set_pthread_affinity = has_set_pthread_affinity(); + if (!set_pthread_affinity) { + return SetThreadAffinity(ranges[device_id], pthread_self()); + } + } else if (bind_conf == 2) { + auto thread_core_map = GetCpuAffinityMap(device_id); + // When the PTA_init function runs on device 0, the main thread is initially assigned to this device 0. + // However, when the acl_thread is initialized, the target device ID(maybe 0-7) is determined. + // Therefore, the main thread should be rescheduled to the target device. + if (current_thread_type == ThreadType::aclThread) + SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); + return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self()); + } else { + ASCEND_LOGD("Thread affinity setting is disabled."); + } + return ACL_ERROR_NONE; + } + + void SetBackwardThreadName(c10::DeviceIndex device_id) + { + static thread_local bool seted = false; + if (!seted) { + seted = true; + if (syscall(SYS_gettid) != getpid()) { + SetThreadName(ThreadType::backwardThread); + SetThreadAffinity(device_id); + } + } + } + + void SetThreadName(ThreadType type) + { + // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function. + if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { + ASCEND_LOGW("set thread name failed!"); + } + } + +} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h new file mode 100644 index 0000000000..2c1e92ddc7 --- /dev/null +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -0,0 +1,35 @@ +#pragma once +#include "torch_npu/csrc/core/npu/npu_log.h" + +namespace c10_npu { + + typedef unsigned int coreId; + + struct coreIdRange { + coreId start; + coreId end; + }; + + enum ThreadType { + unknownThread = 0, // Mostly refers to threads in PyTorch's motorized sleep thread pool, which are not considered in PTA. + mainThread = 1, // 1st performance hotspot, responsible for operator dispatching during the forward phase. + backwardThread = 2, // 2nd performance hotspot, responsible for operator dispatching during the backward phase. + aclThread = 3, // 3rd performance hotspot in PTA, responsible for handling the task queue. + releaseThread = 4, // Thread responsible for resource release. + hcclCommWatchdogThread = 5 // Thread responsible for HCCL communication monitoring. + }; + + aclError SetThreadAffinity(c10::DeviceIndex device); + aclError SetThreadAffinity(c10::DeviceIndex device, ThreadType current_thread_type); + void SetThreadName(ThreadType type); + + // The main thread of PTA, which is also the main thread of PyTorch, handles multiple phases of tasks + // (e.g., first parallel checkpoint data loading, then transitioning to forward training). + // Each phase may require different thread affinity settings. Therefore, we record the thread's TID + // to adjust its affinity later as needed. + void RecordMainThreadTid(); + + // Set backwardThread Name Once + void SetBackwardThreadName(c10::DeviceIndex device_id); + +} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 59456b3349..4b7a40ec11 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -66,20 +66,6 @@ aclError GetDevice(int32_t *device) return err; } -inline bool has_set_pthread_affinity() -{ - unsigned int core_nums = static_cast(sysconf(_SC_NPROCESSORS_ONLN)); - - cpu_set_t mask; - pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); - for (unsigned int i = 0; i < core_nums; i++) { - if (!CPU_ISSET(i, &mask)) { - return true; - } - } - return false; -} - aclError SetDevice(c10::DeviceIndex device) { TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); @@ -88,26 +74,6 @@ aclError SetDevice(c10::DeviceIndex device) return ACL_ERROR_NONE; } - static uint32_t bind_conf = c10_npu::option::OptionsManager::GetCpuAffinityConf(); - // bind_conf=1, bind cores averagely based on device_id - if (bind_conf == 1) { - static const bool set_pthread_affinity = has_set_pthread_affinity(); - if (!set_pthread_affinity) { - int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - int device_nums = device_count_ensure_non_zero(); - int block_size = (core_nums + device_nums - 1) / device_nums; - unsigned int start_core = static_cast(device * block_size); - unsigned int end_core = static_cast(std::min((device + 1) * block_size, core_nums)); - - cpu_set_t mask; - CPU_ZERO(&mask); - for (unsigned int i = start_core; i < end_core; i++) { - CPU_SET(i, &mask); - } - pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); - } - } - aclError err = aclrtSetDevice(device); if (err == ACL_ERROR_NONE) { local_device = device; diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 39bb3514f1..0ea9d98527 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -1,6 +1,7 @@ #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/npu_log.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/framework/OpParamMaker.h" @@ -15,7 +16,6 @@ #include #include #include -#include #include namespace c10_npu { @@ -587,9 +587,8 @@ bool Repository::CheckInit() const { } void StartConsume(Repository* repo, c10::DeviceIndex device_id) { - if (prctl(PR_SET_NAME, ("ACL_thread")) != 0) { - ASCEND_LOGE("set thread name failed!"); - } + SetThreadName(ThreadType::aclThread); + SetThreadAffinity(device_id); aclError ret = c10_npu::SetDevice(device_id); if (ret != 0) { @@ -619,7 +618,7 @@ void Repository::InitRepo(c10::DeviceIndex device_id) { std::thread cur_consumer(StartConsume, this, device_id); consumer = std::move(cur_consumer); - releaseQueue.InitReleaseQueue(); + releaseQueue.InitReleaseQueue(device_id); } std::string Repository::GetPara() @@ -697,17 +696,17 @@ void ReleaseQueue::PopFromReleaseQueue() { } void StartRelease(ReleaseQueue* releaseQue) { - if (prctl(PR_SET_NAME, ("Release_thread")) != 0) { - ASCEND_LOGE("set thread name failed!"); - } + SetThreadName(ThreadType::releaseThread); + SetThreadAffinity(releaseQue->GetDeviceID()); - while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) { - releaseQue->PopFromReleaseQueue(); - } - return; + while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) { + releaseQue->PopFromReleaseQueue(); + } + return; } -void ReleaseQueue::InitReleaseQueue() { +void ReleaseQueue::InitReleaseQueue(c10::DeviceIndex device_id) +{ if (datas == nullptr) { datas = releaseManager().Init(kReleaseQueueCapacity); } @@ -716,6 +715,7 @@ void ReleaseQueue::InitReleaseQueue() { SetStatus(INIT); std::thread cur_releaser(StartRelease, this); releaser = std::move(cur_releaser); + device_idx = device_id; } ReleaseQueue::~ReleaseQueue() { @@ -740,6 +740,12 @@ RepoStatus ReleaseQueue::GetStatus() const { return repo_status.load(); } +c10::DeviceIndex ReleaseQueue::GetDeviceID() const +{ + return device_idx; +} + + void ReleaseQueue::SetStatus(RepoStatus desired) { if (initialized == false) { ASCEND_LOGE("Release queue is not initialized, shouldn't call SetStatus(). !!"); diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h index 66e648069f..2375ef945b 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.h +++ b/torch_npu/csrc/core/npu/NPUQueue.h @@ -38,8 +38,9 @@ public: ~ReleaseQueue(); void PushToReleaseQueue(void* cur_paras); void PopFromReleaseQueue(); - void InitReleaseQueue(); + void InitReleaseQueue(c10::DeviceIndex device_id); RepoStatus GetStatus() const; + c10::DeviceIndex GetDeviceID() const; private: inline bool IsEmptyQueue() {return read_idx.idx == write_idx.idx;}; @@ -52,6 +53,7 @@ private: private: void* datas = nullptr; std::thread releaser; + c10::DeviceIndex device_idx; private: sring_idx read_idx; diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h index 4359db0136..705e772799 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h @@ -8,6 +8,7 @@ #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" @@ -53,6 +54,7 @@ struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { uncheckedSetDevice(d); } void uncheckedSetDevice(c10::Device d) const noexcept override { + SetBackwardThreadName(d.index()); NPU_CHECK_WARN(c10_npu::SetDevice(d.index())); } c10::Stream getStream(c10::Device d) const noexcept override { diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 8503361020..6a07e170c4 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -350,14 +350,9 @@ uint32_t OptionsManager::GetP2PBufferSize() return buf_size; } -uint32_t OptionsManager::GetCpuAffinityConf() +char* OptionsManager::GetCpuAffinityConf() { - const static uint32_t cpu_affinity_conf = []() -> uint32_t { - char* cpu_affinity_str = std::getenv("CPU_AFFINITY_CONF"); - int64_t cpu_affinity_conf = (cpu_affinity_str != nullptr) ? strtol(cpu_affinity_str, nullptr, 10) : 0; - return static_cast(cpu_affinity_conf); - }(); - return cpu_affinity_conf; + return std::getenv("CPU_AFFINITY_CONF"); } uint32_t OptionsManager::GetTaskQueueEnable() diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 98e8fd72dc..65a9c38a4b 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -51,7 +51,7 @@ public: static std::pair GetSilenceSigmaThresh(); static uint32_t GetP2PBufferSize(); static uint32_t GetTaskQueueEnable(); - static uint32_t GetCpuAffinityConf(); + static char* GetCpuAffinityConf(); static bool CheckForceUncached(); static std::string GetOomSnapshotDumpPath(); static void IsOomSnapshotEnable(); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 679b2a262a..bc1e1f9be3 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -13,6 +13,7 @@ #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h" #include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" @@ -266,8 +267,12 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) const auto& in = iter.second; call_(in); } + lazy_fn_.clear(); + SetThreadAffinity(device_id_); + RecordMainThreadTid(); + init_flag_ = true; ASCEND_LOGD("Npu sys ctrl initialize successfully."); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index e8a9f5a283..fc9e268f07 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -25,6 +25,7 @@ #include "torch_npu/csrc/core/NPUStorageImpl.h" #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" #include "torch_npu/csrc/distributed/HCCLUtils.hpp" @@ -791,6 +792,8 @@ ProcessGroupHCCL::~ProcessGroupHCCL() void ProcessGroupHCCL::hcclCommWatchdog() { try { + c10_npu::SetThreadName(c10_npu::ThreadType::hcclCommWatchdogThread); + VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!"; workCleanupLoop(); VLOG(2) << "[Rank " << rank_ @@ -873,7 +876,9 @@ void ProcessGroupHCCL::workCleanupLoop() auto& work = *it; try { if (needSetDevice) { - NPU_CHECK_ERROR(c10_npu::SetDevice(static_cast(work.devices_[0].index()))); + c10::DeviceIndex device = static_cast(work.devices_[0].index()); + c10_npu::SetThreadAffinity(device); + NPU_CHECK_ERROR(c10_npu::SetDevice(device)); needSetDevice = false; } } catch (const std::exception& e) { diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index d73b536e94..948061f008 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -24,6 +24,7 @@ #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUQueue.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -1211,6 +1212,28 @@ PyObject* THNPModule_npu_support_silentClientV2(PyObject* self, PyObject* noargs END_HANDLE_TH_ERRORS } +PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + int device_index; + NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index)); + c10::DeviceIndex device = static_cast(device_index); + c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::mainThread); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + int device_index; + NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index)); + c10::DeviceIndex device = static_cast(device_index); + c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::unknownThread); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + static struct PyMethodDef THNPModule_methods[] = { {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr}, {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr}, @@ -1260,6 +1283,8 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr}, {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_npu_support_silentClientV2", (PyCFunction)THNPModule_npu_support_silentClientV2, METH_NOARGS, nullptr}, + {"_npu_set_threads_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_NOARGS, nullptr}, + {"_npu_reset_threads_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, {nullptr}}; TORCH_NPU_API PyMethodDef* THNPModule_get_methods() { diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index 92213c7ca9..4bd5bf55cd 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -362,7 +362,9 @@ def _mpdl_iter_init(self, *args, **kwargs): torch_npu.npu.synchronize() except: pass + torch_npu._C._npu_set_threads_affinity() origin_mpdl_iter_init(self, *args, **kwargs) + torch_npu._C._npu_reset_threads_affinity() def _parallel_apply( -- Gitee From 0ce7d8395118961df943f61e43c26aa94fd53473 Mon Sep 17 00:00:00 2001 From: shaojiemike Date: Thu, 28 Nov 2024 20:38:48 +0800 Subject: [PATCH 92/96] [feat]: bind remaining tasks when backward begin [feat]: Add linux platform compatibility check --- .../csrc/core/npu/NPUAffinityController.cpp | 128 ++++++++++++++++-- .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 4 +- 2 files changed, 120 insertions(+), 12 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index e7beafecd4..985c39ce37 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -11,10 +11,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include namespace c10_npu { static pthread_t mainthread_tid; + static pid_t parentPid; const std::unordered_map threadTypeToNameMap = { {releaseThread, "release_thread"}, @@ -33,6 +40,7 @@ namespace c10_npu { void RecordMainThreadTid() { mainthread_tid = pthread_self(); + parentPid = getpid(); } ThreadType getCurrentThreadType() @@ -44,12 +52,31 @@ namespace c10_npu { auto it = threadNameToTypeMap.find(name); if (it != threadNameToTypeMap.end()) { - return it->second; + return std::get<1>(*it); } } return ThreadType::unknownThread; } + ThreadType getThreadType(pid_t tid) + { + char thread_name[16]; + std::string commFile = "/proc/" + std::to_string(tid) + "/comm"; // Path to thread name + + std::ifstream commStream(commFile); + if (commStream.is_open()) { + commStream.getline(thread_name, sizeof(thread_name)); + + std::string name(thread_name); + auto it = threadNameToTypeMap.find(name); + if (it != threadNameToTypeMap.end()) { + return it->second; + } + } + + return ThreadType::unknownThread; // Default if not found + } + aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread) { cpu_set_t mask; @@ -59,12 +86,30 @@ namespace c10_npu { CPU_SET(i, &mask); } if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) { - ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end); + ASCEND_LOGI("[affinity] Set Thread Affinity to %d-%d", core_range.start, core_range.end); return ACL_ERROR_NONE; + } else { + ASCEND_LOGW("[affinity] Set Thread Affinity to %d-%d failed", core_range.start, core_range.end); } return ACL_ERROR_FEATURE_UNSUPPORTED; } + void bindToCoreRange(pid_t pid, const coreIdRange &core_range) + { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + + for (int core = core_range.start; core <= core_range.end; ++core) { + CPU_SET(core, &cpuset); + } + + if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset) == -1) { + ASCEND_LOGW("[affinity] sched_setaffinity failed"); + } else { + ASCEND_LOGI("[affinity] Set Thread %d Affinity to %d-%d", pid, core_range.start, core_range.end); + } + } + coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id) { int core_nums = sysconf(_SC_NPROCESSORS_ONLN); @@ -114,7 +159,7 @@ namespace c10_npu { // calculate env2 default map coreId core_nums = current_core_range.end - current_core_range.start; if (core_nums < thread_types.size()) { - ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.", + ASCEND_LOGW("[affinity] Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.", core_nums, thread_types.size()); for (auto thread_type : thread_types) { threadToCoreidMap[thread_type] = current_core_range; @@ -131,7 +176,7 @@ namespace c10_npu { } } - ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str()); + ASCEND_LOGI("[affinity] Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str()); return threadToCoreidMap; } @@ -150,7 +195,7 @@ namespace c10_npu { oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " "; } - ASCEND_LOGD("Core ranges: %s", oss.str().c_str()); + ASCEND_LOGI("[affinity] Core ranges: %s", oss.str().c_str()); } bool isAllDigits(const std::string &str) @@ -205,7 +250,7 @@ namespace c10_npu { if (isAllDigits(value)) { mode = static_cast(std::stoi(value)); } else { - ASCEND_LOGW("mode is %s, should be all digits", value.c_str()); + ASCEND_LOGW("[affinity] mode is %s, should be all digits", value.c_str()); } } else if (key.rfind("npu", 0) == 0) { // Handle NPU core binding range @@ -221,14 +266,14 @@ namespace c10_npu { coreId end = static_cast(std::stoi(endStr)); ranges[device_id] = {start, end}; } else { - ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str()); + ASCEND_LOGW("[affinity] core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str()); } } else { if (isAllDigits(value)) { coreId singleCore = static_cast(std::stoi(value)); ranges[device_id] = {singleCore, singleCore}; } else { - ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str()); + ASCEND_LOGW("[affinity] core range is string : %s, should be all digits", value.c_str()); } } } @@ -241,6 +286,64 @@ namespace c10_npu { } } + // Function to execute a shell command and capture its output + std::string executeCommand(const std::string &command) + { + std::array buffer; + std::string result; + std::shared_ptr pipe(popen(command.c_str(), "r"), pclose); + if (!pipe) { + ASCEND_LOGE("[affinity] Failed to execute %s.", command.c_str()); + } + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + result += buffer.data(); + } + return result; + } + + // Function to parse PIDs and TIDs from pstree output + std::vector parsePIDsFromPstree(const std::string &pstreeOutput) + { + std::vector pids; + std::regex pidRegex(R"(\((\d+)\))"); // Matches numbers inside parentheses + std::smatch match; + std::string::const_iterator searchStart(pstreeOutput.cbegin()); + while (std::regex_search(searchStart, pstreeOutput.cend(), match, pidRegex)) { + pids.push_back(std::stoi(match[1])); + searchStart = match.suffix().first; + } + return pids; + } + + void SetAffinityForRemainingTasks(coreIdRange core_range) + { + // Check if the platform is Linux +#ifdef __linux__ + // Check if pstree command exists + if (access("/usr/bin/pstree", F_OK) == 0) { + // Run pstree to get child processes and threads + std::string pstreeCommand = "/usr/bin/pstree -p " + std::to_string(parentPid) + " -t"; + std::string pstreeOutput = executeCommand(pstreeCommand); + + // Parse PIDs/TIDs from the pstree output + std::vector pids = parsePIDsFromPstree(pstreeOutput); + ASCEND_LOGI("[affinity] Parse %d PIDs/TIDs from the pstree output of parentPid %d", pids.size(), parentPid); + + // Bind each PID/TID to the core range + for (pid_t pid : pids) { + ThreadType type = getThreadType(pid); + if (type == ThreadType::unknownThread && pid != parentPid) { + bindToCoreRange(pid, core_range); + } + } + } else { + ASCEND_LOGW("[affinity] pstree not found. Please install pstree or check your PATH."); + } +#else + ASCEND_LOGW("[affinity] This function is only supported on Linux platforms."); +#endif + } + aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type) { uint32_t bind_conf; @@ -261,9 +364,13 @@ namespace c10_npu { // Therefore, the main thread should be rescheduled to the target device. if (current_thread_type == ThreadType::aclThread) SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); + // In addition to data-loading processes, users often have other hot threads and processes. + // To isolate interference, all such processes must be confined to separate regions before the dispatch phase. + if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread) + SetAffinityForRemainingTasks(thread_core_map.at(ThreadType::unknownThread)); return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self()); } else { - ASCEND_LOGD("Thread affinity setting is disabled."); + ASCEND_LOGI("[affinity] Thread affinity setting is disabled."); } return ACL_ERROR_NONE; } @@ -274,6 +381,7 @@ namespace c10_npu { if (!seted) { seted = true; if (syscall(SYS_gettid) != getpid()) { + ASCEND_LOGI("[affinity] Set Backward Thread Name"); SetThreadName(ThreadType::backwardThread); SetThreadAffinity(device_id); } @@ -284,7 +392,7 @@ namespace c10_npu { { // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function. if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { - ASCEND_LOGW("set thread name failed!"); + ASCEND_LOGW("[affinity] set thread name failed!"); } } diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index bc1e1f9be3..d05f33168c 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -224,6 +224,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); } + RecordMainThreadTid(); + SetThreadAffinity(device_id_); if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; @@ -270,8 +272,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) lazy_fn_.clear(); - SetThreadAffinity(device_id_); - RecordMainThreadTid(); init_flag_ = true; ASCEND_LOGD("Npu sys ctrl initialize successfully."); -- Gitee From 00814eb840920d9a77ec584cf2d8a55b972b2eb5 Mon Sep 17 00:00:00 2001 From: shaojiemike Date: Mon, 2 Dec 2024 14:39:40 +0800 Subject: [PATCH 93/96] [feat] fix dataloader set affinity error --- torch_npu/utils/_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index 4bd5bf55cd..50f8e6ad3b 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -362,9 +362,9 @@ def _mpdl_iter_init(self, *args, **kwargs): torch_npu.npu.synchronize() except: pass - torch_npu._C._npu_set_threads_affinity() - origin_mpdl_iter_init(self, *args, **kwargs) torch_npu._C._npu_reset_threads_affinity() + origin_mpdl_iter_init(self, *args, **kwargs) + torch_npu._C._npu_set_threads_affinity() def _parallel_apply( -- Gitee From 5800b6e413da985a1197da53ed42cc9be6dd255a Mon Sep 17 00:00:00 2001 From: shaojiemike <943648187@qq.com> Date: Sat, 28 Dec 2024 16:25:34 +0800 Subject: [PATCH 94/96] [feat] support user-defined fine-grained bind core --- torch_npu/csrc/core/npu/NPUAffinityController.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index 985c39ce37..a2a321fb69 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -147,13 +147,12 @@ namespace c10_npu { return oss.str(); } - std::unordered_map GetCpuAffinityMap(c10::DeviceIndex device_id) + std::unordered_map GetCpuAffinityMap(c10::DeviceIndex device_id, coreIdRange current_core_range) { std::unordered_map threadToCoreidMap; std::initializer_list thread_types = {unknownThread, mainThread, backwardThread, aclThread, releaseThread, hcclCommWatchdogThread}; - coreIdRange current_core_range = GetCPUDefaultRange(device_id); coreId offset = current_core_range.start; // calculate env2 default map @@ -358,7 +357,7 @@ namespace c10_npu { return SetThreadAffinity(ranges[device_id], pthread_self()); } } else if (bind_conf == 2) { - auto thread_core_map = GetCpuAffinityMap(device_id); + auto thread_core_map = GetCpuAffinityMap(device_id, ranges[device_id]); // When the PTA_init function runs on device 0, the main thread is initially assigned to this device 0. // However, when the acl_thread is initialized, the target device ID(maybe 0-7) is determined. // Therefore, the main thread should be rescheduled to the target device. -- Gitee From 9900fd2cdfef7abfd70a41c4ce6f7b768c7af1af Mon Sep 17 00:00:00 2001 From: shaojiemike <943648187@qq.com> Date: Mon, 30 Dec 2024 19:43:31 +0800 Subject: [PATCH 95/96] [perf]: optimize main thread affinity with lazy set for minimal impact on non-dispatch phase --- torch_npu/csrc/core/npu/NPUAffinityController.cpp | 4 +++- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index a2a321fb69..a66a0547a4 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -365,8 +365,10 @@ namespace c10_npu { SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); // In addition to data-loading processes, users often have other hot threads and processes. // To isolate interference, all such processes must be confined to separate regions before the dispatch phase. - if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread) + if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread) { + SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); SetAffinityForRemainingTasks(thread_core_map.at(ThreadType::unknownThread)); + } return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self()); } else { ASCEND_LOGI("[affinity] Thread affinity setting is disabled."); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index d05f33168c..c7ff88e9cb 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -225,7 +225,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) } RecordMainThreadTid(); - SetThreadAffinity(device_id_); if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; -- Gitee From 40376366ec01e1d441f1dfb3c97cabe9d8921b77 Mon Sep 17 00:00:00 2001 From: shaojiemike <943648187@qq.com> Date: Tue, 31 Dec 2024 12:00:52 +0800 Subject: [PATCH 96/96] [feat] bind core based on original limited cores [fix] delete useless check [fix] codecheck [fix] compile error --- .../csrc/core/npu/NPUAffinityController.cpp | 80 ++++++++++++------- .../csrc/core/npu/NPUAffinityController.h | 2 +- .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 +- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index a66a0547a4..6d951d6f6f 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -22,6 +22,7 @@ namespace c10_npu { static pthread_t mainthread_tid; static pid_t parentPid; + static coreIdRange originalRange; const std::unordered_map threadTypeToNameMap = { {releaseThread, "release_thread"}, @@ -37,10 +38,52 @@ namespace c10_npu { {"hcclComm_watchd", hcclCommWatchdogThread}, {"backward_thread", backwardThread}}; - void RecordMainThreadTid() + coreIdRange FindLongestCoreAffinityRange(pthread_t thread) + { + cpu_set_t mask; + CPU_ZERO(&mask); + + coreIdRange range = {-1, -1}; + int max_length = 0; + int current_start = -1; + int current_length = 0; + + if (pthread_getaffinity_np(thread, sizeof(mask), &mask) == 0) { + for (int i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &mask)) { + if (current_start == -1) { + current_start = i; + } + current_length++; + } else { + if (current_length > max_length) { + max_length = current_length; + range.start = current_start; + range.end = i - 1; + } + current_start = -1; + current_length = 0; + } + } + + if (current_length > max_length) { + max_length = current_length; + range.start = current_start; + range.end = CPU_SETSIZE - 1; + } + } else { + ASCEND_LOGW("[affinity] Failed to get thread affinity"); + } + + return range; + } + + void GetAffinityInfo() { mainthread_tid = pthread_self(); parentPid = getpid(); + originalRange = FindLongestCoreAffinityRange(mainthread_tid); + ASCEND_LOGI("[affinity] Original Affinity is %d-%d", originalRange.start, originalRange.end); } ThreadType getCurrentThreadType() @@ -70,7 +113,7 @@ namespace c10_npu { std::string name(thread_name); auto it = threadNameToTypeMap.find(name); if (it != threadNameToTypeMap.end()) { - return it->second; + return std::get<1>(*it); } } @@ -112,25 +155,12 @@ namespace c10_npu { coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id) { - int core_nums = sysconf(_SC_NPROCESSORS_ONLN); + int offset = originalRange.start; + int core_nums = originalRange.end - originalRange.start + 1; int device_nums = device_count_ensure_non_zero(); int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0; - return coreIdRange{static_cast(device_id * block_size), - static_cast(std::min((device_id + 1) * block_size, core_nums) - 1)}; - } - - inline bool has_set_pthread_affinity() - { - unsigned int core_nums = static_cast(sysconf(_SC_NPROCESSORS_ONLN)); - - cpu_set_t mask; - pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); - for (unsigned int i = 0; i < core_nums; i++) { - if (!CPU_ISSET(i, &mask)) { - return true; - } - } - return false; + return coreIdRange{offset + static_cast(device_id * block_size), + offset + static_cast(std::min((device_id + 1) * block_size, core_nums) - 1)}; } std::string GetAffinityMapAsString(const std::unordered_map &threadToCoreidMap, c10::DeviceIndex device_id) @@ -286,13 +316,13 @@ namespace c10_npu { } // Function to execute a shell command and capture its output - std::string executeCommand(const std::string &command) + std::string executeCommand(const std::string &exe) { std::array buffer; std::string result; - std::shared_ptr pipe(popen(command.c_str(), "r"), pclose); + std::shared_ptr pipe(popen(exe.c_str(), "r"), pclose); if (!pipe) { - ASCEND_LOGE("[affinity] Failed to execute %s.", command.c_str()); + ASCEND_LOGE("[affinity] %s failed.", exe.c_str()); } while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); @@ -352,10 +382,7 @@ namespace c10_npu { // bind_conf=1, bind cores averagely based on device_id if (bind_conf == 1) { - static const bool set_pthread_affinity = has_set_pthread_affinity(); - if (!set_pthread_affinity) { - return SetThreadAffinity(ranges[device_id], pthread_self()); - } + return SetThreadAffinity(ranges[device_id], pthread_self()); } else if (bind_conf == 2) { auto thread_core_map = GetCpuAffinityMap(device_id, ranges[device_id]); // When the PTA_init function runs on device 0, the main thread is initially assigned to this device 0. @@ -363,7 +390,6 @@ namespace c10_npu { // Therefore, the main thread should be rescheduled to the target device. if (current_thread_type == ThreadType::aclThread) SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); - // In addition to data-loading processes, users often have other hot threads and processes. // To isolate interference, all such processes must be confined to separate regions before the dispatch phase. if (current_thread_type == ThreadType::backwardThread || current_thread_type == ThreadType::unknownThread) { SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index 2c1e92ddc7..f2e78b69b6 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -27,7 +27,7 @@ namespace c10_npu { // (e.g., first parallel checkpoint data loading, then transitioning to forward training). // Each phase may require different thread affinity settings. Therefore, we record the thread's TID // to adjust its affinity later as needed. - void RecordMainThreadTid(); + void GetAffinityInfo(); // Set backwardThread Name Once void SetBackwardThreadName(c10::DeviceIndex device_id); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index c7ff88e9cb..9081b686c4 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -224,7 +224,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); } - RecordMainThreadTid(); + GetAffinityInfo(); if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; -- Gitee