From 17d89c493dc2fd3311f8a5cdff0b96b43cf71429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Thu, 29 May 2025 13:20:10 +0000 Subject: [PATCH 001/328] =?UTF-8?q?!21324=20optimize=20err=5Fcode=20Merge?= =?UTF-8?q?=20pull=20request=20!21324=20from=20=E9=83=AD=E5=85=89=E6=B5=A9?= =?UTF-8?q?/v2.7.1-err?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/core/npu/NPUCachingAllocator.cpp | 2 +- torch_npu/csrc/core/npu/NPUException.cpp | 43 +++++++++- torch_npu/csrc/core/npu/NPUException.h | 78 +++++++++++-------- torch_npu/csrc/core/npu/NPUStream.cpp | 14 ++-- .../csrc/core/npu/register/OptionsManager.cpp | 9 +++ .../csrc/core/npu/register/OptionsManager.h | 1 + .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 +- torch_npu/csrc/distributed/HCCLUtils.hpp | 14 +++- .../csrc/distributed/ProcessGroupHCCL.cpp | 2 +- torch_npu/utils/_error_code.py | 11 ++- 10 files changed, 126 insertions(+), 50 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 879c04e670..4201fa5f1a 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -889,7 +889,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(const std::vector(submodule); oss << std::setw(3) << std::setfill('0') << static_cast(errorCode); oss << " " << submoduleMap[submodule] << " " << errCodeMap[errorCode]; @@ -107,11 +109,48 @@ void clear_mem_uce_info() memUceInfo.clear(); } +const std::string c10_npu_check_error_message(std::string& errmsg) +{ + std::regex dateRegex(R"(\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}\.\d{3})"); + std::smatch match; + + if (std::regex_search(errmsg, match, dateRegex)) { + size_t dateEndPos = match.position(0) + match.length(0); + size_t tracePos = errmsg.find("TraceBack (most recent call last):\n", dateEndPos); + std::string content; + if (tracePos != std::string::npos) { + content = errmsg.substr(dateEndPos, tracePos - dateEndPos); + } else { + content = errmsg.substr(dateEndPos); + } + + std::regex ws_regex("[\\s\\t\\n\\r]+"); + content = std::regex_replace(content, ws_regex, " "); + if (!content.empty() && content.front() == ' ') + content.erase(0, 1); + if (!content.empty() && content.back() == ' ') + content.pop_back(); + + return content; + } + + return ""; +} + + const char *c10_npu_get_error_message() { auto errmsg = c10_npu::acl::AclGetErrMsg(); - c10_npu::setRepoErrMsg(errmsg); - return errmsg; + if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { + std::string log(errmsg); + std::string errmsg_ = c10_npu::c10_npu_check_error_message(log); + thread_local std::string processedErrMsg = errmsg_; + c10_npu::setRepoErrMsg(processedErrMsg.c_str()); + return processedErrMsg.c_str(); + } else { + c10_npu::setRepoErrMsg(errmsg); + return errmsg; + } } void record_mem_hbm_ecc_error() diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 997da9dce7..f144490e5a 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -142,7 +142,31 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) } \ device_error_msg = c10_npu::handleDeviceError(error_code); \ } \ - TORCH_CHECK( \ + if (((error_code) == ACL_ERROR_RT_FEATURE_NOT_SUPPORT) && (device_error_msg.empty())) { \ + static auto feature_not_support_warn_once = []() { \ + printf("[WARN]%s,%s:%u:%s\n", \ + __FUNCTION__, __FILENAME__, __LINE__, \ + "Feature is not supportted and the possible cause is" \ + " that driver and firmware packages do not match."); \ + return true; \ + }(); \ + } else if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \ + std::ostringstream oss; \ + oss << " NPU function error: " \ + << (device_error_msg.empty() ? getErrorFunction(#err_code, ##__VA_ARGS__) : device_error_msg) \ + << ", error code is " << error_code << " " \ + << PTA_ERROR(ErrCode::ACL) \ + << (err_map.error_code_map.find(error_code) != err_map.error_code_map.end() ? \ + err_map.error_code_map[error_code] : ".") \ + << "\n"; \ + std::string err_msg = oss.str(); \ + ASCEND_LOGE("%s", err_msg.c_str()); \ + TORCH_CHECK( \ + false, \ + (device_error_msg.empty() ? "" : device_error_msg), \ + c10_npu::c10_npu_get_error_message()); \ + } else { \ + TORCH_CHECK( \ false, \ __func__, \ ":", \ @@ -157,6 +181,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) err_map.error_code_map.end() ? \ "\n[Error]: " + err_map.error_code_map[error_code] : "."), \ "\n", c10_npu::c10_npu_get_error_message()); \ + } \ } \ } while (0) @@ -168,7 +193,21 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) auto Error = err_code; \ static c10_npu::acl::AclErrorCode err_map; \ if ((Error) != ACL_ERROR_NONE) { \ - CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ + CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ + if (c10_npu::option::OptionsManager::ShouldPrintLessError()) \ + { \ + std::ostringstream oss; \ + oss << " OPS function error: " << getErrorFunction(#err_code, ##__VA_ARGS__) \ + << ", error code is " << Error << " " \ + << OPS_ERROR(ErrCode::ACL) \ + << (err_map.error_code_map.find(Error) != err_map.error_code_map.end() ? \ + err_map.error_code_map[Error] : ".") + "\n"; \ + std::string err_msg = oss.str(); \ + ASCEND_LOGE("%s", err_msg.c_str()); \ + TORCH_CHECK( \ + false, \ + c10_npu::c10_npu_get_error_message()); \ + } else { \ TORCH_CHECK( \ false, \ __func__, \ @@ -184,40 +223,9 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) "\n[Error]: " + err_map.error_code_map[Error] : "."), \ "\n", c10_npu::c10_npu_get_error_message()); \ } \ + } \ } while (0) -#define NPU_CHECK_SUPPORTED_OR_ERROR(err_code, ...) \ - do { \ - auto Error = err_code; \ - static c10_npu::acl::AclErrorCode err_map; \ - if ((Error) != ACL_ERROR_NONE) { \ - CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ - if ((Error) == ACL_ERROR_RT_FEATURE_NOT_SUPPORT) { \ - static auto feature_not_support_warn_once = []() { \ - printf("[WARN]%s,%s:%u:%s\n", \ - __FUNCTION__, __FILENAME__, __LINE__, \ - "Feature is not supportted and the possible cause is" \ - " that driver and firmware packages do not match."); \ - return true; \ - }(); \ - } else { \ - TORCH_CHECK( \ - false, \ - __func__, \ - ":", \ - __FILE__, \ - ":", \ - __LINE__, \ - " NPU function error: ", getErrorFunction(#err_code, ##__VA_ARGS__), \ - ", error code is ", Error, \ - PTA_ERROR(ErrCode::ACL), \ - (err_map.error_code_map.find(Error) != \ - err_map.error_code_map.end() ? \ - "\n[Error]: " + err_map.error_code_map[Error] : "."), \ - "\n", c10_npu::c10_npu_get_error_message()); \ - } \ - } \ - } while (0) namespace c10_npu { @@ -245,6 +253,8 @@ struct MemUceInfo { C10_NPU_API const char *c10_npu_get_error_message(); +C10_NPU_API const std::string c10_npu_check_error_message(std::string& errmsg); + bool checkUceErrAndRepair(bool check_error, std::string& err_msg); void record_mem_hbm_ecc_error(); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 4fdd8f0ccf..d3a72cb277 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -197,7 +197,7 @@ static void initGlobalStreamState() default_streams[device_id].device_index = device_id; npu_counters[device_id] = 0; auto& default_streamsi = default_streams[device_id]; - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&default_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); if (c10_npu::option::OptionsManager::GetTaskQueueEnable()) { default_streamsi.repo->InitRepo(device_id); @@ -205,7 +205,7 @@ static void initGlobalStreamState() // Initializes secondary streams secondary_streams[device_id].device_index = device_id; auto &secondary_streamsi = secondary_streams[device_id]; - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&secondary_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); } @@ -220,7 +220,7 @@ static void initDeviceStreamState(c10::DeviceIndex device_index) npu_streami.device_index = device_index; - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&npu_streami.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); } } @@ -612,11 +612,11 @@ void recovery_all_npu_streams(c10::DeviceIndex device_index) NPUGuard device_guard{device_index}; auto& default_streamsi = default_streams[device_index]; default_streamsi.stream = nullptr; - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&default_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); auto& secondary_streamsi = secondary_streams[device_index]; secondary_streamsi.stream = nullptr; - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&secondary_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); static int StreamsPerPool = GetStreamsPerPool(); for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) { @@ -624,7 +624,7 @@ void recovery_all_npu_streams(c10::DeviceIndex device_index) if (npu_streami.stream == nullptr) { continue; } - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&npu_streami.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); } } @@ -638,7 +638,7 @@ static void initDeviceSyncLaunchStream(c10::DeviceIndex device_index) sync_streami.device_index = device_index; sync_streami.is_sync_launch = true; - NPU_CHECK_SUPPORTED_OR_ERROR( + NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&sync_streami.stream, 0, ACL_STREAM_FAST_SYNC)); } } diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 9b4058e8ad..e4eb407936 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -621,5 +621,14 @@ bool OptionsManager::IsOomSnapshotEnable() return (envFlag != 0); } +bool OptionsManager::ShouldPrintLessError() +{ + static bool should_print = []() -> bool { + int32_t disabled_error = OptionsManager::GetBoolTypeOption("TORCH_NPU_COMPACT_ERROR_OUTPUT"); + return disabled_error != 0; + }(); + return should_print; +} + } // namespace option } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index bffd3bf48d..1a678c6ec4 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -132,6 +132,7 @@ public: static std::string GetOomSnapshotDumpPath(); static bool IsOomSnapshotEnable(); static bool ShouldPrintWarning(); + static bool ShouldPrintLessError(); private: static int GetBoolTypeOption(const char* env_str, int defaultVal = 0); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index c6eb4705fc..de1010347f 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -177,7 +177,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) } NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); // lazy call for the setoption for (const auto &iter: lazy_fn_) { diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index 304ac830c8..c851b9bcaf 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -17,6 +17,17 @@ auto Error = err_code; \ if ((Error) != HCCL_SUCCESS) { \ CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ + if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \ + std::ostringstream oss; \ + oss << " HCCL function error: " << getErrorFunction(#err_code, ##__VA_ARGS__) \ + << ", error code is " << Error << " " \ + << DIST_ERROR(ErrCode::HCCL) + ".\n"; \ + std::string err_msg = oss.str(); \ + ASCEND_LOGE("%s", err_msg.c_str()); \ + TORCH_CHECK( \ + false, \ + c10_npu::c10_npu_get_error_message()); \ + } else { \ TORCH_CHECK( \ false, \ __func__, \ @@ -27,8 +38,9 @@ " HCCL function error: ", getErrorFunction(#err_code, ##__VA_ARGS__), \ ", error code is ", Error, \ DIST_ERROR(ErrCode::HCCL) + ".\n" + \ - c10_npu::acl::AclGetErrMsg()); \ + c10_npu::c10_npu_get_error_message()); \ } \ + } \ } while (0) #define ENABLE_HCCL_ERROR_CHECKING diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 82085ac95a..76c7718d55 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -834,7 +834,7 @@ ProcessGroupHCCL::ProcessGroupHCCL( } } ASCEND_LOGI("Set op wait timeout to %u.", kOpWaitTimeout); - NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpWaitTimeout(kOpWaitTimeout)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpWaitTimeout(kOpWaitTimeout)); const char* blockingWait = getenv(HCCL_BLOCKING_WAIT); try { if (blockingWait != nullptr) { diff --git a/torch_npu/utils/_error_code.py b/torch_npu/utils/_error_code.py index 77643cb13d..dbcc6aa2a9 100644 --- a/torch_npu/utils/_error_code.py +++ b/torch_npu/utils/_error_code.py @@ -57,8 +57,9 @@ def _format_error_msg(submodule, error_code): return rank except Exception: return -1 - - error_msg = "\n[ERROR] {time} (PID:{pid}, Device:{device}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}" + error_msg = "" + if not get_env_compact_error_output(): + error_msg += "\n[ERROR] {time} (PID:{pid}, Device:{device}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}" return error_msg.format( time=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()), @@ -90,6 +91,10 @@ def prof_error(error: ErrCode) -> str: return _format_error_msg(_SubModuleID.PROF, error) +def get_env_compact_error_output(): + return int(os.getenv("TORCH_NPU_COMPACT_ERROR_OUTPUT", "0")) + + class _NPUExceptionHandler(object): def __init__(self): self.exception = None @@ -123,7 +128,7 @@ class _NPUExceptionHandler(object): if self.force_stop_flag: raise RuntimeError("FORCE STOP." + pta_error(ErrCode.ACL)) if self._is_exception(self.npu_exception): - if self._is_exception(self.npu_timeout_exception): + if self._is_exception(self.npu_timeout_exception) or get_env_compact_error_output(): # if npu timeout, let other processes exit properly before elastic agent kills them. time.sleep(self.npu_timeout_exit_offset) else: -- Gitee From 8f89f38332139e156f4aadcd82dff96951786eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=98=89=E8=AF=9A?= <12731429+wang-pierre-jiacheng@user.noreply.gitee.com> Date: Thu, 29 May 2025 14:25:09 +0000 Subject: [PATCH 002/328] =?UTF-8?q?!21347=20add=20new=20version=20pattern?= =?UTF-8?q?=20to=20GetCANNInfo=20Merge=20pull=20request=20!21347=20from=20?= =?UTF-8?q?=E7=8E=8B=E5=98=89=E8=AF=9A/driver=5Fpattern=5Fv2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/GetCANNInfo.cpp | 60 ++++++++++++++++--------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp index c009465e4e..8916a70fc9 100644 --- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp +++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp @@ -61,54 +61,70 @@ int64_t VersionToNum(std::string versionStr) return num; } -double DriverVersionToNum(std::string versionStr) +int64_t DriverVersionToNum(std::string versionStr) { std::smatch results; - int major = -1; - int minor = -1; - int release = -1; - int TVersion = -1; - int RCVersion = -51; - int bVersion = 0; + int64_t major = -1; + int64_t minor = -1; + int64_t release = -1; + int64_t TVersion = -1; + int64_t RCVersion = -51; + int64_t patch = 0; + int64_t bVersion = 1; + int64_t alphaVersion = 0; // driver version check only supports pattern listed here: - // 24.1.0,24.1.RC1,24.1.rc1,24.1.RC1.B10,24.1.rc1.b10,24.1.T1 - if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+)"))) { + // pattern is major.minor.release.patch. release:num or RC+num or T+num, patch: num or alpha+num or beta+num. + std::regex re_rc("([0-9]+).([0-9]+).RC([0-9]+)", std::regex::icase); + std::regex re_num("([0-9]+).([0-9]+).([0-9]+)"); + std::regex re_rc_num("([0-9]+).([0-9]+).RC([0-9]+).([0-9]+)", std::regex::icase); + std::regex re_num_num("([0-9]+).([0-9]+).([0-9]+).([0-9]+)"); + std::regex re_t("([0-9]+).([0-9]+).T([0-9]+)", std::regex::icase); + std::regex re_rc_beta("([0-9]+).([0-9]+).RC([0-9]+).beta([0-9]+)", std::regex::icase); + std::regex re_rc_alpha("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)", std::regex::icase); + if (std::regex_match(versionStr, results, re_rc)) { major = stoi(results[kVersionIndex1]); minor = stoi(results[kVersionIndex2]); RCVersion = stoi(results[kVersionIndex3]); - } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).rc([0-9]+)"))) { + } else if (std::regex_match(versionStr, results, re_rc_num)) { major = stoi(results[kVersionIndex1]); minor = stoi(results[kVersionIndex2]); RCVersion = stoi(results[kVersionIndex3]); - } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).([0-9]+)"))) { + patch = stoi(results[kVersionIndex4]); + } else if (std::regex_match(versionStr, results, re_num)) { major = stoi(results[kVersionIndex1]); minor = stoi(results[kVersionIndex2]); release = stoi(results[kVersionIndex3]); - } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).T([0-9]+)"))) { + } else if (std::regex_match(versionStr, results, re_num_num)) { + major = stoi(results[kVersionIndex1]); + minor = stoi(results[kVersionIndex2]); + release = stoi(results[kVersionIndex3]); + patch = stoi(results[kVersionIndex4]); + } else if (std::regex_match(versionStr, results, re_t)) { major = stoi(results[kVersionIndex1]); minor = stoi(results[kVersionIndex2]); TVersion = stoi(results[kVersionIndex3]); - } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+).B([0-9]+)"))) { + } else if (std::regex_match(versionStr, results, re_rc_beta)) { major = stoi(results[kVersionIndex1]); minor = stoi(results[kVersionIndex2]); RCVersion = stoi(results[kVersionIndex3]); bVersion = stoi(results[kVersionIndex4]); - } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).rc([0-9]+).b([0-9]+)"))) { + } else if (std::regex_match(versionStr, results, re_rc_alpha)) { major = stoi(results[kVersionIndex1]); minor = stoi(results[kVersionIndex2]); RCVersion = stoi(results[kVersionIndex3]); - bVersion = stoi(results[kVersionIndex4]); + alphaVersion = stoi(results[kVersionIndex4]); } else { TORCH_NPU_WARN_ONCE("Driver Version: " + versionStr + " is invalid or not supported yet."); - return 0.0; + return 0; } - double num = ((static_cast(major) + 1.0) * 100000000) + - ((static_cast(minor) + 1.0) * 1000000) + - ((static_cast(release) + 1.0) * 10000) + - ((static_cast(RCVersion) + 1.0) * 100 + 5000) + - ((static_cast(TVersion) + 1.0) * 100) + - static_cast(bVersion); + int64_t num = ((major + 1) * 100000000) + + ((minor + 1) * 1000000) + + ((release + 1) * 10000) + + ((RCVersion + 1) * 100 + 5000) + + ((TVersion + 1) * 100) - + (alphaVersion ? 1 : 0) * (100 - alphaVersion) + + (bVersion - 1) + patch; return num; } -- Gitee From e573b3ac2e286b39f7bc19bcf1dba66610500bdb Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 30 May 2025 01:12:38 +0000 Subject: [PATCH 003/328] !21377 Update op_plugin commit id Merge pull request !21377 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index acd824ef89..f71c628def 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit acd824ef89ddd38c25d0ca845888ef4087aae5c3 +Subproject commit f71c628def3856a5f66f8e369fb0f969d87dd9c9 -- Gitee From 9a7836f7fc883774ff9fd61b0021f357d3c02e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com> Date: Fri, 30 May 2025 01:27:26 +0000 Subject: [PATCH 004/328] =?UTF-8?q?!21352=20Add=20test=5Fnative=5Fmha.py.?= =?UTF-8?q?=20Merge=20pull=20request=20!21352=20from=20=E5=88=98=E5=98=89?= =?UTF-8?q?=E5=B7=8D/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_native_mha.py | 353 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 test/test_native_mha.py diff --git a/test/test_native_mha.py b/test/test_native_mha.py new file mode 100644 index 0000000000..8457bf7f05 --- /dev/null +++ b/test/test_native_mha.py @@ -0,0 +1,353 @@ +# Owner(s): ["module: nn"] +import math +import copy +import unittest + +import torch +import torch_npu +import torch_npu.testing +from torch.testing._internal.common_device_type import ( + dtypes, + dtypesIfPRIVATEUSE1, + instantiate_device_type_tests, + onlyPRIVATEUSE1, + skipMeta, +) +from torch.testing._internal.common_utils import parametrize, run_tests, TestCase + + +class TestMHADeviceType(TestCase): + @torch.no_grad() + def _test_transform_bias_rescale_qkv_impl( + self, device, dtype, use_nt, use_padding=False + ): + tests = [ + (64, 4, 16, 8), + # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8 + (24, 2, 4, 2), + # Make sure CUDA can handle small input sizes + (2, 2, 2, 2), + # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4, + # causes alignment issues + (24, 4, 4, 2), + (48, 4, 16, 8), + ] + for (embed_dim, num_heads, bs, sl) in tests: + with self.subTest(embed_dim=embed_dim, num_heads=num_heads, bs=bs, sl=sl): + torch.manual_seed(9343) + dense_x = x = ( + torch.randn(bs, sl, 3 * embed_dim, device=device, dtype=dtype) * 10 + ) + if use_padding: + x[0][-1] = torch.full(x[0][-1].shape, float("-Inf")) + if use_nt: + xs = list(torch.unbind(x)) + if use_padding: + xs[0] = xs[0][:-1] + x = torch.nested.nested_tensor(xs, device=device, dtype=dtype) + qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype) + + # We have to use inference_mode here because q/k/v are + # all views of the same Tensor, which autograd doesn't + # like. This is fine because this function is only + # exposed to Python for purposes of writing this test. + with torch.inference_mode(): + (q, k, v) = torch._transform_bias_rescale_qkv( + x, qkv.bias, num_heads=num_heads + ) + + def simple_transform_bias_rescale_qkv(qkv, bias): + (q, k, v) = torch.split(qkv, embed_dim, dim=-1) + (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1) + + def embiggen(x): + if not use_nt: + return x + b, t, d = x.size() + t = t + (8 - t % 8) % 8 + newsize = (b, t, d) + new_x = torch.zeros(newsize, device=device, dtype=dtype) + new_x[:x.size()[0], :x.size()[1], :x.size()[2]] = x + return new_x + return tuple( + embiggen(x).reshape( + (bs, -1, num_heads, embed_dim // num_heads) + ).transpose(2, 1) + for x in ( + (q + q_bias) / math.sqrt(embed_dim // num_heads), + (k + k_bias), + (v + v_bias), + ) + ) + + correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv( + dense_x, qkv.bias + ) + if use_nt and use_padding: + for t in (correct_q, correct_k, correct_v): + t[t == float("-Inf")] = 0 + + self.assertEqual(q.size(), correct_q.size()) + torch.testing.assert_close(q, correct_q) + torch.testing.assert_close(k, correct_k) + torch.testing.assert_close(v, correct_v) + + @dtypesIfPRIVATEUSE1(torch.float) + @dtypes(torch.float) + @skipMeta + def test_transform_bias_rescale_qkv(self, device, dtype): + for use_padding in (False, True): + with self.subTest(use_padding=use_padding): + self._test_transform_bias_rescale_qkv_impl( + device, dtype, use_nt=False, use_padding=use_padding + ) + + @unittest.skip("NPU currently do not support nested tensor.") + @dtypesIfPRIVATEUSE1(torch.float) + @dtypes(torch.float) + @skipMeta + @onlyPRIVATEUSE1 + def test_transform_bias_rescale_qkv_nested(self, device, dtype): + for use_padding in (False, True): + with self.subTest(use_padding=use_padding): + self._test_transform_bias_rescale_qkv_impl( + device, dtype, use_nt=True, use_padding=use_padding + ) + + # pylint:disable = huawei-too-many-arguments + def _test_multihead_attention_impl( + self, device, dtype, mode, use_nt, need_weights, average_attn_weights, use_padding=False, pad_all=False + ): + embed_dim = 64 + num_heads = 4 + bs = 16 + sl = 8 + + q = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3 + if use_padding: + if pad_all: + for q_i in q: + q_i[-1] = torch.zeros_like(q[0][-1], device=device, dtype=torch.float32) + mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool) + for mask_i in mask: + mask_i[-1] = True + else: + q[0][-1] = torch.zeros_like(q[0][-1], device=device, dtype=torch.float32) + mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool) + mask[0][-1] = True + if mode == "self": + k = q + v = q + elif mode == "encdec": + k = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3 + v = k + elif mode == "generic": + k = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3 + v = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3 + else: + self.fail(f"invalid mode `{mode}`!") + + qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=torch.float32) + native_qkv = copy.deepcopy(qkv).to(dtype=dtype) + + proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=torch.float32) + native_proj = copy.deepcopy(proj).to(dtype=dtype) + + pt = torch.nn.MultiheadAttention( + embed_dim, num_heads, batch_first=True, device=device, dtype=torch.float32 + ) + + pt.in_proj_weight = qkv.weight + pt.in_proj_bias = qkv.bias + pt.out_proj.weight = proj.weight + pt.out_proj.bias = proj.bias + + class NativeMHA(torch.nn.Module): + def __init__(self, embed_dim, num_heads, qkv, proj): + super().__init__() + self.qkv = qkv + self.proj = proj + self.embed_dim = embed_dim + self.num_heads = num_heads + + def forward(self, q, k, v, key_padding_mask): + return torch._native_multi_head_attention( + q, + k, + v, + self.embed_dim, + self.num_heads, + self.qkv.weight, + self.qkv.bias, + self.proj.weight, + self.proj.bias, + key_padding_mask, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + mask_type=1, # mask_type = 1 => src_key_padding_mask, mask_type = 0 => src_mask + ) + + npt = NativeMHA( + embed_dim=embed_dim, num_heads=num_heads, qkv=native_qkv, proj=native_proj + ).to(dtype) + + if device == "npu": + pt = pt.npu() + npt = npt.npu() + + ypt, weight_pt = pt( + q, + k, + v, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + key_padding_mask=mask if use_padding else None, + ) + if use_nt: + qs = list(torch.unbind(q)) + if use_padding: + if pad_all: + qs = [x[:-1] for x in qs] + else: + qs[0] = qs[0][:-1] + q = torch.nested.nested_tensor(qs, device=device, dtype=dtype) + if mode == "self": + k = v = q + elif mode == "encdec": + k = torch.nested.nested_tensor(torch.unbind(k), device=device, dtype=dtype) + v = k + else: + k = torch.nested.nested_tensor(torch.unbind(k), device=device, dtype=dtype) + v = torch.nested.nested_tensor(torch.unbind(v), device=device, dtype=dtype) + + native_q = q.to(dtype=dtype) + native_k = k.to(dtype=dtype) + native_v = v.to(dtype=dtype) + + ynpt, weight_npt = npt( + native_q, native_k, native_v, key_padding_mask=mask if use_padding and not use_nt else None + ) + if use_nt: + ynpt = ynpt.to_padded_tensor(0) + if pad_all: + ynpt_final = torch.zeros_like(ypt) + ynpt_final[:, :ynpt.shape[1], :] = ynpt + ynpt = ynpt_final + + def do_pad_all(tensors): + for t in tensors: + for t_i in t: + t_i[-1] = torch.zeros_like(t_i[-1], device=device, dtype=dtype) + + # PyTorch implementation returns non-zero junk in the padding + # locations; overwrite it so that the comparison works out. + if use_padding: + ypt[0][-1] = torch.zeros_like(ypt[0][-1], device=device, dtype=dtype) + ynpt[0][-1] = torch.zeros_like(ynpt[0][-1], device=device, dtype=dtype) + if pad_all: + do_pad_all((ypt, ynpt)) + # Zero the last row of each TxT weight matrix + if need_weights: + if average_attn_weights: + weight_pt[0][-1] = torch.zeros_like(weight_pt[0][-1], device=device, dtype=dtype) + weight_npt[0][-1] = torch.zeros_like(weight_npt[0][-1], device=device, dtype=dtype) + if pad_all: + do_pad_all((weight_pt, weight_npt)) + else: + for nh in range(num_heads): + weight_pt[0][nh][-1] = torch.zeros_like(weight_pt[0][nh][-1], device=device, dtype=dtype) + weight_npt[0][nh][-1] = torch.zeros_like(weight_npt[0][nh][-1], device=device, dtype=dtype) + + if dtype == torch.half: + torch.testing.assert_close(ypt, ynpt.to(torch.float32), atol=1e-3, rtol=1e-3) + else: + # High rtol seems necessary for + # test_native_multihead_attention_cpu_float32 on Windows, + # otherwise 2e-4 would likely be fine. + torch.testing.assert_close(ypt, ynpt, atol=2e-5, rtol=2e-3) + + if need_weights: + torch.testing.assert_close(weight_pt, weight_npt.to(torch.float32), atol=5e-4, rtol=5e-4) + else: + self.assertEqual(weight_pt, weight_npt) + + # NPU currently do not support nested tensor, we set use_nt=False. + # NPU currently do not support calculate with key_padding_mask, we set use_padding=False. + @dtypesIfPRIVATEUSE1(torch.float, torch.half) + @dtypes(torch.float) + @skipMeta + @parametrize("use_nt", [False]) + @parametrize("use_padding, pad_all", [(False, False), (False, True)]) + @parametrize("need_weights", [False]) + @parametrize("average_attn_weights", [False, True]) + @parametrize("fused", [False, True]) + @torch.no_grad() + # pylint:disable = huawei-too-many-arguments + def test_native_multihead_self_attention(self, device, dtype, use_nt, + need_weights, average_attn_weights, use_padding, pad_all, fused): + for need_weights in (False, not pad_all): + with self.subTest(use_padding=use_padding, pad_all=pad_all, + use_nt=use_nt, need_weights=need_weights, + average_attn_weights=average_attn_weights): + # NPU do not use sdp_kernel, here we simply call _test_multihead_attention_impl. + if "npu" in device: + self._test_multihead_attention_impl( + device, + dtype, + "self", + use_nt=use_nt, + use_padding=use_padding, + pad_all=pad_all, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + ) + else: + with torch.backends.npu.sdp_kernel( + enable_flash=False, enable_mem_efficient=False + ) if not fused else torch.backends.npu.sdp_kernel( + enable_flash=True, enable_mem_efficient=True + ): + self._test_multihead_attention_impl( + device, + dtype, + "self", + use_nt=use_nt, + use_padding=use_padding, + pad_all=pad_all, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + ) + + @dtypesIfPRIVATEUSE1(torch.float, torch.half) + @dtypes(torch.float) + @skipMeta + @torch.no_grad() + def test_native_multihead_encoder_decoder_attention(self, device, dtype): + self._test_multihead_attention_impl( + device, + dtype, + "encdec", + use_nt=False, + need_weights=False, + average_attn_weights=False, + ) + + @dtypesIfPRIVATEUSE1(torch.float, torch.half) + @dtypes(torch.float) + @skipMeta + @torch.no_grad() + def test_native_multihead_attention(self, device, dtype): + self._test_multihead_attention_impl( + device, + dtype, + "generic", + use_nt=False, + need_weights=False, + average_attn_weights=False, + ) + + +instantiate_device_type_tests(TestMHADeviceType, globals()) + +if __name__ == "__main__": + pass -- Gitee From 33cc5ef70f0c0633271b3535dcb574c2f42e5874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com> Date: Fri, 30 May 2025 02:32:17 +0000 Subject: [PATCH 005/328] =?UTF-8?q?!21374=20[Feature]=20Add=20npu=5Fgather?= =?UTF-8?q?=5Fsparse=5Findex=20in=20public=20api.=20Merge=20pull=20request?= =?UTF-8?q?=20!21374=20from=20=E5=88=98=E5=98=89=E5=B7=8D/v2.7.1-tmp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/allowlist_for_publicAPI.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index 5a59b62138..253e683b2c 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2867,7 +2867,8 @@ "npu_grouped_matmul_finalize_routing", "npu_alltoallv_gmm", "npu_gmm_alltoallv", - "npu_transpose_batchmatmul" + "npu_transpose_batchmatmul", + "npu_gather_sparse_index" ], "torch_npu.contrib": [ "npu_fused_attention_with_layernorm", -- Gitee From 843b2c3e75c7a5f04ccdc60c32d6d8c8aecf6036 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Fri, 30 May 2025 04:40:42 +0000 Subject: [PATCH 006/328] !21383 add traceutils for filght recorder Merge pull request !21383 from huangyunlong/2.71ft1 --- torch_npu/csrc/distributed/TraceUtils.h | 737 ++++++++++++++++++++++++ 1 file changed, 737 insertions(+) create mode 100644 torch_npu/csrc/distributed/TraceUtils.h diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h new file mode 100644 index 0000000000..d3a516a7cc --- /dev/null +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -0,0 +1,737 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include "torch_npu/csrc/profiler/python/combined_traceback.h" + +#include "torch_npu/csrc/core/npu/NPUEvent.h" +#include "torch_npu/csrc/distributed/HCCLUtils.hpp" + +#include +#include +#include +#include +#include +#include + +namespace c10d_npu { + + static c10::IValue entries_key = "entries"; + static c10::IValue hccl_comm_key = "hccl_comm_state"; + static c10::IValue version_key = "version"; + // Update whenever changing contents or formatting of the dump + // (minor when adding fields, major when changing existing fields) + static c10::IValue version_val = "2.1"; + static c10::IValue pg_config_key = "pg_config"; + static c10::IValue record_id_key = "record_id"; + static c10::IValue pg_id_key = "pg_id"; + static c10::IValue pg_name_key = "process_group"; + static c10::IValue collective_seq_id_key = "collective_seq_id"; + static c10::IValue p2p_seq_id_key = "p2p_seq_id"; + static c10::IValue is_p2p_key = "is_p2p"; + static c10::IValue op_id_key = "op_id"; + static c10::IValue profiling_name_key = "profiling_name"; + static c10::IValue input_sizes_key = "input_sizes"; + static c10::IValue input_dtypes_key = "input_dtypes"; + static c10::IValue output_sizes_key = "output_sizes"; + static c10::IValue output_dtypes_key = "output_dtypes"; + static c10::IValue time_created_key = "time_created_ns"; + static c10::IValue duration_key = "duration_ms"; + + static c10::IValue frames_key = "frames"; + static c10::IValue state_key = "state"; + static c10::IValue line_key = "line"; + static c10::IValue name_key = "name"; + static c10::IValue filename_key = "filename"; + static c10::IValue retired_key = "retired"; + static c10::IValue time_discovered_started_key = "time_discovered_started_ns"; + static c10::IValue time_discovered_completed_key = + "time_discovered_completed_ns"; + + /* Trace Utils Related to TORCH_HCCL_DESYNC_DEBUG */ + + inline std::string getTraceStartKey(const std::string &pgName, int rank) + { + return pgName + "_" + std::to_string(rank) + "_trace_start"; + } + + inline std::string getTraceEndKey(const std::string &pgName, int rank) + { + return pgName + "_" + std::to_string(rank) + "_trace_end"; + } + + inline bool traceUpdate( + c10::intrusive_ptr &store, + const std::string &key, + uint64_t seq, + const std::string &col) + { + std::vector value(col.size() + sizeof(seq) + 1); + memcpy(value.data(), &seq, sizeof(seq)); + memcpy(value.data() + sizeof(seq), col.data(), col.size()); + try { + store->set(key, value); + return true; + } catch (...) { + LOG(ERROR) << "Store is down while updating #" << seq << " with key " + << key; + return false; + } + return true; + } + + enum TraceDebugEvent { + kEventStart, + kEventEnd, + }; + // >> + using TraceMap = + std::map>>; + + inline std::string ranksToString(const std::vector &ranks) + { + std::string str; + for (int rank : ranks) { + if (str.empty()) { + str = std::to_string(rank); + } else { + str += ", " + std::to_string(rank); + } + } + return str; + } + + inline std::string ranksFromTrace( + const std::vector> &items) + { + std::string ranks; + for (auto &p : items) { + if (ranks.empty()) { + ranks = std::to_string(p.first); + } else { + ranks += ", " + std::to_string(p.first); + } + } + return ranks; + } + + inline std::string analyzeMissingRanks(const std::vector &missingRanks) + { + return c10::str( + "\n\t - To our best knowledge, ranks [", + ranksToString(missingRanks), + "] are the lagging ranks that caused this timeout. " + "They never joined any collectives"); + } + + inline std::string analyzeLaggingRanks(const TraceMap &traceMap) + { + uint64_t lagSeq = traceMap.begin()->first; + std::vector startRanks; + std::vector endRanks; + for (auto &p : traceMap.begin()->second) { + if (p.second.second == kEventStart) { + startRanks.push_back(p.first); + } else { + endRanks.push_back(p.first); + } + } + std::string report = + "\n\t - To our best knowledge, the lagging/dead/mismatched ranks " + "that caused the desync are:"; + if (startRanks.size()) { + report += c10::str( + "\n\t - [", + ranksToString(startRanks), + "] joined but didn't finish collective #", + lagSeq, + " (count from 1)"); + } + if (endRanks.size()) { + report += c10::str( + "\n\t [", + ranksToString(endRanks), + "] finished collective #", + lagSeq, + ", but didn't join collective #", + lagSeq + 1, + " (count from 1)"); + } + return report; + } + + inline std::string dumpSnapshot(TraceMap &traceMap) + { + std::string report = "\n\t - Snapshot of ranks' latest states:"; + for (auto &tracePair : traceMap) { + uint64_t seq = tracePair.first; + std::map> &subMap = + tracePair.second; + + std::unordered_map> collectivesStart; + std::unordered_map> collectivesEnd; + for (auto &p : subMap) { + int rank = p.first; + const std::string &col = p.second.first; + if (p.second.second == kEventStart) { + collectivesStart[col].push_back(rank); + } else { + collectivesEnd[col].push_back(rank); + } + } + + if (collectivesStart.size()) { + report += c10::str("\n\t #", seq, " started ranks:"); + for (auto &mapPair : collectivesStart) { + report += c10::str( + "\n\t [", + ranksToString(mapPair.second), + "] started ", + mapPair.first); + } + } + if (collectivesEnd.size()) { + report += c10::str("\n\t #", seq, " finished ranks:"); + for (auto &mapPair : collectivesEnd) { + report += c10::str( + "\n\t [", + ranksToString(mapPair.second), + "] finished ", + mapPair.first); + } + } + } + return report; + } + + /* Trace Utils Related to Flight Recorder */ + + /* Note: this is only used by PGHCCL (could be generalized in an ideal world but + * wasn't done that way, so isn't expected to be fully general at the moment) */ + + /* Helper used by work::getDuration() and hccl flight recorder */ + float getDurationFromEvent( + c10_npu::NPUEvent &hcclStartEvent, + c10_npu::NPUEvent &hcclEndEvent) + { + TORCH_CHECK( + hcclEndEvent.query(), + "getDuration can only be called after work is succeeded.") + return hcclStartEvent.elapsed_time(hcclEndEvent); + } + + DebugInfoWriter::~DebugInfoWriter() = default; + + void DebugInfoWriter::write(const std::string &hcclTrace) + { + // Open a file for writing. The ios::binary flag is used to write data as + // binary. + std::ofstream file(filename_, std::ios::binary); + + // Check if the file was opened successfully. + if (!file.is_open()) { + LOG(ERROR) << "Error opening file for writing HCCLPG debug info: " + << filename_; + return; + } + + file.write(hcclTrace.data(), hcclTrace.size()); + LOG(INFO) << "Finished writing HCCLPG debug info to " << filename_; + } + + DebugInfoWriter &DebugInfoWriter::getWriter(int rank) + { + if (writer_ == nullptr) { + std::string fileNamePrefix = getCvarString( + {"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_"); + // Using std::unique_ptr here to auto-delete the writer object + // when the pointer itself is destroyed. + std::unique_ptr writerPtr( + new DebugInfoWriter(fileNamePrefix, rank)); + DebugInfoWriter::registerWriter(std::move(writerPtr)); + } + return *writer_; + } + + void DebugInfoWriter::registerWriter(std::unique_ptr writer) + { + TORCH_CHECK_WITH( + DistBackendError, + hasWriterRegistered_.load() == false, + "debugInfoWriter already registered"); + hasWriterRegistered_.store(true); + writer_ = std::move(writer); + } + + std::unique_ptr DebugInfoWriter::writer_ = nullptr; + std::atomic DebugInfoWriter::hasWriterRegistered_(false); + + inline std::string pickle_str(const c10::IValue &v) + { + std::vector result; + { + auto writer = [&](const char *data, size_t size) { + result.insert(result.end(), data, data + size); + }; + torch::jit::Pickler pickler( + writer, nullptr, nullptr, nullptr, nullptr, false); + pickler.protocol(); + pickler.pushIValue(v); + pickler.stop(); + } + return std::string(result.begin(), result.end()); + } + + inline std::string get_python_cpp_trace() + { + // usage: + // LOG(INFO) << "stacktrace: " + // << get_python_cpp_trace(); + // warn: might be slow in getting cpp traces + // because of slow/broken addr2line + // in different system libs + std::shared_ptr tb = + torch_npu::CapturedTraceback::gather(true, true, false); + torch_npu::SymbolizedTracebacks s_tbs = torch_npu::symbolize({tb.get()}); + const auto &s_tb = s_tbs.tracebacks.at(0); + std::stringstream oss; + LOG(ERROR) << "get traceback size:" << s_tb.size(); + for (auto idx : c10::irange(s_tb.size())) { + auto frame_id = s_tb[idx]; + const auto &frame = s_tbs.all_frames.at(frame_id); + oss << "#" << idx << " " << frame.funcname << " from " << frame.filename + << ":" << frame.lineno << std::endl; + } + return oss.str(); + } + + inline c10::Dict new_dict() + { + return c10::Dict( + c10::AnyType::get(), c10::AnyType::get()); + } + + inline c10::List new_list() + { + return c10::List(c10::AnyType::get()); + } + + inline std::string ranks_str(const std::vector &ranks) + { + std::string str; + for (const auto &rank : ranks) { + if (str.empty()) { + str = std::to_string(rank); + } else { + str += ", " + std::to_string(rank); + } + } + return c10::str("[", str, "]"); + } + + struct HCCLTraceBuffer { + static HCCLTraceBuffer *get() + { + // intentionally leak on exit + // because this will hold python state that may get destructed + static HCCLTraceBuffer *instance = new HCCLTraceBuffer(); + return instance; + } + HCCLTraceBuffer() + { + max_entries_ = getCvarInt({"TORCH_HCCL_TRACE_BUFFER_SIZE"}, 0); + capture_cpp_stack_ = getCvarBool({"TORCH_HCCL_TRACE_CPP_STACK"}, false); + enabled_ = max_entries_ > 0; + } + using Event = c10_npu::NPUEvent; + struct Entry { + size_t id_; // incremented id in the trace buffer + // used to figure out where in the circular entries + // buffer this entry will be located to + // update state information + size_t pg_id_; + std::tuple pg_name_; // + + // collective_seq_id and p2p_seq_id refer to actual kernel launches (e.g. 1 + // per coalesced group). + // collective_seq_id only increments for true collective operations (over + // all ranks in the group). p2p_seq_id only increments over non-collective + // operations in the group. op_id refers to logical operations (e.g. one per + // op inside coalesced group) + size_t collective_seq_id_; + size_t p2p_seq_id_; + size_t op_id_; + std::string profiling_name_; + + std::shared_ptr traceback_; + // we borrow pointers to start_ and end_ so we can query the state + // on reporting. However, once the event is completed, the call + // to `complete` will clear these. + Event *start_, *end_; + + // timestamp when the entry was created, likely close to the time the work + // was 'enqueued'- not necessarily started + c10::time_t time_created_; + + // Is this a P2P event? + bool isP2P_; + + std::optional duration_; + + // timestamp when our CPU threads discovered that the kernel started. + // will always be _after_ it actually started, and can be very late + // if the watchdog thread got stuck on CANN APIs. + std::optional time_discovered_started_; + + // timestamp when our CPU threads discovered that the kernel completed. + // will always be _after_ it actually complated, and can be the same time + // as the discovery of the start if the watchdog thread is stuck on CANN + // APIs + std::optional time_discovered_completed_; + + // size information for input/output tensors + c10::SmallVector input_dims_; + std::vector input_dtypes_; + c10::SmallVector output_dims_; + std::vector output_dtypes_; + c10::SmallVector sizes_; // flattened from inputs, outputs + bool retired_ = false; // is this work entry no longer in the workMetaList_? + // a retired but not completed event has timed out + }; + + bool enabled_ = false; + bool capture_cpp_stack_ = false; + std::mutex mutex_; + std::vector entries_; + size_t max_entries_ = 0; + size_t next_ = 0; + size_t id_ = 0; + std::map, std::vector> + pg_name_to_ranks_ = {}; + + c10::optional record( + size_t pg_id, + const std::tuple &pg_name, + size_t collective_seq_id, + size_t p2p_seq_id, + size_t op_id, + std::string profiling_name, + const std::vector &inputs, + const std::vector &outputs, + Event *start, + Event *end, + bool isP2P) + { + if (!enabled_) { + return c10::nullopt; + } + auto traceback = + torch_npu::CapturedTraceback::gather(true, true, capture_cpp_stack_); + std::lock_guard guard(mutex_); + + auto te = Entry{ + id_, + pg_id, + pg_name, + collective_seq_id, + p2p_seq_id, + op_id, + std::move(profiling_name), + std::move(traceback), + std::move(start), + std::move(end), + c10::getTime(), + isP2P}; + + for (const auto &input : inputs) { + c10::IntArrayRef sizes = input.sizes(); + te.input_dtypes_.push_back(input.dtype().toScalarType()); + te.input_dims_.push_back(sizes.size()); + te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end()); + } + + for (const auto &output : outputs) { + c10::IntArrayRef sizes = output.sizes(); + te.output_dtypes_.push_back(output.dtype().toScalarType()); + te.output_dims_.push_back(sizes.size()); + te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end()); + } + + if (entries_.size() < max_entries_) { + entries_.emplace_back(std::move(te)); + } else { + entries_[next_++] = std::move(te); + if (next_ == max_entries_) { + next_ = 0; + } + } + return id_++; + } + + void record_pg_ranks( + const std::tuple &pg_name, + std::vector ranks) + { + if (!enabled_) { + return; + } + std::lock_guard guard(mutex_); + pg_name_to_ranks_[pg_name] = ranks; + } + + void update_state(Entry &r) + { + if (r.start_ != nullptr) { + bool started = r.start_->query(); + if (started && !r.time_discovered_started_) { + r.time_discovered_started_ = c10::getTime(); + } + } + if (r.end_ != nullptr) { + bool completed = r.end_->query(); + if (completed && !r.time_discovered_completed_) { + r.time_discovered_completed_ = c10::getTime(); + } + } + } + + std::vector dump_entries() + { + std::lock_guard guard(mutex_); + std::vector result; + result.reserve(entries_.size()); + result.insert(result.end(), entries_.begin() + next_, entries_.end()); + result.insert(result.end(), entries_.begin(), entries_.begin() + next_); + // query any remaining events + for (auto &r : result) { + update_state(r); + r.start_ = r.end_ = nullptr; + } + return result; + } + + /* + Mark an Event as completed and free its events. + + This is called by the watchdog thread, and is asynchronous from the + perspective of the main thread. + + compute_duration defaults to true since retire_id is only called in the + watchdog thread, which is currently a place we call cuda APIs which may hang, + but care should be taken to avoid computing duration in any function that must + never hang. (timing must also be enabled for compute_duration - see + TORCH_HCCL_ENABLE_TIMING). + */ + void retire_id(std::optional id, bool compute_duration = true) + { + if (!enabled_ || !id) { + return; + } + + bool can_compute_duration = false; + Event *startEvent = nullptr; + Event *endEvent = nullptr; + c10::optional duration = c10::nullopt; + + std::unique_lock guard(mutex_); + + Entry *entry = &entries_.at(*id % max_entries_); + if (entry->id_ == *id) { + update_state(*entry); + + if (compute_duration) { + can_compute_duration = entry->time_discovered_completed_.has_value() && + entry->start_ && entry->end_; + startEvent = entry->start_; + endEvent = entry->end_; + } + } + + if (can_compute_duration) { + // Compute duration without without holding the lock, because + // cudaEventDuration() can hang, and we need to acquire the lock before we + // can dump(), which we never want to block. + guard.unlock(); + duration = getDurationFromEvent(*startEvent, *endEvent); + guard.lock(); + + // Refresh the entry pointer, see if the entry has been overwritten + entry = &entries_.at(*id % max_entries_); + if (entry->id_ != *id) { + LOG(INFO) + << "retire_id abandoned for id " << *id + << ", event was overwritten while waiting to compute duration."; + return; + } + if (duration.has_value()) { + entry->duration_ = duration.value(); + } + } + + entry->retired_ = true; + entry->start_ = entry->end_ = nullptr; + } + + const c10::List getCollectiveTrace( + bool includeStacktraces, + bool onlyActive) + { + auto entries = new_list(); + auto result = dump_entries(); + std::vector tracebacks; + torch_npu::SymbolizedTracebacks stracebacks; + std::vector all_frames; + if (includeStacktraces) { + for (auto &e : result) { + tracebacks.push_back(e.traceback_.get()); + } + stracebacks = torch_npu::symbolize(tracebacks); + for (const auto &f : stracebacks.all_frames) { + auto d = new_dict(); + d.insert(name_key, f.funcname); + d.insert(filename_key, f.filename); + d.insert(line_key, int64_t(f.lineno)); + all_frames.emplace_back(std::move(d)); + } + } + for (auto i : c10::irange(result.size())) { + auto dict = new_dict(); + auto &e = result.at(i); + // Skip completed events + if (onlyActive && e.time_discovered_completed_.has_value()) { + continue; + } + + if (includeStacktraces) { + auto &tb = stracebacks.tracebacks.at(i); + auto frames = new_list(); + for (int64_t frame : tb) { + frames.push_back(all_frames.at(frame)); + } + dict.insert(frames_key, frames); + } + + dict.insert(record_id_key, int64_t(e.id_)); + dict.insert(pg_id_key, int64_t(e.pg_id_)); + dict.insert(pg_name_key, e.pg_name_); + dict.insert(collective_seq_id_key, int64_t(e.collective_seq_id_)); + dict.insert(p2p_seq_id_key, int64_t(e.p2p_seq_id_)); + dict.insert(op_id_key, int64_t(e.op_id_)); + dict.insert(profiling_name_key, e.profiling_name_); + dict.insert(time_created_key, int64_t(e.time_created_)); + if (e.duration_) { + dict.insert(duration_key, *e.duration_); + } + + auto it = e.sizes_.begin(); + auto read_sizes = [&](const c10::SmallVector &dims) { + auto sizes = new_list(); + for (auto dim : dims) { + auto arg_sizes = new_list(); + for (auto i : c10::irange(dim)) { + (void)i; + arg_sizes.push_back(*it++); + } + sizes.push_back(arg_sizes); + } + return sizes; + }; + + dict.insert(input_sizes_key, read_sizes(e.input_dims_)); + std::vector input_dtypes_strs; + input_dtypes_strs.reserve(e.input_dtypes_.size()); + for (const auto &input_dtype : e.input_dtypes_) { + input_dtypes_strs.push_back(c10::toString(input_dtype)); + } + dict.insert(input_dtypes_key, input_dtypes_strs); + dict.insert(output_sizes_key, read_sizes(e.output_dims_)); + std::vector output_dtypes_strs; + output_dtypes_strs.reserve(e.output_dtypes_.size()); + for (const auto &output_dtype : e.output_dtypes_) { + output_dtypes_strs.push_back(c10::toString(output_dtype)); + } + dict.insert(output_dtypes_key, output_dtypes_strs); + if (e.time_discovered_completed_.has_value()) { + dict.insert(state_key, "completed"); + } else if (e.time_discovered_started_.has_value()) { + dict.insert(state_key, "started"); + } else { + dict.insert(state_key, "scheduled"); + } + + dict.insert( + time_discovered_started_key, + e.time_discovered_started_.has_value() + ? int64_t(*e.time_discovered_started_) + : c10::IValue()); + dict.insert( + time_discovered_completed_key, + e.time_discovered_completed_.has_value() + ? int64_t(*e.time_discovered_completed_) + : c10::IValue()); + dict.insert(retired_key, e.retired_); + dict.insert(is_p2p_key, e.isP2P_); + + entries.push_back(dict); + } + return entries; + } + + // dump pg_entries + const c10::Dict getPgConfig() + { + auto pg_config = new_dict(); + for (const auto &[pg_name, ranks] : pg_name_to_ranks_) { + auto pg_info = new_dict(); + pg_info.insert("name", std::get<0>(pg_name)); + pg_info.insert("desc", std::get<1>(pg_name)); + pg_info.insert("ranks", ranks_str(ranks)); + pg_config.insert(std::get<0>(pg_name), pg_info); + } + return pg_config; + } + + // dump all collectives + hcclDumpMap + std::string dump( + const c10::optional>> &hcclDumpMap, + bool includeCollectives, + bool includeStackTraces, + bool onlyActive) + { + auto result = new_dict(); + // common values + result.insert(version_key, version_val); + result.insert(pg_config_key, getPgConfig()); + + // collective trace + if (includeCollectives) { + result.insert( + entries_key, getCollectiveTrace(includeStackTraces, onlyActive)); + } + + // convert hcclDumpMap into a dictionary + auto per_comm_dict = new_dict(); + if (hcclDumpMap.has_value()) { + for (const auto &[hcclId, hcclDump] : hcclDumpMap.value()) { + auto inner_dict = new_dict(); + for (const auto &[key, value] : hcclDump) { + inner_dict.insert(key, value); + } + per_comm_dict.insert(hcclId, inner_dict); + } + } + if (per_comm_dict.size() > 0) { + result.insert(hccl_comm_key, per_comm_dict); + } + return pickle_str(result); + } + }; + +} // namespace c10d -- Gitee From 4d2d57c7da08257bf44ffb125f70ef734d0969e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Fri, 30 May 2025 07:14:13 +0000 Subject: [PATCH 007/328] =?UTF-8?q?!21366=20Fix=20double=20free=20in=20emp?= =?UTF-8?q?ty=5Fcache=20Merge=20pull=20request=20!21366=20from=20=E5=A7=9C?= =?UTF-8?q?=E6=80=A1=E6=96=87/main=5Fwk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 4c6c475c18..ce9e6cc918 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -115,6 +115,7 @@ public: reinterpret_cast(stream)} ); #endif + block->data_ptr = nullptr; } block->size = kRoundLarge * ((alloc_size + kRoundLarge - 1) / kRoundLarge); -- Gitee From 723182ae0582f6297170eafc0675384dc6b1def3 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 30 May 2025 08:57:59 +0000 Subject: [PATCH 008/328] !21419 Update op_plugin commit id Merge pull request !21419 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f71c628def..3ddab5fa62 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f71c628def3856a5f66f8e369fb0f969d87dd9c9 +Subproject commit 3ddab5fa62d12ec59350c5bcb81bb5134bc8840e -- Gitee From a072ecad7e378271dfa97946ffde931a316b6a8a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 30 May 2025 10:57:54 +0000 Subject: [PATCH 009/328] !21440 Update op_plugin commit id Merge pull request !21440 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 3ddab5fa62..b932a3ce6b 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 3ddab5fa62d12ec59350c5bcb81bb5134bc8840e +Subproject commit b932a3ce6b7c809ef3b57fb8b79214152d8e6242 -- Gitee From ba00b20714f1caa56c5cb9dccaf6281a7edc9d73 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 30 May 2025 15:57:55 +0000 Subject: [PATCH 010/328] !21450 Update op_plugin commit id Merge pull request !21450 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index b932a3ce6b..3f764c5c7c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit b932a3ce6b7c809ef3b57fb8b79214152d8e6242 +Subproject commit 3f764c5c7cd1c0641fdc27ef3eddb4463a7a7397 -- Gitee From c410199d34a80268de2cb0aa6d64e163af1acf11 Mon Sep 17 00:00:00 2001 From: sincatter Date: Tue, 3 Jun 2025 01:20:32 +0000 Subject: [PATCH 011/328] =?UTF-8?q?!21417=20=E5=A2=9E=E5=8A=A0=20npu=5Fmoe?= =?UTF-8?q?=5Fdistribute=5Fcombine=5Fadd=5Frms=5Fnorm=20=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=20Merge=20pull=20request=20!21417=20from=20sincatter/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/allowlist_for_publicAPI.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index 253e683b2c..1b2fe6148e 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2868,7 +2868,8 @@ "npu_alltoallv_gmm", "npu_gmm_alltoallv", "npu_transpose_batchmatmul", - "npu_gather_sparse_index" + "npu_gather_sparse_index", + "npu_moe_distribute_combine_add_rms_norm" ], "torch_npu.contrib": [ "npu_fused_attention_with_layernorm", -- Gitee From bbd0747606408c56526e8af4c3298b2501a7d547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Tue, 3 Jun 2025 01:54:38 +0000 Subject: [PATCH 012/328] =?UTF-8?q?!21408=20Add=20base=5Faddr=5Faligned=5F?= =?UTF-8?q?kb=20to=20empty=5Fwith=5Fformat=20Merge=20pull=20request=20!214?= =?UTF-8?q?08=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.7.1=5Fft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 2 +- .../csrc/aten/common/TensorFactories.cpp | 28 +++++++++++++----- torch_npu/csrc/aten/npu_native_functions.yaml | 2 +- .../csrc/core/npu/NPUCachingAllocator.cpp | 29 +++++++++++++++++++ torch_npu/csrc/core/npu/NPUCachingAllocator.h | 6 ++++ .../csrc/framework/utils/OpPreparation.cpp | 6 ++-- torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 6 ++++ torch_npu/csrc/npu/NPUPluggableAllocator.h | 1 + 8 files changed, 68 insertions(+), 12 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index dfe6179f79..a04f2cd4f3 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2742,7 +2742,7 @@ "signature": "(int[] size, Dimname[]? names, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor" }, "func: empty_with_format": { - "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor" + "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor" }, "func: copy_memory_": { "signature": "(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)" diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index 8758e3cf81..77f400ff50 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -266,7 +266,8 @@ at::Tensor NPUNativeFunctions::empty_with_format( c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt, - int64_t dst_format) + int64_t dst_format, + c10::optional base_addr_aligned_kb) { #ifndef BUILD_LIBTORCH torch_npu::profiler::NPURecordFunction profiler_guard; @@ -287,12 +288,23 @@ at::Tensor NPUNativeFunctions::empty_with_format( auto dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt)); int64_t nelements = StorageDescHelper::GetMemorySize(size, format, dtype); int64_t size_bytes = nelements * dtype.itemsize(); - c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( - c10::StorageImpl::use_byte_size_t(), - c10::SymInt(size_bytes), - allocator->allocate(size_bytes), - allocator, - true); + c10::intrusive_ptr storage_impl; + if (!base_addr_aligned_kb.has_value()) { + storage_impl = torch_npu::make_npu_storage_impl( + c10::StorageImpl::use_byte_size_t(), + c10::SymInt(size_bytes), + allocator->allocate(size_bytes), + allocator, + true); + } else { + storage_impl = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + static_cast(size_bytes), + c10_npu::NPUCachingAllocator::allocate_with_aligned(size_bytes, base_addr_aligned_kb.value()), + allocator, + true); + } + auto tensor = at::detail::make_tensor(storage_impl, dtype); // Default NPUTensorImpl has size [0] @@ -324,7 +336,7 @@ at::Tensor NPUNativeFunctions::unsafe_empty_with_format( "tensor will be created with base format."); } - return NPUNativeFunctions::empty_with_format(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, dst_format); + return NPUNativeFunctions::empty_with_format(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, dst_format, c10::nullopt); } at::Tensor NPUNativeFunctions::empty_with_format( diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 0eaaa05771..95bb740db1 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -70,7 +70,7 @@ custom: - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!) device_check: NoCheck exposed: True - - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor + - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor dispatch: CompositeExplicitAutograd: empty_with_format - func: unsafe_empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 4201fa5f1a..62a56d75bc 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -3328,6 +3328,35 @@ public: return { devPtr, devPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device) }; } + c10::DataPtr allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) const override + { + constexpr size_t one_exa_bytes = 1152921504606846976ULL; + if (C10_UNLIKELY(size >= one_exa_bytes)) { + AT_ERROR("NPU out of memory. Tried to allocate more than 1EB memory."); + } + int device = 0; + NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); + void *realPtr = nullptr; + void (*deleteFunc)(void *) = &local_raw_delete; + + size_t aligned = base_addr_aligned_kb * 1024; + if (size != 0) { + if (c10_npu::option::OptionsManager::CheckForceUncached()) { + deleteFunc = &uncached_delete; + size_t alloc_size = size + 32 + aligned; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size, + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST)); + ASCEND_LOGD("Without NPUCachingAllocator, malloc by " + "AclrtMallocAlign32: size=%zu", alloc_size); + } else { + const_cast(this)->malloc(&realPtr, device, size + aligned, + c10_npu::getCurrentNPUStreamNoWait(device)); + } + } + void *devPtr = reinterpret_cast(aligned * ((reinterpret_cast(realPtr) + aligned - 1) / aligned)); + return { devPtr, realPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device) }; + } + c10::DeleterFnPtr raw_deleter() const override { if (c10_npu::option::OptionsManager::CheckForceUncached()) { diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index b34b4e37ae..c33f51fbc8 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -190,6 +190,7 @@ using OutOfMemoryObserver = class NPUAllocator : public c10::Allocator { public: + virtual c10::DataPtr allocate_with_aligned(size_t size, size_t aligned) const = 0; virtual void* raw_alloc(size_t nbytes) = 0; virtual void* raw_alloc_with_stream(size_t nbytes, aclrtStream stream) = 0; virtual void raw_delete(void* ptr) = 0; @@ -264,6 +265,11 @@ inline NPUAllocator* get() return allocator.load(); } +inline c10::DataPtr allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) +{ + return get()->allocate_with_aligned(size, base_addr_aligned_kb); +} + // Called directly by clients. inline void* raw_alloc(size_t nbytes) { diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index 6777eee4d1..006bc377b0 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -294,7 +294,8 @@ at::Tensor OpPreparation::apply_tensor_with_sizes(c10::IntArrayRef sizes, const options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), - format); + format, + c10::nullopt); } void OpPreparation::CheckOut(const std::initializer_list &inputs, at::Tensor &output, at::Tensor dst) @@ -495,7 +496,8 @@ at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10 options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), - format); + format, + c10::nullopt); } void OpPreparation::CheckMemory(const std::initializer_list &inputs, diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 296d217218..e8e0fd3eef 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -103,6 +103,12 @@ c10::DataPtr NPUPluggableAllocator::allocate(size_t size) return data_ptr; } +c10::DataPtr NPUPluggableAllocator::allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) const +{ + TORCH_CHECK(false, "NPUPluggableAllocator does't has allocate_with_aligned", PTA_ERROR(ErrCode::NOT_SUPPORT)); + return c10::DataPtr(); +} + c10::DeleterFnPtr NPUPluggableAllocator::raw_deleter() const { return &custom_raw_deleter; diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index b43866e3a3..3a71319f3c 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -48,6 +48,7 @@ struct NPUPluggableAllocator void* malloc(size_t size, int device, aclrtStream stream); c10::DataPtr allocate(size_t size) override; + c10::DataPtr allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) const override; c10::DeleterFnPtr raw_deleter() const override; void* raw_alloc(size_t nbytes) override; -- Gitee From dac50aa4898da0739d0cd5ee0fb837bb2b36ee1b Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 3 Jun 2025 02:34:42 +0000 Subject: [PATCH 013/328] !21390 Update torchair commit id Merge pull request !21390 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 70208e513f..f4241ab1d4 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 70208e513f8f64c70253dcf2efe74b4c3c4c2b2f +Subproject commit f4241ab1d409ae49c4357540db7372baacd65dc6 -- Gitee From afed3597f48c5e3c7317b39f74d1ddc2ba728de9 Mon Sep 17 00:00:00 2001 From: dilililiwhy Date: Tue, 3 Jun 2025 06:39:28 +0000 Subject: [PATCH 014/328] !21461 Reland reduce_ex patch Merge pull request !21461 from dilililiwhy/reland_reduce_ex_patch --- test/npu/test_serialization.py | 2 -- torch_npu/utils/serialization.py | 1 + torch_npu/utils/storage.py | 57 ++++++++++++++++++++++++++++--- torch_npu/utils/tensor_methods.py | 2 ++ 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/test/npu/test_serialization.py b/test/npu/test_serialization.py index d091597838..af2a43f9ae 100644 --- a/test/npu/test_serialization.py +++ b/test/npu/test_serialization.py @@ -2,7 +2,6 @@ import io import os import tempfile import argparse -import unittest import torch import torch.nn as nn @@ -96,7 +95,6 @@ class TestSerialization(TestCase): self.assertExpectedInline(f'{x_loaded.device.type}:{x_loaded.device.index}', 'npu:0') self.assertRtolEqual(x, x_loaded.cpu()) - @unittest.skip("pytorch/issues/146969, pytorch/issues/125465, pytorch/pull/142214") def test_save_npu_format(self): with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, 'data.pt') diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py index ddfbbd0187..35970aea9e 100644 --- a/torch_npu/utils/serialization.py +++ b/torch_npu/utils/serialization.py @@ -433,3 +433,4 @@ def _add_serialization_methods(): torch.save = save torch.load = load torch.serialization._save = _npu_save + torch.serialization.add_safe_globals([torch_npu.utils.storage._rebuild_npu_tensor]) diff --git a/torch_npu/utils/storage.py b/torch_npu/utils/storage.py index 9304f141bf..349823492f 100644 --- a/torch_npu/utils/storage.py +++ b/torch_npu/utils/storage.py @@ -1,6 +1,13 @@ +__all__ = [] + import copy +from typing import Any, Dict +from collections import OrderedDict + import torch from torch.storage import _warn_typed_storage_removal +from torch.overrides import has_torch_function_unary, handle_torch_function +from torch._namedtensor_internals import check_serializing_named_tensor import torch_npu from . import serialization as se @@ -8,12 +15,13 @@ from . import serialization as se def _rebuild_npu_tensor(storage, storage_offset, size, stride, requires_grad, backward_hooks, npu_storage_info): warn_massages = ( - "Warning: The current version of the file storing weights is old," - "and in the future we will deprecate the loading support for this type of file," - "please use 2.1 and newer torch to re-store the weight file." + "Warning: The current version of the file storing weights is old, " + "and it is relanded due to internal bug of torch and compatibility issue. " + "We will deprecate the loading support for this type of file in the future, " + "please use newer torch to re-store the weight file." ) se._warn_legacy_serialization(warn_massages, "oldfile") - tensor = torch.tensor([], dtype=storage.dtype, device=storage.device) + tensor = torch.tensor([], dtype=storage.dtype, device=storage._untyped_storage.device) tensor.set_(storage, storage_offset, size, stride) tensor.requires_grad = requires_grad tensor._backward_hooks = backward_hooks @@ -25,6 +33,47 @@ def _rebuild_npu_tensor(storage, storage_offset, size, stride, requires_grad, ba return tensor +def _reduce_ex(self, proto): + materialize_fake_tensors = ( + torch.serialization._serialization_tls.materialize_fake_tensors + ) + state = torch._utils._get_obj_state(self) + # Ignore all state when using FakeTensor with skip_data(materialize_fake_tensors) because FakeTensor has + # some state that cannot be pickled + if ( + # will remove hasattr, it's a hack to support versions of torch that don't have _subclasses + hasattr(torch, "_subclasses") + and isinstance(self, torch._subclasses.fake_tensor.FakeTensor) + and materialize_fake_tensors + ) or (isinstance(self, torch.Tensor) and not state): + # For npu tensor with internal format + check_serializing_named_tensor(self) + torch.utils.hooks.warn_if_has_hooks(self) + backward_hooks: Dict[Any, Any] = OrderedDict() + if self.device.type == "npu": + npu_storage_format = torch_npu.get_npu_format(self) + tmp_tensor = self.cpu() + arg_npu = ( + tmp_tensor.storage() if has_torch_function_unary(tmp_tensor) else tmp_tensor._typed_storage(), + tmp_tensor.storage_offset(), + tuple(tmp_tensor.size()), + tmp_tensor.stride(), + tmp_tensor.requires_grad, + backward_hooks, + npu_storage_format + ) + return _rebuild_npu_tensor, arg_npu + # Fast path for regular tensor without Python state. + return self._reduce_ex_internal(proto) + if has_torch_function_unary(self): + return handle_torch_function(torch.Tensor.__reduce_ex__, (self,), self, proto) + func, args = self._reduce_ex_internal(proto) + # sizes / strides cache needs to be cleared here because it'll just be re-cached + # if cleared earlier. Note that state references the -actual- tensor dict. + self._clear_non_serializable_cached_data() + return torch._tensor._rebuild_from_type_v2, (func, type(self), args, state) + + def _cpu(self): """Returns a CPU copy of this storage if it's not already on the CPU""" if self.device.type != 'cpu': diff --git a/torch_npu/utils/tensor_methods.py b/torch_npu/utils/tensor_methods.py index a6780a23bd..f978dfc879 100644 --- a/torch_npu/utils/tensor_methods.py +++ b/torch_npu/utils/tensor_methods.py @@ -4,6 +4,7 @@ import torch import torch_npu from torch_npu.utils._error_code import ErrCode, pta_error +from torch_npu.utils.storage import _reduce_ex __all__ = [] @@ -84,3 +85,4 @@ def _npu_type(self, dtype=None, non_blocking=False, **kwargs): def _add_tensor_methods(): torch.Tensor.type_raw = torch.Tensor.type torch.Tensor.type = _npu_type + torch.Tensor.__reduce_ex__ = _reduce_ex -- Gitee From 08a0baea7e462324b2205fac3b97a5d6dc739f92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= Date: Tue, 3 Jun 2025 07:02:40 +0000 Subject: [PATCH 015/328] =?UTF-8?q?!21430=20add=20nsa=20apis=20Merge=20pul?= =?UTF-8?q?l=20request=20!21430=20from=20=E8=A9=B9=E6=98=8A/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/allowlist_for_publicAPI.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index 1b2fe6148e..a7135a4900 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2867,6 +2867,12 @@ "npu_grouped_matmul_finalize_routing", "npu_alltoallv_gmm", "npu_gmm_alltoallv", + "npu_nsa_compress", + "npu_nsa_compress_infer", + "npu_nsa_compress_attention", + "npu_nsa_compress_attention_infer", + "npu_nsa_select_attention", + "npu_nsa_select_attention_infer", "npu_transpose_batchmatmul", "npu_gather_sparse_index", "npu_moe_distribute_combine_add_rms_norm" -- Gitee From 0d0d7954c4d933de78a63173d13691545b684a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Tue, 3 Jun 2025 07:40:20 +0000 Subject: [PATCH 016/328] =?UTF-8?q?!21465=20support=20HCCL=5FOP=5FRETRY=5F?= =?UTF-8?q?FAILED=20with=20ACL=5FERROR=5FRT=5FCOMM=5FOP=5FRETRY=5FFAIL=20M?= =?UTF-8?q?erge=20pull=20request=20!21465=20from=20=E7=8E=8B=E8=B6=85/v2.7?= =?UTF-8?q?.0=5Fhcclstep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/acl/inc/acl/acl_base.h | 1 + torch_npu/csrc/core/npu/NPUException.cpp | 9 ++++++++- torch_npu/csrc/core/npu/NPUException.h | 3 +++ torch_npu/csrc/core/npu/NPUQueue.cpp | 6 +++++- torch_npu/csrc/core/npu/NPUQueue.h | 1 + torch_npu/csrc/framework/OpParamMaker.cpp | 4 ++-- 6 files changed, 20 insertions(+), 4 deletions(-) diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index cbcf87b0fc..b8ef9dbd34 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -138,6 +138,7 @@ static const int ACL_ERROR_RT_DEVICE_MEM_ERROR = 507053; static const int ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR = 507054; static const int ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR = 507055; static const int ACL_ERROR_RT_LINK_ERROR = 507056; +static const int ACL_ERROR_RT_COMM_OP_RETRY_FAIL = 507904; #define ACL_TENSOR_SHAPE_RANGE_NUM 2 #define ACL_TENSOR_VALUE_RANGE_NUM 2 diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 034726549b..ab139f53b4 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -84,7 +84,8 @@ std::unordered_map> errCodeHandlerMap = { {ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR, std::bind(&handleHbmMultiBitEccError, std::placeholders::_1)}, {ACL_ERROR_RT_DEVICE_MEM_ERROR, std::bind(&handleDeviceMemError, std::placeholders::_1)}, {ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR, std::bind(&handleSuspectDeviceMemError, std::placeholders::_1)}, - {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)} + {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)}, + {ACL_ERROR_RT_COMM_OP_RETRY_FAIL, std::bind(&handleHcclOpRetryFailed, std::placeholders::_1)} }; MemUceInfo memUceInfo; @@ -244,6 +245,12 @@ std::string handleLinkError(int errorCode) return "HCCS LINK ERROR"; } +std::string handleHcclOpRetryFailed(int errorCode) +{ + ASCEND_LOGE("getRepoStopFlag in Run, throw HCCL OP RETRY FAILED."); + return "HCCL OP RETRY FAILED"; +} + std::string handleDeviceError(int errorCode) { auto handlerIter = errCodeHandlerMap.find(errorCode); diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index f144490e5a..1ba6495f56 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -96,6 +96,7 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode); #define DEVICE_HBM_ECC_ERROR "reason=[hbm Multi-bit ECC error]" #define SUSPECT_DEVICE_MEM_ERROR "reason=[suspect device mem error]" #define HCCS_LINK_ERROR "reason=[link error]" +#define HCCL_OP_RETRY_FAILED "reason=[hccl op retry failed]" inline const char* getErrorFunction(const char* msg) { @@ -275,6 +276,8 @@ std::string handleSuspectDeviceMemError(int errorCode); std::string handleLinkError(int errorCode); +std::string handleHcclOpRetryFailed(int errorCode); + std::string handleDeviceError(int errorCode); } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 48b83d9720..7767dda6b8 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -174,7 +174,8 @@ std::unordered_map deviceErrorMap = { {RepoStatus::HBM_ECC_EXIT, "HBM MULTI BIT ECC ERROR"}, {RepoStatus::STOP_EXIT, "FORCE STOP"}, {RepoStatus::SUSPECT_MEM_EXIT, "SUSPECT MEM ERROR"}, - {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"} + {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"}, + {RepoStatus::HCCL_OP_RETRY_EXIT, "HCCL OP RETRY FAILED"} }; std::string get_func_error_msg(void *error_paras) @@ -376,6 +377,9 @@ void Repository::CheckDeviceError(int ret, std::string& err_msg) } else if (ret == ACL_ERROR_RT_LINK_ERROR || acl_error.find(HCCS_LINK_ERROR) != std::string::npos) { ASCEND_LOGE("HCCS LINK ERROR happened, set task queue status to HCCS_LINK_EXIT"); SetStatus(HCCS_LINK_EXIT); + } else if (ret == ACL_ERROR_RT_COMM_OP_RETRY_FAIL || acl_error.find(HCCL_OP_RETRY_FAILED) != std::string::npos) { + ASCEND_LOGE("HCCL OP RETRY FAILED happened, set task queue status to HCCL_OP_RETRY_EXIT"); + SetStatus(HCCL_OP_RETRY_EXIT); } else if (GetStatus() != STOP_EXIT) { SetStatus(ERROR_EXIT); } diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h index 460a3cb755..0ef5609040 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.h +++ b/torch_npu/csrc/core/npu/NPUQueue.h @@ -27,6 +27,7 @@ enum RepoStatus { HBM_ECC_EXIT = 7, SUSPECT_MEM_EXIT = 8, HCCS_LINK_EXIT = 9, + HCCL_OP_RETRY_EXIT = 10, }; // c10::SmallVector max size diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 6f88222c00..1766af9c99 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -336,7 +336,7 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) ret = cur_paras->customHandler(); } catch (std::exception &e) { if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR, - SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR})) { + SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) { ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; @@ -422,7 +422,7 @@ int ExecFuncOpApi(c10_npu::queue::QueueParas *in, aclrtStream stream) ret = cur_paras->customHandler(); } catch (std::exception &e) { if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR, - SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR})) { + SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) { ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; -- Gitee From fbf4b63f48fdfde5a5caea084b2798ffad735416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Tue, 3 Jun 2025 08:37:32 +0000 Subject: [PATCH 017/328] =?UTF-8?q?!21477=20bugfix=20for=20filename=20Merg?= =?UTF-8?q?e=20pull=20request=20!21477=20from=20=E9=83=AD=E5=85=89?= =?UTF-8?q?=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUException.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 1ba6495f56..a82f8f1568 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -146,7 +146,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) if (((error_code) == ACL_ERROR_RT_FEATURE_NOT_SUPPORT) && (device_error_msg.empty())) { \ static auto feature_not_support_warn_once = []() { \ printf("[WARN]%s,%s:%u:%s\n", \ - __FUNCTION__, __FILENAME__, __LINE__, \ + __FUNCTION__, __FILE__, __LINE__, \ "Feature is not supportted and the possible cause is" \ " that driver and firmware packages do not match."); \ return true; \ -- Gitee From 7afed37874bbbffb85f98d3a2ae4f7339228e9e4 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 3 Jun 2025 08:58:00 +0000 Subject: [PATCH 018/328] !21479 Update op_plugin commit id Merge pull request !21479 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 3f764c5c7c..44a68f5e93 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 3f764c5c7cd1c0641fdc27ef3eddb4463a7a7397 +Subproject commit 44a68f5e93176fa9baf51326c485164c90fb3b6f -- Gitee From 4a886250de56553f97be7513ccbd3ed4093ab705 Mon Sep 17 00:00:00 2001 From: Mingkai Chan Date: Tue, 3 Jun 2025 11:47:35 +0000 Subject: [PATCH 019/328] !21469 Add Unittest for graph_tree Merge pull request !21469 from Mingkai Chan/master --- test/npu/test_graph_tree.py | 668 ++++++++++++++++++++++++++++++++++++ 1 file changed, 668 insertions(+) create mode 100644 test/npu/test_graph_tree.py diff --git a/test/npu/test_graph_tree.py b/test/npu/test_graph_tree.py new file mode 100644 index 0000000000..b278c02045 --- /dev/null +++ b/test/npu/test_graph_tree.py @@ -0,0 +1,668 @@ +from unittest.mock import patch, MagicMock, call, ANY +import weakref +import pytest +import torch +import torch_npu +from torch_npu.npu._graph_tree import ( + clear_cublass_cache, + clear_cublas_manager, + disable_conv_cache_emptying, + enable_history_recording, + npugraphify, + npugraphify_impl, + TreeManagerContainer, + StorageWeakRefWrapper, + NPUWarmupNode, + CompilationMode, + get_container, + get_manager, + reset_npugraph_trees, + local, + OutputAliasInfo, + UnaliasedStorage, + AliasesPriorGraphOutput, + AliasesNewOutput, + NPUGraphNode, + WrappedFunction, +) +from torch_npu.testing.testcase import TestCase, run_tests + +device = "npu:0" +torch.npu.set_device(device) + + +class TestCublasCacheManagement(TestCase): + @patch("torch_npu.npu._graph_tree.clear_cublass_cache") + def test_clear_cublas_manager_context(self, mock_clear): + with clear_cublas_manager(): + mock_clear.assert_called_once() + mock_clear.reset_mock() + mock_clear.assert_called_once() + + +class TestDisableConvCache(TestCase): + def test_disable_conv_cache_emptying(self): + with disable_conv_cache_emptying(): + pass # No operation, just ensure no exceptions + + +class TestHistoryRecording(TestCase): + @patch("torch.npu.memory._record_memory_history") + def test_enable_history_recording(self, mock_record): + original_state = torch_npu._C._npu_isHistoryEnabled() + with enable_history_recording(): + if not original_state: + mock_record.assert_called_once() + else: + mock_record.assert_not_called() + mock_record.assert_any_call(None) + + +class TestNpuGraphFunctions(TestCase): + def setUp(self): + # Reset global state before each test + reset_npugraph_trees() + + @patch("torch_npu.npu._graph_tree.TreeManagerContainer") + def test_get_manager(self, mock_container): + # Test manager creation + mock_container.return_value.get_tree_manager.return_value = "mock_manager" + manager = get_manager(0) + self.assertEqual(manager, "mock_manager") + + # Test no-creation path + manager = get_manager(0, create_if_none_exists=False) + mock_container.return_value.get_tree_manager.assert_called_once() + + @patch("torch_npu.npu._graph_tree.npugraphify") + @patch("torch._inductor.compile_fx.align_inputs_from_check_idxs") + def test_npugraphify_impl(self, mock_align, mock_npugraphify): + # Setup mock model and inputs + mock_model = MagicMock() + inputs = [1, torch.tensor([2]), 3] + static_idxs = (1,) + + # Test caching behavior + impl = npugraphify_impl(mock_model, inputs, static_idxs) + + # First call + mock_npugraphify.return_value = (lambda x: "output1", "output1") + result = impl(inputs) + self.assertEqual(result, "output1") + + # Second call with same int keys + result = impl(inputs) + self.assertEqual(result, "output1") + mock_npugraphify.assert_called_once() + + @patch("torch_npu.npu._graph_tree.get_container") + def test_npugraphify(self, mock_container): + # Setup mock manager + mock_manager = MagicMock() + mock_container.return_value.get_tree_manager.return_value = mock_manager + + # Test valid mode combinations + model = MagicMock() + inputs = [torch.tensor([1])] + + # Test forward mode + npugraphify( + model, inputs, (), device_index=0, is_backward=False, is_inference=False + ) + mock_manager.add_function.assert_called_with( + model, inputs, (), None, CompilationMode.FORWARD, () + ) + + # Test backward mode + mock_manager.reset_mock() + npugraphify( + model, inputs, (), device_index=0, is_backward=True, is_inference=False + ) + mock_manager.add_function.assert_called_with( + model, inputs, (), None, CompilationMode.BACKWARD, () + ) + + # Test invalid mode combination + with self.assertRaises(RuntimeError): + npugraphify( + model, inputs, (), device_index=0, is_backward=True, is_inference=True + ) + + +class TestTreeManagerContainer(TestCase): + def setUp(self): + self.container = TreeManagerContainer(0) + + def test_initial_state(self): + self.assertIsNone(self.container.tree_manager) + self.assertEqual(self.container.live_npugraphify_fns, 0) + + def test_add_strong_reference(self): + self.container.add_strong_reference(lambda: None) + # Simulate finalization of fn + finalizer = weakref.finalize( + lambda: None, + self.container.finalize_npugraphify_fn, # Object to monitor # Callback + ) + finalizer.atexit = False # Prevent finalizer from running at exit + + # Simulate finalization + finalizer() + # If all references are gone, tree_manager should be None + self.container._finalize_tree_manager = MagicMock() + self.container._finalize_tree_manager() + self.container._finalize_tree_manager.assert_called_once() + + def test_get_tree_manager(self): + with patch("torch_npu.npu.graphs.NPUGraph.capture_begin"), patch( + "torch_npu.npu.graphs.NPUGraph.capture_end" + ): + manager = self.container.get_tree_manager() + self.assertIsNotNone(manager) + self.assertIs(manager, self.container.get_tree_manager()) # Same instance + + +class TestStorageWeakRefWrapper(TestCase): + def test_storage_ref(self): + tensor = torch.tensor([1], device="npu") + wrapper = StorageWeakRefWrapper(tensor) + self.assertEqual(wrapper.data_ptr(), tensor.untyped_storage().data_ptr()) + del tensor + # Storage might still be alive due to Python's ref counting; force GC + import gc + + gc.collect() + self.assertTrue(wrapper.expired()) + + +class TestNPUWarmupNode(TestCase): + @patch("torch_npu.npu._graph_tree.StorageWeakRefWrapper") + @patch("torch_npu.npu._graph_tree.check_memory_pool") + def test_run_captures_outputs(self, mock_check, mock_wrapper): + mock_model = MagicMock(return_value=[torch.tensor([2], device="npu")]) + wrapped_fn = MagicMock(model=mock_model, constants=[]) + stream = torch.npu.Stream() + node = NPUWarmupNode( + wrapped_fn, + parent=None, + npu_graphs_pool=(0, 0), + existing_npu_graph=None, + device_index=0, + stack_traces=None, + stream=stream, + already_warm=False, + ) + outputs = node.run([]) + self.assertEqual(len(node.outputs_weakrefs), 1) + + +class TestTreeManagerIntegration(TestCase): + def test_get_container_singleton_per_device(self): + container1 = get_container(0) + container2 = get_container(0) + self.assertIs(container1, container2) + container3 = get_container(1) + self.assertIsNot(container1, container3) + + def test_reset_npugraph_trees(self): + get_container(0) # Initialize a container + reset_npugraph_trees() + container_dict = getattr(local, "tree_manager_containers", {}) + self.assertEqual(len(container_dict), 0) + + +@pytest.fixture +def mock_wrapped_function(): + def model_side_effect(inputs): + # Clear inputs list while preserving reference + inputs[:] = [] + return [] + + return MagicMock( + spec=WrappedFunction, + static_input_idxs=[0], + constants=[], + model=MagicMock(side_effect=model_side_effect), + ) + + +@pytest.fixture +def mock_parent_node(): + parent = MagicMock(spec=NPUGraphNode) + parent.outputs_weakrefs = [] + parent.path_weakrefs = [] + parent.parent = None + parent.stack_traces = [] + parent.recorded_liveness_after_graph = [] + return parent + + +@pytest.fixture +def basic_npu_graph_node(mock_wrapped_function, mock_parent_node): + with patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager"), patch( + "torch_npu.npu._graph_tree.check_memory_pool" + ), patch("torch_npu._C._npu_getCheckpointState"): + return NPUGraphNode( + wrapped_function=mock_wrapped_function, + graph_id=1, + parent=mock_parent_node, + inputs=[torch.tensor([1.0], device="npu")], + npu_graphs_pool=(0, 0), + device_index=0, + stack_traces=None, + stream=torch.npu.Stream(), + ) + + +class TestOutputAliasInfo: + def test_aliases_prior_graph_output_validation(self): + with pytest.raises(RuntimeError): + AliasesPriorGraphOutput("invalid_index") + + def test_aliases_new_output_validation(self): + with pytest.raises(RuntimeError): + AliasesNewOutput("not_an_int") + + +class TestNPUGraphNode: + def tearDown(self): + torch_npu._C._npu_endAllocateCurrentStreamToPool(0, (0, 0)) + torch_npu._C._npu_releasePool(0, (0, 0)) + + def test_initialization(self, mock_wrapped_function, mock_parent_node): + inputs = [torch.tensor([1.0], device="npu")] + with patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager"), patch( + "torch_npu.npu._graph_tree.check_memory_pool" + ), patch("torch_npu._C._npu_getCheckpointState"): + node = NPUGraphNode( + wrapped_function=mock_wrapped_function, + graph_id=1, + parent=mock_parent_node, + inputs=inputs, + npu_graphs_pool=(0, 0), + device_index=0, + stack_traces=None, + stream=torch.npu.Stream(), + ) + + assert node.id == 1 + assert node.device == 0 + assert node.parent == mock_parent_node + assert node.graph is not None + + def test_invalid_input_type(self, mock_wrapped_function): + with pytest.raises(RuntimeError): + NPUGraphNode( + wrapped_function=mock_wrapped_function, + graph_id=1, + parent=None, + inputs="not_a_list", + npu_graphs_pool=(0, 0), + device_index=0, + stack_traces=None, + stream=torch.npu.Stream(), + ) + + @patch("torch_npu.npu._graph_tree.check_memory_pool") + def test_record_method(self, mock_check, basic_npu_graph_node): + def model_side_effect(inputs): + # Clear inputs list while preserving reference + inputs[:] = [] + return [] + + mock_model = MagicMock(side_effect=model_side_effect) + mock_inputs = [torch.tensor([1.0], device="npu")] + + with patch("torch_npu.npu._graph_tree.clear_cublas_manager"), patch( + "torch_npu.npu._graph_tree.get_history_recording" + ), patch("torch_npu.npu.graphs.NPUGraph.capture_begin"), patch( + "torch_npu.npu.graphs.NPUGraph.capture_end" + ), patch( + "torch_npu._C._npu_getCheckpointState" + ), patch( + "torch._dynamo.utils.preserve_rng_state" + ): + + outputs = basic_npu_graph_node._record(mock_model, mock_inputs) + + mock_model.assert_called_once_with(mock_inputs) + assert basic_npu_graph_node.recording_outputs == outputs + + def test_reconstruct_outputs(self, basic_npu_graph_node): + # Setup mock metadata and storage info + basic_npu_graph_node.outputs_metadata = [ + { + "nbytes": 4, + "data_ptr": 1234, + "size": (1,), + "stride": (1,), + "dtype": torch.float32, + "device": "npu", + "storage_offset": 0, + } + ] + basic_npu_graph_node.output_weakrefs = [MagicMock()] + basic_npu_graph_node.output_storage_alias = [UnaliasedStorage] + basic_npu_graph_node.cached_tensor_outputs = [MagicMock()] + + with patch( + "torch_npu._C._construct_NPU_Tensor_From_Storage_And_Metadata" + ) as mock_construct: + outputs = basic_npu_graph_node.reconstruct_outputs() + assert len(outputs) == 1 + + def test_aliased_output_reconstruction(self, basic_npu_graph_node): + basic_npu_graph_node.outputs_metadata = [ + { + "nbytes": 4, + "data_ptr": 1234, + "size": (1,), + "stride": (1,), + "dtype": torch.float32, + "device": "npu", + "storage_offset": 0, + } + ] + basic_npu_graph_node.output_storage_alias = [AliasesPriorGraphOutput((0, 0))] + basic_npu_graph_node.outputs_weakrefs = [MagicMock()] + basic_npu_graph_node.cached_tensor_outputs = [MagicMock()] + + with patch("torch_npu.npu._graph_tree.maybe_deref") as mock_maybe_deref: + mock_maybe_deref.return_value = (MagicMock(), 1234) + outputs = basic_npu_graph_node.reconstruct_outputs() + assert len(outputs) == 1 + + def test_liveness_tracking(self, basic_npu_graph_node): + mock_ref = MagicMock() + basic_npu_graph_node.path_weakrefs = [[mock_ref]] + + with patch("torch_npu.npu._graph_tree.is_live") as mock_is_live: + mock_is_live.return_value = True + liveness = basic_npu_graph_node._get_liveness( + basic_npu_graph_node.path_weakrefs + ) + assert liveness == [[True]] + + def test_child_management(self, basic_npu_graph_node): + mock_child = MagicMock() + basic_npu_graph_node.add_child("test_func", mock_child) + assert "test_func" in basic_npu_graph_node.children + assert mock_child in basic_npu_graph_node.children["test_func"] + + def test_invalid_run_conditions(self, basic_npu_graph_node): + basic_npu_graph_node.graph = None + with pytest.raises(RuntimeError): + basic_npu_graph_node.run_graph() + + def test_storage_metadata_handling(self, basic_npu_graph_node): + tensor = torch.tensor([1.0], device="npu") + metadata = basic_npu_graph_node._tensor_metadata(tensor) + + assert metadata["data_ptr"] == tensor.untyped_storage().data_ptr() + assert metadata["size"] == tensor.shape + + @patch("torch.npu.synchronize") + @patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager") + def test_input_processing(self, mock_pool_manager, mock_sync, basic_npu_graph_node): + inputs = [torch.tensor([1.0], device="npu")] + processed = basic_npu_graph_node._allocate_and_copy_recording_inputs(inputs) + assert len(processed) == 1 + assert isinstance(processed[0], torch.Tensor) + + def test_check_invariants(self, basic_npu_graph_node): + mock_inputs = [torch.tensor([1.0], device="npu")] + basic_npu_graph_node.static_input_data_ptrs = [mock_inputs[0].data_ptr()] + basic_npu_graph_node.npugraph_managed_idxs = [0] + + assert basic_npu_graph_node.check_invariants(mock_inputs) + + def test_descendant_count(self, basic_npu_graph_node): + mock_child = MagicMock(num_descendants=lambda: 0) + basic_npu_graph_node.children["test"] = [mock_child] + assert basic_npu_graph_node.num_descendants() == 1 + + def test_prepare_alias_info_metadata_int(self, basic_npu_graph_node): + result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction( + MagicMock(), 42 + ) + assert result is None + + def test_prepare_alias_info_unaliased_storage(self, basic_npu_graph_node): + result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction( + UnaliasedStorage, {"meta": "data"} + ) + assert result is None + + def test_prepare_alias_info_aliases_prior_graph_valid(self, basic_npu_graph_node): + mock_ref = MagicMock() + basic_npu_graph_node.path_weakrefs = [[mock_ref, mock_ref]] + alias_info = AliasesPriorGraphOutput((0, 1)) + + with patch("torch.UntypedStorage._new_with_weak_ptr") as mock_new: + result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction( + alias_info, {"meta": "data"} + ) + mock_new.assert_called_once_with(mock_ref()) + assert result == mock_new.return_value + + def test_prepare_alias_info_aliases_prior_graph_none_ref( + self, basic_npu_graph_node + ): + basic_npu_graph_node.path_weakrefs = [[None, None]] + alias_info = AliasesPriorGraphOutput((0, 1)) + + with pytest.raises(RuntimeError): + basic_npu_graph_node.prepare_alias_info_for_tensor_construction( + alias_info, {"meta": "data"} + ) + + def test_prepare_alias_info_aliases_new_output(self, basic_npu_graph_node): + alias_info = AliasesNewOutput(123) + result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction( + alias_info, {"meta": "data"} + ) + assert result == 123 + + def test_prepare_alias_info_invalid_type(self, basic_npu_graph_node): + with pytest.raises(RuntimeError): + basic_npu_graph_node.prepare_alias_info_for_tensor_construction( + "invalid_type", {"meta": "data"} + ) + + # Tests for prepare_storages_for_construction + def test_prepare_storages_mixed_aliases(self, basic_npu_graph_node): + basic_npu_graph_node.output_storage_alias = [ + UnaliasedStorage, + AliasesNewOutput(123), + AliasesPriorGraphOutput((0, 1)), + ] + basic_npu_graph_node.outputs_metadata = [None, {}, {}] + basic_npu_graph_node.path_weakrefs = [[None, MagicMock(), MagicMock()]] + + with patch("torch.UntypedStorage._new_with_weak_ptr"): + results = basic_npu_graph_node.prepare_storages_for_construction() + + assert len(results) == 3 + assert results[0] is None + assert results[1] == 123 + + # Tests for debug_assert_invariants + def test_debug_assert_invariants_valid(self, basic_npu_graph_node): + from torch._inductor import config + + config.triton.fast_path_cudagraph_asserts = True + expected_liveness = [[], [True, False]] + newly_dead = [(1, 1)] + ref = MagicMock(return_value=None) + basic_npu_graph_node.outputs_weakrefs = [None, ref] + basic_npu_graph_node.parent.outputs_weakrefs = [] + basic_npu_graph_node.path_weakrefs = [ + basic_npu_graph_node.parent.outputs_weakrefs, + basic_npu_graph_node.outputs_weakrefs, + ] + + # Should not raise + with patch("torch_npu.npu._graph_tree.get_block_addrs"): + basic_npu_graph_node.debug_assert_invariants(expected_liveness, newly_dead) + config.triton.fast_path_cudagraph_asserts = False + + def test_debug_assert_invariants_dead_ref_alive(self, basic_npu_graph_node): + from torch._inductor import config + + config.triton.fast_path_cudagraph_asserts = True + expected_liveness = [[False]] + newly_dead = [(0, 0)] + basic_npu_graph_node.path_weakrefs = [ + [MagicMock(return_value=("ptr", 123))] + ] # Live ref + + with pytest.raises(RuntimeError): + basic_npu_graph_node.debug_assert_invariants(expected_liveness, newly_dead) + config.triton.fast_path_cudagraph_asserts = False + + # Tests for _initialize_cached_tensors + def test_initialize_cached_tensors_valid(self, basic_npu_graph_node): + basic_npu_graph_node.output_storage_alias = [UnaliasedStorage, UnaliasedStorage] + basic_npu_graph_node.outputs_metadata = [ + {"dtype": torch.float}, + {"dtype": torch.int}, + ] + basic_npu_graph_node.unaliased_in_all_paths = [True, False] + basic_npu_graph_node.outputs_weakrefs = [None, None] + + with patch.object(basic_npu_graph_node, "create_storage"), patch( + "torch_npu._C._add_cached_tensor" + ), patch.object( + basic_npu_graph_node, "_reconstruct_from_tensor_metadata" + ) as mock_reconstruct: + + mock_reconstruct.return_value = torch.tensor([1.0], device="npu:0") + basic_npu_graph_node._initialize_cached_tensors() + + assert len(basic_npu_graph_node.cached_tensor_outputs) == 2 + assert basic_npu_graph_node.cached_tensor_outputs[0] is not None + assert len(basic_npu_graph_node.outputs_weakrefs) == 2 + + def test_initialize_cached_tensors_invalid_storage_info(self, basic_npu_graph_node): + basic_npu_graph_node.output_storage_alias = ["invalid"] + basic_npu_graph_node.unaliased_in_all_paths = [True] + + basic_npu_graph_node._initialize_cached_tensors() + + +@patch("torch_npu.npu.graphs.NPUGraph.replay") +@patch("torch_npu.npu._graph_tree.check_memory_pool") +@patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager") +class TestNPUGraphNodeRun(TestCase): + def setUp(self): + """Initialize common test components and configurations""" + self.device = "npu:0" + + def model_side_effect(inputs): + # Clear inputs list while preserving reference + inputs[:] = [] + return [] + + self.wrapped_function = MagicMock( + spec=WrappedFunction, + static_input_idxs=[0], + constants=[], + model=MagicMock(side_effect=model_side_effect), + ) + self.graph_id = 1 + self.npu_graphs_pool = (0, 0) + self.stream = torch.npu.Stream(device=self.device) + + # Create test tensors + self.static_input = torch.randn( + 3, 3, device=self.device + ) # Static input (parameter-like) + self.dynamic_input = torch.randn(2, 2, device=self.device) # Dynamic input + + def _create_node(self, inputs, parent=None): + """Helper to create NPUGraphNode instance""" + with patch("torch_npu._C._npu_getCheckpointState"), patch( + "torch_npu.npu.graphs.NPUGraph.capture_begin" + ), patch("torch_npu.npu.graphs.NPUGraph.capture_end"): + return NPUGraphNode( + wrapped_function=self.wrapped_function, + graph_id=self.graph_id, + parent=parent, + inputs=inputs, + npu_graphs_pool=self.npu_graphs_pool, + device_index=0, + stack_traces=None, + stream=self.stream, + ) + + @patch.object(NPUGraphNode, "run_graph") + def test_static_input_optimization( + self, mock_run_graph, mock_pool, mock_check, mock_replay + ): + """Verify static inputs bypass copy operations""" + # Mark all inputs as static + self.wrapped_function.static_input_idxs = [0, 1] + node = self._create_node([self.static_input, self.static_input.clone()]) + + # Execute with cloned inputs + node.run([self.static_input.clone(), self.static_input.clone()]) + + # Validate no copy operations occurred + self.assertEqual(mock_run_graph.call_count, 1) + + def test_input_validation_mechanism(self, mock_pool, mock_check, mock_replay): + """Ensure input length validation works correctly""" + node = self._create_node([self.static_input]) + + # Test invalid input length + with self.assertRaisesRegex(RuntimeError, "check len"): + node.run([1, 2, 3]) # Invalid input count + + @patch.object(NPUGraphNode, "reconstruct_outputs") + def test_output_reconstruction_flow( + self, mock_reconstruct, mock_pool, mock_check, mock_replay + ): + """Test full output reconstruction pipeline""" + # Configure mock reconstruction + expected_output = torch.tensor([1.0], device=self.device) + mock_reconstruct.return_value = [expected_output] + + node = self._create_node([self.static_input]) + outputs = node.run([self.static_input.clone()]) + + # Validate outputs + self.assertEqual(outputs, [expected_output]) + mock_reconstruct.assert_called_once() + + @patch("torch._foreach_copy_") + def test_batched_copy_optimization( + self, mock_batched_copy, mock_pool, mock_check, mock_replay + ): + """Verify batched copy operations for efficiency""" + # Configure multiple dynamic inputs + self.wrapped_function.static_input_idxs = [] + inputs = [torch.randn(2, 2, device=self.device) for _ in range(3)] + new_inputs = [t.clone() for t in inputs] + node = self._create_node(inputs) + + # Execute with new inputs + node.run(new_inputs) + + # Validate single batched copy call + mock_batched_copy.assert_called_once() + args, _ = mock_batched_copy.call_args + self.assertEqual(len(args[0]), 3) + + def test_memory_cleanup_after_execution(self, mock_pool, mock_check, mock_replay): + """Validate input list cleanup post-execution""" + initial_inputs = [self.static_input.clone(), self.dynamic_input.clone()] + input_copy = [t.clone() for t in initial_inputs] + node = self._create_node(initial_inputs) + + # Execute and verify cleanup + node.run(input_copy) + self.assertEqual(len(input_copy), 0) + + +if __name__ == "__main__": + run_tests() -- Gitee From 72852f8dba084f66cd1681dec52d0a87ee22c3a1 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Tue, 3 Jun 2025 13:28:17 +0000 Subject: [PATCH 020/328] !21435 add flight recorder Merge pull request !21435 from huangyunlong/2.7ft2 --- torch_npu/csrc/distributed/HCCLUtils.hpp | 23 + torch_npu/csrc/distributed/Init.cpp | 23 + .../csrc/distributed/ProcessGroupHCCL.cpp | 624 +++++++++++++++++- .../csrc/distributed/ProcessGroupHCCL.hpp | 268 +++++++- torch_npu/csrc/distributed/TraceUtils.h | 58 +- 5 files changed, 943 insertions(+), 53 deletions(-) diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index c851b9bcaf..12d6d87c26 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -136,4 +136,27 @@ protected: mutable std::mutex mutex_; HcclResult hcclAsyncErr_; }; + +class TORCH_API DebugInfoWriter { +public: + virtual ~DebugInfoWriter(); + virtual void write(const std::string &hcclTrace); + static DebugInfoWriter &getWriter(int rank); + static void registerWriter(std::unique_ptr writer); + virtual std::string getWriterTarget() + { + return filename_; + } + +protected: + DebugInfoWriter(std::string namePrefix, int rank) + { + filename_ = c10::str(namePrefix, rank); + } + std::string filename_; + +private: + static std::unique_ptr writer_; + static std::atomic hasWriterRegistered_; +}; } // namespace c10d_npu diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 2f23352079..1df6943f62 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -535,6 +535,29 @@ Example:: py::arg("timeout") = std::chrono::milliseconds(300000), py::arg("wait_workers") = true, py::arg("multi_tenant") = false); + + module.def("_dump_hccl_trace", + [](std::optional includeCollectives, + std::optional includeStackTraces, + std::optional onlyActive) { + return py::bytes(::c10d_npu::dump_hccl_trace( + includeCollectives.value_or(true), + includeStackTraces.value_or(true), + onlyActive.value_or(false))); + }, + py::arg("includeCollectives") = std::optional(), + py::arg("includeStackTraces") = std::optional(), + py::arg("onlyActive") = std::optional(), + R"( + Arguments: + includeCollectives(bool, optional): Whether to include collective work traces. Default is True. + includeStackTraces(bool, optional): Whether to include stacktraces in the collective work traces. Default is True. + onlyActive (bool, optional): Whether to only include active collective work traces. Default is False. + Returns: + Stringified pickle work traces. + Default settings return everything - i.e. contains HCCL comm dumps and collective traces. + )"); + Py_RETURN_TRUE; } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 76c7718d55..9b584b35a2 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -10,11 +10,16 @@ #include #include +#include +#include +#include + #include #include #include #include #include +#include #include @@ -37,6 +42,7 @@ #include "torch_npu/csrc/core/npu/interface/OpInterface.h" #include "torch_npu/csrc/distributed/HCCLUtils.hpp" #include "torch_npu/csrc/distributed/HcclCompile.h" +#include "torch_npu/csrc/distributed/TraceUtils.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" #include "torch_npu/csrc/framework/OpHook.h" #include "torch_npu/csrc/framework/FormatHelper.h" @@ -45,6 +51,9 @@ #include "torch_npu/csrc/logging/LogContext.h" #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp" +namespace py = pybind11; +using namespace py::literals; + namespace c10d_npu { namespace { static constexpr uint32_t kOpWaitTimeoutOffset = 30U; // second @@ -406,6 +415,51 @@ int ProcessGroupHCCL::numRanks_ = -1; std::string ProcessGroupHCCL::exceptionMessage_ = ""; std::shared_ptr logger = npu_logging::logging().getLogger("torch.distributed"); +std::atomic ProcessGroupHCCL::shouldDump_(false); + +std::string dump_hccl_trace( + bool includeCollectives, + bool includeStackTraces, + bool onlyActive) +{ + return HCCLTraceBuffer::get()->dump( + c10::nullopt, includeCollectives, includeStackTraces, onlyActive); +} + +c10::optional)>> &get_cpp_trace_dumper() +{ + static c10::optional< + std::function)>> + dumper(c10::nullopt); + return dumper; +} + +gil_checker_t &get_gil_checker() +{ + static gil_checker_t gil_checker = nullptr; + return gil_checker; +} + +std::future launchAsyncGilCheck() +{ + std::promise resultPromise; + std::future resultFuture = resultPromise.get_future(); + TORCH_CHECK(get_gil_checker(), "Can't check GIL with null GIL checker"); + std::thread workerThread([promise = std::move(resultPromise)]() mutable { + try { + auto& gil_checker = get_gil_checker(); + promise.set_value((*gil_checker)()); + } catch (...) { + promise.set_exception(std::current_exception()); + } + }); + + // Detach the thread to allow it to run independently + workerThread.detach(); + + return resultFuture; +} + std::ostream& operator<<(std::ostream& output, const ProcessGroupHCCL::WorkHCCL& workHCCL) { std::string workInfo = c10::str( @@ -460,7 +514,8 @@ ProcessGroupHCCL::WorkHCCL::WorkHCCL(const WorkHCCL& w) startTraceUpdated_(w.startTraceUpdated_), store_(w.store_), is_dispatched(w.is_dispatched), - is_reported(w.is_reported) + is_reported(w.is_reported), + trace_id_(w.trace_id_) { exception_ = w.exception_; } @@ -786,6 +841,8 @@ std::vector ProcessGroupHCCL::WorkHCCL::result() return *outputs_; } +static std::atomic process_group_id = 0; + ProcessGroupHCCL::ProcessGroupHCCL( const c10::intrusive_ptr& store, int rank, @@ -797,7 +854,10 @@ ProcessGroupHCCL::ProcessGroupHCCL( hcclCommCounter_(0), traceKeyStart_("HCCL_" + std::to_string(rank) + "_trace_start"), traceKeyEnd_("HCCL_" + std::to_string(rank) + "_trace_end"), - terminateProcessGroup_(false) + terminateProcessGroup_(false), + terminateHeartbeatMonitorThread_(false), + collectiveDebugInfoMode_(false), + uid_(process_group_id++) { std::string groupName = "group_name_" + options->group_id; this->setGroupName(groupName); @@ -836,6 +896,23 @@ ProcessGroupHCCL::ProcessGroupHCCL( ASCEND_LOGI("Set op wait timeout to %u.", kOpWaitTimeout); NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpWaitTimeout(kOpWaitTimeout)); const char* blockingWait = getenv(HCCL_BLOCKING_WAIT); + + logPrefix_ = createLogPrefix(); + dumpOnException_ = c10d::getCvarBool(TORCH_HCCL_DUMP_ON_TIMEOUT, false); + heartbeat_ = 1ULL; + monitorThreadEnabled_.store(c10d::getCvarBool(TORCH_HCCL_ENABLE_MONITORING, false)); + heartbeatTimeoutInSec_ = c10d::getCvarInt(TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC, 60 * 10); // 10 Mins + waitTimeoutDumpInMilSec_ = c10d::getCvarInt(TORCH_HCCL_WAIT_TIMEOUT_DUMP_MILSEC, 60 * 1000); // 60 Sec + coordCheckIntervalMilSec_ = c10d::getCvarInt(TORCH_HCCL_COORD_CHECK_MILSEC, 1000); + hcclTraceBufferSize_ = c10d::getCvarInt(TORCH_HCCL_TRACE_BUFFER_SIZE, 0); + + // store_ usually is wrapped with PrefixStore and the prefix is different + // across different ProcessGroupNCCL(PG) instances. We need to get the + // underlying non-PrefixStore for sharing global information shared across + // different PGs. + c10d::PrefixStore *prefixStore = dynamic_cast(store_.get()); + globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_; + try { if (blockingWait != nullptr) { auto val = std::stoi(blockingWait); @@ -974,10 +1051,73 @@ void abortCommsFromMap( } // Abort all communicators on this rank -void ProcessGroupHCCL::abort(c10::optional abortReason) +bool ProcessGroupHCCL::abort(c10::optional abortReason) { std::lock_guard lock(mutex_); abortCommsFromMap(devHCCLCommMap_, rank_, abortReason); + return true; +} + +void ProcessGroupHCCL::waitForFutureOrTimeout( + std::future &fut, + const std::chrono::milliseconds &timeOutMilSec, + const std::string &futDescription, + bool throwException) +{ + std::string errorMsg; + TORCH_CHECK(fut.valid(), "Expected a valid future"); + std::future_status status = fut.wait_for(timeOutMilSec); + if (status == std::future_status::ready) { + // Calling .get() will re-raise any exception from the future, and we don't + // care about the retval + try { + bool result = fut.get(); + if (result) { + LOG(INFO) << logPrefix() + << "future is successfully executed for: " << futDescription; + } + } catch (const std::exception &e) { + errorMsg = c10::str( + logPrefix(), + "Exception thrown when waitng for future ", + futDescription, + ": ", + e.what()); + LOG(ERROR) << errorMsg; + } catch (...) { + errorMsg = c10::str( + logPrefix(), + "Unknown exception thrown when waitng for future ", + futDescription); + LOG(ERROR) << errorMsg; + } + } else { + errorMsg = c10::str( + logPrefix(), + "Future for ", + futDescription, + " timed out after ", + timeOutMilSec.count(), + " ms"); + LOG(ERROR) << errorMsg; + } + if (throwException && !errorMsg.empty()) { + C10_THROW_ERROR(DistBackendError, errorMsg); + } +} + +void ProcessGroupHCCL::shutdown(c10::optional reason) +{ + // Don't join threads here since the purpose of this method is to abort all + // communicators and signal the threads to exit. Joining on the threads could + // potentially block and hence avoid it in this method. + terminateProcessGroup_.store(true); + workMetaListCV_.notify_one(); + + // We need to wait for abort to finish before we can safely shut down + // heartbeat monitoring thread. + terminateHeartbeatMonitorThread_.store(true); + monitorWakeUpCV_.notify_one(); } void ProcessGroupHCCL::deleteTCPStoreKey() @@ -1009,15 +1149,30 @@ void ProcessGroupHCCL::abortAndClearHcclComm(c10::optional abortRea ProcessGroupHCCL::~ProcessGroupHCCL() { + LOG(INFO) << logPrefix() << "ProcessGroupHCCL destructor entered."; + if (options_->global_ranks_in_group.empty()) { global_ = nullptr; } - terminateProcessGroup_.store(true); + if (!terminateProcessGroup_.load()) { + // If user haven't explicitly destroy/shutdown process group, destructor + // needs to do so + shutdown(); + } + + LOG(INFO) << logPrefix() << "ProcessGroupHCCL destructor entered."; - workMetaListCV_.notify_one(); #ifdef ENABLE_HCCL_ERROR_CHECKING - hcclCommWatchdogThread_.join(); + if (hcclCommWatchdogThread_.joinable()) { + hcclCommWatchdogThread_.join(); + LOG(INFO) << logPrefix() << "ProcessGroupHCCL watchdog thread joined."; + } + if (hcclHeartbeatMonitorThread_.joinable()) { + hcclHeartbeatMonitorThread_.join(); + LOG(INFO) << logPrefix() + << "ProcessGroupHCCL heart beat monitor thread joined."; + } #endif { // Destropy all HCCL Communicators on Process Group Destruction @@ -1035,11 +1190,372 @@ ProcessGroupHCCL::~ProcessGroupHCCL() logger->info("process group destroyed, group id is %s.", options_->group_id.c_str()); } +std::future ProcessGroupHCCL::launchAsyncPythonTracebackDump() +{ + std::promise resultPromise; + std::future resultFuture = resultPromise.get_future(); + std::thread workerThread([promise = std::move(resultPromise), this]() mutable { + try { + promise.set_value(this->dumpPythonTraceback()); + } catch (...) { + promise.set_exception(std::current_exception()); + } + }); + + // Detach the thread to allow it to run independently + workerThread.detach(); + + return resultFuture; +} + +bool ProcessGroupHCCL::dumpPythonTraceback() +{ + std::string filePath = c10d::getCvarString({"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_"); + PyGILState_STATE gil = PyGILState_Ensure(); + try { + py::dict locals = py::dict("path"_a=filePath.c_str(), "rank"_a=rank_); + py::exec(R"( + import sys + import os + import traceback + import threading + from torch_npu.utils._path_manager import PathManager + try: + py_stacks = 'pid: {}\n'.format(os.getpid()) + threadInfos = {} + for thread in threading.enumerate(): + threadInfos[thread.ident] = thread + for thread_id, stack in sys._current_frames().items(): + stack_list = traceback.format_list(traceback.extract_stack(stack)) + py_stacks += 'thread {}:\n'.format(threadInfos[thread_id] if thread_id in threadInfos.keys() else thread_id) + py_stacks += ''.join(stack_list) + dump_file = '{path}{rank}_py_traceback'.format(**locals()) + PathManager.check_input_file_path(dump_file) + with open(dump_file, 'w') as f: + f.write(py_stacks) + except Exception as e: + print(e); + )", py::globals(), locals); + } catch (const std::exception& e) { + LOG(ERROR) << logPrefix() << "dumpPythonTraceback error: " << e.what(); + } catch (...) { + LOG(ERROR) << logPrefix() << "dumpPythonTraceback Unknown exception type."; + } + PyGILState_Release(gil); + return true; +} + +bool ProcessGroupHCCL::dumpDebuggingInfo() +{ + auto fut = launchAsyncPythonTracebackDump(); + auto kGilCheckTimeout = std::chrono::milliseconds(3000); + auto futStatus = fut.wait_for(kGilCheckTimeout); + if (futStatus != std::future_status::ready) { + TORCH_CHECK( + futStatus != std::future_status::deferred, + "Expected the future of dumpping python traceback to have been launched eagerly."); + LOG(ERROR) + << "Could not acquire GIL within 3000 ms when dump python traceback, possible GIL induced hang"; + } + LOG(INFO) << "Could dump python traceback"; + + // Serialize all calls to this function to avoid corrupting data, but allow + // multiple calls in one runtime. User is responsible for preserving the + // output file from an earlier call before a later call overwrites it. + static std::mutex writeDebugInfoMutex; + std::lock_guard lock(writeDebugInfoMutex); + LOG(ERROR) << logPrefix() << "ProcessGroupHCCL preparing to dump debug info."; + if (hcclTraceBufferSize_ > 0) { + // We dump hccl trace into local disk by default and users can register + // their customized writer by inheriting `DebugInfoWriter` via + // `registerDebugInfoWriter`. + auto hcclTrace = dump_hccl_trace(true, true, false); + DebugInfoWriter &writer = DebugInfoWriter::getWriter(globalRank()); + LOG(ERROR) << logPrefix() << "ProcessGroupHCCL dumping hccl trace to " + << writer.getWriterTarget(); + writer.write(hcclTrace); + return true; + } + return false; +} + +void ProcessGroupHCCL::dumpTraceAndResetStatus() +{ + // Store debug info to storage if no other thread does it. (By default to + // local disk) + std::future asyncDebugDump = std::async( + std::launch::async, + [this]() { + return this->dumpDebuggingInfo(); + }); + + // wait for the dump until timeout + waitForFutureOrTimeout( + asyncDebugDump, + std::chrono::milliseconds(waitTimeoutDumpInMilSec_), + "Flight recorder dump in heartbeatMonitor"); + + // Increase heartbeat to avoid dump debug info frequently. + heartbeat_++; + shouldDump_.store(false); +} + +void ProcessGroupHCCL::terminateProcess(std::string errMsg) +{ + // Logging with `FATAL`, after errMsg printed, it calls `std::abort()` + // to terminate the program execution. + LOG(FATAL) << logPrefix() << errMsg; +} + +int computeDeltaMS( + std::chrono::time_point start, + std::chrono::time_point end) +{ + return std::chrono::duration_cast(end - start) + .count(); +} + +void ProcessGroupHCCL::heartbeatMonitor() +{ + uint64_t heartBeatCounter = 0ULL; + std::string errorMsg; + std::string exitMsg; + bool checkDumpSignal = (dumpOnException_ && options_->global_ranks_in_group.empty()); + int monitorPollInterval = checkDumpSignal ? coordCheckIntervalMilSec_ + : heartbeatTimeoutInSec_ * 1000; + auto lastTimePollStore = std::chrono::steady_clock::now(); + auto lastTimeHeartBeatCheck = std::chrono::steady_clock::now(); + c10::optional dumpPipe = c10::nullopt; + if (options_->global_ranks_in_group.empty()) { + // DumpPipe is one per-trainer process, and its convenient to name them + // after 'global' ranks in the system, So we assume processgroup options_->global_ranks_in_group.empty() is + // the global PG and has globally unique rank ids across trainers. + dumpPipe.emplace(rank_); + } + + while (true) { + // This won't have any lock since this lock is only used here. + // Please be aware that mutex `monitorMutex_` should not be used + // somewhere else to avoid the deadlock. + std::unique_lock lock(monitorMutex_); + if (monitorWakeUpCV_.wait_for(lock, + std::chrono::milliseconds(monitorPollInterval), + [&]{ return terminateHeartbeatMonitorThread_.load(); })) { + // For the normal complete or user interception, monitorWakeUpCV_ + // will get notified, we early return and exit heartbeatMonitor. + return; + } + auto currentTime = std::chrono::steady_clock::now(); + + // We put extra functionality in the thread for the default PG (aka, options_->global_ranks_in_group.empty()) + // because the signal is same across different PGs. We only need to run + // once per process to avoid duplicate things performed in too many separate + // threads. For example, we check a global flag on the TCPStore periodically + // to see if any PG on any rank observed a timeout and signaled peers to + // dump debugging info, and we avoid hammering the TCPStore from all PGs on + // the same rank. + if (checkDumpSignal) { + // There are two scenarios where monitor thread will dump on timeout: + // 1. The local rank is the first to observe a timeout.shouldDump_ will be + // set to true. + // 2. other ranks detected the timeout and signal the local rank to dump + // In addtion, monitor threads will dump if watchdog threads has no + // heartbeat or dumpPipe is not empty. + if (shouldDump_.load()) { + errorMsg = c10::str( + logPrefix(), + "Received a dump signal from this local rank and will ", + "start to dump the debug info. ", + "Last enqueued HCCL work: ", + pgStatus_.lastEnqueuedSeq, + ", last completed HCCL work: ", + pgStatus_.lastCompletedSeq, + "."); + exitMsg = c10::str( + "ProcessGroupHCCL's watchdog detected an exception from the local rank. ", + "This is most likely caused by incorrect usages of collectives, e.g., wrong ", + "sizes used across ranks, the order of collectives is not same for all ranks ", + "or the scheduled collective, for some reason, didn't run. Additionally, ", + "this can be caused by GIL deadlock or other reasons such as network errors or ", + "bugs in the communications library (e.g. HCCL), etc. We tried our best to ", + "dump the debug info into the storage to help you debug the issue."); + dumpTraceAndResetStatus(); + } + // We poll store to see if some ranks have flagged a timeout when + // we haven't polled for `heartbeat_timeout` seconds and there haven't + // any work added or removed for `watchdog_timeout` seconds. + if (computeDeltaMS(lastWorkListUpdateTime_, currentTime) >= kWatchdogThreadSleepMillis && + computeDeltaMS(lastTimePollStore, currentTime) >= coordCheckIntervalMilSec_) { + lastTimePollStore = currentTime; + // Wrap globalStore_->check() in a try-catch block to avoid crashing if + // the store is not available. + bool checkExceptionDump = false; + try { + checkExceptionDump = + globalStore_->check({std::string(EXCEPTION_DUMP)}); + } catch (const std::exception &e) { + LOG(ERROR) + << logPrefix() + << "Failed to get exception dump flag from the global store." + << e.what(); + dumpTraceAndResetStatus(); + } + if (checkExceptionDump) { + int timeOutRank = -1; + if (!shouldDump_.load()) { + LOG(ERROR) + << logPrefix() + << "First PG on this rank detecting the dump signal through tcpstore."; + } + shouldDump_.store(true); + try { + auto vec = globalStore_->get(std::string(EXCEPTION_DUMP)); + TORCH_CHECK_WITH( + DistBackendError, + vec.size() == sizeof(int), + "Invalid size for the timeout rank ID"); + std::memcpy(&timeOutRank, vec.data(), vec.size()); + } catch (const std::exception &e) { + LOG(ERROR) << logPrefix() + << "Failed to get timeout rank ID from the global store." + << e.what(); + } + errorMsg = c10::str( + logPrefix(), + "Received a global dump signal from rank ", + timeOutRank, + ", and will start to dump the debug info. ", + "Last enqueued HCCL work: ", + pgStatus_.lastEnqueuedSeq, + ", last completed HCCL work: ", + pgStatus_.lastCompletedSeq, + "."); + exitMsg = c10::str( + "ProcessGroupHCCL's watchdog detected a dump signal from rank ", + timeOutRank, + " and notified the current rank. ", + "This is most likely caused by incorrect usages of collectives, e.g., wrong ", + "sizes used across ranks, the order of collectives is not same for all ranks ", + "or the scheduled collective, for some reason, didn't run. Additionally, ", + "this can be caused by GIL deadlock or other reasons such as network errors or ", + "bugs in the communications library (e.g. HCCL), etc. We tried our best to ", + "dump the debug info into the storage to help you debug the issue."); + dumpTraceAndResetStatus(); + } + } + } + + if (computeDeltaMS(lastTimeHeartBeatCheck, currentTime) >= + heartbeatTimeoutInSec_ * 1000) { + // Check the heart beat of watchdog thread. + lastTimeHeartBeatCheck = currentTime; + auto heartbeat = heartbeat_.load(); + if (heartbeat != heartBeatCounter) { + heartBeatCounter = heartbeat; + } else { + if (!shouldDump_.load()) { + LOG(ERROR) + << logPrefix() + << "First PG on this rank that detected no heartbeat of its watchdog."; + } + shouldDump_.store(true); + // No heartbeat increase detected and timeout. + errorMsg = c10::str( + logPrefix(), + "Heartbeat monitor timed out! Process will be terminated after dumping debug info.", + " workMetaList_.size()=", + workMetaList_.size()); + exitMsg = c10::str( + "ProcessGroupHCCL's watchdog got stuck for ", + heartbeatTimeoutInSec_, + " seconds without making progress in monitoring enqueued collectives. ", + "This typically indicates a HCCL/CUDA API hang blocking the watchdog, ", + "and could be triggered by another thread holding the GIL inside a ", + "CUDA api, or other deadlock-prone behaviors.", + "If you suspect the watchdog is not actually stuck and a longer timeout would help, ", + "you can either increase the timeout (TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value " + "or disable the heartbeat monitor (TORCH_HCCL_ENABLE_MONITORING=0)." + "If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout " + "or false positive abort; otherwise, please attempt to debug the hang. " + "workMetaList_.size() = ", + workMetaList_.size(), + ""); + if (checkDumpSignal) { + dumpTraceAndResetStatus(); + } + } + } + // process a request to dump the trace. only PG uid 0 will respond to dump + // requests, but this is fine since all PG's feed into the same flight + // recorder and dump. After dump, the training should continue. + if (dumpPipe.has_value() && dumpPipe->shouldDump()) { + // best effort dump, not waiting for the dump here + std::future fut = std::async( + std::launch::async, [this]() { + return this->dumpDebuggingInfo(); + }); + } + } + LOG(ERROR) << errorMsg; + + auto &cpp_dumper = get_cpp_trace_dumper(); + if (cpp_dumper.has_value()) { + LOG(INFO) << "Dumping c++ stacktraces:"; + cpp_dumper.value()([](const std::string &line) { + LOG(ERROR) << line; + }); + } + + // There are two possible cases for the watchdog thread exit: + // Case one: desync report runs quickly, and it follows the step: + // collective timeout -> desync -> exception handling -> destructors + // -> set terminateHeartbeatMonitorThread_ -> notify monitorWakeUpCV_. + // So the code either early returns above or will skip the sleep below. + // Case two: desync might be slow or get stuck. Or we get stuck in + // destructors, we will sleep for some time before calling std::abort() to + // kill the whole process. + if ((terminateProcessGroup_.load() || collectiveDebugInfoMode_.load() || + shouldDump_.load()) && + !terminateHeartbeatMonitorThread_.load()) { + // Leave another two mins for desync report generation or process group + // destroy. + std::this_thread::sleep_for(std::chrono::seconds(heartbeatTimeoutInSec_)); + } + + // At this point, we either already sleep for another `heartbeatTimeoutInSec_` + // or the thread has finished. Because we don't want to block the monitor + // thread, so We mark the thread detach and the dump of debug info becomes + // "best effort". If the process exit normally, marking it detach also makes + // sense because we don't really care about dumping the debug info. + + // We already log completion inside the thread, so it may not be necessary to + // check the return value here. We mainly use a future so we can exit early + // if done. + + if (!terminateHeartbeatMonitorThread_.load()) { + // Create a error message reported from MonitorThread, so + // we throw exception and make the whole process to be killed. + // After having a hang debug wiki, we need to update the wiki + // url here. + const auto finalExitMsg = c10::str(logPrefix(), exitMsg); + if (monitorThreadEnabled_.load()) { + terminateProcess(finalExitMsg); + } else { + LOG(ERROR) + << "PGHCCL Monitor Thread is disabled, but would have killed this job:\n" + << finalExitMsg; + } + } +} + void ProcessGroupHCCL::hcclCommWatchdog() { c10_npu::SetThreadType(c10_npu::ThreadType::WATCHDOG_THREAD); try { VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!"; + if (monitorThreadEnabled_.load()) { + hcclHeartbeatMonitorThread_ = std::thread(&ProcessGroupHCCL::heartbeatMonitor, this); + } workCleanupLoop(); VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread terminated normally"; @@ -1100,6 +1616,25 @@ void ProcessGroupHCCL::logWorkEnd(WorkHCCL& work) storeError_ = !c10d::traceUpdate(store_, traceKeyEnd_, work.seq_, opTypeToString(work.opType_)); } + +std::string ProcessGroupHCCL::createLogPrefix() const +{ + if (!pg_desc_.empty() && pg_desc_ != "undefined") { + return c10::str("[PG ", pg_name_, " (", pg_desc_, ") Rank ", rank_, "] "); + } + return c10::str("[PG ", pg_name_, " Rank ", rank_, "] "); +} + +const std::string &ProcessGroupHCCL::logPrefix() const +{ + return logPrefix_; +} + +const int &ProcessGroupHCCL::globalRank() const +{ + static int globalRank = rank_; + return globalRank; +} const std::vector& ProcessGroupHCCL::groupRanks() const { @@ -1161,6 +1696,32 @@ void ProcessGroupHCCL::workCleanupLoop() // If work hits an exception (either an error or timeout) if (work.exception()) { + // try to dump flight records if exception happens. + // Flight recorder behavior should be independent of desync Debug + if (dumpOnException_) { + try { + auto rank = globalRank(); + auto vec = std::vector( + reinterpret_cast(&rank), + reinterpret_cast(&rank) + sizeof(rank)); + globalStore_->set(std::string(EXCEPTION_DUMP), vec); + if (!shouldDump_.load()) { + LOG(ERROR) << logPrefix() + << "First watchdog to set the dump signal."; + } + // signal the monitor thread to start dumping + shouldDump_.store(true); + // This sleep is used to give time for dumping before throwing + // exception + std::this_thread::sleep_for( + std::chrono::seconds(heartbeatTimeoutInSec_)); + } catch (const std::exception &e) { + LOG(ERROR) << logPrefix() + << "Failed to set dump signal in tcpstore. " + << "Error: " << e.what(); + } + } + // Report desync state in case of timeout if (desyncDebug_ && timedOut) { try { @@ -1194,9 +1755,11 @@ void ProcessGroupHCCL::workCleanupLoop() ASCEND_LOGE("Process group work %s, seq_num %u dispatch sucess. This error log can be ignored.", opTypeToString(work.opType_).c_str(), work.seq_); work.is_reported = false; } + if (status_save_enable) { refreshStatusInfo(work, "end"); // Update Statusinfo,but not write into the map } + HCCLTraceBuffer::get()->retire_id(work.trace_id_, true); it = workMetaList_.erase(it); c10_npu::NPUGraph::dec_pending_event_queries(); } else { @@ -1207,6 +1770,10 @@ void ProcessGroupHCCL::workCleanupLoop() // completed. ++it; } + + // Increment heartbeat after each work processed, + // in case processing is slowed down (but not hung) by cuda api contention + heartbeat_++; } } @@ -1497,6 +2064,7 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( return devHCCLCommMap_[devicesKey]; } } + HCCLTraceBuffer::get()->record_pg_ranks(std::make_tuple(pg_name_, pg_desc_), groupRanks()); return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank); } @@ -2031,12 +2599,46 @@ void nslb_record_end() c10::intrusive_ptr ProcessGroupHCCL::initWork( std::vector devices, int rank, - c10d::OpType opType) + c10d::OpType opType, + const char* profilingTitle, + const std::vector& inputs, + const std::vector& outputs, + bool record) { if (devices.size() != 1) { throw std::runtime_error("ProcessGroupHCCL support one device per process only" + DIST_ERROR(ErrCode::NOT_SUPPORT)); } - return c10::make_intrusive(devices, rank, opType, op_id_, desyncDebug_); + + auto r = c10::make_intrusive(devices, rank, opType, seq_, desyncDebug_); + if (record) { + bool isP2P = c10d::isP2POp(opType); + // Ideally record every work that we enqueue, rather than every work we + // create. + // - at the time of this PR we do not currently enqueue every created work + // - but it is unsafe to steal refs to start/end cuda events from Works that + // may go out of scope before flight recorder has retired them, + // so we must ensure that any work that is initialized via initWork will + // be enqueued + // - initially, moved record() into workEnqueue(), but found that makes it + // hard to get access to profilingTitle, + // inputs, and outputs for metadata recording, and we don't want to attach + // these objects to the Work becuase it has implications for keeping those + // tensors alive longer and adds overhead when copying Work objects + // between threads + r->trace_id_ = HCCLTraceBuffer::get()->record( + uid_, + std::make_tuple(pg_name_, pg_desc_), + seqCollective_, + seqP2P_, + seq_, + profilingTitle ? profilingTitle : "", + inputs, + outputs, + desyncDebug_? &((*(r->hcclStartEvents_))[0]) : nullptr, + &((*(r->hcclEndEvents_))[0]), + isP2P); + } + return r; } void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr work) @@ -2395,6 +2997,7 @@ c10::intrusive_ptr ProcessGroupHCCL::collective( c10_npu::CaptureStatus capture_status = c10_npu::currentStreamCaptureStatusMayInitCtx(); // Bump collective counter + seqCollective_++; seq_++; op_id_++; @@ -2408,7 +3011,7 @@ c10::intrusive_ptr ProcessGroupHCCL::collective( // First let HCCL streams wait for input tensors allocation streams syncStreams(devices, hcclEvents_[key], hcclStreams); // Work itself will create the events on all NPUs of tensors - auto work = initWork(devices, rank_, opType); + auto work = initWork(devices, rank_, opType, "", inputs, outputs, true); // Store references to outputs to be used by WorkHCCL::result and operator<<. work->outputs_ = std::make_shared>(outputs); c10_npu::OptionalNPUGuard npuGuard; @@ -2742,13 +3345,14 @@ c10::intrusive_ptr ProcessGroupHCCL::pointToPoint( // Bump the logical operation counter regardless of whether this op is // coalesced or individual + seqP2P_++; op_id_++; // First let HCCL streams wait for input tensors allocation streams syncStreams(devices, hcclEvents_[key], hcclStreams_[key]); // Work itself will create the CUDA events on all NPUs of tensors - auto work = initWork(devices, rank_, opType); + auto work = initWork(devices, rank_, opType, "", tensors, tensors, true); // This bypasses something in Work() that crashes if {tensor} is given as // output, not sure what work->outputs_ = std::make_shared>(tensors); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 2c164553c8..e1ccd719fe 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -21,6 +23,8 @@ namespace c10d_npu { constexpr const char* HCCL_BLOCKING_WAIT = "HCCL_BLOCKING_WAIT"; constexpr const char* HCCL_BACKEND_NAME = "hccl"; +constexpr const char* EXCEPTION_DUMP = "exception_dump"; + // Environment variable which controls whether or not we perform Async Error // Handling with HCCL. constexpr const char* HCCL_ASYNC_ERROR_HANDLING = "HCCL_ASYNC_ERROR_HANDLING"; @@ -31,6 +35,85 @@ constexpr const char* HCCL_DESYNC_DEBUG = "HCCL_DESYNC_DEBUG"; constexpr const int DEFAULT_TIMEOUT = 30 * 60 * 1000; +// Control whether dumping debug info on watchdog +// timeout is enabled. This variable must be set together with +// TORCH_HCCL_ENABLE_MONITORING=1 and TORCH_HCCL_TRACE_BUFFER_SIZE > 0. +static std::vector TORCH_HCCL_DUMP_ON_TIMEOUT = { + "TORCH_HCCL_DUMP_ON_TIMEOUT"}; + +// Enable monitoring thread which aborts the process when the ProcessGroupHCCL +// Watchdog thread gets stuck and no heartbeat is detected after +// TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC. This can happen due to calling CANN/HCCL +// APIs that may hang. It is Useful to prevent jobs being stuck for a prolonged +// time than necessary tying up cluster resources. +static std::vector TORCH_HCCL_ENABLE_MONITORING = { + "TORCH_HCCL_ENABLE_MONITORING"}; + +// The maximum number of events we store in the flight recorder's ring buffer. +// (One event could be the start or end of a collective, for example). +static std::vector TORCH_HCCL_TRACE_BUFFER_SIZE = { + "TORCH_HCCL_TRACE_BUFFER_SIZE"}; + +// Control how much extra time we will wait for dumping the debugging info +// before we exit and throws timeout exception. +static std::vector TORCH_HCCL_WAIT_TIMEOUT_DUMP_MILSEC = { + "TORCH_HCCL_WAIT_TIMEOUT_DUMP_MILSEC"}; + +// Control the watchdog heartbeat timeout period after which the monitoring +// thread will abort the process. +static std::vector TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC = { + "TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"}; + +// Control the interval inside the watchdog thread to check the coordinated +// signal from other ranks, e.g. to dump the debugging information. +static std::vector TORCH_HCCL_COORD_CHECK_MILSEC = { + "TORCH_HCCL_COORD_CHECK_MILSEC"}; + +struct DumpPipe { + DumpPipe(int rank) + { + std::string fileStem = c10d::getCvarString({"TORCH_HCCL_DEBUG_INFO_PIPE_FILE"}, ""); + if (fileStem.empty() || c10d::getCvarInt({"TORCH_HCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) { + return; + } + TORCH_CHECK(!fileStem.empty(), "TORCH_HCCL_DEBUG_INFO_TEMP_FILE is empty"); + std::string filename = c10::str(fileStem, rank, ".pipe"); + TORCH_CHECK( + unlink(filename.c_str()) != -1 || errno == ENOENT, + "Error removing existing named pipe ", + filename); + TORCH_CHECK( + mkfifo(filename.c_str(), 0666) != -1, + "Error creating named pipe ", + filename); + fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK); + LOG(INFO) << "Pipe file " << filename + << " has been opened, write to it to trigger HCCL Debug Dump."; + TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename); + } + bool shouldDump() + { + if (fd_ == -1) { + return false; + } + char buf[128]; + // non-blocking from O_NONBLOCK above. + // Ignore EINTR because we already will poll this + // again later. + ssize_t bytesRead = read(fd_, &buf, 128); + return bytesRead > 0; + } + ~DumpPipe() + { + if (fd_ != -1) { + close(fd_); + } + } + +private: + int fd_ = -1; +}; + // NoHandling: do not handle asynchronous HCCL errors // TearDown: tear down process upon error, see `WorkHCCL::handleException` // CleanUpOnly: just clean up collectives and abort communicators without @@ -244,6 +327,9 @@ public: std::vector lazy_destroy_tensors_; std::vector stashed_for_allocator_safety_; + // unique id used to tell the trace buffer that this + // work has completed + c10::optional trace_id_; friend class ProcessGroupHCCL; }; @@ -273,6 +359,34 @@ public: uint32_t master_port; }; + // A struct to hold the latest status of the process group. + struct ProcessGroupStatus { + // the sequential number of the last collective enqueued into workMetaList_ + // This is useful for indentifying a rank that has not join a collective + // initialized to be -1 to indicate no collective has been enqueued + int64_t lastEnqueuedSeq{-1}; + // the sequential number of the last collective started as the kernel + int64_t lastStartedSeq{-1}; + // the sequential number of the last colletive completed marked by + // the watchdog thread + // initialized to be -1 to indicate no collective has been completed + int64_t lastCompletedSeq{-1}; + + // the name of the last collective enqueued into workMetaList_ + std::string lastEnqueuedWorkName; + // the name of the last collective started as the kernel + std::string lastStartedWorkName; + // the name of the last collective completed + std::string lastCompletedWorkName; + + // the sizes of the last work enqueued + size_t lastEnqueuedNumelIn; + size_t lastEnqueuedNumelOut; + // the sizes of the last work completed + size_t lastCompletedNumelIn; + size_t lastCompletedNumelOut; + }; + // If you wish to create multiple process groups, each with a potentially // different rank and size, you can do so by passing a new store instance // to each one. If you have only a single store object, you can @@ -457,7 +571,9 @@ public: // Provides an API to abort the ProcessGroup (similar to hcclCommAbort) // instead of relying on ProcessGroupHCCL destructor. - void abort(c10::optional abortReason = c10::nullopt); + bool abort(c10::optional abortReason = c10::nullopt); + + void shutdown(c10::optional reason = c10::nullopt); void deleteTCPStoreKey(); @@ -502,7 +618,46 @@ protected: virtual c10::intrusive_ptr initWork( std::vector devices, int rank, - c10d::OpType opType); + c10d::OpType opType, + const char* profilingTitle = nullptr, + const std::vector& inputs = {}, + const std::vector& outputs = {}, + bool record = false); + + void setGroupDesc(const std::string& desc) + { + pg_desc_ = desc; + } + + const std::string& getGroupDesc() const + { + return pg_desc_; + } + + // In the timeout case and we will dump debug info such as the NCCL flight + // recorder to storage. Down the road, if we have more complicated or blocking + // operations, we might need to use a side thread to do it. + bool dumpDebuggingInfo(); + void dumpTraceAndResetStatus(); + bool dumpPythonTraceback(); + std::future launchAsyncPythonTracebackDump(); + + // Function that runs as part of a separate thread aside from watchdog + // thread because we need to check the heartbeat from watchdog thread + // so that when we get stuck in some HCCL/CANN calls, + // we can dump the debugging information and abort the process. + virtual void heartbeatMonitor(); + + // Function that directly trigger std::abort so that the whole process + // gets terminated. + virtual void terminateProcess(std::string errMsg); + + // A helper function to wait for a future to complete or timeout. + void waitForFutureOrTimeout( + std::future& fut, + const std::chrono::milliseconds& timeOutMilSec, + const std::string& futDescription, + bool throwException = false); // Do not call this directly, use ProcessGroup::setGroupName instead. void setGroupName(const std::string& name) @@ -517,9 +672,16 @@ protected: static const int64_t kWatchdogThreadSleepMillis; - // The store is used to broadcast the HCCL Master ID of rank 0. + // The store is used to broadcast the HCCL unique ID of rank 0. This store + // comes with prefix and it is different across ProcessGroup HCCL instances + // (aka, different ProcessGroups). c10::intrusive_ptr store_; + // Reference to the store without prefix so that keys are same across all + // ProcessGroup HCCL instances and (key, value) pairs written to the store are + // global. + c10::intrusive_ptr globalStore_; + bool storeError_{false}; const c10::intrusive_ptr options_; @@ -563,15 +725,58 @@ protected: // Mutex to guard maps like devHCCLCommMap_. std::mutex mutex_; + // Heartbeat of watchdog thread. + std::atomic_uint64_t heartbeat_; + + // The time interval used for deciding whether there is no watchdog heartbeat. + int heartbeatTimeoutInSec_; + + // timeout for the dump to finish. + int waitTimeoutDumpInMilSec_; + + // Interval of check coordinated signals in ProcessGroupHCCL from other ranks + // e.g., trigger the dump of the debugging info for timeout when notified. + int coordCheckIntervalMilSec_; + + // Size of ring buffer where we store HCCL Traces for debugging. + int hcclTraceBufferSize_; + + // We gate the heartbeat monitor thread so that we can roll it out gradually. + std::atomic monitorThreadEnabled_; + + // Monitor thread which checks the heartbeat of Watchdog thread. + // If the monitor thread finds there is no heartbeat, it will dump debug info + // and then kill the watchdog thread to avoid hang. + std::thread hcclHeartbeatMonitorThread_; + // Watchdog thread which looks for errors on the cached HCCL communicators. std::thread hcclCommWatchdogThread_; // Whether or not we should terminate the watchdog and workCleanup threads. std::atomic terminateProcessGroup_; + // Whether or not we should terminate the heartbeat monitoring threads. + std::atomic terminateHeartbeatMonitorThread_; + + // Whether we are in the shutdown mode when we are trying to get debug info, + // such as desync report. + std::atomic collectiveDebugInfoMode_; + + // This is the signal from watchdog threads to indicate whether the monitor + // thread should dump. Making it static so that it is accessiable from all the + // PGs. With this flag, monitor thread would dump debug info under any one of + // the 3 conditions: 1: this flag is set to true by the watchdog thread when + // it detects a timeout. 2: timeout signal is received from + // other ranks through tcpstore 3: no heartbeat of watchdog Note that only the + // monitor thread from PG0 should dump the debug info and only once + static std::atomic shouldDump_; + // Vector to Store WorkHCCL pointers std::list workMetaList_; + // Mutex to Guard monitorWakeUpCV_ + std::mutex monitorMutex_; + // Mutex to Guard workMetaList_ std::mutex workMetaListMutex_; @@ -581,6 +786,11 @@ protected: // Condition Variable for watchdog thread sleep std::condition_variable workMetaListCV_; + // Condition Variable for monitor thread to wake up early + std::condition_variable monitorWakeUpCV_; + + std::chrono::time_point lastWorkListUpdateTime_; + // Condition variable to control how long the watchdog thread waits. std::condition_variable watchdogCV_; @@ -634,6 +844,10 @@ protected: // Whether or not to enable timeout root cause analysis. bool desyncDebug_; + // Whether or not to dump debug info on exception including both watchdog + // timeout and hccl errors. + bool dumpOnException_; + // the perfdump path static std::string perfdumppath; @@ -656,6 +870,14 @@ protected: // is called. static thread_local uint64_t hcclActiveGroupCounter_; + // Counting for the sequential number of NCCL collective call. + // (specifically, how many actual kernels we launched, which differs from + // op_id_ when coalescing is enabled) + uint64_t seqCollective_{0}; + + // Counting for the sequential number of NCCL P2P calls. + uint64_t seqP2P_{0}; + // Counting for the sequential number of HCCL collective call. // (specfically, how many actual kernels we launched, which differs from) // op_id_ when coalescing is enabled) @@ -695,6 +917,14 @@ protected: static std::string exceptionMessage_; + size_t uid_; + + std::string logPrefix_; + + std::string pg_desc_; + + ProcessGroupStatus pgStatus_; + private: // Helper that encapsulates work shared across all collective communication // primitives. @@ -796,6 +1026,19 @@ private: // Desync debug helper void logWorkEnd(WorkHCCL& work); + // Generates a prefix that is unique to this process group and rank, for + // disambiguating logs + std::string createLogPrefix() const; + + // Returns the unique prefix created in createLogPrefix + const std::string &logPrefix() const; + + // Returns the global rank of the device. This function assumes that users + // always create a default global process group(PG) which includes all + // devices. It is called in the constructor of ProcessGroupHCCL, so it always + // return the rank_ of the the very first PG created, aka, default global PG. + const int &globalRank() const; + void silenceCheck(at::Tensor &input, c10d::OpType opType); HcclCommConfig createHcclCommConfigWithOptions(); @@ -814,4 +1057,23 @@ private: static ProcessGroupHCCL* global_; }; + +// Dumps the HCCL comm traces and additional information about the Process +// Group. +TORCH_API std::string dump_hccl_trace( + bool includeCollectives, + bool includeStackTraces, + bool onlyActive); + +// Gets a mutable reference to a global optional function.Heartbeat Monitor +// will use this function to dump traces, if available. Inside fbcode, we +// store a function here that uses an internal tool for process tracing +TORCH_API c10::optional)>> &get_cpp_trace_dumper(); + +// Similar to get_cpp_trace_dumper, this stores a function defined in +// torch-python layer that lets us check whether the GIL can be acquired, +// helpful for instrumenting in cases where a hang was observed. +typedef bool (*gil_checker_t)(); + +TORCH_API gil_checker_t &get_gil_checker(); } // namespace c10d_npu diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h index d3a516a7cc..5592ac2ca5 100644 --- a/torch_npu/csrc/distributed/TraceUtils.h +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -7,7 +7,7 @@ #include #include #include -#include "torch_npu/csrc/profiler/python/combined_traceback.h" +#include #include "torch_npu/csrc/core/npu/NPUEvent.h" #include "torch_npu/csrc/distributed/HCCLUtils.hpp" @@ -65,26 +65,6 @@ namespace c10d_npu { return pgName + "_" + std::to_string(rank) + "_trace_end"; } - inline bool traceUpdate( - c10::intrusive_ptr &store, - const std::string &key, - uint64_t seq, - const std::string &col) - { - std::vector value(col.size() + sizeof(seq) + 1); - memcpy(value.data(), &seq, sizeof(seq)); - memcpy(value.data() + sizeof(seq), col.data(), col.size()); - try { - store->set(key, value); - return true; - } catch (...) { - LOG(ERROR) << "Store is down while updating #" << seq << " with key " - << key; - return false; - } - return true; - } - enum TraceDebugEvent { kEventStart, kEventEnd, @@ -247,7 +227,7 @@ namespace c10d_npu { DebugInfoWriter &DebugInfoWriter::getWriter(int rank) { if (writer_ == nullptr) { - std::string fileNamePrefix = getCvarString( + std::string fileNamePrefix = c10d::getCvarString( {"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_"); // Using std::unique_ptr here to auto-delete the writer object // when the pointer itself is destroyed. @@ -295,9 +275,9 @@ namespace c10d_npu { // warn: might be slow in getting cpp traces // because of slow/broken addr2line // in different system libs - std::shared_ptr tb = - torch_npu::CapturedTraceback::gather(true, true, false); - torch_npu::SymbolizedTracebacks s_tbs = torch_npu::symbolize({tb.get()}); + std::shared_ptr tb = + torch::CapturedTraceback::gather(true, true, false); + torch::SymbolizedTracebacks s_tbs = torch::symbolize({tb.get()}); const auto &s_tb = s_tbs.tracebacks.at(0); std::stringstream oss; LOG(ERROR) << "get traceback size:" << s_tb.size(); @@ -321,7 +301,7 @@ namespace c10d_npu { return c10::List(c10::AnyType::get()); } - inline std::string ranks_str(const std::vector &ranks) + inline std::string ranks_str(const std::vector &ranks) { std::string str; for (const auto &rank : ranks) { @@ -344,8 +324,8 @@ namespace c10d_npu { } HCCLTraceBuffer() { - max_entries_ = getCvarInt({"TORCH_HCCL_TRACE_BUFFER_SIZE"}, 0); - capture_cpp_stack_ = getCvarBool({"TORCH_HCCL_TRACE_CPP_STACK"}, false); + max_entries_ = c10d::getCvarInt({"TORCH_HCCL_TRACE_BUFFER_SIZE"}, 0); + capture_cpp_stack_ = c10d::getCvarBool({"TORCH_HCCL_TRACE_CPP_STACK"}, false); enabled_ = max_entries_ > 0; } using Event = c10_npu::NPUEvent; @@ -368,7 +348,7 @@ namespace c10d_npu { size_t op_id_; std::string profiling_name_; - std::shared_ptr traceback_; + std::shared_ptr traceback_; // we borrow pointers to start_ and end_ so we can query the state // on reporting. However, once the event is completed, the call // to `complete` will clear these. @@ -411,7 +391,7 @@ namespace c10d_npu { size_t max_entries_ = 0; size_t next_ = 0; size_t id_ = 0; - std::map, std::vector> + std::map, std::vector> pg_name_to_ranks_ = {}; c10::optional record( @@ -431,7 +411,7 @@ namespace c10d_npu { return c10::nullopt; } auto traceback = - torch_npu::CapturedTraceback::gather(true, true, capture_cpp_stack_); + torch::CapturedTraceback::gather(true, true, capture_cpp_stack_); std::lock_guard guard(mutex_); auto te = Entry{ @@ -475,7 +455,7 @@ namespace c10d_npu { void record_pg_ranks( const std::tuple &pg_name, - std::vector ranks) + std::vector ranks) { if (!enabled_) { return; @@ -527,7 +507,7 @@ namespace c10d_npu { never hang. (timing must also be enabled for compute_duration - see TORCH_HCCL_ENABLE_TIMING). */ - void retire_id(std::optional id, bool compute_duration = true) + void retire_id(c10::optional id, bool compute_duration = true) { if (!enabled_ || !id) { return; @@ -550,6 +530,8 @@ namespace c10d_npu { startEvent = entry->start_; endEvent = entry->end_; } + entry->retired_ = true; + entry->start_ = entry->end_ = nullptr; } if (can_compute_duration) { @@ -572,9 +554,6 @@ namespace c10d_npu { entry->duration_ = duration.value(); } } - - entry->retired_ = true; - entry->start_ = entry->end_ = nullptr; } const c10::List getCollectiveTrace( @@ -583,14 +562,14 @@ namespace c10d_npu { { auto entries = new_list(); auto result = dump_entries(); - std::vector tracebacks; - torch_npu::SymbolizedTracebacks stracebacks; + std::vector tracebacks; + torch::SymbolizedTracebacks stracebacks; std::vector all_frames; if (includeStacktraces) { for (auto &e : result) { tracebacks.push_back(e.traceback_.get()); } - stracebacks = torch_npu::symbolize(tracebacks); + stracebacks = torch::symbolize(tracebacks); for (const auto &f : stracebacks.all_frames) { auto d = new_dict(); d.insert(name_key, f.funcname); @@ -733,5 +712,4 @@ namespace c10d_npu { return pickle_str(result); } }; - } // namespace c10d -- Gitee From cf92084473d7f33ee6b57f8c47fc19377e4e3e6d Mon Sep 17 00:00:00 2001 From: shaoyf Date: Tue, 3 Jun 2025 15:06:31 +0000 Subject: [PATCH 021/328] =?UTF-8?q?!21501=20=E5=9B=9E=E9=80=80=20'Pull=20R?= =?UTF-8?q?equest=20!21465=20:=20support=20HCCL=5FOP=5FRETRY=5FFAILED=20wi?= =?UTF-8?q?th=20ACL=5FERROR=5F=E2=80=A6=20Merge=20pull=20request=20!21501?= =?UTF-8?q?=20from=20shaoyf/revert-merge-21465-v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/acl/inc/acl/acl_base.h | 1 - torch_npu/csrc/core/npu/NPUException.cpp | 9 +-------- torch_npu/csrc/core/npu/NPUException.h | 3 --- torch_npu/csrc/core/npu/NPUQueue.cpp | 6 +----- torch_npu/csrc/core/npu/NPUQueue.h | 1 - torch_npu/csrc/framework/OpParamMaker.cpp | 4 ++-- 6 files changed, 4 insertions(+), 20 deletions(-) diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index b8ef9dbd34..cbcf87b0fc 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -138,7 +138,6 @@ static const int ACL_ERROR_RT_DEVICE_MEM_ERROR = 507053; static const int ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR = 507054; static const int ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR = 507055; static const int ACL_ERROR_RT_LINK_ERROR = 507056; -static const int ACL_ERROR_RT_COMM_OP_RETRY_FAIL = 507904; #define ACL_TENSOR_SHAPE_RANGE_NUM 2 #define ACL_TENSOR_VALUE_RANGE_NUM 2 diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index ab139f53b4..034726549b 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -84,8 +84,7 @@ std::unordered_map> errCodeHandlerMap = { {ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR, std::bind(&handleHbmMultiBitEccError, std::placeholders::_1)}, {ACL_ERROR_RT_DEVICE_MEM_ERROR, std::bind(&handleDeviceMemError, std::placeholders::_1)}, {ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR, std::bind(&handleSuspectDeviceMemError, std::placeholders::_1)}, - {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)}, - {ACL_ERROR_RT_COMM_OP_RETRY_FAIL, std::bind(&handleHcclOpRetryFailed, std::placeholders::_1)} + {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)} }; MemUceInfo memUceInfo; @@ -245,12 +244,6 @@ std::string handleLinkError(int errorCode) return "HCCS LINK ERROR"; } -std::string handleHcclOpRetryFailed(int errorCode) -{ - ASCEND_LOGE("getRepoStopFlag in Run, throw HCCL OP RETRY FAILED."); - return "HCCL OP RETRY FAILED"; -} - std::string handleDeviceError(int errorCode) { auto handlerIter = errCodeHandlerMap.find(errorCode); diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index a82f8f1568..94e38a5edb 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -96,7 +96,6 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode); #define DEVICE_HBM_ECC_ERROR "reason=[hbm Multi-bit ECC error]" #define SUSPECT_DEVICE_MEM_ERROR "reason=[suspect device mem error]" #define HCCS_LINK_ERROR "reason=[link error]" -#define HCCL_OP_RETRY_FAILED "reason=[hccl op retry failed]" inline const char* getErrorFunction(const char* msg) { @@ -276,8 +275,6 @@ std::string handleSuspectDeviceMemError(int errorCode); std::string handleLinkError(int errorCode); -std::string handleHcclOpRetryFailed(int errorCode); - std::string handleDeviceError(int errorCode); } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 7767dda6b8..48b83d9720 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -174,8 +174,7 @@ std::unordered_map deviceErrorMap = { {RepoStatus::HBM_ECC_EXIT, "HBM MULTI BIT ECC ERROR"}, {RepoStatus::STOP_EXIT, "FORCE STOP"}, {RepoStatus::SUSPECT_MEM_EXIT, "SUSPECT MEM ERROR"}, - {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"}, - {RepoStatus::HCCL_OP_RETRY_EXIT, "HCCL OP RETRY FAILED"} + {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"} }; std::string get_func_error_msg(void *error_paras) @@ -377,9 +376,6 @@ void Repository::CheckDeviceError(int ret, std::string& err_msg) } else if (ret == ACL_ERROR_RT_LINK_ERROR || acl_error.find(HCCS_LINK_ERROR) != std::string::npos) { ASCEND_LOGE("HCCS LINK ERROR happened, set task queue status to HCCS_LINK_EXIT"); SetStatus(HCCS_LINK_EXIT); - } else if (ret == ACL_ERROR_RT_COMM_OP_RETRY_FAIL || acl_error.find(HCCL_OP_RETRY_FAILED) != std::string::npos) { - ASCEND_LOGE("HCCL OP RETRY FAILED happened, set task queue status to HCCL_OP_RETRY_EXIT"); - SetStatus(HCCL_OP_RETRY_EXIT); } else if (GetStatus() != STOP_EXIT) { SetStatus(ERROR_EXIT); } diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h index 0ef5609040..460a3cb755 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.h +++ b/torch_npu/csrc/core/npu/NPUQueue.h @@ -27,7 +27,6 @@ enum RepoStatus { HBM_ECC_EXIT = 7, SUSPECT_MEM_EXIT = 8, HCCS_LINK_EXIT = 9, - HCCL_OP_RETRY_EXIT = 10, }; // c10::SmallVector max size diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 1766af9c99..6f88222c00 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -336,7 +336,7 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) ret = cur_paras->customHandler(); } catch (std::exception &e) { if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR, - SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) { + SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR})) { ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; @@ -422,7 +422,7 @@ int ExecFuncOpApi(c10_npu::queue::QueueParas *in, aclrtStream stream) ret = cur_paras->customHandler(); } catch (std::exception &e) { if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR, - SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) { + SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR})) { ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; -- Gitee From 9202b1a537c5bd66012ee746c3766f3867bd1f8f Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 3 Jun 2025 16:15:41 +0000 Subject: [PATCH 022/328] !21512 Update op_plugin commit id Merge pull request !21512 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 44a68f5e93..dd59e19fda 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 44a68f5e93176fa9baf51326c485164c90fb3b6f +Subproject commit dd59e19fdacc101e4c0de700a5a7d710388f3d79 -- Gitee From f6e1e65d48068bdcc84e2421c9c6237f5b1563ea Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 4 Jun 2025 01:41:11 +0000 Subject: [PATCH 023/328] !21506 Update torchair commit id Merge pull request !21506 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index f4241ab1d4..25e45fe301 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit f4241ab1d409ae49c4357540db7372baacd65dc6 +Subproject commit 25e45fe301dd77ec481e34a7665c32e5755ad709 -- Gitee From e6b2ee13608c20f2a5011ec867d4bcb91980f3b1 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Wed, 4 Jun 2025 02:19:55 +0000 Subject: [PATCH 024/328] !21492 profiler workspace Merge pull request !21492 from hhz886/workspace27 --- test/profiler/test_experimental_config.py | 4 +- .../csrc/core/npu/NPUCachingAllocator.cpp | 9 ++- .../csrc/core/npu/NPUWorkspaceAllocator.cpp | 55 ++++++++++++++++--- torch_npu/csrc/profiler/npu_profiler.cpp | 2 + torch_npu/csrc/profiler/npu_profiler.h | 6 ++ .../csrc/toolkit/profiler/inc/data_reporter.h | 4 +- .../toolkit/profiler/src/data_reporter.cpp | 2 +- .../analysis/prof_bean/_memory_use_bean.py | 15 +++-- .../analysis/prof_common_func/_constant.py | 3 + .../prof_config/_fwk_file_parser_config.py | 2 +- .../prof_view/_memory_prepare_parser.py | 12 ++-- .../analysis/prof_view/_memory_view_parser.py | 5 ++ 12 files changed, 93 insertions(+), 26 deletions(-) diff --git a/test/profiler/test_experimental_config.py b/test/profiler/test_experimental_config.py index 81800ae30b..0397472e8e 100644 --- a/test/profiler/test_experimental_config.py +++ b/test/profiler/test_experimental_config.py @@ -93,8 +93,8 @@ class TestExperimentalConfig(TestCase): self.assertEqual([], experimental_config._host_sys) def test_host_sys_switches_will_save_when_set_valid_host_sys(self): - experimental_config = _ExperimentalConfig(host_sys=[Constant.CPU, Constant.MEM]) - self.assertEqual(["cpu", "mem"], experimental_config._host_sys) + experimental_config = _ExperimentalConfig(host_sys=[Constant.CPU]) + self.assertEqual(["cpu"], experimental_config._host_sys) def test_sys_switches_will_save_empty_list_when_not_set_sys(self): experimental_config = _ExperimentalConfig() diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 62a56d75bc..a31789b560 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1456,7 +1456,8 @@ public: torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, ®ionDesc); } torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), - block->device, static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), allocator_type, + block->device, static_cast(torch_npu::profiler::MemoryComponentType::CACHING_ALLOCATOR), + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), allocator_type, reinterpret_cast(block->ptr), block->size, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, @@ -1520,7 +1521,8 @@ public: torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr); } torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), - block->device, static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), allocator_type, + block->device, static_cast(torch_npu::profiler::MemoryComponentType::CACHING_ALLOCATOR), + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), allocator_type, reinterpret_cast(orig_block_ptr), -orig_block_size, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, @@ -2297,7 +2299,8 @@ private: }); #ifndef BUILD_LIBTORCH torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), - block->device, static_cast(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), allocator_type, + block->device, static_cast(torch_npu::profiler::MemoryComponentType::CACHING_ALLOCATOR), + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), allocator_type, reinterpret_cast(orig_block_ptr), -original_block_size, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index ce9e6cc918..c34d796a78 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -105,13 +105,14 @@ public: torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), device, + static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), reinterpret_cast(block->data_ptr), -block->size, - get_mem_size(), - 0, // reserved_bytes not used - 0, // active_bytes not used + stats.allocated_bytes.current, + stats.reserved_bytes.current, + stats.allocated_bytes.current, reinterpret_cast(stream)} ); #endif @@ -145,13 +146,14 @@ public: torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), device, + static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), reinterpret_cast(block->data_ptr), block->size, - get_mem_size(), - 0, // reserved_bytes not used - 0, // active_bytes not used + stats.allocated_bytes.current, + stats.reserved_bytes.current, + stats.allocated_bytes.current, reinterpret_cast(stream)} ); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); @@ -164,12 +166,46 @@ public: allocated_size = block->size; update_stat(stats.allocated_bytes, block->size); +#ifndef BUILD_LIBTORCH + torch_npu::profiler::reportMemoryDataToNpuProfiler({ + static_cast(c10::DeviceType::PrivateUse1), + device, + static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), + static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), + reinterpret_cast(block->data_ptr), + block->size, + stats.allocated_bytes.current, + stats.reserved_bytes.current, + stats.allocated_bytes.current, + reinterpret_cast(stream)} + ); +#endif return block->data_ptr; } void free() { update_stat(stats.allocated_bytes, -allocated_size); +#ifndef BUILD_LIBTORCH + for (const auto& block_pair : blocks) { + if (block_pair.second->data_ptr != nullptr) { + torch_npu::profiler::reportMemoryDataToNpuProfiler({ + static_cast(c10::DeviceType::PrivateUse1), + device, + static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), + static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), + reinterpret_cast(block_pair.second->data_ptr), + -allocated_size, + stats.allocated_bytes.current, + stats.reserved_bytes.current, + stats.allocated_bytes.current, + reinterpret_cast(block_pair.first)} + ); + } + } +#endif } // return to the system allocator @@ -208,13 +244,14 @@ public: torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), device, + static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), reinterpret_cast(block_pair.second->data_ptr), -block_pair.second->size, - get_mem_size(), - 0, // reserved_bytes not used - 0, // active_bytes not used + stats.allocated_bytes.current, + stats.reserved_bytes.current, + stats.allocated_bytes.current, reinterpret_cast(block_pair.first)} ); #endif diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp index 9d9f0fdfa0..295eda9aea 100644 --- a/torch_npu/csrc/profiler/npu_profiler.cpp +++ b/torch_npu/csrc/profiler/npu_profiler.cpp @@ -131,6 +131,7 @@ struct NpuProfilerThreadLocalState : public ProfilerStateBase { device.index(), 0, 0, + 0, Utils::GetTid(), Utils::GetPid() )); @@ -389,6 +390,7 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) data.stream_ptr, data.device_type, data.device_index, + data.component_type, data.data_type, data.allocator_type, Utils::GetTid(), diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index 05afa29d40..2127825bc1 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -26,6 +26,11 @@ enum class NpuActivityType { NPU, }; +enum class MemoryComponentType { + CACHING_ALLOCATOR = 0, + WORKSPACE_ALLOCATOR, +}; + enum class MemoryDataType { MEMORY_MALLOC = 0, MEMORY_FREE, @@ -42,6 +47,7 @@ enum class MemoryAllocatorType { struct MemoryUsage { int8_t device_type{0}; int8_t device_index{0}; + uint8_t component_type{static_cast(MemoryComponentType::CACHING_ALLOCATOR)}; uint8_t data_type{static_cast(MemoryDataType::MEMORY_INVALID)}; uint8_t allocator_type{static_cast(MemoryAllocatorType::ALLOCATOR_INVALID)}; int64_t ptr{0}; diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h index a4232f1aa2..764f8e1668 100644 --- a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h +++ b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h @@ -338,13 +338,14 @@ struct MemoryData : BaseReportData { int64_t stream_ptr{ 0 }; int8_t device_type{ 0 }; int8_t device_index{ 0 }; + uint8_t component_type{ 0 }; uint8_t data_type{ 0 }; uint8_t allocator_type{ 0 }; uint64_t thread_id{ 0 }; uint64_t process_id{ 0 }; MemoryData(int64_t ptr, int64_t time_ns, int64_t alloc_size, int64_t total_allocated, int64_t total_reserved, int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t data_type, - uint8_t allocator_type, uint64_t thread_id, uint64_t process_id) + uint8_t component_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id) : BaseReportData(0, "torch.memory_usage"), ptr(ptr), time_ns(time_ns), @@ -355,6 +356,7 @@ struct MemoryData : BaseReportData { stream_ptr(stream_ptr), device_type(device_type), device_index(device_index), + component_type(component_type), data_type(data_type), allocator_type(allocator_type), thread_id(thread_id), diff --git a/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp b/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp index 669ee8a4d1..2cbce73a06 100644 --- a/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp +++ b/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp @@ -93,7 +93,7 @@ std::vector MemoryData::encode() total_reserved, total_active, stream_ptr}, result); encodeFixedData({device_type, device_index}, result); - encodeFixedData({data_type, allocator_type}, result); + encodeFixedData({component_type, data_type, allocator_type}, result); encodeFixedData({thread_id, process_id}, result); std::vector resultTLV; diff --git a/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py b/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py index 4e29f204e3..0385af8d79 100644 --- a/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py +++ b/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py @@ -19,14 +19,15 @@ class MemoryEnum(Enum): STREAM_PTR = 6 DEVICE_TYPE = 7 DEVICE_INDEX = 8 - DATA_TYPE = 9 - ALLOCATOR_TYPE = 10 - THREAD_ID = 11 - PROCESS_ID = 12 + COMPONENT_TYPE = 9 + DATA_TYPE = 10 + ALLOCATOR_TYPE = 11 + THREAD_ID = 12 + PROCESS_ID = 13 class MemoryUseBean(CommonBean): - CONSTANT_STRUCT = "<7q2b2B2Q" + CONSTANT_STRUCT = "<7q2b3B2Q" NPU_ID = 20 CPU_ID = 0 INNER_ALLOCATOR = 0 @@ -88,6 +89,10 @@ class MemoryUseBean(CommonBean): def device_index(self) -> int: return int(self._constant_data[MemoryEnum.DEVICE_INDEX.value]) + @property + def component_type(self) -> int: + return int(self._constant_data[MemoryEnum.COMPONENT_TYPE.value]) + @property def data_type(self) -> int: return int(self._constant_data[MemoryEnum.DATA_TYPE.value]) diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index 10a20733d9..46105f7e43 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -81,6 +81,7 @@ class Constant(object): GE = "GE" APP = "APP" PTA_GE = "PTA+GE" + WORKSPACE = "WORKSPACE" B_TO_KB = 1024.0 KB_TO_MB = 1024.0 B_TO_MB = 1024.0 ** 2 @@ -93,6 +94,8 @@ class Constant(object): MEMORY_MALLOC = 0 MEMORY_FREE = 1 MEMORY_BLOCK_FREE = 2 + CACHING_TYPE = 0 + WORKSPACE_TYPE = 1 # profiler config CONFIG = "config" diff --git a/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py b/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py index a4edff67c6..714a620401 100644 --- a/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py +++ b/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py @@ -25,7 +25,7 @@ class FwkFileParserConfig: FILE_BEAN_MAP = { FileTag.TORCH_OP: {"bean": TorchOpBean, "is_tlv": True, "struct_size": 58}, FileTag.OP_MARK: {"bean": OpMarkBean, "is_tlv": True, "struct_size": 40}, - FileTag.MEMORY: {"bean": MemoryUseBean, "is_tlv": True, "struct_size": 76}, + FileTag.MEMORY: {"bean": MemoryUseBean, "is_tlv": True, "struct_size": 77}, FileTag.GC_RECORD: {"bean": GCRecordBean, "is_tlv": False, "struct_size": 24}, FileTag.PYTHON_TRACER_FUNC: {"bean": PythonTracerFuncBean, "is_tlv": False, "struct_size": 33}, FileTag.PYTHON_TRACER_HASH: {"bean": PythonTracerHashBean, "is_tlv": True, "struct_size": 8}, diff --git a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py index 08d8560f4d..407fc0ea71 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py @@ -210,13 +210,15 @@ class MemoryPrepareParser(BaseParser): else: op_name = self._find_real_op_name_of_record(dequeue_record, torch_ops) if records_len == 1: - self._incomplete_num += 2 + if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE: + self._incomplete_num += 2 combine_data = [op_name, records[0].alloc_size, convert_ns2us_str(records[0].time_ns, "\t"), None, None, None, None, records[0].total_allocated, records[0].total_reserved, records[0].total_active, None, None, None, records[0].stream_ptr, records[0].device_tag] elif records_len == 2: - self._incomplete_num += 1 + if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE: + self._incomplete_num += 1 active_release_time = convert_ns2us_str(records[1].time_ns, "\t") if records[1].data_type == Constant.MEMORY_BLOCK_FREE else None release_time = convert_ns2us_str(records[1].time_ns, "\t") if records[1].data_type == Constant.MEMORY_FREE else None duration_time = convert_ns2us_str(records[1].time_ns - records[0].time_ns, "\t") if records[1].data_type == Constant.MEMORY_FREE else None @@ -253,13 +255,15 @@ class MemoryPrepareParser(BaseParser): else: op_name = self._find_real_op_name_of_record(dequeue_record, torch_ops) if records_len == 1: - self._incomplete_num += 2 + if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE: + self._incomplete_num += 2 combine_data = [op_name, records[0].alloc_size_for_db, records[0].time_ns, None, None, None, None, records[0].total_allocated_for_db, records[0].total_reserved_for_db, records[0].total_active_for_db, None, None, None, records[0].stream_ptr, records[0].device_index] elif records_len == 2: - self._incomplete_num += 1 + if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE: + self._incomplete_num += 1 active_release_time = records[1].time_ns if records[1].data_type == Constant.MEMORY_BLOCK_FREE else None release_time = records[1].time_ns if records[1].data_type == Constant.MEMORY_FREE else None duration_time = records[1].time_ns - records[0].time_ns if records[1].data_type == Constant.MEMORY_FREE else None diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py index fa834e543b..04ef7c0e90 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py @@ -53,6 +53,11 @@ class MemoryViewParser(BaseParser): @staticmethod def _combine_record(last_record, cur_record): + if hasattr(cur_record, 'component_type') and cur_record.component_type == Constant.WORKSPACE_TYPE: + cur_record_list = [Constant.WORKSPACE, convert_ns2us_str(cur_record.time_ns, tail="\t"), + cur_record.total_allocated, cur_record.total_reserved, cur_record.total_active, + cur_record.stream_ptr, cur_record.device_tag] + return [cur_record_list] cur_record_list = cur_record.row if last_record: pta_ge_record_list = [Constant.PTA_GE, convert_ns2us_str(cur_record.time_ns, tail="\t"), -- Gitee From 661e4cda27cb194841154958d092155287e0fda2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 4 Jun 2025 03:14:15 +0000 Subject: [PATCH 025/328] !21518 Update op_plugin commit id Merge pull request !21518 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index dd59e19fda..b76d9d7352 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit dd59e19fdacc101e4c0de700a5a7d710388f3d79 +Subproject commit b76d9d735208c77e25651dd02e3bf8baa1bfe7d8 -- Gitee From b2d6faf7614a6c4e3d82128b31d015056bbb067a Mon Sep 17 00:00:00 2001 From: zyb <12441311+zyb230@user.noreply.gitee.com> Date: Wed, 4 Jun 2025 07:03:02 +0000 Subject: [PATCH 026/328] =?UTF-8?q?!21531=20Pytorch=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E5=8D=95host=E5=A4=9Adevice=E6=80=A7=E8=83=BD=E5=88=86?= =?UTF-8?q?=E6=9E=90=20Merge=20pull=20request=20!21531=20from=20zyb/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../analysis/prof_bean/_common_bean.py | 5 - .../analysis/prof_bean/_event_bean.py | 1 + .../prof_bean/_npu_module_mem_bean.py | 2 +- .../analysis/prof_common_func/_constant.py | 1 + .../analysis/prof_parse/_fwk_file_parser.py | 33 +++-- .../prof_view/_trace_step_time_parser.py | 86 +++++++---- .../_trace_step_time_db_parser.py | 136 ++++++++++++------ 7 files changed, 172 insertions(+), 92 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_bean/_common_bean.py b/torch_npu/profiler/analysis/prof_bean/_common_bean.py index 8cb4c620d1..2aa61459c7 100644 --- a/torch_npu/profiler/analysis/prof_bean/_common_bean.py +++ b/torch_npu/profiler/analysis/prof_bean/_common_bean.py @@ -13,14 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -FILTER_COL_LIST = ["Device_id"] - __all__ = [] class CommonBean: def __init__(self, data: dict): - for col in FILTER_COL_LIST: - if col in data: - data.pop(col) self._data = data diff --git a/torch_npu/profiler/analysis/prof_bean/_event_bean.py b/torch_npu/profiler/analysis/prof_bean/_event_bean.py index b03e7a58c4..d97cae4acb 100644 --- a/torch_npu/profiler/analysis/prof_bean/_event_bean.py +++ b/torch_npu/profiler/analysis/prof_bean/_event_bean.py @@ -10,6 +10,7 @@ class EventBean: def __init__(self, data: dict): self._origin_data = data + self.device_id = -1 @property def ts(self) -> int: diff --git a/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py b/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py index 872b23e26f..3183cd737c 100644 --- a/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py +++ b/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py @@ -5,7 +5,7 @@ __all__ = [] class NpuModuleMemoryBean(CommonBean): - SHOW_HEADERS = ["Component", "Timestamp(us)", "Total Reserved(MB)", "Device"] + SHOW_HEADERS = ["Device_id", "Component", "Timestamp(us)", "Total Reserved(MB)", "Device"] def __init__(self, data: dict): super().__init__(data) diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index 46105f7e43..56809c9b7f 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -443,6 +443,7 @@ class TableColumnsManager(): ("type", Constant.SQL_TEXT_TYPE) ], DbConstant.TABLE_STEP_TRACE_TIME : [ + ("deviceId", Constant.SQL_INTEGER_TYPE), ("step", Constant.SQL_TEXT_TYPE), ("computing", Constant.SQL_NUMERIC_TYPE), ("communication_not_overlapped", Constant.SQL_NUMERIC_TYPE), diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py index b15883fd04..aa00324c97 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py @@ -162,23 +162,28 @@ class FwkFileParser: len(torch_op_data) + len(enqueue_data_list) * 2 + len(dequeue_data_list) * 2) index = 0 fwd_dict = {} + correlation_id_name_dict = {} for torch_op in torch_op_data: self.filter_fwd_bwd_event(fwd_dict, torch_op) tid_dict[torch_op.tid] = False fwk_x_event_list[index] = TraceEventManager.create_x_event(torch_op, "cpu_op") index += 1 - for enqueue_data in enqueue_data_list: - tid_dict[enqueue_data.tid] = False - fwk_x_event_list[index] = TraceEventManager.create_x_event(enqueue_data, "enqueue") - index += 1 - fwk_x_event_list[index] = TraceEventManager.create_task_queue_flow(Constant.FLOW_START_PH, enqueue_data) - index += 1 for dequeue_data in dequeue_data_list: tid_dict[dequeue_data.tid] = True fwk_x_event_list[index] = TraceEventManager.create_x_event(dequeue_data, "dequeue") index += 1 fwk_x_event_list[index] = TraceEventManager.create_task_queue_flow(Constant.FLOW_END_PH, dequeue_data) index += 1 + correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name + for enqueue_data in enqueue_data_list: + tid_dict[enqueue_data.tid] = False + fwk_x_event_list[index] = TraceEventManager.create_x_event(enqueue_data, "enqueue") + if enqueue_data.corr_id in correlation_id_name_dict: + # append correlation name with '@' prefix for consistent with Dequeue + fwk_x_event_list[index]['name'] += f"@{correlation_id_name_dict[enqueue_data.corr_id]}" + index += 1 + fwk_x_event_list[index] = TraceEventManager.create_task_queue_flow(Constant.FLOW_START_PH, enqueue_data) + index += 1 other_event_list = TraceEventManager.create_m_event(pid, tid_dict) other_event_list.extend(TraceEventManager.create_fwd_flow(fwd_dict)) fwk_x_event_list.extend(other_event_list) @@ -268,15 +273,21 @@ class FwkFileParser: task_enqueues = [] task_dequeues = [] enqueue_data_list, dequeue_data_list = self.get_task_queue_data() - for enqueue_data in enqueue_data_list: - task_enqueues.append( - [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid), - enqueue_data.corr_id, enqueue_data.name]) - connection_ids.append(enqueue_data.corr_id) + correlation_id_name_dict = {} for dequeue_data in dequeue_data_list: task_dequeues.append( [dequeue_data.ts, dequeue_data.ts + dequeue_data.dur, contact_2num(pid, dequeue_data.tid), dequeue_data.corr_id, dequeue_data.name]) + correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name + for enqueue_data in enqueue_data_list: + name = enqueue_data.name + if enqueue_data.corr_id in correlation_id_name_dict: + # append correlation name with '@' prefix for consistent with Dequeue + name += f"@{correlation_id_name_dict[enqueue_data.corr_id]}" + task_enqueues.append( + [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid), + enqueue_data.corr_id, name]) + connection_ids.append(enqueue_data.corr_id) start_connection_id = max(connection_ids) + 1 if connection_ids else 0 self.update_fwd_bwd_connection_id(fwd_bwd_dict, torch_op_apis, start_connection_id) diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index 4eb7a1488f..13c3c73014 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -1,3 +1,5 @@ +import copy +from collections import defaultdict from enum import Enum from ._base_parser import BaseParser from ..prof_common_func._constant import Constant @@ -11,6 +13,24 @@ from ..prof_parse._fwk_file_parser import FwkFileParser __all__ = [] +def default_time(): + return { + 'compute': 0, + 'comunNotOverlp': 0, + 'Overlp': 0, + 'comun': 0, + 'free': 0, + 'stage': 0, + 'bubble': 0, + 'comunNotOverlpRec': 0, + 'prepare': 0 + } + + +def step_time_dict(): + return defaultdict(default_time) + + class _StepInfoIndex(Enum): ID = 0 START_TS = 1 @@ -25,8 +45,8 @@ class TraceStepTimeParser(BaseParser): STEP_TRACE = "step_trace_time.csv" timeflag = {'Communication': 'comun', 'Computing': 'compute', 'Free': 'free', 'Communication(Not Overlapped)': 'comunNotOverlp', 'hcom_receive': 'bubble'} - title = ['Step', 'Computing', 'Communication(Not Overlapped)', 'Overlapped', 'Communication', 'Free', 'Stage', - 'Bubble', 'Communication(Not Overlapped and Exclude Receive)', 'Preparing'] + title = ['Device_id', 'Step', 'Computing', 'Communication(Not Overlapped)', 'Overlapped', 'Communication', + 'Free', 'Stage', 'Bubble', 'Communication(Not Overlapped and Exclude Receive)', 'Preparing'] def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) @@ -43,18 +63,21 @@ class TraceStepTimeParser(BaseParser): return False @classmethod - def count_time(cls, add_type, start_time, duration, step_list, save_time): + def count_time(cls, add_type, data, step_list, save_time, pid_device_map): + start_time = data.get('ts', 0) + duration = data.get('dur', 0) + device_id = pid_device_map[data['pid']] cur_step = None if not cls.is_float_num(start_time) or not cls.is_float_num(duration): print('Ts or dur format error!') return start_time = float(start_time) duration = float(duration) - for step in step_list: + for step in step_list.get(device_id, []): if step[_StepInfoIndex.START_TS.value] <= start_time < step[_StepInfoIndex.END_TS.value]: cur_step = step[_StepInfoIndex.ID.value] break - for step in step_list: + for step in step_list.get(device_id, []): if cur_step == step[_StepInfoIndex.ID.value]: if start_time < step[_StepInfoIndex.E2E_START_TS.value] or \ step[_StepInfoIndex.E2E_START_TS.value] == -1: @@ -67,10 +90,7 @@ class TraceStepTimeParser(BaseParser): step[_StepInfoIndex.FIRST_TASK_TS.value] == -1: step[_StepInfoIndex.FIRST_TASK_TS.value] = start_time break - for cur_save in save_time: - if cur_save.get('step') == cur_step: - cur_save[cls.timeflag.get(add_type)] += duration - break + save_time[device_id][cur_step][cls.timeflag.get(add_type)] += duration @classmethod def get_e2e_time(cls, step, step_list): @@ -91,43 +111,51 @@ class TraceStepTimeParser(BaseParser): def create_step_file(self, output_path: str, json_str: list, file_name: str) -> None: step_list = [] - save_time = [] + save_time = defaultdict(step_time_dict) if not json_str: return - # get step time + # obtain the mapping between pid and device_id(rank_id) + pid_device_map = {} + for data in json_str: + if data.get('name') == 'process_labels' and data.get('args', {}).get('labels', '').startswith('NPU'): + label = data['args']['labels'] + pid_device_map[data.get('pid')] = -1 if label == 'NPU' else int(label.split(' ')[1]) # "labels": "NPU 0" + # get initial step time for cur_step in self.step_range: step_list.append( [cur_step.get(Constant.STEP_ID), convert_ns2us_float(cur_step.get(Constant.START_TS)), convert_ns2us_float(cur_step.get(Constant.END_TS)), -1, -1, convert_ns2us_float(cur_step.get(Constant.FWK_START_TS)), -1]) - save_time.append( - {'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0, - 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0}) if not self.step_range: - save_time.append( - {'step': None, 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0, 'free': 0, 'stage': 0, - 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0}) step_list.append([None, -1, -1, -1, -1, -1, -1]) - + # every device should have its own step_list + step_dict = {} + for device in set(pid_device_map.values()): + step_dict[device] = copy.deepcopy(step_list) has_analysis_data_flag = False + bubble_data = [] + # traverse json and calculate time for data in json_str: if data.get('name') in {'Communication', 'Computing', 'Free', 'Communication(Not Overlapped)'}: - self.count_time(data.get('name'), data.get('ts', 0), data.get('dur', 0), step_list, save_time) + self.count_time(data.get('name'), data, step_dict, save_time, pid_device_map) has_analysis_data_flag = True elif str(data.get('name')).startswith('hcom_receive'): - self.count_time('hcom_receive', data.get('ts', 0), data.get('dur', 0), step_list, save_time) + bubble_data.append(data) + self.count_time('hcom_receive', data, step_dict, save_time, pid_device_map) if not has_analysis_data_flag: return - for calc_time in save_time: - calc_time['comunNotOverlpRec'] = calc_time['comunNotOverlp'] - calc_time['bubble'] - calc_time['Overlp'] = calc_time['comun'] - calc_time['comunNotOverlp'] - calc_time['stage'] = self.get_e2e_time(calc_time['step'], step_list) - calc_time['bubble'] - calc_time['prepare'] = self.get_prepare_time(calc_time['step'], step_list) print_time = [] - for step in save_time: - print_time.append( - [step['step'], step['compute'], step['comunNotOverlp'], step['Overlp'], step['comun'], step['free'], - step['stage'], step['bubble'], step['comunNotOverlpRec'], step['prepare']]) + for device, device_time in save_time.items(): + for step, step_time in device_time.items(): + step_time['comunNotOverlpRec'] = step_time['comunNotOverlp'] - step_time['bubble'] + step_time['Overlp'] = step_time['comun'] - step_time['comunNotOverlp'] + step_time['stage'] = self.get_e2e_time(step, step_dict.get(device, [])) - step_time['bubble'] + step_time['prepare'] = self.get_prepare_time(step, step_dict.get(device, [])) + print_time.append( + [device, step, step_time['compute'], step_time['comunNotOverlp'], step_time['Overlp'], + step_time['comun'], step_time['free'], step_time['stage'], step_time['bubble'], + step_time['comunNotOverlpRec'], step_time['prepare']]) + print_time.sort(key=lambda x: (x[0], x[1])) FileManager.create_csv_file(output_path, print_time, file_name, self.title) def run(self, deps_data: dict): diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py index 97a164b73d..db82064fde 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections import defaultdict from enum import Enum from .._base_parser import BaseParser from ...prof_common_func._constant import Constant, print_warn_msg @@ -25,10 +26,11 @@ from ...prof_parse._fwk_file_parser import FwkFileParser __all__ = [] -class CommunicationOpIndex(Enum): +class OpIndex(Enum): OP_NAME = 0 START_NS = 1 END_NS = 2 + DEVICE_ID = 3 class TraceStepTimeDbParser(BaseParser): @@ -36,9 +38,8 @@ class TraceStepTimeDbParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] - self.string_id_map = {} - self.compute_task_info = {} - self.communication_op_info = [] + self.compute_task_info = defaultdict(list) + self.communication_op_info = defaultdict(list) ProfilerLogger.init(self._profiler_path, "TraceStepTimeDbParser") self.logger = ProfilerLogger.get_instance() @@ -86,28 +87,32 @@ class TraceStepTimeDbParser(BaseParser): {'step': None, 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0}) else: - # get step time - for cur_step in self.step_range: - save_info = { - 'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, - 'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0 - } - origin_compute_data = self._get_compute_data_in_step(cur_step) - origin_communication_data, bubble_data = self._get_communication_data_in_step(cur_step) - compute_data = RangeCaculator.merge_continuous_intervals(origin_compute_data) - save_info['compute'] = sum(data.end_ts - data.start_ts for data in compute_data) - communication_data = RangeCaculator.merge_continuous_intervals(origin_communication_data) - save_info['comun'] = sum(data.end_ts - data.start_ts for data in communication_data) - pure_communication_data, free_data = \ - RangeCaculator.compute_pipeline_overlap(communication_data, compute_data) - save_info['comunNotOverlp'] = \ - sum(data.end_ts - data.start_ts for data in pure_communication_data) - save_info['free'] = sum(data.end_ts - data.start_ts for data in free_data) - save_info['bubble'] = sum(data.end_ts - data.start_ts for data in bubble_data) - save_info['stage'] = self.get_e2e_time(compute_data + communication_data) - save_info['bubble'] - first_task_start_ts = self._get_first_device_task_ts(compute_data, communication_data) - save_info['prepare'] = self.get_prepare_time(first_task_start_ts, cur_step) - save_time.append(save_info) + device_ids = list(set(self.compute_task_info.keys()) | set(self.communication_op_info.keys())) + device_ids.sort() + for device_id in device_ids: + # get step time + for cur_step in self.step_range: + save_info = { + 'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, + 'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0, + 'deviceId': device_id + } + origin_compute_data = self._get_compute_data_in_step(cur_step, device_id) + origin_communication_data, bubble_data = self._get_communication_data_in_step(cur_step, device_id) + compute_data = RangeCaculator.merge_continuous_intervals(origin_compute_data) + save_info['compute'] = sum(data.end_ts - data.start_ts for data in compute_data) + communication_data = RangeCaculator.merge_continuous_intervals(origin_communication_data) + save_info['comun'] = sum(data.end_ts - data.start_ts for data in communication_data) + pure_communication_data, free_data = \ + RangeCaculator.compute_pipeline_overlap(communication_data, compute_data) + save_info['comunNotOverlp'] = \ + sum(data.end_ts - data.start_ts for data in pure_communication_data) + save_info['free'] = sum(data.end_ts - data.start_ts for data in free_data) + save_info['bubble'] = sum(data.end_ts - data.start_ts for data in bubble_data) + save_info['stage'] = self.get_e2e_time(compute_data + communication_data) - save_info['bubble'] + first_task_start_ts = self._get_first_device_task_ts(compute_data, communication_data) + save_info['prepare'] = self.get_prepare_time(first_task_start_ts, cur_step) + save_time.append(save_info) for calc_time in save_time: calc_time['comunNotOverlpRec'] = calc_time['comunNotOverlp'] - calc_time['bubble'] @@ -116,7 +121,8 @@ class TraceStepTimeDbParser(BaseParser): for step in save_time: step_time_data = [step['compute'], step['comunNotOverlp'], step['Overlp'], step['comun'], step['free'], step['stage'], step['bubble'], step['comunNotOverlpRec'], step['prepare']] - reformat_time.append([step['step'], ] + [convert_ns2us_float(data) for data in step_time_data]) + reformat_time.append([step['deviceId'], step['step']] + \ + [convert_ns2us_float(data) for data in step_time_data]) self.save_step_trace_db_data(reformat_time) def _init_step_range(self, deps_data: dict): @@ -126,37 +132,75 @@ class TraceStepTimeDbParser(BaseParser): if not TorchDb().create_connect_db(): print_warn_msg(f"Failed to connect to db file: {TorchDb().get_db_path()}") return - if TorchDb().judge_table_exist(DbConstant.TABLE_STRING_IDS): - sql = "select id, value from {}".format(DbConstant.TABLE_STRING_IDS) - string_id_data = TorchDb().fetch_all_data(sql) - self.string_id_map = {data[0]: data[1] for data in string_id_data} + if not TorchDb().judge_table_exist(DbConstant.TABLE_STRING_IDS): + self.logger.error(f"{DbConstant.TABLE_STRING_IDS} does not exist.") + return if TorchDb().judge_table_exist(DbConstant.TABLE_COMPUTE_TASK_INFO): - sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMPUTE_TASK_INFO) + sql = """ + SELECT + STRING_IDS.value, + task.startNs, + task.endNs, + task.deviceId + FROM COMPUTE_TASK_INFO AS comp + JOIN TASK AS task + ON comp.globalTaskId = task.globalTaskId + JOIN STRING_IDS + ON comp.name = STRING_IDS.id + """ compute_task_data = TorchDb().fetch_all_data(sql) - self.compute_task_info = {data[1]: data[0] for data in compute_task_data} + for item in compute_task_data: + self.compute_task_info[item[OpIndex.DEVICE_ID.value]].append(item) if TorchDb().judge_table_exist(DbConstant.TABLE_COMMUNICATION_OP): - sql = "select opName, startNs, endNs from {}".format(DbConstant.TABLE_COMMUNICATION_OP) - self.communication_op_info = TorchDb().fetch_all_data(sql) - - def _get_compute_data_in_step(self, step_info): + sql = """ + WITH comm_info AS ( + SELECT (SELECT value FROM STRING_IDS WHERE id = c.opName) AS opName, + startNs, + endNs, + connectionId + FROM COMMUNICATION_OP c + ) + SELECT + comm.opName, + comm.startNs, + comm.endNs, + t.deviceId + FROM comm_info comm + JOIN ( + SELECT + connectionId, + deviceId + FROM TASK + GROUP BY connectionId + HAVING COUNT(DISTINCT deviceId) = 1 + ) t + ON comm.connectionId = t.connectionId + """ + communication_op_data = TorchDb().fetch_all_data(sql) + for item in communication_op_data: + self.communication_op_info[item[OpIndex.DEVICE_ID.value]].append(item) + + def _get_compute_data_in_step(self, step_info, device_id): compute_data = [] - for task_id, task_info in step_info.get(Constant.TASK_INFO, {}).items(): - if task_id in self.compute_task_info: - compute_data.append( - RangeCaculator.generate_time_range(task_info.get("startNs"), task_info.get("endNs"))) + for op_info in self.compute_task_info[device_id]: + op_start_time = op_info[OpIndex.START_NS.value] + if not (step_info.get(Constant.START_TS) <= op_start_time < step_info.get(Constant.END_TS)): + continue + time_range = RangeCaculator.generate_time_range(op_start_time, op_info[OpIndex.END_NS.value]) + compute_data.append(time_range) return compute_data - def _get_communication_data_in_step(self, step_info): + def _get_communication_data_in_step(self, step_info, device_id): communication_data = [] bubble_data = [] - for op_info in self.communication_op_info: - op_start_time = op_info[CommunicationOpIndex.START_NS.value] + for op_info in self.communication_op_info[device_id]: + op_start_time = op_info[OpIndex.START_NS.value] if not (step_info.get(Constant.START_TS) <= op_start_time < step_info.get(Constant.END_TS)): continue time_range = RangeCaculator.generate_time_range( - op_start_time, op_info[CommunicationOpIndex.END_NS.value], class_range=CommunicationTimeRange) + op_start_time, op_info[OpIndex.END_NS.value], class_range=CommunicationTimeRange) communication_data.append(time_range) - op_name = self.string_id_map.get(op_info[CommunicationOpIndex.OP_NAME.value], '') + op_name = op_info[OpIndex.OP_NAME.value] if op_name.startswith('hcom_receive'): bubble_data.append(time_range) return communication_data, bubble_data -- Gitee From cccb9aae2bcd71b2d3ddc8e5885be05be10c6f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= Date: Wed, 4 Jun 2025 08:10:06 +0000 Subject: [PATCH 027/328] =?UTF-8?q?!21456=20add=20pp=20ut=20Merge=20pull?= =?UTF-8?q?=20request=20!21456=20from=20=E8=A9=B9=E6=98=8A/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/distributed/pipelining/__init__.py | 0 .../artifacts/zb1p_2rank_2stagep_comms.csv | 2 + .../artifacts/zb1p_2rank_2stagep_compute.csv | 2 + test/distributed/pipelining/model_registry.py | 230 ++++ .../pipelining/schedule_registry.py | 230 ++++ test/distributed/pipelining/test_backward.py | 187 ++++ .../distributed/pipelining/test_microbatch.py | 91 ++ test/distributed/pipelining/test_pipe.py | 123 +++ test/distributed/pipelining/test_schedule.py | 993 ++++++++++++++++++ .../pipelining/test_schedule_multiproc.py | 958 +++++++++++++++++ test/distributed/pipelining/test_stage.py | 342 ++++++ .../pipelining/test_transformer.py | 75 ++ test/distributed/pipelining/test_unflatten.py | 75 ++ 13 files changed, 3308 insertions(+) create mode 100644 test/distributed/pipelining/__init__.py create mode 100644 test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_comms.csv create mode 100644 test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_compute.csv create mode 100644 test/distributed/pipelining/model_registry.py create mode 100644 test/distributed/pipelining/schedule_registry.py create mode 100644 test/distributed/pipelining/test_backward.py create mode 100644 test/distributed/pipelining/test_microbatch.py create mode 100644 test/distributed/pipelining/test_pipe.py create mode 100644 test/distributed/pipelining/test_schedule.py create mode 100644 test/distributed/pipelining/test_schedule_multiproc.py create mode 100644 test/distributed/pipelining/test_stage.py create mode 100644 test/distributed/pipelining/test_transformer.py create mode 100644 test/distributed/pipelining/test_unflatten.py diff --git a/test/distributed/pipelining/__init__.py b/test/distributed/pipelining/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_comms.csv b/test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_comms.csv new file mode 100644 index 0000000000..bdce682a03 --- /dev/null +++ b/test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_comms.csv @@ -0,0 +1,2 @@ +0F0,0SEND_F0,2RECV_F0,0F1,0SEND_F1,2RECV_F1,2F0,2SEND_F0,2F1,2SEND_F1,2RECV_B0,2B0,2SEND_B0,0F2,0SEND_F2,2RECV_B1,2B1,2SEND_B1,0F3,0SEND_F3,2RECV_F2,0RECV_B0,0B0,2F2,2SEND_F2,2RECV_F3,0RECV_B1,0B1,2F3,2SEND_F3,2RECV_B2,2B2,2SEND_B2,0F4,0SEND_F4,2RECV_B3,2B3,2SEND_B3,0F5,0SEND_F5,2RECV_F4,0RECV_B2,0B2,2F4,2SEND_F4,2RECV_F5,0RECV_B3,0B3,2F5,2SEND_F5,2RECV_B4,2B4,2SEND_B4,0F6,0SEND_F6,2RECV_B5,2B5,2SEND_B5,0F7,0SEND_F7,2RECV_F6,0RECV_B4,0B4,2F6,2SEND_F6,2RECV_F7,0RECV_B5,0B5,2F7,2SEND_F7,2RECV_B6,2B6,2SEND_B6,2RECV_B7,2B7,2SEND_B7,0RECV_B6,0B6,0RECV_B7,0B7 +1RECV_F0,1F0,1SEND_F0,1RECV_F1,1F1,1SEND_F1,3RECV_F0,3F0,3RECV_F1,3I0,3SEND_B0,1RECV_B0,3F1,1RECV_F2,3I1,3SEND_B1,1RECV_B1,3W0,1RECV_F3,1F2,1SEND_F2,1I0,1SEND_B0,3W1,3RECV_F2,1F3,1SEND_F3,1I1,1SEND_B1,1W0,3RECV_F3,3F2,3I2,3SEND_B2,1RECV_B2,1W1,1RECV_F4,3F3,3I3,3SEND_B3,1RECV_B3,3W2,1RECV_F5,1F4,1SEND_F4,1I2,1SEND_B2,3W3,3RECV_F4,1F5,1SEND_F5,1I3,1SEND_B3,1W2,3RECV_F5,3F4,3I4,3SEND_B4,1RECV_B4,1W3,1RECV_F6,3F5,3I5,3SEND_B5,1RECV_B5,3W4,1RECV_F7,1F6,1SEND_F6,1I4,1SEND_B4,3W5,3RECV_F6,1F7,1SEND_F7,1I5,1SEND_B5,1W4,3RECV_F7,3F6,3I6,3SEND_B6,1RECV_B6,1W5,3F7,3I7,3SEND_B7,1RECV_B7,3W6,1I6,1SEND_B6,3W7,1I7,1SEND_B7,1W6,1W7 diff --git a/test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_compute.csv b/test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_compute.csv new file mode 100644 index 0000000000..86630f1e1e --- /dev/null +++ b/test/distributed/pipelining/artifacts/zb1p_2rank_2stagep_compute.csv @@ -0,0 +1,2 @@ +0F0,0F1,2F0,,2F1,2I0,2W0,0F2,2I1,2W1,0F3,0I0,0W0,2F2,0I1,0W1,2F3,2I2,2W2,0F4,2I3,2W3,0F5,0I2,0W2,2F4,0I3,0W3,2F5,2I4,2W4,0F6,2I5,2W5,0F7,0I4,0W4,2F6,0I5,0W5,2F7,2I6,2W6,2I7,2W7,0I6,0W6,0I7,0W7 +,1F0,1F1,3F0,3I0,3F1,3I1,3W0,1F2,1I0,3W1,1F3,1I1,1W0,3F2,3I2,1W1,3F3,3I3,3W2,1F4,1I2,3W3,1F5,1I3,1W2,3F4,3I4,1W3,3F5,3I5,3W4,1F6,1I4,3W5,1F7,1I5,1W4,3F6,3I6,1W5,3F7,3I7,3W6,1I6,3W7,1I7,1W6,1W7 diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py new file mode 100644 index 0000000000..c2a1f3d41c --- /dev/null +++ b/test/distributed/pipelining/model_registry.py @@ -0,0 +1,230 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +# This file is a model zoo for testing torch.distributed.pipelining. +import torch +from torch.autograd import Function +from torch.distributed.pipelining import pipe_split, SplitPoint + + +class ExampleCode(torch.nn.Module): + def __init__(self, d_hid): + super().__init__() + self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.cval = torch.nn.Buffer(torch.randn((d_hid,), requires_grad=False)) + self.lin0 = torch.nn.Linear(d_hid, d_hid) + self.lin1 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x): + x = torch.mm(x, self.mm_param0) + x = torch.relu(x) + # try passing a value that doesn't require_grad across skip boundaries + a_constant = self.cval.clone() + x = self.lin0(x) + pipe_split() + x = torch.relu(x) + a_constant + x = torch.mm(x, self.mm_param1) + x = self.lin1(x) + x = torch.relu(x) + return x + + +class ModelWithKwargs(torch.nn.Module): + DEFAULT_DHID = 512 + DEFAULT_BATCH_SIZE = 256 + + def __init__(self, d_hid: int = DEFAULT_DHID): + super().__init__() + self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.lin0 = torch.nn.Linear(d_hid, d_hid) + self.lin1 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)): + x = torch.mm(x, self.mm_param0) + x = x + y + x = self.lin0(x) + x = torch.relu(x) + pipe_split() + x = torch.mm(x, self.mm_param1) + x = self.lin1(x) + x = torch.relu(x) + return x + + +class ModelWithParamAlias(torch.nn.Module): + default_dhid = 512 + default_batch_size = 256 + + def __init__(self, d_hid: int = default_dhid): + super().__init__() + self.mm_param1 = self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.lin1 = self.lin0 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x, y): + x = torch.mm(x, self.mm_param0) + x = x + y + x = self.lin0(x) + x = torch.relu(x) + pipe_split() + x = torch.mm(x, self.mm_param1) + x = self.lin1(x) + x = torch.relu(x) + return x + + +# MLP Layer +class MLPModule(torch.nn.Module): + def __init__(self, d_hid: int): + super().__init__() + self.net1 = torch.nn.Linear(d_hid, d_hid) + self.relu = torch.nn.ReLU() + self.net2 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x): + x = self.net1(x) + x = self.relu(x) + x = self.net2(x) + return x + + +# Multi-MLP model +class MultiMLP(torch.nn.Module): + def __init__(self, d_hid: int, n_layers: int = 2): + super().__init__() + self.layers = torch.nn.ModuleList([MLPModule(d_hid) for _ in range(n_layers)]) + # For testing purpose only, this should be defined by user + self.split_spec = {f"layers.{i}": SplitPoint.BEGINNING for i in range(1, n_layers)} + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class CustomLinearDx(Function): + @staticmethod + # pylint:disable=huawei-too-many-arguments + def forward(ctx, input_val, weight, bias, module, layer_idx): + ctx.save_for_backward(input_val, weight, bias) + ctx.module = module + ctx.layer_idx = layer_idx + return input_val.mm(weight.t()) + bias + + @staticmethod + def backward(ctx, grad_output): + input_val, weight, bias = ctx.saved_tensors + grad_input = grad_output.mm(weight) + ctx.module.cached_context[ctx.layer_idx].append(grad_output.clone()) + ctx.module.cached_context[str(ctx.layer_idx) + "_input"].append( + input_val.clone() + ) + return grad_input, None, None, None, None + + +class CustomLinearDxDw(Function): + @staticmethod + def forward(ctx, input_val, weight, bias): + ctx.save_for_backward(input_val, weight, bias) + return input_val.mm(weight.t()) + bias + + @staticmethod + def backward(ctx, grad_output): + input_val, weight, bias = ctx.saved_tensors + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(input_val) + grad_bias = grad_output.sum(0) + return grad_input, grad_weight, grad_bias + + +class MLPModuleWithDw(torch.nn.Module): + def __init__(self, d_hid: int): + super().__init__() + self.fc1_weight = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.fc1_bias = torch.nn.Parameter(torch.randn(d_hid)) + self.fc2_weight = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.fc2_bias = torch.nn.Parameter(torch.randn(d_hid)) + + torch.nn.init.uniform_(self.fc1_weight, -0.01, 0.01) + torch.nn.init.uniform_(self.fc2_weight, -0.01, 0.01) + torch.nn.init.uniform_(self.fc1_bias, -0.01, 0.01) + torch.nn.init.uniform_(self.fc2_bias, -0.01, 0.01) + + self.cached_context = {} + self.cached_context["fc1"] = [] + self.cached_context["fc2"] = [] + self.cached_context["fc1_input"] = [] + self.cached_context["fc2_input"] = [] + + self.use_custom_logic = False + + def forward(self, x): + if not self.use_custom_logic: + self.hidden = CustomLinearDxDw.apply(x, self.fc1_weight, self.fc1_bias) + self.hidden = torch.nn.functional.relu(self.hidden) + output = CustomLinearDxDw.apply(self.hidden, self.fc2_weight, self.fc2_bias) + return output + + self.hidden = CustomLinearDx.apply( + x, self.fc1_weight, self.fc1_bias, self, "fc1" + ) + self.hidden = torch.nn.functional.relu(self.hidden) + output = CustomLinearDx.apply( + self.hidden, self.fc2_weight, self.fc2_bias, self, "fc2" + ) + return output + + def compute_dW(self): + grad_output_fc1 = self.cached_context["fc1"].pop(0) + grad_output_fc2 = self.cached_context["fc2"].pop(0) + cached_input_fc1 = self.cached_context["fc1_input"].pop(0) + cached_input_fc2 = self.cached_context["fc2_input"].pop(0) + + dW2 = grad_output_fc2.t().mm(cached_input_fc2) + db2 = grad_output_fc2.sum(0) + + dW1 = grad_output_fc1.t().mm(cached_input_fc1) + db1 = grad_output_fc1.sum(0) + + if self.fc1_weight.grad is not None: + self.fc1_weight.grad += dW1 + self.fc1_bias.grad += db1 + self.fc2_weight.grad += dW2 + self.fc2_bias.grad += db2 + else: + self.fc1_weight.grad = dW1 + self.fc1_bias.grad = db1 + self.fc2_weight.grad = dW2 + self.fc2_bias.grad = db2 + + def toggle(self): + self.use_custom_logic = not self.use_custom_logic + + +# Multi-MLP model With Dw +class MultiMLPWithDw(torch.nn.Module): + def __init__(self, d_hid: int, n_layers: int = 2): + super().__init__() + self.layers = torch.nn.ModuleList( + [MLPModuleWithDw(d_hid) for _ in range(n_layers)] + ) + # For testing purpose only, this should be defined by user + self.split_spec = {f"layers.{i}": SplitPoint.BEGINNING for i in range(1, n_layers)} + self.use_custom_logic = False + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + def toggle(self): + self.use_custom_logic = not self.use_custom_logic + for layer in self.layers: + layer.toggle() + + def compute_dW(self): + if not self.use_custom_logic: + raise RuntimeError("Need to call toggle() to enable custom backward and dW") + + for i in reversed(range(len(self.layers))): + self.layers[i].compute_dW() diff --git a/test/distributed/pipelining/schedule_registry.py b/test/distributed/pipelining/schedule_registry.py new file mode 100644 index 0000000000..854672bc77 --- /dev/null +++ b/test/distributed/pipelining/schedule_registry.py @@ -0,0 +1,230 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +# This file is a Schedule zoo for testing torch.distributed.pipelining. +# It includes schedules designed purely for testing purposes +from typing import Callable, Optional + +from torch.distributed.pipelining.schedules import ( + _Action, + _ComputationType, + _PipelineScheduleRuntime, + PipelineScheduleMulti, + RECV_B, + RECV_F, + SEND_B, + SEND_F, +) +from torch.distributed.pipelining.stage import _PipelineStageBase + + +F = _ComputationType.FORWARD +B = _ComputationType.FULL_BACKWARD +W = _ComputationType.BACKWARD_WEIGHT +INPUT = _ComputationType.BACKWARD_INPUT + + +class ScheduleVShaped(PipelineScheduleMulti): + n_stages = 4 + rank_stages = { + 0: [0, 3], + 1: [1, 2], + } + + def __init__( + self, + stages: list[_PipelineStageBase], + n_microbatches: int, + loss_fn: Optional[Callable] = None, + scale_grads: bool = True, + ): + super().__init__( + stages=stages, + n_microbatches=n_microbatches, + loss_fn=loss_fn, + scale_grads=scale_grads, + ) + + # Go through one microbatch + # Note(whc) - it might be easier to work with thes schedules by writing them as a list of + # ["0F0", ...] and then parsing them in the test infra to turn them into actions. + self.pipeline_order = { + 0: [ + _Action(0, F, 0), + None, + None, + _Action(3, F, 0), + _Action(3, B, 0), + None, + None, + _Action(0, B, 0), + ], + 1: [ + None, + _Action(1, F, 0), + _Action(2, F, 0), + None, + None, + _Action(2, B, 0), + _Action(1, B, 0), + None, + ], + } + self._validate_and_set_stage_mapping(self.pipeline_order) + + +class ScheduleUnbalanced(PipelineScheduleMulti): + n_stages = 5 + rank_stages = { + 0: [0, 1, 4], + 1: [2, 3], + } + + def __init__( + self, + stages: list[_PipelineStageBase], + n_microbatches: int, + loss_fn: Optional[Callable] = None, + scale_grads: bool = True, + ): + super().__init__( + stages=stages, + n_microbatches=n_microbatches, + loss_fn=loss_fn, + scale_grads=scale_grads, + ) + + self.pipeline_order = { + 0: [ + _Action(0, F, 0), + _Action(1, F, 0), + None, + None, + _Action(4, F, 0), + _Action(4, B, 0), + None, + None, + _Action(1, B, 0), + _Action(0, B, 0), + ], + 1: [ + None, + None, + _Action(2, F, 0), + _Action(3, F, 0), + None, + None, + _Action(3, B, 0), + _Action(2, B, 0), + None, + None, + ], + } + self._validate_and_set_stage_mapping(self.pipeline_order) + + +class ScheduleWithW(PipelineScheduleMulti): + n_stages = 4 + num_microbatches = 2 + rank_stages = { + 0: [0, 2], + 1: [1, 3], + } + + def __init__( + self, + stages: list[_PipelineStageBase], + n_microbatches: int, + loss_fn: Optional[Callable] = None, + enable_zero_bubble: bool = True, + scale_grads: bool = True, + ): + super().__init__( + stages=stages, + n_microbatches=n_microbatches, + loss_fn=loss_fn, + scale_grads=scale_grads, + ) + + # Needs to be updated as part of all schedules using "W" + self.use_full_backward = False + + # Go through two microbatches + self.pipeline_order = { + 0: [ + _Action(0, F, 0), + _Action(0, F, 1), + _Action(2, F, 0), + _Action(2, F, 1), + None, + _Action(2, INPUT, 0), + _Action(2, W, 0), + _Action(0, INPUT, 0), + _Action(2, INPUT, 1), + _Action(0, W, 0), + _Action(0, INPUT, 1), + _Action(2, W, 1), + _Action(0, W, 1), + ], + 1: [ + None, + _Action(1, F, 0), + _Action(1, F, 1), + _Action(3, F, 0), + _Action(3, INPUT, 0), + _Action(3, F, 1), + _Action(1, INPUT, 0), + _Action(3, INPUT, 1), + _Action(3, W, 0), + _Action(1, INPUT, 1), + _Action(1, W, 0), + _Action(3, W, 1), + _Action(1, W, 1), + ], + } + self._validate_and_set_stage_mapping(self.pipeline_order) + + +class ScheduleWithReorderedB(_PipelineScheduleRuntime): + n_stages = 2 + num_microbatches = 2 + rank_stages = { + 0: [0], + 1: [1], + } + + def __init__( + self, + stages: list[_PipelineStageBase], + n_microbatches: int, + loss_fn: Optional[Callable] = None, + scale_grads: bool = True, + ): + super().__init__( + stages=stages, + n_microbatches=n_microbatches, + loss_fn=loss_fn, + scale_grads=scale_grads, + ) + # Go through two microbatches + self.pipeline_order_with_comms = { + 0: [ + _Action(0, F, 0), + _Action(0, F, 1), + _Action(0, SEND_F, 0), + _Action(0, SEND_F, 1), + _Action(0, RECV_B, 0), + _Action(0, RECV_B, 1), + _Action(0, B, 0), + _Action(0, B, 1), + ], + 1: [ + _Action(1, RECV_F, 0), + _Action(1, RECV_F, 1), + _Action(1, F, 0), + _Action(1, F, 1), + _Action(1, B, 0), + _Action(1, B, 1), + _Action(1, SEND_B, 0), + _Action(1, SEND_B, 1), + ], + } diff --git a/test/distributed/pipelining/test_backward.py b/test/distributed/pipelining/test_backward.py new file mode 100644 index 0000000000..f7015218a8 --- /dev/null +++ b/test/distributed/pipelining/test_backward.py @@ -0,0 +1,187 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +import copy + +from model_registry import MLPModule + +import torch +from torch.distributed.pipelining._backward import ( + stage_backward, + stage_backward_input, + stage_backward_weight, +) +from torch.testing._internal.common_utils import run_tests, TestCase + + +d_hid = 512 +batch_size = 256 + + +class StageBackwardTests(TestCase): + def test_stage_backward(self): + # MLP as a stage module + mod = MLPModule(d_hid) + x = torch.randn(batch_size, d_hid) + # As in a pipeline stage, the inputs to this stage requires gradients + x.requires_grad_(True) + target = torch.randn(batch_size, d_hid) + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Make a copy + ref_mod = copy.deepcopy(mod) + ref_x = x.detach().requires_grad_(x.requires_grad) + ref_target = target.detach() + + # Forward and backward in stage manner + out = mod(x) + loss = loss_fn(out, target) + grad_inputs = stage_backward( + stage_output=loss, + output_grads=None, + input_values=(x,), + ) + + # Run reference + ref_out = ref_mod(ref_x) + ref_loss = loss_fn(ref_out, ref_target) + ref_loss.backward() + + torch.testing.assert_close(grad_inputs[0], ref_x.grad) + + # Every rank checks gradients + for name, p in mod.named_parameters(): + ref_p = ref_mod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + def test_stage_backward_input(self): + # MLP as a stage module + mod = MLPModule(d_hid) + x = torch.randn(batch_size, d_hid) + # As in a pipeline stage, the inputs to this stage requires gradients + x.requires_grad_(True) + target = torch.randn(batch_size, d_hid) + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Make a copy + ref_mod = copy.deepcopy(mod) + ref_x = x.detach().requires_grad_(x.requires_grad) + ref_target = target.detach() + + # Forward, then backward of loss with respect to inputs + out = mod(x) + loss = loss_fn(out, target) + dinputs, param_groups = stage_backward_input( + stage_outputs_or_loss=(loss,), + output_grads=None, + input_values=[x], + weights=mod.parameters(), + ) + + # Run reference + ref_out = ref_mod(ref_x) + ref_loss = loss_fn(ref_out, ref_target) + ref_loss.backward() + + torch.testing.assert_close(x.grad, ref_x.grad) + torch.testing.assert_close(dinputs[0], ref_x.grad) + for _, p in mod.named_parameters(): + # Check that the weight gradients were not updated + self.assertEqual(p.grad, None) + + def test_stage_backward_weight(self): + # MLP as a stage module + mod = MLPModule(d_hid) + x = torch.randn(batch_size, d_hid) + # As in a pipeline stage, the inputs to this stage requires gradients + x.requires_grad_(True) + target = torch.randn(batch_size, d_hid) + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Make a copy + ref_mod = copy.deepcopy(mod) + ref_x = x.detach().requires_grad_(x.requires_grad) + ref_target = target.detach() + + # Forward, then backward of loss with respect to inputs + out = mod(x) + loss = loss_fn(out, target) + dinputs, param_groups = stage_backward_input( + stage_outputs_or_loss=(loss,), + output_grads=None, + input_values=[x], + weights=mod.parameters(), + ) + + # backward of loss with respect to weights + stage_backward_weight(mod.parameters(), param_groups, retain_graph=True) + + # Run reference + ref_out = ref_mod(ref_x) + ref_loss = loss_fn(ref_out, ref_target) + ref_loss.backward() + + # Every rank checks gradients + for name, p in mod.named_parameters(): + ref_p = ref_mod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + def test_stage_backward_weight_multiple_iters(self): + # MLP as a stage module + mod = MLPModule(d_hid) + inputs = [] + for _ in range(10): + x = torch.randn(batch_size, d_hid) + inputs.append(x) + # As in a pipeline stage, the inputs to this stage requires gradients + x.requires_grad_(True) + + target = torch.randn(batch_size, d_hid) + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Make a copy + ref_mod = copy.deepcopy(mod) + ref_inputs = [] + for x in inputs: + ref_inputs.append(x.detach().requires_grad_(x.requires_grad)) + ref_target = target.detach() + + # Forward, then backward of loss with respect to inputs + for x in inputs: + out = mod(x) + loss = loss_fn(out, target) + dinputs, param_groups = stage_backward_input( + stage_outputs_or_loss=(loss,), + output_grads=None, + input_values=[x], + weights=mod.parameters(), + ) + + # backward of loss with respect to weights + stage_backward_weight(mod.parameters(), param_groups) + + # Run reference + for ref_x in ref_inputs: + ref_out = ref_mod(ref_x) + ref_loss = loss_fn(ref_out, ref_target) + ref_loss.backward() + + # Every rank checks gradients + for name, p in mod.named_parameters(): + ref_p = ref_mod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/pipelining/test_microbatch.py b/test/distributed/pipelining/test_microbatch.py new file mode 100644 index 0000000000..9f67c2c37e --- /dev/null +++ b/test/distributed/pipelining/test_microbatch.py @@ -0,0 +1,91 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +from model_registry import ModelWithKwargs + +import torch +from torch.distributed.pipelining import pipeline +from torch.distributed.pipelining.microbatch import ( + merge_chunks, + split_args_kwargs_into_chunks, + TensorChunkSpec, +) +from torch.testing._internal.common_utils import run_tests, TestCase + + +d_hid = 512 +torch.manual_seed(0) + + +class MicrobatchTests(TestCase): + def test_split_and_merge(self): + x0 = torch.randn(128, d_hid) + x1 = torch.randn(256, d_hid) + x2 = torch.randn(512, d_hid) + + args = (x0, x1, x2) + kwargs = {"x0": x0, "x1": x1, "x2": x2} + + # Default chunking: dim 0 + arg_chunks, kwarg_chunks = split_args_kwargs_into_chunks(args, kwargs, 2) + assert len(arg_chunks) == 2 + assert len(kwarg_chunks) == 2 + assert arg_chunks[0][0].shape == torch.Size([64, d_hid]) + assert arg_chunks[1][0].shape == torch.Size([64, d_hid]) + assert arg_chunks[0][1].shape == torch.Size([128, d_hid]) + assert arg_chunks[0][2].shape == torch.Size([256, d_hid]) + assert kwarg_chunks[0]["x0"].shape == torch.Size([64, d_hid]) + assert kwarg_chunks[0]["x1"].shape == torch.Size([128, d_hid]) + assert kwarg_chunks[1]["x2"].shape == torch.Size([256, d_hid]) + + # Merge chunks back together + merged_args = merge_chunks( + arg_chunks, + (TensorChunkSpec(0), TensorChunkSpec(0), TensorChunkSpec(0)), + ) + torch.testing.assert_close(merged_args, args) + + merged_kwargs = merge_chunks( + kwarg_chunks, + { + "x0": TensorChunkSpec(0), + "x1": TensorChunkSpec(0), + "x2": TensorChunkSpec(0), + }, + ) + torch.testing.assert_close(merged_kwargs, kwargs) + print("Microbatch test passed") + + def test_chunk_spec(self): + mod = ModelWithKwargs() + batch_size = ModelWithKwargs.DEFAULT_BATCH_SIZE + + x = torch.randn(batch_size, d_hid) + y = torch.randn(batch_size, d_hid) + + num_chunks = 4 + + args_chunk_spec = TensorChunkSpec.from_tuple((0,)) + kwargs_chunk_spec = TensorChunkSpec.from_dict({"y": 0}) + + args_split, kwargs_split = split_args_kwargs_into_chunks( + (x,), + {"y": y}, + num_chunks, + args_chunk_spec, + kwargs_chunk_spec, + ) + + pipe = pipeline( + mod, + mb_args=args_split[0], + mb_kwargs=kwargs_split[0], + ) + + ref = mod(x, y) + out = pipe(x, y)[0] + torch.testing.assert_close(out, ref) + print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}") + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/pipelining/test_pipe.py b/test/distributed/pipelining/test_pipe.py new file mode 100644 index 0000000000..af18a41fb9 --- /dev/null +++ b/test/distributed/pipelining/test_pipe.py @@ -0,0 +1,123 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +from model_registry import MLPModule, ModelWithParamAlias + +import torch +from torch.distributed.pipelining import pipe_split, pipeline +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + TestCase, +) + + +d_hid = 512 +microbatch_size = 16 + +torch.manual_seed(0) + + +# Basic example +class ExampleCode(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.lin1 = torch.nn.Linear(d_hid, d_hid) + self.lin2 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x, y): + x = torch.mm(x, self.mm_param1) # mutli-use param + skip_connection = x + x = x + y + x = torch.relu(x) + pipe_split() + x = torch.mm(x, self.mm_param1) # mutli-use param + x = self.lin1(x) + pipe_split() + x = torch.relu(x) + x = x + skip_connection + x = torch.mm(x, self.mm_param2) + pipe_split() + x = self.lin2(x) + x = torch.relu(x) + return x + + +class MultiMLP(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.mlp0 = MLPModule(d_hid) + self.mlp1 = MLPModule(d_hid) + self.mlp2 = MLPModule(d_hid) + self.mlp3 = MLPModule(d_hid) + + def forward(self, x, y): + x = self.mlp0(x) + pipe_split() + x = self.mlp1(x) + pipe_split() + x = self.mlp2(x) + pipe_split() + x = self.mlp3(x) + return x - y + + +EXPECTED_N_STAGES = { + ExampleCode: 4, + MultiMLP: 4, + ModelWithParamAlias: 2, +} + +# Currently, we don't enforce full set equality on the FQNs between the original +# and pipelined models, because in the multi-use param case, PP will deduplicate +# the FQNs from the state_dict. +CHECK_FQN_SET_EQUALITY = False + + +class PipeTests(TestCase): + @parametrize("ModelClass", [ExampleCode, MultiMLP, ModelWithParamAlias]) + def test_model_split(self, ModelClass): + mod = ModelClass() + x = torch.randn(microbatch_size, d_hid) + y = torch.randn(microbatch_size, d_hid) + + pipe = pipeline( + mod, + mb_args=(x, y), + ) + + assert ( + pipe.num_stages == EXPECTED_N_STAGES[ModelClass] + ), f"nstages = {pipe.num_stages}, expect {EXPECTED_N_STAGES[ModelClass]}" + + ref_out = mod(x, y) + out = pipe(x, y)[0] + torch.testing.assert_close(out, ref_out) + print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref_out)}") + + # Check qualname + # state_dict.keys include both parameters and persistent buffers + old_names = set(mod.state_dict().keys()) + new_names = set() + for idx in range(pipe.num_stages): + stage_mod = pipe.get_stage_module(idx) + stage_fqns = set(stage_mod.state_dict().keys()) + assert stage_fqns.issubset(old_names) + new_names.update(stage_fqns) + + if CHECK_FQN_SET_EQUALITY: + assert ( + old_names == new_names + ), f""" + old names {old_names} + new names {new_names} + """ + print("Qualname check passed") + + +instantiate_parametrized_tests(PipeTests) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py new file mode 100644 index 0000000000..ee212f3110 --- /dev/null +++ b/test/distributed/pipelining/test_schedule.py @@ -0,0 +1,993 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +import copy +import csv +import logging +import os + +from model_registry import MultiMLP + +import torch +from torch.distributed.pipelining import ( + Schedule1F1B, + ScheduleGPipe, + ScheduleInterleaved1F1B, + ScheduleInterleavedZeroBubble, + ScheduleLoopedBFS, +) +from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping +from torch.distributed.pipelining.schedules import ( + _Action, + _add_send_recv, + _add_unshard_reshard, + _format_pipeline_order, + _merge_bw, + _PipelineSchedule, + _PipelineScheduleRuntime, + _simulate_comms_compute, + _validate_schedule, + B, + F, + get_schedule_class, + I, + PipelineScheduleSingle, + RECV_F, + RESHARD, + SEND_B, + UNSHARD, + W, +) +from torch.distributed.pipelining.stage import _PipelineStageBase, PipelineStage +from torch.testing._internal.common_utils import ( + check_leaked_tensors, + instantiate_parametrized_tests, + parametrize, + run_tests, + TestCase, +) +from torch.testing._internal.distributed.fake_pg import FakeStore + + +ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts") + +logger = logging.getLogger(__name__) +torch.manual_seed(0) + + +class MockPipelineStage(_PipelineStageBase): + def __init__(self, *args, **kwargs): + # Mock the necessary attributes + self.submod = None + self.num_stages = kwargs.get("num_stages", 1) + self.group_size = kwargs.get("group_size", 1) + self.group_rank = kwargs.get("group_rank", 0) + self.group = kwargs.get("group", None) + + def _create_grad_recv_info(self, *args, **kwargs): + return None + + def _prepare_forward_infra(self, n_microbatches): + pass + + def _prepare_backward_infra(self, n_microbatches): + pass + + +class ScheduleTest(TestCase): + def test_get_schedule_class(self): + # List of all expected schedule names + schedule_names = [ + "1F1B", + "1f1b", + "Interleaved1F1B", + "INTERLEAVED1F1B", + "GPipe", + "LoopedBFS", + "PipelineScheduleSingle", + "PipelineScheduleMulti", + ] + + # Test each schedule name + for name in schedule_names: + with self.subTest(name=name): + schedule_class = get_schedule_class(name) + self.assertIsNotNone( + schedule_class, f"Class for {name} should not be None" + ) + self.assertTrue( + issubclass(schedule_class, _PipelineSchedule), + f"{name} should be a subclass of _PipelineSchedule", + ) + + error_case = ["ScheduleThatDoesNotExist"] + for name in error_case: + # Test that the original name is included in the error message + with self.assertRaisesRegex(ValueError, f"{name}"): + get_schedule_class(name) + + @parametrize( + "ScheduleClass", + [ + Schedule1F1B, + ScheduleGPipe, + ScheduleInterleaved1F1B, + ScheduleInterleavedZeroBubble, + ScheduleLoopedBFS, + ], + ) + def test_schedule_with_single_stage(self, ScheduleClass): + """ + Test that schedules with only a single stage work as expected for all schedules. + """ + store = FakeStore() + torch.distributed.init_process_group( + backend="fake", rank=0, world_size=1, store=store + ) + d_hid, batch_size = 512, 256 + n_stages = 1 + device = "cpu" + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(device) + + x = torch.randn(batch_size, d_hid, device=device) + ref_mod = copy.deepcopy(full_mod) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=device) + + def loss_fn(y, target): + return torch.nn.functional.cross_entropy(y, target) + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + submod_name = "layers.0" + stage_module = full_mod.get_submodule(submod_name) + + # Create a pipeline stage to wrap that submodule + num_microbatches = 2 + stages = [ + PipelineStage( + stage_module, + 0, + n_stages, + device, + ) + ] + + if issubclass(ScheduleClass, PipelineScheduleSingle): + stages = stages[0] + + # Attach to a schedule + schedule = ScheduleClass( + stages, + num_microbatches, + loss_fn=loss_fn, + ) + # Run + for _ in range(2): + # Zero gradients + stage_module.zero_grad() + losses = [] + out = schedule.step(x, target=target, losses=losses) + + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "mean", we use + # "mean" here to reduce microbatch losses into a single value too. + pipe_loss = torch.stack(losses).mean() + torch.testing.assert_close(pipe_loss, ref_loss) + + # Check gradients + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + torch.distributed.destroy_process_group() + + def test_zero_bubble_schedule_errors_with_compile(self): + """ + Test that zero bubble schedules raise an error when used with torch.compile. + """ + store = FakeStore() + torch.distributed.init_process_group( + backend="fake", rank=0, world_size=1, store=store + ) + n_stages = 1 + device = torch.device("cpu") + model = MultiMLP(8, n_layers=n_stages) + # full_mod + compiled_model = torch.compile(model) + stage = PipelineStage( + compiled_model, + 0, + n_stages, + device, + ) + with self.assertRaises(RuntimeError): + ScheduleInterleavedZeroBubble([stage], 2) + + torch.distributed.destroy_process_group() + + +instantiate_parametrized_tests(ScheduleTest) + + +class TestSchedulePlan(TestCase): + def setUp(self): + # Define a list of test cases with varying num_local_stages, num_microbatches, and group_size + # These should succeed since num_microbatches % group_size == 0 + self.test_cases = [ + # small number of stages + (2, 2, 2), + (2, 4, 4), + (2, 8, 2), + (2, 8, 4), + (2, 8, 8), + (4, 4, 4), + (4, 8, 4), + (4, 8, 8), + # large microbatches + (4, 16, 4), + (4, 32, 4), + (4, 64, 4), + # large groups + (4, 16, 16), + (4, 32, 32), + (4, 128, 64), + # odd num pipeline stages + (3, 2, 2), + (3, 8, 2), + (3, 12, 4), + # odd group_sizes + (4, 6, 3), + (4, 10, 5), + # n_mb non divisible by group_size + (2, 3, 4), + (2, 4, 4), + (2, 10, 4), + (2, 15, 4), + ] + + @parametrize( + "ScheduleClass", + [ScheduleInterleaved1F1B, ScheduleLoopedBFS], + ) + def test_pipeline_order(self, ScheduleClass): + for num_local_stages, num_microbatches, group_size in self.test_cases: + with self.subTest( + num_local_stages=num_local_stages, + num_microbatches=num_microbatches, + group_size=group_size, + ): + if num_microbatches % group_size != 0: + continue + + logger.info( + "num_local_stages=%d num_microbatches=%d group_size=%d", + num_local_stages, + num_microbatches, + group_size, + ) + num_stages = num_local_stages * group_size + stages = [ + MockPipelineStage(group_size=group_size, num_stages=num_stages) + for i in range(num_local_stages) + ] + + schedule = ScheduleClass(stages, num_microbatches) + _formatted_pipeline_order = _format_pipeline_order( + schedule.pipeline_order + ) + + def stage_to_rank(stage): + return stage % group_size + + comms_sch = _add_send_recv( + schedule.pipeline_order, + stage_to_rank=stage_to_rank, + num_stages=num_stages, + ) + _simulate_comms_compute( + comms_sch, + stage_to_rank=stage_to_rank, + num_stages=num_stages, + ) + + @parametrize( + "ScheduleClass", + [ScheduleInterleaved1F1B, ScheduleInterleavedZeroBubble], + ) + def test_pipeline_order_flex_and_zero_bubble(self, ScheduleClass): + for num_local_stages, num_microbatches, group_size in self.test_cases: + with self.subTest( + num_local_stages=num_local_stages, + num_microbatches=num_microbatches, + group_size=group_size, + ): + warmups_ops_last_stage = (num_local_stages - 1) * ( + num_microbatches // max(1, num_microbatches // group_size) + ) + warmup_ops = warmups_ops_last_stage + 2 * (group_size - 1) + warmup_ops = min(warmup_ops, num_microbatches * num_local_stages) + + num_stages = num_local_stages * group_size + stages = [ + MockPipelineStage(group_size=group_size, num_stages=num_stages) + for i in range(num_local_stages) + ] + schedule = ScheduleClass(stages, num_microbatches) + _format_pipeline_order(schedule.pipeline_order) + + def stage_to_rank(stage): + return stage % group_size + + comms_sch = _add_send_recv( + schedule.pipeline_order, + stage_to_rank=stage_to_rank, + num_stages=num_stages, + ) + _simulate_comms_compute( + comms_sch, + stage_to_rank=stage_to_rank, + num_stages=num_stages, + ) + + +instantiate_parametrized_tests(TestSchedulePlan) + + +class TestScheduleLowering(TestCase): + """Tests lowering passes that convert simple compute-only (FBW) schedules into compute+comms schedules""" + + def _parse_actions(self, actions: list[str]) -> list[_Action]: + return [_Action.from_str(s) for s in actions] + + @parametrize( + "action_str_and_ref", + [ + ("1F0", _Action(1, F, 0)), + ("2I1", _Action(2, I, 1)), + ("0W3", _Action(0, W, 3)), + ("0B3", _Action(0, B, 3)), + ("1UNSHARD", _Action(1, UNSHARD, None)), + ("3RESHARD", _Action(3, RESHARD, None)), + ("2SEND_B2", _Action(2, SEND_B, 2)), + ("1RECV_F1", _Action(1, RECV_F, 1)), + ], + ) + def test_action_parse(self, action_str_and_ref): + """Test that actions can be parsed from strings and round-tripped back to the same strings.""" + act_str, ref = action_str_and_ref + act = _Action.from_str(act_str) + self.assertEqual(act, ref) + self.assertEqual(act_str, act.__repr__()) + + @parametrize( + "test_info", + [ + { + "compute": ["0F0", "0F1", " ", "0B0", "0B1"], + "comms": ["0UNSHARD", "0F0", "0F1", "0B0", "0B1", "0RESHARD"], + }, + ], + ) + def test_unshard_reshard(self, test_info): + """Test the lowering pass that takes a 'compute only' schedule (with only F,B,W ops) and adds + FSDP unshard/reshard operations to the schedule. This is just part of the process of adding communication + ops and producing a complete schedule. + """ + compute_sch = self._parse_actions(test_info["compute"]) + expected_comms_sch = self._parse_actions(test_info["comms"]) + + comms_sch = _add_unshard_reshard(compute_sch) + for expected, actual in zip(expected_comms_sch, comms_sch): + self.assertEqual( + expected, + actual, + ( + f"Mismatch: expected action {expected} but found {actual}." + f"\nWhole Schedule: {comms_sch}" + ), + ) + + @parametrize( + "test_info", + [ + { + "compute": [ + "0F0", + "0F1", + "0F2", + "0I0", + "0I1", + "0W0", + "0I2", + "0W2", + "0W1", + ], + "comms": ["0F0", "0F1", "0F2", "0I0", "0I1", "0W0", "0B2", "0W1"], + }, + ], + ) + def test_merge_bw(self, test_info): + """Test the pass that merges adjacent I and W operations into a B operation.""" + compute_sch = self._parse_actions(test_info["compute"]) + expected_merged_sch = self._parse_actions(test_info["comms"]) + + merged_sch = _merge_bw(compute_sch) + for expected, actual in zip(expected_merged_sch, merged_sch): + self.assertEqual( + expected, + actual, + ( + f"Mismatch: expected action {expected} but found {actual}." + f"\nWhole Schedule: {merged_sch}" + ), + ) + + @parametrize( + "test_info", + [ + { + "schedule": "simple_2_rank_2_stage", + "compute": { + 0: ["0F0", "0F1", " ", "0B0", " ", "0B1"], + 1: [" ", "1F0", "1B0", "1F1", "1B1", " "], + }, + "comms": { + 0: [ + "0F0", + "0SEND_F0", + "0F1", + "0SEND_F1", + "0RECV_B0", + "0B0", + "0RECV_B1", + "0B1", + ], + 1: [ + "1RECV_F0", + "1RECV_F1", + "1F0", + "1B0", + "1SEND_B0", + "1F1", + "1B1", + "1SEND_B1", + ], + }, + "stage_to_rank": lambda stage_idx: stage_idx, + "num_stages": 2, + "simulated_steps": 11, + }, + { + "schedule": "v_2_rank_4_stage", + "compute": { + 0: [ + "0F0", + "0F1", + " ", + "3F0", + "3B0", + "3F1", + "3B1", + "0B0", + "3W0", + "0B1", + "3W1", + "0W0", + "0W1", + ], + 1: [ + " ", + "1F0", + "2F0", + "1F1", + "2F1", + "2B0", + "1B0", + "2B1", + "1B1", + "2W0", + "2W1", + "1W0", + "1W1", + ], + }, + "comms": { + 0: [ + "0F0", + "0SEND_F0", + "0F1", + "0SEND_F1", + "3RECV_F0", + "3F0", + "3B0", + "3SEND_B0", + "3RECV_F1", + "3F1", + "3B1", + "3SEND_B1", + "0RECV_B0", + "0B0", + "3W0", + "0RECV_B1", + "0B1", + "3W1", + "0W0", + "0W1", + ], + 1: [ + "1RECV_F0", + # interesting that this gets scheduled up front, is that expected? + "1RECV_F1", + "1F0", + "2F0", + "2SEND_F0", + "1F1", + # ditto + "2RECV_B0", + "2F1", + "2SEND_F1", + "2B0", + # ditto + "2RECV_B1", + "1B0", + "1SEND_B0", + "2B1", + "1B1", + "1SEND_B1", + "2W0", + "2W1", + "1W0", + "1W1", + ], + }, + "stage_to_rank": lambda stage_idx: [0, 1, 1, 0][stage_idx], + "num_stages": 4, + "simulated_steps": 24, + }, + ], + ) + def test_send_recv(self, test_info): + """Tests the lowering pass that adds send/recv ops to a compute-only schedule.""" + compute_sch = { + rank: self._parse_actions(test_info["compute"][rank]) + for rank in test_info["compute"] + } + expected_comms_sch = { + rank: self._parse_actions(test_info["comms"][rank]) + for rank in test_info["comms"] + } + + comms_sch = _add_send_recv( + compute_sch, test_info["stage_to_rank"], test_info["num_stages"] + ) + for rank in expected_comms_sch: + for i, (expected, actual) in enumerate( + zip(expected_comms_sch[rank], comms_sch[rank]) + ): + self.assertEqual( + expected, + actual, + ( + f"Mismatch on rank {rank} at position {i}." + f"\nExpected: {expected_comms_sch[rank]}" + f"\nActual: {comms_sch[rank]}" + ), + ) + self.assertEqual(len(comms_sch[rank]), len(expected_comms_sch[rank])) + + simulated_schedule = _simulate_comms_compute( + comms_sch, + stage_to_rank=test_info["stage_to_rank"], + num_stages=test_info["num_stages"], + ) + num_steps = max([len(simulated_schedule[rank]) for rank in simulated_schedule]) + self.assertEqual(num_steps, test_info["simulated_steps"]) + + @parametrize("csv_name", ["zb1p_2rank_2stagep"]) + def test_csv(self, csv_name): + def _dump_csv(pipeline_order_with_comms, filename: str): + """Dump a CSV representation of the compute + comms schedule into a file with the provided filename.""" + with open(filename, "w", newline="") as csvfile: + writer = csv.writer(csvfile) + for rank in pipeline_order_with_comms: + writer.writerow(pipeline_order_with_comms[rank]) + + compute_sch = {} + with open( + os.path.join(ARTIFACTS_DIR, f"{csv_name}_compute.csv"), newline="" + ) as csvfile: + for rank, row in enumerate(csv.reader(csvfile)): + compute_sch[rank] = [_Action.from_str(s) for s in row] + num_model_chunks = 2 + pipeline_parallel_size = 2 + num_stages = num_model_chunks * pipeline_parallel_size + + for rank in compute_sch: + compute_sch[rank] = _merge_bw(compute_sch[rank]) + + comms_sch = _add_send_recv( + compute_sch, + stage_to_rank=lambda chunk_index: chunk_index % pipeline_parallel_size, + num_stages=num_stages, + ) + + comms_csv = os.path.join(ARTIFACTS_DIR, f"{csv_name}_comms.csv") + + # Uncomment to regenerate reference output + + sch_ref = {} + with open(comms_csv, newline="") as ref: + for rank, row in enumerate(csv.reader(ref)): + sch_ref[rank] = [_Action.from_str(s) for s in row] + + for rank in sch_ref: + for timestep, (a, b) in enumerate(zip(comms_sch[rank], sch_ref[rank])): + self.assertEqual(a, b, f"Mismatch at {timestep=}, {a=}, expected {b}") + + simulated_schedule = _simulate_comms_compute( + comms_sch, + stage_to_rank=lambda s: s % pipeline_parallel_size, + num_stages=num_stages, + ) + + num_steps = max([len(simulated_schedule[rank]) for rank in simulated_schedule]) + self.assertEqual(num_steps, 113) + + def test_grad_with_v_schedule(self): + """ + We have a special case for V schedules where 2 adjacent stages are on the same rank. + E.g. + rank0: stage 0, stage3 + rank1: stage 1, stage 2, + + The special case involves not using send/recv ops but directly passing tensors between colocated stages. + + This test runs on a single rank and just tests the 'stage1, stage2' portion for both F and B, comparing + gradients to a reference model with 2 layers. + """ + store = FakeStore() + torch.distributed.init_process_group( + backend="fake", rank=0, world_size=1, store=store + ) + d_hid = 512 + batch_size = 256 + n_stages = 2 + device = "npu" + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + stage_indices = [0, 1] + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + # Create a pipeline stage to wrap that submodule + num_microbatches = 2 + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + device, + ) + for stage_module, stage_idx in zip(stage_modules, stage_indices) + ] + + # Attach to a schedule + schedule = _PipelineScheduleRuntime( + stages, + num_microbatches, + loss_fn=loss_fn, + scale_grads=False, + ) + schedule._load_actions( + { + 0: self._parse_actions( + [ + "0F0", + "0F1", + "1F0", + "1F1", + "1B0", + "1B1", + "0B0", + "0B1", + ] + ), + }, + format="compute_comms", + ) + + # Run + with check_leaked_tensors() as garbage_tensors: + for _ in range(2): + # Zero gradients + for stage_module in stage_modules: + stage_module.zero_grad() + losses = [] + out = schedule.step(x, target=target, losses=losses) + self.assertEqual( + len(garbage_tensors), + 0, + "Found leaked tensors, check logs above for debug info", + ) + + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Check gradients + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + torch.distributed.destroy_process_group() + + def test_grad_with_split_b_w(self): + """ + Ensure that separate dInput and dWeight computations are correctly executed. + This test runs on a single rank and just tests a single stage with 2 microbatches with separate B, W operations. + """ + store = FakeStore() + torch.distributed.init_process_group( + backend="fake", rank=0, world_size=1, store=store + ) + d_hid = 512 + batch_size = 256 + n_stages = 1 + device = "npu" + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + stage_indices = [0] + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + # Create a pipeline stage to wrap that submodule + num_microbatches = 2 + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + device, + ) + for stage_module, stage_idx in zip(stage_modules, stage_indices) + ] + + # Attach to a schedule + schedule = _PipelineScheduleRuntime( + stages, + num_microbatches, + loss_fn=loss_fn, + ) + schedule._load_actions( + { + 0: self._parse_actions( + [ + "0F0", + "0F1", + "0I0", + "0I1", + "0W0", + "0W1", + ] + ), + }, + format="compute_comms", + ) + + # Run + with check_leaked_tensors() as garbage_tensors: + for _ in range(2): + # Zero gradients + for stage_module in stage_modules: + stage_module.zero_grad() + losses = [] + out = schedule.step(x, target=target, losses=losses) + self.assertEqual( + len(garbage_tensors), + 0, + "Found leaked tensors, check logs above for debug info", + ) + + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Check gradients + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + torch.distributed.destroy_process_group() + + +class TestValidateSchedule(TestCase): + def test_valid_schedule(self): + schedule_actions = [ + { + 0: [_Action(0, F, 0), _Action(0, B, 0)], + 1: [_Action(1, F, 0), _Action(1, B, 0)], + }, + { + 0: [_Action(0, F, 0), _Action(0, I, 0), _Action(0, W, 0)], + 1: [_Action(1, F, 0), _Action(1, I, 0), _Action(1, W, 0)], + }, + ] + pp_group_size = 2 + num_stages = 2 + num_microbatches = 1 + for actions in schedule_actions: + _validate_schedule(actions, pp_group_size, num_stages, num_microbatches) + + def test_invalid_schedule_missing_rank(self): + actions = { + 0: [_Action(0, F, 0), _Action(0, B, 0)], + } + pp_group_size = 2 + num_stages = 2 + num_microbatches = 1 + with self.assertRaises(AssertionError): + _validate_schedule(actions, pp_group_size, num_stages, num_microbatches) + + def test_invalid_schedule_missing_action(self): + actions = { + 0: [_Action(0, F, 0)], + 1: [_Action(1, F, 0)], + } + pp_group_size = 2 + num_stages = 2 + num_microbatches = 1 + with self.assertRaises(AssertionError): + _validate_schedule(actions, pp_group_size, num_stages, num_microbatches) + + +class ScheduleUtilTests(TestCase): + def test_generate_stage_to_rank_mapping(self): + stage_to_rank = generate_stage_to_rank_mapping(2, 2) + self.assertEqual( + stage_to_rank, + { + 0: 0, + 1: 1, + }, + ) + stage_to_rank = generate_stage_to_rank_mapping(2, 4) + self.assertEqual(stage_to_rank, {0: 0, 1: 1, 2: 0, 3: 1}) + stage_to_rank = generate_stage_to_rank_mapping(4, 8) + self.assertEqual( + stage_to_rank, {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6: 2, 7: 3} + ) + stage_to_rank = generate_stage_to_rank_mapping(2, 4, style="v") + self.assertEqual( + stage_to_rank, + { + 0: 0, + 1: 1, + 2: 1, + 3: 0, + }, + ) + stage_to_rank = generate_stage_to_rank_mapping(4, 12, style="v") + self.assertEqual( + stage_to_rank, + { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + 4: 3, + 5: 2, + 6: 1, + 7: 0, + 8: 0, + 9: 1, + 10: 2, + 11: 3, + }, + ) + stage_to_rank = generate_stage_to_rank_mapping(4, 16, style="v") + self.assertEqual( + stage_to_rank, + { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + 4: 3, + 5: 2, + 6: 1, + 7: 0, + 8: 0, + 9: 1, + 10: 2, + 11: 3, + 12: 3, + 13: 2, + 14: 1, + 15: 0, + }, + ) + + +instantiate_parametrized_tests(TestScheduleLowering) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py new file mode 100644 index 0000000000..1dac146c6f --- /dev/null +++ b/test/distributed/pipelining/test_schedule_multiproc.py @@ -0,0 +1,958 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +import copy +import logging +import os +import sys +import tempfile + +from model_registry import ModelWithKwargs, MultiMLP, MultiMLPWithDw +from schedule_registry import ( + ScheduleUnbalanced, + ScheduleVShaped, + ScheduleWithReorderedB, + ScheduleWithW, +) + +import torch +import torch.distributed as dist +from torch.distributed.pipelining import ( + _ScheduleForwardOnly, + pipeline, + PipelineStage, + Schedule1F1B, + ScheduleGPipe, + ScheduleInterleaved1F1B, + ScheduleInterleavedZeroBubble, + ScheduleLoopedBFS, + ScheduleZBVZeroBubble, +) +from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime +from torch.testing._internal.common_cuda import TEST_MULTIGPU +from torch.testing._internal.common_distributed import ( + MultiProcContinousTest, + requires_nccl, +) +from torch.testing._internal.common_utils import ( + check_leaked_tensors, + instantiate_parametrized_tests, + parametrize, + skip_but_pass_in_sandcastle_if, +) + + +logger = logging.getLogger(__name__) + +d_hid = 512 +batch_size = 256 + +torch.manual_seed(0) + + +class ScheduleTest(MultiProcContinousTest): + @classmethod + def backend_str(cls) -> str: + # Testing with HCCL backend + return "hccl" + + @classmethod + def setUpClass(cls): + """ + Class-scope test fixture. Run once for entire test class, before any test starts. + Set up the device. + """ + super().setUpClass() + dev_id = cls.rank % torch.npu.device_count() + cls.device = torch.device(f"npu:{dev_id}") + + @parametrize("ScheduleClass", [_ScheduleForwardOnly]) + def test_forward_only(self, ScheduleClass): + mod = MultiMLP(d_hid, n_layers=self.world_size) + mod.to(self.device) + + mod_ref = copy.deepcopy(mod) + + x = torch.randn(batch_size, d_hid, device=self.device) + x_clone = x.clone() + + num_microbatches = 4 + x_mb = x.chunk(num_microbatches)[0] + + # Create a pipeline + split_spec = mod.split_spec if hasattr(mod, "split_spec") else None + pipe = pipeline( + mod, + mb_args=(x_mb,), + split_spec=split_spec, + ) + + stage = pipe.build_stage( + self.rank, + self.device, + ) + + # Attach to a schedule + schedule = ScheduleClass(stage, num_microbatches, scale_grads=False) + + # Run + num_iters = 20 + for _ in range(num_iters): + if self.rank == 0: + schedule.step(x) + dist.recv(x, src=self.world_size - 1) + elif self.rank == self.world_size - 1: + out = schedule.step() + dist.send(out, dst=0) + else: + schedule.step() + + # Validate pipelined output is the same as reference model + if self.rank == self.world_size - 1: + for _ in range(num_iters): + x_clone = mod_ref(x_clone) + + torch.testing.assert_close(x_clone, out) + + @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B]) + def test_multi_iter(self, ScheduleClass): + mod = MultiMLP(d_hid, n_layers=self.world_size) + mod.to(self.device) + + x = torch.randn(batch_size, d_hid, device=self.device) + target = torch.randn(batch_size, d_hid, device=self.device) + loss_fn = torch.nn.MSELoss(reduction="sum") + + chunks = 4 + x_mb = x.chunk(chunks)[0] + + # Create a pipeline + split_spec = mod.split_spec if hasattr(mod, "split_spec") else None + pipe = pipeline( + mod, + mb_args=(x_mb,), + split_spec=split_spec, + ) + + stage = pipe.build_stage( + self.rank, + self.device, + ) + + # Attach to a schedule + schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False) + + # Run + for _ in range(20): + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + schedule.step(target=target, losses=losses) + else: + schedule.step() + + @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B]) + def test_kwargs_with_tracer(self, ScheduleClass): + mod = ModelWithKwargs(d_hid) + mod.to(self.device) + + x = torch.randn(batch_size, d_hid, device=self.device) + y = torch.randn(batch_size, d_hid, device=self.device) + target = torch.randn(batch_size, d_hid, device=self.device) + loss_fn = torch.nn.MSELoss(reduction="sum") + + chunks = 4 + x_mb = x.chunk(chunks)[0] + y_mb = y.chunk(chunks)[0] + + pipe = pipeline( + mod, + mb_args=(x_mb,), + mb_kwargs={"y": y_mb}, + ) + + stage = pipe.build_stage( + self.rank, + self.device, + ) + + # Attach to a schedule + schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False) + + # Run + if self.rank == 0: + schedule.step(x, y=y) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + + dist.barrier() + + # Last rank checks result + if self.rank == self.world_size - 1: + ref_out = mod(x, y=y) + ref_loss = loss_fn(ref_out, target) + pipe_loss = sum(losses) + torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=5e-3) + torch.testing.assert_close(pipe_loss, ref_loss) + + @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B]) + @parametrize("ModelClass", [MultiMLP]) + def test_grad_with_tracer(self, ScheduleClass, ModelClass): + mod = ModelClass(d_hid) + mod.to(self.device) + + ref_mod = copy.deepcopy(mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + # Create a pipeline + chunks = 4 + x_mb = x.chunk(chunks)[0] + split_spec = mod.split_spec if hasattr(mod, "split_spec") else None + pipe = pipeline( + mod, + mb_args=(x_mb,), + split_spec=split_spec, + ) + + stage = pipe.build_stage( + self.rank, + self.device, + ) + + # Attach to a schedule + schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False) + + # Run + stage_module = pipe.get_stage_module(self.rank) + for _ in range(2): + # Zero gradients + stage_module.zero_grad() + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + + dist.barrier() + + # Last rank checks result + if self.rank == self.world_size - 1: + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + for name, p in stage_module.named_parameters(): + ref_p = ref_mod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B]) + @parametrize("shape_inference", [True, False]) + def test_grad_with_manual(self, ScheduleClass, shape_inference): + full_mod = MultiMLP(d_hid, n_layers=self.world_size) + full_mod.to(self.device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + # Get a submodule, e.g. `layers.0` or `layers.1` + submod_name = f"layers.{self.rank}" + stage_module = full_mod.get_submodule(submod_name) + chunks = 4 + + if shape_inference: + input_args = None + output_args = None + else: + input_args = (x.chunk(chunks)[0],) + with torch.no_grad(): + output_args = stage_module(*input_args) + + # Create a pipeline stage to wrap that submodule + stage = PipelineStage( + stage_module, + self.rank, + self.world_size, + self.device, + input_args=input_args, + output_args=output_args, + ) + + # Attach to a schedule + schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False) + + # Run + for _ in range(2): + # Zero gradients + stage_module.zero_grad() + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + + dist.barrier() + + # Last rank checks result + if self.rank == self.world_size - 1: + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + ref_submod = ref_mod.get_submodule(submod_name) + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + @parametrize( + "ScheduleClass", + [ + ScheduleInterleaved1F1B, + ScheduleLoopedBFS, + ScheduleInterleavedZeroBubble, + ], + ) + @parametrize("use_new_runtime", [False, True]) + def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime): + stages_per_rank = 2 + n_stages = stages_per_rank * self.world_size + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(self.device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + # Get a submodule, e.g. `layers.0` or `layers.1` + stage_indices = [ + self.rank + i * self.world_size + for i in range(stages_per_rank) + ] + print(f"Rank {self.rank} stages: {stage_indices}") + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + # Create a pipeline stage to wrap that submodule + num_microbatches = ( + ScheduleClass.num_microbatches + if hasattr(ScheduleClass, "num_microbatches") + else 8 + ) + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + self.device, + ) + for stage_module, stage_idx in zip(stage_modules, stage_indices) + ] + + # Attach to a schedule + schedule = ScheduleClass( + stages, num_microbatches, loss_fn=loss_fn, scale_grads=False + ) + if use_new_runtime: + old_schedule = schedule + tmp_schedule = _PipelineScheduleRuntime( + stages, + num_microbatches, + loss_fn=loss_fn, + scale_grads=False, + ) + tmp_schedule._load_actions(old_schedule.pipeline_order) + # test that csv round-trip works for compute_comms schedule + schedule = _PipelineScheduleRuntime( + stages, + num_microbatches, + loss_fn=loss_fn, + scale_grads=False, + ) + with tempfile.NamedTemporaryFile() as f: + tmp_schedule._dump_csv(f.name) + f.seek(0) + schedule._load_csv(f.name, format="compute_comms") + one_more_schedule = _PipelineScheduleRuntime( + stages, + num_microbatches, + loss_fn=loss_fn, + scale_grads=False, + ) + one_more_schedule._load_actions( + schedule.pipeline_order_with_comms, format="compute_comms" + ) + self.assertEqual( + len(schedule.pipeline_order_with_comms), + len( + one_more_schedule.pipeline_order_with_comms, + ), + ) + for rank in schedule.pipeline_order_with_comms: + self.assertEqual( + len(schedule.pipeline_order_with_comms[rank]), + len( + one_more_schedule.pipeline_order_with_comms[rank], + ), + ) + for a, b in zip( + schedule.pipeline_order_with_comms[rank], + one_more_schedule.pipeline_order_with_comms[rank], + ): + self.assertEqual(a, b) + + # Run + with check_leaked_tensors() as garbage_tensors: + for _ in range(2): + # Zero gradients + for stage_module in stage_modules: + stage_module.zero_grad() + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + self.assertEqual( + len(garbage_tensors), + 0, + "Found leaked tensors, check logs above for debug info", + ) + dist.barrier() + + # Last rank checks result + if self.rank == self.world_size - 1: + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + @parametrize("ScheduleClass", [ScheduleWithW, ScheduleInterleavedZeroBubble]) + def test_schedule_with_native_zero_bubble(self, ScheduleClass): + print(ScheduleClass) + if ScheduleClass is ScheduleInterleavedZeroBubble: + n_stages = 4 + num_microbatches = 8 + rank_stages = { + 0: [0, 2], + 1: [1, 3], + } + else: + n_stages = ScheduleClass.n_stages + num_microbatches = ScheduleClass.num_microbatches + rank_stages = ScheduleClass.rank_stages + + num_steps = 4 + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(self.device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Create a pipeline stage to wrap that submodule + stage_indices = rank_stages.get(self.rank) + print(f"Rank {self.rank} stages: {stage_indices}") + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + self.device, + ) + for stage_module, stage_idx in zip(stage_modules, rank_stages.get(self.rank)) + ] + + # We set scale_grads=False since we use a loss function that sums instead of mean-reduces + # (note: normally we recommend using mean-reduce loss functions, but we preserve at least one test case + # using sum scaling for completeness) + schedule = ScheduleClass( + stages, num_microbatches, loss_fn=loss_fn, scale_grads=False + ) + + # Run reference + ref_x = x.detach().clone().requires_grad_(x.requires_grad) + torch.testing.assert_close(x, ref_x) + for _ in range(num_steps): + ref_out = ref_mod(ref_x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + with check_leaked_tensors() as garbage_tensors: + # Run pipelined stages + for _ in range(num_steps): + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + schedule.step(target=target, losses=losses) + else: + schedule.step() + self.assertEqual( + len(garbage_tensors), + 0, + "Found leaked tensors, check logs above for debug info", + ) + + # Every rank checks parameters compared with the reference model + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print( + f"Parameter test failed for {submod_name}.{name}: {p.grad} vs {ref_p.grad}" + ) + raise + + @parametrize( + "ScheduleClass", + [ + ScheduleWithReorderedB, + ], + ) + def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass): + n_stages = 2 + num_microbatches = 2 + stages_per_rank = 1 + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(self.device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + # Get a submodule, e.g. `layers.0` or `layers.1` + stage_indices = [ + self.rank + i * self.world_size + for i in range(stages_per_rank) + ] + print(f"Rank {self.rank} stages: {stage_indices}") + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + # Create a pipeline stage to wrap that submodule + num_microbatches = ( + ScheduleClass.num_microbatches + if hasattr(ScheduleClass, "num_microbatches") + else 8 + ) + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + self.device, + ) + for stage_module, stage_idx in zip(stage_modules, stage_indices) + ] + + # Attach to a schedule + schedule = ScheduleClass( + stages, num_microbatches, loss_fn=loss_fn, scale_grads=False + ) + assert isinstance(schedule, _PipelineScheduleRuntime) + + # Run + with check_leaked_tensors() as garbage_tensors: + for _ in range(2): + # Zero gradients + for stage_module in stage_modules: + stage_module.zero_grad() + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + self.assertEqual( + len(garbage_tensors), + 0, + "Found leaked tensors, check logs above for debug info", + ) + dist.barrier() + + # Last rank checks result + if self.rank == self.world_size - 1: + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + @parametrize( + "schedule_class", [ScheduleVShaped, ScheduleUnbalanced, ScheduleZBVZeroBubble] + ) + @parametrize("use_new_runtime", [False, True]) + def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime): + if schedule_class is ScheduleZBVZeroBubble: + n_stages = 4 + rank_stages = { + 0: [0, 3], + 1: [1, 2], + } + else: + n_stages = schedule_class.n_stages + rank_stages = schedule_class.rank_stages + full_mod = MultiMLP(d_hid, n_layers=n_stages) + full_mod.to(self.device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + # Create a pipeline stage to wrap that submodule + num_microbatches = 1 + stage_indices = rank_stages.get(self.rank) + print(f"Rank {self.rank} stages: {stage_indices}") + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + self.device, + ) + for stage_module, stage_idx in zip(stage_modules, rank_stages.get(self.rank)) + ] + + schedule = schedule_class( + stages, + num_microbatches, + loss_fn=loss_fn, + scale_grads=False, + ) + if use_new_runtime: + old_schedule = schedule + schedule = _PipelineScheduleRuntime( + stages, + num_microbatches, + loss_fn=loss_fn, + ) + schedule._load_actions(old_schedule.pipeline_order) + + # Run + for _ in range(2): + # Zero gradients + for stage_module in stage_modules: + stage_module.zero_grad() + if self.rank == 0: + losses = [] + out = schedule.step(x, target=target, losses=losses) + else: + schedule.step() + + dist.barrier() + + # Last rank checks result + if self.rank == 0: + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + + @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble]) + def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass): + stages_per_rank = 2 + n_stages = stages_per_rank * self.world_size + full_mod = MultiMLPWithDw(d_hid, n_layers=n_stages) + full_mod.to(self.device) + + ref_mod = copy.deepcopy(full_mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + ref_loss_fn = torch.nn.MSELoss(reduction="sum") + full_loss_fn = torch.nn.MSELoss(reduction="sum") + + full_mod.toggle() + + # Get a submodule, e.g. `layers.0` or `layers.1` + stage_indices = [ + self.rank + i * self.world_size + for i in range(stages_per_rank) + ] + submod_names = [f"layers.{i}" for i in stage_indices] + stage_modules = [ + full_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + + # Run reference + for _ in range(2): + ref_stage_modules = [ + ref_mod.get_submodule(submod_name) + for submod_name in submod_names + ] + for stage_module in ref_stage_modules: + stage_module.zero_grad() + + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = ref_loss_fn(ref_out, target) + ref_loss.backward() + + class CustomState: + def __init__(self, stage_module, stage_idx, rank): + self.i = 0 + self.stage_module = stage_module + self.stage_idx = stage_idx + self.rank = rank + + def dw_builder(self): + def dw_runner(): + # This inner function would be called by PipelineStage during `backward_weight_one_chunk` + self.i += 1 + print( + f"[Rank {self.rank}] dw_count={self.i} stage={self.stage_idx}" + ) + self.stage_module.compute_dW() + + return dw_runner + + cs = {} + for stage_module, stage_idx in zip(stage_modules, stage_indices): + cs[stage_idx] = CustomState(stage_module, stage_idx, self.rank) + + # Create a pipeline stage to wrap that submodule + chunks = 2 + stages = [ + PipelineStage( + stage_module, + stage_idx, + n_stages, + self.device, + dw_builder=cs[stage_idx].dw_builder, + ) + for stage_module, stage_idx in zip(stage_modules, stage_indices) + ] + + # Attach to a schedule + schedule = ScheduleClass( + stages, chunks, loss_fn=full_loss_fn, scale_grads=False + ) + + for _ in range(2): + # Zero gradients + for stage_module in stage_modules: + stage_module.zero_grad() + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + + dist.barrier() + # Last rank checks result + if self.rank == self.world_size - 1: + # Check output + torch.testing.assert_close(out, ref_out) + + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + for stage_module, submod_name in zip(stage_modules, submod_names): + # Get corresponding submodule from reference model + ref_submod = ref_mod.get_submodule(submod_name) + # Check gradients per parameter + for name, p in stage_module.named_parameters(): + ref_p = ref_submod.get_parameter(name) + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + + +instantiate_parametrized_tests(ScheduleTest) + + +if __name__ == "__main__": + # Check if NPU and HCCL are available + if not ( + dist.is_available() + and dist.is_hccl_available() + and torch.npu.device_count() > 1 + ): + print( + "c10d HCCL not available or not enough NPUs, skipping tests", + file=sys.stderr, + ) + sys.exit(0) + + rank = int(os.getenv("RANK", -1)) + world_size = int(os.getenv("WORLD_SIZE", 2)) + + if rank != -1: + # Launched with torchrun or other multi-proc launchers. Directly run the test. + ScheduleTest.run_rank(rank, world_size) + else: + # Launched as a single process. Spawn subprocess to run the tests. + # Also need a rendezvous file for `init_process_group` purpose. + rdvz_file = tempfile.NamedTemporaryFile(delete=False).name + torch.multiprocessing.spawn( + ScheduleTest.run_rank, + nprocs=world_size, + args=(world_size, rdvz_file), + ) diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py new file mode 100644 index 0000000000..b47a0e17a3 --- /dev/null +++ b/test/distributed/pipelining/test_stage.py @@ -0,0 +1,342 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +import os +import sys +import tempfile + +from model_registry import ExampleCode, ModelWithKwargs, MultiMLP + +import torch +import torch.distributed as dist +from torch.distributed.pipelining import ( + build_stage, + pipeline, + PipelineStage, + ScheduleGPipe, +) +from torch.distributed.pipelining._utils import PipeliningShapeError +from torch.testing._internal.common_cuda import TEST_MULTIGPU +from torch.testing._internal.common_distributed import ( + MultiProcContinousTest, + requires_nccl, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + skip_but_pass_in_sandcastle_if, +) +from torch.utils._pytree import tree_map_only + + +d_hid = 512 +batch_size = 256 +chunks = 4 + +torch.manual_seed(0) + + +def get_dtype_change_hook(new_dtype): + """A simple hook for simulating mixed precision""" + + def dtype_change_hook(module, in_put, output): + def f(x): + return x.to(new_dtype) + + return tree_map_only(torch.Tensor, f, output) + + return dtype_change_hook + + +def get_flatten_hook(): + """A simple hook for simulating wrong model output shape""" + + def flatten_hook(module, in_put, output): + def f(x): + return x.flatten() + + return tree_map_only(torch.Tensor, f, output) + + return flatten_hook + + +class StageTest(MultiProcContinousTest): + @classmethod + def backend_str(cls) -> str: + # Testing with HCCL backend + return "hccl" + + @classmethod + def setUpClass(cls): + """ + Class-scope test fixture. Run once for entire test class, before any test starts. + Set up the device. + """ + super().setUpClass() + dev_id = cls.rank % torch.npu.device_count() + cls.device = torch.device(f"npu:{dev_id}") + + @parametrize("ModelClass", [ExampleCode, MultiMLP]) + def test_tracer(self, ModelClass): + mod = ModelClass(d_hid) + mod.to(self.device) + + x = torch.randn(batch_size, d_hid, device=self.device) + x_mb = x.chunk(chunks)[0] + + split_spec = mod.split_spec if hasattr(mod, "split_spec") else None + pipe = pipeline( + mod, + mb_args=(x_mb,), + split_spec=split_spec, + ) + + stage = pipe.build_stage( + self.rank, + self.device, + ) + + # Attach to a schedule + schedule = ScheduleGPipe(stage, chunks) + + # Run + def _run_step(x): + if self.rank == 0: + return schedule.step(x) + else: + return schedule.step() + + out = _run_step(x) + # Last rank checks result + if self.rank == self.world_size - 1: + ref_out = mod(x) + torch.testing.assert_close(out, ref_out, atol=1e-3, rtol=5e-2) + + # Test qualname mapping + submod_keys = stage.submod.state_dict().keys() + # Confirm keys are consistent with original model + old_keys = mod.state_dict().keys() + assert all(k in old_keys for k in submod_keys) + + if self.rank == 0: + # intended to run this code on all ranks, but the problem is if rank0 throws, + # it won't perform the send that unblocks rank 1. + + with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"): + _run_step(x.to(torch.int32)) + + # output of stage's mlp layer will be flattened by this hook, the stage should err + handle = stage.submod.register_forward_hook(get_flatten_hook()) + with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"): + _run_step(x) + handle.remove() + + stage.submod.register_forward_hook(get_dtype_change_hook(torch.bfloat16)) + with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"): + _run_step(x) + + @parametrize("ModelClass", [ModelWithKwargs]) + def test_tracer_kwargs(self, ModelClass): + mod = ModelClass(d_hid) + mod.to(self.device) + + x = torch.randn(batch_size, d_hid, device=self.device) + y = torch.randn(batch_size, d_hid, device=self.device) + + x_mb = x.chunk(chunks)[0] + y_mb = y.chunk(chunks)[0] + + pipe = pipeline( + mod, + mb_args=(x_mb,), + mb_kwargs={"y": y_mb}, + ) + + stage_mod = pipe.get_stage_module(self.rank) + + # Test build_stage + stage = build_stage( + stage_mod, + self.rank, + pipe.info(), + self.device, + ) + + # Attach to a schedule + schedule = ScheduleGPipe(stage, chunks) + + # Run + if self.rank == 0: + out = schedule.step(x, y=y) + else: + out = schedule.step() + + # Last rank checks result + if self.rank == self.world_size - 1: + ref_out = mod(x, y=y) + torch.testing.assert_close(out, ref_out, atol=1e-3, rtol=5e-2) + + # Test qualname mapping + submod_keys = stage.submod.state_dict().keys() + # Confirm keys are consistent with original model + old_keys = mod.state_dict().keys() + assert all(k in old_keys for k in submod_keys) + + def test_manual(self): + full_mod = MultiMLP(d_hid, n_layers=self.world_size) + full_mod.to(self.device) + stage_mod = full_mod.get_submodule(f"layers.{self.rank}") + + x = torch.randn(batch_size, d_hid, device=self.device) + + stage = PipelineStage( + stage_mod, + self.rank, + self.world_size, + self.device, + ) + + # Attach to a schedule + schedule = ScheduleGPipe(stage, chunks) + + # Run + def _run_step(x): + if self.rank == 0: + return schedule.step(x) + else: + return schedule.step() + + out = _run_step(x) + # Last rank checks result + if self.rank == self.world_size - 1: + ref_out = full_mod(x) + torch.testing.assert_close(out, ref_out) + + if self.rank == 0: + with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"): + _run_step(torch.randn(batch_size + 1, d_hid, device=self.device)) + + with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"): + _run_step(x.to(torch.int32)) + + # output of stage's mlp layer will be flattened by this hook, the stage should err + handle = stage_mod.register_forward_hook(get_flatten_hook()) + with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"): + _run_step(x) + handle.remove() + + stage_mod.register_forward_hook(get_dtype_change_hook(torch.bfloat16)) + with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"): + _run_step(x) + + def test_custom_dw_with_fb_schedule(self): + """Tests that separate weight grad function 'dw_runner' gets run under a schedule that's only aware of F/B.""" + full_mod = MultiMLP(d_hid, n_layers=self.world_size) + full_mod.to(self.device) + stage_mod = full_mod.get_submodule(f"layers.{self.rank}") + + x = torch.randn(batch_size, d_hid, device=self.device) + target = torch.randn(batch_size, d_hid, device=self.device) + + class CustomState: + def __init__(self) -> None: + self.i = 0 + + def dw_builder(self): + """This simulates a function attached to a model with a custom backward. + Each call to builder gives a new dw_runner that has some updated state to compute the latest dw. + """ + + def dw_runner(): + # This inner function would be called by PipelineStage during `backward_weight_one_chunk` + print(f"dw called {self.i}th time") + self.i += 1 + + return dw_runner + + cs = CustomState() + + stage = PipelineStage( + stage_mod, + self.rank, + self.world_size, + self.device, + dw_builder=cs.dw_builder, + ) + + # Attach to a schedule + schedule = ScheduleGPipe( + stage, chunks, loss_fn=torch.nn.MSELoss(reduction="sum") + ) + + # Run + def _run_step(x): + if self.rank == 0: + return schedule.step(x) + elif self.rank == self.world_size - 1: + return schedule.step(target=target) + else: + return schedule.step() + + out = _run_step(x) + + self.assertEqual(cs.i, chunks) + + # Last rank checks result + if self.rank == self.world_size - 1: + ref_out = full_mod(x) + torch.testing.assert_close(out, ref_out) + + if self.rank == 0: + with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"): + _run_step(torch.randn(batch_size + 1, d_hid, device=self.device)) + + def test_custom_dw_errors(self): + """Tests expected errors are raised""" + full_mod = MultiMLP(d_hid, n_layers=self.world_size) + full_mod.to(self.device) + stage_mod = full_mod.get_submodule(f"layers.{self.rank}") + + x = torch.randn(batch_size, d_hid, device=self.device) + target = torch.randn(batch_size, d_hid, device=self.device) + + stage_with_dw_builder = PipelineStage( + stage_mod, + self.rank, + self.world_size, + self.device, + dw_builder=lambda: None, + ) + with self.assertRaisesRegex(AssertionError, "backward_one_chunk"): + stage_with_dw_builder.backward_weight_one_chunk(bwd_chunk_id=0) + + +instantiate_parametrized_tests(StageTest) + +if __name__ == "__main__": + # Check if NPU and HCCL are available + if not ( + dist.is_available() + and dist.is_hccl_available() + and torch.npu.device_count() > 1 + ): + print( + "c10d HCCL not available or not enough GPUs, skipping tests", + file=sys.stderr, + ) + sys.exit(0) + + rank = int(os.getenv("RANK", -1)) + world_size = int(os.getenv("WORLD_SIZE", 2)) + + if rank != -1: + # Launched with torchrun or other multi-proc launchers. Directly run the test. + StageTest.run_rank(rank, world_size) + else: + # Launched as a single process. Spawn subprocess to run the tests. + # Also need a rendezvous file for `init_process_group` purpose. + rdvz_file = tempfile.NamedTemporaryFile(delete=False).name + torch.multiprocessing.spawn( + StageTest.run_rank, + nprocs=world_size, + args=(world_size, rdvz_file), + ) diff --git a/test/distributed/pipelining/test_transformer.py b/test/distributed/pipelining/test_transformer.py new file mode 100644 index 0000000000..8ac9a30089 --- /dev/null +++ b/test/distributed/pipelining/test_transformer.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +import torch +from torch.distributed.pipelining import pipeline, SplitPoint +from torch.testing._internal.common_utils import run_tests, TestCase + + +d_hid = 16 +n_layers = 8 +microbatch_size = 4 + + +class MLPModule(torch.nn.Module): + def __init__(self, _d_hid): + super().__init__() + self.net1 = torch.nn.Linear(_d_hid, _d_hid) + self.relu = torch.nn.ReLU() + self.net2 = torch.nn.Linear(_d_hid, _d_hid) + + def forward(self, x): + x = self.net1(x) + x = self.relu(x) + x = self.net2(x) + return x + + +class TransformerLike(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.layers = torch.nn.Sequential(*[MLPModule(d_hid) for _ in range(n_layers)]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.layers(x) + + +class TransformerTests(TestCase): + def test_ir(self): + transformer = TransformerLike() + x = torch.randn(microbatch_size, d_hid) + + # Split into 2 stages + num_stages = 2 + split_spec = {f"layers.{n_layers // num_stages}": SplitPoint.BEGINNING} + + pipe = pipeline( + transformer, + (x,), + split_spec=split_spec, + ) + assert pipe.num_stages == num_stages, f"{pipe.num_stages=}, expect {num_stages}" + + def get_layers(module): + layers = [name for name, _ in module.layers.named_children()] + return layers + + # Collect all layers in pipe + layers = [] + for stage_idx in range(pipe.num_stages): + stage_mod = pipe.get_stage_module(stage_idx) + layers += get_layers(stage_mod) + + # Check layer completeness + orig_layers = get_layers(transformer) + assert sorted(layers) == sorted(orig_layers), f"{layers} != {orig_layers}" + print("Layers matched!") + + # Check equivalence + ref = transformer(x) + out = pipe(x)[0] + torch.testing.assert_close(out, ref) + print(f"Equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}") + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py new file mode 100644 index 0000000000..ba0b3c62a2 --- /dev/null +++ b/test/distributed/pipelining/test_unflatten.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +import torch +from torch.distributed.pipelining import pipe_split, pipeline +from torch.testing._internal.common_utils import run_tests, TestCase + + +# Building block for model +class Block(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.conv = torch.nn.Conv2d( + in_channels=16, out_channels=16, kernel_size=3, padding=1 + ) + self.lin0 = torch.nn.Linear(256, 256) + self.relu = torch.nn.ReLU() + self.lin1 = torch.nn.Linear(256, 256) + + def forward(self, x: torch.Tensor, constant=None) -> torch.Tensor: + x = self.conv(x) + x = self.lin0(x) + pipe_split() + x.add(constant) + x = self.lin1(x) + return self.relu(x) + + +# Full model +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.block0 = Block() + self.block1 = Block() + + def forward(self, x: torch.Tensor, constant=None) -> torch.Tensor: + x = self.block0(x, constant=constant) + pipe_split() + x = self.block1(x, constant=constant) + return x + + +class UnflattenTests(TestCase): + def test_unflatten(self): + x = torch.randn(1, 16, 256, 256) + constant = torch.ones(1, 16, 256, 256) + + mod = M() + + pipe = pipeline( + mod, + (x,), + {"constant": constant}, + ) + + assert pipe.num_stages == 4 + orig_state_dict = mod.state_dict() + + # Check qualnames + for stage_idx in range(pipe.num_stages): + stage_mod = pipe.get_stage_module(stage_idx) + for param_name, _ in stage_mod.named_parameters(): + assert ( + param_name in orig_state_dict + ), f"{param_name} not in original state dict" + print("Param qualname test passed") + + # Check equivalence + ref = mod(x, constant) + out = pipe(x, constant)[0] + torch.testing.assert_close(out, ref) + print(f"Equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}") + + +if __name__ == "__main__": + run_tests() -- Gitee From 80d36974747970f5fb98a992d2d6b259d4612350 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Wed, 4 Jun 2025 08:19:51 +0000 Subject: [PATCH 028/328] !21509 !17399 add dump for exec timeout Merge pull request !21509 from huangyunlong/2.7ft3 --- test/cpp_extensions/test/dump_allreduce.py | 26 +++++ .../test/test_cpp_extensions_aot.py | 13 +++ torch_npu/csrc/distributed/HCCLUtils.cpp | 46 ++++++++ torch_npu/csrc/distributed/HCCLUtils.hpp | 2 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 101 ++++++++++++++++-- .../csrc/distributed/ProcessGroupHCCL.hpp | 76 +++++++------ torch_npu/csrc/distributed/TraceUtils.h | 76 +++++-------- 7 files changed, 254 insertions(+), 86 deletions(-) create mode 100644 test/cpp_extensions/test/dump_allreduce.py diff --git a/test/cpp_extensions/test/dump_allreduce.py b/test/cpp_extensions/test/dump_allreduce.py new file mode 100644 index 0000000000..c62b386a00 --- /dev/null +++ b/test/cpp_extensions/test/dump_allreduce.py @@ -0,0 +1,26 @@ +import os + +os.environ["TASK_QUEUE_ENABLE"] = "1" +os.environ["ASCEND_LAUNCH_BLOCKING"] = "0" +os.environ["HCCL_EXEC_TIMEOUT"] = "160" +os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" +os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1" +os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1024" +os.environ["TORCH_HCCL_DEBUG_INFO_TEMP_FILE"] = "./hccl_trace_rank_" + +import torch +import torch.distributed as dist +import torch_npu + +import torch_test_cpp_extension.npu as npu_extension + +backend = "hccl" +dist.init_process_group(backend) + +rank = dist.get_rank() +torch.npu.set_device(rank) +t = torch.rand(2).npu() + +dist.all_reduce(t) +t = npu_extension.blocking_ops(t) +dist.all_reduce(t) diff --git a/test/cpp_extensions/test/test_cpp_extensions_aot.py b/test/cpp_extensions/test/test_cpp_extensions_aot.py index 638eb16f63..83650c5b3b 100644 --- a/test/cpp_extensions/test/test_cpp_extensions_aot.py +++ b/test/cpp_extensions/test/test_cpp_extensions_aot.py @@ -194,5 +194,18 @@ class TestCppExtensionAOT(TestCase): self._test_multiprocess(TestCppExtensionAOT._test_op_hook_with_all_reduce, TestCppExtensionAOT._init_dist_hccl, expected, input1, world_size) + def test_dump_allreduce(self): + dump_pth = "./hccl_trace_rank_0" + code_file = os.path.join(pathlib.Path(__file__).absolute().parent, "dump_allreduce.py") + cmd = ["torchrun", "--nproc-per-node=1", code_file] + p = subprocess.Popen(cmd) + p.wait() + + self.assertTrue(os.path.exists(dump_pth)) + self.assertTrue(os.path.exists(dump_pth + "_py_traceback")) + os.remove(dump_pth) + os.remove(dump_pth + "_py_traceback") + + if __name__ == "__main__": run_tests() diff --git a/torch_npu/csrc/distributed/HCCLUtils.cpp b/torch_npu/csrc/distributed/HCCLUtils.cpp index 96ad6469a0..a45b34e60e 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.cpp +++ b/torch_npu/csrc/distributed/HCCLUtils.cpp @@ -1,4 +1,7 @@ #include +#include + +#include #include "torch_npu/csrc/core/npu/interface/HcclInterface.h" #include "torch_npu/csrc/distributed/HCCLUtils.hpp" @@ -221,4 +224,47 @@ HcclResult HCCLComm::checkForHcclError() #endif } +void DebugInfoWriter::write(const std::string &hcclTrace) +{ + // Open a file for writing. The ios::binary flag is used to write data as + // binary. + std::ofstream file(filename_, std::ios::binary); + + // Check if the file was opened successfully. + if (!file.is_open()) { + LOG(ERROR) << "Error opening file for writing HCCLPG debug info: " + << filename_; + return; + } + + file.write(hcclTrace.data(), hcclTrace.size()); + LOG(INFO) << "Finished writing HCCLPG debug info to " << filename_; +} + +DebugInfoWriter &DebugInfoWriter::getWriter(int rank) +{ + if (writer_ == nullptr) { + std::string fileNamePrefix = c10d::getCvarString( + {"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_"); + // Using std::unique_ptr here to auto-delete the writer object + // when the pointer itself is destroyed. + std::unique_ptr writerPtr( + new DebugInfoWriter(fileNamePrefix, rank)); + DebugInfoWriter::registerWriter(std::move(writerPtr)); + } + return *writer_; +} + +void DebugInfoWriter::registerWriter(std::unique_ptr writer) +{ + TORCH_CHECK_WITH( + DistBackendError, + hasWriterRegistered_.load() == false, + "debugInfoWriter already registered"); + hasWriterRegistered_.store(true); + writer_ = std::move(writer); +} + +std::unique_ptr DebugInfoWriter::writer_ = nullptr; +std::atomic DebugInfoWriter::hasWriterRegistered_(false); } // namespace c10d_npu diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index 12d6d87c26..cbc5491735 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -139,7 +139,7 @@ protected: class TORCH_API DebugInfoWriter { public: - virtual ~DebugInfoWriter(); + virtual ~DebugInfoWriter() = default; virtual void write(const std::string &hcclTrace); static DebugInfoWriter &getWriter(int rank); static void registerWriter(std::unique_ptr writer); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 9b584b35a2..5a2e407f2c 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -414,8 +414,8 @@ int ProcessGroupHCCL::deviceId_ = -1; int ProcessGroupHCCL::numRanks_ = -1; std::string ProcessGroupHCCL::exceptionMessage_ = ""; std::shared_ptr logger = npu_logging::logging().getLogger("torch.distributed"); - std::atomic ProcessGroupHCCL::shouldDump_(false); +std::atomic ProcessGroupHCCL::monitorThreadEnabled_(false); std::string dump_hccl_trace( bool includeCollectives, @@ -468,6 +468,10 @@ std::ostream& operator<<(std::ostream& output, const ProcessGroupHCCL::WorkHCCL& workHCCL.seq_, ", OpType=", opTypeToString(workHCCL.opType_), + ", NumelIn=", + workHCCL.numelIn_, + ", NumelOut=", + workHCCL.numelOut_, ", Timeout(ms)=", workHCCL.opTimeout_.count(), ")"); @@ -488,7 +492,7 @@ ProcessGroupHCCL::WorkHCCL::WorkHCCL( // Creates the npu event wrappers // Note: The actual events are lazily created when first recorded to with // DEFAULT_FLAGS = npuEventDisableTiming. - if (desyncDebug || (status_save_enable)) { + if (desyncDebug || (status_save_enable) || ProcessGroupHCCL::monitorThreadEnabled_.load()) { hcclStartEvents_ = std::make_shared>(); hcclStartEvents_->reserve(devices.size()); for (size_t i = 0; i < devices.size(); i++) { @@ -512,9 +516,12 @@ ProcessGroupHCCL::WorkHCCL::WorkHCCL(const WorkHCCL& w) workStartTime_(w.workStartTime_), seq_(w.seq_), startTraceUpdated_(w.startTraceUpdated_), + numelIn_(w.numelIn_), + numelOut_(w.numelOut_), store_(w.store_), is_dispatched(w.is_dispatched), is_reported(w.is_reported), + is_dumped(w.is_dumped), trace_id_(w.trace_id_) { exception_ = w.exception_; @@ -693,6 +700,27 @@ void ProcessGroupHCCL::WorkHCCL::checkDispatch() } } +bool ProcessGroupHCCL::WorkHCCL::checkExec() +{ + if (is_dumped) { + return false; + } + + static int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout(); + if (hccl_exec_timeout <= 0) { + hccl_exec_timeout = 1800; + } + int32_t timeout = std::max(60, hccl_exec_timeout - 60); + auto currentTimepoint = std::chrono::steady_clock::now(); + auto timeElapsed = std::chrono::duration_cast(currentTimepoint - workStartTime_); + + if (timeElapsed > std::chrono::milliseconds(timeout * 1000)) { + is_dumped = true; + return true; + } + return false; +} + void ProcessGroupHCCL::WorkHCCL::synchronize() { // Call Synchronize without a timeout. We use this method to avoid adding a @@ -1367,9 +1395,9 @@ void ProcessGroupHCCL::heartbeatMonitor() "Received a dump signal from this local rank and will ", "start to dump the debug info. ", "Last enqueued HCCL work: ", - pgStatus_.lastEnqueuedSeq, + pgStatus_->lastEnqueuedSeq, ", last completed HCCL work: ", - pgStatus_.lastCompletedSeq, + pgStatus_->lastCompletedSeq, "."); exitMsg = c10::str( "ProcessGroupHCCL's watchdog detected an exception from the local rank. ", @@ -1385,7 +1413,7 @@ void ProcessGroupHCCL::heartbeatMonitor() // we haven't polled for `heartbeat_timeout` seconds and there haven't // any work added or removed for `watchdog_timeout` seconds. if (computeDeltaMS(lastWorkListUpdateTime_, currentTime) >= kWatchdogThreadSleepMillis && - computeDeltaMS(lastTimePollStore, currentTime) >= coordCheckIntervalMilSec_) { + computeDeltaMS(lastTimePollStore, currentTime) >= coordCheckIntervalMilSec_ && !hasGlobalDumped) { lastTimePollStore = currentTime; // Wrap globalStore_->check() in a try-catch block to avoid crashing if // the store is not available. @@ -1426,9 +1454,9 @@ void ProcessGroupHCCL::heartbeatMonitor() timeOutRank, ", and will start to dump the debug info. ", "Last enqueued HCCL work: ", - pgStatus_.lastEnqueuedSeq, + pgStatus_->lastEnqueuedSeq, ", last completed HCCL work: ", - pgStatus_.lastCompletedSeq, + pgStatus_->lastCompletedSeq, "."); exitMsg = c10::str( "ProcessGroupHCCL's watchdog detected a dump signal from rank ", @@ -1441,6 +1469,7 @@ void ProcessGroupHCCL::heartbeatMonitor() "bugs in the communications library (e.g. HCCL), etc. We tried our best to ", "dump the debug info into the storage to help you debug the issue."); dumpTraceAndResetStatus(); + hasGlobalDumped = true; } } } @@ -1692,6 +1721,25 @@ void ProcessGroupHCCL::workCleanupLoop() } work.checkAndSetException(); work.checkDispatch(); + bool exec_timeout = work.checkExec(); + if (dumpOnException_ && exec_timeout) { + try { + auto rank = globalRank(); + auto vec = std::vector( + reinterpret_cast(&rank), + reinterpret_cast(&rank) + sizeof(rank)); + globalStore_->set(std::string(EXCEPTION_DUMP), vec); + if (!shouldDump_.load()) { + LOG(ERROR) << logPrefix() + << "First watchdog exec timeout to set the dump signal."; + } + shouldDump_.store(true); + } catch (const std::exception &e) { + LOG(ERROR) << logPrefix() + << "Failed to set exec timeout dump signal in tcpstore. " + << "Error: " << e.what(); + } + } bool timedOut = work.checkTimeout(); // If work hits an exception (either an error or timeout) @@ -1749,6 +1797,17 @@ void ProcessGroupHCCL::workCleanupLoop() } } + // a work could be started but not completed, so we should not update + // lastStartedSeq and lastStartedOpName if the work state is checked + // multiple times after the start + if (monitorThreadEnabled_.load() && pgStatus_->lastStartedSeq < static_cast(work.seq_) && + work.isStarted()) { + pgStatus_->lastStartedSeq = static_cast(work.seq_); + pgStatus_->lastStartedWorkName = opTypeToString(work.opType_); + pgStatus_->lastStartedNumelIn = work.numelIn_; + pgStatus_->lastStartedNumelOut = work.numelOut_; + } + // Clean up completed work if (work.isCompleted()) { if (*(work.is_dispatched) && work.is_reported) { @@ -1759,6 +1818,10 @@ void ProcessGroupHCCL::workCleanupLoop() if (status_save_enable) { refreshStatusInfo(work, "end"); // Update Statusinfo,but not write into the map } + pgStatus_->lastCompletedSeq = static_cast(work.seq_); + pgStatus_->lastCompletedWorkName = opTypeToString(work.opType_); + pgStatus_->lastCompletedNumelIn = work.numelIn_; + pgStatus_->lastCompletedNumelOut = work.numelOut_; HCCLTraceBuffer::get()->retire_id(work.trace_id_, true); it = workMetaList_.erase(it); c10_npu::NPUGraph::dec_pending_event_queries(); @@ -2636,6 +2699,8 @@ c10::intrusive_ptr ProcessGroupHCCL::initWork( outputs, desyncDebug_? &((*(r->hcclStartEvents_))[0]) : nullptr, &((*(r->hcclEndEvents_))[0]), + options_->timeout, + pgStatus_, isP2P); } return r; @@ -2665,6 +2730,11 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptrlastEnqueuedSeq = work->seq_; + pgStatus_->lastEnqueuedWorkName = opTypeToString(work->opType_); + pgStatus_->lastEnqueuedNumelIn = work->numelIn_; + pgStatus_->lastEnqueuedNumelOut = work->numelOut_; } } @@ -3142,6 +3212,16 @@ c10::intrusive_ptr ProcessGroupHCCL::collective( work->blockingWait_ = blockingWait_; work->opTimeout_ = options_->timeout; work->store_ = store_; + // Record size info for debug. We only record the size on the first device as + // multi-device per process is deprecated + work->numelIn_ = 0; + work->numelOut_ = 0; + for (const auto& input : inputs) { + work->numelIn_ += input.numel(); + } + for (const auto& output : outputs) { + work->numelOut_ += output.numel(); + } c10_npu::NPUGraph::inc_pending_event_queries(); if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) { workEnqueue(work); @@ -3303,6 +3383,10 @@ c10::intrusive_ptr ProcessGroupHCCL::collectiveCoalesced( work->blockingWait_ = blockingWait_; work->opTimeout_ = options_->timeout; work->store_ = store_; + // Record size info for debug. We only record the size on the first device as + // multi-device per process is deprecated + work->numelIn_ = inputs[0].numel(); + work->numelOut_ = outputs[0].numel(); c10_npu::NPUGraph::inc_pending_event_queries(); if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) { workEnqueue(work); @@ -3480,6 +3564,9 @@ c10::intrusive_ptr ProcessGroupHCCL::pointToPoint( work->blockingWait_ = blockingWait_; work->opTimeout_ = options_->timeout; work->store_ = store_; + // Record size info for debug. We only record the size on the first device + // as multi-device per process is deprecated + work->numelIn_ = work->numelOut_ = tensors[i].numel(); } c10_npu::NPUGraph::inc_pending_event_queries(); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index e1ccd719fe..fe3315196c 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -69,6 +69,37 @@ static std::vector TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC = { static std::vector TORCH_HCCL_COORD_CHECK_MILSEC = { "TORCH_HCCL_COORD_CHECK_MILSEC"}; +// A struct to hold the latest status of the process group. +struct ProcessGroupStatus { + // the sequential number of the last collective enqueued into workMetaList_ + // This is useful for indentifying a rank that has not join a collective + // initialized to be -1 to indicate no collective has been enqueued + int64_t lastEnqueuedSeq{-1}; + // the sequential number of the last collective started as the kernel + int64_t lastStartedSeq{-1}; + // the sequential number of the last colletive completed marked by + // the watchdog thread + // initialized to be -1 to indicate no collective has been completed + int64_t lastCompletedSeq{-1}; + + // the name of the last collective enqueued into workMetaList_ + std::string lastEnqueuedWorkName; + // the name of the last collective started as the kernel + std::string lastStartedWorkName; + // the name of the last collective completed + std::string lastCompletedWorkName; + + // the sizes of the last work enqueued + size_t lastEnqueuedNumelIn; + size_t lastEnqueuedNumelOut; + // the sizes of the last work completed + size_t lastCompletedNumelIn; + size_t lastCompletedNumelOut; + // the sizes of the last work started + size_t lastStartedNumelIn; + size_t lastStartedNumelOut; +}; + struct DumpPipe { DumpPipe(int rank) { @@ -202,6 +233,8 @@ public: std::shared_ptr is_dispatched = std::make_shared(false); bool is_reported = false; + bool is_dumped = false; + // Checks if request has completed. In this specific case of HCCL, it checks // if the HCCL operation has completed on the NPU in its own HCCL stream. // Non-blocking operation. @@ -241,6 +274,8 @@ public: void checkDispatch(); + bool checkExec(); + protected: // The cached list of NPU devices to operate on. // HCCL support one device per rank only @@ -279,6 +314,11 @@ public: // This will be used by desync debug. bool startTraceUpdated_{false}; + // Record collective sizes for debug. We only record the size on the first + // device as multi-device per process is deprecated + size_t numelIn_ = -1; + size_t numelOut_ = -1; + // Wrapper method for the static checkForHCCLErrors which can be overridden // for tests. virtual std::exception_ptr checkForHCCLErrors( @@ -359,34 +399,6 @@ public: uint32_t master_port; }; - // A struct to hold the latest status of the process group. - struct ProcessGroupStatus { - // the sequential number of the last collective enqueued into workMetaList_ - // This is useful for indentifying a rank that has not join a collective - // initialized to be -1 to indicate no collective has been enqueued - int64_t lastEnqueuedSeq{-1}; - // the sequential number of the last collective started as the kernel - int64_t lastStartedSeq{-1}; - // the sequential number of the last colletive completed marked by - // the watchdog thread - // initialized to be -1 to indicate no collective has been completed - int64_t lastCompletedSeq{-1}; - - // the name of the last collective enqueued into workMetaList_ - std::string lastEnqueuedWorkName; - // the name of the last collective started as the kernel - std::string lastStartedWorkName; - // the name of the last collective completed - std::string lastCompletedWorkName; - - // the sizes of the last work enqueued - size_t lastEnqueuedNumelIn; - size_t lastEnqueuedNumelOut; - // the sizes of the last work completed - size_t lastCompletedNumelIn; - size_t lastCompletedNumelOut; - }; - // If you wish to create multiple process groups, each with a potentially // different rank and size, you can do so by passing a new store instance // to each one. If you have only a single store object, you can @@ -742,7 +754,7 @@ protected: int hcclTraceBufferSize_; // We gate the heartbeat monitor thread so that we can roll it out gradually. - std::atomic monitorThreadEnabled_; + static std::atomic monitorThreadEnabled_; // Monitor thread which checks the heartbeat of Watchdog thread. // If the monitor thread finds there is no heartbeat, it will dump debug info @@ -848,6 +860,8 @@ protected: // timeout and hccl errors. bool dumpOnException_; + bool hasGlobalDumped = false; + // the perfdump path static std::string perfdumppath; @@ -891,6 +905,8 @@ protected: std::exception_ptr watchDogException_ = nullptr; + std::shared_ptr pgStatus_ = std::make_shared(); + struct StatusStruct { uint64_t seq = 0; std::string pgId; @@ -923,8 +939,6 @@ protected: std::string pg_desc_; - ProcessGroupStatus pgStatus_; - private: // Helper that encapsulates work shared across all collective communication // primitives. diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h index 5592ac2ca5..ee7c20cdc7 100644 --- a/torch_npu/csrc/distributed/TraceUtils.h +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -11,6 +11,7 @@ #include "torch_npu/csrc/core/npu/NPUEvent.h" #include "torch_npu/csrc/distributed/HCCLUtils.hpp" +#include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp" #include #include @@ -28,6 +29,7 @@ namespace c10d_npu { // (minor when adding fields, major when changing existing fields) static c10::IValue version_val = "2.1"; static c10::IValue pg_config_key = "pg_config"; + static c10::IValue pg_status_key = "pg_status"; static c10::IValue record_id_key = "record_id"; static c10::IValue pg_id_key = "pg_id"; static c10::IValue pg_name_key = "process_group"; @@ -42,7 +44,7 @@ namespace c10d_npu { static c10::IValue output_dtypes_key = "output_dtypes"; static c10::IValue time_created_key = "time_created_ns"; static c10::IValue duration_key = "duration_ms"; - + static c10::IValue timeout_key = "timeout_ms"; static c10::IValue frames_key = "frames"; static c10::IValue state_key = "state"; static c10::IValue line_key = "line"; @@ -205,52 +207,6 @@ namespace c10d_npu { return hcclStartEvent.elapsed_time(hcclEndEvent); } - DebugInfoWriter::~DebugInfoWriter() = default; - - void DebugInfoWriter::write(const std::string &hcclTrace) - { - // Open a file for writing. The ios::binary flag is used to write data as - // binary. - std::ofstream file(filename_, std::ios::binary); - - // Check if the file was opened successfully. - if (!file.is_open()) { - LOG(ERROR) << "Error opening file for writing HCCLPG debug info: " - << filename_; - return; - } - - file.write(hcclTrace.data(), hcclTrace.size()); - LOG(INFO) << "Finished writing HCCLPG debug info to " << filename_; - } - - DebugInfoWriter &DebugInfoWriter::getWriter(int rank) - { - if (writer_ == nullptr) { - std::string fileNamePrefix = c10d::getCvarString( - {"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_"); - // Using std::unique_ptr here to auto-delete the writer object - // when the pointer itself is destroyed. - std::unique_ptr writerPtr( - new DebugInfoWriter(fileNamePrefix, rank)); - DebugInfoWriter::registerWriter(std::move(writerPtr)); - } - return *writer_; - } - - void DebugInfoWriter::registerWriter(std::unique_ptr writer) - { - TORCH_CHECK_WITH( - DistBackendError, - hasWriterRegistered_.load() == false, - "debugInfoWriter already registered"); - hasWriterRegistered_.store(true); - writer_ = std::move(writer); - } - - std::unique_ptr DebugInfoWriter::writer_ = nullptr; - std::atomic DebugInfoWriter::hasWriterRegistered_(false); - inline std::string pickle_str(const c10::IValue &v) { std::vector result; @@ -358,6 +314,9 @@ namespace c10d_npu { // was 'enqueued'- not necessarily started c10::time_t time_created_; + // configured timeout for this entry + c10::time_t timeout_ms_; + // Is this a P2P event? bool isP2P_; @@ -391,6 +350,7 @@ namespace c10d_npu { size_t max_entries_ = 0; size_t next_ = 0; size_t id_ = 0; + std::map> all_pg_status_ = {}; std::map, std::vector> pg_name_to_ranks_ = {}; @@ -405,11 +365,17 @@ namespace c10d_npu { const std::vector &outputs, Event *start, Event *end, + std::chrono::milliseconds timeout_ms, + std::shared_ptr pg_status, bool isP2P) { if (!enabled_) { return c10::nullopt; } + if (all_pg_status_.find(pg_id) == all_pg_status_.end()) { + // Current pg_status is not in FR. + all_pg_status_[pg_id] = std::move(pg_status); + } auto traceback = torch::CapturedTraceback::gather(true, true, capture_cpp_stack_); std::lock_guard guard(mutex_); @@ -426,6 +392,7 @@ namespace c10d_npu { std::move(start), std::move(end), c10::getTime(), + timeout_ms.count(), isP2P}; for (const auto &input : inputs) { @@ -654,6 +621,7 @@ namespace c10d_npu { ? int64_t(*e.time_discovered_completed_) : c10::IValue()); dict.insert(retired_key, e.retired_); + dict.insert(timeout_key, e.timeout_ms_); dict.insert(is_p2p_key, e.isP2P_); entries.push_back(dict); @@ -675,6 +643,19 @@ namespace c10d_npu { return pg_config; } + const c10::Dict getPgStatus() + { + auto all_pg_status = new_dict(); + for (const auto& [pg_id, status] : all_pg_status_) { + auto pg_status = new_dict(); + pg_status.insert("last_enqueued_collective", status->lastEnqueuedSeq); + pg_status.insert("last_started_collective", status->lastStartedSeq); + pg_status.insert("last_completed_collective", status->lastCompletedSeq); + all_pg_status.insert(std::to_string(pg_id), pg_status); + } + return all_pg_status; + } + // dump all collectives + hcclDumpMap std::string dump( const c10::optional Date: Wed, 4 Jun 2025 09:18:14 +0000 Subject: [PATCH 029/328] =?UTF-8?q?!21496=20[feature-v2.7.1]add=20profiler?= =?UTF-8?q?=20ut=20Merge=20pull=20request=20!21496=20from=20=E8=99=9E?= =?UTF-8?q?=E8=89=AF=E6=96=8C/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../prof_view/test_memory_timeline_parser.py | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 test/profiler/analysis/prof_view/test_memory_timeline_parser.py diff --git a/test/profiler/analysis/prof_view/test_memory_timeline_parser.py b/test/profiler/analysis/prof_view/test_memory_timeline_parser.py new file mode 100644 index 0000000000..9732ed01a2 --- /dev/null +++ b/test/profiler/analysis/prof_view/test_memory_timeline_parser.py @@ -0,0 +1,184 @@ +import os +import random +import unittest +from unittest.mock import patch, MagicMock + +from torch_npu.profiler.analysis.prof_common_func._constant import Constant +from torch_npu.profiler.analysis.prof_parse._event_tree_parser import (_ProfilerEvent, + _DeviceType, _EventType) +from torch_npu.profiler.analysis.prof_view._memory_timeline_parser import ( + MemoryProfile, MemoryProfileTimeline, Storage, + DeviceKey, TensorKey, Category, Action, _CATEGORY_TO_COLORS +) + + +class TestMemoryProfile(unittest.TestCase): + def setUp(self): + self.memory_profile = MagicMock() + self.memory_profile._root_nodes = [] + self.memory_profile._categories = {} + + @patch("torch_npu.profiler.analysis.prof_view._memory_timeline_parser.EventTree") + def test_init_success(self, mock_event_tree): + mock_event_tree.return_value.sorted_events = [] + mock_event_tree.return_value.get_root_nodes.return_value = [] + mp = MemoryProfile("valid.prof") + self.assertIsNotNone(mp) + + def test_memory_history(self): + mock_event = MagicMock(spec=_ProfilerEvent) + mock_event.tag = _EventType.Allocation + mock_event.extra_fields = MagicMock() + mock_event.extra_fields.device_type = _DeviceType.CUDA + mock_event.extra_fields.device_index = 0 + mock_event.extra_fields.total_active = 100 + mock_event.extra_fields.total_allocated = 200 + mock_event.extra_fields.total_reserved = 300 + mock_event.children = [] + self.memory_profile._root_nodes = [mock_event] + self.memory_profile.memory_history = [(DeviceKey(_DeviceType.NPU, 0), 100, 200, 300)] + result = self.memory_profile.memory_history + expected = [(DeviceKey(_DeviceType.NPU, 0), 100, 200, 300)] + self.assertEqual(result, expected) + + def test_is_gradient(self): + mock_categories = MagicMock() + mock_categories.get.return_value = Category.GRADIENT + self.memory_profile._categories = mock_categories + self.assertTrue(self.memory_profile._is_gradient(TensorKey(1, 0, 1, "storage"), 0)) + + def test_set_gradients_and_temporaries(self): + mock_event = MagicMock(spec=_ProfilerEvent) + mock_event.tag = _EventType.PyCall + + mock_event.extra_fields = MagicMock() + mock_event.extra_fields.grads = [TensorKey(1, 0, 1, "storage")] + + self.assertEqual(mock_event.extra_fields.grads[0].id, 1) + self.assertEqual(mock_event.extra_fields.grads[0].storage, "storage") + + def test_set_optimizer_state(self): + mock_event = MagicMock(spec=_ProfilerEvent) + mock_event.tag = _EventType.PyCall + + mock_event.extra_fields = MagicMock() + mock_event.extra_fields.optimizer_parameters = [MagicMock()] + + random_data = [random.random() for _ in range(2)] + mock_event.extra_fields.optimizer_parameters[0].state = {"weight": random_data} + + self.memory_profile._root_nodes = [mock_event] + + with patch("torch_npu.profiler.analysis.prof_view._memory_timeline_parser.TensorKey.from_tensor", + return_value=TensorKey(1, 0, 1, "storage")): + self.memory_profile._set_optimizer_state() + self.assertEqual(self.memory_profile._categories.get(TensorKey(1, 0, 1, "storage"), 0), 0) + + +class TestMemoryProfileTimeline(unittest.TestCase): + + def setUp(self): + self.memory_profile = MagicMock() + self.mpt = MemoryProfileTimeline(self.memory_profile) + + def test_parse_device_cpu(self): + result = self.mpt._parse_device_info("cpu") + self.assertIsInstance(result, DeviceKey) + self.assertEqual(result.device_type, 0) + self.assertEqual(result.device_index, -1) + + def test_parse_device_npu(self): + result = self.mpt._parse_device_info("npu:0") + self.assertIsInstance(result, DeviceKey) + self.assertEqual(result.device_index, 0) + + def test_construct_timeline_empty(self): + self.memory_profile.timeline = [] + timestamps, sizes = self.mpt._construct_timeline("cpu") + self.assertEqual(len(timestamps), 0) + self.assertEqual(len(sizes), 0) + + def test_construct_timeline_filter_device(self): + key1 = TensorKey(0, 0, 0, Storage(0, 1)) + key2 = TensorKey(1, 1, 1, Storage(0, 1)) + self.memory_profile.timeline = [ + (1000000, Action.CREATE, (key1, 0), 1024), + (2000000, Action.CREATE, (key2, 0), 2048), + ] + timestamps, sizes = self.mpt._construct_timeline("cpu") + self.assertEqual(len(timestamps), 0) + + @patch('torch_npu.profiler.analysis.prof_common_func._file_manager.FileManager.create_json_file_by_path') + def test_export_json(self, mock_write): + self.memory_profile.timeline = [(1000000, Action.CREATE, (TensorKey(0, 0, 0, Storage(0, 1)), 0), 1024)] + self.mpt._construct_timeline = MagicMock(return_value=([1000], [[0, 1024]])) + self.mpt.export_memory_timeline_json("output.json", "cpu") + expected_path = os.path.abspath("output.json") + mock_write.assert_called_once_with(expected_path, [[1000], [[0, 1024]]]) + + +class TestMemoryTimelineParser(unittest.TestCase): + + @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfile') + @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfileTimeline') + def test_run_method(self, mock_timeline_class, mock_profile_class): + parser = mock_timeline_class() + parser._device = "npu" + parser.logger = MagicMock() + mock_profile_instance = mock_profile_class.return_value + mock_profile_instance.some_method_we_use.return_value = "mocked profile data" + mock_timeline_instance = mock_timeline_class.return_value + mock_timeline_instance.export_memory_timeline_html.return_value = None + parser.run.return_value = [Constant.SUCCESS] + result = parser.run(deps_data={}) + self.assertEqual(result[0], Constant.SUCCESS) + + @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfile') + @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfileTimeline') + def test_run_with_exception(self, mock_timeline_class, mock_profile_class): + parser = mock_timeline_class() + parser._device = "npu" + parser.logger = MagicMock() + mock_profile_class.side_effect = Exception("Mocked Initialization Error") + parser.run.return_value = [Constant.FAIL] + result = parser.run(deps_data={}) + self.assertEqual(result[0], Constant.FAIL) + + +class TestEdgeCases(unittest.TestCase): + + def test_category_handling(self): + mock_mem_profile = MagicMock() + mock_mem_profile.timeline = [] + mock_mem_profile.memory_history = [] + mock_mem_profile._categories = MagicMock() + + test_cases = [ + (Category.INPUT, "black"), + (Category.PARAMETER, "darkgreen"), + (None, "grey") + ] + + for category, expected_color in test_cases: + mock_mem_profile._categories.get.return_value = category + timeline = MemoryProfileTimeline(mock_mem_profile) + + idx = timeline._get_category_index(MagicMock(), 0) + self.assertEqual(_CATEGORY_TO_COLORS[category], expected_color) + + +def run_tests(): + loader = unittest.TestLoader() + + suite = unittest.TestSuite() + suite.addTests(loader.loadTestsFromTestCase(TestMemoryProfile)) + suite.addTests(loader.loadTestsFromTestCase(TestMemoryProfileTimeline)) + suite.addTests(loader.loadTestsFromTestCase(TestMemoryTimelineParser)) + suite.addTests(loader.loadTestsFromTestCase(TestEdgeCases)) + + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite) + + +if __name__ == "__main__": + run_tests() -- Gitee From 3257f18e73fd2f2d0a8aec9a3db6e4faad0d76f4 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Wed, 4 Jun 2025 11:52:17 +0000 Subject: [PATCH 030/328] !21546 update version Merge pull request !21546 from huangyunlong/2.7ft4 --- torch_npu/csrc/distributed/TraceUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h index ee7c20cdc7..e5e4ed7957 100644 --- a/torch_npu/csrc/distributed/TraceUtils.h +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -27,7 +27,7 @@ namespace c10d_npu { static c10::IValue version_key = "version"; // Update whenever changing contents or formatting of the dump // (minor when adding fields, major when changing existing fields) - static c10::IValue version_val = "2.1"; + static c10::IValue version_val = "2.4"; static c10::IValue pg_config_key = "pg_config"; static c10::IValue pg_status_key = "pg_status"; static c10::IValue record_id_key = "record_id"; -- Gitee From f82b5de1b7df0792259eb2961f15a9aa13e54c3f Mon Sep 17 00:00:00 2001 From: shaoyf Date: Wed, 4 Jun 2025 11:53:13 +0000 Subject: [PATCH 031/328] !21541 Modify the operator in the test_not_supported_ops Merge pull request !21541 from shaoyf/271_fix_op --- test/npu/test_fault_mode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py index a50c443438..88bc8cca19 100644 --- a/test/npu/test_fault_mode.py +++ b/test/npu/test_fault_mode.py @@ -102,7 +102,7 @@ class TestMode(TestCase): torch.Generator(device="cuda") def test_not_supported_ops(self): - command = ['python', '-c', 'import torch; import torch_npu; torch.rand(1, 3, 3).npu().logit()'] + command = ['python', '-c', 'import torch; import torch_npu; t = torch.rand(1, 3, 3).npu();t.fmax(t)'] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) message = process.stderr.read() process.stderr.close() @@ -110,7 +110,7 @@ class TestMode(TestCase): process.terminate() process.wait() self.assertIn( - "CAUTION: The operator 'aten::logit' is not currently supported on the NPU backend and will fall back " + "CAUTION: The operator 'aten::fmax.out' is not currently supported on the NPU backend and will fall back " "to run on the CPU. This may have performance implications. (function npu_cpu_fallback)", message ) -- Gitee From 69221dde7c6541f60e2804c0f329e421712f2c4f Mon Sep 17 00:00:00 2001 From: wangzixuan <617225691@qq.com> Date: Wed, 4 Jun 2025 14:04:14 +0000 Subject: [PATCH 032/328] !21533 profiler support collecting multi-device Merge pull request !21533 from wangzixuan/v2.7.1 --- .../framework/interface/MsProfilerInterface.cpp | 14 ++++++++++++++ .../csrc/framework/interface/MsProfilerInterface.h | 2 ++ torch_npu/csrc/profiler/profiler_mgr.cpp | 8 ++++++++ 3 files changed, 24 insertions(+) diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp index 8d7e6f179d..f2dcf69944 100644 --- a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp +++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp @@ -19,8 +19,22 @@ LOAD_FUNCTION(aclprofWarmup) LOAD_FUNCTION(aclprofSetConfig) LOAD_FUNCTION(aclprofGetSupportedFeatures) LOAD_FUNCTION(aclprofGetSupportedFeaturesV2) +LOAD_FUNCTION(aclprofRegisterDeviceCallback) LOAD_FUNCTION(aclprofMarkEx) +aclError AclProfilingRegisterDeviceCallback() +{ + typedef aclError (*AclProfRegisterDeviceCallbackFunc)(); + static AclProfRegisterDeviceCallbackFunc func = nullptr; + if (func == nullptr) { + func = (AclProfRegisterDeviceCallbackFunc)GET_FUNC(aclprofRegisterDeviceCallback); + if (func == nullptr) { + return ACL_ERROR_PROF_MODULES_UNSUPPORTED; + } + } + return func(); +} + aclError AclProfilingWarmup(const aclprofConfig *profilerConfig) { typedef aclError (*AclProfWarmupFunc)(const aclprofConfig *); diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.h b/torch_npu/csrc/framework/interface/MsProfilerInterface.h index b06ca001e6..d049a05593 100644 --- a/torch_npu/csrc/framework/interface/MsProfilerInterface.h +++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.h @@ -7,6 +7,8 @@ namespace at_npu { namespace native { +aclError AclProfilingRegisterDeviceCallback(); + aclError AclProfilingWarmup(const aclprofConfig *profilerConfig); aclError AclprofSetConfig(aclprofConfigType configType, const char* config, size_t configLength); diff --git a/torch_npu/csrc/profiler/profiler_mgr.cpp b/torch_npu/csrc/profiler/profiler_mgr.cpp index 21c6547209..eae7c9c5af 100644 --- a/torch_npu/csrc/profiler/profiler_mgr.cpp +++ b/torch_npu/csrc/profiler/profiler_mgr.cpp @@ -106,6 +106,14 @@ void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, a ASCEND_LOGE("Profiling start failed."); return; } + + ASCEND_LOGI("Try to register set device callback function."); + ret = at_npu::native::AclProfilingRegisterDeviceCallback(); + if (ret == ACL_ERROR_PROF_MODULES_UNSUPPORTED) { + ASCEND_LOGW("Not support set device callback function."); + } else if (ret != ACL_SUCCESS) { + ASCEND_LOGE("Failed to register set device callback function."); + } } uint64_t ProfilerMgr::PrepareProfilerConfig(const NpuTraceConfig &npu_config) -- Gitee From 068a56aa647980b8ab4e3a5a89c542794bc1da14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com> Date: Wed, 4 Jun 2025 14:04:40 +0000 Subject: [PATCH 033/328] =?UTF-8?q?!21554=20Add=20a=20c++=20stack=20to=20a?= =?UTF-8?q?rm=20Merge=20pull=20request=20!21554=20from=20=E6=9D=9C?= =?UTF-8?q?=E9=87=91=E8=88=AA/v2.3.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/InitNpuBindings.cpp | 2 + torch_npu/csrc/npu/Module.cpp | 18 +- torch_npu/csrc/npu/memory_snapshot.cpp | 9 + torch_npu/csrc/profiler/CMakeLists.txt | 2 +- .../csrc/profiler/combined_traceback.cpp | 179 ++++++++++++++++++ torch_npu/csrc/profiler/combined_traceback.h | 59 ++++++ .../profiler/python/combined_traceback.cpp | 170 +++++++++++++++++ .../csrc/profiler/python/combined_traceback.h | 22 +++ torch_npu/csrc/profiler/unwind/unwind.cpp | 95 ++++++++++ torch_npu/csrc/profiler/unwind/unwind.h | 36 ++++ torch_npu/npu/memory.py | 4 - 11 files changed, 588 insertions(+), 8 deletions(-) create mode 100644 torch_npu/csrc/profiler/combined_traceback.cpp create mode 100644 torch_npu/csrc/profiler/combined_traceback.h create mode 100644 torch_npu/csrc/profiler/python/combined_traceback.cpp create mode 100644 torch_npu/csrc/profiler/python/combined_traceback.h create mode 100644 torch_npu/csrc/profiler/unwind/unwind.cpp create mode 100644 torch_npu/csrc/profiler/unwind/unwind.h diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 60ba472791..c8084af923 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -20,6 +20,7 @@ #include "torch_npu/csrc/utils/TensorType.h" #include "torch_npu/csrc/utils/AutocastMode.h" #include "torch_npu/csrc/core/npu/NPURecovery.h" +#include "torch_npu/csrc/profiler/python/combined_traceback.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" #endif @@ -195,6 +196,7 @@ PyObject* initModule() #endif initCommMethods(); torch::installCapturedTracebackPython(); + torch_npu::installCapturedTracebackPython(); torch_npu::profiler::initMstx(module); return module; } diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 88cfe2e034..b0c77e9268 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -47,6 +47,8 @@ #include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h" #include "op_plugin/utils/custom_functions/opapi/FFTCommonOpApi.h" #include "torch_npu/csrc/aten/common/from_blob.h" +#include "torch_npu/csrc/profiler/combined_traceback.h" +#include "torch_npu/csrc/profiler/python/combined_traceback.h" struct NPUDeviceProp { std::string name; @@ -969,9 +971,15 @@ PyObject* THNPModule_resetPeakMemoryStats(PyObject *_unused, PyObject *arg) Py_RETURN_NONE; } -torch::CapturedTraceback* getFromContext(const std::shared_ptr& x) +#if defined(__x86_64__) + using CapturedTraceback = torch::CapturedTraceback; +#elif defined(__aarch64__) + using CapturedTraceback = torch_npu::CapturedTraceback; +#endif + +CapturedTraceback* getFromContext(const std::shared_ptr& x) { - if (torch::CapturedTraceback* sc = dynamic_cast(x.get())) { + if (CapturedTraceback* sc = dynamic_cast(x.get())) { return sc; } TORCH_CHECK(false, "attempting to gather stack context from the wrong StackContext type.", OPS_ERROR(ErrCode::NOT_FOUND)); @@ -1006,7 +1014,7 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) py::str frames_s = "frames"; py::list empty_frames; - std::vector to_gather_frames; + std::vector to_gather_frames; std::vector to_gather_dest; auto add_frame_key = [&](const py::dict& d, const std::shared_ptr& ctx) { @@ -1137,7 +1145,11 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) result["segments"] = segments; result["device_traces"] = traces; +#if defined(__x86_64__) auto frames = torch::py_symbolize(to_gather_frames); +#else + auto frames = torch_npu::py_symbolize(to_gather_frames); +#endif for (auto i : c10::irange(frames.size())) { to_gather_dest.at(i)[frames_s] = frames.at(i); } diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp index 6cca3eaca6..47fbf4de6c 100644 --- a/torch_npu/csrc/npu/memory_snapshot.cpp +++ b/torch_npu/csrc/npu/memory_snapshot.cpp @@ -5,6 +5,7 @@ #include "torch_npu/csrc/utils/LazyInit.h" #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h" +#include "torch_npu/csrc/profiler/combined_traceback.h" #include "torch_npu/csrc/npu/memory_snapshot.h" using torch::jit::Pickler; @@ -20,7 +21,11 @@ std::shared_ptr gather() std::shared_ptr gather_with_cpp() { +#if defined(__x86_64__) return torch::CapturedTraceback::gather(true, true, true); +#else + return torch_npu::CapturedTraceback::gather(true, true, true); +#endif } static void checkOptionIn(const std::string& option, @@ -51,7 +56,11 @@ void _record_memory_history(c10::optional enabled, if (enabled && stacks == "all") { recorder = gather_with_cpp; // warm up C++ stack unwinding +#if defined(__x86_64__) torch::unwind::unwind(); +#else + torch_npu::unwind::unwind(); +#endif } max_entries = (enabled && *enabled == "all") ? max_entries : 1; auto when = c10_npu::NPUCachingAllocator::RecordContext::NEVER; diff --git a/torch_npu/csrc/profiler/CMakeLists.txt b/torch_npu/csrc/profiler/CMakeLists.txt index 9a39c2d4d8..81bbd22b1e 100644 --- a/torch_npu/csrc/profiler/CMakeLists.txt +++ b/torch_npu/csrc/profiler/CMakeLists.txt @@ -1,4 +1,4 @@ -FILE(GLOB _PROF_SRCS *.cpp) +FILE(GLOB _PROF_SRCS *.cpp unwind/*.cpp python/*.cpp) LIST(APPEND PROF_SRCS ${_PROF_SRCS}) diff --git a/torch_npu/csrc/profiler/combined_traceback.cpp b/torch_npu/csrc/profiler/combined_traceback.cpp new file mode 100644 index 0000000000..27fde26a7e --- /dev/null +++ b/torch_npu/csrc/profiler/combined_traceback.cpp @@ -0,0 +1,179 @@ +#include "combined_traceback.h" + +namespace torch_npu { + +static std::atomic python_support_ = nullptr; + +std::shared_ptr CapturedTraceback::gather(bool python, bool script, bool cpp) +{ + auto r = std::make_shared(); + if (python) { + auto p = python_support_.load(); + while (p && r->frames_.empty()) { + r->frames_ = p->gather(); + r->python_ = p; + p = p->next_; + } + } + if (script) { + r->script_frames_ = torch::jit::currentCallstack(); + } + if (cpp) { + r->cpp_frames_ = unwind::unwind(); + } + return r; +} + +int CapturedTraceback::traversePython(visitproc visit, void* arg) +{ + TORCH_INTERNAL_ASSERT(python_); + return python_->traverse(frames_, visit, arg); +} + +int CapturedTraceback::clearPython() +{ + TORCH_INTERNAL_ASSERT(python_); + return python_->clear(frames_); +} + +CapturedTraceback::~CapturedTraceback() +{ + if (!frames_.empty()) { + TORCH_INTERNAL_ASSERT(python_); + python_->release(frames_); + } +} + +struct PyFrameHash { + std::size_t operator()(const CapturedTraceback::PyFrame& f) const + { + return std::hash()(f.code) ^ std::hash()(f.lasti); + } +}; + +struct PyFrameEq { + std::size_t operator()(const CapturedTraceback::PyFrame& lhs, const CapturedTraceback::PyFrame& rhs) const + { + return lhs.code == rhs.code && lhs.lasti == rhs.lasti; + } +}; + +SymbolizedTracebacks symbolize(const std::vector& to_symbolize) +{ + SymbolizedTracebacks r; + + std::unordered_map ip_to_frame_offset; + std::unordered_map py_to_frame_offset; + std::vector all_cpp_ips; + + // dedup and collect any C++ frames that need symbols for + for (const auto& e : to_symbolize) { + for (void* f : e->cpp_frames_) { + if (!ip_to_frame_offset.count(f)) { + ip_to_frame_offset[f] = all_cpp_ips.size(); + all_cpp_ips.push_back(f); + } + } + } + // gather symbol names for C++ frames + if (!all_cpp_ips.empty()) { + r.all_frames = unwind::symbolize(all_cpp_ips); + } + + // batch symbolization requests so we dedup frame objects + // however, we might have to request from different python interpreters + // make sure we flush requests before switching interpreters; + CapturedTraceback::Python* cur_python = nullptr; + std::vector cur_py_frames; + size_t py_frames_size_ = 0; + + for (const auto& e : to_symbolize) { + if (e->python_) { + if (cur_python != e->python_ && !cur_py_frames.empty()) { + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + cur_python->appendSymbolized(cur_py_frames, r); + cur_py_frames.clear(); + } + cur_python = e->python_; + for (const auto& f : e->frames_) { + if (!py_to_frame_offset.count(f)) { + py_to_frame_offset[f] = py_frames_size_++; + cur_py_frames.push_back(f); + } + } + } + } + if (!cur_py_frames.empty()) { + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + cur_python->appendSymbolized(cur_py_frames, r); + cur_py_frames.clear(); + } + std::vector > python_frame_fragments = std::move(r.tracebacks); + r.tracebacks = {}; + + for (const auto& sc : to_symbolize) { + r.tracebacks.emplace_back(); + auto py_it = sc->frames_.begin(); + auto py_end = sc->frames_.end(); + + bool jit_appended = false; + + auto append_python = [&](const CapturedTraceback::PyFrame& f) { + const auto& fragment = python_frame_fragments.at(py_to_frame_offset.at(f)); + r.tracebacks.back().insert(r.tracebacks.back().end(), fragment.begin(), fragment.end()); + }; + + auto append_jit = [&]() { + if (jit_appended) { + return; + } + jit_appended = true; + for (const auto& f : sc->script_frames_) { + torch::unwind::Frame frame; + frame.funcname = f.filename; // sic: torchscript puts funcname in filename field + auto flc = f.range.file_line_col(); + if (flc) { + size_t col = 0; + std::tie(frame.filename, frame.lineno, col) = *flc; + } else { + frame.filename = "??"; + frame.lineno = 0; + } + r.tracebacks.back().push_back(r.all_frames.size()); + r.all_frames.emplace_back(std::move(frame)); + } + }; + + for (void* f : sc->cpp_frames_) { + uint64_t cpp_frame = ip_to_frame_offset.at(f); + const torch::unwind::Frame& uf = r.all_frames.at(cpp_frame); + if (uf.funcname.find("PyEval_EvalFrame") != std::string::npos) { + if (py_it != py_end) { + append_python(*py_it++); + } + } else if (uf.funcname.rfind("torch::jit::InterpreterStateImpl::run", 0) != std::string::npos) { + append_jit(); + } + r.tracebacks.back().push_back(cpp_frame); + } + + // add frames if we otherwise haven't seen the C++ frame indicating where + // it should go + append_jit(); + + for (; py_it != py_end; ++py_it) { + append_python(*py_it); + } + } + return r; +} + +void CapturedTraceback::addPythonUnwinder(CapturedTraceback::Python* p) +{ + CapturedTraceback::Python* old_unwinder = python_support_.load(); + do { + p->next_ = old_unwinder; + } while (!python_support_.compare_exchange_strong(old_unwinder, p)); +} + +} // namespace torch_npu diff --git a/torch_npu/csrc/profiler/combined_traceback.h b/torch_npu/csrc/profiler/combined_traceback.h new file mode 100644 index 0000000000..90e799a5b7 --- /dev/null +++ b/torch_npu/csrc/profiler/combined_traceback.h @@ -0,0 +1,59 @@ +#pragma once +#include +#include + +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "unwind/unwind.h" + +using torch::SymbolizedTracebacks; + +namespace torch_npu { + +struct TORCH_NPU_API CapturedTraceback : public c10::GatheredContext { + struct PyFrame { + void* code; // PyCodeObject*, but python headers not present + int lasti; + }; + + static std::shared_ptr gather(bool python, bool script, bool cpp); + CapturedTraceback() = default; + CapturedTraceback(const CapturedTraceback&) = delete; + CapturedTraceback& operator=(const CapturedTraceback&) = delete; + CapturedTraceback(CapturedTraceback&&) noexcept = default; + CapturedTraceback& operator=(CapturedTraceback&&) noexcept = delete; + ~CapturedTraceback() override; + + using visitproc = int (*)(void* self, void* arg); + + struct Python { + virtual std::vector gather() = 0; + virtual void release(std::vector& frames) = 0; + virtual void appendSymbolized(const std::vector& to_symbolize, SymbolizedTracebacks& st) = 0; + // tp_traverse/tp_clear implementations + virtual int traverse(std::vector& frames, visitproc visit, void* arg) = 0; + virtual int clear(std::vector& frames) = 0; + virtual ~Python() = default; + Python* next_ = nullptr; + }; + // called once by each python interpreter to + // register python stack recording functionality + // p cannot be deleted once added. + static void addPythonUnwinder(Python* p); + + int traversePython(visitproc visit, void* arg); + int clearPython(); + +private: + std::vector frames_; + std::vector cpp_frames_; + std::vector script_frames_; + friend TORCH_NPU_API SymbolizedTracebacks symbolize(const std::vector& to_symbolize); + + // non-owning reference to one of the immortal Python* objects + // registered above. + Python* python_ = nullptr; +}; + +TORCH_NPU_API SymbolizedTracebacks symbolize(const std::vector& to_symbolize); + +} // namespace torch_npu diff --git a/torch_npu/csrc/profiler/python/combined_traceback.cpp b/torch_npu/csrc/profiler/python/combined_traceback.cpp new file mode 100644 index 0000000000..d209a21621 --- /dev/null +++ b/torch_npu/csrc/profiler/python/combined_traceback.cpp @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "combined_traceback.h" + +namespace py = pybind11; + +namespace torch_npu { +// Locking: +// We need to free PyCodeObjects when ~StackContext runs, but +// CUDACachingAllocator may hold its device lock when ~StackContext runs. + +// Because the thread calling the allocator _may_ hold the GIL, +// attempting to lock the GIL in ~StackContext can deadlock: +// T0: GIL Lock -> Call Allocator ->| Waiting Device Lock +// T1: Call Allocator -> Device Lock ->| Waiting GIL Lock +// Instead the destructor defers freeing stack frames by putting them in +// to_free_frames. We still need a lock to manage this vector, but +// we can ensure an overall lock ordering of GIL -> device_lock -> +// to_free_frames_mutex because ::gather is called outside of the device lock. + +namespace { +static std::mutex to_free_frames_mutex; +static std::vector to_free_frames; +struct PythonTraceback : public CapturedTraceback::Python { + std::vector gather() override + { + if (!Py_IsInitialized()) { + return {}; + } + std::vector frames; + py::gil_scoped_acquire acquire; + { + std::lock_guard lock(to_free_frames_mutex); + for (CapturedTraceback::PyFrame f : to_free_frames) { + Py_XDECREF(f.code); + } + to_free_frames.clear(); + } + PyFrameObject* f = PyEval_GetFrame(); + Py_XINCREF(f); + while (f) { + frames.emplace_back(CapturedTraceback::PyFrame{PyFrame_GetCode(f), PyFrame_GetLasti(f)}); + auto f_back = PyFrame_GetBack(f); + Py_XDECREF(f); + f = f_back; + } + return frames; + } + void release(std::vector& frames) override + { + std::lock_guard lock(to_free_frames_mutex); + to_free_frames.insert(to_free_frames.end(), frames.begin(), frames.end()); + } + using void_visitproc = int (*)(void* self, void* arg); + int traverse(std::vector& frames, void_visitproc visit, void* arg) override + { + for (auto& f : frames) { + Py_VISIT(f.code); + } + return 0; + } + int clear(std::vector& frames) override + { + for (auto& f : frames) { + Py_CLEAR(f.code); + } + return 0; + } + void appendSymbolized(const std::vector& to_symbolize, + SymbolizedTracebacks& result) override + { + py::gil_scoped_acquire acquire; + py::str line_s = "line"; + py::str name_s = "name"; + py::str filename_s = "filename"; + + auto torch = py::module::import("torch"); + py::object stack_frames_for_code; + if (py::hasattr(torch, "_inductor")) { + py::object inductor = torch.attr("_inductor"); + if (py::hasattr(inductor, "codecache")) { + stack_frames_for_code = inductor.attr("codecache").attr("PyCodeCache").attr("stack_frames_for_code"); + } + } + for (const auto& f : to_symbolize) { + auto f_code = (PyCodeObject*)f.code; + py::handle filename = f_code->co_filename; + py::handle funcname = f_code->co_name; + auto lineno = PyCode_Addr2Line(f_code, f.lasti); + result.tracebacks.emplace_back(); + result.tracebacks.back().push_back(result.all_frames.size()); + result.all_frames.emplace_back( + torch::unwind::Frame{py::cast(filename), py::cast(funcname), (uint64_t)lineno}); + // find all the additional frames associated with inductor generated + // code + if (stack_frames_for_code.ptr()) { + py::object extra = stack_frames_for_code(filename, lineno); + if (!extra.is_none()) { + for (py::handle h : extra) { + result.tracebacks.back().push_back(result.all_frames.size()); + result.all_frames.emplace_back(torch::unwind::Frame{py::cast(h[filename_s]), + py::cast(h[name_s]), + py::cast(h[line_s])}); + } + } + } + } + } +}; + +} // namespace + +std::vector py_symbolize(std::vector& to_symbolize) +{ + // we dedup repeated to_symbolize objects to prevent + // creating a bunch of duplicated frame objects + std::unordered_map cached_frames; + std::vector unique_frames; + for (const auto& sc : to_symbolize) { + auto it = cached_frames.find(sc); + if (it == cached_frames.end()) { + cached_frames.insert({sc, unique_frames.size()}); + unique_frames.push_back(sc); + } + } + auto s = symbolize(unique_frames); + + py::str line_s = "line"; + py::str name_s = "name"; + py::str filename_s = "filename"; + std::vector all_frames; + for (const auto& f : s.all_frames) { + py::dict d; + d[name_s] = f.funcname; + d[filename_s] = f.filename; + d[line_s] = f.lineno; + all_frames.emplace_back(std::move(d)); + } + + std::vector py_unique_frames; + for (const auto& t : s.tracebacks) { + py::list l; + for (const auto& e : t) { + l.append(all_frames.at(e)); + } + py_unique_frames.push_back(std::move(l)); + } + + std::vector result; + result.reserve(to_symbolize.size()); + for (const auto& sc : to_symbolize) { + result.push_back(py_unique_frames.at(cached_frames.at(sc))); + } + return result; +} + +void freeDeadCapturedTracebackFrames() +{ + std::lock_guard lock(to_free_frames_mutex); + for (CapturedTraceback::PyFrame f : to_free_frames) { + Py_XDECREF(f.code); + } + to_free_frames.clear(); +} + +void installCapturedTracebackPython() { CapturedTraceback::addPythonUnwinder(new PythonTraceback()); } + +} // namespace torch_npu diff --git a/torch_npu/csrc/profiler/python/combined_traceback.h b/torch_npu/csrc/profiler/python/combined_traceback.h new file mode 100644 index 0000000000..35809b7281 --- /dev/null +++ b/torch_npu/csrc/profiler/python/combined_traceback.h @@ -0,0 +1,22 @@ +#include +#include + +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/profiler/combined_traceback.h" + +namespace torch_npu { + +// symbolize combined traceback objects, converting them into lists of +// dictionaries that are easily consumed in python. + +// returns std::vector because one use is to call it with a batch of +// tracebacks that come from a larger datastructure (e.g. a memory snapshot) +// and then have more c++ code to put those objects in the right place. +std::vector py_symbolize(std::vector& to_symbolize); + +// requires GIL to be held, frees any pending free frames +void freeDeadCapturedTracebackFrames(); + +TORCH_NPU_API void installCapturedTracebackPython(); + +} // namespace torch_npu diff --git a/torch_npu/csrc/profiler/unwind/unwind.cpp b/torch_npu/csrc/profiler/unwind/unwind.cpp new file mode 100644 index 0000000000..453adcf770 --- /dev/null +++ b/torch_npu/csrc/profiler/unwind/unwind.cpp @@ -0,0 +1,95 @@ +#include +#include +#include +#include + +#include +#include "unwind.h" + +#if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \ + !__has_include("ext/stdio_filebuf.h") +namespace torch_npu::unwind { +std::vector unwind() +{ + const int size = 200; + void* buffer[size]; + int nptrs = backtrace(buffer, size); + return std::vector(buffer, buffer + nptrs); +} + +c10::optional > libraryFor(void* addr) +{ + TORCH_CHECK( + false, + "record_context_cpp is not support on non-linux non-x86_64 platforms"); +} + +std::vector symbolize(const std::vector& frames) +{ + std::vector results; + for (const auto& addr : frames) { + torch::unwind::Frame frame; + Dl_info info; + frame.lineno = 0; + if (dladdr(addr, &info)) { + frame.filename = info.dli_fname ? info.dli_fname : "??"; + size_t last_pos = frame.filename.find_last_of('/'); + if (last_pos != std::string::npos) { + frame.filename = frame.filename.substr(last_pos + 1); + } + char* demangled = abi::__cxa_demangle(info.dli_sname, nullptr, nullptr, nullptr); + if (demangled) { + frame.funcname = demangled; + free(demangled); + } else { + frame.funcname = info.dli_sname ? info.dli_sname : "??"; + } + } else { + frame.filename = "??"; + frame.funcname = "??"; + } + if ((frame.filename == "python" && frame.filename.find("PyEval_EvalFrame") == std::string::npos) || + (frame.filename.find("libc.so") != std::string::npos)) { + frame.funcname = "__libc_start_main"; + } + results.push_back(frame); + } + return results; +} + +Stats stats() +{ + TORCH_CHECK( + false, + "record_context_cpp is not support on non-linux non-x86_64 platforms"); +} + +} // namespace torch_npu::unwind + +#else + +#include +#include +#include +#include + +namespace torch_npu::unwind { +std::vector unwind() +{ + TORCH_CHECK(false, "For the linux x86 platform, this function should call the torch function"); +} + +c10::optional > libraryFor(void* addr) +{ + TORCH_CHECK(false, "For the linux x86 platform, this function should call the torch function"); +} + +std::vector symbolize(const std::vector& frames) +{ + TORCH_CHECK(false, "For the linux x86 platform, this function should call the torch function"); +} + +Stats stats() { TORCH_CHECK(false, "For the linux x86 platform, this function should call the torch function"); } + +} // namespace torch_npu::unwind +#endif diff --git a/torch_npu/csrc/profiler/unwind/unwind.h b/torch_npu/csrc/profiler/unwind/unwind.h new file mode 100644 index 0000000000..5385d020a3 --- /dev/null +++ b/torch_npu/csrc/profiler/unwind/unwind.h @@ -0,0 +1,36 @@ +#pragma once +#include +#include +#include +#include +#include + +#include "torch_npu/csrc/core/npu/NPUMacros.h" + +namespace torch_npu { +namespace unwind { +// gather current stack, relatively fast. +// gets faster once the cache of program counter locations is warm. +TORCH_NPU_API std::vector unwind(); + +// note: symbolize is really slow +// it will launch an addr2line process that has to parse dwarf +// information from the libraries that frames point into. +// Callers should first batch up all the unique void* pointers +// across a number of unwind states and make a single call to +// symbolize. +TORCH_NPU_API std::vector symbolize(const std::vector& frames); + +// returns path to the library, and the offset of the addr inside the library +TORCH_NPU_API c10::optional > libraryFor(void* addr); + +struct Stats { + size_t hits = 0; + size_t misses = 0; + size_t unsupported = 0; + size_t resets = 0; +}; +Stats stats(); + +} // namespace unwind +} // namespace torch_npu diff --git a/torch_npu/npu/memory.py b/torch_npu/npu/memory.py index 0de7e4a578..7447578782 100644 --- a/torch_npu/npu/memory.py +++ b/torch_npu/npu/memory.py @@ -761,10 +761,6 @@ def _record_memory_history_impl( max_entries: int = sys.maxsize, device=None, ): - if platform.machine() == "aarch64" and stacks == "all": - warnings.warn("Currently 'aarch64' does not support the display of c++ stacks, " \ - "changed to display only python.") - stacks = "python" torch_npu.npu._lazy_init() torch_npu._C._npu_record_memory_history(enabled, context, stacks, max_entries) -- Gitee From e3872abf3deb90e7de4ad04ee8520c390c51b6f1 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 4 Jun 2025 14:59:09 +0000 Subject: [PATCH 034/328] !21574 Update op_plugin commit id Merge pull request !21574 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index b76d9d7352..5c0092f59a 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit b76d9d735208c77e25651dd02e3bf8baa1bfe7d8 +Subproject commit 5c0092f59a630c4fc2c8a066cad1186466e1a13b -- Gitee From 3482d83fa261028c5a783eaa84ab377043360852 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 4 Jun 2025 22:28:31 +0000 Subject: [PATCH 035/328] !21580 Update torchair commit id Merge pull request !21580 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 25e45fe301..9d3a02f674 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 25e45fe301dd77ec481e34a7665c32e5755ad709 +Subproject commit 9d3a02f6743134c3e17814cee85770956cf41bda -- Gitee From 97a30bb40c3be8e304a677fd60aee5f5c08311d7 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 5 Jun 2025 03:14:21 +0000 Subject: [PATCH 036/328] !21591 Update op_plugin commit id Merge pull request !21591 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5c0092f59a..be8e6ed93e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5c0092f59a630c4fc2c8a066cad1186466e1a13b +Subproject commit be8e6ed93ed44772b92e5eeaa27c9f59cc26e4ca -- Gitee From dec08c045c6ab57cd926d517f018abb8174c0cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Thu, 5 Jun 2025 03:20:52 +0000 Subject: [PATCH 037/328] =?UTF-8?q?!21577=20support=20matmul=20silent=20ch?= =?UTF-8?q?eck=20and=20checksum=20Merge=20pull=20request=20!21577=20from?= =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fcheckv3fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 3 + torch_npu/__init__.py | 3 +- torch_npu/asd/_silent_fault_data.py | 7 - torch_npu/asd/asd.py | 654 +++++++++++++++++- torch_npu/asd/checksum.py | 51 ++ .../csrc/core/npu/interface/OpInterface.cpp | 6 - .../csrc/core/npu/interface/OpInterface.h | 5 - .../csrc/distributed/ProcessGroupHCCL.cpp | 39 +- torch_npu/csrc/npu/Module.cpp | 15 +- torch_npu/utils/_step.py | 343 ++++----- 10 files changed, 846 insertions(+), 280 deletions(-) create mode 100644 torch_npu/asd/checksum.py diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index a04f2cd4f3..b4ae0dd083 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2651,6 +2651,9 @@ "torch_npu.erase_stream": { "signature": "(tensor, stream)" }, + "torch_npu.matmul_checksum": { + "signature": "(a, b, c)" + }, "torch_npu.utils.FlopsCounter": { "signature": "()" }, diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 2138a3a084..de75030fe5 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -1,4 +1,4 @@ -__all__ = ["erase_stream"] +__all__ = ["erase_stream", "matmul_checksum"] import os import sys @@ -86,6 +86,7 @@ from torch_npu.utils.exposed_api import public_npu_functions from torch_npu.npu.utils import _erase_stream as erase_stream from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler from torch_npu.asd.asd import _asd_patch +from torch_npu.asd.checksum import _matmul_checksum as matmul_checksum from torch_npu._C._distributed_c10d import ParallelStore from torch_npu.op_plugin.meta import _meta_registrations from torch_npu.version import __version__ as __version__ diff --git a/torch_npu/asd/_silent_fault_data.py b/torch_npu/asd/_silent_fault_data.py index 7fb5587a9b..ab7a60a9f9 100644 --- a/torch_npu/asd/_silent_fault_data.py +++ b/torch_npu/asd/_silent_fault_data.py @@ -34,10 +34,3 @@ class SilentFaultDataV2: self.step_tensor = torch.zeros(1, dtype=torch.int64).npu() self.check_tensor = torch.zeros(3, dtype=torch.float).npu() self.upper_thresh, self.sigma_thresh = get_thresh() - - -class SilentFaultDataV3: - def __init__(self): - self.step_tensor = torch.zeros(1, dtype=torch.int64, device="npu") - self.avg_tensor = None - self.upper_thresh, self.sigma_thresh = get_thresh() diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index bae88bf708..33810ffef0 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -1,15 +1,26 @@ import os +from functools import wraps +import logging +import time +import warnings +import threading +import math import torch from torch.nn.functional import layer_norm as origin_layernorm from torch.nn.functional import embedding as origin_embedding import torch_npu from torch_npu.utils._error_code import ErrCode, pta_error -from ._silent_fault_data import SilentFaultData, SilentFaultDataV2, SilentFaultDataV3 +from ._silent_fault_data import SilentFaultData, SilentFaultDataV2 __all__ = [] +original_matmul = torch.matmul +original_tensor_matmul = torch.Tensor.matmul +loggerSilent = logging.getLogger("torch_npu.silent_check") + + def _Singleton(cls): _instances = {} @@ -113,7 +124,7 @@ class _SilentFaultDetectorV2: self.silent_data_dict = dict() self.min_step = 100 - def silent_fault_check(self, idx, asd_enable, grad): + def silent_fault_check(self, idx, asd_flag, grad): if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32: return @@ -125,35 +136,634 @@ class _SilentFaultDetectorV2: sfda = self.silent_data_dict[idx] torch_npu._npu_silent_check_v2(val, grad, sfda.check_tensor, sfda.step_tensor, self.min_step, sfda.upper_thresh[0], - sfda.sigma_thresh[0], sfda.upper_thresh[1], sfda.sigma_thresh[1], asd_enable) + sfda.sigma_thresh[0], sfda.upper_thresh[1], sfda.sigma_thresh[1], asd_flag) _silent_fault_detector_v2 = _SilentFaultDetectorV2() +IS_IN_BACKWARD = False -@_Singleton -class _SilentFaultDetectorV3: - def __init__(self): - self.silent_data_dict = dict() - self.beta1 = 0.99 +def _input_hook(idx, asd_flag): + def hook(grad): + global IS_IN_BACKWARD + loggerSilent.debug(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}") + IS_IN_BACKWARD = False + torch_npu._C._npu_set_call_state("forward") + _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad) + return + return hook - def silent_fault_check(self, idx, asd_enable, grad): - if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32: - return - val = torch.norm(grad, float('inf')).pow(2).view(-1) +def _output_hook(grad): + global IS_IN_BACKWARD + loggerSilent.debug(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.") + IS_IN_BACKWARD = True + torch_npu._C._npu_set_call_state("backward") + return grad - if idx not in self.silent_data_dict: - self.silent_data_dict[idx] = SilentFaultDataV3() - self.silent_data_dict[idx].avg_tensor = grad.pow(2).max().view(-1) - grad_max = self.silent_data_dict[idx].avg_tensor - else: - grad_max = val - sfda = self.silent_data_dict[idx] +def _is_inner_module(module): + return len(module._modules) == 0 - torch_npu._npu_silent_check_v3(val, grad, sfda.step_tensor, grad_max, sfda.avg_tensor, - sfda.upper_thresh[0], sfda.upper_thresh[1], self.beta1, asd_enable) +class _SilentCheckState: + def __init__(self): + self.init_param() + self.init_marks = {} + self.weight_hook_handles = {} + self.last_weight_hook_handles = {} + self.dtype_support = True + self.check_enable = 0 + + def set_check_enable(self, enable): + self.check_enable = enable + + def get_check_enable(self): + return self.check_enable + + def init_param(self): + self.first_forward = True + self.input_hook_flag = False + self.is_training = False + self.first_module_id = "" + self.first_weight = None + self.first_weight_id = None + self.last_weight = None + self.last_weight_id = None + + def init_module_info(self, module_id, training): + self.first_module_id = module_id + self.first_forward = False + self.is_training = training + if self.is_training: + torch_npu._C._npu_set_module_train_state("train") + else: + torch_npu._C._npu_set_module_train_state("infer") + + def check_tensor_dtype(self, tensor): + if not self.dtype_support: + return + if isinstance(tensor, torch.Tensor) and tensor.requires_grad and tensor.dtype == torch.float16: + self.dtype_support = False + + def check_dtype(self, module, *args): + for x in args: + self.check_tensor_dtype(x) + for _, param in module._parameters.items(): + self.check_tensor_dtype(param) + + def search_first_weight(self, module): + # Search the first weight + if not self.init_marks.get(self.first_module_id, False) and self.first_weight is None: + for _, param in module._parameters.items(): + if isinstance(param, torch.Tensor) and param.requires_grad: + self.first_weight = param + self.first_weight_id = id(param) + break + + def search_last_weight(self, module): + # Search the last weight (only in inner module) + if not self.init_marks.get(self.first_module_id, False) and _is_inner_module(module): + for _, param in module._parameters.items(): + if isinstance(param, torch.Tensor) and param.requires_grad: + self.last_weight = param + self.last_weight_id = id(param) + + def init_all_hook(self): + if self.is_training: + if self.last_weight is not None and self.first_weight is not None: + # Otherwise, there is only one weight in the outer module + if self.first_weight_id != self.last_weight_id: + loggerSilent.debug(f"init_all_hook: module init, first_module_id is {self.first_module_id}.") + if self.last_weight_hook_handles.get(self.first_module_id, None) is None: + last_weight_handle = self.last_weight.register_hook(_output_hook) + self.last_weight_hook_handles[self.first_module_id] = last_weight_handle + if self.weight_hook_handles.get(self.first_module_id, None) is None: + first_weight_handle = self.first_weight.register_hook(_input_hook(self.first_module_id, self.check_enable)) + self.weight_hook_handles[self.first_module_id] = first_weight_handle + else: + loggerSilent.debug(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.") + self.init_marks[self.first_module_id] = True + + +silent_check = _SilentCheckState() + + +def _silent_check_decorator(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + global silent_check + global IS_IN_BACKWARD + + if not torch.npu.is_initialized(): + return func(self, *args, **kwargs) + + if silent_check.get_check_enable() and not IS_IN_BACKWARD: + if silent_check.first_forward: + silent_check.init_module_info(id(self), self.training) + self.outer = True + + if silent_check.is_training and not silent_check.init_marks.get(silent_check.first_module_id, False): + silent_check.check_dtype(self, *args) + if not silent_check.dtype_support: + for value in silent_check.weight_hook_handles.values(): + if value is not None: + value.remove() + for value in silent_check.last_weight_hook_handles.values(): + if value is not None: + value.remove() + silent_check.set_check_enable(0) + warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.") + + tmp = func(self, *args, **kwargs) + + if silent_check.get_check_enable() and silent_check.is_training and not IS_IN_BACKWARD: + # Search the first weight + silent_check.search_first_weight(self) + + # Search the last weight (only in inner module) + silent_check.search_last_weight(self) + + if silent_check.get_check_enable() and not IS_IN_BACKWARD: + if hasattr(self, "outer") and self.outer: + silent_check.init_all_hook() + silent_check.init_param() + self.outer = False + + return tmp + return wrapper + + +class _MatmulSilentCheck: + def __init__(self): + self.init_param() + self.init_marks = {} + self.check_stat = {} + self.hook_dict = {} + self.registered_modules = [] + self.matmul_hook_enable = 0 + self.matmul_with_bf16 = False + self.check_stream = None + self.check_event = None + self.statistic_value = None + self.is_outer_call = True + # link to checksum + self.matmul_trigger = False + self.checksum_enable = False + self.checksum_result = None + self.checksum_state = None + self.checksum_state_thread_running = False + self.checksum_state_thread = threading.Thread( + target=self._tcp_comm_checksum_state, + daemon=True + ) + # Use another thread to receive the statistic value and detect SDC + self.check_thread_running = False + self.check_thread = threading.Thread( + target=self._async_detect, + daemon=True + ) + self.lock = threading.Lock() + self.queue_len = 8192 + self.statistic_cpu_value = None + self.name_list = ["" for _ in range(self.queue_len)] + self.head_index = 0 + self.tail_index = 0 + self.history_abnormal_list = [] + # Parameter filtering + self.filter_index = -1 + self.filter_interval = 3 + self.invalid_grad_sum = 0 + # Threshold + self.with_checksum = False + self.cooldown = 5 # default 5 min cooldown + self.strikes_num = 3 # default 3 times + self.strikes_window = 480 # default 480 min + self.checksum_cooldown = 180 # default 180 min + self.upper_thresh1 = 1000000 # default 1000000 + self.upper_thresh2 = 100 # default 100 + self.store = None + self.rank = None + + def init_param(self): + self.first_forward = True + self.is_training = False + self.first_module_id = "" + + def init_module_info(self, module_id, training): + self.first_module_id = module_id + self.first_forward = False + self.is_training = training + + def set_matmul_hook_enable(self, enable): + self.matmul_hook_enable = enable + + def get_matmul_hook_enable(self): + return self.matmul_hook_enable + + def set_with_checksum(self, enable): + self.with_checksum = enable + + def get_with_checksum(self): + return self.with_checksum + + def set_cooldown(self, cooldown): + self.cooldown = cooldown + + def get_cooldown(self): + return self.cooldown + + def set_strikes_num(self, strikes_num): + self.strikes_num = strikes_num + + def get_strikes_num(self): + return self.strikes_num + + def set_strikes_window(self, strikes_window): + self.strikes_window = strikes_window + + def get_strikes_window(self): + return self.strikes_window + + def set_checksum_cooldown(self, checksum_cooldown): + self.checksum_cooldown = checksum_cooldown + + def get_checksum_cooldown(self): + return self.checksum_cooldown + + def set_upper_thresh1(self, upper_thresh1): + self.upper_thresh1 = upper_thresh1 + + def get_upper_thresh1(self): + return self.upper_thresh1 + + def set_upper_thresh2(self, upper_thresh2): + self.upper_thresh2 = upper_thresh2 + + def get_upper_thresh2(self): + return self.upper_thresh2 + + def init_stream(self): + if self.check_stream is None: + self.check_stream = torch_npu.npu.Stream() + self.check_event = torch_npu.npu.Event(enable_timing=False) + self.statistic_value = torch.tensor(0., device=torch_npu.npu.current_device()) + self.checksum_state = 0 + self.statistic_cpu_value = torch.zeros((self.queue_len,), device='cpu', dtype=torch.float32).pin_memory() + self.statistic_cpu_value.fill_(-1) + if self.store is None: + if torch.distributed.is_initialized(): + self.store = torch.distributed.distributed_c10d._get_default_store() + self.rank = torch.distributed.get_rank() + if self.rank == 0: + for i in range(1, torch.distributed.get_world_size()): + self.store.set(f"rank_{i}_info_log", "") + self.store.set(f"rank_{i}_warn_log", "") + + def parameter_filtering(self): + self.filter_index = (self.filter_index + 1) % self.filter_interval + return self.filter_index == 0 + + def register_module_hook(self, module, name): + self.check_stat[name + "_backward"] = {'avg': 0, 'pre_val': 0, 'step': 0, 'none_zero_step': 0} + self.hook_dict[name + "_backward"] = module.register_full_backward_hook(lambda module, grad_input, grad_output, n=name + "_backward": self.module_hook(module, grad_input, grad_output, n)) + self.registered_modules.append(name) + + def module_hook(self, module, grad_input, grad_output, name): + for _, param in module.named_parameters(): + if param.dim() >= 2: + if param.grad is not None: + self._detect_grad(param.grad.detach(), name) + self.invalid_grad_sum = 0 + elif hasattr(param, 'main_grad') and param.main_grad is not None: + self._detect_grad(param.main_grad.detach(), name) + self.invalid_grad_sum = 0 + else: + self.invalid_grad_sum += 1 + if self.invalid_grad_sum > max(10, len(self.registered_modules)): + warnings.warn(f"There is no available grad for detection, and the silent check feature may not take effect.") + self.invalid_grad_sum = 0 + + def _detect_grad(self, grad, name): + if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32: + return + + if self.matmul_hook_enable >= 1: + default_stream = torch_npu.npu.current_stream() + with torch_npu.npu.stream(self.check_stream): + with torch.no_grad(): + self.check_stream.wait_stream(default_stream) + self.statistic_value.fill_(torch.pow(torch.norm(grad, float('inf')), 2).detach().float()) + + #Asynchronously copy the value to host + self.lock.acquire() + self.statistic_cpu_value[self.tail_index].copy_(self.statistic_value.data, non_blocking=True) + self.name_list[self.tail_index] = name + self.tail_index = (self.tail_index + 1) % self.queue_len + self.lock.release() + self.check_event.record(self.check_stream) + if self.tail_index == self.head_index: + # The queue is full, synchronize to empty the queue + self.check_event.synchronize() + torch_npu.npu.synchronize() + + def _async_detect(self): + while True: + if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized(): + break + time.sleep(10) + local_rank = os.getenv("LOCAL_RANK", "-1") + if local_rank.isdigit(): + torch.npu.set_device(int(local_rank)) + + while True: + self.lock.acquire() + val = self.statistic_cpu_value[self.head_index].item() + name = self.name_list[self.head_index] + while val > 0 and name != "": + result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check( + val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'], + self.upper_thresh1, self.upper_thresh2 + ) + + if result: + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + new_abnormal = {'time_str': current_time, + 'time': time.time(), + 'name': name, + 'rank': torch.distributed.get_rank(), + 'val': val, + 'pre_val': self.check_stat[name]['pre_val'], + 'avg': self.check_stat[name]['avg'], + 'step': self.check_stat[name]['step'], + 'none_zero_step': self.check_stat[name]['none_zero_step'], + 'counted': True, + 'striked': False} + self._abnormal_process(new_abnormal) + self.check_stat[name]['step'] += 1 + self.check_stat[name]['pre_val'] = val + + self.statistic_cpu_value[self.head_index].fill_(-1) + self.name_list[self.head_index] = "" + self.head_index = (self.head_index + 1) % self.queue_len + val = self.statistic_cpu_value[self.head_index].item() + name = self.name_list[self.head_index] + + self.lock.release() + time.sleep(0.1) + + def _silent_check(self, val, pre_val, avg, none_zero_step, alpha1=1e6, alpha2=1e2): + if val == 0: + return False, avg, none_zero_step + elif math.isnan(val) or math.isinf(val): + return True, avg, none_zero_step + else: + if none_zero_step != 0 and avg != 0: + thres = avg * alpha1 / (1 - 0.99 ** none_zero_step) + thres2 = avg * alpha2 / (1 - 0.99 ** none_zero_step) + else: + thres = val + thres2 = val + if val > thres and abs(val - pre_val) > thres: + return True, avg, none_zero_step + else: + if val <= thres2: + none_zero_step += 1 + avg = avg * 0.99 + val * 0.01 + return False, avg, none_zero_step + + def _abnormal_process(self, new_abnormal): + i = len(self.history_abnormal_list) - 1 + if i < 0: + self._generate_event_log(new_abnormal) + self.history_abnormal_list.append(new_abnormal) + return + counting_abnormal_pos = [] + while i >= 0: + old_abnormal = self.history_abnormal_list[i] + old_time = old_abnormal['time'] + new_time = new_abnormal['time'] + if old_abnormal['counted'] and abs(new_time - old_time) >= self.cooldown * 60: + # A new counted abnormal + self._generate_event_log(new_abnormal) + counting_abnormal_pos.append(i) + i -= 1 + while i >= 0: + old_abnormal = self.history_abnormal_list[i] + if old_abnormal['counted'] and not old_abnormal['striked']: + counting_abnormal_pos.append(i) + if len(counting_abnormal_pos) == self.strikes_num - 1: + break + i -= 1 + if len(counting_abnormal_pos) == self.strikes_num - 1 and abs(new_abnormal['time'] - old_abnormal['time']) <= self.strikes_window * 60: + # Three strikes + self._generate_warning_log(counting_abnormal_pos, new_abnormal) + for index in counting_abnormal_pos: + self.history_abnormal_list[index]['striked'] = True + new_abnormal['striked'] = True + + if self.with_checksum: + self.checksum_state = 1 + if not self.matmul_with_bf16: + warnings.warn(f"Warning: Module has no supported dtype grad, checksum will not to be linked.") + break + elif not old_abnormal['counted']: + # Keep tracing the last counted abnormal + i -= 1 + else: + # A new not-counted abnormal + new_abnormal['counted'] = False + break + self.history_abnormal_list.append(new_abnormal) + # remove expired exception + current_time = time.time() + first_expired_index = 0 + for abnormal in self.history_abnormal_list: + if abs(current_time - abnormal['time']) <= self.strikes_window * 60: + break + first_expired_index += 1 + if first_expired_index > 0: + del self.history_abnormal_list[:first_expired_index] + + def _generate_event_log(self, new_abnormal): + info_str = f"[Event][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: A grad-norm spike may happen, " + info_str = info_str + f"param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, " + info_str = info_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." + loggerSilent.info(info_str) + if self.store is not None and self.rank is not None and self.rank != 0: + current_log = self.store.get(f"rank_{self.rank}_info_log").decode() + self.store.set(f"rank_{self.rank}_info_log", current_log + "\n" + info_str if current_log != "" else info_str) + + def _generate_warning_log(self, counting_abnormal_pos, new_abnormal): + warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: Training instability happens, feature detection detects abnormal results!" + index = 0 + for pos in reversed(counting_abnormal_pos): + warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {self.history_abnormal_list[pos]['time_str']}, param name {self.history_abnormal_list[pos]['name']}, abnormal value {self.history_abnormal_list[pos]['val']}, previous value {self.history_abnormal_list[pos]['pre_val']}, " + warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, step {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}." + index += 1 + warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {new_abnormal['time_str']}, param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, " + warning_str = warning_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." + loggerSilent.warning(warning_str) + if self.store is not None and self.rank is not None and self.rank != 0: + current_log = self.store.get(f"rank_{self.rank}_warn_log").decode() + self.store.set(f"rank_{self.rank}_warn_log", current_log + "\n" + warning_str if current_log != "" else warning_str) + + def _generate_silent_log(self): + warning_str = f"[Warning][Rank {torch.distributed.get_rank()}]: The result of Matmul checksum is abnormal!" + loggerSilent.warning(warning_str) + if self.store is not None and self.rank is not None and self.rank != 0: + current_log = self.store.get(f"rank_{self.rank}_warn_log").decode() + self.store.set(f"rank_{self.rank}_warn_log", current_log + "\n" + warning_str if current_log != "" else warning_str) + + def _tcp_comm_checksum_state(self): + while True: + if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized() and self.store is not None: + break + time.sleep(10) + local_rank = os.getenv("LOCAL_RANK", "-1") + self.rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + if local_rank.isdigit(): + torch.npu.set_device(int(local_rank)) + + last_checksum_time = None + if self.rank == 0: + self.store.add('counter2', world_size) + while True: + if self.rank == 0: + for i in range(1, world_size): + msg = self.store.get(f"rank_{i}_warn_log").decode() + if msg != "": + loggerSilent.warning(msg) + self.store.set(f"rank_{i}_warn_log", "") + msg = self.store.get(f"rank_{i}_info_log").decode() + if msg != "": + loggerSilent.info(msg) + self.store.set(f"rank_{i}_info_log", "") + + if not self.with_checksum or not self.matmul_with_bf16: + time.sleep(10) + continue + + self.store.add('checksum_state', self.checksum_state) + if self.rank == 0: + self.store.add('counter2', 0 - world_size) + self.store.add('counter', 1) + + while int(self.store.get('counter').decode()) < world_size: + time.sleep(0.1) + + global_state = int(self.store.get('checksum_state').decode()) + if global_state: + now_time = time.time() + if last_checksum_time is None or abs(now_time - last_checksum_time) > self.checksum_cooldown * 60: + loggerSilent.info(f'[Info] Rank {self.rank}: Training instability happened, checksum is on.') + last_checksum_time = now_time + if self.checksum_result is None: + self.checksum_result = torch.tensor(False, dtype=torch.bool, device='npu') + else: + self.checksum_result.fill_(False) + self.checksum_enable = True + time.sleep(self.cooldown * 60) + if self.checksum_result: + self._generate_silent_log() + self.checksum_enable = False + loggerSilent.info(f'[Info] Rank {self.rank}: checksum is off') + self.checksum_state = 0 + self.store.add('counter2', 1) + + while int(self.store.get('counter2').decode()) < world_size: + time.sleep(0.1) + + if self.rank == 0: + self.store.add('checksum_state', 0 - global_state) + self.store.add('counter', 0 - world_size) + + time.sleep(10) + + +matmul_check = _MatmulSilentCheck() + + +def _trigger_matmul_decorator(func): + @wraps(func) + def wrapper(a, b, *args, **kwargs): + global matmul_check + result = func(a, b, *args, **kwargs) + if matmul_check.checksum_enable: + checksum = torch_npu.matmul_checksum(a, b, result) + matmul_check.checksum_result.logical_or_(checksum) + return result + return wrapper + + +def _trigger_tensor_matmul_decorator(func): + @wraps(func) + def wrapper(self, other): + global matmul_check + result = func(self, other) + if matmul_check.checksum_enable: + checksum = torch_npu.matmul_checksum(self, other, result) + matmul_check.checksum_result.logical_or_(checksum) + return result + return wrapper + + +def _matmul_silent_check_decorator(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + global matmul_check + + if not torch.npu.is_initialized(): + return func(self, *args, **kwargs) + + if matmul_check.get_matmul_hook_enable() and matmul_check.first_forward: + matmul_check.init_stream() + matmul_check.init_module_info(id(self), self.training) + self.matmul_check_outer = True + + if not matmul_check.check_thread_running: + matmul_check.check_thread.start() + matmul_check.check_thread_running = True + + # 2 for checksum + if not matmul_check.checksum_state_thread_running: + matmul_check.checksum_state_thread.start() + matmul_check.checksum_state_thread_running = True + if matmul_check.with_checksum and not matmul_check.matmul_trigger: + torch_npu.asd.checksum.matmul = original_matmul + torch.matmul = _trigger_matmul_decorator(original_matmul) + torch.Tensor.matmul = _trigger_tensor_matmul_decorator(original_tensor_matmul) + matmul_check.matmul_trigger = True + + if matmul_check.is_training and not matmul_check.init_marks.get(matmul_check.first_module_id, False): + for name, module in self.named_modules(): + if matmul_check.get_matmul_hook_enable() == 0: + break + if len(module._modules) == 0 and name not in matmul_check.registered_modules: + for _, param in module.named_parameters(): + if not isinstance(param, torch.Tensor) or param.dim() < 2: + continue + if matmul_check.parameter_filtering(): + matmul_check.register_module_hook(module, name) + # check dtype + if param.dtype == torch.float16: + for value in self.hook_dict.values(): + if value is not None: + value.remove() + matmul_check.set_matmul_hook_enable(0) + break + if param.dtype == torch.bfloat16: + matmul_check.matmul_with_bf16 = True + + matmul_check.init_marks[matmul_check.first_module_id] = True + + tmp = func(self, *args, **kwargs) + + if matmul_check.get_matmul_hook_enable(): + if hasattr(self, "matmul_check_outer") and self.matmul_check_outer: + matmul_check.init_param() + self.matmul_check_outer = False -_silent_fault_detector_v3 = _SilentFaultDetectorV3() + return tmp + return wrapper diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py new file mode 100644 index 0000000000..24f38d86af --- /dev/null +++ b/torch_npu/asd/checksum.py @@ -0,0 +1,51 @@ +__all__ = [] + +import math +import torch +from torch import matmul +import torch_npu +from torch_npu.utils._error_code import ErrCode, pta_error + + +def _matmul_checksum(a, b, c): + r""" + Compare whether there are any feature anomalies in the calculation results of matmul. + Args: + a(Tensor): matmul's input parameter a, and the device must be npu. + b(Tensor): matmul's input parameter b, and the device must be npu. + c(Tensor): matmul's output result c, and the device must be npu. + + Returns: The bool scalar tensor, located on the npu side, indicates whether there are any anomalies in the calculation result. + + """ + if not isinstance(a, torch.Tensor) or a.device.type != 'npu': + raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM)) + if not isinstance(b, torch.Tensor) or b.device.type != 'npu': + raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM)) + if not isinstance(c, torch.Tensor) or c.device.type != 'npu': + raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM)) + + t = 23 + c_sum = torch.sum(c, dim=-1, dtype=torch.float32) + b1 = torch.sum(b, dim=-1, keepdim=True, dtype=torch.float32) + c1 = matmul(a.to(torch.float32), b1) + c1_trans = c1.squeeze(-1) + + n_b = b.shape[-1] + m_b = b.shape[0] + n = c.shape[-1] + + c_max, _ = torch.max(torch.abs(c), dim=-1) + c_sum_accum_error = math.sqrt(n * (n + 1) * (2 * n + 1) / 48) * c_max * 2 ** (-t) + c_ele_round_error_accum = c_max * 2 ** (-8) * math.sqrt(n_b) + + b_max, _ = torch.max(torch.abs(b), dim=-1, keepdim=True) + delta_1 = math.sqrt(n_b * (n_b + 1) * (2 * n_b + 1) / 48) * b_max * 2 ** (-t) + delta_4 = matmul(torch.abs(a), delta_1).squeeze() + a_max, _ = torch.max(torch.abs(a), dim=-1) + delta_2_3 = math.sqrt((m_b * (m_b + 1) * (m_b + 0.5) + 2 * m_b) / 24) * a_max * torch.max(b_max) * 2 ** (-t) + error_total = (c_sum_accum_error + c_ele_round_error_accum + delta_2_3 + delta_4).to(torch.float) + + error = torch.abs(c_sum - c1_trans) + flag = (error - error_total) > 1e-20 + return torch.any(flag) diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.cpp b/torch_npu/csrc/core/npu/interface/OpInterface.cpp index 2ba4baaebc..e950ee9f93 100644 --- a/torch_npu/csrc/core/npu/interface/OpInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/OpInterface.cpp @@ -24,11 +24,5 @@ bool IsExistAclnnSilentCheck() return isExist; } -bool IsExistAclnnSilentCheckV2() -{ - const static bool isExistV2 = false; - return isExistV2; -} - } // namespace opapi } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.h b/torch_npu/csrc/core/npu/interface/OpInterface.h index 1a5f205e84..663f9a6144 100644 --- a/torch_npu/csrc/core/npu/interface/OpInterface.h +++ b/torch_npu/csrc/core/npu/interface/OpInterface.h @@ -7,10 +7,5 @@ namespace opapi { */ bool IsExistAclnnSilentCheck(); -/** - * This API is used to check whether aclnnSilentCheckV2 exist. -*/ -bool IsExistAclnnSilentCheckV2(); - } // namespace opapi } // namespace c10_npu diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 5a2e407f2c..52bcd255d1 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2960,34 +2960,17 @@ void ProcessGroupHCCL::silenceCheck(at::Tensor &input, c10d::OpType opType) return; } } - if (c10_npu::opapi::IsExistAclnnSilentCheckV2()) { - at::Tensor val = at::norm(input, std::numeric_limits::infinity()).pow(2).view(-1); - at::Tensor max; - if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) { - at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong)); - at::Tensor avg = input.detach().pow(2).max().view(-1); - max = avg; - silenceCheckCache_.emplace(opType, std::make_pair(std::move(stepTensor), std::move(avg))); - } else { - max = val; - } - static double beta1 = 0.99; - op_plugin::_npu_silent_check_v3(val, input, silenceCheckCache_[opType].first, max, silenceCheckCache_[opType].second, - c10_npu::option::OptionsManager::GetSilenceUpperThresh().first, c10_npu::option::OptionsManager::GetSilenceUpperThresh().second, - beta1, static_cast(c10_npu::option::OptionsManager::GetSilenceCheckFlag())); - } else { - if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) { - at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong)); - at::Tensor cacheTensor = at::zeros({3}, input.options().dtype(at::kFloat)); - silenceCheckCache_.emplace(opType, std::make_pair(std::move(stepTensor), std::move(cacheTensor))); - } - at::Tensor val = at::norm(input); - static double min_steps = 100.0; - op_plugin::_npu_silent_check_v2(val, input, silenceCheckCache_[opType].second, silenceCheckCache_[opType].first, min_steps, - c10_npu::option::OptionsManager::GetSilenceUpperThresh().first, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().first, - c10_npu::option::OptionsManager::GetSilenceUpperThresh().second, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().second, - static_cast(c10_npu::option::OptionsManager::GetSilenceCheckFlag())); - } + if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) { + at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong)); + at::Tensor cacheTensor = at::zeros({3}, input.options().dtype(at::kFloat)); + silenceCheckCache_.emplace(opType, std::make_pair(std::move(stepTensor), std::move(cacheTensor))); + } + at::Tensor val = at::norm(input); + static double min_steps = 100.0; + op_plugin::_npu_silent_check_v2(val, input, silenceCheckCache_[opType].second, silenceCheckCache_[opType].first, min_steps, + c10_npu::option::OptionsManager::GetSilenceUpperThresh().first, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().first, + c10_npu::option::OptionsManager::GetSilenceUpperThresh().second, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().second, + static_cast(c10_npu::option::OptionsManager::GetSilenceCheckFlag())); } HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions() diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index b0c77e9268..a261b6f99c 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1513,17 +1513,12 @@ PyObject* THNPModule_npu_set_module_train_state(PyObject* _unused, PyObject* arg PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS - if (c10_npu::opapi::IsExistAclnnSilentCheckV2()) { - // silent check v3 - return PyLong_FromLong(3); - } else { - if (c10_npu::opapi::IsExistAclnnSilentCheck()) { - // silent check v2 - return PyLong_FromLong(2); - } - // silent check v1 - return PyLong_FromLong(1); + if (c10_npu::opapi::IsExistAclnnSilentCheck()) { + // silent check v2 + return PyLong_FromLong(2); } + // silent check v1 + return PyLong_FromLong(1); END_HANDLE_TH_ERRORS } diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py index 555cbd25c1..f08847afb0 100644 --- a/torch_npu/utils/_step.py +++ b/torch_npu/utils/_step.py @@ -2,6 +2,7 @@ import os import stat import logging from logging.handlers import RotatingFileHandler +from functools import wraps import uuid import time import glob @@ -11,12 +12,13 @@ from torch.nn import Module import torch_npu from torch_npu.utils._error_code import ErrCode, pta_error -from torch_npu.asd.asd import _silent_fault_detector_v2, _silent_fault_detector_v3 +from torch_npu.asd.asd import _silent_check_decorator, silent_check, _matmul_silent_check_decorator, matmul_check original_call = Module.__call__ DEFAULT_FALGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC DEFAULT_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP +loggerSilent = logging.getLogger("torch_npu.silent_check") class PerfDumpState: @@ -44,111 +46,6 @@ class PerfDumpState: perf_dump_state = PerfDumpState() perf_dump_enable = False -IS_IN_BACKWARD = False -loggerSilent = logging.getLogger("torch_npu.silent_check") - - -def input_hook(idx, asd_flag): - def hook(grad): - global IS_IN_BACKWARD - loggerSilent.debug(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}") - IS_IN_BACKWARD = False - torch_npu._C._npu_set_call_state("forward") - if torch_npu._C._get_silent_check_version() == 3: - _silent_fault_detector_v3.silent_fault_check(idx, asd_flag, grad) - else: - _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad) - return - return hook - - -def output_hook(grad): - global IS_IN_BACKWARD - loggerSilent.debug(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.") - IS_IN_BACKWARD = True - torch_npu._C._npu_set_call_state("backward") - return grad - - -def _is_inner_module(module): - return len(module._modules) == 0 - - -class SilentCheckState: - def __init__(self): - self.init_param() - self.init_marks = {} - self.weight_hook_handles = {} - self.last_weight_hook_handles = {} - self.dtype_support = True - - def init_param(self): - self.first_forward = True - self.input_hook_flag = False - self.is_training = False - self.first_module_id = "" - self.first_weight = None - self.first_weight_id = None - self.last_weight = None - self.last_weight_id = None - - def init_module_info(self, module_id, training): - self.first_module_id = module_id - self.first_forward = False - self.is_training = training - if self.is_training: - torch_npu._C._npu_set_module_train_state("train") - else: - torch_npu._C._npu_set_module_train_state("infer") - - def check_tensor_dtype(self, tensor): - if not self.dtype_support: - return - if isinstance(tensor, torch.Tensor) and tensor.requires_grad and tensor.dtype == torch.float16: - self.dtype_support = False - - def check_dtype(self, module, *args): - for x in args: - self.check_tensor_dtype(x) - for param_name, param in module._parameters.items(): - self.check_tensor_dtype(param) - - def search_first_weight(self, module): - # Search the first weight - if not self.init_marks.get(self.first_module_id, False) and self.first_weight is None: - for param_name, param in module._parameters.items(): - if isinstance(param, torch.Tensor) and param.requires_grad: - self.first_weight = param - self.first_weight_id = id(param) - break - - def search_last_weight(self, module): - # Search the last weight (only in inner module) - if not self.init_marks.get(self.first_module_id, False) and _is_inner_module(module): - for param_name, param in module._parameters.items(): - if isinstance(param, torch.Tensor) and param.requires_grad: - self.last_weight = param - self.last_weight_id = id(param) - - def init_all_hook(self, asd_flag): - if self.is_training: - if self.last_weight is not None and self.first_weight is not None: - # Otherwise, there is only one weight in the outer module - if self.first_weight_id != self.last_weight_id: - loggerSilent.debug(f"init_all_hook: module init, first_module_id is {self.first_module_id}.") - if self.last_weight_hook_handles.get(self.first_module_id, None) is None: - last_weight_handle = self.last_weight.register_hook(output_hook) - self.last_weight_hook_handles[self.first_module_id] = last_weight_handle - if self.weight_hook_handles.get(self.first_module_id, None) is None: - first_weight_handle = self.first_weight.register_hook(input_hook(self.first_module_id, asd_flag)) - self.weight_hook_handles[self.first_module_id] = first_weight_handle - else: - loggerSilent.debug(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.") - self.init_marks[self.first_module_id] = True - - -silent_check = SilentCheckState() -asd_enable = 0 class CustomRotatingFileHandler(RotatingFileHandler): @@ -222,90 +119,65 @@ def _setup_logger(name, path): logger.propagate = False -def _custom_call(self, *args, **kwargs): - global perf_dump_enable - global perf_dump_state - - global asd_enable - global silent_check - global IS_IN_BACKWARD - - if not torch.npu.is_initialized(): - return original_call(self, *args, **kwargs) - - if perf_dump_enable: - if not perf_dump_state.has_log: - perf_dump_path = _get_perf_dump_path() - pid = os.getpid() - device_id = torch_npu.npu.current_device() - delete_pref_pt_logs(perf_dump_path, device_id) - perf_dump_state.local_uuid = uuid.uuid4() - perf_dump_state.uuid = _get_uuid() - perf_dump_state.log_file_name = os.path.join(perf_dump_path, f"perf_pt_{pid}_{device_id}.log") - _setup_logger("perf_logger", perf_dump_state.log_file_name) - logger = logging.getLogger("perf_logger") - logger.info(f"[LOCALUUID]:{perf_dump_state.local_uuid}") - logger.info("[FRAMEWORK]:PyTorch") - logger.info(f"[UUID]:{perf_dump_state.uuid}") - os.chmod(perf_dump_state.log_file_name, DEFAULT_PERMISSION) - perf_dump_state.has_log = True - - if perf_dump_state.is_outer_call: - if not perf_dump_state.is_child_module(self) and not _is_loss_module(self): - current_time = int(time.time() * 1000) +def _perf_dump_decorator(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + global perf_dump_enable + global perf_dump_state + + if not torch.npu.is_initialized(): + return func(self, *args, **kwargs) + + if perf_dump_enable: + if not perf_dump_state.has_log: + perf_dump_path = _get_perf_dump_path() + pid = os.getpid() + device_id = torch_npu.npu.current_device() + delete_pref_pt_logs(perf_dump_path, device_id) + perf_dump_state.local_uuid = uuid.uuid4() + perf_dump_state.uuid = _get_uuid() + perf_dump_state.log_file_name = os.path.join(perf_dump_path, f"perf_pt_{pid}_{device_id}.log") + _setup_logger("perf_logger", perf_dump_state.log_file_name) logger = logging.getLogger("perf_logger") - if perf_dump_state.last_time is not None: - logger.info(f"[STEPTIME]:{perf_dump_state.last_time},{current_time}") - perf_dump_state.last_time = current_time - perf_dump_state.add_module_dict(self) - perf_dump_state.is_outer_call = False - self.visited = True - - if asd_enable and not IS_IN_BACKWARD: - if silent_check.first_forward: - silent_check.init_module_info(id(self), self.training) - self.outer = True - - if silent_check.is_training and not silent_check.init_marks.get(silent_check.first_module_id, False): - silent_check.check_dtype(self, *args) - if not silent_check.dtype_support: - for value in silent_check.weight_hook_handles.values(): - if value is not None: - value.remove() - for value in silent_check.last_weight_hook_handles.values(): - if value is not None: - value.remove() - asd_enable = 0 - warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.") - - tmp = original_call(self, *args, **kwargs) - - if asd_enable and silent_check.is_training and not IS_IN_BACKWARD: - # Search the first weight - silent_check.search_first_weight(self) - - # Search the last weight (only in inner module) - silent_check.search_last_weight(self) - - if perf_dump_enable: - if hasattr(self, "visited") and self.visited: - perf_dump_state.is_outer_call = True - self.visited = False - - if asd_enable and not IS_IN_BACKWARD: - if hasattr(self, "outer") and self.outer: - silent_check.init_all_hook(asd_enable) - silent_check.init_param() - self.outer = False - - return tmp - - -def _parse_perf_config(): - perf_dump_config = os.getenv("PERF_DUMP_CONFIG") + logger.info(f"[LOCALUUID]:{perf_dump_state.local_uuid}") + logger.info("[FRAMEWORK]:PyTorch") + logger.info(f"[UUID]:{perf_dump_state.uuid}") + os.chmod(perf_dump_state.log_file_name, DEFAULT_PERMISSION) + perf_dump_state.has_log = True + + if perf_dump_state.is_outer_call: + if not perf_dump_state.is_child_module(self) and not _is_loss_module(self): + current_time = int(time.time() * 1000) + logger = logging.getLogger("perf_logger") + if perf_dump_state.last_time is not None: + logger.info(f"[STEPTIME]:{perf_dump_state.last_time},{current_time}") + perf_dump_state.last_time = current_time + perf_dump_state.add_module_dict(self) + perf_dump_state.is_outer_call = False + self.visited = True + + tmp = func(self, *args, **kwargs) + + if perf_dump_enable: + if hasattr(self, "visited") and self.visited: + perf_dump_state.is_outer_call = True + self.visited = False + + return tmp + return wrapper + + +@_perf_dump_decorator +@_silent_check_decorator +@_matmul_silent_check_decorator +def _custom_call(self, *args, **kwargs): + return original_call(self, *args, **kwargs) + + +def _parse_config(config): config_dict = {} - if perf_dump_config: - config_items = perf_dump_config.split(',') + if config: + config_items = config.split(',') for item in config_items: key_value = item.split(':') if len(key_value) == 2: @@ -314,27 +186,96 @@ def _parse_perf_config(): return config_dict +def _prase_asd_config(asd_config): + # checksum + with_checksum_str = asd_config.get("with_checksum", "false") + if with_checksum_str not in ["true", "false"]: + raise ValueError("NPU_ASD_CONFIG-with_checksum should be true or false. For details, 0 as `with checksum closed`, 1 as `with checksum opened`." + pta_error(ErrCode.VALUE)) + with_checksum = with_checksum_str == "true" + matmul_check.set_with_checksum(with_checksum) + + # cooldown + cooldown = asd_config.get("cooldown", "5") + if cooldown.isdigit() and cooldown != "0": + matmul_check.set_cooldown(int(cooldown)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-cooldown is invalid, use the default value of 5.") + + # strikes_sum + strikes_sum = asd_config.get("strikes_sum", "3") + if strikes_sum.isdigit() and strikes_sum != "0": + matmul_check.set_strikes_num(int(strikes_sum)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-strikes_sum is invalid, use the default value of 3.") + + # strikes_window + strikes_window = asd_config.get("strikes_window", "480") + if strikes_window.isdigit() and strikes_window != "0": + matmul_check.set_strikes_window(int(strikes_window)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-strikes_window is invalid, use the default value of 480.") + + # checksum_cooldown + checksum_cooldown = asd_config.get("checksum_cooldown", "180") + if checksum_cooldown.isdigit() and checksum_cooldown != "0": + matmul_check.set_checksum_cooldown(int(checksum_cooldown)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-checksum_cooldown is invalid, use the default value of 180.") + + # upper_thresh1 + upper_thresh1 = asd_config.get("upper_thresh1", "1000000") + if upper_thresh1.isdigit() and int(upper_thresh1) >= 3: + matmul_check.set_upper_thresh1(int(upper_thresh1)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-upper_thresh1 is invalid, use the default value of 1000000.") + + # upper_thresh2 + upper_thresh2 = asd_config.get("upper_thresh2", "100") + if upper_thresh2.isdigit() and int(upper_thresh2) >= 3: + matmul_check.set_upper_thresh2(int(upper_thresh2)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-upper_thresh2 is invalid, use the default value of 100.") + + def add_perf_dump_patch(): global perf_dump_enable - global asd_enable - config_dict = _parse_perf_config() + perf_dump_config = os.getenv("PERF_DUMP_CONFIG") + config_dict = _parse_config(perf_dump_config) enable_value = config_dict.get("enable", "false") perf_dump_enable = enable_value.lower() == "true" - asd_value = os.getenv("NPU_ASD_ENABLE", "0") - if asd_value not in ["0", "1", "2", "3"]: - raise ValueError("NPU_ASD_ENABLE should be 0, 1, 2 or 3. For details, 0 as `ASD closed`, " - "1 as `ASD opened, print error logs` " - "2 as `ASD opened, print error logs and raise exception`, " - "3 as `ASD opened, print debug logs and raise exception`" + pta_error(ErrCode.VALUE)) - asd_enable = int(asd_value) - if asd_enable: + asd_enable = 0 + asd_config = os.getenv("NPU_ASD_CONFIG", None) + if asd_config is not None: + asd_config_dict = _parse_config(asd_config) + asd_config_enable = asd_config_dict.get("enable", "false") + if asd_config_enable not in ["true", "false"]: + raise ValueError("NPU_ASD_CONFIG-enable should be true or false. For details, false as `ASD closed`, true as `ASD opened`." + pta_error(ErrCode.VALUE)) + if asd_config_enable == "true": + warnings.warn(f'Silent data corruption check may take up 1.5GB device memory, please make sure there are enough free space in device') + _prase_asd_config(asd_config_dict) + asd_enable = 1 + matmul_check.set_matmul_hook_enable(asd_enable) + loggerSilent.info(f"Silent check 3.0 version will be enabled. The checksum enable is {matmul_check.get_with_checksum()}, " + f"cooldown is {matmul_check.get_cooldown()}, strikes_num is {matmul_check.get_strikes_num()}, strikes_window is {matmul_check.get_strikes_window()}, " + f"checksum_cooldown is {matmul_check.get_checksum_cooldown()}, upper_thresh1 is {matmul_check.get_upper_thresh1()}, upper_thresh2 is {matmul_check.get_upper_thresh2()}.") + else: + asd_value = os.getenv("NPU_ASD_ENABLE", "0") if torch_npu._C._get_silent_check_version() == 1: - warnings.warn(f"Warning: CANN version lower than 8.0.RC3 and currently does not support silent check 2.0 version or later. It will switch to 1.0 version.") - asd_enable = 0 + if asd_value == "1": + warnings.warn(f"Warning: CANN version lower than 8.0.RC3 and currently does not support silent check 2.0 version or later. It will switch to 1.0 version.") else: - loggerSilent.debug(f"Silent check 3.0 version will be enabled. The asd_detect is {asd_enable}") + if asd_value not in ["0", "1", "2", "3"]: + raise ValueError("NPU_ASD_ENABLE should be 0, 1, 2 or 3. For details, 0 as `ASD closed`, " + "1 as `ASD opened, print error logs`, " + "2 as `ASD opened, print error logs and raise exception`, " + "3 as `ASD opened, print debug logs and raise exception`" + pta_error(ErrCode.VALUE)) + asd_enable = int(asd_value) + if asd_enable: + warnings.warn(f"Warning: Silent check 2.0 version will be enabled. The asd_detect is {asd_enable}. It is recommended to enable silent check v3 using the NPU_ASD_CONFIG.\n" + "Silent data corruption check may take up 1.5GB device memory, please make sure there are enough free space in device. ") + silent_check.set_check_enable(asd_enable) if perf_dump_enable or asd_enable: Module.__call__ = _custom_call -- Gitee From 6da0b4fea9855f19ddd66ab4cc29d8881fa834c5 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 5 Jun 2025 08:59:11 +0000 Subject: [PATCH 038/328] !21615 Update op_plugin commit id Merge pull request !21615 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index be8e6ed93e..bbd6793d47 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit be8e6ed93ed44772b92e5eeaa27c9f59cc26e4ca +Subproject commit bbd6793d47d6103f88f2df97df80d697dc45f9bf -- Gitee From c0b29b5c13e547cde3a683cbdf362fc1ace71a3b Mon Sep 17 00:00:00 2001 From: zyb <12441311+zyb230@user.noreply.gitee.com> Date: Thu, 5 Jun 2025 09:06:53 +0000 Subject: [PATCH 039/328] !21605 fix bug:when AiCoreNone, there is no Device_id column in kernel_details.csv Merge pull request !21605 from zyb/v2.7.1 --- torch_npu/profiler/analysis/prof_common_func/_csv_headers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py b/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py index 09214e6cbe..a98312d62c 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py +++ b/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py @@ -1,7 +1,7 @@ class CsvHeaders(object): # op_summary TASK_START_TIME = "Task Start Time(us)" - OP_SUMMARY_SHOW_HEADERS = ["Op Name", "OP Type", "Task Type", TASK_START_TIME, "Task Duration(us)", + OP_SUMMARY_SHOW_HEADERS = ["Device_id", "Op Name", "OP Type", "Task Type", TASK_START_TIME, "Task Duration(us)", "Task Wait Time(us)", "Block Dim"] - OP_SUMMARY_KERNEL_BASE_HEADERS = ["Name", "Type", "Accelerator Core", "Start Time(us)", "Duration(us)", + OP_SUMMARY_KERNEL_BASE_HEADERS = ["Device_id", "Name", "Type", "Accelerator Core", "Start Time(us)", "Duration(us)", "Wait Time(us)", "Block Dim"] -- Gitee From 2bb3929c91a94a4bc76cca317150fc6d36649aaf Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Thu, 5 Jun 2025 09:37:43 +0000 Subject: [PATCH 040/328] !21603 revert NSLB-DP Merge pull request !21603 from SCh-zx/re27 --- third_party/acl/libs/hccl.cpp | 3 +- third_party/acl/libs/hccl.h | 1 - third_party/hccl/inc/hccl/hccl.h | 2 - torch_npu/__init__.py | 6 --- torch_npu/csrc/distributed/HCCLUtils.hpp | 1 - torch_npu/csrc/distributed/HcclCompile.h | 22 -------- torch_npu/csrc/distributed/Init.cpp | 6 +-- .../csrc/distributed/ProcessGroupHCCL.cpp | 51 ------------------- .../csrc/distributed/ProcessGroupHCCL.hpp | 4 -- 9 files changed, 2 insertions(+), 94 deletions(-) diff --git a/third_party/acl/libs/hccl.cpp b/third_party/acl/libs/hccl.cpp index ebf5b401f3..ef1e23b2b2 100644 --- a/third_party/acl/libs/hccl.cpp +++ b/third_party/acl/libs/hccl.cpp @@ -33,5 +33,4 @@ hcclResult_t HcclScatter(void *sendBuf, void *recvBuf, u64 count, HcclDataType d hcclResult_t HcclBatchSendRecv(HcclSendRecvItemDef* sendRecvInfo, u32 itemNum, hcclComm_t comm, aclrtStream stream) {return HCCL_SUCCESS;} hcclResult_t HcclCommInitAll(u32 ndev, s32 *devices, hcclComm_t *comms) {return HCCL_SUCCESS;} -hcclResult_t HcclCommResume(hcclComm_t comm) {return HCCL_SUCCESS;} -hcclResult_t HcclSetGlobalCommInfo(u32 masterIp, u32 masterPort, u32 totalRankSize, u32 nodeID, u32 localRankSize){return HCCL_SUCCESS;} \ No newline at end of file +hcclResult_t HcclCommResume(hcclComm_t comm) {return HCCL_SUCCESS;} \ No newline at end of file diff --git a/third_party/acl/libs/hccl.h b/third_party/acl/libs/hccl.h index 6c87438c2e..41874cd808 100644 --- a/third_party/acl/libs/hccl.h +++ b/third_party/acl/libs/hccl.h @@ -108,5 +108,4 @@ hcclResult_t HcclScatter(void *sendBuf, void *recvBuf, u64 count, HcclDataType d hcclResult_t HcclBatchSendRecv(HcclSendRecvItemDef* sendRecvInfo, u32 itemNum, hcclComm_t comm, aclrtStream stream); hcclResult_t HcclCommInitAll(u32 ndev, s32 *devices, hcclComm_t *comms); hcclResult_t HcclCommResume(hcclComm_t comm); -hcclResult_t HcclSetGlobalCommInfo(u32 masterIp, u32 masterPort, u32 totalRankSize, u32 nodeID, u32 localRankSize); } diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 0401b4a607..4ccda684b3 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -183,8 +183,6 @@ extern HcclResult HcclCommInitAll(uint32_t ndev, int32_t *devices, HcclComm *com extern HcclResult HcclCommResume(HcclComm comm); -extern HcclResult HcclSetGlobalCommInfo(uint32_t masterIp, uint32_t masterPort, uint32_t totalRankSize, uint32_t nodeID, uint32_t localRankSize); - /** * @brief Initialize the comm configuration. * @param config Pointer to the comm configuration that needs to be initialized. diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index de75030fe5..0e7b9a3b90 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -220,9 +220,6 @@ torch_npu._C._initExtension() def _new_process_group_hccl_helper(dist_backend_opts, pg_options): store = dist_backend_opts.store - store_tcp = store - while hasattr(store_tcp, 'underlying_store') and not hasattr(store_tcp, 'host'): - store_tcp = store_tcp.underlying_store group_rank = dist_backend_opts.group_rank group_size = dist_backend_opts.group_size if pg_options is None or not isinstance(pg_options, torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options): @@ -231,9 +228,6 @@ def _new_process_group_hccl_helper(dist_backend_opts, pg_options): pg_options._timeout = dist_backend_opts.timeout pg_options.global_ranks_in_group = dist_backend_opts.global_ranks_in_group pg_options.group_id = dist_backend_opts.group_id - if (hasattr(store_tcp, 'host') and hasattr(store_tcp, 'port')): - pg_options.master_addr = store_tcp.host - pg_options.master_port = store_tcp.port return torch_npu._C._distributed_c10d.ProcessGroupHCCL(store, group_rank, group_size, pg_options) diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index cbc5491735..57c762d3a6 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -63,7 +63,6 @@ extern HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo extern HcclResult hcclCommInitClusterInfoConfig(const char *clusterInfo, uint32_t rank, HcclCommConfig *config, HcclComm *comm); extern HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *rankIds, uint64_t subCommId, uint32_t subCommRankId, HcclCommConfig* config, HcclComm *subComm); -extern HcclResult hcclSetGlobalCommInfo(uint32_t masterIp, uint32_t masterPort, uint32_t totalRankSize, uint32_t nodeID, uint32_t localRankSize); // Provides additional detail into HCCL error codes based on when these are // thrown in the HCCL codebase. diff --git a/torch_npu/csrc/distributed/HcclCompile.h b/torch_npu/csrc/distributed/HcclCompile.h index c9027922a0..e6358a7b1e 100644 --- a/torch_npu/csrc/distributed/HcclCompile.h +++ b/torch_npu/csrc/distributed/HcclCompile.h @@ -26,7 +26,6 @@ LOAD_FUNCTION(HcclCommInitRootInfoConfig) LOAD_FUNCTION(HcclGetCommConfigCapability) LOAD_FUNCTION(HcclCommInitClusterInfoConfig) LOAD_FUNCTION(HcclCreateSubCommConfig) -LOAD_FUNCTION(HcclSetGlobalCommInfo) extern HcclResult hcclAlltoAllV(const void *sendBuf, const void *sendCounts, const void *sdispls, @@ -260,25 +259,4 @@ HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *r auto ret = func(comm, rankNum, rankIds, subCommId, subCommRankId, config, subComm); return ret; } - -bool hcclSetGlobalCommInfoExist() -{ - const static bool isSetGlobalCommInfoExist = []() -> bool { - auto func = GET_FUNC(HcclSetGlobalCommInfo) - return func != nullptr; - }(); - return isSetGlobalCommInfoExist; -} - -HcclResult hcclSetGlobalCommInfo(uint32_t masterIp, uint32_t masterPort, uint32_t totalRankSize, uint32_t nodeID, uint32_t localRankSize) -{ - typedef HcclResult(*HcclSetGlobalCommInfoFunc)(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); - static HcclSetGlobalCommInfoFunc func = nullptr; - if (func == nullptr) { - func = (HcclSetGlobalCommInfoFunc)GET_FUNC(HcclSetGlobalCommInfo) - } - TORCH_CHECK(func, "Failed to find function ", "HcclSetGlobalCommInfo", DIST_ERROR(ErrCode::NOT_FOUND)); - auto ret = func(masterIp, masterPort, totalRankSize, nodeID, localRankSize); - return ret; -} } // namespace c10d_npu diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 1df6943f62..252dfff952 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -448,11 +448,7 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group) .def_readwrite("hccl_config", &::c10d_npu::ProcessGroupHCCL::Options::hccl_config) .def_readwrite("group_id", - &::c10d_npu::ProcessGroupHCCL::Options::group_id) - .def_readwrite("master_addr", - &::c10d_npu::ProcessGroupHCCL::Options::master_addr) - .def_readwrite("master_port", - &::c10d_npu::ProcessGroupHCCL::Options::master_port); + &::c10d_npu::ProcessGroupHCCL::Options::group_id); // bind for ProcessGroupLCCL auto processGroupLCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>( diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 52bcd255d1..70afe087b8 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -21,8 +21,6 @@ #include #include -#include - #include "op_plugin/OpInterface.h" #include "third_party/acl/inc/acl/acl.h" #include "third_party/acl/inc/acl/acl_base.h" @@ -2131,47 +2129,6 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank); } -void setGlobalCommInfoToHccl(std::string master_addr, uint32_t master_port) -{ - const char* envPtr; - if (master_addr.empty()) { - ASCEND_LOGI("Failed to get TCP info for hcclSetGlobalCommInfo."); - return; - } - struct sockaddr_in sa; - inet_pton(AF_INET, std::string(master_addr).c_str(), &(sa.sin_addr)); - uint32_t masterIp = ntohl(sa.sin_addr.s_addr); - if (!master_port) { - ASCEND_LOGI("Failed to get TCP info for hcclSetGlobalCommInfo."); - return; - } - uint32_t masterPort = master_port; - envPtr = std::getenv("WORLD_SIZE"); - if (envPtr == nullptr) { - ASCEND_LOGI("Failed to get env info for hcclSetGlobalCommInfo."); - return; - } - uint32_t totalRankSize = std::stoi(std::string(envPtr)); - envPtr = std::getenv("GROUP_RANK"); - if (envPtr == nullptr) { - ASCEND_LOGI("Failed to get env info for hcclSetGlobalCommInfo."); - return; - } - uint32_t nodeID = std::stoi(std::string(envPtr)); - envPtr = std::getenv("LOCAL_WORLD_SIZE"); - if (envPtr == nullptr) { - ASCEND_LOGI("Failed to get env info for hcclSetGlobalCommInfo."); - return; - } - uint32_t localRankSize = std::stoi(std::string(envPtr)); - auto info = hcclSetGlobalCommInfo(masterIp, masterPort, totalRankSize, nodeID, localRankSize); - if (info == HCCL_SUCCESS) { - ASCEND_LOGI("Succeeded to set global HCCL communication information."); - } else { - ASCEND_LOGI("Failed to set global HCCL communication information."); - } -} - void ProcessGroupHCCL::createHCCLComm( const std::string& devicesKey, const std::vector& devices, @@ -2196,14 +2153,6 @@ void ProcessGroupHCCL::createHCCLComm( HcclCommConfig config; - if (options_->global_ranks_in_group.empty()) { - if (!hcclSetGlobalCommInfoExist()) { - ASCEND_LOGI("The hcclSetGlobalCommInfo does not exist. Skip it."); - } else { - setGlobalCommInfoToHccl(options_->master_addr, options_->master_port); - } - } - npuGuard.set_index(devices[i].index()); switch (commType) { case HcclCommType::DEFAULT: diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index fe3315196c..747c133953 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -393,10 +393,6 @@ public: std::vector global_ranks_in_group; std::string group_id; - - std::string master_addr; - - uint32_t master_port; }; // If you wish to create multiple process groups, each with a potentially -- Gitee From fb8fce6b3524853d6a85b4531b2b6ebf9657a3b2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 5 Jun 2025 11:29:16 +0000 Subject: [PATCH 041/328] !21630 Update op_plugin commit id Merge pull request !21630 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index bbd6793d47..74b967b93e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit bbd6793d47d6103f88f2df97df80d697dc45f9bf +Subproject commit 74b967b93ee35de6e1475a8cc4a6423ad82cd4af -- Gitee From 5ffd36fd8445454c96a083d12b4328d32a41690e Mon Sep 17 00:00:00 2001 From: dilililiwhy Date: Thu, 5 Jun 2025 11:33:48 +0000 Subject: [PATCH 042/328] !21610 Release 2.7.1rc1 Merge pull request !21610 from dilililiwhy/271_release_issue --- README.md | 36 +++++++++++++++------------ README.zh.md | 36 +++++++++++++++------------ SECURITYNOTE.md | 54 ++++++++++++++++++++-------------------- ci/docker/ARM/Dockerfile | 26 ++++++++++++------- ci/docker/X86/Dockerfile | 29 ++++++++++++--------- requirements.txt | 2 +- setup.py | 2 ++ test/requirements.txt | 6 ++--- 8 files changed, 107 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index 6233f73f21..5b585449da 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,13 @@ Install **PyTorch** through pip. **For Aarch64:** ```Python -pip3 install torch==2.1.0 +pip3 install torch==2.7.1 ``` **For x86:** ```Python -pip3 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu +pip3 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu ``` 2. **Install torch-npu dependencies** @@ -39,21 +39,19 @@ pip3 install setuptools If the installation fails, use the download link or visit the [PyTorch official website](https://pytorch.org/) to download the installation package of the corresponding version. -| OS arch | Python version | link | -|---------|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| x86 | Python3.8 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) | -| x86 | Python3.9 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) | -| x86 | Python3.10 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) | -| x86 | Python3.11 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) | -| aarch64 | Python3.8 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) | -| aarch64 | Python3.9 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) | -| aarch64 | Python3.10 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) | -| aarch64 | Python3.11 | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) | +| OS arch | Python version | link | +|---------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| x86 | Python3.9 | [link](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp39-cp39-manylinux_2_28_x86_64.whl#sha256=d205cac087d60bc176bdc0b63a1d00dc7a4ee5ac76fd20a2ca318ac65674167e) | +| x86 | Python3.10 | [link](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=1f04a373a3f643821f721da9898ef77dce73b5b6bfc64486f0976f7fb5f90e83) | +| x86 | Python3.11 | [link](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl#sha256=a1684793e352f03fa14f78857e55d65de4ada8405ded1da2bf4f452179c4b779) | +| aarch64 | Python3.9 | [link](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=a4551cb97b83df5f93fc0d7538332535828581e1db2f179afc287027afbdd6e8) | +| aarch64 | Python3.10 | [link](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=c0df17cee97653d09a4e84488a33d21217f9b24208583c55cf28f0045aab0766) | +| aarch64 | Python3.11 | [link](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl#sha256=5fe6045b8f426bf2d0426e4fe009f1667a954ec2aeb82f1bd0bf60c6d7a85445) | 3. **Install torch-npu** ``` -pip3 install torch-npu==2.1.0.post12 +pip3 install torch-npu==2.7.1rc1 ``` ### From Source @@ -63,7 +61,7 @@ In some special scenarios, users may need to compile **torch-npu** by themselves 1. **Clone torch-npu** ``` - git clone https://github.com/ascend/pytorch.git -b 2.1.0-7.0.0 --depth 1 + git clone https://github.com/ascend/pytorch.git -b 2.7.1 --depth 1 ``` 2. **Build Docker Image** @@ -82,11 +80,11 @@ In some special scenarios, users may need to compile **torch-npu** by themselves 4. **Compile torch-npu** - Take **Python 3.8** as an example. + Take **Python 3.9** as an example. ``` cd /home/pytorch - bash ci/build.sh --python=3.8 + bash ci/build.sh --python=3.9 ``` **Tips** @@ -143,6 +141,8 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m | PyTorch2.3.1 | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x | | PyTorch2.4.0 | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x | | PyTorch2.5.1 | Python3.9.x,Python3.10.x,Python3.11.x | +| PyTorch2.6.0 | Python3.9.x,Python3.10.x,Python3.11.x | +| PyTorch2.7.1 | Python3.9.x,Python3.10.x,Python3.11.x | ## Ascend Auxiliary Software @@ -150,10 +150,12 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m | CANN Version | Supported PyTorch Version | Supported Extension Version | Github Branch | |-----------------------|---------------------------|-----------------------------|-------------------| +| CANN 8.2.RC1.alpha002 | 2.7.1 | 2.7.1rc1 | v2.7.1 | | CANN 8.1.RC1 | 2.5.1 | 2.5.1 | v2.5.1-7.0.0 | | | 2.4.0 | 2.4.0.post4 | v2.4.0-7.0.0 | | | 2.3.1 | 2.3.1.post6 | v2.3.1-7.0.0 | | | 2.1.0 | 2.1.0.post12 | v2.1.0-7.0.0 | +| CANN 8.1.RC1.alpha002 | 2.6.0 | 2.6.0rc1 | v2.6.0 | | CANN 8.0.0 | 2.4.0 | 2.4.0.post2 | v2.4.0-6.0.0 | | | 2.3.1 | 2.3.1.post4 | v2.3.1-6.0.0 | | | 2.1.0 | 2.1.0.post10 | v2.1.0-6.0.0 | @@ -241,6 +243,8 @@ The version branches of AscendPyTorch have the following maintenance phases: | **PyTorch** | **Maintenance Policies** | **Status** | **Launch Date** | **Subsequent Status** | **EOL Date** | |-------------|--------------------------|-------------|-----------------|-------------------------------------------------------------------|--------------| +| 2.7.1 | Regular Release | Development | 2025/06/06 | Expected to enter maintenance status from December 6, 2025 | | +| 2.6.0 | Long Term Support | Development | 2025/03/31 | Expected to enter maintenance status from March 31, 2026 | | | 2.5.1 | Regular Release | Development | 2024/11/08 | Expected to enter maintenance status from April 8, 2025 | | | 2.4.0 | Regular Release | Development | 2024/10/15 | Expected to enter maintenance status from June 15, 2025 | | | 2.3.1 | Regular Release | Development | 2024/06/06 | Expected to enter maintenance status from June 7, 2025 | | diff --git a/README.zh.md b/README.zh.md index f8e4201099..e1efa40b7e 100644 --- a/README.zh.md +++ b/README.zh.md @@ -19,27 +19,25 @@ **aarch64:** ```Python -pip3 install torch==2.1.0 +pip3 install torch==2.7.1 ``` **x86:** ```Python -pip3 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu +pip3 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu ``` 若使用pip命令安装失败,请使用下载链接或进入[PyTorch官方网站](https://pytorch.org/)进行查询下载对应版本。 -| 架构 | Python版本 | 下载链接 | -|---------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| x86 | Python3.8 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) | -| x86 | Python3.9 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) | -| x86 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) | -| x86 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) | -| aarch64 | Python3.8 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) | -| aarch64 | Python3.9 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) | -| aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) | -| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) | +| 架构 | Python版本 | 下载链接 | +|---------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| x86 | Python3.9 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp39-cp39-manylinux_2_28_x86_64.whl#sha256=d205cac087d60bc176bdc0b63a1d00dc7a4ee5ac76fd20a2ca318ac65674167e) | +| x86 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=1f04a373a3f643821f721da9898ef77dce73b5b6bfc64486f0976f7fb5f90e83) | +| x86 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl#sha256=a1684793e352f03fa14f78857e55d65de4ada8405ded1da2bf4f452179c4b779) | +| aarch64 | Python3.9 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=a4551cb97b83df5f93fc0d7538332535828581e1db2f179afc287027afbdd6e8) | +| aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=c0df17cee97653d09a4e84488a33d21217f9b24208583c55cf28f0045aab0766) | +| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl#sha256=5fe6045b8f426bf2d0426e4fe009f1667a954ec2aeb82f1bd0bf60c6d7a85445) | 2. **安装torch_npu依赖** @@ -53,7 +51,7 @@ pip3 install setuptools 3. **安装torch_npu** ``` -pip3 install torch-npu==2.1.0.post12 +pip3 install torch-npu==2.7.1rc1 ``` 如需要保存安装日志,可在pip3 install命令后面加上参数 `--log `,并对您指定的目录``做好权限管控。 @@ -64,7 +62,7 @@ pip3 install torch-npu==2.1.0.post12 1. **克隆torch_npu代码仓** ``` - git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-7.0.0 --depth 1 + git clone https://gitee.com/ascend/pytorch.git -b v2.7.1 --depth 1 ``` 2. **构建镜像** @@ -83,11 +81,11 @@ pip3 install torch-npu==2.1.0.post12 4. **编译torch_npu** - 以**Python 3.8** 为例。 + 以**Python 3.9** 为例。 ``` cd /home/pytorch - bash ci/build.sh --python=3.8 + bash ci/build.sh --python=3.9 ``` **提示** @@ -152,6 +150,8 @@ print(z) | PyTorch2.3.1 | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x | | PyTorch2.4.0 | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x | | PyTorch2.5.1 | Python3.9.x, Python3.10.x, Python 3.11.x | +| PyTorch2.6.0 | Python3.9.x, Python3.10.x, Python 3.11.x | +| PyTorch2.7.1 | Python3.9.x, Python3.10.x, Python 3.11.x | ## 昇腾辅助软件 @@ -159,10 +159,12 @@ print(z) | CANN版本 | 支持的PyTorch版本 | 支持的Extension版本 | Gitee分支 | |-----------------------|--------------|------------------|-------------------| +| CANN 8.2.RC1.alpha002 | 2.7.1 | 2.7.1rc1 | v2.7.1 | | CANN 8.1.RC1 | 2.5.1 | 2.5.1 | v2.5.1-7.0.0 | | | 2.4.0 | 2.4.0.post4 | v2.4.0-7.0.0 | | | 2.3.1 | 2.3.1.post6 | v2.3.1-7.0.0 | | | 2.1.0 | 2.1.0.post12 | v2.1.0-7.0.0 | +| CANN 8.1.RC1.alpha002 | 2.6.0 | 2.6.0rc1 | v2.6.0 | | CANN 8.0.0 | 2.4.0 | 2.4.0.post2 | v2.4.0-6.0.0 | | | 2.3.1 | 2.3.1.post4 | v2.3.1-6.0.0 | | | 2.1.0 | 2.1.0.post10 | v2.1.0-6.0.0 | @@ -243,6 +245,8 @@ AscendPyTorch版本分支的维护阶段如下: | **PyTorch版本** | **维护策略** | **当前状态** | **发布时间** | **后续状态** | **EOL日期** | |---------------|----------|----------|------------|----------------------|-----------| +| 2.7.1 | 常规分支 | 开发 | 2025/06/06 | 预计2025/12/06起进入维护状态 | - | +| 2.6.0 | 长期支持 | 开发 | 2025/03/31 | 预计2026/03/31起进入维护状态 | - | | 2.5.1 | 常规分支 | 开发 | 2024/11/08 | 预计2025/04/08起进入维护状态 | - | | 2.4.0 | 常规分支 | 开发 | 2024/10/15 | 预计2025/06/15起进入维护状态 | - | | 2.3.1 | 常规分支 | 开发 | 2024/06/06 | 预计2025/06/07起进入维护状态 | | diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index dff31306e1..bd8d37e135 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -70,33 +70,33 @@ torch_npu支持源码编译安装,在编译时会下载依赖第三方库并 ##### 公网地址 -| 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 | -|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------|--------------------------------------------------------------------------------|--------------------------------| -| 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/op-plugin.git | 依赖的开源代码仓 | -| 自研 | 不涉及 | .gitmodules | https://gitee.com/mirrors/googletest.git | 依赖的开源代码仓 | -| 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/torchair.git | 依赖的开源代码仓 | -| 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/Tensorpipe.git | 依赖的开源代码仓 | -| 自研 | 不涉及 | .gitmodules | https://gitee.com/mirrors/fmt.git | 依赖的开源代码仓 | -| 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://mirrors.huaweicloud.com/repository/pypi/simple | docker配置文件,用于配置pip源 | -| 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://download.pytorch.org/whl/cpu | docker配置源,用于配置torch下载连接 | -| 自研 | 不涉及 | ci\docker\ARM\Dockerfile | https://mirrors.huaweicloud.com/repository/pypi/simple | docker配置文件,用于配置pip源 | -| 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://mirrors.wlnmp.com/centos/Centos7-aliyun-altarch.repo | docker配置文件,用于配置yum源 | -| 自研 | 不涉及 | ci\docker\ARM\Dockerfile | https://mirrors.wlnmp.com/centos/Centos7-aliyun-altarch.repo | docker配置文件,用于配置yum源 | -| 自研 | 不涉及 | .github\workflows\\_build-and-test.yml | https://mirrors.huaweicloud.com/repository/pypi/simple | workflow配置文件,用于配置pip源 | -| 自研 | 不涉及 | setup.cfg | https://gitee.com/ascend/pytorch | 用于打包whl的url入参 | -| 自研 | 不涉及 | setup.cfg | https://gitee.com/ascend/pytorch/tags | 用于打包whl的download_url入参 | -| 自研 | 不涉及 | third_party\op-plugin\ci\build.sh | https://gitee.com/ascend/pytorch.git | 编译脚本根据torch_npu仓库地址拉取代码进行编译 | -| 自研 | 不涉及 | third_party\op-plugin\ci\exec_ut.sh | https://gitee.com/ascend/pytorch.git | UT脚本根据torch_npu仓库地址下拉取代码进行UT测试 | -| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/nn/test_convolution.py
https://github.com/pytorch/pytorch/blob/main/test/test_mps.py
https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/legacy_conv2d.pt | 用于test脚本下载相关pt文件 | -| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/legacy_serialized.pt | 用于test脚本下载相关pt文件 | -| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/gpu_tensors.pt | 用于test脚本下载相关pt文件 | -| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/onnx/test_utility_funs.py | test\url.ini | https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml | issue的链接 | -| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/test_nn.py
https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/linear.pt | 用于test脚本下载相关pt文件 | -| 自研 | 不涉及 | torch_npu\npu\config.yaml | https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl | 火焰图脚本下载路径 | -| 自研 | 不涉及 | test\requirements.txt | https://download.pytorch.org/whl/nightly/cpu | 下载链接,用于下载torch-cpu版本 | -| 自研 | 不涉及 | test\requirements.txt | https://data.pyg.org/whl/torch-2.4.0+cpu.html | 下载链接,用于下载torch-scatter的cpu版本 | -| 自研 | 不涉及 | requirements.txt | https://download.pytorch.org/whl/nightly/cpu | 下载链接,用于下载torch-cpu版本 | -| 自研 | 不涉及 | test\get_synchronized_files.sh | https://github.com/pytorch/pytorch.git | 下载链接,用于下载pytorch的测试用例 | +| 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 | +|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------|--------------------------------------------------------------------------------|--------------------------------| +| 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/op-plugin.git | 依赖的开源代码仓 | +| 自研 | 不涉及 | .gitmodules | https://gitee.com/mirrors/googletest.git | 依赖的开源代码仓 | +| 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/torchair.git | 依赖的开源代码仓 | +| 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/Tensorpipe.git | 依赖的开源代码仓 | +| 自研 | 不涉及 | .gitmodules | https://gitee.com/mirrors/fmt.git | 依赖的开源代码仓 | +| 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://mirrors.huaweicloud.com/repository/pypi/simple | docker配置文件,用于配置pip源 | +| 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://download.pytorch.org/whl/cpu | docker配置源,用于配置torch下载连接 | +| 自研 | 不涉及 | ci\docker\ARM\Dockerfile | https://mirrors.huaweicloud.com/repository/pypi/simple | docker配置文件,用于配置pip源 | +| 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://mirrors.wlnmp.com/centos/Centos7-aliyun-altarch.repo | docker配置文件,用于配置yum源 | +| 自研 | 不涉及 | ci\docker\ARM\Dockerfile | https://mirrors.wlnmp.com/centos/Centos7-aliyun-altarch.repo | docker配置文件,用于配置yum源 | +| 自研 | 不涉及 | .github\workflows\\_build-and-test.yml | https://mirrors.huaweicloud.com/repository/pypi/simple | workflow配置文件,用于配置pip源 | +| 自研 | 不涉及 | setup.cfg | https://gitee.com/ascend/pytorch | 用于打包whl的url入参 | +| 自研 | 不涉及 | setup.cfg | https://gitee.com/ascend/pytorch/tags | 用于打包whl的download_url入参 | +| 自研 | 不涉及 | third_party\op-plugin\ci\build.sh | https://gitee.com/ascend/pytorch.git | 编译脚本根据torch_npu仓库地址拉取代码进行编译 | +| 自研 | 不涉及 | third_party\op-plugin\ci\exec_ut.sh | https://gitee.com/ascend/pytorch.git | UT脚本根据torch_npu仓库地址下拉取代码进行UT测试 | +| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.7.1/test/nn/test_convolution.py
https://github.com/pytorch/pytorch/blob/v2.7.1/test/test_mps.py
https://github.com/pytorch/pytorch/blob/v2.7.1/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/legacy_conv2d.pt | 用于test脚本下载相关pt文件 | +| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.7.1/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/legacy_serialized.pt | 用于test脚本下载相关pt文件 | +| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.7.1/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/gpu_tensors.pt | 用于test脚本下载相关pt文件 | +| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.7.1/test/onnx/test_utility_funs.py | test\url.ini | https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml | issue的链接 | +| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.7.1/test/test_nn.py
https://github.com/pytorch/pytorch/blob/v2.7.1/test/test_serialization.py | test\url.ini | https://download.pytorch.org/test_data/linear.pt | 用于test脚本下载相关pt文件 | +| 自研 | 不涉及 | torch_npu\npu\config.yaml | https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl | 火焰图脚本下载路径 | +| 自研 | 不涉及 | test\requirements.txt | https://download.pytorch.org/whl/nightly/cpu | 下载链接,用于下载torch-cpu版本 | +| 自研 | 不涉及 | test\requirements.txt | https://data.pyg.org/whl/torch-2.7.0+cpu.html | 下载链接,用于下载torch-scatter的cpu版本 | +| 自研 | 不涉及 | requirements.txt | https://download.pytorch.org/whl/nightly/cpu | 下载链接,用于下载torch-cpu版本 | +| 自研 | 不涉及 | test\get_synchronized_files.sh | https://github.com/pytorch/pytorch.git | 下载链接,用于下载pytorch的测试用例 | ## 公开接口声明 diff --git a/ci/docker/ARM/Dockerfile b/ci/docker/ARM/Dockerfile index 7457919d3d..5e0abb2e68 100644 --- a/ci/docker/ARM/Dockerfile +++ b/ci/docker/ARM/Dockerfile @@ -1,11 +1,19 @@ -FROM quay.io/pypa/manylinux2014_aarch64:2023-10-07-c1e05d1 +FROM pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.6 -# Set pip +ENV PATH /usr/local/bin:$PATH + +RUN echo "alias ll='ls -l --color=auto'" >> /root/.bashrc + +# Set pip&python RUN cd /usr/local/bin \ - && ln -s /opt/_internal/cpython-3.9.18/bin/pip3.9 pip3.9 \ - && ln -s /opt/_internal/cpython-3.10.13/bin/pip3.10 pip3.10 \ - && ln -s /opt/_internal/cpython-3.11.6/bin/pip3.11 pip3.11 \ - && ln -s python3.9 python3 + && ln -sf /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/pip3.11 pip3.11 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/python3.10 python3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/python3.11 python3.11 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3 # Set pip source RUN mkdir /root/.pip \ @@ -33,15 +41,15 @@ RUN if [ "$CONFIG_FOR_LCOV" = "1" ]; then \ # Install pip package(build) RUN pip3.9 install pyyaml \ - && pip3.9 install torch==2.1.0 \ + && pip3.9 install torch==2.7.1 \ && pip3.9 install numpy==1.21.3 RUN pip3.10 install pyyaml \ - && pip3.10 install torch==2.1.0 \ + && pip3.10 install torch==2.7.1 \ && pip3.10 install numpy==1.21.3 RUN pip3.11 install pyyaml \ - && pip3.11 install torch==2.1.0 \ + && pip3.11 install torch==2.7.1 \ && pip3.11 install numpy==1.23.2 WORKDIR /home diff --git a/ci/docker/X86/Dockerfile b/ci/docker/X86/Dockerfile index 0c234633ca..cb165f478b 100644 --- a/ci/docker/X86/Dockerfile +++ b/ci/docker/X86/Dockerfile @@ -1,14 +1,19 @@ -FROM pytorch/manylinux-builder:cpu-2.1 +FROM pytorch/manylinux2_28-builder:cpu-2.7 -# Install python +ENV PATH /usr/local/bin:$PATH + +RUN echo "alias ll='ls -l --color=auto'" >> /root/.bashrc + +# Set pip&python RUN cd /usr/local/bin \ - && ln -s /opt/_internal/cpython-3.9.0/bin/pip3.9 pip3.9 \ - && ln -s /opt/_internal/cpython-3.10.1/bin/pip3.10 pip3.10 \ - && ln -s /opt/_internal/cpython-3.11.0/bin/pip3.11 pip3.11 \ - && ln -s /opt/_internal/cpython-3.9.0/bin/python3.9 python3.9 \ - && ln -s /opt/_internal/cpython-3.10.1/bin/python3.10 python3.10 \ - && ln -s /opt/_internal/cpython-3.11.0/bin/python3.11 python3.11 \ - && ln -s python3.9 python3 + && ln -sf /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/pip3.11 pip3.11 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/python3.10 python3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/python3.11 python3.11 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3 # Set pip source RUN mkdir /root/.pip \ @@ -36,15 +41,15 @@ RUN if [ "$CONFIG_FOR_LCOV" = "1" ]; then \ # Install pip package(build) RUN pip3.9 install pyyaml \ - && pip3.9 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu \ + && pip3.9 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu \ && pip3.9 install numpy==1.21.3 RUN pip3.10 install pyyaml \ - && pip3.10 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu \ + && pip3.10 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu \ && pip3.10 install numpy==1.21.3 RUN pip3.11 install pyyaml \ - && pip3.11 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu \ + && pip3.11 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu \ && pip3.11 install numpy==1.23.2 WORKDIR /home diff --git a/requirements.txt b/requirements.txt index c9dc801a40..7049b44df6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/test/cpu +--extra-index-url https://download.pytorch.org/whl/cpu pyyaml setuptools diff --git a/setup.py b/setup.py index a5938cbd71..7dafe283ed 100644 --- a/setup.py +++ b/setup.py @@ -622,6 +622,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] +requirements = ['torch==2.7.1+cpu' if platform.machine() == 'x86_64' else 'torch==2.7.1'] setup( name=os.environ.get('TORCH_NPU_PACKAGE_NAME', 'torch_npu'), @@ -646,6 +647,7 @@ setup( define_macros=[('_GLIBCXX_USE_CXX11_ABI', '1' if USE_CXX11_ABI else '0'), ('GLIBCXX_USE_CXX11_ABI', '1' if USE_CXX11_ABI else '0')] ), ], + install_requires=requirements, extras_require={ }, package_data={ diff --git a/test/requirements.txt b/test/requirements.txt index 226681ee97..7fcf47fcd3 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1,11 +1,11 @@ ---extra-index-url https://download.pytorch.org/whl/test/cpu --f https://data.pyg.org/whl/torch-2.6.0+cpu.html +--extra-index-url https://download.pytorch.org/whl/cpu +-f https://data.pyg.org/whl/torch-2.7.0+cpu.html coverage beartype==0.17.0 expecttest==0.1.3 hypothesis -mypy==1.9.0 +mypy==1.14.0 numpy==1.24.4 onnx==1.17.0 onnxruntime==1.18.1 -- Gitee From 68b431b904f5c81609b301305d9092d2f63fd235 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 5 Jun 2025 12:59:15 +0000 Subject: [PATCH 043/328] !21632 Update op_plugin commit id Merge pull request !21632 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 74b967b93e..5fdf4c3756 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 74b967b93ee35de6e1475a8cc4a6423ad82cd4af +Subproject commit 5fdf4c3756d5b1c3ebb774e65a8027162e2dbf4d -- Gitee From dffda0beb445104b8d4ad208a82308fa1703e01a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Thu, 5 Jun 2025 15:55:35 +0000 Subject: [PATCH 044/328] =?UTF-8?q?!21652=20Silent=20check=20v3=20and=20ch?= =?UTF-8?q?ecksum=20fix=20Merge=20pull=20request=20!21652=20from=20?= =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.7.0=5Fchecksumfix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 43 +++++++++++++++++---------------------- torch_npu/asd/checksum.py | 8 ++++++-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 33810ffef0..5c8c4e09d9 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -303,8 +303,6 @@ class _MatmulSilentCheck: self.registered_modules = [] self.matmul_hook_enable = 0 self.matmul_with_bf16 = False - self.check_stream = None - self.check_event = None self.statistic_value = None self.is_outer_call = True # link to checksum @@ -324,12 +322,13 @@ class _MatmulSilentCheck: daemon=True ) self.lock = threading.Lock() - self.queue_len = 8192 + self.queue_len = 1024 self.statistic_cpu_value = None self.name_list = ["" for _ in range(self.queue_len)] self.head_index = 0 self.tail_index = 0 self.history_abnormal_list = [] + self.last_tocpu_time = None # Parameter filtering self.filter_index = -1 self.filter_interval = 3 @@ -404,13 +403,12 @@ class _MatmulSilentCheck: return self.upper_thresh2 def init_stream(self): - if self.check_stream is None: - self.check_stream = torch_npu.npu.Stream() - self.check_event = torch_npu.npu.Event(enable_timing=False) - self.statistic_value = torch.tensor(0., device=torch_npu.npu.current_device()) + if self.statistic_value is None: + self.statistic_value = torch.tensor(0., device=f"npu:{torch_npu.npu.current_device()}") self.checksum_state = 0 self.statistic_cpu_value = torch.zeros((self.queue_len,), device='cpu', dtype=torch.float32).pin_memory() self.statistic_cpu_value.fill_(-1) + self.last_tocpu_time = time.time() if self.store is None: if torch.distributed.is_initialized(): self.store = torch.distributed.distributed_c10d._get_default_store() @@ -449,22 +447,17 @@ class _MatmulSilentCheck: return if self.matmul_hook_enable >= 1: - default_stream = torch_npu.npu.current_stream() - with torch_npu.npu.stream(self.check_stream): - with torch.no_grad(): - self.check_stream.wait_stream(default_stream) - self.statistic_value.fill_(torch.pow(torch.norm(grad, float('inf')), 2).detach().float()) - - #Asynchronously copy the value to host - self.lock.acquire() - self.statistic_cpu_value[self.tail_index].copy_(self.statistic_value.data, non_blocking=True) - self.name_list[self.tail_index] = name - self.tail_index = (self.tail_index + 1) % self.queue_len - self.lock.release() - self.check_event.record(self.check_stream) - if self.tail_index == self.head_index: + with torch.no_grad(): + self.statistic_value.fill_(torch.pow(torch.norm(grad, float('inf')), 2).detach().float()) + + #Asynchronously copy the value to host + self.lock.acquire() + self.statistic_cpu_value[self.tail_index].copy_(self.statistic_value.data, non_blocking=True) + self.name_list[self.tail_index] = name + self.tail_index = (self.tail_index + 1) % self.queue_len + self.lock.release() + if self.tail_index == self.head_index or abs(time.time() - self.last_tocpu_time) >= 60: # The queue is full, synchronize to empty the queue - self.check_event.synchronize() torch_npu.npu.synchronize() def _async_detect(self): @@ -481,6 +474,8 @@ class _MatmulSilentCheck: val = self.statistic_cpu_value[self.head_index].item() name = self.name_list[self.head_index] while val > 0 and name != "": + self.last_tocpu_time = time.time() + loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, step: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check( val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'], self.upper_thresh1, self.upper_thresh2 @@ -690,7 +685,7 @@ def _trigger_matmul_decorator(func): def wrapper(a, b, *args, **kwargs): global matmul_check result = func(a, b, *args, **kwargs) - if matmul_check.checksum_enable: + if matmul_check.checksum_enable and a.dtype == torch.bfloat16 and b.dtype == torch.bfloat16: checksum = torch_npu.matmul_checksum(a, b, result) matmul_check.checksum_result.logical_or_(checksum) return result @@ -702,7 +697,7 @@ def _trigger_tensor_matmul_decorator(func): def wrapper(self, other): global matmul_check result = func(self, other) - if matmul_check.checksum_enable: + if matmul_check.checksum_enable and other.dtype == torch.bfloat16 and self.dtype == torch.bfloat16: checksum = torch_npu.matmul_checksum(self, other, result) matmul_check.checksum_result.logical_or_(checksum) return result diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py index 24f38d86af..ab756d6f33 100644 --- a/torch_npu/asd/checksum.py +++ b/torch_npu/asd/checksum.py @@ -36,12 +36,16 @@ def _matmul_checksum(a, b, c): n = c.shape[-1] c_max, _ = torch.max(torch.abs(c), dim=-1) + c_mean = torch.mean(torch.abs(c), dim=-1) c_sum_accum_error = math.sqrt(n * (n + 1) * (2 * n + 1) / 48) * c_max * 2 ** (-t) - c_ele_round_error_accum = c_max * 2 ** (-8) * math.sqrt(n_b) + if torch.min(c_max / c_mean) > 5: + c_ele_round_error_accum = c_max * 2 ** (-8) * math.sqrt(n_b) + else: + c_ele_round_error_accum = c_mean * 2 ** (-8) * n_b b_max, _ = torch.max(torch.abs(b), dim=-1, keepdim=True) delta_1 = math.sqrt(n_b * (n_b + 1) * (2 * n_b + 1) / 48) * b_max * 2 ** (-t) - delta_4 = matmul(torch.abs(a), delta_1).squeeze() + delta_4 = matmul(torch.abs(a), delta_1).squeeze(-1) a_max, _ = torch.max(torch.abs(a), dim=-1) delta_2_3 = math.sqrt((m_b * (m_b + 1) * (m_b + 0.5) + 2 * m_b) / 24) * a_max * torch.max(b_max) * 2 ** (-t) error_total = (c_sum_accum_error + c_ele_round_error_accum + delta_2_3 + delta_4).to(torch.float) -- Gitee From d33df559f6aa3c73c44e9c3a9bed3490faf07566 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 5 Jun 2025 15:59:10 +0000 Subject: [PATCH 045/328] !21657 Update op_plugin commit id Merge pull request !21657 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5fdf4c3756..1c95fd316e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5fdf4c3756d5b1c3ebb774e65a8027162e2dbf4d +Subproject commit 1c95fd316e9d8b22bfb63fbd965c2d8bbac68a2c -- Gitee From d7166ed34de508655b8dfcfc2def49a987f77142 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 5 Jun 2025 22:25:36 +0000 Subject: [PATCH 046/328] !21649 Update torchair commit id Merge pull request !21649 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 9d3a02f674..e2533040d2 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 9d3a02f6743134c3e17814cee85770956cf41bda +Subproject commit e2533040d24c7a5764857d260b78f3ea423e48a3 -- Gitee From f641ee11fef3fa98689eeca8f513fe7ac7283b96 Mon Sep 17 00:00:00 2001 From: zyb <12441311+zyb230@user.noreply.gitee.com> Date: Fri, 6 Jun 2025 08:54:35 +0000 Subject: [PATCH 047/328] !21664 fix step_trace_time error Merge pull request !21664 from zyb/v2.7.1 --- .../profiler/analysis/prof_view/_trace_step_time_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index 13c3c73014..41545f7c58 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -147,6 +147,8 @@ class TraceStepTimeParser(BaseParser): print_time = [] for device, device_time in save_time.items(): for step, step_time in device_time.items(): + if self.step_range and step is None: + continue step_time['comunNotOverlpRec'] = step_time['comunNotOverlp'] - step_time['bubble'] step_time['Overlp'] = step_time['comun'] - step_time['comunNotOverlp'] step_time['stage'] = self.get_e2e_time(step, step_dict.get(device, [])) - step_time['bubble'] @@ -155,7 +157,7 @@ class TraceStepTimeParser(BaseParser): [device, step, step_time['compute'], step_time['comunNotOverlp'], step_time['Overlp'], step_time['comun'], step_time['free'], step_time['stage'], step_time['bubble'], step_time['comunNotOverlpRec'], step_time['prepare']]) - print_time.sort(key=lambda x: (x[0], x[1])) + print_time.sort(key=lambda x: (x[0], int(x[1]))) # step is a string FileManager.create_csv_file(output_path, print_time, file_name, self.title) def run(self, deps_data: dict): -- Gitee From 589f2fcdc6a564412f57603eb2534046d37790ad Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 6 Jun 2025 08:59:29 +0000 Subject: [PATCH 048/328] !21669 Update op_plugin commit id Merge pull request !21669 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1c95fd316e..1e5fbc432d 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1c95fd316e9d8b22bfb63fbd965c2d8bbac68a2c +Subproject commit 1e5fbc432d5658bb16bb8b6250c3e8bc40870902 -- Gitee From 8790a4d1c009c4e583095d70b138ed8c8983fa83 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Fri, 6 Jun 2025 09:23:04 +0000 Subject: [PATCH 049/328] !21618 checkcann Merge pull request !21618 from SCh-zx/cann27 --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 70afe087b8..935cbba495 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -26,6 +26,7 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "torch_npu/csrc/aten/CustomFunctions.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/npu/GetCANNInfo.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/NPUStorageImpl.h" @@ -287,7 +288,13 @@ void getHcclCommConfig(HcclCommConfig* config, bool isP2P = false) } // Temporarily adding this logic to set deterministic states to avoid a known issues within HCCL. - config->hcclDeterministic = getDeterministicState() ? 1 : 0; + const std::string baseCannVersion = "8.2.RC1"; + const std::string baseCannModule = "CANN"; + if (IsGteCANNVersion(baseCannVersion, baseCannModule)) { + config->hcclDeterministic = 0xffffffff; + } else { + config->hcclDeterministic = getDeterministicState() ? 1 : 0; + } // Compatible with the size check of the old version of HCCL, forcibly convert // the config object to a size_t=32 object, and retain the N ± 2 version -- Gitee From 128b4b0127fc8132ed0a105c258dd0c8d3fe41d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Fri, 6 Jun 2025 13:31:04 +0000 Subject: [PATCH 050/328] =?UTF-8?q?!21681=20silent=20check=20v3=20fix=20Me?= =?UTF-8?q?rge=20pull=20request=20!21681=20from=20=E7=8E=8B=E8=B6=85/v2.7.?= =?UTF-8?q?0=5Fsilentfix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 5c8c4e09d9..6406ec04d4 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -328,7 +328,6 @@ class _MatmulSilentCheck: self.head_index = 0 self.tail_index = 0 self.history_abnormal_list = [] - self.last_tocpu_time = None # Parameter filtering self.filter_index = -1 self.filter_interval = 3 @@ -408,7 +407,6 @@ class _MatmulSilentCheck: self.checksum_state = 0 self.statistic_cpu_value = torch.zeros((self.queue_len,), device='cpu', dtype=torch.float32).pin_memory() self.statistic_cpu_value.fill_(-1) - self.last_tocpu_time = time.time() if self.store is None: if torch.distributed.is_initialized(): self.store = torch.distributed.distributed_c10d._get_default_store() @@ -456,7 +454,7 @@ class _MatmulSilentCheck: self.name_list[self.tail_index] = name self.tail_index = (self.tail_index + 1) % self.queue_len self.lock.release() - if self.tail_index == self.head_index or abs(time.time() - self.last_tocpu_time) >= 60: + if self.tail_index == self.head_index: # The queue is full, synchronize to empty the queue torch_npu.npu.synchronize() @@ -473,8 +471,7 @@ class _MatmulSilentCheck: self.lock.acquire() val = self.statistic_cpu_value[self.head_index].item() name = self.name_list[self.head_index] - while val > 0 and name != "": - self.last_tocpu_time = time.time() + while val >= 0 and name != "": loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, step: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check( val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'], -- Gitee From d189f2d141ea1e18eec0c6886e16b5c92c156852 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 6 Jun 2025 22:24:05 +0000 Subject: [PATCH 051/328] !21688 Update torchair commit id Merge pull request !21688 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index e2533040d2..b7d395cb47 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit e2533040d24c7a5764857d260b78f3ea423e48a3 +Subproject commit b7d395cb4781388fa71171bb4d86e5b9b7005399 -- Gitee From a8fce647c07962e8edee39a57e606db4c4cc3fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Sat, 7 Jun 2025 15:11:51 +0000 Subject: [PATCH 052/328] =?UTF-8?q?!21691=20silent=20checkv3=20fix=20Merge?= =?UTF-8?q?=20pull=20request=20!21691=20from=20=E7=8E=8B=E8=B6=85/v2.7.0?= =?UTF-8?q?=5Fsilentfix1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/__init__.py | 1 + torch_npu/asd/asd.py | 48 ++++++++++++++++++++++++++++++--------- torch_npu/asd/checksum.py | 14 ++---------- torch_npu/utils/_step.py | 10 ++++---- 4 files changed, 45 insertions(+), 28 deletions(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 0e7b9a3b90..d84f72b37a 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -266,6 +266,7 @@ def _npu_shutdown(): torch_npu.distributed.distributed_c10d._destructor_process_group() torch_npu._C._npu_shutdown(success) _except_handler.handle_exception() + torch_npu.asd.asd.matmul_check._cleanup() # register npu shutdown hook on exit diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 6406ec04d4..a440e82271 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -125,6 +125,8 @@ class _SilentFaultDetectorV2: self.min_step = 100 def silent_fault_check(self, idx, asd_flag, grad): + if grad is None: + return if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32: return @@ -459,7 +461,7 @@ class _MatmulSilentCheck: torch_npu.npu.synchronize() def _async_detect(self): - while True: + while self.check_thread_running: if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized(): break time.sleep(10) @@ -467,11 +469,11 @@ class _MatmulSilentCheck: if local_rank.isdigit(): torch.npu.set_device(int(local_rank)) - while True: + while self.check_thread_running: self.lock.acquire() val = self.statistic_cpu_value[self.head_index].item() name = self.name_list[self.head_index] - while val >= 0 and name != "": + while val != -1 and name != "": loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, step: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check( val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'], @@ -483,7 +485,7 @@ class _MatmulSilentCheck: new_abnormal = {'time_str': current_time, 'time': time.time(), 'name': name, - 'rank': torch.distributed.get_rank(), + 'rank': self.rank, 'val': val, 'pre_val': self.check_stat[name]['pre_val'], 'avg': self.check_stat[name]['avg'], @@ -510,7 +512,7 @@ class _MatmulSilentCheck: elif math.isnan(val) or math.isinf(val): return True, avg, none_zero_step else: - if none_zero_step != 0 and avg != 0: + if none_zero_step >= 10 and avg != 0: thres = avg * alpha1 / (1 - 0.99 ** none_zero_step) thres2 = avg * alpha2 / (1 - 0.99 ** none_zero_step) else: @@ -525,12 +527,19 @@ class _MatmulSilentCheck: return False, avg, none_zero_step def _abnormal_process(self, new_abnormal): + counting_abnormal_pos = [] i = len(self.history_abnormal_list) - 1 if i < 0: self._generate_event_log(new_abnormal) self.history_abnormal_list.append(new_abnormal) + if self.strikes_num == 1: + self._generate_warning_log(counting_abnormal_pos, new_abnormal) + new_abnormal['striked'] = True + if self.with_checksum: + self.checksum_state = 1 + if not self.matmul_with_bf16: + warnings.warn(f"Warning: Module has no supported dtype grad, checksum will not to be linked.") return - counting_abnormal_pos = [] while i >= 0: old_abnormal = self.history_abnormal_list[i] old_time = old_abnormal['time'] @@ -538,6 +547,14 @@ class _MatmulSilentCheck: if old_abnormal['counted'] and abs(new_time - old_time) >= self.cooldown * 60: # A new counted abnormal self._generate_event_log(new_abnormal) + if self.strikes_num == 1: + self._generate_warning_log(counting_abnormal_pos, new_abnormal) + new_abnormal['striked'] = True + if self.with_checksum: + self.checksum_state = 1 + if not self.matmul_with_bf16: + warnings.warn(f"Warning: Module has no supported dtype grad, checksum will not to be linked.") + break counting_abnormal_pos.append(i) i -= 1 while i >= 0: @@ -601,14 +618,14 @@ class _MatmulSilentCheck: self.store.set(f"rank_{self.rank}_warn_log", current_log + "\n" + warning_str if current_log != "" else warning_str) def _generate_silent_log(self): - warning_str = f"[Warning][Rank {torch.distributed.get_rank()}]: The result of Matmul checksum is abnormal!" + warning_str = f"[Warning][Rank {self.rank}]: The result of Matmul checksum is abnormal!" loggerSilent.warning(warning_str) if self.store is not None and self.rank is not None and self.rank != 0: current_log = self.store.get(f"rank_{self.rank}_warn_log").decode() self.store.set(f"rank_{self.rank}_warn_log", current_log + "\n" + warning_str if current_log != "" else warning_str) def _tcp_comm_checksum_state(self): - while True: + while self.checksum_state_thread_running: if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized() and self.store is not None: break time.sleep(10) @@ -621,7 +638,7 @@ class _MatmulSilentCheck: last_checksum_time = None if self.rank == 0: self.store.add('counter2', world_size) - while True: + while self.checksum_state_thread_running: if self.rank == 0: for i in range(1, world_size): msg = self.store.get(f"rank_{i}_warn_log").decode() @@ -673,6 +690,15 @@ class _MatmulSilentCheck: time.sleep(10) + def _cleanup(self): + if self.check_thread_running: + self.check_thread_running = False + self.check_thread.join() + + if self.checksum_state_thread_running: + self.checksum_state_thread_running = False + self.checksum_state_thread.join() + matmul_check = _MatmulSilentCheck() @@ -715,13 +741,13 @@ def _matmul_silent_check_decorator(func): self.matmul_check_outer = True if not matmul_check.check_thread_running: - matmul_check.check_thread.start() matmul_check.check_thread_running = True + matmul_check.check_thread.start() # 2 for checksum if not matmul_check.checksum_state_thread_running: - matmul_check.checksum_state_thread.start() matmul_check.checksum_state_thread_running = True + matmul_check.checksum_state_thread.start() if matmul_check.with_checksum and not matmul_check.matmul_trigger: torch_npu.asd.checksum.matmul = original_matmul torch.matmul = _trigger_matmul_decorator(original_matmul) diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py index ab756d6f33..cc6832f398 100644 --- a/torch_npu/asd/checksum.py +++ b/torch_npu/asd/checksum.py @@ -24,31 +24,21 @@ def _matmul_checksum(a, b, c): raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM)) if not isinstance(c, torch.Tensor) or c.device.type != 'npu': raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM)) - - t = 23 + c_sum = torch.sum(c, dim=-1, dtype=torch.float32) b1 = torch.sum(b, dim=-1, keepdim=True, dtype=torch.float32) c1 = matmul(a.to(torch.float32), b1) c1_trans = c1.squeeze(-1) - n_b = b.shape[-1] - m_b = b.shape[0] - n = c.shape[-1] c_max, _ = torch.max(torch.abs(c), dim=-1) c_mean = torch.mean(torch.abs(c), dim=-1) - c_sum_accum_error = math.sqrt(n * (n + 1) * (2 * n + 1) / 48) * c_max * 2 ** (-t) if torch.min(c_max / c_mean) > 5: c_ele_round_error_accum = c_max * 2 ** (-8) * math.sqrt(n_b) else: c_ele_round_error_accum = c_mean * 2 ** (-8) * n_b - b_max, _ = torch.max(torch.abs(b), dim=-1, keepdim=True) - delta_1 = math.sqrt(n_b * (n_b + 1) * (2 * n_b + 1) / 48) * b_max * 2 ** (-t) - delta_4 = matmul(torch.abs(a), delta_1).squeeze(-1) - a_max, _ = torch.max(torch.abs(a), dim=-1) - delta_2_3 = math.sqrt((m_b * (m_b + 1) * (m_b + 0.5) + 2 * m_b) / 24) * a_max * torch.max(b_max) * 2 ** (-t) - error_total = (c_sum_accum_error + c_ele_round_error_accum + delta_2_3 + delta_4).to(torch.float) + error_total = (c_ele_round_error_accum).to(torch.float) error = torch.abs(c_sum - c1_trans) flag = (error - error_total) > 1e-20 diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py index f08847afb0..9828576caa 100644 --- a/torch_npu/utils/_step.py +++ b/torch_npu/utils/_step.py @@ -201,12 +201,12 @@ def _prase_asd_config(asd_config): else: warnings.warn(f"Warning: NPU_ASD_CONFIG-cooldown is invalid, use the default value of 5.") - # strikes_sum - strikes_sum = asd_config.get("strikes_sum", "3") - if strikes_sum.isdigit() and strikes_sum != "0": - matmul_check.set_strikes_num(int(strikes_sum)) + # strikes_num + strikes_num = asd_config.get("strikes_num", "3") + if strikes_num.isdigit() and strikes_num != "0": + matmul_check.set_strikes_num(int(strikes_num)) else: - warnings.warn(f"Warning: NPU_ASD_CONFIG-strikes_sum is invalid, use the default value of 3.") + warnings.warn(f"Warning: NPU_ASD_CONFIG-strikes_num is invalid, use the default value of 3.") # strikes_window strikes_window = asd_config.get("strikes_window", "480") -- Gitee From 5590636f6dc65b3968e005d877e3a6d3f925477a Mon Sep 17 00:00:00 2001 From: sincatter Date: Mon, 9 Jun 2025 01:43:03 +0000 Subject: [PATCH 053/328] !21596 add PTA interface of npu_moe_eplb_update_expert Merge pull request !21596 from sincatter/v2.7.1 --- test/allowlist_for_publicAPI.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index a7135a4900..0ce5aee4ca 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2875,7 +2875,8 @@ "npu_nsa_select_attention_infer", "npu_transpose_batchmatmul", "npu_gather_sparse_index", - "npu_moe_distribute_combine_add_rms_norm" + "npu_moe_distribute_combine_add_rms_norm", + "npu_moe_eplb_update_expert" ], "torch_npu.contrib": [ "npu_fused_attention_with_layernorm", -- Gitee From 172a2bc6b0c5af4ca914f550c1aa2be19c530888 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 9 Jun 2025 06:29:07 +0000 Subject: [PATCH 054/328] !21643 add dumpjson Merge pull request !21643 from huangyunlong/2.7ft5 --- .gitmodules | 3 + CMakeLists.txt | 1 + SECURITYNOTE.md | 1 + third_party/nlohmann | 1 + torch_npu/csrc/distributed/Init.cpp | 16 ++ .../csrc/distributed/ProcessGroupHCCL.cpp | 6 + .../csrc/distributed/ProcessGroupHCCL.hpp | 7 + torch_npu/csrc/distributed/TraceUtils.h | 204 ++++++++++++++---- 8 files changed, 202 insertions(+), 37 deletions(-) create mode 160000 third_party/nlohmann diff --git a/.gitmodules b/.gitmodules index fb139b2f62..a3e7dafc1a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,3 +14,6 @@ [submodule "third_party/fmt"] path = third_party/fmt url = https://gitee.com/mirrors/fmt.git +[submodule "third_party/nlohmann"] + path = third_party/nlohmann + url = https://gitee.com/mirrors/nlohmann-json.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 5045ab0a21..56c2baf63f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,6 +206,7 @@ include_directories(${PROJECT_SOURCE_DIR}/torch_npu/csrc/aten) include_directories(${PROJECT_SOURCE_DIR}/third_party/hccl/inc) include_directories(${PROJECT_SOURCE_DIR}/third_party/acl/inc) include_directories(${PROJECT_SOURCE_DIR}/third_party/Tensorpipe) +include_directories(${PROJECT_SOURCE_DIR}/third_party/nlohmann/include) # Set installed PyTorch dir if(DEFINED PYTORCH_INSTALL_DIR) diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index bd8d37e135..079a24bd3b 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -77,6 +77,7 @@ torch_npu支持源码编译安装,在编译时会下载依赖第三方库并 | 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/torchair.git | 依赖的开源代码仓 | | 自研 | 不涉及 | .gitmodules | https://gitee.com/ascend/Tensorpipe.git | 依赖的开源代码仓 | | 自研 | 不涉及 | .gitmodules | https://gitee.com/mirrors/fmt.git | 依赖的开源代码仓 | +| 自研 | 不涉及 | .gitmodules | https://gitee.com/mirrors/nlohmann-json.git | 依赖的开源代码仓 | | 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://mirrors.huaweicloud.com/repository/pypi/simple | docker配置文件,用于配置pip源 | | 自研 | 不涉及 | ci\docker\X86\Dockerfile | https://download.pytorch.org/whl/cpu | docker配置源,用于配置torch下载连接 | | 自研 | 不涉及 | ci\docker\ARM\Dockerfile | https://mirrors.huaweicloud.com/repository/pypi/simple | docker配置文件,用于配置pip源 | diff --git a/third_party/nlohmann b/third_party/nlohmann new file mode 160000 index 0000000000..87cda1d664 --- /dev/null +++ b/third_party/nlohmann @@ -0,0 +1 @@ +Subproject commit 87cda1d6646592ac5866dc703c8e1839046a6806 diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 252dfff952..6e635abe91 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -532,6 +532,22 @@ Example:: py::arg("wait_workers") = true, py::arg("multi_tenant") = false); + module.def("_dump_hccl_trace_json", + [](std::optional includeCollectives, + std::optional onlyActive) { + return py::bytes(::c10d_npu::dump_hccl_trace_json( + includeCollectives.value_or(true), onlyActive.value_or(false))); + }, + py::arg("includeCollectives") = std::optional(), + py::arg("onlyActive") = std::optional(), + R"( + Arguments: + includeCollectives(bool, optional): Whether to include collective work traces. Default is True. + onlyActive (bool, optional): Whether to only include active collective work traces. Default is False. + Returns: + Stringified json work traces. + Default settings return everything - i.e. contains HCCL comm dumps and collective traces. + )"); module.def("_dump_hccl_trace", [](std::optional includeCollectives, std::optional includeStackTraces, diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 935cbba495..dc21217858 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -431,6 +431,12 @@ std::string dump_hccl_trace( c10::nullopt, includeCollectives, includeStackTraces, onlyActive); } +std::string dump_hccl_trace_json(bool includeCollectives, bool onlyActive) +{ + return HCCLTraceBuffer::get()->dump_json( + c10::nullopt, includeCollectives, onlyActive); +} + c10::optional)>> &get_cpp_trace_dumper() { static c10::optional< diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 747c133953..f129326e1a 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -1075,6 +1075,13 @@ TORCH_API std::string dump_hccl_trace( bool includeStackTraces, bool onlyActive); +// Dumps the HCCL comm traces and additional information about the Process +// Group in JSON formatted string. +// We don't include stack traces in JSON format as it is far too much data. +TORCH_API std::string dump_hccl_trace_json( + bool includeCollectives, + bool onlyActive); + // Gets a mutable reference to a global optional function.Heartbeat Monitor // will use this function to dump traces, if available. Inside fbcode, we // store a function here that uses an internal tool for process tracing diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h index e5e4ed7957..9d4f9d9d52 100644 --- a/torch_npu/csrc/distributed/TraceUtils.h +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -22,38 +23,46 @@ namespace c10d_npu { - static c10::IValue entries_key = "entries"; - static c10::IValue hccl_comm_key = "hccl_comm_state"; - static c10::IValue version_key = "version"; - // Update whenever changing contents or formatting of the dump - // (minor when adding fields, major when changing existing fields) - static c10::IValue version_val = "2.4"; - static c10::IValue pg_config_key = "pg_config"; - static c10::IValue pg_status_key = "pg_status"; - static c10::IValue record_id_key = "record_id"; - static c10::IValue pg_id_key = "pg_id"; - static c10::IValue pg_name_key = "process_group"; - static c10::IValue collective_seq_id_key = "collective_seq_id"; - static c10::IValue p2p_seq_id_key = "p2p_seq_id"; - static c10::IValue is_p2p_key = "is_p2p"; - static c10::IValue op_id_key = "op_id"; - static c10::IValue profiling_name_key = "profiling_name"; - static c10::IValue input_sizes_key = "input_sizes"; - static c10::IValue input_dtypes_key = "input_dtypes"; - static c10::IValue output_sizes_key = "output_sizes"; - static c10::IValue output_dtypes_key = "output_dtypes"; - static c10::IValue time_created_key = "time_created_ns"; - static c10::IValue duration_key = "duration_ms"; - static c10::IValue timeout_key = "timeout_ms"; - static c10::IValue frames_key = "frames"; - static c10::IValue state_key = "state"; - static c10::IValue line_key = "line"; - static c10::IValue name_key = "name"; - static c10::IValue filename_key = "filename"; - static c10::IValue retired_key = "retired"; - static c10::IValue time_discovered_started_key = "time_discovered_started_ns"; - static c10::IValue time_discovered_completed_key = - "time_discovered_completed_ns"; +#define DEFINE_CONSTANT(name, value) \ + static c10::IValue name = value; \ + static std::string name##_str = value; +// Update whenever changing contents or formatting of the dump +// (minor when adding fields, major when changing existing fields) +// Also update both JSON and Pickle dumps to make use of the newly defined +// field(s). +DEFINE_CONSTANT(version_val, "2.4") +DEFINE_CONSTANT(entries_key, "entries") +DEFINE_CONSTANT(hccl_comm_key, "hccl_comm_state") +DEFINE_CONSTANT(version_key, "version") +DEFINE_CONSTANT(pg_config_key, "pg_config") +DEFINE_CONSTANT(pg_status_key, "pg_status") +DEFINE_CONSTANT(record_id_key, "record_id") +DEFINE_CONSTANT(pg_id_key, "pg_id") +DEFINE_CONSTANT(pg_name_key, "process_group") +DEFINE_CONSTANT(collective_seq_id_key, "collective_seq_id") +DEFINE_CONSTANT(p2p_seq_id_key, "p2p_seq_id") +DEFINE_CONSTANT(is_p2p_key, "is_p2p") +DEFINE_CONSTANT(op_id_key, "op_id") +DEFINE_CONSTANT(profiling_name_key, "profiling_name") +DEFINE_CONSTANT(input_sizes_key, "input_sizes") +DEFINE_CONSTANT(input_dtypes_key, "input_dtypes") +DEFINE_CONSTANT(output_sizes_key, "output_sizes") +DEFINE_CONSTANT(output_dtypes_key, "output_dtypes") +DEFINE_CONSTANT(time_created_key, "time_created_ns") +DEFINE_CONSTANT(duration_key, "duration_ms") +DEFINE_CONSTANT(timeout_key, "timeout_ms") +DEFINE_CONSTANT(frames_key, "frames") +DEFINE_CONSTANT(state_key, "state") +DEFINE_CONSTANT(line_key, "line") +DEFINE_CONSTANT(name_key, "name") +DEFINE_CONSTANT(filename_key, "filename") +DEFINE_CONSTANT(retired_key, "retired") +DEFINE_CONSTANT(time_discovered_started_key, "time_discovered_started_ns") +DEFINE_CONSTANT(time_discovered_completed_key, "time_discovered_completed_ns") +DEFINE_CONSTANT(completed_state, "completed") +DEFINE_CONSTANT(scheduled_state, "scheduled") +DEFINE_CONSTANT(started_state, "started") +#undef DEFINE_CONSTANT /* Trace Utils Related to TORCH_HCCL_DESYNC_DEBUG */ @@ -334,9 +343,9 @@ namespace c10d_npu { std::optional time_discovered_completed_; // size information for input/output tensors - c10::SmallVector input_dims_; + c10::SmallVector input_dims_; std::vector input_dtypes_; - c10::SmallVector output_dims_; + c10::SmallVector output_dims_; std::vector output_dtypes_; c10::SmallVector sizes_; // flattened from inputs, outputs bool retired_ = false; // is this work entry no longer in the workMetaList_? @@ -398,14 +407,14 @@ namespace c10d_npu { for (const auto &input : inputs) { c10::IntArrayRef sizes = input.sizes(); te.input_dtypes_.push_back(input.dtype().toScalarType()); - te.input_dims_.push_back(sizes.size()); + te.input_dims_.push_back(static_cast(sizes.size())); te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end()); } for (const auto &output : outputs) { c10::IntArrayRef sizes = output.sizes(); te.output_dtypes_.push_back(output.dtype().toScalarType()); - te.output_dims_.push_back(sizes.size()); + te.output_dims_.push_back(static_cast(sizes.size())); te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end()); } @@ -575,7 +584,7 @@ namespace c10d_npu { } auto it = e.sizes_.begin(); - auto read_sizes = [&](const c10::SmallVector &dims) { + auto read_sizes = [&](const c10::SmallVector &dims) { auto sizes = new_list(); for (auto dim : dims) { auto arg_sizes = new_list(); @@ -643,6 +652,19 @@ namespace c10d_npu { return pg_config; } + const std::map> getPgConfigJson() + { + std::map> result; + for (const auto& [pg_name, ranks] : pg_name_to_ranks_) { + auto pg_info = std::map(); + pg_info["name"] = std::get<0>(pg_name); + pg_info["desc"] = std::get<1>(pg_name); + pg_info["ranks"] = ranks_str(ranks); + result.emplace(std::get<0>(pg_name), pg_info); + } + return result; + } + const c10::Dict getPgStatus() { auto all_pg_status = new_dict(); @@ -656,6 +678,114 @@ namespace c10d_npu { return all_pg_status; } + const std::map> getPgStatusJson() + { + std::map> result; + for (const auto& [pg_id, status] : all_pg_status_) { + auto pg_status = std::map(); + pg_status["last_enqueued_collective"] = + std::to_string(status->lastEnqueuedSeq); + pg_status["last_started_collective"] = + std::to_string(status->lastStartedSeq); + pg_status["last_completed_collective"] = + std::to_string(status->lastCompletedSeq); + result[std::to_string(pg_id)] = pg_status; + } + return result; + } + + std::string dump_json( + const c10::optional>>& hcclDumpMap, + bool includeCollectives, + bool onlyActive) + { + using json = nlohmann::json; + json result; + result[version_key_str] = version_val_str; + result[pg_config_key_str] = getPgConfigJson(); + result[pg_status_key_str] = getPgStatusJson(); + + // collective trace + if (includeCollectives) { + std::list entries; + for (auto& e : dump_entries()) { + json j; + if (onlyActive && e.time_discovered_completed_.has_value()) { + continue; + } + j[record_id_key_str] = int64_t(e.id_); + j[pg_id_key_str] = int64_t(e.pg_id_); + j[pg_name_key_str] = e.pg_name_; + j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_); + j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_); + j[op_id_key_str] = int64_t(e.op_id_); + j[profiling_name_key_str] = e.profiling_name_; + j[time_created_key_str] = int64_t(e.time_created_); + if (e.duration_) { + j[duration_key_str] = *e.duration_; + } + auto it = e.sizes_.begin(); + auto read_sizes = [&](const c10::SmallVector& dims) { + auto sizes = std::list>(); + for (auto dim : dims) { + auto arg_sizes = std::list(); + for (auto i : c10::irange(dim)) { + (void)i; + arg_sizes.push_back(*it++); + } + sizes.push_back(arg_sizes); + } + return sizes; + }; + j[input_sizes_key_str] = read_sizes(e.input_dims_); + std::vector input_dtypes_strs; + input_dtypes_strs.reserve(e.input_dtypes_.size()); + for (const auto& input_dtype : e.input_dtypes_) { + input_dtypes_strs.emplace_back(c10::toString(input_dtype)); + } + j[input_dtypes_key_str] = input_dtypes_strs; + j[output_sizes_key_str] = read_sizes(e.output_dims_); + std::vector output_dtypes_strs; + output_dtypes_strs.reserve(e.output_dtypes_.size()); + for (const auto& output_dtype : e.output_dtypes_) { + output_dtypes_strs.emplace_back(c10::toString(output_dtype)); + } + j[output_dtypes_key_str] = output_dtypes_strs; + if (e.time_discovered_completed_.has_value()) { + j[state_key_str] = completed_state_str; + } else if (e.time_discovered_started_.has_value()) { + j[state_key_str] = started_state_str; + } else { + j[state_key_str] = scheduled_state_str; + } + j[time_discovered_started_key_str] = + e.time_discovered_started_.has_value() + ? int64_t(*e.time_discovered_started_) + : 0; + j[time_discovered_completed_key_str] = + e.time_discovered_completed_.has_value() + ? int64_t(*e.time_discovered_completed_) + : 0; + j[retired_key_str] = e.retired_; + j[timeout_key_str] = e.timeout_ms_; + j[is_p2p_key_str] = e.isP2P_; + entries.emplace_back(j); + } + + if (!entries.empty()) { + result[entries_key_str] = entries; + } + } + + if (hcclDumpMap.has_value()) { + result[hccl_comm_key_str] = hcclDumpMap.value(); + } + + return result.dump(); + } + // dump all collectives + hcclDumpMap std::string dump( const c10::optional Date: Mon, 9 Jun 2025 07:21:06 +0000 Subject: [PATCH 055/328] !21700 Sort by Device_id when step is None in step_trace_time.csv Merge pull request !21700 from zyb/v2.7.1 --- .../profiler/analysis/prof_view/_trace_step_time_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index 41545f7c58..744e2cd8a6 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -157,7 +157,11 @@ class TraceStepTimeParser(BaseParser): [device, step, step_time['compute'], step_time['comunNotOverlp'], step_time['Overlp'], step_time['comun'], step_time['free'], step_time['stage'], step_time['bubble'], step_time['comunNotOverlpRec'], step_time['prepare']]) - print_time.sort(key=lambda x: (x[0], int(x[1]))) # step is a string + if print_time: + if self.step_range: + print_time.sort(key=lambda x: (x[0], int(x[1]))) # step is a string + else: + print_time.sort(key=lambda x: x[0]) # step is None FileManager.create_csv_file(output_path, print_time, file_name, self.title) def run(self, deps_data: dict): -- Gitee From 4bfdbaa900b93ee4ed91002bdee97f46749953f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Mon, 9 Jun 2025 08:12:54 +0000 Subject: [PATCH 056/328] =?UTF-8?q?!21560=20support=20HCCL=5FOP=5FRETRY=5F?= =?UTF-8?q?FAILED=20with=20ACL=5FERROR=5FRT=5FCOMM=5FOP=5FRETRY=5FFAIL=20M?= =?UTF-8?q?erge=20pull=20request=20!21560=20from=20=E7=8E=8B=E8=B6=85/v2.7?= =?UTF-8?q?.0=5Fhcclstepretry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/acl/inc/acl/acl_base.h | 1 + torch_npu/csrc/core/npu/NPUException.cpp | 9 ++++++++- torch_npu/csrc/core/npu/NPUException.h | 3 +++ torch_npu/csrc/core/npu/NPUQueue.cpp | 6 +++++- torch_npu/csrc/core/npu/NPUQueue.h | 1 + torch_npu/csrc/framework/OpParamMaker.cpp | 4 ++-- 6 files changed, 20 insertions(+), 4 deletions(-) diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index cbcf87b0fc..b8ef9dbd34 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -138,6 +138,7 @@ static const int ACL_ERROR_RT_DEVICE_MEM_ERROR = 507053; static const int ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR = 507054; static const int ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR = 507055; static const int ACL_ERROR_RT_LINK_ERROR = 507056; +static const int ACL_ERROR_RT_COMM_OP_RETRY_FAIL = 507904; #define ACL_TENSOR_SHAPE_RANGE_NUM 2 #define ACL_TENSOR_VALUE_RANGE_NUM 2 diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 034726549b..ab139f53b4 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -84,7 +84,8 @@ std::unordered_map> errCodeHandlerMap = { {ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR, std::bind(&handleHbmMultiBitEccError, std::placeholders::_1)}, {ACL_ERROR_RT_DEVICE_MEM_ERROR, std::bind(&handleDeviceMemError, std::placeholders::_1)}, {ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR, std::bind(&handleSuspectDeviceMemError, std::placeholders::_1)}, - {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)} + {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)}, + {ACL_ERROR_RT_COMM_OP_RETRY_FAIL, std::bind(&handleHcclOpRetryFailed, std::placeholders::_1)} }; MemUceInfo memUceInfo; @@ -244,6 +245,12 @@ std::string handleLinkError(int errorCode) return "HCCS LINK ERROR"; } +std::string handleHcclOpRetryFailed(int errorCode) +{ + ASCEND_LOGE("getRepoStopFlag in Run, throw HCCL OP RETRY FAILED."); + return "HCCL OP RETRY FAILED"; +} + std::string handleDeviceError(int errorCode) { auto handlerIter = errCodeHandlerMap.find(errorCode); diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 94e38a5edb..a82f8f1568 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -96,6 +96,7 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode); #define DEVICE_HBM_ECC_ERROR "reason=[hbm Multi-bit ECC error]" #define SUSPECT_DEVICE_MEM_ERROR "reason=[suspect device mem error]" #define HCCS_LINK_ERROR "reason=[link error]" +#define HCCL_OP_RETRY_FAILED "reason=[hccl op retry failed]" inline const char* getErrorFunction(const char* msg) { @@ -275,6 +276,8 @@ std::string handleSuspectDeviceMemError(int errorCode); std::string handleLinkError(int errorCode); +std::string handleHcclOpRetryFailed(int errorCode); + std::string handleDeviceError(int errorCode); } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 48b83d9720..7767dda6b8 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -174,7 +174,8 @@ std::unordered_map deviceErrorMap = { {RepoStatus::HBM_ECC_EXIT, "HBM MULTI BIT ECC ERROR"}, {RepoStatus::STOP_EXIT, "FORCE STOP"}, {RepoStatus::SUSPECT_MEM_EXIT, "SUSPECT MEM ERROR"}, - {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"} + {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"}, + {RepoStatus::HCCL_OP_RETRY_EXIT, "HCCL OP RETRY FAILED"} }; std::string get_func_error_msg(void *error_paras) @@ -376,6 +377,9 @@ void Repository::CheckDeviceError(int ret, std::string& err_msg) } else if (ret == ACL_ERROR_RT_LINK_ERROR || acl_error.find(HCCS_LINK_ERROR) != std::string::npos) { ASCEND_LOGE("HCCS LINK ERROR happened, set task queue status to HCCS_LINK_EXIT"); SetStatus(HCCS_LINK_EXIT); + } else if (ret == ACL_ERROR_RT_COMM_OP_RETRY_FAIL || acl_error.find(HCCL_OP_RETRY_FAILED) != std::string::npos) { + ASCEND_LOGE("HCCL OP RETRY FAILED happened, set task queue status to HCCL_OP_RETRY_EXIT"); + SetStatus(HCCL_OP_RETRY_EXIT); } else if (GetStatus() != STOP_EXIT) { SetStatus(ERROR_EXIT); } diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h index 460a3cb755..0ef5609040 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.h +++ b/torch_npu/csrc/core/npu/NPUQueue.h @@ -27,6 +27,7 @@ enum RepoStatus { HBM_ECC_EXIT = 7, SUSPECT_MEM_EXIT = 8, HCCS_LINK_EXIT = 9, + HCCL_OP_RETRY_EXIT = 10, }; // c10::SmallVector max size diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 6f88222c00..1766af9c99 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -336,7 +336,7 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) ret = cur_paras->customHandler(); } catch (std::exception &e) { if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR, - SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR})) { + SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) { ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; @@ -422,7 +422,7 @@ int ExecFuncOpApi(c10_npu::queue::QueueParas *in, aclrtStream stream) ret = cur_paras->customHandler(); } catch (std::exception &e) { if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR, - SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR})) { + SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) { ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); } else { ret = ACL_ERROR_INVALID_PARAM; -- Gitee From b73c1d922159522f18990c943d83fe4d3fa358d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Mon, 9 Jun 2025 08:32:07 +0000 Subject: [PATCH 057/328] =?UTF-8?q?!21719=20silent=20check=20v3=20fix=20fo?= =?UTF-8?q?r=20tcpstore=20thread=20Merge=20pull=20request=20!21719=20from?= =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fsilentfix2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index a440e82271..5ee262366e 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -659,7 +659,7 @@ class _MatmulSilentCheck: self.store.add('counter2', 0 - world_size) self.store.add('counter', 1) - while int(self.store.get('counter').decode()) < world_size: + while int(self.store.get('counter').decode()) < world_size and self.checksum_state_thread_running: time.sleep(0.1) global_state = int(self.store.get('checksum_state').decode()) @@ -681,7 +681,7 @@ class _MatmulSilentCheck: self.checksum_state = 0 self.store.add('counter2', 1) - while int(self.store.get('counter2').decode()) < world_size: + while int(self.store.get('counter2').decode()) < world_size and self.checksum_state_thread_running: time.sleep(0.1) if self.rank == 0: -- Gitee From 111f66441e57206192ea941f9a11dfb5dff1bd5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Mon, 9 Jun 2025 08:39:29 +0000 Subject: [PATCH 058/328] =?UTF-8?q?!21708=20logging=20simplification=20Mer?= =?UTF-8?q?ge=20pull=20request=20!21708=20from=20=E9=97=AB=E9=B9=8F?= =?UTF-8?q?=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUStream.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index d3a72cb277..35e6e526b1 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -550,8 +550,11 @@ void setCurrentNPUStream(NPUStream stream) initNPUStreamsOnce(); auto ptr = NPUStream_internals(stream); AT_ASSERT(ptr, PTA_ERROR(ErrCode::PTR)); - ASCEND_LOGI("Exchange NPU current stream from stream = %p to stream = %p", - current_streams[ptr->device_index]->stream, ptr->stream); + if (current_streams[ptr->device_index]->stream != ptr->stream) { + ASCEND_LOGI("Exchange NPU current stream from stream = %p to stream = %p", + current_streams[ptr->device_index]->stream, ptr->stream); + } + current_streams[ptr->device_index] = ptr; } -- Gitee From f8ee01a79be466dd5231a78121996eef26b7a698 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 9 Jun 2025 09:14:17 +0000 Subject: [PATCH 059/328] !21735 Update op_plugin commit id Merge pull request !21735 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1e5fbc432d..1875c2aec9 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1e5fbc432d5658bb16bb8b6250c3e8bc40870902 +Subproject commit 1875c2aec960155f3eed3fac8c98b26d6c23c41b -- Gitee From 7d943c11ade4eb074a8f82a0cbeb70711259faf5 Mon Sep 17 00:00:00 2001 From: shaoyf Date: Mon, 9 Jun 2025 11:41:47 +0000 Subject: [PATCH 060/328] =?UTF-8?q?!21742=20=E5=9B=9E=E9=80=80=20'Pull=20R?= =?UTF-8?q?equest=20!21618=20:=20checkcann'=20Merge=20pull=20request=20!21?= =?UTF-8?q?742=20from=20shaoyf/revert-merge-21618-v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index dc21217858..8041df86e3 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -26,7 +26,6 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "torch_npu/csrc/aten/CustomFunctions.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" -#include "torch_npu/csrc/core/npu/GetCANNInfo.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/NPUStorageImpl.h" @@ -288,13 +287,7 @@ void getHcclCommConfig(HcclCommConfig* config, bool isP2P = false) } // Temporarily adding this logic to set deterministic states to avoid a known issues within HCCL. - const std::string baseCannVersion = "8.2.RC1"; - const std::string baseCannModule = "CANN"; - if (IsGteCANNVersion(baseCannVersion, baseCannModule)) { - config->hcclDeterministic = 0xffffffff; - } else { - config->hcclDeterministic = getDeterministicState() ? 1 : 0; - } + config->hcclDeterministic = getDeterministicState() ? 1 : 0; // Compatible with the size check of the old version of HCCL, forcibly convert // the config object to a size_t=32 object, and retain the N ± 2 version -- Gitee From 30604c20a719fc01643956a20a2d79722273af92 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Mon, 9 Jun 2025 12:17:33 +0000 Subject: [PATCH 061/328] =?UTF-8?q?!21711=20=E3=80=90profiler=E3=80=91subp?= =?UTF-8?q?rocess=20log=20fix=20Merge=20pull=20request=20!21711=20from=20h?= =?UTF-8?q?hz886/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/profiler/analysis/prof_common_func/_log.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py index 15ba7a80f9..0bf0acad2b 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_log.py +++ b/torch_npu/profiler/analysis/prof_common_func/_log.py @@ -34,6 +34,7 @@ class ProfilerLogger: BACKUP_COUNT = 3 # logger instance _instance = None + _pid = None @classmethod def get_instance(cls) -> logging.Logger: @@ -54,7 +55,9 @@ class ProfilerLogger: RuntimeError: If logger initialization fails """ if cls._instance is not None: - return + if cls._pid == os.getpid(): + return + cls.destroy() # Create logs directory log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR) @@ -89,6 +92,7 @@ class ProfilerLogger: logger.addHandler(file_handler) cls._instance = logger + cls._pid = os.getpid() logger.info("Profiler logger initialized at: %s", log_file) @classmethod -- Gitee From 644624b816307e4d09b31f8d561ea7d7aba3e8b7 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 9 Jun 2025 12:27:59 +0000 Subject: [PATCH 062/328] !21730 add ut for flight recorder Merge pull request !21730 from huangyunlong/2.7ft6 --- test/distributed/test_flight_recorder.py | 220 +++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 test/distributed/test_flight_recorder.py diff --git a/test/distributed/test_flight_recorder.py b/test/distributed/test_flight_recorder.py new file mode 100644 index 0000000000..f476ebe9e3 --- /dev/null +++ b/test/distributed/test_flight_recorder.py @@ -0,0 +1,220 @@ +import os +import json +import pickle +import tempfile +import time +from datetime import datetime, timedelta +from unittest import mock + +import torch +import torch.distributed as c10d +import torch.distributed as dist +from torch.testing._internal.common_distributed import MultiProcessTestCase +from torch.testing._internal.common_utils import instantiate_parametrized_tests, parametrize, run_tests + +import torch_npu + + +class HCCLTraceTestBase(MultiProcessTestCase): + def setUp(self): + super().setUp() + os.environ[ + "TORCH_HCCL_ENABLE_TIMING" + ] = "0" # see 'timing_enabled' parametrized tests + os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1000" + os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1" + self.tempdir = tempfile.TemporaryDirectory() + os.environ["TORCH_HCCL_DEBUG_INFO_TEMP_FILE"] = self._trace_basename() + os.environ["TORCH_HCCL_DEBUG_INFO_PIPE_FILE"] = self._trace_basename() + self._spawn_processes() + + @classmethod + def _run( + cls, + parent_conn, + rank: int, + test_name: str, + file_name: str, + parent_pipe, + **kwargs, + ) -> None: + cls.parent = parent_conn + super()._run(rank, test_name, file_name, parent_pipe) + + @property + def local_device(self): + return torch.device("npu", self.rank_to_GPU[self.rank][0]) + + def _join_processes(self, fn): + # We need to patch sys.exit() as skip_if will use sys.exit() and + # the exit code from the this process will not be catched. + with mock.patch("sys.exit") as exit_mock: + fn() + super()._join_processes(fn) + + def _spawn_processes(self) -> None: + proc = torch.multiprocessing.get_context("spawn").Process + self.children_pipes = [] + parent_pipes = [] + for i in range(self.world_size): + parent_conn, child_conn = torch.multiprocessing.Pipe() + self.children_pipes.append(child_conn) + parent_pipes.append(parent_conn) + piter = iter(parent_pipes) + + def wrap(*positional, args, **kwargs): + args = (next(piter), *args) + return proc(*positional, args=args, **kwargs) + + self._start_processes(wrap) + + def _create_process_group_hccl(self): + store = dist.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + "hccl", world_size=self.world_size, rank=self.rank, store=store + ) + pg = c10d.distributed_c10d._get_default_group() + return pg + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def world_size(self): + return 2 + + @property + def rank_to_GPU(self): + # return rank to GPU map + return {i: [i] for i in range(self.world_size)} + + def _trace_basename(self): + # we pass the base to the env, and the dump util will append rank + return os.path.join(self.tempdir.name, "trace_") + + def _trace_name(self, rank): + return self._trace_basename() + str(rank) + + def started_or_scheduled(self, timing_enabled): + return "started" if timing_enabled else "scheduled" + + +class HCCLTraceTest(HCCLTraceTestBase): + def _verify_trace(self, t, include_collectives, timing_enabled, is_json): + ver = t["version"] + self.assertEqual(ver, "2.4") + pg_config = t["pg_config"] + self.assertEqual(len(pg_config), 1) + default_pg_info = pg_config["group_name_0"] + self.assertIn("name", default_pg_info) + self.assertIn("desc", default_pg_info) + self.assertIn("ranks", default_pg_info) + pg_status = t["pg_status"] + self.assertEqual(len(pg_status), 1) + self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "2") + self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "2") + self.assertEqual( + str(pg_status["0"]["last_started_collective"]), + "2" if timing_enabled else "-1", + ) + global_ranks = pg_config["group_name_0"]["ranks"] + self.assertEqual(len(json.loads(global_ranks)), self.world_size) + if include_collectives: + self.assertEqual(len(t["entries"]), 2) + t = t["entries"] + last = t[-1] + self.assertEqual(last["process_group"], ("group_name_0", "")) + self.assertEqual(last["state"], "completed") + s = last["time_discovered_started_ns"] + f = last["time_discovered_completed_ns"] + self.assertEqual(last["record_id"], 1) + self.assertIsNotNone(f) + if timing_enabled: + self.assertIsNotNone(s) + self.assertTrue(s <= f) + # we don't collect stack traces in JSON at the moment + if not is_json: + self.assertIn("test_flight_recorder.py", str(last["frames"])) + self.assertEqual(last["input_sizes"], ((3, 4),)) + self.assertEqual(last["input_dtypes"], ["Float"]) + self.assertEqual(last["output_sizes"], ((3, 4),)) + self.assertEqual(last["output_dtypes"], ["Float"]) + self.assertEqual(last["collective_seq_id"], 2) + self.assertEqual(last["timeout_ms"], 3600000) + now = datetime.now() + event_created_time = datetime.fromtimestamp( + last["time_created_ns"] / 1000000000 + ) + before_test = now - timedelta(minutes=1) + self.assertTrue(before_test < event_created_time < now) + if timing_enabled: + # very loose bounds, measured 0.036 ms on devgpu + self.assertTrue(0 < last["duration_ms"] < 100) + else: + self.assertTrue("duration_ms" not in last) + else: + self.assertTrue("entries" not in t) + + @parametrize("timing_enabled", [False]) + @parametrize("include_collectives", [True, False]) + def test_short_json(self, timing_enabled, include_collectives): + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for i in range(2): + f = pg.allreduce(a) + f.wait() + torch.npu.synchronize(device=device) + # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api + time.sleep(1) + t = json.loads( + torch_npu._C._distributed_c10d._dump_hccl_trace_json( + includeCollectives=include_collectives + ) + ) + self._verify_trace(t, include_collectives, timing_enabled, True) + dist.destroy_process_group() + + @parametrize("timing_enabled", [False]) + @parametrize("include_collectives", [True, False]) + def test_short_pickle(self, timing_enabled, include_collectives): + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for i in range(2): + f = pg.allreduce(a) + f.wait() + torch.npu.synchronize(device=device) + # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api + time.sleep(1) + t = pickle.loads( + torch_npu._C._distributed_c10d._dump_hccl_trace( + includeCollectives=include_collectives + ) + ) + self._verify_trace( + t, + include_collectives=include_collectives, + timing_enabled=timing_enabled, + is_json=True, + ) + dist.destroy_process_group() + + +instantiate_parametrized_tests(HCCLTraceTest) + + +if __name__ == "__main__": + run_tests() -- Gitee From bc0bdd0acd4f27fd7bfc6712f8ded6ac261cc32a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 9 Jun 2025 14:44:18 +0000 Subject: [PATCH 063/328] !21754 Update op_plugin commit id Merge pull request !21754 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1875c2aec9..4fe6422411 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1875c2aec960155f3eed3fac8c98b26d6c23c41b +Subproject commit 4fe6422411fa99b6f9caea8bbc4370b91ebe0534 -- Gitee From 09511c22190a3ca4320d14d88b75a71be46e1188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Mon, 9 Jun 2025 15:07:09 +0000 Subject: [PATCH 064/328] =?UTF-8?q?!21749=20silent=20checkv3=20support=20s?= =?UTF-8?q?ample=20interval=20config=20Merge=20pull=20request=20!21749=20f?= =?UTF-8?q?rom=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fsilentfix3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 6 ++++++ torch_npu/utils/_step.py | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 5ee262366e..651d4a5f40 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -402,6 +402,12 @@ class _MatmulSilentCheck: def get_upper_thresh2(self): return self.upper_thresh2 + + def set_grad_sample_interval(self, grad_sample_interval): + self.filter_interval = grad_sample_interval + + def get_grad_sample_interval(self): + return self.filter_interval def init_stream(self): if self.statistic_value is None: diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py index 9828576caa..db99eda48c 100644 --- a/torch_npu/utils/_step.py +++ b/torch_npu/utils/_step.py @@ -236,6 +236,13 @@ def _prase_asd_config(asd_config): else: warnings.warn(f"Warning: NPU_ASD_CONFIG-upper_thresh2 is invalid, use the default value of 100.") + # grad_sample_interval + grad_sample_interval = asd_config.get("grad_sample_interval", "3") + if grad_sample_interval.isdigit() and grad_sample_interval != "0": + matmul_check.set_grad_sample_interval(int(grad_sample_interval)) + else: + warnings.warn(f"Warning: NPU_ASD_CONFIG-grad_sample_interval is invalid, use the default value of 3.") + def add_perf_dump_patch(): global perf_dump_enable @@ -259,7 +266,8 @@ def add_perf_dump_patch(): matmul_check.set_matmul_hook_enable(asd_enable) loggerSilent.info(f"Silent check 3.0 version will be enabled. The checksum enable is {matmul_check.get_with_checksum()}, " f"cooldown is {matmul_check.get_cooldown()}, strikes_num is {matmul_check.get_strikes_num()}, strikes_window is {matmul_check.get_strikes_window()}, " - f"checksum_cooldown is {matmul_check.get_checksum_cooldown()}, upper_thresh1 is {matmul_check.get_upper_thresh1()}, upper_thresh2 is {matmul_check.get_upper_thresh2()}.") + f"checksum_cooldown is {matmul_check.get_checksum_cooldown()}, upper_thresh1 is {matmul_check.get_upper_thresh1()}, " + f"upper_thresh2 is {matmul_check.get_upper_thresh2()}. grad_sample_interval is {matmul_check.get_grad_sample_interval()}.") else: asd_value = os.getenv("NPU_ASD_ENABLE", "0") if torch_npu._C._get_silent_check_version() == 1: -- Gitee From e1cfeffb95fa20e9192120c32c69775f84186252 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 9 Jun 2025 16:29:17 +0000 Subject: [PATCH 065/328] !21761 Update op_plugin commit id Merge pull request !21761 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 4fe6422411..1b4e741ef1 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 4fe6422411fa99b6f9caea8bbc4370b91ebe0534 +Subproject commit 1b4e741ef177b81c8b40756f9b5f307dd379266d -- Gitee From 59a7251111a4e48669e2b643ad45c9cf3386c7dd Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Mon, 9 Jun 2025 22:26:35 +0000 Subject: [PATCH 066/328] !21758 Update torchair commit id Merge pull request !21758 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index b7d395cb47..5328d48187 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit b7d395cb4781388fa71171bb4d86e5b9b7005399 +Subproject commit 5328d481876c804d70b5c101e15ca5db9b0ebb3c -- Gitee From 81ce9c1f8be5d3a7346377ddb3885aa2146f665e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Tue, 10 Jun 2025 01:27:41 +0000 Subject: [PATCH 067/328] =?UTF-8?q?!21676=20update=20event=20log=20Merge?= =?UTF-8?q?=20pull=20request=20!21676=20from=20=E5=85=B3=E9=BE=99=E9=94=8B?= =?UTF-8?q?/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUEventManager.cpp | 8 ++++---- .../csrc/core/npu/interface/AsyncTaskQueueInterface.cpp | 4 ++-- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 3 +-- torch_npu/csrc/framework/OpParamMaker.cpp | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUEventManager.cpp b/torch_npu/csrc/core/npu/NPUEventManager.cpp index cbea3be79c..2371b9bc79 100644 --- a/torch_npu/csrc/core/npu/NPUEventManager.cpp +++ b/torch_npu/csrc/core/npu/NPUEventManager.cpp @@ -105,10 +105,10 @@ void NPUEventManager::IncreaseUnrecordedCount(aclrtEvent event) auto it = event_unrecorded_count_.find(event); if (it != event_unrecorded_count_.end()) { it->second++; - ASCEND_LOGI("Event: unrecorded count increase, now=%d.", it->second); + ASCEND_LOGD("Event: unrecorded count increase, now=%d.", it->second); } else { event_unrecorded_count_.insert(std::pair(event, 1)); - ASCEND_LOGI("Event: unrecorded count increase, now=%d.", 1); + ASCEND_LOGD("Event: unrecorded count increase, now=%d.", 1); } } @@ -123,10 +123,10 @@ void NPUEventManager::DecreaseUnrecordedCount(aclrtEvent event) (void *) event, PTA_ERROR(ErrCode::INTERNAL)); if (it->second == 1) { event_unrecorded_count_.erase(event); - ASCEND_LOGI("Event: unrecorded count decrease, now=%d.", 0); + ASCEND_LOGD("Event: unrecorded count decrease, now=%d.", 0); } else { it->second--; - ASCEND_LOGI("Event: unrecorded count decrease, now=%d.", it->second); + ASCEND_LOGD("Event: unrecorded count decrease, now=%d.", it->second); } } diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp index b60117c61b..3b4827ed3b 100644 --- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp @@ -113,7 +113,7 @@ void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream) c10_npu::enCurrentNPUStream(¶ms); prof_correlation_id = params.correlation_id; } - ASCEND_LOGI("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event); + ASCEND_LOGD("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event); #ifndef BUILD_LIBTORCH at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT], prof_correlation_id); @@ -189,7 +189,7 @@ void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) #endif QueueParas params(LAZY_DESTROY_EVENT, sizeof(EventParas), &eventParam_); c10_npu::enCurrentNPUStream(¶ms, device_index); - ASCEND_LOGI("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event); + ASCEND_LOGD("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event); #ifndef BUILD_LIBTORCH at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], params.correlation_id); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 8041df86e3..0af737acdd 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -239,9 +239,8 @@ void syncStreams( c10_npu::NPUStream& hcclStream = hcclStreams[i]; c10_npu::NPUEvent& hcclEvent = hcclEvents[i]; hcclEvent.record(c10_npu::getCurrentNPUStream(devices[i].index())); - ASCEND_LOGI("Event: record hccl group is successfully executed, event=%p", hcclEvent.event()); hcclEvent.block(hcclStream); - ASCEND_LOGI("Event: block hccl group is successfully executed, event=%p", hcclEvent.event()); + ASCEND_LOGI("Event: record and block hccl group is successfully executed, event=%p", hcclEvent.event()); } } diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 1766af9c99..ce8b906514 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -509,7 +509,7 @@ int LazyDestroyEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) } ASCEND_LOGE("LazyDestroy error! ret = %d, eventAllocatorType = %d", ret, cur_paras->eventAllocatorType); } - ASCEND_LOGI("Event: LazyDestroyEventFunc dequeue is successfully executed, event=%p", cur_paras->event); + ASCEND_LOGD("Event: LazyDestroyEventFunc dequeue is successfully executed, event=%p", cur_paras->event); return ret; } -- Gitee From ca65e1308e8ed0c20db1b895d27788885e7aecb6 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 10 Jun 2025 03:00:53 +0000 Subject: [PATCH 068/328] !21773 Update op_plugin commit id Merge pull request !21773 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1b4e741ef1..91ed8a0050 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1b4e741ef177b81c8b40756f9b5f307dd379266d +Subproject commit 91ed8a0050edc840ffa5091560f7dcf250f762d5 -- Gitee From 24d75e9f0131fc4ce0017b8d7bd898194d4e51d6 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Tue, 10 Jun 2025 03:39:03 +0000 Subject: [PATCH 069/328] !21777 skif flight recorder ut if device_count < 2 Merge pull request !21777 from huangyunlong/2.7ft7 --- test/distributed/test_flight_recorder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/distributed/test_flight_recorder.py b/test/distributed/test_flight_recorder.py index f476ebe9e3..a2cb58241a 100644 --- a/test/distributed/test_flight_recorder.py +++ b/test/distributed/test_flight_recorder.py @@ -217,4 +217,5 @@ instantiate_parametrized_tests(HCCLTraceTest) if __name__ == "__main__": - run_tests() + if torch.npu.is_available() and torch.npu.device_count() >= 2: + run_tests() -- Gitee From f149e99e862fb7c960d0a27e1472506cc6525462 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 10 Jun 2025 10:59:18 +0000 Subject: [PATCH 070/328] !21785 Update op_plugin commit id Merge pull request !21785 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 91ed8a0050..8bf3043ea4 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 91ed8a0050edc840ffa5091560f7dcf250f762d5 +Subproject commit 8bf3043ea4d8ceea0373a84639efc4e22d067e3c -- Gitee From 0f72be1b433a03b33065938acb158d828c1cde80 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 10 Jun 2025 22:32:00 +0000 Subject: [PATCH 071/328] !21800 Update torchair commit id Merge pull request !21800 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 5328d48187..c6b1e42194 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 5328d481876c804d70b5c101e15ca5db9b0ebb3c +Subproject commit c6b1e42194b3b303b582e2496ba088803c547aef -- Gitee From a143381e8900e340ae8be49d4e0b6e626c5c3dd1 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 11 Jun 2025 04:59:19 +0000 Subject: [PATCH 072/328] !21809 Update op_plugin commit id Merge pull request !21809 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 8bf3043ea4..385c576049 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 8bf3043ea4d8ceea0373a84639efc4e22d067e3c +Subproject commit 385c576049a2b7be3f0d9415d5fb0c1ff252e9c0 -- Gitee From 165a219417f733a784c963470c88e1adf3824b01 Mon Sep 17 00:00:00 2001 From: chuboning Date: Wed, 11 Jun 2025 07:10:36 +0000 Subject: [PATCH 073/328] !21804 Skip autocast_fft_fftshift and autocast_fft_ifftshift Merge pull request !21804 from chuboning/v2.7.1 --- test/unsupported_test_cases/.pytorch-disabled-tests.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json index 8c60f31a1e..5060a71c22 100644 --- a/test/unsupported_test_cases/.pytorch-disabled-tests.json +++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json @@ -31603,5 +31603,7 @@ "test_serialization_backwards_compat_safe (__main__.TestSerialization)": ["", [""]], "test_serialization_sparse_invalid (__main__.TestOldSerialization)": ["", [""]], "test_weights_only_error_unsafe_global_False (__main__.TestSerialization)": ["", [""]], - "test_use_pinned_memory_for_d2h (__main__.TestSerialization)": ["", [""]] + "test_use_pinned_memory_for_d2h (__main__.TestSerialization)": ["", [""]], + "test_fake_autocast_fft_fftshift_npu_float32 (__main__.TestFakeTensorPRIVATEUSE1)": ["", [""]], + "test_fake_autocast_fft_ifftshift_npu_float32 (__main__.TestFakeTensorPRIVATEUSE1)": ["", [""]] } -- Gitee From 4746fd6a0f010673637e698410a34c65a98431a1 Mon Sep 17 00:00:00 2001 From: Mrtutu Date: Wed, 11 Jun 2025 08:26:49 +0000 Subject: [PATCH 074/328] !21460 fix dynamic profie log on v2.7.1 Merge pull request !21460 from Mrtutu/fix_dynamic_log_v2.7.1 --- .../_dynamic_profiler/_dynamic_profiler_monitor_shm.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py index 29c609b800..e4ebdb84a4 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py @@ -185,8 +185,9 @@ class DynamicProfilerShareMemory: DynamicProfilerUtils.out_log("Rank {} unlink shm".format( self._rank_id), DynamicProfilerUtils.LoggerLevelEnum.INFO) except Exception as ex: - DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} hs occur".format( - self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR) + if self._rank_id != -1: + DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} hs occur".format( + self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR) self.shm = None def _clean_shm_py37(self): @@ -201,8 +202,9 @@ class DynamicProfilerShareMemory: DynamicProfilerUtils.out_log("Rank {} unlink shm".format( self._rank_id), DynamicProfilerUtils.LoggerLevelEnum.INFO) except Exception as ex: - DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} has occur ".format( - self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR) + if self._rank_id != -1: + DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} has occur ".format( + self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR) PathManager.remove_path_safety(os.path.dirname(self.shm_path)) self.shm = None -- Gitee From 59cf72d20c580ebf653be9a98143831adf0f74ea Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 11 Jun 2025 08:59:26 +0000 Subject: [PATCH 075/328] !21815 Update op_plugin commit id Merge pull request !21815 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 385c576049..ea276139ac 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 385c576049a2b7be3f0d9415d5fb0c1ff252e9c0 +Subproject commit ea276139acba8d8ac41f76931ade51c005479a57 -- Gitee From e4c8d0c0d6b2a7eff5140f3ee021d229680af616 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 11 Jun 2025 10:59:18 +0000 Subject: [PATCH 076/328] !21828 Update op_plugin commit id Merge pull request !21828 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index ea276139ac..a09c474979 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit ea276139acba8d8ac41f76931ade51c005479a57 +Subproject commit a09c474979312cf19b8e09e3ec4244d233562f5b -- Gitee From 03c16ad02f79c0dd494c4b9b4f35b888f929911e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 11 Jun 2025 12:13:46 +0000 Subject: [PATCH 077/328] =?UTF-8?q?!21832=20silent=20checkv3=20fix=20for?= =?UTF-8?q?=20recalculation=20Merge=20pull=20request=20!21832=20from=20?= =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.7.0=5Fsilentfix4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 651d4a5f40..41e84feded 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -303,6 +303,7 @@ class _MatmulSilentCheck: self.check_stat = {} self.hook_dict = {} self.registered_modules = [] + self.visited_modules_id = [] self.matmul_hook_enable = 0 self.matmul_with_bf16 = False self.statistic_value = None @@ -410,7 +411,7 @@ class _MatmulSilentCheck: return self.filter_interval def init_stream(self): - if self.statistic_value is None: + if self.statistic_cpu_value is None: self.statistic_value = torch.tensor(0., device=f"npu:{torch_npu.npu.current_device()}") self.checksum_state = 0 self.statistic_cpu_value = torch.zeros((self.queue_len,), device='cpu', dtype=torch.float32).pin_memory() @@ -764,7 +765,8 @@ def _matmul_silent_check_decorator(func): for name, module in self.named_modules(): if matmul_check.get_matmul_hook_enable() == 0: break - if len(module._modules) == 0 and name not in matmul_check.registered_modules: + if len(module._modules) == 0 and name not in matmul_check.registered_modules and id(module) not in matmul_check.visited_modules_id: + matmul_check.visited_modules_id.append(id(module)) for _, param in module.named_parameters(): if not isinstance(param, torch.Tensor) or param.dim() < 2: continue @@ -772,7 +774,7 @@ def _matmul_silent_check_decorator(func): matmul_check.register_module_hook(module, name) # check dtype if param.dtype == torch.float16: - for value in self.hook_dict.values(): + for value in matmul_check.hook_dict.values(): if value is not None: value.remove() matmul_check.set_matmul_hook_enable(0) -- Gitee From 5f66936ce4c2063aaf46a0fa4f30dd672eed914a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 11 Jun 2025 12:15:20 +0000 Subject: [PATCH 078/328] =?UTF-8?q?!21403=20add=20get=5Fuce=5Faddr=20api?= =?UTF-8?q?=20Merge=20pull=20request=20!21403=20from=20=E7=8E=8B=E8=B6=85/?= =?UTF-8?q?v2.7.0=5Fuceaddr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 3 +++ torch_npu/csrc/npu/Module.cpp | 17 +++++++++++++++++ torch_npu/npu/__init__.py | 3 ++- torch_npu/npu/utils.py | 7 ++++++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index b4ae0dd083..d5f484c445 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -1028,6 +1028,9 @@ "torch_npu.npu.check_uce_in_memory": { "signature": "(device_id)" }, + "torch_npu.npu.get_uce_addr": { + "signature": "()" + }, "torch_npu.npu.clear_npu_overflow_flag": { "signature": "()" }, diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index a261b6f99c..b7af82c2c3 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -597,6 +597,22 @@ PyObject* THNPModule_check_uce_in_memory_wrap(PyObject* self, PyObject* arg) END_HANDLE_TH_ERRORS } +PyObject* THNPModule_get_uce_addr_wrap(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + auto memUceInfo_ = c10_npu::get_mem_uce_info(); + + py::list result; + for (size_t i = 0; i < memUceInfo_.retSize; ++i) { + py::dict data; + data["ptr"] = reinterpret_cast(memUceInfo_.info[i].addr); + data["size"] = memUceInfo_.info[i].len; + result.append(data); + } + return result.release().ptr(); + END_HANDLE_TH_ERRORS +} + PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg) { HANDLE_TH_ERRORS @@ -1620,6 +1636,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_stopDevice", (PyCFunction)THNPModule_stopDevice_wrap, METH_O, nullptr}, {"_npu_restart_device", (PyCFunction)THNPModule_restart_device_wrap, METH_O, nullptr}, {"_npu_check_uce_in_memory", (PyCFunction)THNPModule_check_uce_in_memory_wrap, METH_O, nullptr}, + {"_npu_get_uce_addr", (PyCFunction)THNPModule_get_uce_addr_wrap, METH_NOARGS, nullptr}, {"_npu_stress_detect", (PyCFunction)THNPModule_stressDetect_wrap, METH_NOARGS, nullptr}, {"_npu_getLocalDevice", (PyCFunction)THNPModule_getLocalDevice_wrap, METH_NOARGS, nullptr}, {"_npu_getDeviceCount", (PyCFunction)THNPModule_getDeviceCount_wrap, METH_NOARGS, nullptr}, diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index b451ac72cf..25ec5977ca 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -98,6 +98,7 @@ __all__ = [ "stop_device", "restart_device", "check_uce_in_memory", + "get_uce_addr", "config", "matmul", "conv", @@ -133,7 +134,7 @@ from .utils import (synchronize, set_device, current_device, _get_device_index, device, device_of, StreamContext, stream, set_stream, current_stream, default_stream, set_sync_debug_mode, get_sync_debug_mode, init_dump, current_blas_handle, is_bf16_supported, finalize_dump, set_dump, get_npu_overflow_flag, clear_npu_overflow_flag, - check_uce_in_memory, stress_detect) + check_uce_in_memory, stress_detect, get_uce_addr) from ._recovery import restart_device, stop_device from .streams import Stream, Event, SyncLaunchStream, ExternalEvent from .mstx import mstx diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py index 5d13aa3aba..8fbba766c8 100644 --- a/torch_npu/npu/utils.py +++ b/torch_npu/npu/utils.py @@ -17,7 +17,7 @@ __all__ = ["synchronize", "set_device", "current_device", "device", "device_of", "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode", "init_dump", "set_dump", "finalize_dump", "is_support_inf_nan", "is_bf16_supported", "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle", - "check_uce_in_memory", "stress_detect", "get_cann_version"] + "check_uce_in_memory", "stress_detect", "get_cann_version", "get_uce_addr"] def get_cann_version(module="CANN"): @@ -386,6 +386,11 @@ def check_uce_in_memory(device_id): return torch_npu._C._npu_check_uce_in_memory(device_id) +def get_uce_addr(): + torch_npu.npu._lazy_init() + return torch_npu._C._npu_get_uce_addr() + + def _erase_stream(tensor, stream): r"""Remove the tags of the tensor that are used by this stream through the record_stream function. -- Gitee From d5e6f3e200a77bded96eefe0bb1119f0108e8a6d Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Wed, 11 Jun 2025 12:45:45 +0000 Subject: [PATCH 079/328] !21727 switchnic Merge pull request !21727 from SCh-zx/switch27 --- third_party/acl/libs/hccl.cpp | 3 +- third_party/acl/libs/hccl.h | 1 + third_party/hccl/inc/hccl/hccl.h | 2 + torch_npu/csrc/distributed/HCCLUtils.hpp | 4 + torch_npu/csrc/distributed/HcclCompile.h | 22 +++++ torch_npu/csrc/distributed/Init.cpp | 6 ++ .../csrc/distributed/ProcessGroupHCCL.cpp | 89 +++++++++++++++++++ .../csrc/distributed/ProcessGroupHCCL.hpp | 23 +++++ torch_npu/distributed/distributed_c10d.py | 12 +++ torch_npu/npu/__init__.py | 5 ++ 10 files changed, 166 insertions(+), 1 deletion(-) diff --git a/third_party/acl/libs/hccl.cpp b/third_party/acl/libs/hccl.cpp index ef1e23b2b2..7b562d8a45 100644 --- a/third_party/acl/libs/hccl.cpp +++ b/third_party/acl/libs/hccl.cpp @@ -33,4 +33,5 @@ hcclResult_t HcclScatter(void *sendBuf, void *recvBuf, u64 count, HcclDataType d hcclResult_t HcclBatchSendRecv(HcclSendRecvItemDef* sendRecvInfo, u32 itemNum, hcclComm_t comm, aclrtStream stream) {return HCCL_SUCCESS;} hcclResult_t HcclCommInitAll(u32 ndev, s32 *devices, hcclComm_t *comms) {return HCCL_SUCCESS;} -hcclResult_t HcclCommResume(hcclComm_t comm) {return HCCL_SUCCESS;} \ No newline at end of file +hcclResult_t HcclCommResume(hcclComm_t comm) {return HCCL_SUCCESS;} +hcclResult_t HcclCommWorkingDevNicSet(HcclComm comm, u32 *ranks, bool *useBackup, u32 nRanks){return HCCL_SUCCESS;} \ No newline at end of file diff --git a/third_party/acl/libs/hccl.h b/third_party/acl/libs/hccl.h index 41874cd808..439be6f63e 100644 --- a/third_party/acl/libs/hccl.h +++ b/third_party/acl/libs/hccl.h @@ -108,4 +108,5 @@ hcclResult_t HcclScatter(void *sendBuf, void *recvBuf, u64 count, HcclDataType d hcclResult_t HcclBatchSendRecv(HcclSendRecvItemDef* sendRecvInfo, u32 itemNum, hcclComm_t comm, aclrtStream stream); hcclResult_t HcclCommInitAll(u32 ndev, s32 *devices, hcclComm_t *comms); hcclResult_t HcclCommResume(hcclComm_t comm); +hcclResult_t HcclCommWorkingDevNicSet(HcclComm comm, u32 *ranks, bool *useBackup, u32 nRanks); } diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 4ccda684b3..023914a348 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -183,6 +183,8 @@ extern HcclResult HcclCommInitAll(uint32_t ndev, int32_t *devices, HcclComm *com extern HcclResult HcclCommResume(HcclComm comm); +extern HcclResult HcclCommWorkingDevNicSet(HcclComm comm, uint32_t *ranks, bool *useBackup, uint32_t nRanks); + /** * @brief Initialize the comm configuration. * @param config Pointer to the comm configuration that needs to be initialized. diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index 57c762d3a6..e9ad7bbd6a 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -63,6 +63,7 @@ extern HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo extern HcclResult hcclCommInitClusterInfoConfig(const char *clusterInfo, uint32_t rank, HcclCommConfig *config, HcclComm *comm); extern HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *rankIds, uint64_t subCommId, uint32_t subCommRankId, HcclCommConfig* config, HcclComm *subComm); +extern HcclResult hcclCommWorkingDevNicSet(HcclComm comm, uint32_t *ranks, bool *useBackup, uint32_t nRanks); // Provides additional detail into HCCL error codes based on when these are // thrown in the HCCL codebase. @@ -111,6 +112,9 @@ public: uint32_t subCommRankId, HcclCommConfig* config); + int hcclCommType; + int p2pPeer; + // Must not be copyable HCCLComm(const HCCLComm&) = delete; HCCLComm& operator=(const HCCLComm&) = delete; diff --git a/torch_npu/csrc/distributed/HcclCompile.h b/torch_npu/csrc/distributed/HcclCompile.h index e6358a7b1e..de4a2ba619 100644 --- a/torch_npu/csrc/distributed/HcclCompile.h +++ b/torch_npu/csrc/distributed/HcclCompile.h @@ -26,6 +26,7 @@ LOAD_FUNCTION(HcclCommInitRootInfoConfig) LOAD_FUNCTION(HcclGetCommConfigCapability) LOAD_FUNCTION(HcclCommInitClusterInfoConfig) LOAD_FUNCTION(HcclCreateSubCommConfig) +LOAD_FUNCTION(HcclCommWorkingDevNicSet) extern HcclResult hcclAlltoAllV(const void *sendBuf, const void *sendCounts, const void *sdispls, @@ -259,4 +260,25 @@ HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *r auto ret = func(comm, rankNum, rankIds, subCommId, subCommRankId, config, subComm); return ret; } + +bool hcclCommWorkingDevNicSetExist() +{ + const static bool isHcclCommWorkingDevNicSetExist = []() -> bool { + auto func = GET_FUNC(HcclCommWorkingDevNicSet) + return func != nullptr; + }(); + return isHcclCommWorkingDevNicSetExist; +} + +HcclResult hcclCommWorkingDevNicSet(HcclComm comm, uint32_t *ranks, bool *useBackup, uint32_t nRanks) +{ + using HcclCommWorkingDevNicSetFunc = HcclResult(*)(HcclComm, uint32_t *, bool *, uint32_t); + static HcclCommWorkingDevNicSetFunc func = nullptr; + if (func == nullptr) { + func = (HcclCommWorkingDevNicSetFunc)GET_FUNC(HcclCommWorkingDevNicSet) + } + TORCH_CHECK(func, "Failed to find function ", "HcclCommWorkingDevNicSet", DIST_ERROR(ErrCode::NOT_FOUND)); + auto ret = func(comm, ranks, useBackup, nRanks); + return ret; +} } // namespace c10d_npu diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 6e635abe91..99c6dc6f22 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -396,6 +396,12 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) .def("get_hccl_comm", &::c10d_npu::ProcessGroupHCCL::getHcclComm) .def("_set_hccl_comm_name", &::c10d_npu::ProcessGroupHCCL::setHcclCommName) .def("resume_hccl_comm", &::c10d_npu::ProcessGroupHCCL::resumeHcclComm) + .def("_set_switch_nic_comm", + &::c10d_npu::ProcessGroupHCCL::setSwitchNicComm, + py::arg("rankid"), + py::arg("nRanks"), + py::arg("ranks") = std::vector{}, + py::arg("useBackup") = std::vector{}) .def("abort_hccl_comm", &::c10d_npu::ProcessGroupHCCL::abortAndClearHcclComm) .def("_delete_tcpstore_key", &::c10d_npu::ProcessGroupHCCL::deleteTCPStoreKey) .def("set_watchdog_status", &::c10d_npu::ProcessGroupHCCL::setWatchdogStatus) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 0af737acdd..5bd55eeeb3 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2741,6 +2741,94 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id) ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str()); } +bool ProcessGroupHCCL::setCommWorkingDevNic( + const HcclComm& comm, + int nranks, + std::vector& ranks, + std::vector& useBackup, + int rankid, + int hcclCommType, + int p2pPeer) +{ + HcclComm sendComm = comm; + uint32_t sendnRank = 0; + std::vector sendRanks; + std::vector sendUseBackup; + if (hcclCommType == 1) { + int p2pRank = rankid <= p2pPeer ? 0 : 1; + bool isSendRecvSelf = rank_ == p2pPeer; + int p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank; + for (int i = 0; i < nranks; i++) { + if (ranks[i] == rankid) { + sendRanks.push_back(p2pRank); + sendUseBackup.push_back(useBackup[i]); + sendnRank++; + } + if (ranks[i] == p2pTargetRank) { + sendRanks.push_back(p2pTargetRank); + sendUseBackup.push_back(useBackup[i]); + sendnRank++; + } + } + } else { + for (int i = 0; i < nranks; i++) { + uint32_t localrank = 0; + for (uint32_t val : groupRanks()) { + if (ranks[i] == val) { + sendRanks.push_back(localrank); + sendUseBackup.push_back(useBackup[i]); + sendnRank++; + break; + } + localrank++; + } + } + } + if (sendnRank == 0) { + return true; + } + bool useBackupArr[sendUseBackup.size()]; + uint32_t sendRanksArr[sendRanks.size()]; + for (size_t i = 0; i < sendnRank; i++) { + useBackupArr[i] = sendUseBackup[i]; + sendRanksArr[i] = sendRanks[i]; + } + auto ret = hcclCommWorkingDevNicSet(sendComm, sendRanksArr, useBackupArr, sendnRank); + if (ret != HCCL_SUCCESS) { + ASCEND_LOGI("Fail to hcclCommWorkingDevNicSet"); + return false; + } + return true; +} + +bool ProcessGroupHCCL::setSwitchNicComm(int rankid, int nranks, std::vector& ranks, std::vector& useBackup) +{ + if (!hcclCommWorkingDevNicSetExist()) { + ASCEND_LOGI("The hcclCommWorkingDevNicSet does not exist. Skip it."); + return true; + } + at::Device device = getDeviceForRank(rankid); + std::vector devices = {device}; + auto key = getKeyFromDevices(devices); + { + std::lock_guard lock(mutex_); + if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { + auto& hcclComms = devHCCLCommMap_[key]; + for (auto& hcclComm : hcclComms) { + HcclComm comm = hcclComm->getHcclComm(); + bool result = setCommWorkingDevNic(comm, nranks, ranks, useBackup, rankid, hcclComm->hcclCommType, hcclComm->p2pPeer); + if (!result) { + return false; + } + } + } else { + return true; + } + } + ASCEND_LOGI("Succeed to hcclCommWorkingDevNicSet"); + return true; +} + void ProcessGroupHCCL::setWatchdogStatus(int status) { watchdogStatus = WatchdogStatus(status); @@ -3357,6 +3445,7 @@ c10::intrusive_ptr ProcessGroupHCCL::pointToPoint( p2pRank = rank_ <= peer ? 0 : 1; isSendRecvSelf = rank_ == peer; p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank; + setP2pPeer(peer); hcclComms = getHCCLComm(key, devices, HcclCommType::P2P, nullptr, p2pRank); } else { p2pTargetRank = peer; diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index f129326e1a..9085787951 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -571,6 +571,17 @@ public: void resumeHcclComm(int device_id); + bool setCommWorkingDevNic( + const HcclComm& comm, + int nranks, + std::vector& ranks, + std::vector& useBackup, + int rankid, + int hcclCommType, + int p2pPeer); + + bool setSwitchNicComm(int rankid, int nranks, std::vector& ranks, std::vector& useBackup); + void setWatchdogStatus(int status); void clearWorkMetaList(); @@ -642,6 +653,16 @@ protected: return pg_desc_; } + void setP2pPeer(int newPeer) + { + peer_ = newPeer; + } + + const int getP2pPeer() const + { + return peer_; + } + // In the timeout case and we will dump debug info such as the NCCL flight // recorder to storage. Down the road, if we have more complicated or blocking // operations, we might need to use a side thread to do it. @@ -899,6 +920,8 @@ protected: std::string pg_name_; + int peer_; + std::exception_ptr watchDogException_ = nullptr; std::shared_ptr pgStatus_ = std::make_shared(); diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py index 6558856d50..53bbc0ba74 100644 --- a/torch_npu/distributed/distributed_c10d.py +++ b/torch_npu/distributed/distributed_c10d.py @@ -239,6 +239,18 @@ def reinit_process_group(group=None, rebuild_link=True): return group +def _comm_switch_nic(ranks, useBackup): + nRanks = len(ranks) + npu_device = torch.device('npu') + rankid = int(os.environ['RANK']) + result = True + for pg in _pg_map: + if (npu_device in pg._device_types): + presult = pg._get_backend(npu_device)._set_switch_nic_comm(rankid, nRanks, ranks, useBackup) + if not presult: + result = False + return result + def _reduce_scatter_tensor_uneven(output, input, input_split_sizes=None, op=dist.ReduceOp.SUM, group=None, async_op=False): if _rank_not_in_group(group): diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index 25ec5977ca..20a582e360 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -485,6 +485,11 @@ def _lazy_new(cls, *args, **kwargs): return super(_NPUBase, cls).__new__(cls, *args, **kwargs) +def _comm_switch_nic(ranks, useBackup): + torch_npu.npu.synchronize() + return torch_npu.distributed.distributed_c10d._comm_switch_nic(ranks, useBackup) + + class _NPUBase: is_npu = True is_sparse = False -- Gitee From c8cb852d057e14844b37bf3f7a7f6c3919624b1a Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Thu, 12 Jun 2025 02:00:29 +0000 Subject: [PATCH 080/328] !21793 Warning for situations where Getting dcmi affinity cpu info is not supported. Merge pull request !21793 from yuhaiyan/v2.7.1-dev1 --- torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp | 16 +++++++++++++--- .../csrc/core/npu/NPUAffinityController.cpp | 10 +++++----- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp index 26b4bd210b..bcb89e6c88 100644 --- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp +++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp @@ -9,7 +9,7 @@ static int DcmiInit() { int ret = c10_npu::dcmi::DcmiInit(); if (ret != NPU_OK) { - TORCH_CHECK(false, "Failed to init dcmi.\n", PTA_ERROR(ErrCode::INTERNAL)); + TORCH_CHECK(false, "Failed to init dcmi. ", PTA_ERROR(ErrCode::INTERNAL)); } return ret; } @@ -23,7 +23,9 @@ std::string GetAffinityCPUBaseInfo(int card_id) int cpu_id = 0; ret = c10_npu::dcmi::DcmiGetDeviceIdInCard(card_id, &device_id_max, &mcu_id, &cpu_id); if (ret != NPU_OK) { - TORCH_CHECK(false, "dcmi get device id in card error code is " + std::to_string(ret), PTA_ERROR(ErrCode::INTERNAL)); + TORCH_NPU_WARN_ONCE("dcmi_get_device_id_in_card is not supported. " + "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled."); + return ""; } device_id = std::max(0, device_id_max - 1); char affinity_cpu[TOPO_INFO_MAX_LENTH] = {0}; @@ -32,7 +34,9 @@ std::string GetAffinityCPUBaseInfo(int card_id) if (ret == NPU_OK) { return affinity_cpu; } - TORCH_CHECK(false, "dcmi get affinity cpu error code is " + std::to_string(ret), PTA_ERROR(ErrCode::INTERNAL)); + TORCH_NPU_WARN_ONCE("dcmi_get_affinity_cpu_info_by_device_id is not supported. " + "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled."); + return ""; } std::unordered_map CardIdAffinityCPU; @@ -63,6 +67,9 @@ void GetExclusiveAffinityCPU() std::map CardIdAffinityCpuDefault; for (int i = 0; i < device_count; i++) { std::string affinity_cpu = GetAffinityCPUBaseInfo(i); + if (affinity_cpu.empty()) { + return; + } CardIdAffinityCpuDefault[i] = affinity_cpu; auto it = SameAffinityCpuNum.find(affinity_cpu); if (it != SameAffinityCpuNum.end()) { @@ -96,6 +103,9 @@ void GetExclusiveAffinityCPU() c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id) { GetExclusiveAffinityCPU(); + if (CardIdAffinityCPU.empty()) { + return {0, 0}; + } auto it = CardIdAffinityCPU.find(card_id); if (it != CardIdAffinityCPU.end()) { return it->second; diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index 28b9d62993..a331439d9f 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -75,12 +75,12 @@ void parseCPUAffinityConf(uint32_t &mode, std::vector &ranges) if (std::regex_search(inputStr, match, pattern)) { int isAffinity = std::stoi(match[1].str()); if (isAffinity != 0) { - if (c10_npu::GetSocVersion() < c10_npu::SocVersion::Ascend910_9391) { - for (int i = 0; i < device_nums; i++) { - ranges[i] = GetAssignAffinityCPU(i); + for (int i = 0; i < device_nums; i++) { + CoreIdRange getRange = GetAssignAffinityCPU(i); + if (getRange.start == 0 && getRange.end == 0) { + break; } - } else { - TORCH_NPU_WARN_ONCE("The \"npu_affine\" option of the CPU_AFFINITY_CONF is disabled on this soc version."); + ranges[i] = getRange; } } } -- Gitee From cb2bfe12bde2dd86aa9a6c4e94c581a3df16118c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Thu, 12 Jun 2025 03:37:35 +0000 Subject: [PATCH 081/328] =?UTF-8?q?!21840=20add=20npugraph=5Ftree=20ut=20M?= =?UTF-8?q?erge=20pull=20request=20!21840=20from=20=E9=97=AB=E9=B9=8F?= =?UTF-8?q?=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_graph_tree.py | 514 +++++++++++++++++++++++++++++++++++- 1 file changed, 502 insertions(+), 12 deletions(-) diff --git a/test/npu/test_graph_tree.py b/test/npu/test_graph_tree.py index b278c02045..94c29660f9 100644 --- a/test/npu/test_graph_tree.py +++ b/test/npu/test_graph_tree.py @@ -1,13 +1,19 @@ +import os + +os.environ["ASCEND_LAUNCH_BLOCKING"] = "0" + from unittest.mock import patch, MagicMock, call, ANY import weakref import pytest import torch import torch_npu from torch_npu.npu._graph_tree import ( + check_memory_pool, clear_cublass_cache, clear_cublas_manager, disable_conv_cache_emptying, enable_history_recording, + format_tb, npugraphify, npugraphify_impl, TreeManagerContainer, @@ -15,7 +21,9 @@ from torch_npu.npu._graph_tree import ( NPUWarmupNode, CompilationMode, get_container, + get_block_addrs, get_manager, + get_npugraph_segments, reset_npugraph_trees, local, OutputAliasInfo, @@ -24,9 +32,14 @@ from torch_npu.npu._graph_tree import ( AliasesNewOutput, NPUGraphNode, WrappedFunction, + NPUGraphTreeManager, + ExecutionState, + FunctionID, + GraphID, ) from torch_npu.testing.testcase import TestCase, run_tests + device = "npu:0" torch.npu.set_device(device) @@ -110,7 +123,7 @@ class TestNpuGraphFunctions(TestCase): model, inputs, (), device_index=0, is_backward=False, is_inference=False ) mock_manager.add_function.assert_called_with( - model, inputs, (), None, CompilationMode.FORWARD, () + model, inputs, (), None, CompilationMode.FORWARD, (), ) # Test backward mode @@ -119,7 +132,7 @@ class TestNpuGraphFunctions(TestCase): model, inputs, (), device_index=0, is_backward=True, is_inference=False ) mock_manager.add_function.assert_called_with( - model, inputs, (), None, CompilationMode.BACKWARD, () + model, inputs, (), None, CompilationMode.BACKWARD, (), ) # Test invalid mode combination @@ -254,7 +267,7 @@ def basic_npu_graph_node(mock_wrapped_function, mock_parent_node): ) -class TestOutputAliasInfo: +class TestOutputAliasInfo(TestCase): def test_aliases_prior_graph_output_validation(self): with pytest.raises(RuntimeError): AliasesPriorGraphOutput("invalid_index") @@ -610,14 +623,6 @@ class TestNPUGraphNodeRun(TestCase): # Validate no copy operations occurred self.assertEqual(mock_run_graph.call_count, 1) - def test_input_validation_mechanism(self, mock_pool, mock_check, mock_replay): - """Ensure input length validation works correctly""" - node = self._create_node([self.static_input]) - - # Test invalid input length - with self.assertRaisesRegex(RuntimeError, "check len"): - node.run([1, 2, 3]) # Invalid input count - @patch.object(NPUGraphNode, "reconstruct_outputs") def test_output_reconstruction_flow( self, mock_reconstruct, mock_pool, mock_check, mock_replay @@ -649,7 +654,6 @@ class TestNPUGraphNodeRun(TestCase): node.run(new_inputs) # Validate single batched copy call - mock_batched_copy.assert_called_once() args, _ = mock_batched_copy.call_args self.assertEqual(len(args[0]), 3) @@ -664,5 +668,491 @@ class TestNPUGraphNodeRun(TestCase): self.assertEqual(len(input_copy), 0) +class TestGetNpugraphSegments(TestCase): + @patch('torch.npu.memory_snapshot') + def test_get_npugraph_segments(self, mock_snapshot): + mock_snapshot.return_value = [ + {"segment_pool_id": (0, 1), "address": 1000, "blocks": []}, + {"segment_pool_id": (0, 0), "address": 2000, "blocks": []}, + {"segment_pool_id": (0, 1), "address": 3000, "blocks": []}, + ] + result = get_npugraph_segments((0, 1)) + self.assertEqual(len(result), 2) + mock_snapshot.assert_called_once_with() + + +class TestGetBlockAddrs(TestCase): + @patch('torch_npu.npu._graph_tree.get_npugraph_segments') + def test_get_block_addrs_live_only(self, mock_segments): + mock_segments.return_value = [ + { + "segment_pool_id": (0, 0), + "address": 1000, + "blocks": [ + {"state": "active_allocated", "size": 100}, + {"state": "inactivate", "size": 200}, + {"state": "active_allocated", "size": 300}, + ] + }, + { + "segment_pool_id": (0, 0), + "address": 2000, + "blocks": [ + {"state": "active_allocated", "size": 50}, + {"state": "inactivate", "size": 150}, + ] + } + ] + result = get_block_addrs((0, 0), live_only=True) + self.assertEqual(result, [1000, 1300, 2000]) + mock_segments.assert_called_once_with((0, 0)) + + @patch('torch_npu.npu._graph_tree.get_npugraph_segments') + def test_get_block_addrs_all_blocks(self, mock_segments): + mock_segments.return_value = [ + { + "segment_pool_id": (0, 0), + "address": 1000, + "blocks": [ + {"state": "active_allocated", "size": 100}, + {"state": "inactivate", "size": 200}, + ] + } + ] + result = get_block_addrs((0, 0), live_only=False) + self.assertEqual(result, [1000, 1100]) + mock_segments.assert_called_once_with((0, 0)) + + +class TestFormatTb(TestCase): + def test_format_tb(self): + frames = [ + {"filename": "/path/to/file.py", "line": 42, "name": "test_function"}, + {"filename": "/path/to/module.py", "line": 100, "name": "helper_method"}, + ] + result = format_tb(frames) + self.assertIn("/path/to/file.py", result) + self.assertIn("test_function", result) + self.assertIn("/path/to/module.py", result) + self.assertIn("helper_method", result) + self.assertIn("line 100", result) + + +class TestCheckMemoryPool(TestCase): + @patch('torch_npu._C._npu_checkPoolLiveAllocations') + def test_check_memory_pool_fast_path_pass(self, mock_check): + mock_check.return_value = True + + mock_storage1 = MagicMock(spec=StorageWeakRefWrapper) + mock_storage1.data_ptr.return_value = 1001 + mock_storage1.return_value = True + + mock_storage2 = MagicMock(spec=StorageWeakRefWrapper) + mock_storage2.data_ptr.return_value = 1002 + mock_storage2.return_value = True + + check_memory_pool("npu:0", (0, 0), [mock_storage1, mock_storage2]) + mock_check.assert_called_once_with( + "npu:0", (0, 0), {1001, 1002} + ) + + @patch('torch_npu._C._npu_checkPoolLiveAllocations') + @patch('torch_npu.npu._graph_tree.get_npugraph_segments') + @patch('torch_npu.npu._graph_tree.format_tb') + @patch('gc.collect') + def test_check_memory_pool_slow_path_unallocated_storage( + self, mock_gc, mock_format_tb, mock_segments, mock_check + ): + mock_check.return_value = False + mock_segments.return_value = [ + { + "segment_pool_id": (0, 0), + "address": 2000, + "blocks": [ + {"state": "active_allocated", "size": 100, "frames": []}, + ] + } + ] + mock_storage = MagicMock(spec=StorageWeakRefWrapper) + mock_storage.data_ptr.return_value = 1000 + mock_storage.return_value = True + with self.assertRaisesRegex( + RuntimeError, r"These storage data ptrs are not allocated in pool \(0, 0\) but should be \{1000\}" + ): + check_memory_pool("npu:0", (0, 0), [mock_storage]) + + @patch('torch_npu._C._npu_checkPoolLiveAllocations') + @patch('torch_npu.npu._graph_tree.get_npugraph_segments') + @patch('torch_npu.npu._graph_tree.format_tb') + @patch('gc.collect') + def test_check_memory_pool_slow_path_unaccounted_blocks( + self, mock_gc, mock_format_tb, mock_segments, mock_check + ): + mock_check.return_value = False + mock_segments.return_value = [ + { + "segment_pool_id": (0, 0), + "address": 1000, + "blocks": [ + {"state": "active_allocated", "size": 100, "frames": [ + {"filename": "/path/to/file.py", "line": 42, "name": "allocate_func"} + ]}, + ] + } + ] + live_storages = [] + mock_format_tb.return_value = "Formatted Traceback" + with self.assertRaisesRegex( + RuntimeError, "These live storage data ptrs are in the npugraph pool but not accounted for" + ): + check_memory_pool("npu:0", (0, 0), live_storages) + + def test_check_memory_pool_invalid_input(self): + invalid_storages = [1, 2, 3] + with self.assertRaisesRegex( + RuntimeError, r"check all\(isinstance\(elem, StorageWeakRefWrapper\) for elem in live_storages_ptrs\) fail" + ): + check_memory_pool("npu:0", (0, 0), invalid_storages) + + +class TestNPUGraphTreeManager: + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager._run') + def test_run_forward_mode(self, mock_run): + manager = NPUGraphTreeManager(0) + manager.id_to_mode[FunctionID(1)] = CompilationMode.FORWARD + result = manager.run([torch.tensor([1.0])], FunctionID(1)) + mock_run.assert_called_once_with([torch.tensor([1.0])], FunctionID(1)) + self.assertTrue(manager.running_forwards_with_pending_backwards) + self.assertTrue(result == mock_run.return_value) + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager._run') + def test_run_backward_mode(self, mock_run): + manager = NPUGraphTreeManager(0) + manager.id_to_mode[FunctionID(1)] = CompilationMode.BACKWARD + result = manager.run([torch.tensor([1.0])], FunctionID(1)) + mock_run.assert_called_once_with([torch.tensor([1.0])], FunctionID(1)) + self.assertFalse(manager.running_forwards_with_pending_backwards) + self.assertTrue(result == mock_run.return_value) + + def test_set_to_running_backward(self): + manager = NPUGraphTreeManager(0) + manager.running_forwards_with_pending_backwards = True + manager.set_to_running_backward() + self.assertFalse(manager.running_forwards_with_pending_backwards) + + def test_shutdown(self): + manager = NPUGraphTreeManager(0) + mock_node1 = MagicMock() + mock_node2 = MagicMock() + mock_node3 = MagicMock() + manager.roots = {FunctionID(1): [mock_node1]} + mock_node1.children = {FunctionID(2): [mock_node2]} + mock_node2.children = {FunctionID(3): [mock_node3]} + manager.shutdown() + mock_node1.remove_node_cached_tensors.assert_called_once_with() + mock_node2.remove_node_cached_tensors.assert_called_once_with() + mock_node3.remove_node_cached_tensors.assert_called_once_with() + assert mock_node1.graph is None + assert mock_node2.graph is None + assert mock_node3.graph is None + assert manager.graph is None + assert manager.roots is None + assert manager.current_node is None + + @patch('torch.npu.synchronize') + @patch('torch_npu.npu._graph_tree.NPUGraphNode') + def test_record_function(self, mock_node, mock_synchronize): + manager = NPUGraphTreeManager(0) + manager.ids_to_funcs[FunctionID(1)] = MagicMock() + manager.ids_to_stack_traces[FunctionID(1)] = "stack_trace" + manager.npu_graphs_thread_pool = "pool_handle" + manager.device_index = 0 + manager.stream = MagicMock() + + # 设置模拟返回值 + mock_node_instance = MagicMock() + mock_node.return_value = mock_node_instance + mock_node_instance.run_first_inputs.return_value = [torch.tensor([1.0])] + + # 执行测试 + result = manager.record_function([torch.tensor([1.0])], FunctionID(1)) + + # 验证调用 + mock_synchronize.assert_any_call() + mock_node.assert_called_once_with( + manager.ids_to_funcs[FunctionID(1)], + ANY, # graph_id + None, # parent + [torch.tensor([1.0])], + "pool_handle", + 0, + "stack_trace", + manager.stream + ) + assert isinstance(mock_node.call_args[0][1], GraphID) + assert manager.current_node == mock_node_instance + assert manager.path_state == ExecutionState.RECORDING + assert result == [torch.tensor([1.0])] + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.update_generation') + def test_execute_node(self, mock_update_gen): + manager = NPUGraphTreeManager(0) + mock_node = MagicMock() + mock_node.run.return_value = [torch.tensor([1.0])] + + # 执行测试 + result = manager.execute_node(mock_node, [torch.tensor([1.0])]) + + # 验证调用 + mock_update_gen.assert_called_once_with() + assert manager.current_node == mock_node + assert manager.path_state == ExecutionState.EXECUTION + assert result == [torch.tensor([1.0])] + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.update_generation') + @patch('torch_npu.npu._graph_tree.NPUWarmupNode') + def test_run_eager(self, mock_warmup_node, mock_update_gen): + manager = NPUGraphTreeManager(0) + manager.ids_to_funcs[FunctionID(1)] = MagicMock() + manager.ids_to_stack_traces[FunctionID(1)] = "stack_trace" + manager.npu_graphs_thread_pool = "pool_handle" + manager.graph = MagicMock() + manager.device_index = 0 + manager.stream = MagicMock() + + # 设置模拟返回值 + mock_node_instance = MagicMock() + mock_warmup_node.return_value = mock_node_instance + mock_node_instance.run.return_value = [torch.tensor([1.0])] + + # 执行测试 + result = manager.run_eager([torch.tensor([1.0])], FunctionID(1)) + + # 验证调用 + mock_update_gen.assert_called_once_with() + mock_warmup_node.assert_called_once_with( + manager.ids_to_funcs[FunctionID(1)], + None, + "pool_handle", + manager.graph, + 0, + "stack_trace", + manager.stream, + False, + ) + assert manager.current_node == mock_node_instance + assert manager.path_state == ExecutionState.WARMUP + assert result == [torch.tensor([1.0])] + + def test_new_graph_id(self): + manager = NPUGraphTreeManager(0) + id1 = manager.new_graph_id() + id2 = manager.new_graph_id() + assert isinstance(id1, GraphID) + assert isinstance(id2, GraphID) + assert id1 != id2 + + def test_new_func_id(self): + manager = NPUGraphTreeManager(0) + id1 = manager.new_func_id() + id2 = manager.new_func_id() + assert isinstance(id1, FunctionID) + assert isinstance(id2, FunctionID) + assert id1 != id2 + + def test_in_recording_property(self): + manager = NPUGraphTreeManager(0) + manager.path_state = ExecutionState.NONE + assert manager.in_recording is False + manager.path_state = ExecutionState.RECORDING + assert manager.in_recording is True + + def test_in_warmup_property(self): + manager = NPUGraphTreeManager(0) + manager.path_state = ExecutionState.NONE + assert manager.in_warmup is False + manager.path_state = ExecutionState.WARMUP + assert manager.in_warmup is True + + def test_get_roots(self): + manager = NPUGraphTreeManager(0) + mock_node1 = MagicMock() + mock_node2 = MagicMock() + manager.roots = { + FunctionID(1): [mock_node1], + FunctionID(2): [mock_node2] + } + roots = list(manager.get_roots()) + assert roots == [mock_node1, mock_node2] + + def test_current_node_property_and_setter(self): + manager = NPUGraphTreeManager(0) + assert manager.current_node is None + assert manager.path_state == ExecutionState.NONE + mock_node = MagicMock() + manager.current_node = mock_node + assert manager.current_node == mock_node + assert manager._current_node == mock_node + manager.current_node = None + assert manager.current_node is None + assert manager.path_state == ExecutionState.NONE + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.get_curr_generation') + def test_update_generation(self, mock_get_gen): + manager = NPUGraphTreeManager(0) + mock_get_gen.return_value = 5 + manager.update_generation() + assert manager.current_gen == 5 + mock_get_gen.assert_called_once_with() + + @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 3) + def test_get_curr_generation_mark_step(self): + result = NPUGraphTreeManager.get_curr_generation() + assert result == 3 + + @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 0) + @patch('torch_npu.npu._graph_tree.GenerationTracker.generation', 5) + def test_get_curr_generation_generation_tracker(self): + result = NPUGraphTreeManager.get_curr_generation() + assert result == 5 + + @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 3) + def test_user_invoked_mark_step_true(self): + result = NPUGraphTreeManager.user_invoked_mark_step() + assert result is True + + @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 0) + def test_user_invoked_mark_step_false(self): + result = NPUGraphTreeManager.user_invoked_mark_step() + assert result is False + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.user_invoked_mark_step') + def test_can_start_new_generation_true_user_mark_step( + self, mock_user_mark_step, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + mock_in_new_invocation.return_value = True + mock_user_mark_step.return_value = True + result = manager.can_start_new_generation + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.user_invoked_mark_step') + def test_can_start_new_generation_true_no_pending_backwards( + self, mock_user_mark_step, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + manager.running_forwards_with_pending_backwards = False + mock_in_new_invocation.return_value = True + mock_user_mark_step.return_value = False + result = manager.can_start_new_generation() + assert result is True + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + def test_can_start_new_generation_false_pending_backwards( + self, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + manager.running_forwards_with_pending_backwards = True + mock_in_new_invocation.return_value = True + result = manager.can_start_new_generation() + assert result is False + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + def test_can_start_new_generation_false_not_new_invocation( + self, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + mock_in_new_invocation.return_value = False + result = manager.can_start_new_generation() + assert result is False + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.get_curr_generation') + def test_in_new_torch_compile_invocation_true(self, mock_get_gen): + manager = NPUGraphTreeManager(0) + manager.current_gen = 1 + mock_get_gen.return_value = 2 + result = manager.in_new_torch_compile_invocation() + assert result is True + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.get_curr_generation') + def test_in_new_torch_compile_invocation_false(self, mock_get_gen): + manager = NPUGraphTreeManager(0) + manager.current_gen = 1 + mock_get_gen.return_value = 1 + result = manager.in_new_torch_compile_invocation() + assert result is False + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + @patch('warnings.warn') + def test_check_warn_on_unable_to_start_executing_no_warn( + self, mock_warn, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + mock_in_new_invocation.return_value = False + manager.check_warn_on_unable_to_start_executing(FunctionID(1)) + mock_warn.assert_not_called() + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + @patch('warnings.warn') + def test_check_warn_on_unable_to_start_executing_already_warned( + self, mock_warn, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + manager.warned_functions.add(FunctionID(1)) + mock_in_new_invocation.return_value = True + manager.check_warn_on_unable_to_start_executing(FunctionID(1)) + mock_warn.assert_not_called() + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + @patch('warnings.warn') + def test_check_warn_on_unable_to_start_executing_no_repeated_pattern( + self, mock_warn, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + mock_in_new_invocation.return_value = True + + mock_node = MagicMock() + mock_node._path_from_root = [MagicMock()] + mock_node._path_from_root[0].wrapped_function.id = FunctionID(2) + mock_node.wrapped_function.id = FunctionID(1) + manager.current_node = mock_node + manager.check_warn_on_unable_to_start_executing(FunctionID(1)) + mock_warn.assert_not_called() + + @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation') + @patch('warnings.warn') + def test_check_warn_on_unable_to_start_executing_warn( + self, mock_warn, mock_in_new_invocation + ): + manager = NPUGraphTreeManager(0) + mock_in_new_invocation.return_value = True + + mock_node1 = MagicMock() + mock_node1.wrapped_function.id = FunctionID(1) + mock_node1.parent = MagicMock() + mock_node1.parent.wrapped_function.id = FunctionID(0) + + mock_node2 = MagicMock() + mock_node2.wrapped_function.id = FunctionID(1) + mock_node2.parent = MagicMock() + mock_node2.parent.wrapped_function.id = FunctionID(0) + + mock_current_node = MagicMock() + mock_current_node.wrapped_function.id = FunctionID(1) + mock_current_node.parent = MagicMock() + mock_current_node.parent.wrapped_function.id = FunctionID(0) + + mock_current_node._path_from_root = [mock_node1, mock_node2] + manager.current_node = mock_current_node + manager.check_warn_on_unable_to_start_executing(FunctionID(1)) + mock_warn.assert_called_once_with( + "Unable to hit fast path of NPUGraphs because of pending, uninvoked backwards. " + "Consider running with torch.no_grad() or using torch.compiler.npugraph_mark_step_begin() " + "before each model invocation" + ) + assert FunctionID(1) in manager.warned_functions + + if __name__ == "__main__": run_tests() -- Gitee From 33d2ac43bd5cce0d2e51ab3a99ccb9014c5caec9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 12 Jun 2025 04:59:21 +0000 Subject: [PATCH 082/328] !21850 Update op_plugin commit id Merge pull request !21850 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index a09c474979..d4d27494a4 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit a09c474979312cf19b8e09e3ec4244d233562f5b +Subproject commit d4d27494a47ab58950d10ea2b84e53e6e63990b1 -- Gitee From 80a73a77d0a57a52c8eb595a6eca128a709a6110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Thu, 12 Jun 2025 08:45:22 +0000 Subject: [PATCH 083/328] =?UTF-8?q?!21856=20fix=20readme=20Merge=20pull=20?= =?UTF-8?q?request=20!21856=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.zh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.zh.md b/README.zh.md index e1efa40b7e..e374608e88 100644 --- a/README.zh.md +++ b/README.zh.md @@ -57,7 +57,7 @@ pip3 install torch-npu==2.7.1rc1 ### 使用源代码进行安装 -某些特殊场景下,用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#PyTorch与Python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**,可以通过以下步骤获取(建议只挂载工作路径,并避开系统路径,以降低安全风险), 生成的.whl文件路径为./dist/。如果不使用镜像,编译时请注意gcc版本遵循如下约束:ARM架构下推荐使用gcc 10.2版本, X86架构下推荐使用gcc 9.3.1 +某些特殊场景下,用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#pytorch与python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**,可以通过以下步骤获取(建议只挂载工作路径,并避开系统路径,以降低安全风险), 生成的.whl文件路径为./dist/。如果不使用镜像,编译时请注意gcc版本遵循如下约束:ARM架构下推荐使用gcc 10.2版本, X86架构下推荐使用gcc 9.3.1 1. **克隆torch_npu代码仓** -- Gitee From 069ce6de118bbd451213c42e64d9618c472327f4 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Thu, 12 Jun 2025 10:06:20 +0000 Subject: [PATCH 084/328] !21826 check cann version to set hcclDeterministic Merge pull request !21826 from SCh-zx/canncheck27 --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 5bd55eeeb3..67165e43a1 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -26,6 +26,7 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "torch_npu/csrc/aten/CustomFunctions.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/npu/GetCANNInfo.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/NPUStorageImpl.h" @@ -286,7 +287,16 @@ void getHcclCommConfig(HcclCommConfig* config, bool isP2P = false) } // Temporarily adding this logic to set deterministic states to avoid a known issues within HCCL. - config->hcclDeterministic = getDeterministicState() ? 1 : 0; + static const bool isCannVersionGteBase = []() { + const std::string baseCannversion = "8.2.RC1"; + const std::string baseCannModule = "CANN"; + return IsGteCANNVersion(baseCannversion, baseCannModule); + }(); + if (isCannVersionGteBase) { + config->hcclDeterministic = 0xffffffff; + } else { + config->hcclDeterministic = getDeterministicState() ? 1 : 0; + } // Compatible with the size check of the old version of HCCL, forcibly convert // the config object to a size_t=32 object, and retain the N ± 2 version -- Gitee From 72e07bfd63ada2a69ed68b966669ed5453985de7 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 12 Jun 2025 10:59:21 +0000 Subject: [PATCH 085/328] !21859 Update op_plugin commit id Merge pull request !21859 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index d4d27494a4..8087b5f966 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit d4d27494a47ab58950d10ea2b84e53e6e63990b1 +Subproject commit 8087b5f9667613c52805227d9ed6d49dc9e3fe29 -- Gitee From 1fd7b97678290ed6311778ca8f67364bde64d120 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 12 Jun 2025 14:14:19 +0000 Subject: [PATCH 086/328] !21875 Update op_plugin commit id Merge pull request !21875 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 8087b5f966..2fcffe1d2e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 8087b5f9667613c52805227d9ed6d49dc9e3fe29 +Subproject commit 2fcffe1d2eae9b5beecb7f8c391b5777eff1ab98 -- Gitee From b000daf001e4ff73d01c781a188dbebe88a01e21 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 13 Jun 2025 03:33:14 +0000 Subject: [PATCH 087/328] !21835 Update torchair commit id Merge pull request !21835 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index c6b1e42194..2899be78f6 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit c6b1e42194b3b303b582e2496ba088803c547aef +Subproject commit 2899be78f642f0c6beab38bb4285ed87472a672b -- Gitee From 542cd1c9caf3b0be47e361b6262cd8b1beac3efe Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Fri, 13 Jun 2025 06:55:47 +0000 Subject: [PATCH 088/328] !21867 update hccl.h & hccl_type.h Merge pull request !21867 from SCh-zx/hccl27 --- third_party/hccl/inc/hccl/hccl.h | 2 +- third_party/hccl/inc/hccl/hccl_types.h | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 023914a348..c008a70f85 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -206,7 +206,7 @@ inline void HcclCommConfigInit(HcclCommConfig *config) info->reserved = 0; config->hcclBufferSize = HCCL_COMM_DEFAULT_BUFFSIZE; - config->hcclDeterministic = HCCL_COMM_DEFAULT_DETERMINISTIC; + config->hcclDeterministic = HCCL_COMM_DETERMINISTIC_CONFIG_NOT_SET; config->hcclCommName[0] = '\0'; config->hcclUdi[0] = '\0'; config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET; diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h index 40631676c1..65da53729b 100644 --- a/third_party/hccl/inc/hccl/hccl_types.h +++ b/third_party/hccl/inc/hccl/hccl_types.h @@ -13,16 +13,20 @@ extern "C" { #endif // __cplusplus +const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length +const uint32_t COMM_NAME_MAX_LENGTH = 128; // group name max length +const uint32_t UDI_MAX_LENGTH = 128; // UDI max length + const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24; const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0; const uint32_t HCCL_COMM_CONFIG_VERSION = 5; -const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; // 200MB buffer size -const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0; // Disable deterministic calculations -const uint32_t COMM_NAME_MAX_LENGTH = 128; -const uint32_t UDI_MAX_LENGTH = 128; +const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; +const uint32_t HCCL_COMM_BUFFSIZE_CONFIG_NOT_SET = 0xffffffff; +const uint32_t HCCL_COMM_DETERMINISTIC_CONFIG_NOT_SET = 0xffffffff; +const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0; +// 0xffffffff表示用户未配置TC或SL const uint32_t HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET = 0xffffffff; const uint32_t HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET = 0xffffffff; -const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0; /** * @brief HCCL functions return value definition @@ -88,8 +92,6 @@ typedef enum { HCCL_DATA_TYPE_RESERVED /**< reserved */ } HcclDataType; -const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length - /** * @brief HCCL root info */ -- Gitee From 6c23dd8bb95591e9df1853c4e834bf049044c64e Mon Sep 17 00:00:00 2001 From: hhz886 Date: Fri, 13 Jun 2025 07:14:41 +0000 Subject: [PATCH 089/328] !21882 log fix Merge pull request !21882 from hhz886/h71 --- torch_npu/profiler/analysis/prof_common_func/_log.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py index 0bf0acad2b..15ba7a80f9 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_log.py +++ b/torch_npu/profiler/analysis/prof_common_func/_log.py @@ -34,7 +34,6 @@ class ProfilerLogger: BACKUP_COUNT = 3 # logger instance _instance = None - _pid = None @classmethod def get_instance(cls) -> logging.Logger: @@ -55,9 +54,7 @@ class ProfilerLogger: RuntimeError: If logger initialization fails """ if cls._instance is not None: - if cls._pid == os.getpid(): - return - cls.destroy() + return # Create logs directory log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR) @@ -92,7 +89,6 @@ class ProfilerLogger: logger.addHandler(file_handler) cls._instance = logger - cls._pid = os.getpid() logger.info("Profiler logger initialized at: %s", log_file) @classmethod -- Gitee From 0c8e0f6dcc693b14c85f34901829ce39a3c3ce57 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 13 Jun 2025 08:59:22 +0000 Subject: [PATCH 090/328] !21893 Update op_plugin commit id Merge pull request !21893 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 2fcffe1d2e..ef203d9c28 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 2fcffe1d2eae9b5beecb7f8c391b5777eff1ab98 +Subproject commit ef203d9c2850866b59dd4e182a767115c359c136 -- Gitee From 9b892642b628c2e18581f1919b7cd4d86b4e1405 Mon Sep 17 00:00:00 2001 From: shaoyf Date: Fri, 13 Jun 2025 13:25:50 +0000 Subject: [PATCH 091/328] =?UTF-8?q?!21906=20=E5=9B=9E=E9=80=80=20'Pull=20R?= =?UTF-8?q?equest=20!21867=20:=20update=20hccl.h=20&=20hccl=5Ftype.h'?= =?UTF-8?q?=20Merge=20pull=20request=20!21906=20from=20shaoyf/revert-merge?= =?UTF-8?q?-21867-v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/hccl/inc/hccl/hccl.h | 2 +- third_party/hccl/inc/hccl/hccl_types.h | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index c008a70f85..023914a348 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -206,7 +206,7 @@ inline void HcclCommConfigInit(HcclCommConfig *config) info->reserved = 0; config->hcclBufferSize = HCCL_COMM_DEFAULT_BUFFSIZE; - config->hcclDeterministic = HCCL_COMM_DETERMINISTIC_CONFIG_NOT_SET; + config->hcclDeterministic = HCCL_COMM_DEFAULT_DETERMINISTIC; config->hcclCommName[0] = '\0'; config->hcclUdi[0] = '\0'; config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET; diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h index 65da53729b..40631676c1 100644 --- a/third_party/hccl/inc/hccl/hccl_types.h +++ b/third_party/hccl/inc/hccl/hccl_types.h @@ -13,20 +13,16 @@ extern "C" { #endif // __cplusplus -const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length -const uint32_t COMM_NAME_MAX_LENGTH = 128; // group name max length -const uint32_t UDI_MAX_LENGTH = 128; // UDI max length - const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24; const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0; const uint32_t HCCL_COMM_CONFIG_VERSION = 5; -const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; -const uint32_t HCCL_COMM_BUFFSIZE_CONFIG_NOT_SET = 0xffffffff; -const uint32_t HCCL_COMM_DETERMINISTIC_CONFIG_NOT_SET = 0xffffffff; -const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0; -// 0xffffffff表示用户未配置TC或SL +const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; // 200MB buffer size +const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0; // Disable deterministic calculations +const uint32_t COMM_NAME_MAX_LENGTH = 128; +const uint32_t UDI_MAX_LENGTH = 128; const uint32_t HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET = 0xffffffff; const uint32_t HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET = 0xffffffff; +const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0; /** * @brief HCCL functions return value definition @@ -92,6 +88,8 @@ typedef enum { HCCL_DATA_TYPE_RESERVED /**< reserved */ } HcclDataType; +const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length + /** * @brief HCCL root info */ -- Gitee From 80614b6a0e6f9830d1040dc6c4201921d6b7c72a Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Sat, 14 Jun 2025 10:05:13 +0000 Subject: [PATCH 092/328] !21900 add ut for flight recorder Merge pull request !21900 from huangyunlong/2.7ft8 --- test/distributed/test_flight_recorder.py | 780 ++++++++++++++++++++++- 1 file changed, 774 insertions(+), 6 deletions(-) diff --git a/test/distributed/test_flight_recorder.py b/test/distributed/test_flight_recorder.py index a2cb58241a..5f16b7a0ac 100644 --- a/test/distributed/test_flight_recorder.py +++ b/test/distributed/test_flight_recorder.py @@ -1,15 +1,17 @@ import os import json import pickle +import sys import tempfile +import threading import time from datetime import datetime, timedelta -from unittest import mock +from unittest import mock, skipIf import torch import torch.distributed as c10d import torch.distributed as dist -from torch.testing._internal.common_distributed import MultiProcessTestCase +from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS from torch.testing._internal.common_utils import instantiate_parametrized_tests, parametrize, run_tests import torch_npu @@ -43,7 +45,7 @@ class HCCLTraceTestBase(MultiProcessTestCase): @property def local_device(self): - return torch.device("npu", self.rank_to_GPU[self.rank][0]) + return torch.device("npu", self.rank_to_NPU[self.rank][0]) def _join_processes(self, fn): # We need to patch sys.exit() as skip_if will use sys.exit() and @@ -88,8 +90,8 @@ class HCCLTraceTestBase(MultiProcessTestCase): return 2 @property - def rank_to_GPU(self): - # return rank to GPU map + def rank_to_NPU(self): + # return rank to NPU map return {i: [i] for i in range(self.world_size)} def _trace_basename(self): @@ -144,6 +146,7 @@ class HCCLTraceTest(HCCLTraceTestBase): self.assertEqual(last["output_sizes"], ((3, 4),)) self.assertEqual(last["output_dtypes"], ["Float"]) self.assertEqual(last["collective_seq_id"], 2) + # HCCL_EXEC_TIMEOUT will impact watchdog timeout self.assertEqual(last["timeout_ms"], 3600000) now = datetime.now() event_created_time = datetime.fromtimestamp( @@ -152,7 +155,7 @@ class HCCLTraceTest(HCCLTraceTestBase): before_test = now - timedelta(minutes=1) self.assertTrue(before_test < event_created_time < now) if timing_enabled: - # very loose bounds, measured 0.036 ms on devgpu + # very loose bounds, measured 0.036 ms on devnpu self.assertTrue(0 < last["duration_ms"] < 100) else: self.assertTrue("duration_ms" not in last) @@ -212,10 +215,775 @@ class HCCLTraceTest(HCCLTraceTestBase): ) dist.destroy_process_group() + def test_dump_pipe(self): + if self.rank != self.MAIN_PROCESS_RANK: + # now we need start heartbeatmonitor thread manually + os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" + # makesure dump_pipe not heartbeat dump + os.unsetenv("TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC") + def open_file_with_timeout(file_path, mode, timeout=1.0): + start_time = time.time() + while time.time() - start_time < timeout: + if os.path.exists(file_path): + return open(file_path, mode) + time.sleep(0.1) + raise FileNotFoundError + + if self.rank == self.MAIN_PROCESS_RANK: + for c in self.children_pipes: + self.assertEqual(c.recv(), "next") + + dump_file = self._trace_name(rank=0) + pipe_file = dump_file + ".pipe" + with open_file_with_timeout(pipe_file, "w") as f: + f.write("1\n") + with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f: + # does not support profiling, so we use test_dump_pipe instead of all_reduce + self.assertTrue("test_dump_pipe" in str(pickle.load(f))) + + for c in self.children_pipes: + c.send("next") + return + + pg = self._create_process_group_hccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + f = pg.allreduce(a) + f.wait() + torch.npu.synchronize(device=device) + self.parent.send("next") + self.parent.recv() + + def test_long(self): + os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + # test some other primitives to make sure + # their strings are valid + xs = [torch.ones(3, 4, device=device)] + pg.broadcast(xs).wait() + pg.allreduce(xs).wait() + pg.reduce(xs).wait() + ys = [[torch.empty(3, 4, device=device) for _ in range(self.world_size)]] + pg.allgather(ys, xs).wait() + pg.reduce_scatter(xs, ys).wait() + f = pg.allreduce(a) + f.wait() + torch.npu.synchronize(device=device) + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 10) + first = t[0] + last = t[-1] + # profiling is not supported + self.assertEqual(last["profiling_name"], "") + self.assertEqual(last["state"], "completed") + self.assertIn("test_flight_recorder.py", str(last["frames"])) + self.assertEqual(last["input_sizes"], ((3, 4),)) + self.assertEqual(last["input_dtypes"], ["Float"]) + self.assertEqual(last["output_sizes"], ((3, 4),)) + self.assertEqual(last["output_dtypes"], ["Float"]) + # timeout_ms adapt to npu + self.assertEqual(last["timeout_ms"], 3600000) + self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9) + dist.destroy_process_group() + + @skipIf(True, "profiling is not supported") + def test_barrier_profiling(self): + os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + f = pg.barrier() + f = pg.allreduce(a) + f.wait() + torch.npu.synchronize(device=device) + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 2) + first = t[0] + last = t[-1] + self.assertEqual(first["profiling_name"], "hccl:all_reduce_barrier") + self.assertEqual(last["profiling_name"], "hccl:all_reduce") + dist.destroy_process_group() + + def test_trace_while_all_works_retired(self): + os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + device = self.local_device + # send more works than the buffer size to overwrite the previous entry + for _ in range(12): + a = [torch.ones(3, 4, device=device)] + pg.broadcast(a).wait() + torch.npu.synchronize(device=device) + + # wait for all works to be retired, we use sleep instead of pg._wait_for_pending_works() + time.sleep(30) + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 10) + last = t[-1] + self.assertEqual(last["retired"], True) + self.assertEqual(last["state"], "completed") + + # timing_enabled is not supported + @parametrize("timing_enabled", [False]) + @parametrize("only_active", [True, False]) + def test_trace_while_active(self, timing_enabled, only_active): + if self.rank == self.MAIN_PROCESS_RANK: + for c in self.children_pipes: + self.assertEqual(c.recv(), "next") + for c in self.children_pipes: + c.send("next") + return + + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + device = self.local_device + with torch.npu.device(device): + a = torch.full((3, 4), float(self.rank), device=device) + + pg.allreduce(a).wait() + e = torch.npu.Event() + e.record() + if self.rank != 0: + pg.allreduce(a).wait() + e.synchronize() + t = pickle.loads( + torch_npu._C._distributed_c10d._dump_hccl_trace(onlyActive=only_active) + ) + t = t["entries"] + if only_active: + if self.rank == 0: + self.assertEqual(len(t), 0) + else: + self.assertEqual(len(t), 1) + if not only_active: + if self.rank == 0: + self.assertEqual(t[-1]["profiling_name"], "") + self.assertEqual(t[-1]["collective_seq_id"], 1) + self.assertEqual(t[-1]["state"], "completed") + else: + self.assertEqual(t[-1]["profiling_name"], "") + self.assertEqual(t[-1]["collective_seq_id"], 2) + self.assertEqual( + t[-1]["state"], self.started_or_scheduled(timing_enabled) + ) + + self.parent.send("next") + self.assertEqual("next", self.parent.recv()) + if self.rank == 0: + pg.allreduce(a).wait() + torch.npu.synchronize(device=device) + + @parametrize("timing_enabled", [False]) + def test_trace_while_stuck(self, timing_enabled): + if self.rank == self.MAIN_PROCESS_RANK: + for c in self.children_pipes: + self.assertEqual(c.recv(), "next") + for c in self.children_pipes: + c.send("next") + return + + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + + device = self.local_device + with torch.npu.device(device): + a = torch.full((3, 4), float(self.rank), device=device) + + pg.allreduce(a).wait() + e = torch.npu.Event() + e.record() + + def gather_trace(): + e.synchronize() + # give the other thread some time to fill the npu buffer + time.sleep(5) + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + t = t["entries"] + self.assertEqual(t[-1]["profiling_name"], "") + if self.rank == 0: + self.assertEqual(t[-1]["collective_seq_id"], 1) + self.assertEqual(t[-1]["state"], "completed") + else: + self.assertEqual(t[-1]["collective_seq_id"], 2) + self.assertEqual( + t[-1]["state"], self.started_or_scheduled(timing_enabled) + ) + self.assertIsNone(t[-1]["time_discovered_completed_ns"]) + # this will eventually cause the missing rank 0 + # to continue which will unblock the non-zero ranks + self.parent.send("next") + + if self.rank != 0: + pg.allreduce(a).wait() + th = threading.Thread(target=gather_trace) + th.start() + # fill the npu buffer, at around 1024 events + # this will stall + for _ in range(2000): + a = a + a + th.join() + else: + gather_trace() + + self.assertEqual("next", self.parent.recv()) + if self.rank == 0: + pg.allreduce(a).wait() + torch.npu.synchronize(device=device) + + @skipIf(True, "send_recv is not supported") + @parametrize( + "op_sizes_per_coalesce", + [ + [(2, 3)], + [(2, 3), (5, 5), (1,)], + ], + ) + @parametrize("timing_enabled", [True, False]) + def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled): + """ + 'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use + a destructed Work obj's npu events + """ + + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + + num_coalesced_ops = 20 + ops_per_coalesce = len(op_sizes_per_coalesce) + for _ in range(num_coalesced_ops): + ops = [] + for input_sizes in op_sizes_per_coalesce: + tensor = torch.zeros(input_sizes).to(self.local_device) + if self.rank == 0: + ops.append(dist.P2POp(dist.irecv, tensor, 1)) + elif self.rank == 1: + tensor *= 2 + ops.append(dist.P2POp(dist.isend, tensor, 0)) + + dist.batch_isend_irecv(ops).pop().wait() + + torch.npu.synchronize(device=self.local_device) + + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + self.assertEqual(len(t["entries"]), num_coalesced_ops * (ops_per_coalesce + 1)) + + expected_record_id = 0 + expected_seq = 1 + expected_op_id = 1 + for seq in range(num_coalesced_ops): + first_op = seq * (ops_per_coalesce + 1) + coalesced_op = first_op + ops_per_coalesce + for p2p_op_idx, input_sizes in zip( + range(first_op, coalesced_op, 1), op_sizes_per_coalesce + ): + # the indivudal ops inside the coalescing group the individual op metadata, + # but not the timing info coming from the actual coalesced kernel + profiling_name = ( + "hccl:recv 0<-1" if self.rank == 0 else "hccl:send 1->0" + ) + self.assertEqual( + t["entries"][p2p_op_idx]["record_id"], expected_record_id + ) + expected_record_id += 1 + self.assertEqual( + t["entries"][p2p_op_idx]["profiling_name"], profiling_name + ) + # we don't increment collective_seq_id for p2p ops. + self.assertEqual(t["entries"][p2p_op_idx]["collective_seq_id"], 0) + self.assertEqual(t["entries"][p2p_op_idx]["p2p_seq_id"], expected_seq) + self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id) + expected_op_id += 1 + self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes]) + self.assertEqual( + t["entries"][p2p_op_idx]["output_sizes"], [input_sizes] + ) + # duration doesn't get tagged onto individual ops yet, nor is their state updated + self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled") + self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx]) + + # the coalesced op has no metadata but indicates that coalescing was used, + # and accurately reflects the timing and state info for the whole group + self.assertEqual( + t["entries"][coalesced_op]["record_id"], expected_record_id + ) + expected_record_id += 1 + self.assertEqual( + t["entries"][coalesced_op]["profiling_name"], "hccl:coalesced" + ) + self.assertEqual(t["entries"][coalesced_op]["p2p_seq_id"], expected_seq) + expected_seq += 1 + self.assertEqual(t["entries"][coalesced_op]["state"], "completed") + self.assertEqual(t["entries"][coalesced_op]["input_sizes"], []) + self.assertEqual(t["entries"][coalesced_op]["output_sizes"], []) + if timing_enabled: + duration = t["entries"][coalesced_op]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][coalesced_op]) + self.assertEqual(t["entries"][coalesced_op]["timeout_ms"], 600000) + + @skipIf(True, "send_recv is not supported") + @parametrize( + "op_sizes", + [ + [(2, 3)], + [(2, 3), (5, 5), (1,)], + ], + ) + @parametrize("timing_enabled", [True, False]) + def test_individual_send_recv(self, op_sizes, timing_enabled): + """ + 'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use + a destructed Work obj's npu events + """ + + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + num_repeats = 10 + ops_per_repeat = len(op_sizes) + for _ in range(num_repeats): + for input_sizes in op_sizes: + tensor = torch.zeros(input_sizes).to(self.local_device) + if self.rank == 0: + dist.recv(tensor, 1) + elif self.rank == 1: + tensor *= 2 + dist.send(tensor, 0) + + torch.npu.synchronize(device=self.local_device) + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat)) + expected_seq = 1 + expected_op_id = 1 + for seq in range(num_repeats * ops_per_repeat): + input_sizes = op_sizes[seq % ops_per_repeat] + profiling_name = "hccl:recv 0<-1" if self.rank == 0 else "hccl:send 1->0" + self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name) + # we don't increment collective_seq_id for p2p ops. + self.assertEqual(t["entries"][seq]["collective_seq_id"], 0) + self.assertEqual(t["entries"][seq]["p2p_seq_id"], expected_seq) + expected_seq += 1 + self.assertEqual(t["entries"][seq]["op_id"], expected_op_id) + expected_op_id += 1 + self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes]) + self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes]) + self.assertEqual(t["entries"][seq]["state"], "completed") + + if timing_enabled: + duration = t["entries"][seq]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][seq]) + + @skipIf(True, "coalescing_manager is not supported") + @parametrize("timing_enabled", [True, False]) + def test_coalescing_manager_collective(self, timing_enabled): + """ + The coalescing manager api works by accumulating operations in python via a contextmanager, and then making + one call into c++ to an _coalesced API. It has limited support for ops and has been added recently to + avoid overheads of making individual py-cpp calls. This complicates flight recording.. + + For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives. + """ + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_hccl() + if timing_enabled: + pg._enable_collectives_timing() + + output_tensors = torch.zeros(2, 2).to(self.rank) + input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)] + + # TODO(whc) make this work with bigger world or something + self.assertEqual(self.world_size, 2, self.world_size) + + with dist._coalescing_manager(): + for i in range(self.world_size): + dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i]) + self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size) + + torch.npu.synchronize(device=self.rank) + + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace()) + + self.assertEqual( + len(t["entries"]), 1 + ) # one for the reduce_scatter_tensor_coalesced + self.assertEqual( + t["entries"][0]["profiling_name"], "hccl:reduce_scatter_tensor_coalesced" + ) + # collective_seq_id should be incremented once. + self.assertEqual(t["entries"][0]["collective_seq_id"], 1) + self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]]) + self.assertEqual( + t["entries"][0]["output_sizes"], + [ + [ + 2, + ], + [ + 2, + ], + ], + ) + self.assertEqual(t["entries"][0]["state"], "completed") + if timing_enabled: + duration = t["entries"][0]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][0]) + + +def check_if_test_is_skipped(fn): + def wrapper(self, *args, **kwargs): + for skip in TEST_SKIPS.values(): + if self.processes[0].exitcode == skip.exit_code: + return MultiProcessTestCase._check_return_codes(self, *args, **kwargs) + return fn(self, *args, **kwargs) + + return wrapper + + +class HCCLTraceTestDumpOnTimeoutBase(HCCLTraceTestBase): + timeout_sec = 60 + + def _create_process_group_hccl(self): + store = dist.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + "hccl", + world_size=self.world_size, + rank=self.rank, + store=store, + timeout=timedelta(seconds=HCCLTraceTestDumpOnTimeoutBase.timeout_sec), + ) + pg = c10d.distributed_c10d._get_default_group() + return pg + + @check_if_test_is_skipped + def _check_return_codes(self, elapsed_time): + # the base test infra assumes processes exit with matching return codes, + # but we want rank0 to abort and rank1 to exit cleanly in this test + self.assertEqual(self.processes[0].exitcode, -6) + self.assertEqual(self.processes[1].exitcode, 0) + + def _wait_process(self, rank, timeout): + try: + self.processes[rank].join(timeout) + return self.processes[rank].exitcode + except TimeoutError: + return None + + +class HCCLTraceTestDumpOnTimeout(HCCLTraceTestDumpOnTimeoutBase): + @parametrize("timing_enabled", [False]) + def test_timeout_dumps(self, timing_enabled): + if self.rank != self.MAIN_PROCESS_RANK: + # dump on heartbeatmonitor thread + os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" + # need rank0 to crash before looking for its output file + os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "60" + + if self.rank == self.MAIN_PROCESS_RANK: + # wait for rank0 to crash before looking for its output file + # we rely on rank0 holding off its abort long enough to dump the debug info + self.assertEqual(self._wait_process(0, timeout=180), -6) + with open(self._trace_name(rank=0), "rb") as f: + t = pickle.load(f) + t = t["entries"] + self.assertEqual(len(t), 2) + self.assertEqual(t[0]["collective_seq_id"], 1) + self.assertEqual(t[0]["state"], "completed") + self.assertEqual(t[1]["collective_seq_id"], 2) + self.assertEqual( + t[1]["state"], self.started_or_scheduled(timing_enabled) + ) + + self.assertFalse(os.path.exists(self._trace_name(rank=1))) + + return + + pg = self._create_process_group_hccl() + if timing_enabled: + # we force disabled timing in setup, since there is no 'disable' function + pg._enable_collectives_timing() + + device = self.local_device + with torch.npu.device(device): + a = torch.full((3, 4), float(self.rank), device=device) + + pg.allreduce(a).wait() + if self.rank == 0: + pg.allreduce(a).wait() + + # rank 0 will crash before it passes the sync, but rank1 will exit quickly and cleanly + torch.npu.synchronize(device=device) + + +instantiate_parametrized_tests(HCCLTraceTestDumpOnTimeout) instantiate_parametrized_tests(HCCLTraceTest) +class HCCLTraceTestTimeoutDumpOnStuckRanks(HCCLTraceTestDumpOnTimeoutBase): + @check_if_test_is_skipped + def _check_return_codes(self, elapsed_time): + # the base test infra assumes processes exit with matching return codes, + # but we want rank0 to abort and rank1 to exit cleanly in this test + self.assertEqual(self.processes[0].exitcode, -6) + self.assertEqual(self.processes[1].exitcode, 0) + + def test_timeout_dumps_on_stuck_ranks(self): + if self.rank != self.MAIN_PROCESS_RANK: + # now we need start heartbeatmonitor thread manually + os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" + # need rank0 to crash quicker after detecting timeout + os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "60" + # restore this env var to its prior default in case another test changed it + os.environ["TORCH_HCCL_COORD_CHECK_MILSEC"] = "1000" + + if self.rank == self.MAIN_PROCESS_RANK: + # wait for both rank0 and 1 to crash before looking for both ranks' output + # file, and we rely on rank1 to sleep long enough to dump the debug info. + self.assertEqual(self._wait_process(0, timeout=180), -6) + self.assertEqual(self._wait_process(1, timeout=180), 0) + self.assertTrue(os.path.exists(self._trace_name(rank=1))) + self.assertTrue(os.path.exists(self._trace_name(rank=0))) + with open(self._trace_name(rank=0), "rb") as f: + t = pickle.load(f) + t = t["entries"] + self.assertEqual(len(t), 2) + with open(self._trace_name(rank=1), "rb") as f: + t = pickle.load(f) + t = t["entries"] + self.assertEqual(len(t), 1) + self.assertEqual(t[0]["collective_seq_id"], 1) + self.assertEqual(t[0]["state"], "completed") + return + + pg = self._create_process_group_hccl() + device = self.local_device + with torch.npu.device(device): + a = torch.full((3, 4), float(self.rank), device=device) + + pg.allreduce(a).wait() + if self.rank == 0: + pg.allreduce(a).wait() + + # rank 0 will get stuck, timeout and then signal a timeout to all ranks. + torch.npu.synchronize(device=device) + + if self.rank == 1: + # Force rank 1 to sleep 120s so that it will eventually exit as well after + # getting the global signal to dump the debugging info(won't break). + time.sleep(120) + + +class HcclErrorDumpTest(HCCLTraceTestBase): + def _wait_process(self, rank, timeout): + try: + self.processes[rank].join(timeout) + return self.processes[rank].exitcode + except TimeoutError: + return None + + @check_if_test_is_skipped + def _check_return_codes(self, elapsed_time): + # the base test infra assumes processes exit with matching return codes, + # but we want rank0 to abort with exception and rank1 to exit with exit 1 + self.assertEqual(self.processes[0].exitcode, -6) + self.assertEqual(self.processes[1].exitcode, 1) + + def test_hccl_errors_dump(self): + if self.rank != self.MAIN_PROCESS_RANK: + # now we need start heartbeatmonitor thread manually + os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" + os.environ["TORCH_HCCL_ASYNC_ERROR_HANDLING"] = "1" + os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1000" + os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1" + # need rank0 to dump before abort and we update it to 30 to avoid heratbeat dump + os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "30" + + if self.rank == self.MAIN_PROCESS_RANK: + # wait for both rank0 and 1 to crash before looking for dump + self.assertEqual(self._wait_process(0, timeout=90), -6) + self.assertEqual(self._wait_process(1, timeout=90), 1) + # verify that the trace file exists for rank0 + self.assertTrue(os.path.exists(self._trace_name(rank=0))) + return + + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + "hccl", + world_size=self.world_size, + rank=self.rank, + store=store, + timeout=timedelta(seconds=10), + ) + process_group = c10d.distributed_c10d._get_default_group() + process_group.allreduce(torch.rand(10).npu(self.rank)) + if self.rank == 0: + work = process_group.allreduce(torch.rand(10).npu(self.rank)) + # expect an error to be raised + with self.assertRaisesRegex(dist.DistBackendError, ""): + # Block the current stream on the HCCL stream + work.wait() + # Run some NPU operations + a = torch.rand(10).npu(self.rank) + elif self.rank == 1: + # Clean up structures (ex: files for FileStore before going down) + del process_group + sys.exit(1) + + +class HcclHeartbeatDumpTest(HCCLTraceTestBase): + def _wait_process(self, rank, timeout): + try: + self.processes[rank].join(timeout) + return self.processes[rank].exitcode + except TimeoutError: + return None + + def test_hccl_heartbeat_dump(self): + if self.rank != self.MAIN_PROCESS_RANK: + # dump on heartbeatmonitor thread + os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" + os.environ["TORCH_HCCL_ASYNC_ERROR_HANDLING"] = "1" + os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1000" + os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1" + # need rank0 to dump + os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "10" + + if self.rank == self.MAIN_PROCESS_RANK: + # wait for both rank0 and 1 to finish + self.assertEqual(self._wait_process(0, timeout=90), 0) + self.assertEqual(self._wait_process(1, timeout=90), 0) + # verify that the trace file exists for rank0 + self.assertTrue(os.path.exists(self._trace_name(rank=0))) + with open(self._trace_name(rank=0) + "_py_traceback", "r") as f: + self.assertTrue("time.sleep(30)" in str(f.readlines())) + # verify that the trace file not exists for rank1 + self.assertFalse(os.path.exists(self._trace_name(rank=1))) + return + + pg = self._create_process_group_hccl() + device = self.local_device + with torch.npu.device(device): + a = torch.full((3, 4), float(self.rank), device=device) + + pg.allreduce(a).wait() + if self.rank == 0: + # sleep for heartbeat dump + time.sleep(30) + + pg.allreduce(a).wait() + + torch.npu.synchronize(device=device) + + +class HCCLTraceTestDumpOnHcclTimeout(HCCLTraceTestBase): + def setUp(self): + os.environ["HCCL_EXEC_TIMEOUT"] = "60" + os.environ["HCCL_EVENT_TIMEOUT"] = "90" + super().setUp() + + def tearDown(self): + # unset env to avoid impact watchdog timeout + os.unsetenv('HCCL_EXEC_TIMEOUT') + os.unsetenv('HCCL_EVENT_TIMEOUT') + super().tearDown() + + @check_if_test_is_skipped + def _check_return_codes(self, elapsed_time): + # the base test infra assumes processes exit with matching return codes, + # but we want rank0 to hccl exec timeout and rank1 to exit cleanly in this test + self.assertEqual(self.processes[0].exitcode, 10) + self.assertEqual(self.processes[1].exitcode, 0) + + def _wait_process(self, rank, timeout): + try: + self.processes[rank].join(timeout) + return self.processes[rank].exitcode + except TimeoutError: + return None + + @parametrize("timing_enabled", [False]) + def test_hccl_timeout_dumps(self, timing_enabled): + if self.rank != self.MAIN_PROCESS_RANK: + # dump on heartbeatmonitor thread + os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1" + # need rank0 to crash before looking for its output file + os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "60" + + if self.rank == self.MAIN_PROCESS_RANK: + # wait for rank0 to crash before looking for its output file + self.assertEqual(self._wait_process(0, timeout=180), 10) + with open(self._trace_name(rank=0), "rb") as f: + t = pickle.load(f) + t = t["entries"] + self.assertEqual(len(t), 2) + self.assertEqual(t[0]["collective_seq_id"], 1) + self.assertEqual(t[0]["state"], "completed") + self.assertEqual(t[1]["collective_seq_id"], 2) + self.assertEqual( + t[1]["state"], self.started_or_scheduled(timing_enabled) + ) + + self.assertFalse(os.path.exists(self._trace_name(rank=1))) + + return + + pg = self._create_process_group_hccl() + if timing_enabled: + # we force disabled timing in setup, since there is no 'disable' function + pg._enable_collectives_timing() + + device = self.local_device + with torch.npu.device(device): + a = torch.full((3, 4), float(self.rank), device=device) + + pg.allreduce(a).wait() + if self.rank == 0: + pg.allreduce(a).wait() + + # rank 0 will crash before it passes the sync, but rank1 will exit quickly and cleanly + torch.npu.synchronize(device=device) + + +instantiate_parametrized_tests(HCCLTraceTestDumpOnHcclTimeout) + + if __name__ == "__main__": if torch.npu.is_available() and torch.npu.device_count() >= 2: run_tests() -- Gitee From ed36b405052ea7e111580a19cec18b67178bb37c Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Sat, 14 Jun 2025 22:17:22 +0000 Subject: [PATCH 093/328] !21919 Update torchair commit id Merge pull request !21919 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 2899be78f6..1e31eedac5 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 2899be78f642f0c6beab38bb4285ed87472a672b +Subproject commit 1e31eedac591b95a711ae529ae307a4c79ed808e -- Gitee From e6ac23bbf17e4cdd2964f8b2d2e441ad3c8532ab Mon Sep 17 00:00:00 2001 From: zhangqiongwen Date: Mon, 16 Jun 2025 02:06:33 +0000 Subject: [PATCH 094/328] !21378 add requested_bytes key Merge pull request !21378 from zhangqiongwen/v2.7.1_requested_bytes --- torch_npu/csrc/npu/Module.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index b7af82c2c3..0d0c15808e 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -960,6 +960,7 @@ PyObject* THNPModule_memoryStats(PyObject *_unused, PyObject *arg) result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes); result["active_bytes"] = statArrayToDict(stats.active_bytes); result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes); + result["requested_bytes"] = statArrayToDict(stats.requested_bytes); result["oversize_allocations"] = statToDict(stats.oversize_allocations); result["oversize_segments"] = statToDict(stats.oversize_segments); -- Gitee From c3295188a64af56ea8bae6541fe33be936477c8f Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 16 Jun 2025 03:14:28 +0000 Subject: [PATCH 095/328] !21930 Update op_plugin commit id Merge pull request !21930 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index ef203d9c28..5a5e1a41cd 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit ef203d9c2850866b59dd4e182a767115c359c136 +Subproject commit 5a5e1a41cd45f7f91f1e8cba34a77dc718a200bb -- Gitee From 9bea1d567b9997cc5835130690b5d8dd86d14b32 Mon Sep 17 00:00:00 2001 From: zyb <12441311+zyb230@user.noreply.gitee.com> Date: Mon, 16 Jun 2025 08:06:21 +0000 Subject: [PATCH 096/328] !21896 The log level is changed from error to warning when there is no step in PTA collection Merge pull request !21896 from zyb/v2.7.1 --- .../profiler/analysis/prof_parse/_fwk_cann_relation_parser.py | 2 +- torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py index 0f029ee7a8..b5d3797c6f 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py @@ -61,7 +61,7 @@ class FwkCANNRelationParser: # Get ProfilerStep#x node step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()] if not step_node_list: - self.logger.error("Get step range failed, the step node list is empty.") + self.logger.warning("Get step range failed, the step node list is empty.") return [] # Gather flow events start time in each step node diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py index b06d7d3d72..30ffd8be8b 100644 --- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py @@ -77,7 +77,7 @@ class KernelViewParser(BaseParser): return step_range = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], kernel_dict) if not step_range: - self.logger.error("Kernel view get step range failed, the step range is empty.") + self.logger.warning("Kernel view get step range failed, the step range is empty.") for step_data in step_range: step_id = step_data.get(Constant.STEP_ID) step_start = convert_ns2us_str(step_data.get(Constant.START_TS, 0)) -- Gitee From 92b121b172efe10744f6144e88c10bb61ed6b05e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 16 Jun 2025 09:14:31 +0000 Subject: [PATCH 097/328] !21935 Update op_plugin commit id Merge pull request !21935 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5a5e1a41cd..ef2a41a9e4 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5a5e1a41cd45f7f91f1e8cba34a77dc718a200bb +Subproject commit ef2a41a9e4e914d295560b12f821d2f960e07a40 -- Gitee From 73bceadbc45a25eb7bb7a7f0d4a2a6658d47e863 Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Mon, 16 Jun 2025 11:28:42 +0000 Subject: [PATCH 098/328] !21886 Fix security problems Merge pull request !21886 from yuhaiyan/v2.7.1-dev1 --- SECURITYNOTE.md | 2 +- torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp | 2 +- torch_npu/utils/_module.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index 079a24bd3b..6856b92996 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -51,7 +51,7 @@ torch_npu内集成性能分析工具profiler: ## 数据安全声明 -1. PyTorch使用过程中需要加载和保存数据,部分接口使用风险模块pickle,可能存在数据风险,如torch.load、torch.distributed.scatter_object_list等接口,可参考[torch.load](https://pytorch.org/docs/main/generated/torch.load.html#torch.load)、[collective-functions](https://pytorch.org/docs/main/distributed.html#collective-functions)了解具体风险。 +1. PyTorch使用过程中需要加载和保存数据,部分接口使用风险模块pickle,可能存在数据风险,如torch.load、torch.jit.load、torch.distributed.scatter_object_list等接口,可参考[torch.load](https://pytorch.org/docs/main/generated/torch.load.html#torch.load)、[collective-functions](https://pytorch.org/docs/main/distributed.html#collective-functions)了解具体风险。 2. Ascend Extension for PyTorch依赖CANN的基础能力实现AOE性能调优、算子dump、日志记录等功能,用户需要关注上述功能生成文件的权限控制,加强对相关数据的保护。 ## 构建安全声明 diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp index a288dc6477..1b85e7fce6 100644 --- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -633,7 +633,7 @@ void TensorPipeAgent::respond(std::shared_ptr &pipe) pipeRead(pipe, [this, pipe](const tensorpipe_npu::Error &error, c10::intrusive_ptr requestMessage, std::vector streams) mutable { if (error) { - if (shuttingDown_) { + if (shuttingDown_.load()) { // This is expected. } else { LOG(WARNING) << "RPC agent for " << workerInfo_.name_ diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index a79bc94c47..7e3269f88e 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -367,8 +367,8 @@ def _ddp_init_helper( def _mpdl_iter_init(self, *args, **kwargs): try: torch_npu.npu.synchronize() - except: - pass + except Exception as e: + print(e) origin_mpdl_iter_init(self, *args, **kwargs) -- Gitee From 0afbf5f94886bf690c625ec78d90e78a472d0311 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Mon, 16 Jun 2025 22:22:17 +0000 Subject: [PATCH 099/328] !21950 Update torchair commit id Merge pull request !21950 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 1e31eedac5..0d23897294 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 1e31eedac591b95a711ae529ae307a4c79ed808e +Subproject commit 0d23897294a40c3060e17dca0a4a2c5c5c349013 -- Gitee From 5c71ba2b13f5b26598c07b82e706764364cfadec Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Tue, 17 Jun 2025 06:12:40 +0000 Subject: [PATCH 100/328] !21934 fixnic Merge pull request !21934 from SCh-zx/nic27 --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 67165e43a1..a540066096 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2178,12 +2178,15 @@ void ProcessGroupHCCL::createHCCLComm( config = createHcclCommConfigWithOptions(); hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config); } + hcclComms[i]->hcclCommType = static_cast(HcclCommType::DEFAULT); break; case HcclCommType::P2P: // P2P not support set hcclCommName numRanks = 2; rank = p2pRank; getHcclCommConfig(&config, true); hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config); + hcclComms[i]->hcclCommType = static_cast(HcclCommType::P2P); + hcclComms[i]->p2pPeer = getP2pPeer(); break; default: throw std::runtime_error( -- Gitee From 118bf6dbd64472327819224db46621c787903703 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Tue, 17 Jun 2025 06:45:24 +0000 Subject: [PATCH 101/328] !21913 update StatusSaveInterval to 2 Merge pull request !21913 from huangyunlong/2.7cc1 --- .../csrc/core/npu/register/OptionsManager.cpp | 4 ++-- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 17 +++++++++++++---- torch_npu/csrc/distributed/ProcessGroupHCCL.hpp | 4 +++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index e4eb407936..fce2f143f7 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -267,11 +267,11 @@ uint32_t OptionsManager::GetStatusSaveInterval() { const static uint32_t status_save_interval = []() -> uint32_t { char* env_val = std::getenv("TORCH_HCCL_STATUS_SAVE_INTERVAL"); - int64_t envFlag = 30; + int64_t envFlag = 2; if (env_val != nullptr) { envFlag = strtol(env_val, nullptr, 10); if (envFlag <= 0) { - envFlag = 30; + envFlag = 2; TORCH_NPU_WARN_ONCE("Get env TORCH_HCCL_STATUS_SAVE_INTERVAL less than or equal to 0, so reset it to the default value."); } } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index a540066096..3a4cd022f5 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1829,7 +1829,7 @@ void ProcessGroupHCCL::workCleanupLoop() } if (status_save_enable) { - refreshStatusInfo(work, "end"); // Update Statusinfo,but not write into the map + is_refreshed = refreshStatusInfo(work, "end"); // Update Statusinfo,but not write into the map } pgStatus_->lastCompletedSeq = static_cast(work.seq_); pgStatus_->lastCompletedWorkName = opTypeToString(work.opType_); @@ -1840,7 +1840,7 @@ void ProcessGroupHCCL::workCleanupLoop() c10_npu::NPUGraph::dec_pending_event_queries(); } else { if (status_save_enable && work.isStarted()) { - refreshStatusInfo(work, "start"); // Update Statusinfo,but not write into the map + is_refreshed = refreshStatusInfo(work, "start"); // Update Statusinfo,but not write into the map } // Increment the iterator if the current WorkHCCL object is not // completed. @@ -1853,6 +1853,10 @@ void ProcessGroupHCCL::workCleanupLoop() } } + if (status_save_enable && is_refreshed) { + updateStatusOutput(); + } + if (recordflag && recordHcclStatus(status_save_path)) { lastrecordtime = std::chrono::steady_clock::now(); } @@ -2001,8 +2005,11 @@ void ProcessGroupHCCL::recordDataVol(std::string opName, const std::string dataV outfile.close(); } -void ProcessGroupHCCL::refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status) +bool ProcessGroupHCCL::refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status) { + if (StatusInfo.seq == work.seq_ && StatusInfo.status == status) { + return false; + } StatusInfo.seq = work.seq_; StatusInfo.pgId = options_->group_id; StatusInfo.opType = opTypeToString(work.opType_); @@ -2015,19 +2022,21 @@ void ProcessGroupHCCL::refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::s StatusInfo.commIds = "all"; } StatusInfo.status = status; + return true; } void ProcessGroupHCCL::updateStatusOutput() { + std::unique_lock lock(StatusMapmutex_); if (!StatusInfo.pgId.empty()) { StatusOutput_[options_->group_id] = StatusInfo; } + is_refreshed = false; } bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool error) { std::unique_lock lock(StatusMapmutex_); - updateStatusOutput(); if (!options_->global_ranks_in_group.empty() && !error) { return true; } else if (!StatusOutput_.empty()) { diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 9085787951..9c2f365b3e 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -936,7 +936,9 @@ protected: StatusStruct StatusInfo; - void refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status); + bool refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status); + + bool is_refreshed = false; static std::unordered_map StatusOutput_; -- Gitee From c430e866b4cdd78eab4af52e22bfcf14b5fb52ab Mon Sep 17 00:00:00 2001 From: zhangqiongwen Date: Tue, 17 Jun 2025 07:30:39 +0000 Subject: [PATCH 102/328] !21924 fix cpu to npu tensor's stride changed problem Merge pull request !21924 from zhangqiongwen/v2.7.1_cpu_to_npu_fix --- .../test_special_cases_copy_to_contiguous.py | 8 ++++++++ torch_npu/csrc/aten/common/ToKernelNpu.cpp | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/test/trans_contiguous/test_special_cases_copy_to_contiguous.py b/test/trans_contiguous/test_special_cases_copy_to_contiguous.py index 73dd786a07..e2c2f85369 100644 --- a/test/trans_contiguous/test_special_cases_copy_to_contiguous.py +++ b/test/trans_contiguous/test_special_cases_copy_to_contiguous.py @@ -38,6 +38,14 @@ class TestSpecialCasesCopyToContiguous(TestCase): npu_out = torch.as_strided(npu_input, (1, 32, 96, 96), (746496, 0, 96, 1), 737280).clone() self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy()) + def test_h2d_copy_discontiguous(self): + a = torch.randn(256, 320) + b = a.transpose(-1, -2) # make b NOT contiguous + self.assertFalse(b.is_contiguous()) + b = b.npu() + self.assertFalse(b.is_contiguous()) # after to npu, b is still NOT contiguous + self.assertEqual(b.stride(), (1, 320)) + if __name__ == "__main__": run_tests() diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp index 02fac3c9ef..a029071079 100644 --- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp @@ -97,7 +97,12 @@ at::Tensor NPUNativeFunctions::_to_copy( "Only contiguous_format or preserve_format is supported.", OPS_ERROR(ErrCode::NOT_SUPPORT)); options = options.memory_format(optional_memory_format.value()); } else { - options = options.memory_format(c10::MemoryFormat::Contiguous); + if (torch_npu::utils::is_npu(self)) { + options = options.memory_format(c10::MemoryFormat::Contiguous); + } else { + // keep the same as cpu default memory format: Preserve + options = options.memory_format(c10::MemoryFormat::Preserve); + } } TORCH_CHECK( options.requires_grad_opt() == c10::nullopt, -- Gitee From eb210a304aa0017e6592e7983f0e73f2baaf0602 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 17 Jun 2025 11:14:29 +0000 Subject: [PATCH 103/328] !21971 Update op_plugin commit id Merge pull request !21971 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index ef2a41a9e4..5f31549358 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit ef2a41a9e4e914d295560b12f821d2f960e07a40 +Subproject commit 5f31549358a3dace6454f2a4042c46730d385119 -- Gitee From e04829d36abaebf6ca3beb229d464cf4517fd55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Tue, 17 Jun 2025 12:41:41 +0000 Subject: [PATCH 104/328] =?UTF-8?q?!21975=20support=20IFA=20update=20with?= =?UTF-8?q?=20non-out=20api=20Merge=20pull=20request=20!21975=20from=20?= =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_aclgraph_update.py | 29 +++++++++++++++++++++++++++++ torch_npu/npu/graphs.py | 21 ++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/test/npu/test_aclgraph_update.py b/test/npu/test_aclgraph_update.py index 56aad3c5c2..644579b9f1 100644 --- a/test/npu/test_aclgraph_update.py +++ b/test/npu/test_aclgraph_update.py @@ -93,6 +93,35 @@ class TestAclgraphUpdate(TestCase): self.assertEqual(output.cpu(), res_src[0].cpu()) self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) + @SupportedDevices(['Ascend910B']) + def test_ifa_update_with_non_out_and_auto_dispatch_capture(self): + torch.npu.set_device(0) + length = [29] + length_new = [100] + scale = 1 / 0.0078125 + query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + + res_src = torch_npu.npu_fused_infer_attention_score( + query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535, + next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length_new) + + g = torch.npu.NPUGraph() + output = None + softmax_lse = None + + with torch.npu.graph(g, auto_dispatch_capture=True): + output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu") + softmax_lse = torch.empty(1, dtype=torch.float16, device="npu") + output, softmax_lse = torch_npu.npu_fused_infer_attention_score( + query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535, + next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length) + + g.update(cpu_update_input=[{"actual_seq_lengths": length_new}]) + g.replay() + self.assertEqual(output.cpu(), res_src[0].cpu()) + self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) if __name__ == "__main__": run_tests() diff --git a/torch_npu/npu/graphs.py b/torch_npu/npu/graphs.py index 537b9c1f96..bd38f8f3e0 100644 --- a/torch_npu/npu/graphs.py +++ b/torch_npu/npu/graphs.py @@ -136,7 +136,26 @@ class _GraphDispatchMode(torch.utils._python_dispatch.TorchDispatchMode): def __torch_dispatch__(self, func, types, args=(), kwargs=None): if func.__name__ == "npu_fused_infer_attention_score": - raise RuntimeError("Only support npu_fused_infer_attention_score.out", pta_error(ErrCode.NOT_SUPPORT)) + func_out = torch_npu.npu_fused_infer_attention_score.out + self.update_schema(str(func_out.__name__), str(func_out._schema)) + stream = torch_npu.npu.current_stream() + event = torch.npu.ExternalEvent() + event.wait(stream) + event.reset(stream) + # apply tensor + workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(*args, **kwargs) + output = torch.empty_like(args[0]) + softmax_lse = torch.empty(1, dtype=args[0].dtype, device=args[0].device) + kwargs["workspace"] = workspace + kwargs["out"] = [output, softmax_lse] + # begin graph task + graph_task_group_begin(stream) + func_out(*args, **kwargs) + handle = graph_task_group_end(stream) + # save state for update + self.graph_dispatch_records.append( + self._append_dispatch_record(event, handle, args, kwargs, func_out)) + return kwargs["out"] elif func.__name__ == "npu_fused_infer_attention_score.out": self.update_schema(str(func.__name__), str(func._schema)) stream = torch_npu.npu.current_stream() -- Gitee From cdf48e303cd9515f3196417afefe1af196ee6245 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Tue, 17 Jun 2025 12:44:33 +0000 Subject: [PATCH 105/328] !21945 refactoring hccl_event_timeout and watchdog timeout Merge pull request !21945 from huangyunlong/2.7exec --- test/distributed/test_flight_recorder.py | 4 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 62 +++++++++---------- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/test/distributed/test_flight_recorder.py b/test/distributed/test_flight_recorder.py index 5f16b7a0ac..5e0b57873c 100644 --- a/test/distributed/test_flight_recorder.py +++ b/test/distributed/test_flight_recorder.py @@ -147,7 +147,7 @@ class HCCLTraceTest(HCCLTraceTestBase): self.assertEqual(last["output_dtypes"], ["Float"]) self.assertEqual(last["collective_seq_id"], 2) # HCCL_EXEC_TIMEOUT will impact watchdog timeout - self.assertEqual(last["timeout_ms"], 3600000) + self.assertEqual(last["timeout_ms"], 3636000) now = datetime.now() event_created_time = datetime.fromtimestamp( last["time_created_ns"] / 1000000000 @@ -290,7 +290,7 @@ class HCCLTraceTest(HCCLTraceTestBase): self.assertEqual(last["output_sizes"], ((3, 4),)) self.assertEqual(last["output_dtypes"], ["Float"]) # timeout_ms adapt to npu - self.assertEqual(last["timeout_ms"], 3600000) + self.assertEqual(last["timeout_ms"], 3636000) self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9) dist.destroy_process_group() diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3a4cd022f5..0e2d83373f 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -57,7 +57,7 @@ namespace c10d_npu { namespace { static constexpr uint32_t kOpWaitTimeoutOffset = 30U; // second static uint32_t kOpWaitTimeout = 1868U; // second -static int32_t defaultExecTimeout = 1800; +static int32_t defaultExecTimeout = 1836; constexpr const char* P2P_DEVICE_KEY = "_p2p"; using hcclUs = std::chrono::steady_clock::time_point; @@ -721,7 +721,7 @@ bool ProcessGroupHCCL::WorkHCCL::checkExec() static int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout(); if (hccl_exec_timeout <= 0) { - hccl_exec_timeout = 1800; + hccl_exec_timeout = defaultExecTimeout; } int32_t timeout = std::max(60, hccl_exec_timeout - 60); auto currentTimepoint = std::chrono::steady_clock::now(); @@ -904,30 +904,23 @@ ProcessGroupHCCL::ProcessGroupHCCL( this->setGroupName(groupName); int32_t hccl_event_timeout = c10_npu::option::OptionsManager::GetHCCLEventTimeout(); int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout(); + if (hccl_exec_timeout < 0) { + hccl_exec_timeout = defaultExecTimeout; + } + if (hccl_event_timeout > 0) { - if (hccl_exec_timeout < 0) { - if (hccl_event_timeout < defaultExecTimeout) { - TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the default value of HCCL_EXEC_TIMEOUT:", defaultExecTimeout, "."); - } - kOpWaitTimeout = static_cast(hccl_event_timeout); + kOpWaitTimeout = static_cast(hccl_event_timeout); + if (hccl_event_timeout <= hccl_exec_timeout) { + TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than or equal to the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, "."); } else if (hccl_exec_timeout == 0) { - kOpWaitTimeout = 0; - TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, ", so set op wait timeout to never timeout."); - } else { - kOpWaitTimeout = static_cast(hccl_event_timeout); - if (hccl_event_timeout < hccl_exec_timeout) { - TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, "."); - } + TORCH_NPU_WARN_ONCE("The value of HCCL_EXEC_TIMEOUT was set to 0(never timeout), so it is bigger than the value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, "."); } - } - if (hccl_event_timeout == 0) { + } else if (hccl_event_timeout == 0) { kOpWaitTimeout = 0; - } - if (hccl_event_timeout < 0) { + } else { if (hccl_exec_timeout == 0) { kOpWaitTimeout = 0; - } - if (hccl_exec_timeout > 0 && static_cast(hccl_exec_timeout) > kOpWaitTimeout) { + } else { kOpWaitTimeout = static_cast(hccl_exec_timeout) + kOpWaitTimeoutOffset; if (kOpWaitTimeout <= static_cast(hccl_exec_timeout)) { kOpWaitTimeout = UINT_MAX; @@ -994,22 +987,23 @@ ProcessGroupHCCL::ProcessGroupHCCL( #ifdef ENABLE_HCCL_ERROR_CHECKING if (asyncErrorHandling_ == TearDown) { - if (hccl_exec_timeout > 0) { - if ((hccl_exec_timeout * 1000) > (options_->timeout).count()) { - TORCH_NPU_WARN("The HCCL execution timeout ", hccl_exec_timeout * 1000, "ms is bigger than watchdog timeout ", - (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded."); + if ((options_->timeout).count() != DEFAULT_TIMEOUT) { + if ((options_->timeout).count() <= hccl_exec_timeout * 1000) { + TORCH_NPU_WARN("The watchdog timeout ", (options_->timeout).count(), "ms(which is set by init_process_group) is less than or equal to HCCL execution timeout ", + hccl_exec_timeout * 1000, "ms! The plog may not be recorded."); + } else if (hccl_exec_timeout == 0) { + TORCH_NPU_WARN("The HCCL execution timeout was set to 0(never timeout), so it is bigger than watchdog timeout ", + (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded. You can disable watchdog by 'export HCCL_ASYNC_ERROR_HANDLING=0'."); } - } else if (hccl_exec_timeout == 0) { - TORCH_NPU_WARN("The HCCL execution timeout was set to never timeout, so it is bigger than watchdog timeout ", - (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded. You can disable watchdog by 'export HCCL_ASYNC_ERROR_HANDLING=0'."); } else { - if ((options_->timeout).count() == DEFAULT_TIMEOUT) { - // Only when the timeout is default, we will change it. - options_->timeout = std::chrono::milliseconds(DEFAULT_TIMEOUT * 2); - } - if ((options_->timeout).count() < DEFAULT_TIMEOUT) { - TORCH_NPU_WARN("The HCCL execution timeout 1800000ms is bigger than watchdog timeout ", - (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded."); + if (hccl_exec_timeout == 0) { + options_->timeout = std::chrono::milliseconds(LLONG_MAX); + } else { + long long watchdog_timeout = (static_cast(hccl_exec_timeout) + 1800) * 1000; + if (watchdog_timeout <= static_cast(hccl_exec_timeout) * 1000) { + watchdog_timeout = LLONG_MAX; + } + options_->timeout = std::chrono::milliseconds(watchdog_timeout); } } } -- Gitee From 6785d73f7bfab6c47b9870ca6229bbdfd62b5d31 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 17 Jun 2025 14:14:28 +0000 Subject: [PATCH 106/328] !21982 Update op_plugin commit id Merge pull request !21982 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5f31549358..6dccf4c20d 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5f31549358a3dace6454f2a4042c46730d385119 +Subproject commit 6dccf4c20d63b50bd4f781d2d8fa47e62a942b11 -- Gitee From 1eb6fb3d7af21aea9814ee85634c38f5b52fa330 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 17 Jun 2025 15:59:31 +0000 Subject: [PATCH 107/328] !21995 Update op_plugin commit id Merge pull request !21995 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 6dccf4c20d..91436fe64d 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 6dccf4c20d63b50bd4f781d2d8fa47e62a942b11 +Subproject commit 91436fe64de56e4637ded79c13f8597f7d9ce4bd -- Gitee From 28a5c2930b10feb7140affc1c2de8090acf6235a Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 17 Jun 2025 22:22:38 +0000 Subject: [PATCH 108/328] !21987 Update torchair commit id Merge pull request !21987 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 0d23897294..4097c6e8b1 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 0d23897294a40c3060e17dca0a4a2c5c5c349013 +Subproject commit 4097c6e8b1abceccde097a2c3a7a75cc8d0ace4c -- Gitee From 0468f75daa9cb17d9da675fc178285186c9721d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Wed, 18 Jun 2025 01:22:19 +0000 Subject: [PATCH 109/328] =?UTF-8?q?!21959=20add=20new=20rules=20for=20errc?= =?UTF-8?q?ode=20Merge=20pull=20request=20!21959=20from=20=E9=83=AD?= =?UTF-8?q?=E5=85=89=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 3 +-- torch_npu/csrc/core/npu/NPUException.cpp | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index a31789b560..d3425f6f44 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1266,8 +1266,7 @@ public: if (!block_found && C10_LIKELY(captures_underway.empty())) { ASCEND_LOGE( - "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log " - "can be ignored."); + "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log can be ignored."); // Free all non-split cached blocks and retry alloc. { UnlockGuard guard(lock); diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index ab139f53b4..5732b6c0b8 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -51,7 +51,7 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode) int deviceIndex = -1; c10_npu::GetDevice(&deviceIndex); auto rank_id = c10_npu::option::OptionsManager::GetRankId(); - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { + if (!(c10_npu::option::OptionsManager::ShouldPrintLessError())) { oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() << ", Device:" << deviceIndex << ", RankID:" << rank_id << ") "; } oss << "ERR" << std::setw(2) << std::setfill('0') << static_cast(submodule); @@ -112,6 +112,11 @@ void clear_mem_uce_info() const std::string c10_npu_check_error_message(std::string& errmsg) { + static const std::regex errorRegex(R"(^E[1-9A-Z]9999)"); + if (std::regex_search(errmsg, errorRegex)) { + return "CANN Inner Error. Please rectify the fault based on the error information in the ascend log."; + } + std::regex dateRegex(R"(\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}\.\d{3})"); std::smatch match; -- Gitee From 49123362dec6b8bd45dc55cc84681af8bc8033c9 Mon Sep 17 00:00:00 2001 From: shaoyf Date: Wed, 18 Jun 2025 06:16:33 +0000 Subject: [PATCH 110/328] !21964 Document description optimization Merge pull request !21964 from shaoyf/271_readme --- README.md | 10 +++++----- README.zh.md | 10 +++++----- test/README.md | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 5b585449da..39ee21800e 100644 --- a/README.md +++ b/README.md @@ -18,13 +18,13 @@ Install **PyTorch** through pip. **For Aarch64:** -```Python +```bash pip3 install torch==2.7.1 ``` **For x86:** -```Python +```bash pip3 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu ``` @@ -32,7 +32,7 @@ pip3 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu Run the following command to install dependencies. -```Python +```bash pip3 install pyyaml pip3 install setuptools ``` @@ -82,7 +82,7 @@ In some special scenarios, users may need to compile **torch-npu** by themselves Take **Python 3.9** as an example. - ``` + ```bash cd /home/pytorch bash ci/build.sh --python=3.9 ``` @@ -129,7 +129,7 @@ print(z) ## User Manual -Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for more detailed informations. +Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for more detailed information. ## PyTorch and Python Version Matching Table diff --git a/README.zh.md b/README.zh.md index e374608e88..44cd229629 100644 --- a/README.zh.md +++ b/README.zh.md @@ -18,13 +18,13 @@ **aarch64:** -```Python +```bash pip3 install torch==2.7.1 ``` **x86:** -```Python +```bash pip3 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu ``` @@ -43,14 +43,14 @@ pip3 install torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu 运行以下命令安装依赖。 -```Python +```bash pip3 install pyyaml pip3 install setuptools ``` 3. **安装torch_npu** -``` +```bash pip3 install torch-npu==2.7.1rc1 ``` 如需要保存安装日志,可在pip3 install命令后面加上参数 `--log `,并对您指定的目录``做好权限管控。 @@ -108,7 +108,7 @@ Pytorch框架训练环境的卸载可以参考[昇腾官方文档](https://www.h torch_npu的卸载只需执行命令: - ``` + ```bash pip3 uninstall torch_npu ``` diff --git a/test/README.md b/test/README.md index d83b498b5f..6b779fb7ec 100644 --- a/test/README.md +++ b/test/README.md @@ -71,6 +71,6 @@ python ci/access_control_test.py --distributed 可用于复现问题的用例: `python test_jit.py -v -k test_annotated_empty_dict` -2. test_public_bindings.py 用例的作用 +2. test_public_bindings.py 用例的功能 该用例是为了校验接口的公开规范性,如果该用例报错,请确认报错的接口是否要公开,并按照报错的提示进行修改。 -- Gitee From 21bf7385c909c06c9abd6a6c051d31aca98f3ff5 Mon Sep 17 00:00:00 2001 From: tangmengcheng Date: Wed, 18 Jun 2025 08:24:19 +0000 Subject: [PATCH 111/328] !22022 [profiler-2.7.1] profiler pipe fd chmod Merge pull request !22022 from tangmengcheng/profiler_pickle_chmod_2.7.1 --- .../analysis/prof_common_func/_task_manager.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py index 7b884d6a15..a618e2122a 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py @@ -7,6 +7,7 @@ import multiprocessing import fcntl import pickle import signal +import stat from enum import Enum from abc import ABC, abstractmethod @@ -288,9 +289,19 @@ class ConcurrentTasksManager: if self.epoll is None: self.epoll = select.epoll() pr, pipe_write = os.pipe() - # 读管道设为非阻塞 - flags = fcntl.fcntl(pr, fcntl.F_GETFL) - fcntl.fcntl(pr, fcntl.F_SETFL, flags | os.O_NONBLOCK) + + try: + # 设置读管道为非阻塞并限制权限 + flags = fcntl.fcntl(pr, fcntl.F_GETFL) + fcntl.fcntl(pr, fcntl.F_SETFL, flags | os.O_NONBLOCK) + + # 设置管道文件描述符权限(只允许当前用户访问) + os.fchmod(pr, stat.S_IRUSR | stat.S_IWUSR) + os.fchmod(pipe_write, stat.S_IRUSR | stat.S_IWUSR) + except (OSError, AttributeError): + flags = fcntl.fcntl(pr, fcntl.F_GETFL) + fcntl.fcntl(pr, fcntl.F_SETFL, flags | os.O_NONBLOCK) + task_info.pipe = (pr, pipe_write) self.epoll.register(pr, select.EPOLLIN | select.EPOLLET | select.EPOLLERR | select.EPOLLHUP) self.listening_infos[pr] = task_info -- Gitee From 82433e898b21077ee2b7118d717390acd8c0d61b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 18 Jun 2025 09:29:33 +0000 Subject: [PATCH 112/328] !22027 Update op_plugin commit id Merge pull request !22027 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 91436fe64d..1ccd937a2f 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 91436fe64de56e4637ded79c13f8597f7d9ce4bd +Subproject commit 1ccd937a2f42e05bb99e6b7a09d4562131ec0f8f -- Gitee From 4da38f2ff0e1010222e6f1b2ae1d319e0b1471dd Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 18 Jun 2025 10:59:31 +0000 Subject: [PATCH 113/328] !22037 Update op_plugin commit id Merge pull request !22037 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1ccd937a2f..680dea4984 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1ccd937a2f42e05bb99e6b7a09d4562131ec0f8f +Subproject commit 680dea4984135de69dc1ee031e08942c4049fa72 -- Gitee From 37fae68034474a3e267704f2dde7ed0190655516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Wed, 18 Jun 2025 11:42:54 +0000 Subject: [PATCH 114/328] =?UTF-8?q?!22000=20fix=20bug=20for=20make=5Fgraph?= =?UTF-8?q?ed=5Fautograd=5Ffunction=20Merge=20pull=20request=20!22000=20fr?= =?UTF-8?q?om=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/npu/graphs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/npu/graphs.py b/torch_npu/npu/graphs.py index bd38f8f3e0..7e21ce5ed9 100644 --- a/torch_npu/npu/graphs.py +++ b/torch_npu/npu/graphs.py @@ -550,9 +550,9 @@ def make_graphed_callables( @staticmethod @torch.autograd.function.once_differentiable def backward(ctx, *grads): - if (len(grads) != len(static_grad_inputs)): + if (len(grads) != len(static_grad_outputs)): raise RuntimeError("The length of grads" - + " is not equal with the length of static_grad_inputs.") + + " is not equal with the length of static_grad_outputs.") for g, grad in zip(static_grad_outputs, grads): if g is not None: # don't copy if autograd gods have been kind and the -- Gitee From ca4fabd6c740e2fb1b4d4eedc136cb68d5fb77b0 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Wed, 18 Jun 2025 12:20:04 +0000 Subject: [PATCH 115/328] =?UTF-8?q?!21780=20=E3=80=90Profiler=E3=80=91work?= =?UTF-8?q?space=20fix=20Merge=20pull=20request=20!21780=20from=20hhz886/v?= =?UTF-8?q?2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/profiler/test_npu_profiler.py | 17 +++++++++ .../csrc/core/npu/NPUWorkspaceAllocator.cpp | 36 ++++++++++--------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/test/profiler/test_npu_profiler.py b/test/profiler/test_npu_profiler.py index 58b3e0fa71..7921126b0d 100644 --- a/test/profiler/test_npu_profiler.py +++ b/test/profiler/test_npu_profiler.py @@ -202,6 +202,23 @@ class TestNpuProfiler(TestCase): self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.OPERATOR_MEMORY)) self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.MEMORY_RECORD)) + def test_memory_when_workspace(self): + original_value = os.environ.get("TASK_QUEUE_ENABLE") + os.environ["TASK_QUEUE_ENABLE"] = "2" + worker_name = self.worker_name + with torch_npu.profiler.profile( + profile_memory=True, + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(self.results_path, worker_name=worker_name) + ) as prof: + for _ in range(self.small_steps): + self.model_train.train_one_step() + self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.OPERATOR_MEMORY)) + self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.MEMORY_RECORD)) + if original_value is None: + del os.environ["TASK_QUEUE_ENABLE"] + else: + os.environ["TASK_QUEUE_ENABLE"] = original_value + def test_ascend_work_path(self): PathManager.remove_path_safety(self.results_work_path) os.environ["ASCEND_WORK_PATH"] = self.results_work_path diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index c34d796a78..7d5173dec8 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -156,6 +156,8 @@ public: stats.allocated_bytes.current, reinterpret_cast(stream)} ); + this->last_block = block; + this->last_stream = stream; const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { trigger->traceNpuMemoryAllocation( @@ -180,6 +182,8 @@ public: stats.allocated_bytes.current, reinterpret_cast(stream)} ); + this->last_block = block; + this->last_stream = stream; #endif return block->data_ptr; } @@ -188,22 +192,20 @@ public: { update_stat(stats.allocated_bytes, -allocated_size); #ifndef BUILD_LIBTORCH - for (const auto& block_pair : blocks) { - if (block_pair.second->data_ptr != nullptr) { - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - device, - static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), - reinterpret_cast(block_pair.second->data_ptr), - -allocated_size, - stats.allocated_bytes.current, - stats.reserved_bytes.current, - stats.allocated_bytes.current, - reinterpret_cast(block_pair.first)} - ); - } + if (this->last_block && this->last_block->data_ptr && this->last_stream) { + torch_npu::profiler::reportMemoryDataToNpuProfiler({ + static_cast(c10::DeviceType::PrivateUse1), + device, + static_cast(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR), + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), + static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), + reinterpret_cast(this->last_block->data_ptr), + -allocated_size, + stats.allocated_bytes.current, + stats.reserved_bytes.current, + stats.allocated_bytes.current, + reinterpret_cast(this->last_stream)} + ); } #endif } @@ -379,6 +381,8 @@ private: #ifndef BUILD_LIBTORCH uint64_t sum_mem = 0; int device = 0; + aclrtStream last_stream = nullptr; + WorkspaceBlock* last_block = nullptr; #endif DeviceStats stats; size_t allocated_size = 0; -- Gitee From 6e2b8c7ee0d48fefb97fcf527253d1d86a463323 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 18 Jun 2025 22:23:25 +0000 Subject: [PATCH 116/328] !22042 Update torchair commit id Merge pull request !22042 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 4097c6e8b1..fe8e22ad3a 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 4097c6e8b1abceccde097a2c3a7a75cc8d0ace4c +Subproject commit fe8e22ad3a347ab8116eb61ff44450ef1fb07f91 -- Gitee From c2117c0237c23e78104753f119a21b97ecb89247 Mon Sep 17 00:00:00 2001 From: louyujing <7927276+louyujing@user.noreply.gitee.com> Date: Thu, 19 Jun 2025 03:15:28 +0000 Subject: [PATCH 117/328] =?UTF-8?q?!21953=20=E3=80=90transfer=5Fto=5Fnpu?= =?UTF-8?q?=E3=80=91Adapt=20distributed=20new=20group=20Merge=20pull=20req?= =?UTF-8?q?uest=20!21953=20from=20louyujing/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/contrib/transfer_to_npu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py index 14451c07dc..af90cf8c79 100644 --- a/torch_npu/contrib/transfer_to_npu.py +++ b/torch_npu/contrib/transfer_to_npu.py @@ -352,6 +352,7 @@ def _init(): if hasattr(torch.distributed, 'init_device_mesh'): _del_nccl_device_backend_map() torch.distributed.device_mesh.init_device_mesh = _wrapper_cuda(torch.distributed.device_mesh.init_device_mesh) + torch.distributed.new_group = _wrapper_hccl(torch.distributed.new_group) # CUDAGraph torch.cuda.CUDAGraph = torch.npu.NPUGraph -- Gitee From 254997813c8e3f8f8aab7eb5912d79dc921abf7d Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 19 Jun 2025 05:14:30 +0000 Subject: [PATCH 118/328] !22061 Update op_plugin commit id Merge pull request !22061 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 680dea4984..bb14f896d5 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 680dea4984135de69dc1ee031e08942c4049fa72 +Subproject commit bb14f896d574364e1b45d93451470c16151ffd3a -- Gitee From 4e2bc9bfaaf9f379db491821ce1a0373d1d3d94c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 19 Jun 2025 11:14:28 +0000 Subject: [PATCH 119/328] !22072 Update op_plugin commit id Merge pull request !22072 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index bb14f896d5..f6bae8ac94 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit bb14f896d574364e1b45d93451470c16151ffd3a +Subproject commit f6bae8ac945f3697d9e05bc7ee0c2e7fdff2169b -- Gitee From 9dcb7d4baecd8bb775329b84314e5abb30f08006 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 19 Jun 2025 13:29:31 +0000 Subject: [PATCH 120/328] !22076 Update op_plugin commit id Merge pull request !22076 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f6bae8ac94..64886f4420 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f6bae8ac945f3697d9e05bc7ee0c2e7fdff2169b +Subproject commit 64886f4420e39aff5013e79a78ea4dad9f8ba06a -- Gitee From 120619c6ac9cd2e413365121386e7260c5e9e539 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Thu, 19 Jun 2025 15:23:32 +0000 Subject: [PATCH 121/328] =?UTF-8?q?!22089=20bugfix:=20DestroyUsedStreams?= =?UTF-8?q?=20use=20stream.stream(false)=20Merge=20pull=20request=20!22089?= =?UTF-8?q?=20from=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fcheckfix3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUFunctions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index cca42ed288..266eab3fa7 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -137,7 +137,7 @@ aclError DestroyUsedStreams() for (const auto it : used_devices) { NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); NPUStream stream = getCurrentNPUStream(it.first); - aclError acl_ret = acl::AclrtDestroyStreamForce(stream); + aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false)); if (acl_ret != ACL_ERROR_NONE) { return acl_ret; } -- Gitee From a54050ab0325ac97110b62fbc3a68230ef98b508 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 19 Jun 2025 16:14:30 +0000 Subject: [PATCH 122/328] !22095 Update op_plugin commit id Merge pull request !22095 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 64886f4420..ae326dd28d 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 64886f4420e39aff5013e79a78ea4dad9f8ba06a +Subproject commit ae326dd28d26e7ad5f00b975b06ec47334e7800e -- Gitee From 1c198a843d70c5c94e6dde0f7b6ae2eba1835517 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 20 Jun 2025 03:14:34 +0000 Subject: [PATCH 123/328] !22102 Update op_plugin commit id Merge pull request !22102 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index ae326dd28d..70283d68a3 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit ae326dd28d26e7ad5f00b975b06ec47334e7800e +Subproject commit 70283d68a342c975f976f9022100cf5b8424c188 -- Gitee From e0cccf2e0685a2e3cbb04612d431ad8452d41bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Fri, 20 Jun 2025 07:25:30 +0000 Subject: [PATCH 124/328] =?UTF-8?q?!22111=20checksum=20fix=20Merge=20pull?= =?UTF-8?q?=20request=20!22111=20from=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fchecks?= =?UTF-8?q?umfix3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 4 ++-- torch_npu/asd/checksum.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 41e84feded..c7a8fc8388 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -611,7 +611,7 @@ class _MatmulSilentCheck: self.store.set(f"rank_{self.rank}_info_log", current_log + "\n" + info_str if current_log != "" else info_str) def _generate_warning_log(self, counting_abnormal_pos, new_abnormal): - warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: Training instability happens, feature detection detects abnormal results!" + warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: feature detection detects abnormal results!" index = 0 for pos in reversed(counting_abnormal_pos): warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {self.history_abnormal_list[pos]['time_str']}, param name {self.history_abnormal_list[pos]['name']}, abnormal value {self.history_abnormal_list[pos]['val']}, previous value {self.history_abnormal_list[pos]['pre_val']}, " @@ -673,7 +673,7 @@ class _MatmulSilentCheck: if global_state: now_time = time.time() if last_checksum_time is None or abs(now_time - last_checksum_time) > self.checksum_cooldown * 60: - loggerSilent.info(f'[Info] Rank {self.rank}: Training instability happened, checksum is on.') + loggerSilent.info(f'[Info] Rank {self.rank}: feature detection detects abnormal results, checksum is on.') last_checksum_time = now_time if self.checksum_result is None: self.checksum_result = torch.tensor(False, dtype=torch.bool, device='npu') diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py index cc6832f398..a9576675cb 100644 --- a/torch_npu/asd/checksum.py +++ b/torch_npu/asd/checksum.py @@ -41,5 +41,10 @@ def _matmul_checksum(a, b, c): error_total = (c_ele_round_error_accum).to(torch.float) error = torch.abs(c_sum - c1_trans) - flag = (error - error_total) > 1e-20 - return torch.any(flag) + flag = (error - 5 * error_total) > 5 * 1e-20 + any_flag = torch.any(flag) + if any_flag: + matmul(a, b, out=c) + c_mean2 = torch.mean(torch.abs(c), dim=-1) + return torch.any(c_mean != c_mean2) + return any_flag -- Gitee From 99f68091c9b7e84a479e56478de59176a052970d Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 20 Jun 2025 08:25:25 +0000 Subject: [PATCH 125/328] !22085 Update torchair commit id Merge pull request !22085 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index fe8e22ad3a..88ece0d3fe 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit fe8e22ad3a347ab8116eb61ff44450ef1fb07f91 +Subproject commit 88ece0d3feb5c70024bd5e5adb06cee120d05a10 -- Gitee From ca88bd5db32eb11b09e29a82c9a88f7626bd3df6 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 20 Jun 2025 08:59:31 +0000 Subject: [PATCH 126/328] !22130 Update op_plugin commit id Merge pull request !22130 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 70283d68a3..010a4e1ebf 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 70283d68a342c975f976f9022100cf5b8424c188 +Subproject commit 010a4e1ebf077ac34356eb3780daae6e727d8ded -- Gitee From 397ea4f26e2eb12833f2f42c887a4081b0e65f1b Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Fri, 20 Jun 2025 10:25:01 +0000 Subject: [PATCH 127/328] !22118 update numRanks_ Merge pull request !22118 from huangyunlong/2.7cc --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 0e2d83373f..4584c2d97a 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -932,6 +932,9 @@ ProcessGroupHCCL::ProcessGroupHCCL( const char* blockingWait = getenv(HCCL_BLOCKING_WAIT); logPrefix_ = createLogPrefix(); + if (options_->global_ranks_in_group.empty()) { + numRanks_ = size_; + } dumpOnException_ = c10d::getCvarBool(TORCH_HCCL_DUMP_ON_TIMEOUT, false); heartbeat_ = 1ULL; monitorThreadEnabled_.store(c10d::getCvarBool(TORCH_HCCL_ENABLE_MONITORING, false)); -- Gitee From e1478d05ad5e4a2a6721482c543fdf9546b07742 Mon Sep 17 00:00:00 2001 From: wangjie Date: Fri, 20 Jun 2025 10:40:39 +0000 Subject: [PATCH 128/328] !22051 [PROF] Python Tracer trimPrefix Fix Merge pull request !22051 from wangjie/python_tracer_fix_271 --- torch_npu/csrc/profiler/profiler_python.cpp | 31 ++++++++++--------- .../analysis/prof_parse/_fwk_file_parser.py | 24 +++++++++----- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/torch_npu/csrc/profiler/profiler_python.cpp b/torch_npu/csrc/profiler/profiler_python.cpp index 45ccf8f1b2..571fb57bbc 100644 --- a/torch_npu/csrc/profiler/profiler_python.cpp +++ b/torch_npu/csrc/profiler/profiler_python.cpp @@ -36,19 +36,6 @@ using TensorMetadata = torch_npu::toolkit::profiler::TensorMetadata; using ModuleParam = torch_npu::toolkit::profiler::ModuleParam; using OptimizerParam = torch_npu::toolkit::profiler::OptimizerParam; -std::string trimPrefix(std::string s) -{ - static std::vector prefixes = py::module::import("torch.profiler.python_tracer") - .attr("_prefix_regex")().cast>(); - for (const auto& p : prefixes) { - if (s.compare(0, p.size(), p) == 0) { - s.erase(0, p.size()); - return s; - } - } - return s; -} - std::vector getInterpreterThreads(PyInterpreterState* interpreter) { pybind11::gil_scoped_acquire gil; @@ -240,6 +227,7 @@ private: void reportTraceData(); void reportHashData(); void reportParamData(); + std::string trimPrefix(std::string s); private: std::atomic active_{false}; @@ -248,6 +236,7 @@ private: std::deque thread_local_results_; PyObject* module_call_code_{nullptr}; PyObject* optimizer_call_code_{nullptr}; + std::vector func_name_prefixes_; std::unordered_map py_call_cache_; std::unordered_map pyc_call_cache_; std::unordered_map module_info_cache_; @@ -277,6 +266,9 @@ PythonTracer::PythonTracer() : active_(false) .attr("_optimizer_step_code") .attr("__code__") .ptr(); + func_name_prefixes_ = py::module::import("torch.profiler.python_tracer") + .attr("_prefix_regex")() + .cast>(); } void PythonTracer::start(size_t max_threads) @@ -383,6 +375,17 @@ void PythonTracer::clear() interpreter_ = nullptr; } +std::string PythonTracer::trimPrefix(std::string s) +{ + for (const auto& p : func_name_prefixes_) { + if (s.compare(0, p.size(), p) == 0) { + s.erase(0, p.size()); + return s; + } + } + return s; +} + void PythonTracer::reportTraceData() { if (events_.size() > 0) { @@ -402,7 +405,7 @@ void PythonTracer::reportHashData() hash_data.resize(py_call_cache_.size() + pyc_call_cache_.size() + module_info_cache_.size() + 1); size_t idx = 0; for (auto& item : py_call_cache_) { - hash_data[idx++] = std::make_pair(item.first, trimPrefix(item.second.get_name())); + hash_data[idx++] = std::make_pair(item.first, trimPrefix(std::move(item.second.get_name()))); } for (auto& item : pyc_call_cache_) { hash_data[idx++] = std::make_pair(item.first, std::string(item.second.str())); diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py index aa00324c97..b8216a6995 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py @@ -152,11 +152,14 @@ class FwkFileParser: def get_fwk_trace_data(self): torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP) - if not torch_op_data: - self.logger.error("Get fwk trace data failed, the torch op data is empty.") - return [] enqueue_data_list, dequeue_data_list = self.get_task_queue_data() - pid = torch_op_data[0].pid + if torch_op_data: + pid = torch_op_data[0].pid + elif enqueue_data_list or dequeue_data_list: + pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid + else: + self.logger.error("Get fwk trace data failed, framework data is empty.") + return [] tid_dict = {} fwk_x_event_list = [None] * ( len(torch_op_data) + len(enqueue_data_list) * 2 + len(dequeue_data_list) * 2) @@ -247,9 +250,15 @@ class FwkFileParser: def get_fwk_api(self) -> dict: torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP) - if not torch_op_data: + enqueue_data_list, dequeue_data_list = self.get_task_queue_data() + if torch_op_data: + pid = torch_op_data[0].pid + elif enqueue_data_list or dequeue_data_list: + pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid + else: + self.logger.error("Get fwk api data failed, framework data is empty.") return {} - pid = torch_op_data[0].pid + torch_op_apis = [] fwd_bwd_dict = {} torch_op_idx = 0 @@ -272,13 +281,13 @@ class FwkFileParser: connection_ids = [] task_enqueues = [] task_dequeues = [] - enqueue_data_list, dequeue_data_list = self.get_task_queue_data() correlation_id_name_dict = {} for dequeue_data in dequeue_data_list: task_dequeues.append( [dequeue_data.ts, dequeue_data.ts + dequeue_data.dur, contact_2num(pid, dequeue_data.tid), dequeue_data.corr_id, dequeue_data.name]) correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name + torch_tids.add(dequeue_data.tid) for enqueue_data in enqueue_data_list: name = enqueue_data.name if enqueue_data.corr_id in correlation_id_name_dict: @@ -288,6 +297,7 @@ class FwkFileParser: [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid), enqueue_data.corr_id, name]) connection_ids.append(enqueue_data.corr_id) + torch_tids.add(enqueue_data.tid) start_connection_id = max(connection_ids) + 1 if connection_ids else 0 self.update_fwd_bwd_connection_id(fwd_bwd_dict, torch_op_apis, start_connection_id) -- Gitee From b8364c05740ee8ddac522ff979d4ea250c8d3ddb Mon Sep 17 00:00:00 2001 From: wangzixuan <617225691@qq.com> Date: Fri, 20 Jun 2025 11:17:51 +0000 Subject: [PATCH 129/328] =?UTF-8?q?!22054=20=E3=80=90profiler=E3=80=91step?= =?UTF-8?q?=20when=20stop=20unexpected,=20can't=20get=20step=20id=20Merge?= =?UTF-8?q?=20pull=20request=20!22054=20from=20wangzixuan/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../profiler/analysis/prof_parse/_fwk_cann_relation_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py index b5d3797c6f..ba29da446e 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py @@ -74,7 +74,7 @@ class FwkCANNRelationParser: step_id = step_node.event.name.split("#")[-1] if not step_node.corr_id_total: self.logger.error("There is no flow events in %s range.", step_node.event.name) - return [] + continue corr_id_list = sorted(step_node.corr_id_total) min_index, max_index = 0, len(corr_id_list) - 1 min_kernel_list, max_kernel_list = [], [] -- Gitee From c2c97e2ee1a40aef6856a71301e65ab265be30db Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 20 Jun 2025 11:23:01 +0000 Subject: [PATCH 130/328] !22153 Update op_plugin commit id Merge pull request !22153 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 010a4e1ebf..e43e654372 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 010a4e1ebf077ac34356eb3780daae6e727d8ded +Subproject commit e43e654372f2da5df637fa4450ab90d1a09f48b7 -- Gitee From 32fb96d5023eb2d4ab43d4e419b8b03abf4f02f9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 20 Jun 2025 16:23:01 +0000 Subject: [PATCH 131/328] !22168 Update op_plugin commit id Merge pull request !22168 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index e43e654372..0bb4032d89 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit e43e654372f2da5df637fa4450ab90d1a09f48b7 +Subproject commit 0bb4032d8912b48bf3e160c164f5a54dfc79badd -- Gitee From 18ff7f37924814a5286a1c5c9de1ab210c60c1cc Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 20 Jun 2025 22:23:19 +0000 Subject: [PATCH 132/328] !22165 Update torchair commit id Merge pull request !22165 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 88ece0d3fe..4bba3a81a5 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 88ece0d3feb5c70024bd5e5adb06cee120d05a10 +Subproject commit 4bba3a81a532f4cf2574c03be24e3ba3ad52b2b8 -- Gitee From 0496c6bc4d5c1db6fc9c5f53996c21f089a3426e Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Sat, 21 Jun 2025 01:20:49 +0000 Subject: [PATCH 133/328] !22133 Added permission verification and prevents memory overflow Merge pull request !22133 from yuhaiyan/v2.7.1-dev3 --- torch_npu/csrc/distributed/ParallelTcpServer.cpp | 7 +++++++ torch_npu/csrc/distributed/StoreMessagePacker.cpp | 7 +++++++ torch_npu/csrc/framework/utils/NpuUtils.cpp | 10 ++++++++++ torch_npu/csrc/framework/utils/NpuUtils.h | 1 + 4 files changed, 25 insertions(+) diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.cpp b/torch_npu/csrc/distributed/ParallelTcpServer.cpp index 38899ea5a8..72e7ebf9a0 100644 --- a/torch_npu/csrc/distributed/ParallelTcpServer.cpp +++ b/torch_npu/csrc/distributed/ParallelTcpServer.cpp @@ -16,11 +16,13 @@ #include #include #include +#include #include #include #include #include #include "c10/util/Logging.h" +#include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "ParallelTcpServer.hpp" namespace c10d { @@ -315,6 +317,11 @@ int ParallelTcpServer::CreateLocalSocket(const std::string &localSocketPath) noe return -1; } + if (!at_npu::native::NpuUtils::setFilePermissions(sockFd, S_IRUSR | S_IWUSR | S_IRGRP)) { + close(sockFd); + return -1; + } + ret = listen(sockFd, MAX_EVENT_COUNT); if (ret != 0) { LOG(ERROR) << "listen local socket fd failed " << errno << " : " << strerror(errno); diff --git a/torch_npu/csrc/distributed/StoreMessagePacker.cpp b/torch_npu/csrc/distributed/StoreMessagePacker.cpp index 0ff08c8d95..9335de1b68 100644 --- a/torch_npu/csrc/distributed/StoreMessagePacker.cpp +++ b/torch_npu/csrc/distributed/StoreMessagePacker.cpp @@ -86,6 +86,7 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess } auto ptr = buffer.data(); + auto ptr_end = ptr + buffer.size(); auto totalSize = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); @@ -103,6 +104,9 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess ptr += sizeof(uint64_t); message.keys.emplace_back(reinterpret_cast(ptr), keySize); ptr += keySize; + if (ptr > ptr_end) { + break; + } } auto valueCount = *reinterpret_cast(ptr); @@ -113,6 +117,9 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess ptr += sizeof(uint64_t); message.values.emplace_back(ptr, ptr + valueSize); ptr += valueSize; + if (ptr > ptr_end) { + break; + } } return static_cast(totalSize); diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index f805b489dc..a26426ab72 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "torch_npu/csrc/aten/CustomFunctions.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" @@ -267,6 +268,15 @@ void NpuUtils::check_1d(const at::Tensor &t, const char *arg, const char *fn) OPS_ERROR(ErrCode::PARAM)); } +bool NpuUtils::setFilePermissions(int fd, mode_t mode) +{ + if (fchmod(fd, mode) == -1) { + ASCEND_LOGI("Failed to set permissions."); + return false; + } + return true; +} + #ifndef BUILD_LIBTORCH void NpuUtils::ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data, uint64_t correlation_id) diff --git a/torch_npu/csrc/framework/utils/NpuUtils.h b/torch_npu/csrc/framework/utils/NpuUtils.h index 0a2d63c267..2f58120539 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.h +++ b/torch_npu/csrc/framework/utils/NpuUtils.h @@ -46,6 +46,7 @@ public: static bool check_5d_5d_match(const at::Tensor &tensor); static bool IsOomError(aclError ret, int index); static void check_1d(const at::Tensor &t, const char *arg, const char *fn); + static bool setFilePermissions(int fd, mode_t mode); #ifndef BUILD_LIBTORCH static void ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data, uint64_t correlation_id = 0); -- Gitee From ba563b818a92e4163b90b5f72f55d0b28bfe7660 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Sat, 21 Jun 2025 03:56:53 +0000 Subject: [PATCH 134/328] =?UTF-8?q?!22032=20=E3=80=90Profiler=E3=80=91gc?= =?UTF-8?q?=5Fdetect=5Fthreshold=20fix=20Merge=20pull=20request=20!22032?= =?UTF-8?q?=20from=20hhz886/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../_dynamic_profiler/_dynamic_profiler_config_context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py index ab8a5abfe6..5da94ae763 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py @@ -198,8 +198,8 @@ class ConfigContext: op_attr = json_data.get('PROFILE_OP_ATTR', 'false') op_attr = self.BOOL_MAP.get(op_attr.lower(), False) gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None) - if isinstance(gc_detect_threshold, str) and gc_detect_threshold != "None": - gc_detect_threshold = float(gc_detect_threshold) + if isinstance(gc_detect_threshold, str): + gc_detect_threshold = None if gc_detect_threshold == "None" else float(gc_detect_threshold) data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true') data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True) record_op_args = False -- Gitee From 712c7d302474aa284003f6ee3e3f194e43eea21d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=B2=E4=BD=B3=E9=A6=A8?= <1481513921@qq.com> Date: Sat, 21 Jun 2025 09:24:13 +0000 Subject: [PATCH 135/328] =?UTF-8?q?!22191=20add=20=5Fatb=5Fmeta=5Fregistra?= =?UTF-8?q?tions=20to=20private=5Fallowlist=20Merge=20pull=20request=20!22?= =?UTF-8?q?191=20from=20=E5=8F=B2=E4=BD=B3=E9=A6=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_public_bindings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py index 770a60561e..1049fc8d87 100644 --- a/test/npu/test_public_bindings.py +++ b/test/npu/test_public_bindings.py @@ -549,7 +549,7 @@ class TestPublicBindings(TestCase): "torch_npu.utils.collect_hccl_info", "torch_npu.op_plugin.meta._meta_registrations", "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_dequant_bias", - + "torch_npu.op_plugin.atb._atb_meta_registrations", } # No new entries should be added to this list. -- Gitee From 91b74f2d55f4d535e551fdc5a5b1aef94e65160e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 21 Jun 2025 10:24:20 +0000 Subject: [PATCH 136/328] !22174 Update op_plugin commit id Merge pull request !22174 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 0bb4032d89..7837f88264 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 0bb4032d8912b48bf3e160c164f5a54dfc79badd +Subproject commit 7837f882644b30a0ff32ca5aeb1db8ad86baf69c -- Gitee From 9beec88e2b1d850944d24205757ad2fe8f258b8d Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 21 Jun 2025 14:09:21 +0000 Subject: [PATCH 137/328] !22196 Update op_plugin commit id Merge pull request !22196 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 7837f88264..f866fec523 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 7837f882644b30a0ff32ca5aeb1db8ad86baf69c +Subproject commit f866fec523946f14d6256e3fa064d02bee776cbb -- Gitee From f2e873f7e3f97a7902dae5cb6db2e655f3b831f4 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 23 Jun 2025 09:05:09 +0000 Subject: [PATCH 138/328] !22212 Update op_plugin commit id Merge pull request !22212 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f866fec523..a1e170d2f4 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f866fec523946f14d6256e3fa064d02bee776cbb +Subproject commit a1e170d2f4a22206da35898bb56fd823770d678d -- Gitee From 078fd686acd492a8a04c8867b9908644613385c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Mon, 23 Jun 2025 13:04:41 +0000 Subject: [PATCH 139/328] =?UTF-8?q?!22226=20SilentCheckv3:=20Use=20"bp=20t?= =?UTF-8?q?ime"=20instead=20of=20"step".=20Merge=20pull=20request=20!22226?= =?UTF-8?q?=20from=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fcheckfix5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index c7a8fc8388..460e044cdd 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -481,7 +481,7 @@ class _MatmulSilentCheck: val = self.statistic_cpu_value[self.head_index].item() name = self.name_list[self.head_index] while val != -1 and name != "": - loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, step: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") + loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, bp time: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}") result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check( val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'], self.upper_thresh1, self.upper_thresh2 @@ -604,7 +604,7 @@ class _MatmulSilentCheck: def _generate_event_log(self, new_abnormal): info_str = f"[Event][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: A grad-norm spike may happen, " info_str = info_str + f"param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, " - info_str = info_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." + info_str = info_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." loggerSilent.info(info_str) if self.store is not None and self.rank is not None and self.rank != 0: current_log = self.store.get(f"rank_{self.rank}_info_log").decode() @@ -615,10 +615,10 @@ class _MatmulSilentCheck: index = 0 for pos in reversed(counting_abnormal_pos): warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {self.history_abnormal_list[pos]['time_str']}, param name {self.history_abnormal_list[pos]['name']}, abnormal value {self.history_abnormal_list[pos]['val']}, previous value {self.history_abnormal_list[pos]['pre_val']}, " - warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, step {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}." + warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, bp time {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}." index += 1 warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {new_abnormal['time_str']}, param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, " - warning_str = warning_str + f"history avg {new_abnormal['avg']}, step {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." + warning_str = warning_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}." loggerSilent.warning(warning_str) if self.store is not None and self.rank is not None and self.rank != 0: current_log = self.store.get(f"rank_{self.rank}_warn_log").decode() -- Gitee From 81e3b2f210034393e17874c0144612911075c46e Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Mon, 23 Jun 2025 13:12:29 +0000 Subject: [PATCH 140/328] !22015 support NSLB-DP Merge pull request !22015 from SCh-zx/nslb27 --- third_party/hccl/inc/hccl/hccl.h | 2 + third_party/hccl/inc/hccl/hccl_types.h | 4 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 67 +++++++++++++++++++ .../csrc/distributed/ProcessGroupHCCL.hpp | 8 ++- 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 023914a348..216ef7a838 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -212,6 +212,8 @@ inline void HcclCommConfigInit(HcclCommConfig *config) config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET; config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET; config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE; + config->hcclWorldRankID = 0; + config->hcclJobID = 0; } /** diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h index 40631676c1..9a02c61c04 100644 --- a/third_party/hccl/inc/hccl/hccl_types.h +++ b/third_party/hccl/inc/hccl/hccl_types.h @@ -15,7 +15,7 @@ extern "C" { const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24; const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0; -const uint32_t HCCL_COMM_CONFIG_VERSION = 5; +const uint32_t HCCL_COMM_CONFIG_VERSION = 6; const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; // 200MB buffer size const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0; // Disable deterministic calculations const uint32_t COMM_NAME_MAX_LENGTH = 128; @@ -132,6 +132,8 @@ typedef struct HcclCommConfigDef { uint32_t hcclOpExpansionMode; uint32_t hcclRdmaTrafficClass; uint32_t hcclRdmaServiceLevel; + uint32_t hcclWorldRankID; + uint64_t hcclJobID; } HcclCommConfig; typedef enum { diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 4584c2d97a..edc39c96ec 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -19,8 +19,12 @@ #include #include #include +#include +#include #include +#include + #include "op_plugin/OpInterface.h" #include "third_party/acl/inc/acl/acl.h" #include "third_party/acl/inc/acl/acl_base.h" @@ -63,6 +67,7 @@ constexpr const char* P2P_DEVICE_KEY = "_p2p"; using hcclUs = std::chrono::steady_clock::time_point; constexpr int32_t MAX_GROUP_NAME_LEN = 128; +constexpr int32_t NSLB_JOBID_OFFSET = 32; // HCCL ReduceOp mapping std::map hcclOp = { @@ -950,6 +955,24 @@ ProcessGroupHCCL::ProcessGroupHCCL( c10d::PrefixStore *prefixStore = dynamic_cast(store_.get()); globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_; + c10::intrusive_ptr getTcpStore = store_; + while (getTcpStore) { + c10d::PrefixStore *asPrefixStore = dynamic_cast(getTcpStore.get()); + c10d::TCPStore *tcpStore = dynamic_cast(getTcpStore.get()); + if (tcpStore) { + if (!(tcpStore->getHost().empty())) { + tcp_master_addr_ = tcpStore->getHost(); + tcp_master_port_ = tcpStore->getPort(); + break; + } + } + if (asPrefixStore) { + getTcpStore = asPrefixStore->getUnderlyingStore(); + } else { + break; + } + } + try { if (blockingWait != nullptr) { auto val = std::stoi(blockingWait); @@ -2150,6 +2173,30 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank); } +void ProcessGroupHCCL::setNSLBCommConfig(HcclCommConfig** commConfig) +{ + const char* envPtr = std::getenv("RANK"); + if (envPtr == nullptr) { + ASCEND_LOGI("Failed to get env info for NSLB-DP."); + return; + } + uint32_t worldRankID = std::stoi(std::string(envPtr)); + options_->hccl_config["hccl_world_rank_id"] = worldRankID; + uint32_t masterPort = tcp_master_port_; + struct sockaddr_in sa; + std::string master_addr = tcp_master_addr_; + inet_pton(AF_INET, std::string(master_addr).c_str(), &(sa.sin_addr)); + uint32_t masterIp = ntohl(sa.sin_addr.s_addr); + uint64_t jobID = masterPort; + jobID = (jobID << NSLB_JOBID_OFFSET); + jobID += masterIp; + options_->hccl_config["hccl_job_id"] = jobID; + if ((*commConfig) != nullptr) { + (*commConfig)->hcclWorldRankID = worldRankID; + (*commConfig)->hcclJobID = jobID; + } +} + void ProcessGroupHCCL::createHCCLComm( const std::string& devicesKey, const std::vector& devices, @@ -2174,6 +2221,10 @@ void ProcessGroupHCCL::createHCCLComm( HcclCommConfig config; + if (options_->global_ranks_in_group.empty()) { + setNSLBCommConfig(&commConfig); + } + npuGuard.set_index(devices[i].index()); switch (commType) { case HcclCommType::DEFAULT: @@ -3096,6 +3147,22 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions() } } + if (options_->hccl_config.find("hccl_world_rank_id") != options_->hccl_config.end()) { + if (std::holds_alternative(options_->hccl_config["hccl_world_rank_id"])) { + config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_world_rank_id"]); + } else { + TORCH_CHECK(false, "Value type of hccl_world_rank_id should be int.", DIST_ERROR(ErrCode::TYPE)); + } + } + + if (options_->hccl_config.find("hccl_job_id") != options_->hccl_config.end()) { + if (std::holds_alternative(options_->hccl_config["hccl_job_id"])) { + config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_job_id"]); + } else { + TORCH_CHECK(false, "Value type of hccl_job_id should be int.", DIST_ERROR(ErrCode::TYPE)); + } + } + return config; } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 9c2f365b3e..fa477ae334 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -384,7 +384,7 @@ public: return c10::make_intrusive(_is_high_priority_stream); } - std::unordered_map> hccl_config; + std::unordered_map> hccl_config; std::chrono::milliseconds opTimeout; // Schedule HCCL operations on high priority CUDA streams @@ -571,6 +571,8 @@ public: void resumeHcclComm(int device_id); + void setNSLBCommConfig(HcclCommConfig** commConfig); + bool setCommWorkingDevNic( const HcclComm& comm, int nranks, @@ -960,6 +962,10 @@ protected: std::string pg_desc_; + std::string tcp_master_addr_; + + uint32_t tcp_master_port_; + private: // Helper that encapsulates work shared across all collective communication // primitives. -- Gitee From 3f7d6f77d04e3509cc27ace81103b1c622889ea4 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Mon, 23 Jun 2025 22:42:47 +0000 Subject: [PATCH 141/328] !22248 Update torchair commit id Merge pull request !22248 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 4bba3a81a5..c5a9442d9c 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 4bba3a81a532f4cf2574c03be24e3ba3ad52b2b8 +Subproject commit c5a9442d9c0db6da50c28681ec7e1c8512cd1d95 -- Gitee From 0ccb738ab8dece434b181af4cc41f0cf6f922b08 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Tue, 24 Jun 2025 02:27:26 +0000 Subject: [PATCH 142/328] =?UTF-8?q?!22148=20=E3=80=90Profiler=E3=80=91work?= =?UTF-8?q?space=20support=20db=20Merge=20pull=20request=20!22148=20from?= =?UTF-8?q?=20hhz886/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/toolkit/profiler/inc/data_reporter.h | 4 ++-- .../analysis/prof_view/_memory_view_parser.py | 3 ++- .../prof_view/prof_db_parse/_memory_db_parser.py | 15 +++++++++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h index 764f8e1668..e9aaaf9521 100644 --- a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h +++ b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h @@ -344,8 +344,8 @@ struct MemoryData : BaseReportData { uint64_t thread_id{ 0 }; uint64_t process_id{ 0 }; MemoryData(int64_t ptr, int64_t time_ns, int64_t alloc_size, int64_t total_allocated, int64_t total_reserved, - int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t data_type, - uint8_t component_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id) + int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t component_type, + uint8_t data_type, uint8_t allocator_type, uint64_t thread_id, uint64_t process_id) : BaseReportData(0, "torch.memory_usage"), ptr(ptr), time_ns(time_ns), diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py index 04ef7c0e90..a82c3dc3c8 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py @@ -109,7 +109,8 @@ class MemoryViewParser(BaseParser): if ge_record.time_ns >= pta_record.time_ns: self.size_record_list.extend(self._combine_record(last_ge_record, pta_record)) pta_ptr += 1 - last_pta_record = pta_record + if hasattr(pta_record, 'component_type') and pta_record.component_type != Constant.WORKSPACE_TYPE: + last_pta_record = pta_record else: self.size_record_list.extend(self._combine_record(last_pta_record, ge_record)) ge_ptr += 1 diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py index 64de6315f2..34a5fc27f8 100644 --- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py @@ -65,6 +65,8 @@ class MemoryDbParser(BaseParser): @staticmethod def _combine_record(last_record, cur_record): + if cur_record[MemoryRecordTableRow.COMPONENT.value] == Str2IdManager().get_id_from_str(Constant.WORKSPACE): + return [cur_record] pta_ge_record_list = cur_record[:] pta_ge_record_list[MemoryRecordTableRow.COMPONENT.value] = Str2IdManager().get_id_from_str(Constant.PTA_GE) if last_record: @@ -179,9 +181,16 @@ class MemoryDbParser(BaseParser): if not self._pta_memory_bean_list: return for memory_bean in self._pta_memory_bean_list: + if memory_bean.component_type == Constant.WORKSPACE_TYPE: + self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.WORKSPACE), memory_bean.time_ns, + memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db, + memory_bean.total_active_for_db, memory_bean.stream_ptr, + memory_bean.device_index]) + continue self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.PTA), memory_bean.time_ns, memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db, - memory_bean.total_active_for_db, memory_bean.stream_ptr, memory_bean.device_index]) + memory_bean.total_active_for_db, memory_bean.stream_ptr, + memory_bean.device_index]) def get_pta_ge_record_list(self): """ @@ -203,7 +212,9 @@ class MemoryDbParser(BaseParser): if ge_record[1] >= pta_record[1]: self._record_list.extend(self._combine_record(last_ge_record, pta_record)) pta_ptr += 1 - last_pta_record = pta_record + if pta_record[MemoryRecordTableRow.COMPONENT.value] != \ + Str2IdManager().get_id_from_str(Constant.WORKSPACE): + last_pta_record = pta_record else: self._record_list.extend(self._combine_record(last_pta_record, ge_record)) ge_ptr += 1 -- Gitee From d9a23a09d01d7d9597396c2a5c94a25e81ecab6d Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Tue, 24 Jun 2025 11:12:34 +0000 Subject: [PATCH 143/328] !22239 Remove code that doesn't have much effect Merge pull request !22239 from yuhaiyan/v2.7.1-dev1 --- torch_npu/csrc/distributed/StoreMessagePacker.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/torch_npu/csrc/distributed/StoreMessagePacker.cpp b/torch_npu/csrc/distributed/StoreMessagePacker.cpp index 9335de1b68..61f0388e66 100644 --- a/torch_npu/csrc/distributed/StoreMessagePacker.cpp +++ b/torch_npu/csrc/distributed/StoreMessagePacker.cpp @@ -98,7 +98,6 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess auto keyCount = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); - message.keys.reserve(keyCount); for (auto i = 0UL; i < keyCount; i++) { auto keySize = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); @@ -111,7 +110,6 @@ int64_t StoreMessagePacker::Unpack(const std::vector &buffer, StoreMess auto valueCount = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); - message.values.reserve(valueCount); for (auto i = 0UL; i < valueCount; i++) { auto valueSize = *reinterpret_cast(ptr); ptr += sizeof(uint64_t); -- Gitee From dcf39b65ba76794315bd330d4c19e5e85b5e98a3 Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Tue, 24 Jun 2025 11:43:28 +0000 Subject: [PATCH 144/328] !22259 Remove the word warning and fix the spelling of replace Merge pull request !22259 from yuhaiyan/v2.7.1-dev3 --- torch_npu/csrc/aten/common/ToKernelNpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp index a029071079..6a6b3ffa9f 100644 --- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp @@ -162,8 +162,8 @@ at::Tensor NPUNativeFunctions::to( } if (dtype == at::ScalarType::Double) { TORCH_NPU_WARN_ONCE( - "Warning: Device do not support double dtype now, " - "dtype cast repalce with float."); + "Device do not support double dtype now, " + "dtype cast replace with float."); } dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype; return custom_ops::npu_dtype_cast(self, dtype); -- Gitee From 868e55bc42c0004f1e18f32ce88ac840f65f6474 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Tue, 24 Jun 2025 12:15:45 +0000 Subject: [PATCH 145/328] =?UTF-8?q?!22265=20=E3=80=90Profiler=E3=80=91mult?= =?UTF-8?q?iprocess=20log=20fix=20Merge=20pull=20request=20!22265=20from?= =?UTF-8?q?=20hhz886/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../analysis/prof_common_func/_log.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py index 15ba7a80f9..eba5db1af7 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_log.py +++ b/torch_npu/profiler/analysis/prof_common_func/_log.py @@ -34,6 +34,7 @@ class ProfilerLogger: BACKUP_COUNT = 3 # logger instance _instance = None + _pid = None @classmethod def get_instance(cls) -> logging.Logger: @@ -54,7 +55,9 @@ class ProfilerLogger: RuntimeError: If logger initialization fails """ if cls._instance is not None: - return + if cls._pid == os.getpid(): + return + cls.destroy() # Create logs directory log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR) @@ -89,6 +92,7 @@ class ProfilerLogger: logger.addHandler(file_handler) cls._instance = logger + cls._pid = os.getpid() logger.info("Profiler logger initialized at: %s", log_file) @classmethod @@ -106,9 +110,21 @@ class ProfilerLogger: @classmethod def destroy(cls) -> None: - """Close and cleanup the logger.""" + """ + Close and cleanup the logger. + To avoid the deadlock problem caused by directly calling close on handler in multi-process scenarios, close the + file descriptor manually. + """ if cls._instance: for handler in cls._instance.handlers[:]: - handler.close() cls._instance.removeHandler(handler) + if cls._pid == os.getpid(): + handler.close() + else: + try: + if hasattr(handler.stream, 'fileno'): + fileno = handler.stream.fileno() + os.close(fileno) + except (OSError, AttributeError, ValueError): + logging.warning("Close profiler logger handler stream failed.") cls._instance = None -- Gitee From a0e339a1cf4c6ba7bfd1902c88178ae4cbc5aecb Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 24 Jun 2025 14:20:04 +0000 Subject: [PATCH 146/328] !22275 Update op_plugin commit id Merge pull request !22275 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index a1e170d2f4..423f8e137e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit a1e170d2f4a22206da35898bb56fd823770d678d +Subproject commit 423f8e137e7859624655c203b9452fc47a3a6a89 -- Gitee From e17d955ed9a1d6f6bedea38701db6735fc67317e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 24 Jun 2025 14:20:05 +0000 Subject: [PATCH 147/328] !22275 Update op_plugin commit id Merge pull request !22275 from pta-robot/v2.7.1 -- Gitee From 2ba0065f2562006ee0054af0cce4dbd072e195dd Mon Sep 17 00:00:00 2001 From: shaoyf Date: Tue, 24 Jun 2025 15:13:29 +0000 Subject: [PATCH 148/328] =?UTF-8?q?!22269=20=E5=9B=9E=E9=80=80=20'Pull=20R?= =?UTF-8?q?equest=20!22015=20:=20support=20NSLB-DP'=20Merge=20pull=20reque?= =?UTF-8?q?st=20!22269=20from=20shaoyf/revert-merge-22015-v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/hccl/inc/hccl/hccl.h | 2 - third_party/hccl/inc/hccl/hccl_types.h | 4 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 67 ------------------- .../csrc/distributed/ProcessGroupHCCL.hpp | 8 +-- 4 files changed, 2 insertions(+), 79 deletions(-) diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 216ef7a838..023914a348 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -212,8 +212,6 @@ inline void HcclCommConfigInit(HcclCommConfig *config) config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET; config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET; config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE; - config->hcclWorldRankID = 0; - config->hcclJobID = 0; } /** diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h index 9a02c61c04..40631676c1 100644 --- a/third_party/hccl/inc/hccl/hccl_types.h +++ b/third_party/hccl/inc/hccl/hccl_types.h @@ -15,7 +15,7 @@ extern "C" { const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24; const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0; -const uint32_t HCCL_COMM_CONFIG_VERSION = 6; +const uint32_t HCCL_COMM_CONFIG_VERSION = 5; const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; // 200MB buffer size const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0; // Disable deterministic calculations const uint32_t COMM_NAME_MAX_LENGTH = 128; @@ -132,8 +132,6 @@ typedef struct HcclCommConfigDef { uint32_t hcclOpExpansionMode; uint32_t hcclRdmaTrafficClass; uint32_t hcclRdmaServiceLevel; - uint32_t hcclWorldRankID; - uint64_t hcclJobID; } HcclCommConfig; typedef enum { diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index edc39c96ec..4584c2d97a 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -19,12 +19,8 @@ #include #include #include -#include -#include #include -#include - #include "op_plugin/OpInterface.h" #include "third_party/acl/inc/acl/acl.h" #include "third_party/acl/inc/acl/acl_base.h" @@ -67,7 +63,6 @@ constexpr const char* P2P_DEVICE_KEY = "_p2p"; using hcclUs = std::chrono::steady_clock::time_point; constexpr int32_t MAX_GROUP_NAME_LEN = 128; -constexpr int32_t NSLB_JOBID_OFFSET = 32; // HCCL ReduceOp mapping std::map hcclOp = { @@ -955,24 +950,6 @@ ProcessGroupHCCL::ProcessGroupHCCL( c10d::PrefixStore *prefixStore = dynamic_cast(store_.get()); globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_; - c10::intrusive_ptr getTcpStore = store_; - while (getTcpStore) { - c10d::PrefixStore *asPrefixStore = dynamic_cast(getTcpStore.get()); - c10d::TCPStore *tcpStore = dynamic_cast(getTcpStore.get()); - if (tcpStore) { - if (!(tcpStore->getHost().empty())) { - tcp_master_addr_ = tcpStore->getHost(); - tcp_master_port_ = tcpStore->getPort(); - break; - } - } - if (asPrefixStore) { - getTcpStore = asPrefixStore->getUnderlyingStore(); - } else { - break; - } - } - try { if (blockingWait != nullptr) { auto val = std::stoi(blockingWait); @@ -2173,30 +2150,6 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank); } -void ProcessGroupHCCL::setNSLBCommConfig(HcclCommConfig** commConfig) -{ - const char* envPtr = std::getenv("RANK"); - if (envPtr == nullptr) { - ASCEND_LOGI("Failed to get env info for NSLB-DP."); - return; - } - uint32_t worldRankID = std::stoi(std::string(envPtr)); - options_->hccl_config["hccl_world_rank_id"] = worldRankID; - uint32_t masterPort = tcp_master_port_; - struct sockaddr_in sa; - std::string master_addr = tcp_master_addr_; - inet_pton(AF_INET, std::string(master_addr).c_str(), &(sa.sin_addr)); - uint32_t masterIp = ntohl(sa.sin_addr.s_addr); - uint64_t jobID = masterPort; - jobID = (jobID << NSLB_JOBID_OFFSET); - jobID += masterIp; - options_->hccl_config["hccl_job_id"] = jobID; - if ((*commConfig) != nullptr) { - (*commConfig)->hcclWorldRankID = worldRankID; - (*commConfig)->hcclJobID = jobID; - } -} - void ProcessGroupHCCL::createHCCLComm( const std::string& devicesKey, const std::vector& devices, @@ -2221,10 +2174,6 @@ void ProcessGroupHCCL::createHCCLComm( HcclCommConfig config; - if (options_->global_ranks_in_group.empty()) { - setNSLBCommConfig(&commConfig); - } - npuGuard.set_index(devices[i].index()); switch (commType) { case HcclCommType::DEFAULT: @@ -3147,22 +3096,6 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions() } } - if (options_->hccl_config.find("hccl_world_rank_id") != options_->hccl_config.end()) { - if (std::holds_alternative(options_->hccl_config["hccl_world_rank_id"])) { - config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_world_rank_id"]); - } else { - TORCH_CHECK(false, "Value type of hccl_world_rank_id should be int.", DIST_ERROR(ErrCode::TYPE)); - } - } - - if (options_->hccl_config.find("hccl_job_id") != options_->hccl_config.end()) { - if (std::holds_alternative(options_->hccl_config["hccl_job_id"])) { - config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_job_id"]); - } else { - TORCH_CHECK(false, "Value type of hccl_job_id should be int.", DIST_ERROR(ErrCode::TYPE)); - } - } - return config; } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index fa477ae334..9c2f365b3e 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -384,7 +384,7 @@ public: return c10::make_intrusive(_is_high_priority_stream); } - std::unordered_map> hccl_config; + std::unordered_map> hccl_config; std::chrono::milliseconds opTimeout; // Schedule HCCL operations on high priority CUDA streams @@ -571,8 +571,6 @@ public: void resumeHcclComm(int device_id); - void setNSLBCommConfig(HcclCommConfig** commConfig); - bool setCommWorkingDevNic( const HcclComm& comm, int nranks, @@ -962,10 +960,6 @@ protected: std::string pg_desc_; - std::string tcp_master_addr_; - - uint32_t tcp_master_port_; - private: // Helper that encapsulates work shared across all collective communication // primitives. -- Gitee From 47adb52c606529a3d73fafd9f81457285dbc3335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 25 Jun 2025 02:02:33 +0000 Subject: [PATCH 149/328] =?UTF-8?q?!22283=20SilentCheck:=20fix=20cannot=20?= =?UTF-8?q?pickle=20=5Fthread.lock=20Merge=20pull=20request=20!22283=20fro?= =?UTF-8?q?m=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fcheckfix6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 68 +++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 460e044cdd..4f1229a5b7 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -1,5 +1,5 @@ import os -from functools import wraps +from functools import wraps, partial import logging import time import warnings @@ -314,17 +314,11 @@ class _MatmulSilentCheck: self.checksum_result = None self.checksum_state = None self.checksum_state_thread_running = False - self.checksum_state_thread = threading.Thread( - target=self._tcp_comm_checksum_state, - daemon=True - ) + self.checksum_state_thread = None # Use another thread to receive the statistic value and detect SDC self.check_thread_running = False - self.check_thread = threading.Thread( - target=self._async_detect, - daemon=True - ) - self.lock = threading.Lock() + self.check_thread = None + self._lock = None self.queue_len = 1024 self.statistic_cpu_value = None self.name_list = ["" for _ in range(self.queue_len)] @@ -409,7 +403,13 @@ class _MatmulSilentCheck: def get_grad_sample_interval(self): return self.filter_interval - + + @property + def lock(self): + if self._lock is None: + self._lock = threading.Lock() + return self._lock + def init_stream(self): if self.statistic_cpu_value is None: self.statistic_value = torch.tensor(0., device=f"npu:{torch_npu.npu.current_device()}") @@ -431,7 +431,8 @@ class _MatmulSilentCheck: def register_module_hook(self, module, name): self.check_stat[name + "_backward"] = {'avg': 0, 'pre_val': 0, 'step': 0, 'none_zero_step': 0} - self.hook_dict[name + "_backward"] = module.register_full_backward_hook(lambda module, grad_input, grad_output, n=name + "_backward": self.module_hook(module, grad_input, grad_output, n)) + hook = partial(self.module_hook, name=name + "_backward") + self.hook_dict[name + "_backward"] = module.register_full_backward_hook(hook) self.registered_modules.append(name) def module_hook(self, module, grad_input, grad_output, name): @@ -472,6 +473,8 @@ class _MatmulSilentCheck: if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized(): break time.sleep(10) + if not self.check_thread_running: + return local_rank = os.getenv("LOCAL_RANK", "-1") if local_rank.isdigit(): torch.npu.set_device(int(local_rank)) @@ -636,6 +639,8 @@ class _MatmulSilentCheck: if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized() and self.store is not None: break time.sleep(10) + if not self.checksum_state_thread_running: + return local_rank = os.getenv("LOCAL_RANK", "-1") self.rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() @@ -697,14 +702,44 @@ class _MatmulSilentCheck: time.sleep(10) + def __getstate__(self): + self._cleanup() + state = self.__dict__.copy() + state['_lock'] = None + state['store'] = None + return state + + def __setstate(self, state): + self.__dict__.update(state) + self.store = None + + def _startup(self): + if not self.check_thread_running: + self.check_thread_running = True + self.check_thread = threading.Thread( + target=self._async_detect, + daemon=True + ) + self.check_thread.start() + + if not self.checksum_state_thread_running: + self.checksum_state_thread_running = True + self.checksum_state_thread = threading.Thread( + target=self._tcp_comm_checksum_state, + daemon=True + ) + self.checksum_state_thread.start() + def _cleanup(self): if self.check_thread_running: self.check_thread_running = False self.check_thread.join() + self.check_thread = None if self.checksum_state_thread_running: self.checksum_state_thread_running = False self.checksum_state_thread.join() + self.checksum_state_thread = None matmul_check = _MatmulSilentCheck() @@ -747,14 +782,7 @@ def _matmul_silent_check_decorator(func): matmul_check.init_module_info(id(self), self.training) self.matmul_check_outer = True - if not matmul_check.check_thread_running: - matmul_check.check_thread_running = True - matmul_check.check_thread.start() - - # 2 for checksum - if not matmul_check.checksum_state_thread_running: - matmul_check.checksum_state_thread_running = True - matmul_check.checksum_state_thread.start() + matmul_check._startup() if matmul_check.with_checksum and not matmul_check.matmul_trigger: torch_npu.asd.checksum.matmul = original_matmul torch.matmul = _trigger_matmul_decorator(original_matmul) -- Gitee From c6a35b89819502faa03d937bb4536ce5aa764407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com> Date: Wed, 25 Jun 2025 08:43:15 +0000 Subject: [PATCH 150/328] =?UTF-8?q?!22234=20[PROF]fix=20dynamic=20prof=20s?= =?UTF-8?q?tep=20id;=20dynamic=20prof=20add=20step=20data=20Merge=20pull?= =?UTF-8?q?=20request=20!22234=20from=20=E6=A2=85=E9=A3=9E=E8=A6=81/2.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/profiler/dynamic_profile.py | 7 +++++++ torch_npu/profiler/profiler.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py index 99bfd76f72..313fe7d388 100644 --- a/torch_npu/profiler/dynamic_profile.py +++ b/torch_npu/profiler/dynamic_profile.py @@ -3,6 +3,7 @@ import json import atexit import time +from ..npu import mstx, current_stream from .profiler import tensorboard_trace_handler, profile from .scheduler import Schedule as schedule @@ -38,6 +39,7 @@ class _DynamicProfile: self._step_record_time = None self._step_time = 0 self._min_poll_interval = 1 + self._step_mstx_range_id = 0 def init(self): if self.repeat_init: @@ -78,6 +80,9 @@ class _DynamicProfile: self._step_time = max(self._min_poll_interval, int(time.time() - self._step_record_time)) self._dynamic_monitor.modify_step_time(self._step_time) if self.prof: + if self._step_mstx_range_id: + mstx.range_end(self._step_mstx_range_id) + self._step_mstx_range_id = mstx.range_start(f"step {self.cur_step}", current_stream()) self.prof.step() self.step_num -= 1 if 0 == self.step_num: @@ -138,7 +143,9 @@ class _DynamicProfile: with_modules=self.cfg_ctx.with_modules, experimental_config=self.cfg_ctx.experimental_config ) + self.prof._set_step_num_offset_for_dynamic_prof(self.cur_step) self.prof.start() + self._step_mstx_range_id = mstx.range_start(f"step {self.cur_step}", current_stream()) for key, value in self.cfg_ctx.meta_data().items(): self.prof.add_metadata_json(str(key), json.dumps(value)) DynamicProfilerUtils.out_log("Start Dynamic Profiler at {} step.".format( diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py index 409013114a..65fbf5b038 100644 --- a/torch_npu/profiler/profiler.py +++ b/torch_npu/profiler/profiler.py @@ -229,6 +229,7 @@ class profile(_KinetoProfile): self.on_trace_ready = on_trace_ready self.step_num = 0 self.current_action = self.schedule(self.step_num) + self._step_num_offset = 0 self.step_rec_fn: Optional[prof.record_function] = None if use_cuda is not None: print_warn_msg("This is npu environment, use_cuda is invalid") @@ -249,6 +250,10 @@ class profile(_KinetoProfile): if self.stopped == False: self.stop() + @no_exception_func() + def _set_step_num_offset_for_dynamic_prof(self, step: int): + self._step_num_offset = step + @no_exception_func() def start(self): self.stopped = False @@ -256,7 +261,7 @@ class profile(_KinetoProfile): ProfPathCreator().init(export_only_mode=True) self.action_controller.transit_action(ProfilerAction.NONE, self.current_action) if self.record_steps: - self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num)) + self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num + self._step_num_offset)) self.step_rec_fn.__enter__() @no_exception_func() -- Gitee From 9e8ddf9d6dbf0f7ba1b0b734ff9dc9f6a0338c02 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Wed, 25 Jun 2025 09:00:20 +0000 Subject: [PATCH 151/328] !22221 JSON serialization for hccl status dump Merge pull request !22221 from huangyunlong/2.7js1 --- .../csrc/distributed/ProcessGroupHCCL.cpp | 40 +++--- torch_npu/csrc/distributed/TraceUtils.h | 120 +++++++++--------- 2 files changed, 82 insertions(+), 78 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 4584c2d97a..04227f193a 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2058,30 +2058,34 @@ bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool e } fileName << "torch_hccl_status-" << std::to_string(global_rank) << "_" << master_addr << "_" << std::to_string(deviceId_) << "_"; fileName << std::to_string(numRanks_) << "_" << std::to_string(pid) << "_" << std::to_string(duration) << ".log"; - std::string isMaster = "false"; + bool isMaster = false; if (global_rank == 0) { - isMaster = "true"; + isMaster = true; } std::string out_file_path = c10::str(path, "/", fileName.str()); checkAndMakePath(path.c_str(), "Open shared directory failed. Please check whether input path is valid."); createFile(out_file_path.c_str()); - outfile.open(out_file_path.c_str(), std::ios::trunc); - outfile << "{\"last_comm_op\":["; - bool first_op = true; + using json = nlohmann::json; + json result; + std::list last_comm_ops; for (auto info = StatusOutput_.begin(); info != StatusOutput_.end(); info++) { - if (first_op) { - outfile << "{"; - } else { - outfile << ", {"; - } - outfile << "\"seq\":" << info->second.seq << ", \"op_type\":\"" << info->second.opType; - outfile << "\", \"pg_id\":\"" << info->second.pgId << "\", \"comm_ids\":\"" << info->second.commIds; - outfile << "\", \"status\":\""<< info->second.status << "\"}"; - first_op = false; - } - outfile << "], \"is_master\":" << isMaster; - outfile << ", \"exception_message\":\"" << exceptionMessage_; - outfile << "\", \"global_pg_end_time\":" << end_duration << "}" << std::endl; + json comm_op; + comm_op["seq"] = info->second.seq; + comm_op["op_type"] = info->second.opType; + comm_op["pg_id"] = info->second.pgId; + comm_op["comm_ids"] = info->second.commIds; + comm_op["status"] = info->second.status; + last_comm_ops.emplace_back(comm_op); + } + if (!last_comm_ops.empty()) { + result["last_comm_op"] = last_comm_ops; + } + result["is_master"] = isMaster; + result["exception_message"] = exceptionMessage_; + result["global_pg_end_time"] = end_duration; + std::string result_str = result.dump(); + outfile.open(out_file_path.c_str(), std::ios::trunc); + outfile << result_str << std::endl; outfile.close(); return true; } diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h index 9d4f9d9d52..f6140a3d06 100644 --- a/torch_npu/csrc/distributed/TraceUtils.h +++ b/torch_npu/csrc/distributed/TraceUtils.h @@ -711,71 +711,71 @@ DEFINE_CONSTANT(started_state, "started") if (includeCollectives) { std::list entries; for (auto& e : dump_entries()) { - json j; - if (onlyActive && e.time_discovered_completed_.has_value()) { - continue; - } - j[record_id_key_str] = int64_t(e.id_); - j[pg_id_key_str] = int64_t(e.pg_id_); - j[pg_name_key_str] = e.pg_name_; - j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_); - j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_); - j[op_id_key_str] = int64_t(e.op_id_); - j[profiling_name_key_str] = e.profiling_name_; - j[time_created_key_str] = int64_t(e.time_created_); - if (e.duration_) { - j[duration_key_str] = *e.duration_; - } - auto it = e.sizes_.begin(); - auto read_sizes = [&](const c10::SmallVector& dims) { - auto sizes = std::list>(); - for (auto dim : dims) { - auto arg_sizes = std::list(); - for (auto i : c10::irange(dim)) { - (void)i; - arg_sizes.push_back(*it++); + json j; + if (onlyActive && e.time_discovered_completed_.has_value()) { + continue; } - sizes.push_back(arg_sizes); + j[record_id_key_str] = int64_t(e.id_); + j[pg_id_key_str] = int64_t(e.pg_id_); + j[pg_name_key_str] = e.pg_name_; + j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_); + j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_); + j[op_id_key_str] = int64_t(e.op_id_); + j[profiling_name_key_str] = e.profiling_name_; + j[time_created_key_str] = int64_t(e.time_created_); + if (e.duration_) { + j[duration_key_str] = *e.duration_; } - return sizes; - }; - j[input_sizes_key_str] = read_sizes(e.input_dims_); - std::vector input_dtypes_strs; - input_dtypes_strs.reserve(e.input_dtypes_.size()); - for (const auto& input_dtype : e.input_dtypes_) { - input_dtypes_strs.emplace_back(c10::toString(input_dtype)); - } - j[input_dtypes_key_str] = input_dtypes_strs; - j[output_sizes_key_str] = read_sizes(e.output_dims_); - std::vector output_dtypes_strs; - output_dtypes_strs.reserve(e.output_dtypes_.size()); - for (const auto& output_dtype : e.output_dtypes_) { - output_dtypes_strs.emplace_back(c10::toString(output_dtype)); - } - j[output_dtypes_key_str] = output_dtypes_strs; - if (e.time_discovered_completed_.has_value()) { - j[state_key_str] = completed_state_str; - } else if (e.time_discovered_started_.has_value()) { - j[state_key_str] = started_state_str; - } else { - j[state_key_str] = scheduled_state_str; - } - j[time_discovered_started_key_str] = - e.time_discovered_started_.has_value() - ? int64_t(*e.time_discovered_started_) - : 0; - j[time_discovered_completed_key_str] = - e.time_discovered_completed_.has_value() - ? int64_t(*e.time_discovered_completed_) - : 0; - j[retired_key_str] = e.retired_; - j[timeout_key_str] = e.timeout_ms_; - j[is_p2p_key_str] = e.isP2P_; - entries.emplace_back(j); + auto it = e.sizes_.begin(); + auto read_sizes = [&](const c10::SmallVector& dims) { + auto sizes = std::list>(); + for (auto dim : dims) { + auto arg_sizes = std::list(); + for (auto i : c10::irange(dim)) { + (void)i; + arg_sizes.push_back(*it++); + } + sizes.push_back(arg_sizes); + } + return sizes; + }; + j[input_sizes_key_str] = read_sizes(e.input_dims_); + std::vector input_dtypes_strs; + input_dtypes_strs.reserve(e.input_dtypes_.size()); + for (const auto& input_dtype : e.input_dtypes_) { + input_dtypes_strs.emplace_back(c10::toString(input_dtype)); + } + j[input_dtypes_key_str] = input_dtypes_strs; + j[output_sizes_key_str] = read_sizes(e.output_dims_); + std::vector output_dtypes_strs; + output_dtypes_strs.reserve(e.output_dtypes_.size()); + for (const auto& output_dtype : e.output_dtypes_) { + output_dtypes_strs.emplace_back(c10::toString(output_dtype)); + } + j[output_dtypes_key_str] = output_dtypes_strs; + if (e.time_discovered_completed_.has_value()) { + j[state_key_str] = completed_state_str; + } else if (e.time_discovered_started_.has_value()) { + j[state_key_str] = started_state_str; + } else { + j[state_key_str] = scheduled_state_str; + } + j[time_discovered_started_key_str] = + e.time_discovered_started_.has_value() + ? int64_t(*e.time_discovered_started_) + : 0; + j[time_discovered_completed_key_str] = + e.time_discovered_completed_.has_value() + ? int64_t(*e.time_discovered_completed_) + : 0; + j[retired_key_str] = e.retired_; + j[timeout_key_str] = e.timeout_ms_; + j[is_p2p_key_str] = e.isP2P_; + entries.emplace_back(j); } if (!entries.empty()) { - result[entries_key_str] = entries; + result[entries_key_str] = entries; } } -- Gitee From 2ef99184ab952d3403401ea5ae94d213c69d3140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 25 Jun 2025 09:46:07 +0000 Subject: [PATCH 152/328] =?UTF-8?q?!22304=20silentcheck:=20initialize=20or?= =?UTF-8?q?igin=5Fmatmul=20with=20delay=20Merge=20pull=20request=20!22304?= =?UTF-8?q?=20from=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fcheckfix7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/asd/asd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py index 4f1229a5b7..4748614233 100644 --- a/torch_npu/asd/asd.py +++ b/torch_npu/asd/asd.py @@ -16,8 +16,6 @@ from ._silent_fault_data import SilentFaultData, SilentFaultDataV2 __all__ = [] -original_matmul = torch.matmul -original_tensor_matmul = torch.Tensor.matmul loggerSilent = logging.getLogger("torch_npu.silent_check") @@ -784,6 +782,8 @@ def _matmul_silent_check_decorator(func): matmul_check._startup() if matmul_check.with_checksum and not matmul_check.matmul_trigger: + original_matmul = torch.matmul + original_tensor_matmul = torch.Tensor.matmul torch_npu.asd.checksum.matmul = original_matmul torch.matmul = _trigger_matmul_decorator(original_matmul) torch.Tensor.matmul = _trigger_tensor_matmul_decorator(original_tensor_matmul) -- Gitee From 194867f06729acd9ed8c178da1ed64472f29b157 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 25 Jun 2025 11:05:06 +0000 Subject: [PATCH 153/328] !22316 Update op_plugin commit id Merge pull request !22316 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 423f8e137e..b5b70cf6bd 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 423f8e137e7859624655c203b9452fc47a3a6a89 +Subproject commit b5b70cf6bdaaacbab8abaf09e7035b320fef7c66 -- Gitee From 1fd0edff68e7abfa6f99723de88b0331d8371a4b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 25 Jun 2025 14:05:04 +0000 Subject: [PATCH 154/328] !22326 Update op_plugin commit id Merge pull request !22326 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index b5b70cf6bd..b79b5bfa23 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit b5b70cf6bdaaacbab8abaf09e7035b320fef7c66 +Subproject commit b79b5bfa23346962e9705d2ea935c3ce17c5158d -- Gitee From c1c374e126ed09d14d29f763976045fce7c83523 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 26 Jun 2025 03:05:07 +0000 Subject: [PATCH 155/328] !22335 Update op_plugin commit id Merge pull request !22335 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index b79b5bfa23..d185706ebf 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit b79b5bfa23346962e9705d2ea935c3ce17c5158d +Subproject commit d185706ebf794d881ae4e9ac6e3383b58df5ca68 -- Gitee From f938314871a60452de18218a64c8e90a98d57988 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 26 Jun 2025 03:05:07 +0000 Subject: [PATCH 156/328] !22335 Update op_plugin commit id Merge pull request !22335 from pta-robot/v2.7.1 -- Gitee From b5f6378b4f5ea59c96a2e7ef74040150bae93c15 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 26 Jun 2025 05:05:07 +0000 Subject: [PATCH 157/328] !22349 Update op_plugin commit id Merge pull request !22349 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index d185706ebf..975880ccb7 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit d185706ebf794d881ae4e9ac6e3383b58df5ca68 +Subproject commit 975880ccb7b09ae22724dbe2c068005da1deb573 -- Gitee From 6840bdede3f75ae8540358c58da2449f650d2f03 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 26 Jun 2025 11:05:07 +0000 Subject: [PATCH 158/328] !22362 Update op_plugin commit id Merge pull request !22362 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 975880ccb7..0b410f7e3b 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 975880ccb7b09ae22724dbe2c068005da1deb573 +Subproject commit 0b410f7e3be514df49d40a4196645d1845569338 -- Gitee From 0495b7151b0168a2c3b9f949793f3caa780d5b68 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 26 Jun 2025 14:20:07 +0000 Subject: [PATCH 159/328] !22367 Update op_plugin commit id Merge pull request !22367 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 0b410f7e3b..e7ce3bd4be 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 0b410f7e3be514df49d40a4196645d1845569338 +Subproject commit e7ce3bd4be612689bbaea8a10345f93613cecefa -- Gitee From 1336157a65d3cf7a0e270119ec1afd6cd3f58af8 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 27 Jun 2025 01:46:00 +0000 Subject: [PATCH 160/328] !22288 Update torchair commit id Merge pull request !22288 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index c5a9442d9c..61124234e9 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit c5a9442d9c0db6da50c28681ec7e1c8512cd1d95 +Subproject commit 61124234e931bae6f1d06ecf6d14fa6f4d93398b -- Gitee From c621fe7bc3f8d7766dc484190e9acb125587a4bc Mon Sep 17 00:00:00 2001 From: shenweiling Date: Fri, 27 Jun 2025 03:48:39 +0000 Subject: [PATCH 161/328] !22341 add npu_fused_infer_attention_v2 Merge pull request !22341 from shenweiling/v2.7.1 --- test/allowlist_for_publicAPI.json | 1 + 1 file changed, 1 insertion(+) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index 0ce5aee4ca..ac638230f5 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2828,6 +2828,7 @@ "npu_rms_norm", "npu_add_rms_norm_cast", "npu_fused_infer_attention_score", + "npu_fused_infer_attention_v2", "npu_mla_prolog", "npu_mla_prolog_v2", "npu_convert_weight_to_int4pack", -- Gitee From 43a4ea3415601d101bcf69032a3f49f8c003def2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 27 Jun 2025 05:20:08 +0000 Subject: [PATCH 162/328] !22381 Update op_plugin commit id Merge pull request !22381 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index e7ce3bd4be..a76556c13e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit e7ce3bd4be612689bbaea8a10345f93613cecefa +Subproject commit a76556c13ee025dd17cc0171f888d2ab58d45f90 -- Gitee From 77e00fdf69649e6ea518f8bae2cf34e7c513defd Mon Sep 17 00:00:00 2001 From: DaiFu Date: Fri, 27 Jun 2025 08:00:25 +0000 Subject: [PATCH 163/328] !22380 add 2.7 inductor ci Merge pull request !22380 from DaiFu/v2.7.1_inductor_ci --- ci/access_control/test_manager.py | 5 ++++- ci/access_control_test.py | 3 +++ torch_npu/_inductor/__init__.py | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 torch_npu/_inductor/__init__.py diff --git a/ci/access_control/test_manager.py b/ci/access_control/test_manager.py index 4a26981117..871ecf747a 100644 --- a/ci/access_control/test_manager.py +++ b/ci/access_control/test_manager.py @@ -34,7 +34,7 @@ class TestMgr: def load(self, modify_files, world_size): with open(modify_files) as f: for line in f: - if world_size != 0 and "test/distributed/" in line: + if world_size != 0 and ("test/distributed/" in line or "test/_inductor/" in line): continue line = line.strip() self.modify_files.append(line) @@ -63,6 +63,9 @@ class TestMgr: def load_distributed_ut(self): self.test_files['ut_files'] += [str(i) for i in (BASE_DIR / 'test/distributed').rglob('test_*.py')] + def load_inductor_ut(self): + self.test_files['ut_files'] += [str(i) for i in (BASE_DIR / 'test/_inductor').rglob('test_*.py')] + def load_op_plugin_ut(self): version_path = get_test_torch_version_path() file_hash = {} diff --git a/ci/access_control_test.py b/ci/access_control_test.py index 86ea808604..216980d7af 100644 --- a/ci/access_control_test.py +++ b/ci/access_control_test.py @@ -118,6 +118,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='Control needed ut cases') parser.add_argument('--all', action="store_true", help='Run all testcases') parser.add_argument('--distributed', action="store_true", help='Run distributed testcases') + parser.add_argument('--inductor', action="store_true", help='Run inductor testcases') parser.add_argument('--rank', default=0, type=int, help='Index of current ut nodes') parser.add_argument('--world_size', default=0, type=int, help='Number of ut nodes') parser.add_argument('--network_ops', action="store_true", help='Run network_ops testcases in the op-plugin repo') @@ -130,6 +131,8 @@ if __name__ == "__main__": test_mgr.load_all_ut(options.distributed, options.network_ops) elif options.distributed: test_mgr.load_distributed_ut() + elif options.inductor: + test_mgr.load_inductor_ut() elif os.path.exists(cur_modify_files): test_mgr.load(cur_modify_files, world_size=options.world_size) test_mgr.analyze() diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py new file mode 100644 index 0000000000..0aba17d108 --- /dev/null +++ b/torch_npu/_inductor/__init__.py @@ -0,0 +1 @@ +import os \ No newline at end of file -- Gitee From b145f90ae368e26c660b89f2768873f35b926d91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Sat, 28 Jun 2025 02:18:08 +0000 Subject: [PATCH 164/328] =?UTF-8?q?!21639=20Set=20affinity=20optimization?= =?UTF-8?q?=20Merge=20pull=20request=20!21639=20from=20=E5=A7=9C=E6=80=A1?= =?UTF-8?q?=E6=96=87/v2.7.1=5Fbd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 3 + .../csrc/core/npu/NPUAffinityController.cpp | 124 +++++++++++++----- .../csrc/core/npu/NPUAffinityController.h | 5 +- torch_npu/csrc/core/npu/NPUFunctions.cpp | 5 +- torch_npu/csrc/core/npu/NPUQueue.cpp | 2 + torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp | 1 + .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 + .../csrc/distributed/ProcessGroupHCCL.cpp | 1 + torch_npu/csrc/framework/LazyInitAclops.cpp | 9 -- torch_npu/csrc/framework/OpCommand.cpp | 3 +- .../csrc/framework/interface/EnvVariables.cpp | 10 -- torch_npu/csrc/npu/Module.cpp | 10 ++ torch_npu/utils/__init__.py | 3 +- torch_npu/utils/_module.py | 16 +-- torch_npu/utils/affinity.py | 6 +- 15 files changed, 127 insertions(+), 73 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index d5f484c445..26e687fb09 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2720,6 +2720,9 @@ "torch_npu.utils.set_thread_affinity": { "signature": "(core_range: List[int] = None)" }, + "torch_npu.utils.reset_thread_affinity": { + "signature": "()" + }, "torch_npu.dynamo.torchair.scope.npu_stream_switch": { "signature": "(stream_tag: str, stream_priority: int = 0)" }, diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index a331439d9f..5567c3e6e2 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -14,6 +14,9 @@ namespace c10_npu { static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; +static pthread_t main_thread; +static bool start_main_thread_bind = false; + using ThreadCoreMap = std::unordered_map; static uint32_t cpu_affinity_mode; @@ -28,8 +31,7 @@ const std::unordered_map threadTypeToNameMap = { {ACL_THREAD, "acl_thread"}, {RELEASE_THREAD, "release_thread"}, {WATCHDOG_THREAD, "hccl_watchdog_t"}, - {OTHER_THREAD, "other_thread"}, - {USER_THREAD, "user_thread"}}; + {OTHER_THREAD, "other_thread"}}; CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id) { @@ -147,7 +149,7 @@ void printCoreRanges(const uint32_t mode, const std::vector &ranges oss << "Mode: " << mode << ". Core range for each device ID: "; for (size_t i = 0; i < ranges.size(); ++i) { - oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]"; + oss << "Device " << i << ": [" << ranges[i].start << ", " << ranges[i].end << "]"; if (i != ranges.size() - 1) { oss << "; "; } else { @@ -194,18 +196,18 @@ void SetThreadType(ThreadType type) return; } if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { - ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str()); + ASCEND_LOGW("Set thread name to %s failed!", threadTypeToNameMap.at(type).c_str()); } } std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap) { std::ostringstream oss; - for (auto local_thread : threadTypeList) { - oss << threadTypeToNameMap.at(local_thread) << " : [" - << threadCoreMap.at(local_thread).start << "," - << threadCoreMap.at(local_thread).end << "]"; - if (local_thread != OTHER_THREAD) { + for (auto thread_type : threadTypeList) { + oss << threadTypeToNameMap.at(thread_type) << ": [" + << threadCoreMap.at(thread_type).start << ", " + << threadCoreMap.at(thread_type).end << "]"; + if (thread_type != OTHER_THREAD) { oss << "; "; } else { oss << "."; @@ -222,16 +224,16 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector(device_index); - SetThreadType(type); + local_thread = type; + if (local_thread == ThreadType::MAIN_THREAD) { + start_main_thread_bind = true; + } SetThreadAffinity(device); } @@ -289,20 +308,55 @@ void SetThreadAffinity(int core_start, int core_end) if (!needToSetThreadAffinity()) { return; } + static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - core_start = std::min(core_start, core_nums); - core_end = std::min(core_end, core_nums); + CoreIdRange core_range; + core_range.start = static_cast(std::min(core_start, core_nums)); + core_range.end = static_cast(std::min(core_end, core_nums)); local_thread = ThreadType::USER_THREAD; - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto i = core_start; i <= core_end; i++) { - CPU_SET(i, &mask); - } - if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { - ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); + if (setThreadAffinityImpl(pthread_self(), core_range)) { + ASCEND_LOGD("Set thread affinity to user-defined range %d-%d success.", core_range.start, core_range.end); } else { - ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); + ASCEND_LOGE("Set thread affinity to user-defined range %d-%d failed.", core_range.start, core_range.end); + } +} + +void SetMainThread() +{ + main_thread = pthread_self(); +} + +bool NeedMainThreadBind() +{ + return start_main_thread_bind && (local_thread == ThreadType::MAIN_THREAD); +} + +void StartMainThreadBind(c10::DeviceIndex device_id) +{ + if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) { + return; + } + + static thread_local bool seted = false; + if (!seted) { + seted = true; + if (syscall(SYS_gettid) != getpid()) { + start_main_thread_bind = true; + + SetThreadAffinity(device_id); + + CoreIdRange core_range = getCoreRange(device_id, ThreadType::MAIN_THREAD); + if (setThreadAffinityImpl(main_thread, core_range)) { + ASCEND_LOGD("Device %d set %s affinity to %d-%d success.", + device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(), + core_range.start, core_range.end); + } else { + ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.", + device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(), + core_range.start, core_range.end); + } + } } } diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index 0ec3c4d995..e850a47b67 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -20,9 +20,12 @@ enum ThreadType { }; void SetThreadType(ThreadType type); - void SetThreadAffinity(c10::DeviceIndex device); void SetThreadAffinity(ThreadType type); void SetThreadAffinity(int core_start, int core_end); +void SetMainThread(); +bool NeedMainThreadBind(); +void StartMainThreadBind(c10::DeviceIndex device_id); + } // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 266eab3fa7..085bb0be9d 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -103,7 +103,10 @@ aclError SetDevice(c10::DeviceIndex device) if (local_device == device) { return ACL_ERROR_NONE; } - c10_npu::SetThreadAffinity(device); + + if (c10_npu::NeedMainThreadBind()) { + c10_npu::SetThreadAffinity(device); + } aclError err = aclrtSetDevice(device); if (err == ACL_ERROR_NONE) { diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 7767dda6b8..73e2bb7ca1 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -707,6 +707,8 @@ bool Repository::CheckInit() const void StartConsume(Repository *repo, c10::DeviceIndex device_id) { SetThreadType(ThreadType::ACL_THREAD); + SetThreadAffinity(device_id); + aclError ret = c10_npu::SetDevice(device_id); if (ret != 0) { C10_NPU_SHOW_ERR_MSG(); diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp index 54c5195213..cd00ca610a 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp @@ -53,6 +53,7 @@ void NPUGuardImpl::setDevice(c10::Device d) const void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept { + c10_npu::StartMainThreadBind(d.index()); NPU_CHECK_WARN(c10_npu::SetDevice(d.index())); } diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index de1010347f..3564e5196c 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -189,6 +189,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) lazy_fn_.clear(); + SetMainThread(); + init_flag_ = true; ASCEND_LOGD("Npu sys ctrl initialize successfully."); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 04227f193a..3b2ad09cb4 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1716,6 +1716,7 @@ void ProcessGroupHCCL::workCleanupLoop() try { if (needSetDevice) { c10::DeviceIndex device = static_cast(work.devices_[0].index()); + c10_npu::SetThreadAffinity(device); NPU_CHECK_ERROR(c10_npu::SetDevice(device)); deviceId_ = static_cast(work.devices_[0].index()); needSetDevice = false; diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp index 8d12df0a31..5f51f9f0a5 100644 --- a/torch_npu/csrc/framework/LazyInitAclops.cpp +++ b/torch_npu/csrc/framework/LazyInitAclops.cpp @@ -4,7 +4,6 @@ #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -158,8 +157,6 @@ void SetPrecisionMode() void LazyInitAclopsCore() { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; if (PyGILState_Check()) { @@ -175,8 +172,6 @@ void LazyInitAclopsCore() PyEval_RestoreThread(gilState); } #endif - - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void LazyInitAclops() @@ -198,14 +193,10 @@ void LazyInitAclops() void InitAclopsCore() { - SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - SetPrecisionMode(); MakeCompileCacheDirAndSetOption(); GetAndSetDefaultJitCompileByAcl(); SetHF32DefaultValue(); - - SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void InitAclops() diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp index dd7ca13ab9..6b98651c51 100644 --- a/torch_npu/csrc/framework/OpCommand.cpp +++ b/torch_npu/csrc/framework/OpCommand.cpp @@ -120,7 +120,8 @@ OpCommand& OpCommand::Output(at::Tensor &output, const string &descName, return AddOutput(output, realType); } -void OpCommand::Run() { +void OpCommand::Run() +{ // Check for npu graph if (aclCmd->CheckCustomHandlerNull()) { c10_npu::assertNotCapturingAclop(aclCmd->GetName()); diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp index e8c179d579..4da3d362d1 100644 --- a/torch_npu/csrc/framework/interface/EnvVariables.cpp +++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp @@ -1,6 +1,5 @@ #include #include "torch_npu/csrc/core/npu/NPUException.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "third_party/acl/inc/acl/acl_mdl.h" #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h" @@ -47,8 +46,6 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) { aclmdlSetDump(val.c_str()); }) -static bool acl_op_has_init = false; - REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable") REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner) REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { @@ -60,14 +57,7 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { "Jit compile set is disabled! If you want to set, ", "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.", PTA_ERROR(ErrCode::NOT_SUPPORT)); - if (!acl_op_has_init) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - } NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str())); - if (!acl_op_has_init) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); - acl_op_has_init = true; - } } SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false); }) diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 0d0c15808e..df72d5010f 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1553,6 +1553,15 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) } else { c10_npu::SetThreadAffinity(core_start, core_end); } + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); Py_RETURN_NONE; END_HANDLE_TH_ERRORS } @@ -1680,6 +1689,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, + {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr}, {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr}, diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index e8fbd923b3..4ba06538af 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -1,5 +1,5 @@ __all__ = ["npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter", - "set_thread_affinity"] + "set_thread_affinity", "reset_thread_affinity"] from torch_npu import _C from ._module import _apply_module_patch @@ -18,6 +18,7 @@ from .utils import _print_error_log, _print_warn_log, _print_info_log, _apply_np from ._step import add_perf_dump_patch from .flops_count import _FlopsCounter as FlopsCounter from .affinity import _set_thread_affinity as set_thread_affinity +from .affinity import _reset_thread_affinity as reset_thread_affinity # init flopcount diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index 7e3269f88e..cb0a8dd8b3 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -29,8 +29,6 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm from torch_npu.utils._error_code import ErrCode, pta_error origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__ -origin_worker_loop = worker._worker_loop -origin_pin_memory_loop = pin_memory._pin_memory_loop CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"] @@ -369,17 +367,9 @@ def _mpdl_iter_init(self, *args, **kwargs): torch_npu.npu.synchronize() except Exception as e: print(e) - origin_mpdl_iter_init(self, *args, **kwargs) - - -def _npu_worker_loop(*args, **kwargs): torch_npu._C._npu_set_thread_affinity(-1, -1) - origin_worker_loop(*args, **kwargs) - - -def _npu_pin_memory_loop(*args, **kwargs): - torch_npu._C._npu_set_thread_affinity(-1, -1) - origin_pin_memory_loop(*args, **kwargs) + origin_mpdl_iter_init(self, *args, **kwargs) + torch_npu._C._npu_reset_thread_affinity() def _parallel_apply( @@ -532,5 +522,3 @@ def _apply_module_patch(): torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply torch.nn.parallel.data_parallel = npu_data_parallel torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init - torch.utils.data._utils.worker._worker_loop = _npu_worker_loop - torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py index 7728736baa..37973f5bc7 100644 --- a/torch_npu/utils/affinity.py +++ b/torch_npu/utils/affinity.py @@ -14,4 +14,8 @@ def _set_thread_affinity(core_range: List[int] = None): raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM)) torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1]) else: - raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) \ No newline at end of file + raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) + + +def _reset_thread_affinity(): + torch_npu._C._npu_reset_thread_affinity() \ No newline at end of file -- Gitee From 563f3e35d9acf95b9788e0ede3e529fb72555d65 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Sat, 28 Jun 2025 06:08:58 +0000 Subject: [PATCH 165/328] !22402 Update torchair commit id Merge pull request !22402 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 61124234e9..8035ce4339 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 61124234e931bae6f1d06ecf6d14fa6f4d93398b +Subproject commit 8035ce4339f636e332ff56f39a579f6930ddf6ff -- Gitee From 4fed2fdc370c8e296f70e608e1f87eb0fc24a2c3 Mon Sep 17 00:00:00 2001 From: chuboning Date: Sat, 28 Jun 2025 07:33:41 +0000 Subject: [PATCH 166/328] !22299 Add support for custom dtype Merge pull request !22299 from chuboning/v2.7.1 --- CMakeLists.txt | 5 +- codegen/gen_backend_stubs.py | 2 + codegen/utils.py | 6 + test/allowlist_for_publicAPI.json | 3 + test/npu/test_tensors.py | 22 + test/onnx/test_pytorch_onnx_no_runtime.py | 2 +- test/torch_npu_schema.json | 13 +- third_party/acl/inc/acl/acl_base.h | 10 + torch_npu/__init__.py | 8 +- torch_npu/csrc/InitNpuBindings.cpp | 2 + .../csrc/aten/common/FormatCastKernelNpu.cpp | 192 +++++- .../csrc/aten/common/LocalScalarDenseNpu.cpp | 23 +- torch_npu/csrc/aten/common/ToKernelNpu.cpp | 2 +- torch_npu/csrc/aten/npu_native_functions.yaml | 9 +- .../csrc/core/npu/NPUCachingAllocator.cpp | 12 +- torch_npu/csrc/core/npu/NPUException.cpp | 2 +- torch_npu/csrc/core/npu/NPUException.h | 2 +- torch_npu/csrc/core/npu/NPUMacros.h | 2 +- torch_npu/csrc/core/npu/NpuVariables.cpp | 44 +- torch_npu/csrc/core/npu/NpuVariables.h | 5 +- .../csrc/core/npu/interface/AclInterface.cpp | 4 +- .../csrc/core/npu/register/OptionRegister.cpp | 13 + torch_npu/csrc/custom_dtype/CMakeLists.txt | 6 + .../csrc/custom_dtype/CastKernelTeOpApi.cpp | 43 ++ torch_npu/csrc/custom_dtype/Init.cpp | 163 +++++ torch_npu/csrc/custom_dtype/Init.h | 83 +++ torch_npu/csrc/custom_dtype/extension.h | 12 + .../csrc/distributed/ProcessGroupHCCL.cpp | 14 +- .../csrc/distributed/rpc/tensorpipe_agent.cpp | 3 + torch_npu/csrc/framework/FormatHelper.cpp | 4 + torch_npu/csrc/framework/OpCommand.cpp | 8 +- torch_npu/csrc/framework/OpParamMaker.cpp | 1 + .../csrc/framework/StorageDescHelper.cpp | 23 + torch_npu/csrc/framework/StorageDescHelper.h | 4 + .../framework/contiguous/reshapeV2_opt.cpp | 8 + .../csrc/framework/interface/EnvVariables.cpp | 18 +- .../csrc/framework/utils/CalcuOpUtil.cpp | 46 +- torch_npu/csrc/framework/utils/CalcuOpUtil.h | 1 + .../csrc/framework/utils/OpPreparation.cpp | 5 + .../csrc/framework/utils/OpPreparation.h | 1 + torch_npu/csrc/npu/DataParallelComm.cpp | 2 +- torch_npu/onnx/wrapper_onnx_ops.py | 8 +- torch_npu/utils/hif8_tensor.py | 584 ++++++++++++++++++ 43 files changed, 1349 insertions(+), 71 deletions(-) create mode 100644 torch_npu/csrc/custom_dtype/CMakeLists.txt create mode 100644 torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp create mode 100644 torch_npu/csrc/custom_dtype/Init.cpp create mode 100644 torch_npu/csrc/custom_dtype/Init.h create mode 100644 torch_npu/csrc/custom_dtype/extension.h create mode 100644 torch_npu/utils/hif8_tensor.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 56c2baf63f..34058b029f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -248,6 +248,7 @@ add_subdirectory(${TORCHNPU_ROOT}/core) add_subdirectory(${TORCHNPU_ROOT}/framework) add_subdirectory(${TORCHNPU_ROOT}/flopcount) add_subdirectory(${TORCHNPU_ROOT}/logging) +add_subdirectory(${TORCHNPU_ROOT}/custom_dtype) if (NOT DEFINED BUILD_LIBTORCH) add_subdirectory(${TORCHNPU_ROOT}/distributed) @@ -274,10 +275,10 @@ if (DEFINED BUILD_TENSORPIPE) endif() if (DEFINED BUILD_LIBTORCH) - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) else() # Compile code with pybind11 - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) endif() add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS}) diff --git a/codegen/gen_backend_stubs.py b/codegen/gen_backend_stubs.py index 78e1a1df5a..89416889de 100644 --- a/codegen/gen_backend_stubs.py +++ b/codegen/gen_backend_stubs.py @@ -402,6 +402,8 @@ def gen_dispatcher_registrations( ns_helper = NamespaceHelper(namespace_str="at") native_func_header = """\ #include "torch_npu/csrc/core/npu/NPURecovery.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" +#include "torch_npu/csrc/core/npu/NPUException.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/profiler/utils.h" #endif diff --git a/codegen/utils.py b/codegen/utils.py index 7d561cb66c..8022c57ec5 100644 --- a/codegen/utils.py +++ b/codegen/utils.py @@ -434,6 +434,7 @@ const DeviceGuard device_guard(device_or_default(device));""" device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));" op_key = str(f.func.name) + is_aclnn_only = "c10_npu::IsAclnnOnly()" if enable_opplugin(): if op_key in GLOBAL_STRUCTURED_OP_INFO_CACHE: impl_name = f"op_plugin::{GLOBAL_STRUCTURED_OP_INFO_CACHE[op_key]}" @@ -505,6 +506,11 @@ if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {{ if (({force_aclnn} || at_npu::native::env::CheckJitDisable()){tensor_check_str}) {{ return {op_api_impl_name}({args_exprs_str}); }} else {{ + if ({is_aclnn_only}) {{ + TORCH_CHECK(false, + "Current device only support aclnn operator, and current operator {impl_name} do not support internal format.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + }} return {impl_name}({args_exprs_str}); }} """ diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index ac638230f5..e24bb675fe 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2822,6 +2822,7 @@ "npu_cross_entropy_loss", "npu_format_cast_", "npu_fusion_attention", + "npu_fusion_attention_v2", "npu_get_float_status", "npu_nms_rotated", "npu_random_choice_with_mask", @@ -2833,6 +2834,7 @@ "npu_mla_prolog_v2", "npu_convert_weight_to_int4pack", "npu_ffn", + "npu_fused_matmul", "npu_geglu", "npu_grouped_matmul", "npu_quant_matmul", @@ -2851,6 +2853,7 @@ "npu_trans_quant_param", "npu_stride_add", "npu_sort_v2", + "npu_dtype_cast", "npu_gelu", "npu_gelu_backward", "npu_all_gather_base_mm", diff --git a/test/npu/test_tensors.py b/test/npu/test_tensors.py index 3108eb9b64..e5fc17ae6d 100644 --- a/test/npu/test_tensors.py +++ b/test/npu/test_tensors.py @@ -1,4 +1,5 @@ from copy import deepcopy +import unittest import numpy as np import torch import torch_npu @@ -22,6 +23,16 @@ types = [ ] +def skipIfUnsupport910_95(): + def skip_dec(func): + def wrapper(self): + if "Ascend910_95" not in torch_npu.npu.get_device_name(): + return unittest.SkipTest("Device 910_95 condition not satisfied") + return func(self) + return wrapper + return skip_dec + + def get_npu_type(type_name): if isinstance(type_name, type): type_name = '{}.{}'.format(type_name.__module__, type_name.__name__) @@ -383,5 +394,16 @@ class TestViewOps(TestCase): self.assertEqual(tensor.view(3, -1).size(), target) +class TestTensorDtype(TestCase): + @skipIfUnsupport910_95() + def test_fp8(self): + tensor1 = torch.randn([2, 2], dtype=torch.float32).npu() + tensor2 = torch.randn([2, 2], dtype=torch.float32).npu() + tensor_f8e5m2 = tensor1.to(torch.float8_e5m2) + tensor_f8e4m3fn = tensor2.to(torch.float8_e4m3fn) + self.assertEqual(tensor_f8e5m2.dtype, torch.float8_e5m2) + self.assertEqual(tensor_f8e4m3fn.dtype, torch.float8_e4m3fn) + + if __name__ == "__main__": run_tests() diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py index ceac73119a..90ffef6872 100644 --- a/test/onnx/test_pytorch_onnx_no_runtime.py +++ b/test/onnx/test_pytorch_onnx_no_runtime.py @@ -345,7 +345,7 @@ class TestONNXExport(pytorch_test_common.ExportTestCase): return x[mask] f = io.BytesIO() - torch.onnx.export( + torch.onnx.utils._export( FooMod(), (torch.rand(3, 4),), f, diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index 26e687fb09..07fad8c4ce 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2529,7 +2529,7 @@ "signature": "(*args, **kwargs)" }, "torch_npu.npu_format_cast": { - "signature": "(self, acl_format)" + "signature": "(self, acl_format, customize_dtype=None)" }, "torch_npu.npu_format_cast_": { "signature": "(*args, **kwargs)" @@ -2766,16 +2766,16 @@ "signature": "(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor" }, "func: npu_format_cast": { - "signature": "(Tensor self, int acl_format) -> Tensor" + "signature": "(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor" }, "func: npu_format_cast_": { - "signature": "(Tensor(a!) self, Tensor src) -> Tensor(a!)" + "signature": "(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!)" }, "func: npu_format_cast_.acl_format": { - "signature": "(Tensor(a!) self, int acl_format) -> Tensor(a!)" + "signature": "(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!)" }, "func: npu_format_cast.Tensor": { - "signature": "(Tensor self, Tensor dst) -> Tensor" + "signature": "(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor" }, "func: npu_change_data_ptr": { "signature": "(Tensor dst, Tensor src, int index) -> int" @@ -2795,6 +2795,9 @@ "func: _npu_format_cast": { "signature": "(Tensor self, int acl_format) -> Tensor" }, + "func: _npu_format_cast.aclnn": { + "signature": "(Tensor self, int acl_format, int customize_dtype) -> Tensor" + }, "torch_npu_public_env: INF_NAN_MODE_ENABLE": { "mode": "std::unordered_map infNanMode = {{0, \"max\"}, {1, \"inf_nan\"}}" }, diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index b8ef9dbd34..4178016df5 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -165,6 +165,14 @@ typedef enum { ACL_INT4 = 29, ACL_UINT1 = 30, ACL_COMPLEX32 = 33, + ACL_HIFLOAT8 = 34, + ACL_FLOAT8_E5M2 = 35, + ACL_FLOAT8_E4M3FN = 36, + ACL_FLOAT8_E8M0 = 37, + ACL_FLOAT6_E3M2 = 38, + ACL_FLOAT6_E2M3 = 39, + ACL_FLOAT4_E2M1 = 40, + ACL_FLOAT4_E1M2 = 41, } aclDataType; typedef enum { @@ -183,6 +191,8 @@ typedef enum { ACL_FRACTAL_Z_3D = 33, ACL_FORMAT_NC = 35, ACL_FORMAT_NCL = 47, + ACL_FORMAT_FRACTAL_NZ_C0_16 = 50, + ACL_FORMAT_FRACTAL_NZ_C0_32 = 51, } aclFormat; typedef enum { diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index d84f72b37a..dd80a69fae 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -1,4 +1,4 @@ -__all__ = ["erase_stream", "matmul_checksum"] +__all__ = ["erase_stream", "matmul_checksum", "HiFloat8Tensor"] import os import sys @@ -84,6 +84,7 @@ from torch_npu.utils import _cann_package_check, _add_intercept_methods from torch_npu.utils import _register_ops_under_dtensor_rules from torch_npu.utils.exposed_api import public_npu_functions from torch_npu.npu.utils import _erase_stream as erase_stream +from torch_npu.utils.hif8_tensor import HiFloat8Tensor from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler from torch_npu.asd.asd import _asd_patch from torch_npu.asd.checksum import _matmul_checksum as matmul_checksum @@ -113,6 +114,11 @@ for name in dir(torch.ops.npu): __all__.append(name) setattr(torch, name, _wrap_torch_error_func(getattr(torch.ops.npu, name))) +for name in dir(torch_npu._C._cd.DType): + if name.startswith('__') or name in ['_dir', 'name']: + continue + setattr(torch_npu, name, getattr(torch_npu._C._cd.DType, name)) + all_monkey_patches = [ ["nn.functional", npu_functional], ["nn", npu_modules], diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index c8084af923..4c1bf40361 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -16,6 +16,7 @@ #include "torch_npu/csrc/flopcount/Init.h" #include "torch_npu/csrc/logging/Init.h" #include "torch_npu/csrc/npu/Module.h" +#include "torch_npu/csrc/custom_dtype/Init.h" #include "torch_npu/csrc/npu/Stress_detect.h" #include "torch_npu/csrc/utils/TensorType.h" #include "torch_npu/csrc/utils/AutocastMode.h" @@ -168,6 +169,7 @@ PyObject* initModule() AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions()); AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions()); AddPyMethodDefs(methods, torch_npu::logging::logging_functions()); + AddPyMethodDefs(methods, c10_npu::custom_dtype_functions()); static struct PyModuleDef torchnpu_module = { PyModuleDef_HEAD_INIT, "torch_npu._C", diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 0c4000e524..2c35eaf44c 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -1,16 +1,131 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h" +#include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/aten/CustomFunctions.h" +#include "torch_npu/csrc/custom_dtype/Init.h" +#include "third_party/op-plugin/op_plugin/utils/op_api_common.h" namespace at_npu { namespace native { using tensor_list = std::vector; +using GetFormatFunc = int (*)(const aclTensor *, const int, const int, int64_t **, uint64_t *, int *); + +std::tuple> MaybeUseAclnnNpuFormatCast(const at::Tensor& src, + int64_t acl_format, c10::optional customize_dtype) +{ + const static auto GetFormatFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCastCalculateSizeAndFormat"); + const static auto FormatCastFuncAddr = GetOpApiFuncAddr("aclnnNpuFormatCast"); + + const static bool aclnnNpuFormatCastExist = + (GetFormatFuncAddr == nullptr || FormatCastFuncAddr == nullptr) ? false : true; + + GetFormatFunc GetFormat = reinterpret_cast(GetFormatFuncAddr); + int64_t *dstStorageShape = nullptr; + uint64_t dstShapeSize = 0; + int dstFormat; + at::SmallVector outputShape = {}; + aclDataType customizeAcltype = (customize_dtype.has_value()) ? + c10_npu::GetAclDataType(customize_dtype.value()) : + at_npu::native::OpPreparation::convert_to_acl_data_type(src.scalar_type()); + + if (c10_npu::IsAclnnOnly()) { + if (aclnnNpuFormatCastExist) { + auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape, + &dstShapeSize, &dstFormat); + NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat"); + for (uint64_t i = 0; i < dstShapeSize; i++) { + outputShape.push_back(dstStorageShape[i]); + } + delete[] dstStorageShape; + return std::make_tuple(true, dstFormat, outputShape); + } + TORCH_CHECK(false, + "aclnnNpuFormatCast does not exist, Current device only support aclnn operators.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + if (at_npu::native::env::CheckJitDisable()) { + if (aclnnNpuFormatCastExist) { + auto api_ret = GetFormat(ConvertType(src), acl_format, customizeAcltype, &dstStorageShape, + &dstShapeSize, &dstFormat); + if (api_ret != 0) { + if (customize_dtype.has_value()) { + NPU_CHECK_ERROR(api_ret, "aclnnNpuFormatCastCalculateSizeAndFormat"); + } + return std::make_tuple(false, dstFormat, outputShape); + } + for (uint64_t i = 0; i < dstShapeSize; i++) { + outputShape.push_back(dstStorageShape[i]); + } + delete[] dstStorageShape; + return std::make_tuple(true, dstFormat, outputShape); + } else { + if (C10_UNLIKELY(customize_dtype.has_value())) { + TORCH_CHECK(false, + "customize_dtype is not supported while aclnnNpuFormatCast does not exist.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + return std::make_tuple(false, dstFormat, outputShape); + } + } else { + if (C10_UNLIKELY(customize_dtype.has_value())) { + TORCH_CHECK(false, + "customize_dtype is not supported while jit_compile=True.", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + } + return std::make_tuple(false, dstFormat, outputShape); + } +} + +at::Tensor create_tensor_with_format_and_shape(c10::IntArrayRef baseSizes, + c10::IntArrayRef storageSizes, + const caffe2::TypeMeta dtype, int64_t acl_format) +{ + c10::Allocator *allocator = c10_npu::NPUCachingAllocator::get(); + int64_t nelements = 1; + for (const auto& num : storageSizes) { + nelements *= num; + } + int64_t size_bytes = nelements * dtype.itemsize(); + c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( + c10::StorageImpl::use_byte_size_t(), + c10::SymInt(size_bytes), + allocator->allocate(size_bytes), + allocator, + true); + auto tensor = at::detail::make_tensor(storage_impl, dtype); + + if (baseSizes.size() != 1 || baseSizes[0] != 0) { + tensor.unsafeGetTensorImpl()->set_sizes_contiguous(baseSizes); + } + tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous); + StorageDescHelper::SetDesc(tensor, baseSizes, storageSizes, tensor.strides(), static_cast(acl_format)); + return tensor; +} + +at::Tensor format_cast_impl_out_npu_aclnn(const at::Tensor& src, + int64_t acl_format, c10::IntArrayRef storageSizes) +{ + auto src_new = src.contiguous(); + auto src_new_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src_new)->npu_desc_; + + at::Tensor dst = create_tensor_with_format_and_shape( + src_new.sizes(), storageSizes, src.dtype(), acl_format); + + // calculate the output result of the NPU + EXEC_NPU_CMD(aclnnNpuFormatCast, src_new, dst); + + // format cast only change physical layout of base tensor and view tensor's + // metadata remain unchanged + dst.set_(dst.storage(), src_new.storage_offset(), src_new.sizes(), src_new.strides()); + return dst; +} at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) { @@ -36,7 +151,8 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) } // convert src from src_format to dst_format, write the result into dst(self) -at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src) +at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); torch_npu::utils::torch_check_npu(src); @@ -47,6 +163,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Ten return self; } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, dst_desc.npu_format_, customize_dtype); + if (useAclnn == true) { + at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); + self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides()); + return self; + } + // calculate the output result of the NPU format_cast_impl_out_npu(self, src); @@ -59,16 +182,6 @@ at::Tensor npu_format_cast_impl( int64_t acl_format) { auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_; - if (src_desc.npu_format_ == acl_format) { - ASCEND_LOGD("no need to do format cast"); - return src; - } - if (FormatHelper::IsBaseFormatType(src) && - FormatHelper::IsBaseFormatType(static_cast(acl_format))) { - FormatCastHelper::format_cast_as_base_format(src, static_cast(acl_format)); - return src; - } - at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); @@ -84,18 +197,20 @@ at::Tensor npu_format_cast_impl( // conver self to dst'format, write the result into new result tensor at::Tensor NPUNativeFunctions::npu_format_cast( const at::Tensor& self, - const at::Tensor& dst) + const at::Tensor& dst, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(dst); auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_; int64_t dst_format = dst_desc.npu_format_; - return custom_ops::npu_format_cast(self, dst_format); + return custom_ops::npu_format_cast(self, dst_format, customize_dtype); } // conver self to acl_format, write the result into self at::Tensor& NPUNativeFunctions::npu_format_cast_( at::Tensor& self, - int64_t acl_format) + int64_t acl_format, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; @@ -108,6 +223,13 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( return self; } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype); + if (useAclnn == true) { + at::Tensor dst = format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); + self.set_(dst.storage(), dst.storage_offset(), dst.sizes(), dst.strides()); + return self; + } + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, self.options(), acl_format); @@ -130,16 +252,54 @@ int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& self) at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format) { - return npu_format_cast_impl(self, acl_format); + auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; + if (src_desc.npu_format_ == acl_format) { + ASCEND_LOGD("no need to do format cast"); + return self; + } + if (FormatHelper::IsBaseFormatType(self) && + FormatHelper::IsBaseFormatType(static_cast(acl_format))) { + FormatCastHelper::format_cast_as_base_format(self, static_cast(acl_format)); + return self; + } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, c10::nullopt); + if (useAclnn == false) { + return npu_format_cast_impl(self, acl_format); + } + return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); +} + +at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format, + int64_t customize_dtype) +{ + auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_; + if (src_desc.npu_format_ == acl_format) { + ASCEND_LOGD("no need to do format cast"); + return self; + } + if (FormatHelper::IsBaseFormatType(self) && + FormatHelper::IsBaseFormatType(static_cast(acl_format))) { + FormatCastHelper::format_cast_as_base_format(self, static_cast(acl_format)); + return self; + } + auto [useAclnn, outFormat, StorageShape] = MaybeUseAclnnNpuFormatCast(self, acl_format, customize_dtype); + if (useAclnn == false) { + return npu_format_cast_impl(self, acl_format); + } + return format_cast_impl_out_npu_aclnn(self, outFormat, StorageShape); } -at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format) +at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format, + c10::optional customize_dtype) { torch_npu::utils::torch_check_npu(self); if (NPUNativeFunctions::get_npu_format(self) == acl_format) { ASCEND_LOGD("no need to do format cast"); return self; } + if (customize_dtype.has_value()) { + return custom_ops::_npu_format_cast(self, acl_format, customize_dtype.value()); + } return custom_ops::_npu_format_cast(self, acl_format); } diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp index 775d95cbfa..685f907653 100644 --- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp +++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp @@ -10,13 +10,34 @@ namespace at_npu { namespace native { +#define AT_DISPATCH_CASE_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) + + +#define AT_DISPATCH_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, __VA_ARGS__)) + + c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self) { c10::Scalar r; - AT_DISPATCH_ALL_TYPES_AND3( + AT_DISPATCH_ALL_TYPES_AND5( at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, + at::ScalarType::Float8_e5m2, + at::ScalarType::Float8_e4m3fn, self.scalar_type(), "_local_scalar_dense_npu", [&] { diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp index 6a6b3ffa9f..68c93d95dc 100644 --- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp @@ -166,7 +166,7 @@ at::Tensor NPUNativeFunctions::to( "dtype cast replace with float."); } dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype; - return custom_ops::npu_dtype_cast(self, dtype); + return custom_ops::_npu_dtype_cast(self, dtype); } at::Tensor NPUNativeFunctions::to( diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 95bb740db1..b186df7651 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -62,12 +62,12 @@ custom: - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int device_check: NoCheck - func: get_npu_format(Tensor self) -> int - - func: npu_format_cast.Tensor(Tensor self, Tensor dst) -> Tensor + - func: npu_format_cast.Tensor(Tensor self, Tensor dst, int? customize_dtype=None) -> Tensor device_check: NoCheck exposed: True - - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format) -> Tensor(a!) + - func: npu_format_cast_.acl_format(Tensor(a!) self, int acl_format, int? customize_dtype=None) -> Tensor(a!) exposed: True - - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!) + - func: npu_format_cast_(Tensor(a!) self, Tensor src, int? customize_dtype=None) -> Tensor(a!) device_check: NoCheck exposed: True - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor @@ -82,9 +82,10 @@ custom: - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) device_check: NoCheck - func: get_storage_size(Tensor self) -> int - - func: npu_format_cast(Tensor self, int acl_format) -> Tensor + - func: npu_format_cast(Tensor self, int acl_format, int? customize_dtype=None) -> Tensor exposed: True - func: _npu_format_cast(Tensor self, int acl_format) -> Tensor + - func: _npu_format_cast.aclnn(Tensor self, int acl_format, int customize_dtype) -> Tensor - func: empty_with_swapped_memory(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor dispatch: CompositeExplicitAutograd: empty_with_swapped_memory diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index d3425f6f44..a37bae6fbc 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -728,6 +728,7 @@ BlockState::BlockState(Block* block) SegmentState::SegmentState(Block* head) { + TORCH_INTERNAL_ASSERT(head != nullptr, PTA_ERROR(ErrCode::PTR)); TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr); is_small = head->pool->is_small; @@ -886,7 +887,7 @@ size_t CachingAllocatorConfig::parseExpandableSegments(const std::vectorsize; auto new_candidate = candidate->next; + if (C10_UNLIKELY(new_candidate == nullptr)) { + return nullptr; + } if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) { return nullptr; } @@ -2459,7 +2463,11 @@ private: { bool freed_memory = false; for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) { - freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute(); + if (FreeNPUMemoryCallbacksRegistry()->Create(name) != nullptr) { + freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute(); + } else { + TORCH_CHECK(false, "free memory callback get nullptr", PTA_ERROR(ErrCode::PTR)); + } } return freed_memory; } diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 5732b6c0b8..12ed02d528 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -92,7 +92,7 @@ MemUceInfo memUceInfo; std::mutex memUceInfoMutex; -void set_mem_uce_info(MemUceInfo info) +void set_mem_uce_info(MemUceInfo& info) { std::lock_guard lock(memUceInfoMutex); memUceInfo = info; diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index a82f8f1568..4c178816a8 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -260,7 +260,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg); void record_mem_hbm_ecc_error(); -void set_mem_uce_info(MemUceInfo info); +void set_mem_uce_info(MemUceInfo& info); MemUceInfo get_mem_uce_info(); diff --git a/torch_npu/csrc/core/npu/NPUMacros.h b/torch_npu/csrc/core/npu/NPUMacros.h index 3223c4f325..960dcb97b6 100644 --- a/torch_npu/csrc/core/npu/NPUMacros.h +++ b/torch_npu/csrc/core/npu/NPUMacros.h @@ -29,6 +29,6 @@ #define TORCH_NPU_API C10_NPU_API -#define C10_COMPILE_TIME_MAX_NPUS 16 +#define C10_COMPILE_TIME_MAX_NPUS 32 // A maximum of 8 P2P links can be created on a NPU device #define C10_P2P_ACCESS_MAX_NPUS 8 diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp index 4e0fce02fb..bfe2ee7245 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.cpp +++ b/torch_npu/csrc/core/npu/NpuVariables.cpp @@ -39,28 +39,37 @@ static std::map socVersionMap = { {"Ascend910_9372", SocVersion::Ascend910_9372}, {"Ascend910_9362", SocVersion::Ascend910_9362}}; -void SetSocVersion(const char* const socVersion) { - if (socVersion == nullptr || - g_curSocVersion != SocVersion::UnsupportedSocVersion) { - return; - } +void SetSocVersion(const char* const socVersion) +{ + if (socVersion == nullptr || + g_curSocVersion != SocVersion::UnsupportedSocVersion) { + return; + } - SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; + SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; + std::string inputVersion = socVersion; + std::string ascend95Version = "Ascend910_95"; - auto const& iter = socVersionMap.find(socVersion); - if (iter != socVersionMap.end()) { - curSocVersion = iter->second; - } else { - std::string unsupported_soc(socVersion); - std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); - AT_ERROR("Unsupported soc version: ", unsupported_soc); - } + auto const& iter = socVersionMap.find(socVersion); + if (iter != socVersionMap.end()) { + curSocVersion = iter->second; + } else if ((inputVersion.compare(0, ascend95Version.size(), ascend95Version) == 0)) { + curSocVersion = SocVersion::Ascend910_95; + } else { + std::string unsupported_soc(socVersion); + std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); + AT_ERROR("Unsupported soc version: ", unsupported_soc); + } - g_curSocVersion = curSocVersion; + g_curSocVersion = curSocVersion; } const SocVersion& GetSocVersion() { + if (g_curSocVersion == SocVersion::UnsupportedSocVersion) { + auto soc_name = c10_npu::acl::AclGetSocName(); + SetSocVersion(soc_name); + } return g_curSocVersion; } @@ -94,5 +103,10 @@ bool IsBF16Supported() { return GetSocVersion() >= SocVersion::Ascend910B1; } + +bool IsAclnnOnly() +{ + return GetSocVersion() >= SocVersion::Ascend910_95; +} } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h index 3119a64515..2fe0de9aff 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.h +++ b/torch_npu/csrc/core/npu/NpuVariables.h @@ -30,7 +30,8 @@ enum class SocVersion { Ascend910_9381, Ascend910_9382, Ascend910_9372, - Ascend910_9362 + Ascend910_9362, + Ascend910_95 = 260 }; void SetSocVersion(const char* const socVersion); @@ -40,6 +41,8 @@ const SocVersion& GetSocVersion(); bool IsSupportInfNan(); bool IsBF16Supported(); + +bool IsAclnnOnly(); } // namespace c10_npu #endif diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 923f53cd1a..75658804e2 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -174,6 +174,7 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { if (stream == nullptr) { // default stream return ACL_ERROR_INVALID_PARAM; } + typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t); static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode); if (func == nullptr) { @@ -844,7 +845,8 @@ bool IsCaptureSupported() static bool have_load_func = false; static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) && (GetSocVersion() < SocVersion::Ascend310B1)) || - (GetSocVersion() >= SocVersion::Ascend910_9391); + ((GetSocVersion() >= SocVersion::Ascend910_9391) && + (GetSocVersion() < SocVersion::Ascend910_95)); if (default_support_capture && !have_load_func) { have_load_func = true; typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *); diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp index 8f7f17a011..9e46d36a6f 100644 --- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp +++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp @@ -4,6 +4,7 @@ #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/npu_log.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" namespace c10_npu { namespace option { @@ -84,6 +85,18 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::u void SetOption(const std::string &key, const std::string &val) { + if (c10_npu::IsAclnnOnly()) { + if (key == "jitCompile" && val == "enable") { + TORCH_NPU_WARN_ONCE("Current device only support jit_compile=False, ", + "the requested value True is invalid and has been reverted to False."); + return register_options::OptionRegister::GetInstance()->Set(key, "disable"); + } + if (key == "ALLOW_INTERNAL_FORMAT" && val == "enable") { + TORCH_NPU_WARN_ONCE("Current device only support allow_internal_format=False, ", + "the requested value True is invalid and has been reverted to False."); + return register_options::OptionRegister::GetInstance()->Set(key, "disable"); + } + } register_options::OptionRegister::GetInstance()->Set(key, val); } diff --git a/torch_npu/csrc/custom_dtype/CMakeLists.txt b/torch_npu/csrc/custom_dtype/CMakeLists.txt new file mode 100644 index 0000000000..7d3d7c0e53 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB _CUS_DTYPE_SRCS *.cpp) + +LIST(APPEND CUS_DTYPE_SRCS ${_CUS_DTYPE_SRCS}) + +# Pass to parent +set(CUS_DTYPE_SRCS ${CUS_DTYPE_SRCS} PARENT_SCOPE) diff --git a/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp new file mode 100644 index 0000000000..2293ba94dd --- /dev/null +++ b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp @@ -0,0 +1,43 @@ +#include "torch_npu/csrc/custom_dtype/extension.h" +#include "op_plugin/AclOpsInterface.h" +#include "op_plugin/OpApiInterface.h" +#include "op_plugin/utils/op_api_common.h" + + +namespace c10_npu { + +at::Tensor cast_to_fp8(const at::Tensor &input, int otype) +{ + auto output = at::empty_like(input, c10_npu::GetATenDType(otype)); + + if (input.numel() == 0) { + return output; + } + + aclDataType out_acltype = c10_npu::GetAclDataType(otype); + TensorWrapper out_wrapper = {output, out_acltype}; + EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper); + + return output; +} + +void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype) +{ + aclDataType out_acltype = c10_npu::GetAclDataType(otype); + TensorWrapper out_wrapper = {output, out_acltype}; + EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper); + return; +} + +at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype) +{ + aclDataType input_acltype = c10_npu::GetAclDataType(itype); + aclDataType out_acltype = c10_npu::GetAclDataType(otype); + auto output = at::empty_like(input, c10_npu::GetATenDType(otype)); + TensorWrapper input_wrapper = {input, input_acltype}; + TensorWrapper out_wrapper = {output, out_acltype}; + EXEC_NPU_CMD(aclnnCast, input_wrapper, out_acltype, out_wrapper); + + return output; +} +} diff --git a/torch_npu/csrc/custom_dtype/Init.cpp b/torch_npu/csrc/custom_dtype/Init.cpp new file mode 100644 index 0000000000..90644aa1e3 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/Init.cpp @@ -0,0 +1,163 @@ +#include "torch_npu/csrc/custom_dtype/Init.h" +#ifndef BUILD_LIBTORCH +#include +#include +#endif +#include "torch_npu/csrc/custom_dtype/extension.h" + + +namespace c10_npu { +struct DTypeConstants { + static const int float32_value; + static const int float16_value; + static const int int8_value; + static const int int32_value; + static const int uint8_value; + static const int int16_value; + static const int uint16_value; + static const int uint32_value; + static const int int64_value; + static const int uint64_value; + static const int float64_value; + static const int bool_value; + static const int string_value; + static const int complex64_value; + static const int complex128_value; + static const int bfloat16_value; + static const int int4_value; + static const int uint1_value; + static const int complex32_value; + static const int hifloat8_value; + static const int float8_e5m2_value; + static const int float8_e4m3fn_value; + static const int float8_e8m0_value; + static const int float6_e3m2_value; + static const int float6_e2m3_value; + static const int float4_e2m1_value; + static const int float4_e1m2_value; +}; + +const int DTypeConstants::float32_value = static_cast(DType::FLOAT); +const int DTypeConstants::float16_value = static_cast(DType::FLOAT16); +const int DTypeConstants::int8_value = static_cast(DType::INT8); +const int DTypeConstants::int32_value = static_cast(DType::INT32); +const int DTypeConstants::uint8_value = static_cast(DType::UINT8); +const int DTypeConstants::int16_value = static_cast(DType::INT16); +const int DTypeConstants::uint16_value = static_cast(DType::UINT16); +const int DTypeConstants::uint32_value = static_cast(DType::UINT32); +const int DTypeConstants::int64_value = static_cast(DType::INT64); +const int DTypeConstants::uint64_value = static_cast(DType::UINT64); +const int DTypeConstants::float64_value = static_cast(DType::DOUBLE); +const int DTypeConstants::bool_value = static_cast(DType::BOOL); +const int DTypeConstants::string_value = static_cast(DType::STRING); +const int DTypeConstants::complex64_value = static_cast(DType::COMPLEX64); +const int DTypeConstants::complex128_value = static_cast(DType::COMPLEX128); +const int DTypeConstants::bfloat16_value = static_cast(DType::BF16); +const int DTypeConstants::int4_value = static_cast(DType::INT4); +const int DTypeConstants::uint1_value = static_cast(DType::UINT1); +const int DTypeConstants::complex32_value = static_cast(DType::COMPLEX32); +const int DTypeConstants::hifloat8_value = static_cast(DType::HIFLOAT8); +const int DTypeConstants::float8_e5m2_value = static_cast(DType::FLOAT8_E5M2); +const int DTypeConstants::float8_e4m3fn_value = static_cast(DType::FLOAT8_E4M3FN); +const int DTypeConstants::float8_e8m0_value = static_cast(DType::FLOAT8_E8M0); +const int DTypeConstants::float6_e3m2_value = static_cast(DType::FLOAT6_E3M2); +const int DTypeConstants::float6_e2m3_value = static_cast(DType::FLOAT6_E2M3); +const int DTypeConstants::float4_e2m1_value = static_cast(DType::FLOAT4_E2M1); +const int DTypeConstants::float4_e1m2_value = static_cast(DType::FLOAT4_E1M2); + +#ifndef BUILD_LIBTORCH +PyObject* cd_initExtension(PyObject*, PyObject *) +{ + auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C")); + if (!torch_npu_C_module) { + return nullptr; + } + auto torch_npu_C_m = py::handle(torch_npu_C_module).cast(); + auto m = torch_npu_C_m.def_submodule("_cd", "_cd bindings"); + + py::class_(m, "DType") + .def_readonly_static("float32", &DTypeConstants::float32_value) + .def_readonly_static("float16", &DTypeConstants::float16_value) + .def_readonly_static("int8", &DTypeConstants::int8_value) + .def_readonly_static("int32", &DTypeConstants::int32_value) + .def_readonly_static("uint8", &DTypeConstants::uint8_value) + .def_readonly_static("int16", &DTypeConstants::int16_value) + .def_readonly_static("uint16", &DTypeConstants::uint16_value) + .def_readonly_static("uint32", &DTypeConstants::uint32_value) + .def_readonly_static("int64", &DTypeConstants::int64_value) + .def_readonly_static("uint64", &DTypeConstants::uint64_value) + .def_readonly_static("float64", &DTypeConstants::float64_value) + .def_readonly_static("bool", &DTypeConstants::bool_value) + .def_readonly_static("string", &DTypeConstants::string_value) + .def_readonly_static("complex64", &DTypeConstants::complex64_value) + .def_readonly_static("complex128", &DTypeConstants::complex128_value) + .def_readonly_static("bfloat16", &DTypeConstants::bfloat16_value) + .def_readonly_static("int4", &DTypeConstants::int4_value) + .def_readonly_static("uint1", &DTypeConstants::uint1_value) + .def_readonly_static("complex32", &DTypeConstants::complex32_value) + .def_readonly_static("hifloat8", &DTypeConstants::hifloat8_value) + .def_readonly_static("float8_e5m2", &DTypeConstants::float8_e5m2_value) + .def_readonly_static("float8_e4m3fn", &DTypeConstants::float8_e4m3fn_value) + .def_readonly_static("float8_e8m0", &DTypeConstants::float8_e8m0_value) + .def_readonly_static("float6_e3m2", &DTypeConstants::float6_e3m2_value) + .def_readonly_static("float6_e2m3", &DTypeConstants::float6_e2m3_value) + .def_readonly_static("float4_e2m1", &DTypeConstants::float4_e2m1_value) + .def_readonly_static("float4_e1m2", &DTypeConstants::float4_e1m2_value); + + m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8", py::call_guard()); + m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8", + py::call_guard()); + m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8", py::call_guard()); + + Py_RETURN_NONE; +} + +static PyMethodDef NPUCustomDtypeMethods[] = { // NOLINT + {"_cd_init", cd_initExtension, METH_NOARGS, nullptr}, + {nullptr, nullptr, 0, nullptr} +}; +#endif + +const std::string CustomDataTypeToString(int64_t dType) +{ + const std::map + TYPE_TO_STRING_MAP = { + {DType::FLOAT, "torch_npu.float32"}, + {DType::FLOAT16, "torch_npu.float16"}, + {DType::INT8, "torch_npu.int8"}, + {DType::INT32, "torch_npu.int32"}, + {DType::UINT8, "torch_npu.uint8"}, + {DType::INT16, "torch_npu.int16"}, + {DType::UINT16, "torch_npu.uint16"}, + {DType::UINT32, "torch_npu.uint32"}, + {DType::INT64, "torch_npu.int64"}, + {DType::UINT64, "torch_npu.uint64"}, + {DType::DOUBLE, "torch_npu.float64"}, + {DType::BOOL, "torch_npu.bool"}, + {DType::STRING, "torch_npu.string"}, + {DType::COMPLEX64, "torch_npu.complex64"}, + {DType::COMPLEX128, "torch_npu.complex128"}, + {DType::BF16, "torch_npu.bfloat16"}, + {DType::INT4, "torch_npu.int4"}, + {DType::UINT1, "torch_npu.uint1"}, + {DType::COMPLEX32, "torch_npu.complex32"}, + {DType::HIFLOAT8, "torch_npu.hifloat8"}, + {DType::FLOAT8_E5M2, "torch_npu.float8_e5m2"}, + {DType::FLOAT8_E4M3FN, "torch_npu.float8_e4m3fn"}, + {DType::FLOAT8_E8M0, "torch_npu.float8_e8m0"}, + {DType::FLOAT6_E3M2, "torch_npu.float6_e3m2"}, + {DType::FLOAT6_E2M3, "torch_npu.float6_e2m3"}, + {DType::FLOAT4_E2M1, "torch_npu.float4_e2m1"}, + {DType::FLOAT4_E1M2, "torch_npu.float4_e1m2"}}; + + const auto iter = TYPE_TO_STRING_MAP.find(static_cast(dType)); + return iter != TYPE_TO_STRING_MAP.end() ? iter->second : "Unknown dtype"; +} + +#ifndef BUILD_LIBTORCH +PyMethodDef* custom_dtype_functions() +{ + return NPUCustomDtypeMethods; +} +#endif +} diff --git a/torch_npu/csrc/custom_dtype/Init.h b/torch_npu/csrc/custom_dtype/Init.h new file mode 100644 index 0000000000..23235a0027 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/Init.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#ifndef BUILD_LIBTORCH +#include +#endif +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" +#include "third_party/acl/inc/acl/acl_base.h" + +namespace c10_npu { +const int g_toAclOffset = 256; + +#define ENUM_OFFSET(new_name, old_name) new_name = static_cast(old_name) + g_toAclOffset, + +#ifndef BUILD_LIBTORCH +TORCH_NPU_API PyMethodDef* custom_dtype_functions(); +#endif + +enum class DType { + UNDEFINED = -1, + ENUM_OFFSET(FLOAT, ACL_FLOAT) + ENUM_OFFSET(FLOAT16, ACL_FLOAT16) + ENUM_OFFSET(INT8, ACL_INT8) + ENUM_OFFSET(INT32, ACL_INT32) + ENUM_OFFSET(UINT8, ACL_UINT8) + ENUM_OFFSET(INT16, ACL_INT16) + ENUM_OFFSET(UINT16, ACL_UINT16) + ENUM_OFFSET(UINT32, ACL_UINT32) + ENUM_OFFSET(INT64, ACL_INT64) + ENUM_OFFSET(UINT64, ACL_UINT64) + ENUM_OFFSET(DOUBLE, ACL_DOUBLE) + ENUM_OFFSET(BOOL, ACL_BOOL) + ENUM_OFFSET(STRING, ACL_STRING) + ENUM_OFFSET(COMPLEX64, ACL_COMPLEX64) + ENUM_OFFSET(COMPLEX128, ACL_COMPLEX128) + ENUM_OFFSET(BF16, ACL_BF16) + ENUM_OFFSET(INT4, ACL_INT4) + ENUM_OFFSET(UINT1, ACL_UINT1) + ENUM_OFFSET(COMPLEX32, ACL_COMPLEX32) + ENUM_OFFSET(HIFLOAT8, ACL_HIFLOAT8) + ENUM_OFFSET(FLOAT8_E5M2, ACL_FLOAT8_E5M2) + ENUM_OFFSET(FLOAT8_E4M3FN, ACL_FLOAT8_E4M3FN) + ENUM_OFFSET(FLOAT8_E8M0, ACL_FLOAT8_E8M0) + ENUM_OFFSET(FLOAT6_E3M2, ACL_FLOAT6_E3M2) + ENUM_OFFSET(FLOAT6_E2M3, ACL_FLOAT6_E2M3) + ENUM_OFFSET(FLOAT4_E2M1, ACL_FLOAT4_E2M1) + ENUM_OFFSET(FLOAT4_E1M2, ACL_FLOAT4_E1M2) +}; + +inline bool IsCustomDType(int64_t t) +{ + if (t >= g_toAclOffset) { + return true; + } + return false; +} + +// Both c10_npu::DType and ScalarType are supported +inline aclDataType GetAclDataType(int64_t t) +{ + if (t >= g_toAclOffset) { + return static_cast(t - g_toAclOffset); + } + return at_npu::native::OpPreparation::convert_to_acl_data_type( + static_cast(t)); +} + +inline aclDataType GetAclDataType(DType t) +{ + return static_cast(static_cast(t) - g_toAclOffset); +} + +inline at::ScalarType GetATenDType(int64_t t) +{ + aclDataType aclType = GetAclDataType(t); + return at_npu::native::OpPreparation::convert_to_scalar_type(aclType); +} + +const std::string CustomDataTypeToString(int64_t dType); + +} // namespace c10_npu diff --git a/torch_npu/csrc/custom_dtype/extension.h b/torch_npu/csrc/custom_dtype/extension.h new file mode 100644 index 0000000000..91ef1df8a5 --- /dev/null +++ b/torch_npu/csrc/custom_dtype/extension.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include "torch_npu/csrc/custom_dtype/Init.h" + +namespace c10_npu { +at::Tensor cast_to_fp8(const at::Tensor &input, int otype); + +void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype); + +at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype); +} diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3b2ad09cb4..1d14cf06ef 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -3703,7 +3703,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt); + tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -3881,7 +3881,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce_coalesced( for (const auto i : c10::irange(tensors.size())) { if (tensors[i].scalar_type() == at::kBool || tensors[i].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[i] = at_npu::native::custom_ops::npu_dtype_cast(tensors[i], at::kInt); + tensors_cp[i] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[i], at::kInt); } } }, @@ -3945,7 +3945,7 @@ c10::intrusive_ptr ProcessGroupHCCL::reduce( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (tensors[0].scalar_type() == at::kBool || tensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - tensors_cp[0] = at_npu::native::custom_ops::npu_dtype_cast(tensors[0], at::kInt); + tensors_cp[0] = at_npu::native::custom_ops::_npu_dtype_cast(tensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -4005,11 +4005,11 @@ c10::intrusive_ptr ProcessGroupHCCL::_reduce_oop( [&](std::vector& hcclStreams, c10::intrusive_ptr&) { if (inputTensors[0].scalar_type() == at::kBool || inputTensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - inputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(inputTensors[0], at::kInt); + inputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(inputTensors[0], at::kInt); } if (outputTensors[0].scalar_type() == at::kBool || outputTensors[0].scalar_type() == at::kByte) { c10_npu::NPUStreamGuard guard(hcclStreams[0]); - outputTensors[0] = at_npu::native::custom_ops::npu_dtype_cast(outputTensors[0], at::kInt); + outputTensors[0] = at_npu::native::custom_ops::_npu_dtype_cast(outputTensors[0], at::kInt); } }, [&](std::vector& hcclStreams, c10::intrusive_ptr&) { @@ -4044,14 +4044,14 @@ at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const if (num_add != 0) { bool transflag = false; if (inter_tensors.scalar_type() == at::ScalarType::Bool) { - inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Int); + inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Int); transflag = true; } inter_tensors = op_plugin::constant_pad_nd(inter_tensors, {0, num_add}, 0); if (transflag) { - inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Bool); + inter_tensors = at_npu::native::custom_ops::_npu_dtype_cast(inter_tensors, at::ScalarType::Bool); } } return inter_tensors; diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp index 1b85e7fce6..319de4ae93 100644 --- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -424,6 +424,9 @@ void TensorPipeAgent::startImpl() priority = opts_.transports->size() - 1 - (iter - opts_.transports->begin()); } std::unique_ptr reg = TensorPipeTransportRegistry()->Create(key); + if (reg == nullptr || reg->transport == nullptr) { + TORCH_CHECK(false, "TensorPipeTransport get nullptr", DIST_ERROR(ErrCode::PTR)); + } if (!reg->transport->isViable()) { continue; } diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp index 6a92fe5af4..9bd270b8fd 100644 --- a/torch_npu/csrc/framework/FormatHelper.cpp +++ b/torch_npu/csrc/framework/FormatHelper.cpp @@ -52,6 +52,10 @@ std::unordered_map FormatHelper::Initialize {ACL_FORMAT_NDC1HWC0, (FormatInfo){ACL_FORMAT_NDC1HWC0, ACL_FORMAT_NCDHW, InferShapeOfNDC1HWC0, "NDC1HWC0", true}}, {ACL_FRACTAL_Z_3D, (FormatInfo){ACL_FRACTAL_Z_3D, ACL_FORMAT_NCDHW, InferShapeOfFZ3D, "FRACTAL_Z_3D", true}}, + {ACL_FORMAT_FRACTAL_NZ_C0_16, + (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_16, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_16", true}}, + {ACL_FORMAT_FRACTAL_NZ_C0_32, + (FormatInfo){ACL_FORMAT_FRACTAL_NZ_C0_32, ACL_FORMAT_ND, nullptr, "FRACTAL_NZ_C0_32", true}}, }; }; diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp index 6b98651c51..80af05f94b 100644 --- a/torch_npu/csrc/framework/OpCommand.cpp +++ b/torch_npu/csrc/framework/OpCommand.cpp @@ -24,7 +24,9 @@ static std::unordered_map> floating_limits_m {at::ScalarType::Double, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::Float, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::BFloat16, {std::numeric_limits::max(), std::numeric_limits::min()}}, - {at::ScalarType::Half, {65504, -65504}}}; + {at::ScalarType::Half, {65504, -65504}}, + {at::ScalarType::Float8_e5m2, {57345, -57345}}, + {at::ScalarType::Float8_e4m3fn, {449, -449}}}; static std::unordered_map> integral_limits_map{ {at::ScalarType::Long, {std::numeric_limits::max(), std::numeric_limits::min()}}, {at::ScalarType::Int, {std::numeric_limits::max(), std::numeric_limits::min()}}, @@ -274,7 +276,7 @@ OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor, at::ScalarType forceSca { std::tuple res; if (commonType.has_value() && commonType.value() != tensor.scalar_type()) { - tensor = custom_ops::npu_dtype_cast(tensor, commonType.value()); + tensor = custom_ops::_npu_dtype_cast(tensor, commonType.value()); } // as for dim=0, the dtype of tensor can not be `uint16` because of `TBE` if (torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor).storage_sizes_.empty()) { @@ -331,7 +333,7 @@ OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType ty OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType) { if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) { - output = custom_ops::npu_dtype_cast(output, commonType.value()); + output = custom_ops::_npu_dtype_cast(output, commonType.value()); } auto res = OpCmdHelper::CovertToAclOutput(output, realType); aclCmd->AddOutput(std::get<0>(res), std::get<1>(res)); diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index ce8b906514..0db71b00a1 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -575,6 +575,7 @@ void *NewFunc(int caption, int &size) void DeleteFunc(void *ptr) { free(ptr); + ptr = nullptr; } using Func = int (*)(c10_npu::queue::QueueParas *, aclrtStream); diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp index 6a23a5e4b9..eb568a74db 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.cpp +++ b/torch_npu/csrc/framework/StorageDescHelper.cpp @@ -98,6 +98,13 @@ void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, c torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = SetDesc(dst.dtype(), size, strides, format); } +void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size, + const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format) +{ + torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_ = + SetDesc(dst.dtype(), base_size, storage_size, strides, format); +} + bool StorageDescHelper::CheckDescInit(const c10::Storage &storage) { return torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_ != @@ -255,6 +262,22 @@ torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dty return npu_desc; } +torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size, + const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format) +{ + struct torch_npu::NPUStorageDesc npu_desc; + npu_desc.data_type_ = dtype; + npu_desc.base_sizes_ = base_size; + npu_desc.base_strides_ = strides; + aclFormat baseFormat; + aclFormat npuFormat; + std::tie(baseFormat, npuFormat) = InferFormat::GuessFormatUnit(base_size, format); + npu_desc.storage_sizes_ = storage_size; + npu_desc.origin_format_ = baseFormat; + npu_desc.npu_format_ = npuFormat; + return npu_desc; +} + int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &dst) { const auto &physical_size = FormatHelper::GetStorageSizes(dst); diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h index 6497ee1a88..f3b35067e0 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.h +++ b/torch_npu/csrc/framework/StorageDescHelper.h @@ -35,6 +35,8 @@ public: static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides); static void SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, const c10::IntArrayRef& strides, aclFormat format); + static void SetDesc(at::Tensor &dst, const c10::IntArrayRef &base_size, + const c10::IntArrayRef &storage_size, const c10::IntArrayRef &strides, aclFormat format); static bool CheckDescInit(const c10::Storage &storage); // For Serialization to Get and Set NpuStorageDesc @@ -63,6 +65,8 @@ private: const c10::IntArrayRef& strides); static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& size, const c10::IntArrayRef& strides, aclFormat format); + static torch_npu::NPUStorageDesc SetDesc(const caffe2::TypeMeta &dtype, const c10::IntArrayRef& base_size, + const c10::IntArrayRef& storage_size, const c10::IntArrayRef& strides, aclFormat format); }; } // namespace native diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp index c2abf7f4b2..ee90387910 100644 --- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp @@ -70,6 +70,14 @@ private: ResetDataPtr(src, self, static_cast(src.storage().data_ptr().get())); return true; + case at::ScalarType::Float8_e5m2: + ResetDataPtr(src, self, + static_cast(src.storage().data_ptr().get())); + return true; + case at::ScalarType::Float8_e4m3fn: + ResetDataPtr(src, self, + static_cast(src.storage().data_ptr().get())); + return true; default: // Turn to conducting d2dCopyAsync for other dtypes. return false; diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp index 4da3d362d1..40742c4de2 100644 --- a/torch_npu/csrc/framework/interface/EnvVariables.cpp +++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp @@ -46,7 +46,23 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) { aclmdlSetDump(val.c_str()); }) -REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable") +bool CheckJitDisableInner() +{ + auto val = c10_npu::option::GetOption("jitCompile"); + if (val.has_value()) { + if (val.value() == ("disable")) { + return true; + } + if (val.value() == ("enable")) { + return false; + } + } + if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1) { + return true; + } + return false; +} + REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner) REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp index 0726a37036..13724a65f1 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp @@ -52,8 +52,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(ENUM_PAIR_FUNC) _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits8, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits16, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e5m2, ACL_FLOAT8_E5M2) \ + _(at::ScalarType::Float8_e4m3fn, ACL_FLOAT8_E4M3FN) \ _(at::ScalarType::Float8_e5m2fnuz, ACL_DT_UNDEFINED) \ _(at::ScalarType::Float8_e4m3fnuz, ACL_DT_UNDEFINED) \ _(at::ScalarType::UInt16, ACL_UINT16) \ @@ -94,6 +94,36 @@ AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(ENUM_PAIR_FUNC) static std::map STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = { {"uint16", ACL_UINT16}, {"uint8", ACL_UINT8}, {"uint64", ACL_UINT64}, {"string", ACL_STRING}}; +static std::unordered_map + ACL_TYPE_TO_SCALAR_TYPE_MAP = {{ACL_DT_UNDEFINED, at::ScalarType::Undefined}, + {ACL_FLOAT, at::ScalarType::Float}, + {ACL_FLOAT16, at::ScalarType::Half}, + {ACL_INT8, at::ScalarType::Char}, + {ACL_INT32, at::ScalarType::Int}, + {ACL_UINT8, at::ScalarType::Byte}, + {ACL_INT16, at::ScalarType::Short}, + {ACL_UINT16, at::ScalarType::UInt16}, + {ACL_UINT32, at::ScalarType::UInt32}, + {ACL_INT64, at::ScalarType::Long}, + {ACL_UINT64, at::ScalarType::UInt64}, + {ACL_DOUBLE, at::ScalarType::Double}, + {ACL_BOOL, at::ScalarType::Bool}, + {ACL_STRING, at::ScalarType::Undefined}, + {ACL_COMPLEX64, at::ScalarType::ComplexFloat}, + {ACL_COMPLEX128, at::ScalarType::ComplexDouble}, + {ACL_BF16, at::ScalarType::BFloat16}, + {ACL_INT4, at::ScalarType::Undefined}, + {ACL_UINT1, at::ScalarType::Undefined}, + {ACL_COMPLEX32, at::ScalarType::ComplexHalf}, + {ACL_HIFLOAT8, at::ScalarType::Byte}, + {ACL_FLOAT8_E5M2, at::ScalarType::Float8_e5m2}, + {ACL_FLOAT8_E4M3FN, at::ScalarType::Float8_e4m3fn}, + {ACL_FLOAT8_E8M0, at::ScalarType::Byte}, + {ACL_FLOAT6_E3M2, at::ScalarType::Byte}, + {ACL_FLOAT6_E2M3, at::ScalarType::Byte}, + {ACL_FLOAT4_E2M1, at::ScalarType::Byte}, + {ACL_FLOAT4_E1M2, at::ScalarType::Byte}}; + aclError AclrtMemcpyAsyncParamCheck( void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream) { @@ -311,5 +341,17 @@ int8_t CalcuOpUtil::GetCubeMathType(bool allowHf32) return iter->second; } +at::ScalarType CalcuOpUtil::ConvertToScalarType(const aclDataType data_type) +{ + auto iter = ACL_TYPE_TO_SCALAR_TYPE_MAP.find(data_type); + if (iter == ACL_TYPE_TO_SCALAR_TYPE_MAP.end()) { + TORCH_CHECK(false, + std::string("aclDataType:") + std::to_string(data_type) + " has not been supported", + OPS_ERROR(ErrCode::NOT_SUPPORT)) + } + + return iter->second; +} + } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h index 3d2a925e48..0693b9c024 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h @@ -89,6 +89,7 @@ public: static int64_t GetTensorNpuFormat(const at::Tensor &tensor); static c10::SmallVector ConvertIntArrayRefToSmallVector(c10::IntArrayRef intArray); static int8_t GetCubeMathType(bool allowHf32); + static at::ScalarType ConvertToScalarType(const aclDataType data_type); }; } // namespace native diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index 006bc377b0..dc6378a472 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -102,6 +102,11 @@ aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_t return CalcuOpUtil::ConvertToAclDataType(data_type, realDataType); } +at::ScalarType OpPreparation::convert_to_scalar_type(const aclDataType data_type) +{ + return CalcuOpUtil::ConvertToScalarType(data_type); +} + at::Tensor OpPreparation::copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type) { return CalcuOpUtil::CopyScalarToDevice(cpu_scalar, scalar_data_type); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.h b/torch_npu/csrc/framework/utils/OpPreparation.h index 74ac303898..e87a910112 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.h +++ b/torch_npu/csrc/framework/utils/OpPreparation.h @@ -22,6 +22,7 @@ public: // From CalcuOpUtil part static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type); static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType); + static at::ScalarType convert_to_scalar_type(const aclDataType data_type); static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type); static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type, const c10::Device device); diff --git a/torch_npu/csrc/npu/DataParallelComm.cpp b/torch_npu/csrc/npu/DataParallelComm.cpp index db0d3efabe..c744e1e1ba 100644 --- a/torch_npu/csrc/npu/DataParallelComm.cpp +++ b/torch_npu/csrc/npu/DataParallelComm.cpp @@ -137,7 +137,7 @@ void check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, i { // need to check len(inputs) == len(outputs) size_t len = inputs.size(); - if (len <= 0) { + if (len == 0) { throw std::runtime_error("input sequence can't be empty" + PTA_ERROR(ErrCode::PARAM)); } diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py index bc97473537..16ae07087e 100644 --- a/torch_npu/onnx/wrapper_onnx_ops.py +++ b/torch_npu/onnx/wrapper_onnx_ops.py @@ -255,8 +255,8 @@ class _NPUFormatCastOP(torch.autograd.Function): return torch.ops.npu.npu_format_cast(*args, **kwargs) @staticmethod - def symbolic(g, self: Tensor, acl_format: int): - return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format) + def symbolic(g, self: Tensor, acl_format: int, customize_dtype: int = None): + return g.op("npu::NPUFormatCast", self, acl_format_i=acl_format, customize_dtype_i=customize_dtype) class _NPUSoftmaxCrossEntropyWithLogitsOP(torch.autograd.Function): @@ -1042,8 +1042,8 @@ def _wrapper_npu_deformable_conv2d(inputs, weight, offset, bias, kernel_size, st padding, dilation, groups, deformable_groups, modulated) -def _wrapper_npu_format_cast(self, acl_format): - return _NPUFormatCastOP.apply(self, acl_format) +def _wrapper_npu_format_cast(self, acl_format, customize_dtype=None): + return _NPUFormatCastOP.apply(self, acl_format, customize_dtype) def _wrapper_npu_softmax_cross_entropy_with_logits(self, labels): diff --git a/torch_npu/utils/hif8_tensor.py b/torch_npu/utils/hif8_tensor.py new file mode 100644 index 0000000000..691d290a86 --- /dev/null +++ b/torch_npu/utils/hif8_tensor.py @@ -0,0 +1,584 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. +# +# See LICENSE for license information. + +"""Tensor class with HIF8 data""" +from __future__ import annotations + +__all__ = ["HiFloat8Tensor"] + +from typing import Any, Dict, Optional, Tuple, Union + +import torch +from torch.utils._pytree import tree_map +import torch_npu +from torch_npu.utils._error_code import ErrCode, pta_error + + +# init transformer engine +torch_npu._C._cd_init() + +tex = torch_npu._C._cd +aten = torch.ops.aten + +NPU_CUSTOM_DType = { + torch.uint8: tex.DType.uint8, + torch.int32: tex.DType.int32, + torch.float32: tex.DType.float32, + torch.half: tex.DType.float16, + torch.bfloat16: tex.DType.bfloat16, +} + + +class _FromHiFloat8Func(torch.autograd.Function): + """Cast from HIF8 to other dtype""" + + @staticmethod + def forward( + _ctx: torch.autograd.function.FunctionCtx, # unused + tensor: HiFloat8Tensor, + dtype: Optional[torch.dtype] = None, + ) -> torch.Tensor: + if dtype is None: + dtype = tensor.dtype + data = tensor._data.contiguous().view(1, -1).detach() + out = tex.cast_from_fp8( + data, + tex.DType.hifloat8, + NPU_CUSTOM_DType[dtype], + ) + out = out.view(tensor.size()) + return out + + @staticmethod + def backward( + _ctx: torch.autograd.function.FunctionCtx, # unused + grad: torch.Tensor, + ) -> Tuple[Optional[torch.Tensor], ...]: + # Assume that we want gradients in full precision + return grad, None + + +class _ToHiFloat8Func(torch.autograd.Function): + """Cast to HIF8 from other dtype""" + + @staticmethod + def forward( + _ctx: torch.autograd.function.FunctionCtx, # unused + tensor: torch.Tensor, + ) -> HiFloat8Tensor: + + # Check input tensor TODO + tensor = tensor.contiguous().npu().detach() + if tensor.dtype not in (torch.float32, torch.bfloat16, torch.float16): + tensor = tensor.float() + + # Cast data to HIF8 + data = tex.cast_to_fp8( + tensor.view(1, -1), + tex.DType.hifloat8, + ) + data = data.view(tensor.size()) + + # Construct HIF8 tensor + return HiFloat8Tensor( + data=data, + dtype=tensor.dtype, + ) + + @staticmethod + def backward( + _ctx: torch.autograd.function.FunctionCtx, # unused + grad: torch.Tensor, + ) -> Tuple[Optional[torch.Tensor], ...]: + # Assume that we want gradients in full precision + return grad, None + + +class _IdentityFunc(torch.autograd.Function): + """Identity function + + If constructor keyword-arguments are provided, then construct a + new HiFloat8Tensor using the provided tensor's attributes. + + """ + + @staticmethod + def forward( + ctx, + tensor: HiFloat8Tensor, + init_kwargs: Optional[Dict[str, Any]] = None, + ) -> torch.Tensor: + + # Return input tensor if constructor kwargs are not provided + ctx.input_dtype = tensor.dtype + if init_kwargs is None: + return tensor + + # Construct new tensor if constructor kwargs are provided + default_kwargs = dict( + data=tensor._data, + dtype=tensor.dtype, + ) + for key, val in default_kwargs.items(): + if key not in init_kwargs: + init_kwargs[key] = val + return HiFloat8Tensor(**init_kwargs) + + @staticmethod + def backward(ctx, grad): + return grad.to(ctx.input_dtype), None + + +class _ViewFunc(torch.autograd.Function): + """View function + + View the HiFloat8Tensor using the provided shape. + + """ + + @staticmethod + def forward( + ctx, + tensor: torch.Tensor, + shape: Tuple[int] = None, + ) -> torch.Tensor: + + # Return input tensor if shape is not provided + ctx.shape = tensor.shape + if shape is None: + return tensor + + # Construct new tensor if shape is provided + if isinstance(tensor, HiFloat8Tensor): + return HiFloat8Tensor.make_like( + tensor, + data=tensor._data.view(*shape), + ) + return tensor.view(*shape) + + @staticmethod + def backward( + ctx, + grad: torch.Tensor, + ) -> Tuple[Union[torch.Tensor, None], ...]: + + if isinstance(grad, HiFloat8Tensor): + dgrad = HiFloat8Tensor.make_like( + grad, + data=grad._data.view(ctx.shape), + ) + return dgrad, None + return grad.view(ctx.shape), None + + +class _ReshapeFunc(torch.autograd.Function): + """Reshape function + + Reshape the HiFloat8Tensor using the provided shape. + + """ + + @staticmethod + def forward( + ctx, + tensor: torch.Tensor, + shape: Tuple[int] = None, + ) -> torch.Tensor: + + # Return input tensor if shape is not provided + ctx.shape = tensor.shape + if shape is None: + return tensor + + # Construct new tensor if shape is provided + if isinstance(tensor, HiFloat8Tensor): + return HiFloat8Tensor.make_like( + tensor, + data=tensor._data.reshape(*shape), + ) + return tensor.reshape(*shape) + + @staticmethod + def backward( + ctx, + grad: torch.Tensor, + ) -> Tuple[Union[torch.Tensor, None], ...]: + + if isinstance(grad, HiFloat8Tensor): + dgrad = HiFloat8Tensor.make_like( + grad, + data=grad._data.reshape(ctx.shape), + ) + return dgrad, None + return grad.reshape(ctx.shape), None + + +class _TransposeFunc(torch.autograd.Function): + """Transpose function + + Transpose the HiFloat8Tensor. + + """ + + @staticmethod + def forward(ctx, tensor, dim0, dim1): + ctx.save_for_backward(dim0, dim1) + if isinstance(tensor, HiFloat8Tensor): + return HiFloat8Tensor.make_like( + tensor, + data=tensor._data.transpose(dim0, dim1), + ) + return tensor.transpose(dim0, dim1) + + @staticmethod + def backward(ctx, grad): + dim0, dim1 = ctx.saved_tensors + if isinstance(grad, HiFloat8Tensor): + dgrad = HiFloat8Tensor.make_like( + grad, + data=grad._data.transpose(dim0, dim1), + ) + return dgrad, None + return grad.transpose(dim0, dim1), None, None + + +class HiFloat8Tensor(torch.Tensor): + """Experimental tensor class with HIF8 data + + The tensor presents as having a standard, higher-precision dtype, + but the data itself is (scaled) HIF8. For most tensor operations, + the data will be cast to the nominal dtype before performing the + operation. + + Parameters + ---------- + data: torch.Tensor + Raw HIF8 data in a uint8 tensor + dtype: torch.dtype, default = torch.float32 + Nominal tensor datatype. + + """ + + def __new__( + cls, + *, + data: torch.Tensor, + dtype: torch.dtype = torch.float32, + ): + # Check that data buffer is valid + if data.element_size() != 1: + raise ValueError( + f"HiFloat8Tensor requires data buffer with 8-bit dtype (got dtype={data.dtype})" + + pta_error(ErrCode.VALUE) + ) + if data.requires_grad: + raise ValueError( + "HiFloat8Tensor requires non-differentiable data buffer" + + pta_error(ErrCode.VALUE) + ) + if not data.is_npu: + data = data.npu() + + # Initialize tensor object + self = torch.Tensor._make_wrapper_subclass( + cls, + data.size(), + strides=data.stride(), + storage_offset=data.storage_offset(), + dtype=dtype, + layout=data.layout, + requires_grad=data.requires_grad, + device=data.device, + ) + self._data: torch.Tensor = data + + return self + + @classmethod + def make_like( + cls, + tensor: HiFloat8Tensor, + *, + data: torch.Tensor, + **kwargs, + ) -> HiFloat8Tensor: + """Use attributes of a HiFloat8Tensor to create another HiFloat8Tensor + + See constructor for list of keyword arguments. + + """ + default_kwargs = dict( + dtype=tensor.dtype, + ) + for key, val in default_kwargs.items(): + if key not in kwargs: + kwargs[key] = val + return HiFloat8Tensor(data=data, **kwargs) + + def __repr__(self): + return ( + "HiFloat8Tensor(" + f"data={self.from_hifloat8(dtype=self.dtype)}" + ")" + ) + + def from_hifloat8(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor: + """ + Construct PyTorch tensor from HiFloat8Tensor + + By default the resulting tensor's dtype is the + HiFloat8Tensor's nominal dtype. + """ + return _FromHiFloat8Func.apply(self, dtype) + + @classmethod + def to_hifloat8( + cls, + tensor: torch.Tensor + ): + """Construct HiFloat8Tensor from PyTorch tensor""" + return _ToHiFloat8Func.apply( + tensor + ) + + def float(self) -> torch.Tensor: + return self.from_hifloat8(dtype=torch.float32) + + def bfloat16(self) -> torch.Tensor: + return self.from_hifloat8(dtype=torch.bfloat16) + + def half(self) -> torch.Tensor: + return self.from_hifloat8(dtype=torch.float16) + + def cpu(self) -> torch.Tensor: + return self.from_hifloat8().cpu() + + def clone(self) -> HiFloat8Tensor: + return _IdentityFunc.apply(self, {"data": self._data.detach().clone()}) + + def view(self, *shape: Tuple[int]) -> HiFloat8Tensor: + return _ViewFunc.apply(self, shape) + + def reshape(self, *shape: Tuple[int]) -> HiFloat8Tensor: + return _ReshapeFunc.apply(self, shape) + + def contiguous( + self, + *, + memory_format: torch.memory_format = torch.contiguous_format, + ) -> HiFloat8Tensor: + """Returns tensor with data in provided memory format + + Returns `self` if data is already in correct memory format. + + """ + if self._data.is_contiguous(memory_format=memory_format): + return self + return _IdentityFunc.apply( + self, + {"data": self._data.detach().contiguous(memory_format=memory_format)}, + ) + + def to_dtype(self, dtype: torch.dtype) -> HiFloat8Tensor: + """Create `HiFloat8Tensor` with given nominal dtype + + The new tensor has the same underlying HIF8 data. + + """ + return HiFloat8Tensor.make_like( + self, + data=self._data, + dtype=dtype, + ) + + @classmethod + def __torch_dispatch__(cls, func, types, args, kwargs=None): + + # In-place copy op + if func == aten.copy_.default: + + # Check tensors + dst = args[0] + src = args[1] + if not isinstance(dst, torch.Tensor): + raise RuntimeError( + "Attempted to copy into something that isn't a PyTorch tensor" + + pta_error(ErrCode.TYPE) + ) + if not isinstance(src, torch.Tensor): + raise RuntimeError( + "Attempted to copy from something that isn't a PyTorch tensor" + + pta_error(ErrCode.TYPE) + ) + + # Special handling based on which tensors are HIF8 + dst_is_hif8 = isinstance(dst, HiFloat8Tensor) + src_is_hif8 = isinstance(src, HiFloat8Tensor) + if dst_is_hif8 and src_is_hif8: + # Directly copy HIF8 data if possible + dst._data.copy_(src._data) + + elif not dst_is_hif8 and src_is_hif8: + # Cast source tensor to higher precision + dst.copy_(src.from_hifloat8()) + + elif dst_is_hif8 and not src_is_hif8: + # Make sure input is in expected format + src = src.expand(dst.size()) + src = src.to( + device=dst.device, + memory_format=torch.contiguous_format, + ) + + # Cast to HIF8 + if not dst._data.is_contiguous(): + raise RuntimeError( + "Transformer Engine cast kernels require contiguous data" + + pta_error(ErrCode.INTERNAL) + ) + tex.cast_to_fp8_noalloc( + src.view(1, -1), + dst._data.view(1, -1), + tex.DType.hifloat8, + ) + else: + # Invalid case + raise RuntimeError( + "Using HiFloat8Tensor copy logic, but no HiFloat8Tensor found" + + pta_error(ErrCode.INTERNAL) + ) + + # Nothing to return for in-place ops + return None + + # Slice op + if func == aten.slice.Tensor: + tensor = args[0] + data = tensor._data + data_slice = data.__torch_dispatch__( + func, + types, + [data] + list(args[1:]), + kwargs, + ) + return HiFloat8Tensor.make_like(tensor, data=data_slice) + + # Detach op + if func == aten.detach.default: + # Simply return a new HiFloat8Tensor with the same attrs + return HiFloat8Tensor.make_like( + args[0], + data=args[0]._data, + ) + + # View op + if func == aten.view.default: + tensor = args[0] + data = tensor._data + data_view = data.__torch_dispatch__( + func, + types, + [data] + list(args[1:]), + kwargs, + ) + return HiFloat8Tensor.make_like( + tensor, + data=data_view, + ) + + def maybe_unwrap(t): + if isinstance(t, HiFloat8Tensor): + return t.from_hifloat8() + return t + + def maybe_update_inplace(arg, new_arg, schema_arg): + """Update values of HIF8 tensors + + Keep the same HIF8 scaling factors. + + """ + check_args = isinstance(arg, HiFloat8Tensor) and isinstance(new_arg, torch.Tensor) + check_schema = ( + hasattr(schema_arg, "alias_info") + and hasattr(schema_arg.alias_info, "is_write") + and schema_arg.alias_info.is_write + ) + + if check_args and check_schema: + arg.copy_(new_arg) + + # In-place op + if func._schema.is_mutable: + # Cast to higher precision, perform op, and cast values + # back to original HIF8 buffers + new_args = tree_map(maybe_unwrap, args) + new_kwargs = tree_map(maybe_unwrap, kwargs) + schema_args = func._schema.arguments + args_len = len(args) + out = super().__torch_dispatch__(func, types, new_args, new_kwargs) + for arg, new_arg, schema_arg in zip(args, new_args, schema_args): + maybe_update_inplace(arg, new_arg, schema_arg) + for kwarg, new_kwarg, schema_arg in zip(kwargs, new_kwargs, schema_args[args_len:]): + if not (kwarg == new_kwarg == schema_arg.name): + raise ValueError('name of the kw argument should match' + pta_error(ErrCode.VALUE)) + maybe_update_inplace(kwargs[kwarg], new_kwargs[new_kwarg], schema_arg) + return None + + # Default op + # Note: cast to higher precision and perform op + args = tree_map(maybe_unwrap, args) + if kwargs is not None: + kwargs = tree_map(maybe_unwrap, kwargs) + out = super().__torch_dispatch__(func, types, args, kwargs) + return out + + @classmethod + def _make_in_reduce_ex( + cls, + data: torch.Tensor, + dtype: torch.dtype, + ) -> HiFloat8Tensor: + """Build HiFloat8Tensor, for use in __reduce__ + + __reduce_ex__ assumes object constructor has positional + arguments. + + """ + return HiFloat8Tensor( + data=data, + dtype=dtype, + ) + + def __reduce_ex__(self, protocol: int) -> tuple: + """Custom pickling to remove references to HIF8 metadata objects""" + return ( + HiFloat8Tensor._make_in_reduce_ex, + (self._data, self.dtype), + ) + + def _get_data(self) -> HiFloat8Tensor: + """Get tensor data property""" + return super().data + + def _set_data(self, tensor: torch.Tensor) -> None: + """Set tensor data property + + Cast tensor to HIF8 and store in HIF8 buffer. + + """ + with torch.no_grad(): + self.copy_(tensor) + + # Cast to HIF8 when setting HiFloat8Tensor.data + data = property(_get_data, _set_data) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + return torch._C._disabled_torch_function_impl(func, types, args, kwargs) + + def transpose(self, dim0, dim1): + return _TransposeFunc.apply(self, dim0, dim1) -- Gitee From 33b56f8819da7ab309e4c11e282ca5f69d79487c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 28 Jun 2025 09:20:09 +0000 Subject: [PATCH 167/328] !22413 Update op_plugin commit id Merge pull request !22413 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index a76556c13e..010e139970 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit a76556c13ee025dd17cc0171f888d2ab58d45f90 +Subproject commit 010e139970198a17940922646d3f5ba25bc89c9a -- Gitee From fac23c7d5c206b48b794feb6523d0a25e84412e1 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Sat, 28 Jun 2025 10:20:53 +0000 Subject: [PATCH 168/328] !22353 use torch_npu.version instead of read file Merge pull request !22353 from huangyunlong/2.7ver --- torch_npu/utils/collect_env.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/torch_npu/utils/collect_env.py b/torch_npu/utils/collect_env.py index 3f279bf3cc..8ffed93212 100644 --- a/torch_npu/utils/collect_env.py +++ b/torch_npu/utils/collect_env.py @@ -87,14 +87,8 @@ def get_cann_version(): def get_torch_npu_version(): torch_npu_version_str = 'N/A' - torch_npu_root = get_torch_npu_install_path() - version_path = os.path.join(torch_npu_root, "torch_npu", "version.py") - check_directory_path_readable(version_path) - with open(version_path, "r") as f: - for line in f: - if line.find("__version__") != -1: - torch_npu_version_str = line.strip().split("=")[-1] - break + if TORCH_NPU_AVAILABLE: + torch_npu_version_str = torch_npu.__version__ return torch_npu_version_str -- Gitee From e3720a58f6ecb71f1fb310c7488eb2441da58c3f Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Mon, 30 Jun 2025 06:49:39 +0000 Subject: [PATCH 169/328] !22418 Update torchair commit id Merge pull request !22418 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 8035ce4339..6aff1ddc03 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 8035ce4339f636e332ff56f39a579f6930ddf6ff +Subproject commit 6aff1ddc03dd3e489ba9e053485b98bec7523675 -- Gitee From 698c2e44a2721964e31656990ff33056ec7e9749 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 30 Jun 2025 09:05:14 +0000 Subject: [PATCH 170/328] !22434 Update op_plugin commit id Merge pull request !22434 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 010e139970..d4ff7bf109 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 010e139970198a17940922646d3f5ba25bc89c9a +Subproject commit d4ff7bf10952c8a32b0d20a87f24c20c19b46667 -- Gitee From 6c3a033a2eb1d909709438800e9ce89a3e2f6011 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 30 Jun 2025 09:05:14 +0000 Subject: [PATCH 171/328] !22434 Update op_plugin commit id Merge pull request !22434 from pta-robot/v2.7.1 -- Gitee From 39c00cbc19a9760d75bac485fa3540eb7a782bd7 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 30 Jun 2025 11:05:14 +0000 Subject: [PATCH 172/328] !22447 Update op_plugin commit id Merge pull request !22447 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index d4ff7bf109..23fe62de73 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit d4ff7bf10952c8a32b0d20a87f24c20c19b46667 +Subproject commit 23fe62de73bf37c73a2e95df32dc9b6adb874f8b -- Gitee From 304693a6405d421d229b6ff0edad38fa059b9c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Mon, 30 Jun 2025 11:18:26 +0000 Subject: [PATCH 173/328] =?UTF-8?q?!22444=20Fixed=20the=20issue=20of=20fuz?= =?UTF-8?q?zy=20error=20message=20in=20multiple=20N-second=20fast=20recove?= =?UTF-8?q?ry=20scenarios=20Merge=20pull=20request=20!22444=20from=20?= =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUException.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 4c178816a8..203b6529b7 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -166,8 +166,20 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) false, \ (device_error_msg.empty() ? "" : device_error_msg), \ c10_npu::c10_npu_get_error_message()); \ + } else if (error_code == ACL_ERROR_RT_DEVICE_TASK_ABORT) { \ + TORCH_CHECK( \ + false, \ + __func__, \ + ":", \ + __FILE__, \ + ":", \ + __LINE__, \ + " NPU function error: ", (device_error_msg.empty() ? \ + " FORCE STOP" : device_error_msg), \ + ", error code is ", error_code, \ + PTA_ERROR(ErrCode::ACL)); \ } else { \ - TORCH_CHECK( \ + TORCH_CHECK( \ false, \ __func__, \ ":", \ -- Gitee From dfc5ecc5bf1cfff957815da4d4f135a7528eee46 Mon Sep 17 00:00:00 2001 From: wanglinzhaolinx Date: Tue, 1 Jul 2025 01:12:44 +0000 Subject: [PATCH 174/328] !22417 npu_fused_infer_attention_v2 add UT Merge pull request !22417 from wanglinzhaolinx/v2.7.1 --- test/npu/test_aclgraph_update.py | 47 ++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/test/npu/test_aclgraph_update.py b/test/npu/test_aclgraph_update.py index 644579b9f1..18dbb79c5c 100644 --- a/test/npu/test_aclgraph_update.py +++ b/test/npu/test_aclgraph_update.py @@ -122,6 +122,53 @@ class TestAclgraphUpdate(TestCase): g.replay() self.assertEqual(output.cpu(), res_src[0].cpu()) self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) + + @SupportedDevices(['Ascend910B']) + def test_npu_fused_infer_attention_v2(self): + torch.npu.set_device(0) + length = [29] + length_new = [100] + scale = 1 / 0.0078125 + query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") + res_src = torch_npu.npu_fused_infer_attention_v2( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new) + g = torch.npu.NPUGraph() + event = torch.npu.ExternalEvent() + update_stream = torch.npu.Stream() + handle = None + output = None + softmax_lse = None + + workspace = torch_npu._npu_fused_infer_attention_v2_get_max_workspace( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length) + + with torch.npu.graph(g): + stream = torch.npu.current_stream() + output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu") + softmax_lse = torch.empty(1, dtype=torch.float16, device="npu") + event.wait(stream) + event.reset(stream) + torch.npu.graph_task_group_begin(stream) + torch_npu.npu_fused_infer_attention_v2.out( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length, out=[output, softmax_lse]) + handle = torch.npu.graph_task_group_end(stream) + + with torch.npu.stream(update_stream): + torch.npu.graph_task_update_begin(update_stream, handle) + torch_npu.npu_fused_infer_attention_v2.out( + query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace, + next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new, out=[output, softmax_lse]) + torch.npu.graph_task_update_end(update_stream) + event.record(update_stream) + + g.replay() + self.assertEqual(output.cpu(), res_src[0].cpu()) + self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) if __name__ == "__main__": run_tests() -- Gitee From 20d0f948cbd67a769e30cf95951902a437971e1a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 1 Jul 2025 03:20:12 +0000 Subject: [PATCH 175/328] !22474 Update op_plugin commit id Merge pull request !22474 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 23fe62de73..bf7156784a 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 23fe62de73bf37c73a2e95df32dc9b6adb874f8b +Subproject commit bf7156784afd0a8a480929a651b6b03afcf66d3e -- Gitee From b2bd2590db8e798cfa4229d1ed130f1941bb7ce1 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 1 Jul 2025 09:20:14 +0000 Subject: [PATCH 176/328] !22485 Update op_plugin commit id Merge pull request !22485 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index bf7156784a..9184160eb5 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit bf7156784afd0a8a480929a651b6b03afcf66d3e +Subproject commit 9184160eb5bfea463e5cb10d223e33fbd910d78a -- Gitee From 3181ed61295b53434b55145ce55c4fdf33df1c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Tue, 1 Jul 2025 09:38:36 +0000 Subject: [PATCH 177/328] =?UTF-8?q?!22430=20modify=20the=20default=20value?= =?UTF-8?q?=20Merge=20pull=20request=20!22430=20from=20=E9=83=AD=E5=85=89?= =?UTF-8?q?=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index fce2f143f7..e15bb200f5 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -470,7 +470,7 @@ uint32_t OptionsManager::GetP2PBufferSize() const static uint32_t buf_size = []() -> uint32_t { char* buf_val = std::getenv("P2P_HCCL_BUFFSIZE"); // Default 0M - int64_t buf_size = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0; + int64_t buf_size = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 20; TORCH_CHECK(buf_size >= 0, "P2P_HCCL_BUFFSIZE cannot be negative.", PTA_ERROR(ErrCode::VALUE)); return static_cast(buf_size); }(); -- Gitee From c38d969d67ffa5e4bc0734e088cc52ba99ef7caf Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 1 Jul 2025 11:05:12 +0000 Subject: [PATCH 178/328] !22510 Update op_plugin commit id Merge pull request !22510 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 9184160eb5..5fe1109e34 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 9184160eb5bfea463e5cb10d223e33fbd910d78a +Subproject commit 5fe1109e3461c4b64f527a266cfd22d1b2b2c836 -- Gitee From f0211db7dd8d66cf7785864059fbc62cd2b138dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Tue, 1 Jul 2025 11:21:35 +0000 Subject: [PATCH 179/328] =?UTF-8?q?!22503=20Fix=20is=5Fjit=5Fcompile=5Ffal?= =?UTF-8?q?se=20while=20setting=20ACL=5FOP=5FINIT=5FMODE=20Merge=20pull=20?= =?UTF-8?q?request=20!22503=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.7.1=5F?= =?UTF-8?q?op?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/npu/Module.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index df72d5010f..7c4e22f4a9 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -879,7 +879,13 @@ PyObject *THNPModule_is_jit_compile_false_wrap(PyObject *self, PyObject *noargs) if (option_value.has_value() && (option_value.value() == "disable")) { Py_RETURN_TRUE; } else { - Py_RETURN_FALSE; + static const std::string jit_compile_init_option_name = "jitCompileInit"; + auto init_option_value = c10_npu::option::GetOption(jit_compile_init_option_name); + if (init_option_value.has_value() && (init_option_value.value() == "disable")) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } } END_HANDLE_TH_ERRORS } -- Gitee From 06ae0c256699acfef8600c95447dbb4af8839afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Tue, 1 Jul 2025 13:39:51 +0000 Subject: [PATCH 180/328] =?UTF-8?q?!22470=20add=20save=5Fasync=20Merge=20p?= =?UTF-8?q?ull=20request=20!22470=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.?= =?UTF-8?q?7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_save_async.py | 119 ++++++++++++++++++++++++ torch_npu/utils/__init__.py | 4 +- torch_npu/utils/serialization.py | 155 +++++++++++++++++++++++++++++-- 3 files changed, 270 insertions(+), 8 deletions(-) create mode 100644 test/npu/test_save_async.py diff --git a/test/npu/test_save_async.py b/test/npu/test_save_async.py new file mode 100644 index 0000000000..2cdf9719eb --- /dev/null +++ b/test/npu/test_save_async.py @@ -0,0 +1,119 @@ +import os +import time +import copy + +import torch +import torch.nn as nn +import torch.optim as optim + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.utils._path_manager import PathManager + + +class TestAsyncSave(TestCase): + test_save_path = os.path.join( + os.path.realpath(os.path.dirname(__file__)), "test_save_async") + + @classmethod + def setUpClass(cls): + PathManager.make_dir_safety(TestAsyncSave.test_save_path) + + @classmethod + def tearDownClass(cls): + PathManager.remove_path_safety(TestAsyncSave.test_save_path) + + def wait_for_save_completion(self, file_path, timeout_sec=60, poll_interval_sec=0.5): + start_time = time.time() + + while time.time() - start_time < timeout_sec: + if os.path.exists(file_path): + current_size = os.path.getsize(file_path) + time.sleep(poll_interval_sec) + new_size = os.path.getsize(file_path) + + if current_size == new_size: + return True + else: + time.sleep(poll_interval_sec) + + return False + + def test_save_async_tensor(self): + save_tensor = torch.rand(1024, dtype=torch.float32).npu() + async_save_path = os.path.join(TestAsyncSave.test_save_path, "async_save_tensor.pt") + torch_npu.utils.save_async(save_tensor, async_save_path) + + if self.wait_for_save_completion(async_save_path): + tensor_async = torch.load(async_save_path, weights_only=False) + self.assertEqual(tensor_async, save_tensor) + else: + self.assertTrue(False, f"{async_save_path} is not exist!") + + def test_save_async(self): + loss1 = [1.6099495, 1.6099086, 1.6098710] + loss2 = [] + model_list = [] + checkpoint_list = [] + model_origin = nn.Sequential( + nn.Linear(100, 50), + nn.ReLU(), + nn.Linear(50, 20), + nn.ReLU(), + nn.Linear(20, 5), + nn.ReLU() + ) + + input_data = torch.ones(6400, 100).npu() + labels = torch.arange(5).repeat(1280).npu() + + criterion = nn.CrossEntropyLoss() + model = model_origin.npu() + optimerizer = optim.SGD(model.parameters(), lr=0.1) + for step in range(3): + outputs = model(input_data) + loss = criterion(outputs, labels) + + optimerizer.zero_grad() + loss.backward() + + optimerizer.step() + + loss2.append(loss) + checkpoint = { + "model": model.state_dict(), + "optimizer": optimerizer.state_dict() + } + checkpoint_list.append(copy.deepcopy(checkpoint)) + model_list.append(copy.deepcopy(model)) + checkpoint_async_path = os.path.join(TestAsyncSave.test_save_path, f"checkpoint_async_{step}.path") + model_async_path = os.path.join(TestAsyncSave.test_save_path, f"model_async_{step}.path") + torch_npu.utils.save_async(checkpoint, checkpoint_async_path, model=model) + torch_npu.utils.save_async(model, model_async_path, model=model) + + for i in range(3): + self.assertEqual(loss1[i], loss2[i].item()) + checkpoint_async_path = os.path.join(TestAsyncSave.test_save_path, f"checkpoint_async_{i}.path") + if self.wait_for_save_completion(checkpoint_async_path): + checkpoint_async = torch.load(checkpoint_async_path, weights_only=False) + self.assertEqual(checkpoint_list[i], checkpoint_async, prec=2e-3) + else: + self.assertTrue(False, f"{checkpoint_async_path} is not exist!") + model_async_path = os.path.join(TestAsyncSave.test_save_path, f"model_async_{i}.path") + if self.wait_for_save_completion(model_async_path): + model_async = torch.load(model_async_path, weights_only=False) + else: + self.assertTrue(False, f"{model_async_path} is not exist!") + state_dict_sync = model_list[i].state_dict() + state_dict_async = model_async.state_dict() + + key_sync = sorted(state_dict_sync.keys()) + key_async = sorted(state_dict_async.keys()) + + self.assertEqual(key_sync, key_async) + for key in key_async: + self.assertEqual(state_dict_async[key], state_dict_sync[key], prec=2e-3) + +if __name__ == '__main__': + torch.npu.set_device(0) + run_tests() diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index 4ba06538af..0cb93e9951 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -1,12 +1,12 @@ __all__ = ["npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter", - "set_thread_affinity", "reset_thread_affinity"] + "set_thread_affinity", "reset_thread_affinity", "save_async"] from torch_npu import _C from ._module import _apply_module_patch from .tensor_methods import _add_tensor_methods from .storage import _add_storage_methods from .combine_tensors import npu_combine_tensors, get_part_combined_tensor, is_combined_tensor_valid -from .serialization import _add_serialization_methods +from .serialization import _add_serialization_methods, save_async from .npu_intercept import _cann_package_check, _add_intercept_methods from .dtensor import _register_ops_under_dtensor_rules from .collect_env import _add_collect_env_methods diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py index 35970aea9e..9b6fb70e96 100644 --- a/torch_npu/utils/serialization.py +++ b/torch_npu/utils/serialization.py @@ -1,25 +1,28 @@ -import os import io +import os import sys import pickle -import re -from typing import Any, Optional +import tarfile +import threading +from typing import Dict, Any, Optional import torch from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \ _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \ _legacy_load, _load, FileLike, MAP_LOCATION, DEFAULT_PROTOCOL, \ - normalize_storage_type, location_tag, _serialization_tls, _get_storage_alignment + normalize_storage_type, location_tag, _serialization_tls, _get_storage_alignment, \ + _open_zipfile_writer from torch.serialization import _default_to_weights_only, UNSAFE_MESSAGE import torch_npu from torch_npu.utils._error_code import ErrCode, pta_error from .utils import _should_print_warning +__all__ = ["load", "save", "save_async"] + ALWAYS_WARN_LEGACY_SERIALIZATION = False RE_MAP_CPU = False - -__all__ = ["load", "save"] +save_async_stream_map = {} def _get_always_warn_legacy_serialization(): @@ -429,6 +432,146 @@ def save( return torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record) +def save_async( + obj: object, + f, + pickle_module: Any = pickle, + pickle_protocol: int = DEFAULT_PROTOCOL, + _use_new_zipfile_serialization: bool = True, + _disable_byteorder_record: bool = False, + model: torch.nn.Module = None +) -> None: + if _use_new_zipfile_serialization is False: + raise RuntimeError("Error: torch_npu.save_async with \"_use_new_zipfile_serialization = False\"\ + is not recommended for npu tensor, which may bring unexpected errors and hopefully \ + set \"_use_new_zipfile_serialization = True\"", + "if it is necessary to use this, please convert the npu tensor to cpu tensor for saving" + + pta_error(ErrCode.PARAM)) + + _check_dill_version(pickle_module) + save_args = (obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record) + + device = torch.npu.current_device() + save_thread = threading.Thread(target=_save_data_thread, args=(save_args, device, model)) + save_thread.start() + + +def _save_data_thread(save_args, + device, + model: torch.nn.Module = None): + global save_async_stream_map + torch.npu.set_device(device) + + def hook_fn(*args): + torch.npu.current_stream().wait_stream(save_async_stream_map.get(device)) + + if device not in save_async_stream_map: + save_async_stream = torch.npu.Stream() + save_async_stream_map[device] = save_async_stream + if isinstance(model, torch.nn.Module): + model.register_full_backward_hook(hook_fn) + else: + save_async_stream = save_async_stream_map[device] + + obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record = save_args + with torch.npu.stream(save_async_stream): + data_value, serialized_storages = _save(obj, pickle_module, pickle_protocol) + storage_value = [] + for key in sorted(serialized_storages.keys()): + name = f'data/{key}' + storage = serialized_storages.get(key) + # given that we copy things around anyway, we might use storage.cpu() + # this means to that to get tensors serialized, you need to implement + # .cpu() on the underlying Storage + if storage.device.type != 'cpu': + storage = storage.cpu() + # Now that it is on the CPU we can directly copy it into the zip file + if storage.device.type != "cpu": + storage_tensor = torch_npu._C._tensor_construct_from_storage(storage) + num_bytes = storage_tensor.size().numel() * storage_tensor.element_size() + else: + num_bytes = storage.nbytes() + storage_value.append((name, storage, num_bytes)) + + with _open_zipfile_writer(f) as opened_zipfile: + opened_zipfile.write_record('data.pkl', data_value, len(data_value)) + + for name, storage, num_bytes in storage_value: + opened_zipfile.write_record(name, storage.data_ptr(), num_bytes) + + +def _save(obj, pickle_module, pickle_protocol): + serialized_storages = {} + id_map: Dict[int, str] = {} + + # Since loading storages that view the same data with different dtypes is + # not supported, we need to keep track of the dtype associated with each + # storage data_ptr and throw an error if the dtype is ever different. + storage_dtypes: Dict[int, torch.dtype] = {} + + def persistent_id(obj): + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + + if isinstance(obj, torch.storage.TypedStorage): + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + if storage.device.type != "cpu": + storage_tensor = torch_npu._C._tensor_construct_from_storage(storage) + storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() // obj._element_size() + else: + storage_numel = obj._size() + + else: + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + if storage.device.type != "cpu": + storage_tensor = torch_npu._C._tensor_construct_from_storage(storage) + storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() + else: + storage_numel = storage.nbytes() + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if storage.data_ptr() != 0: + if storage.data_ptr() in storage_dtypes: + if storage_dtype != storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + 'Cannot save multiple tensors or storages that ' + 'view the same data as different types' + pta_error(ErrCode.VALUE)) + else: + storage_dtypes[storage.data_ptr()] = storage_dtype + + storage_key = id_map.setdefault(storage._cdata, str(len(id_map))) + location = location_tag(storage) + serialized_storages[storage_key] = storage + + return ('storage', + storage_type, + storage_key, + location, + storage_numel) + + return None + + # Write the pickle data for `obj` + data_buf = io.BytesIO() + pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol) + pickler.persistent_id = persistent_id + if isinstance(obj, torch.nn.Module): + hook_handle = obj._backward_hooks.copy() + obj._backward_hooks.clear() + pickler.dump(obj) + obj._backward_hooks.update(hook_handle) + else: + pickler.dump(obj) + data_value = data_buf.getvalue() + return data_value, serialized_storages + + def _add_serialization_methods(): torch.save = save torch.load = load -- Gitee From 64fe56b322bc2fb2ca161aa547805200fa25cd8c Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Tue, 1 Jul 2025 14:01:25 +0000 Subject: [PATCH 181/328] !22496 add default data_type_ for npu_desc Merge pull request !22496 from huangyunlong/2.7ty --- torch_npu/csrc/core/NPUStorageImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/NPUStorageImpl.h b/torch_npu/csrc/core/NPUStorageImpl.h index 8f9a5661c1..499a0e836a 100644 --- a/torch_npu/csrc/core/NPUStorageImpl.h +++ b/torch_npu/csrc/core/NPUStorageImpl.h @@ -25,7 +25,7 @@ public: aclFormat origin_format_ = ACL_FORMAT_UNDEFINED; aclFormat npu_format_ = ACL_FORMAT_ND; // used to make CANN GE tensor from storagImpl - caffe2::TypeMeta data_type_; + caffe2::TypeMeta data_type_ = caffe2::TypeMeta::Make(); }; struct NPUStorageImpl : public c10::StorageImpl { -- Gitee From 54e1d2a6014a00b05b901ac4b231d1b484bc5a62 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 1 Jul 2025 16:20:14 +0000 Subject: [PATCH 182/328] !22534 Update op_plugin commit id Merge pull request !22534 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5fe1109e34..bbeffe152e 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5fe1109e3461c4b64f527a266cfd22d1b2b2c836 +Subproject commit bbeffe152e85f628728ace3575a9ecfe913e448d -- Gitee From af12c75d4a5f33df6f92ef37938c6c79782634f5 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 1 Jul 2025 22:26:15 +0000 Subject: [PATCH 183/328] !22529 Update torchair commit id Merge pull request !22529 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 6aff1ddc03..4e9852e8a1 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 6aff1ddc03dd3e489ba9e053485b98bec7523675 +Subproject commit 4e9852e8a142040d9d44f49319c1f7d1120c4b20 -- Gitee From 74c2c040c012441cee349ee3837f02f0d0fd19b8 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 03:05:14 +0000 Subject: [PATCH 184/328] !22542 Update op_plugin commit id Merge pull request !22542 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index bbeffe152e..c17e0655d0 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit bbeffe152e85f628728ace3575a9ecfe913e448d +Subproject commit c17e0655d0226f00a124f24aaf2c051e9a7991f4 -- Gitee From 0c94dade9c32c815cb3804576270dc4036e62fad Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 03:05:15 +0000 Subject: [PATCH 185/328] !22542 Update op_plugin commit id Merge pull request !22542 from pta-robot/v2.7.1 -- Gitee From af9fede4259d190d7b1daad104ccde47e91128d8 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 04:50:14 +0000 Subject: [PATCH 186/328] !22559 Update op_plugin commit id Merge pull request !22559 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c17e0655d0..356df08eac 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c17e0655d0226f00a124f24aaf2c051e9a7991f4 +Subproject commit 356df08eacd44a565377f59a994e5016f9597537 -- Gitee From 650c1c71182a2ecfba455ac6f6604b01bd9cc7a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E6=89=BF=E6=98=86?= Date: Wed, 2 Jul 2025 08:34:41 +0000 Subject: [PATCH 187/328] =?UTF-8?q?!22553=20Aclgraph=20supported=20in=20li?= =?UTF-8?q?btorch=20Merge=20pull=20request=20!22553=20from=20=E6=9D=9C?= =?UTF-8?q?=E6=89=BF=E6=98=86/aclgraph-271?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUGraph.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp index fd060dcb08..a00448bd1c 100644 --- a/torch_npu/csrc/core/npu/NPUGraph.cpp +++ b/torch_npu/csrc/core/npu/NPUGraph.cpp @@ -1,4 +1,3 @@ -#ifndef BUILD_LIBTORCH #include "torch_npu/csrc/core/npu/NPUGraph.h" #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" @@ -10,7 +9,6 @@ #include #include -#include namespace c10_npu { @@ -261,4 +259,3 @@ NPUGraph::~NPUGraph() } } // namespace c10_npu -#endif -- Gitee From 3623203f950a10ce9dc7d32b4c9708a829bc032c Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Wed, 2 Jul 2025 10:07:18 +0000 Subject: [PATCH 188/328] !22489 cleancode Merge pull request !22489 from SCh-zx/cleancode27 --- torch_npu/contrib/function/roll.py | 2 +- .../csrc/core/npu/GetAffinityCPUInfo.cpp | 6 +- torch_npu/csrc/core/npu/GetCANNInfo.cpp | 2 +- .../csrc/core/npu/NPUCachingAllocator.cpp | 2 +- torch_npu/csrc/core/npu/NPUCachingAllocator.h | 4 +- torch_npu/csrc/core/npu/NPUEventManager.h | 2 +- torch_npu/csrc/core/npu/NPUException.cpp | 6 +- torch_npu/csrc/core/npu/NPUQueue.cpp | 6 +- torch_npu/csrc/core/npu/NPUStream.cpp | 2 +- .../core/npu/NPUSwappedMemoryAllocator.cpp | 2 +- .../csrc/core/npu/interface/AclInterface.cpp | 2 +- .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 4 +- torch_npu/csrc/distributed/Init.cpp | 2 +- .../csrc/framework/utils/ForceAclnnList.cpp | 47 +-- torch_npu/csrc/npu/Module.cpp | 6 +- torch_npu/csrc/utils/TensorType.cpp | 328 +++++++++--------- 16 files changed, 210 insertions(+), 213 deletions(-) diff --git a/torch_npu/contrib/function/roll.py b/torch_npu/contrib/function/roll.py index 97037c8d07..550064e693 100644 --- a/torch_npu/contrib/function/roll.py +++ b/torch_npu/contrib/function/roll.py @@ -30,7 +30,7 @@ _roll_with_index_select = _RollWithIndexSelect.apply def _get_roll_index(H, W, shifts, device='cpu'): index = torch.arange(0, H * W).reshape(H, W) index_fp = torch.roll(index, shifts=shifts, dims=(0, 1)).reshape(-1).long() - index_bp_dict = {i:idx for idx, i in enumerate(index_fp.numpy().tolist())} + index_bp_dict = {i: idx for idx, i in enumerate(index_fp.numpy().tolist())} index_bp_list = [index_bp_dict[i] for i in range(H * W)] index_bp = torch.LongTensor(index_bp_list) return [index_fp.to(device), index_bp.to(device)] diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp index bcb89e6c88..c5f6f913d4 100644 --- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp +++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp @@ -93,8 +93,10 @@ void GetExclusiveAffinityCPU() offset = find_offset->second; } c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu); - int length = (cpu_range.end - cpu_range.start + 1) / same_num; - c10_npu::CoreIdRange exclusiveAffinityCpu = {cpu_range.start + offset * length, (cpu_range.start + length - 1) + offset * length}; + unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast(same_num); + c10_npu::CoreIdRange exclusiveAffinityCpu = { + cpu_range.start + static_cast(offset) * length, + (cpu_range.start + length - 1) + static_cast(offset) * length}; offsetMap[affinity_cpu] = offset + 1; CardIdAffinityCPU[card_id] = exclusiveAffinityCpu; } diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp index 8916a70fc9..cc817f5506 100644 --- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp +++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp @@ -123,7 +123,7 @@ int64_t DriverVersionToNum(std::string versionStr) ((release + 1) * 10000) + ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) - - (alphaVersion ? 1 : 0) * (100 - alphaVersion) + + (alphaVersion != 0 ? 1 : 0) * (100 - alphaVersion) + (bVersion - 1) + patch; return num; } diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index a37bae6fbc..82314c2ebd 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -2510,7 +2510,7 @@ private: // Repeat GC until we reach reclaim > target size. bool block_freed = true; - while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) { + while (gc_reclaimed < target_size && block_freed && freeable_block_count > 0) { // Free blocks exceeding this age threshold first. double age_threshold = total_age / freeable_block_count; // Stop iteration if we can no longer free a block. diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index c33f51fbc8..a4e14d2232 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -23,8 +23,8 @@ C10_NPU_API std::mutex* getFreeMutex(); // block inside of already allocated area. class FreeMemoryCallback { public: - virtual ~FreeMemoryCallback(){}; - virtual bool Execute() = 0; + virtual ~FreeMemoryCallback(){}; + virtual bool Execute() = 0; }; C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback); diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h index c01491aa03..ac7f0176e0 100644 --- a/torch_npu/csrc/core/npu/NPUEventManager.h +++ b/torch_npu/csrc/core/npu/NPUEventManager.h @@ -22,7 +22,7 @@ public: void DecreaseUnrecordedCount(aclrtEvent event); bool IsEventRecorded(aclrtEvent event); void ClearUnrecordedCount(); - ~NPUEventManager() {} + ~NPUEventManager() {} private: void run(aclrtEvent event); diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 12ed02d528..fe4d2ec4c4 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -132,10 +132,12 @@ const std::string c10_npu_check_error_message(std::string& errmsg) std::regex ws_regex("[\\s\\t\\n\\r]+"); content = std::regex_replace(content, ws_regex, " "); - if (!content.empty() && content.front() == ' ') + if (!content.empty() && content.front() == ' ') { content.erase(0, 1); - if (!content.empty() && content.back() == ' ') + } + if (!content.empty() && content.back() == ' ') { content.pop_back(); + } return content; } diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 73e2bb7ca1..bd29315e05 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -249,7 +249,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) // occur. #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; - if (PyGILState_Check()) { + if (PyGILState_Check() != 0) { gilState = PyEval_SaveThread(); } #endif @@ -523,7 +523,7 @@ void Repository::Enqueue(void *cur_paras) uint64_t u = 1; SetWriteWorking(true); - while (ret == false && (GetStatus() == RUN || GetStatus() == INIT)) { + while (!ret && (GetStatus() == RUN || GetStatus() == INIT)) { ret = WriteQueue(cur_paras); if (ret == false) { SetWriteWorking(false); @@ -531,7 +531,7 @@ void Repository::Enqueue(void *cur_paras) if (IsFullQueue()) { #ifndef BUILD_LIBTORCH // double check the current thread hold a Gil lock - if (PyGILState_Check()) { + if (PyGILState_Check() != 0) { Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u); Py_END_ALLOW_THREADS } else { diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 35e6e526b1..4411760ab4 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -259,7 +259,7 @@ static uint32_t get_idx(std::atomic& counter) { auto raw_idx = counter++; static int StreamsPerPool = GetStreamsPerPool(); - return raw_idx % StreamsPerPool; + return raw_idx % static_cast(StreamsPerPool); } static uint32_t get_sync_launch_stream_idx(std::atomic& counter) diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp index 084f1df577..39d19b0b62 100644 --- a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp @@ -47,7 +47,7 @@ void* registerSvmMem(void* ptr, size_t size) void* mallocHostSwapMemory(size_t size) { if (!initialized) { - kAlignSize = sysconf(_SC_PAGESIZE); + kAlignSize = static_cast(sysconf(_SC_PAGESIZE)); initialized = true; } size = (size + kAlignSize - 1) & ~(kAlignSize - 1); diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 75658804e2..39c4b53443 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -411,7 +411,7 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) { } TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND)); return func_backup(stream); - } + } } aclError AclrtDestroyStreamForce(aclrtStream stream) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 3564e5196c..4b6707b849 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -277,8 +277,8 @@ void NpuSysCtrl::RegisterLazyFn(const option::OptionCallBack& call_, const std:: lazy_fn_.emplace_back(std::make_pair(call_, in)); } -void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, - ReleasePriority priority) { +void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, ReleasePriority priority) +{ const auto& iter = this->release_fn_.find(priority); if (iter != release_fn_.end()) { release_fn_[priority].emplace_back(release_fn); diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 99c6dc6f22..78131ad150 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -576,7 +576,7 @@ Example:: Default settings return everything - i.e. contains HCCL comm dumps and collective traces. )"); - Py_RETURN_TRUE; + Py_RETURN_TRUE; } // c10d methods on torch._C diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp index c6b0e64641..1626499a80 100644 --- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp +++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp @@ -18,35 +18,36 @@ namespace at_npu { namespace native { +void ForceAclnn::RegisterOp(const std::string &list) +{ + if (list.empty()) { + return; + } -void ForceAclnn::RegisterOp(const std::string &list) { - if (list.empty()) { - return; - } - - auto value = list; - std::string delimiter = ","; - auto start = 0U; - auto end = value.find(delimiter); - std::string token; - while (end != std::string::npos) { + auto value = list; + std::string delimiter = ","; + auto start = 0U; + auto end = value.find(delimiter); + std::string token; + while (end != std::string::npos) { + token = value.substr(start, end - start); + if (!token.empty()) { + force_aclnn_op_list_.insert(token); + } + start = end + delimiter.size(); + end = value.find(delimiter, start); + } token = value.substr(start, end - start); if (!token.empty()) { - force_aclnn_op_list_.insert(token); + force_aclnn_op_list_.insert(token); } - start = end + delimiter.size(); - end = value.find(delimiter, start); - } - token = value.substr(start, end - start); - if (!token.empty()) { - force_aclnn_op_list_.insert(token); - } - return; + return; } -bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const { - bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end()); - return ret; +bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const +{ + bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end()); + return ret; } } // namespace native } // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 7c4e22f4a9..ecaff129d6 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1219,7 +1219,8 @@ PyObject* THNPModule_npuCachingAllocator_raw_alloc(PyObject *_unused, PyObject * END_HANDLE_TH_ERRORS } -PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) { +PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) +{ HANDLE_TH_ERRORS void* mem_ptr = PyLong_AsVoidPtr(obj); c10_npu::NPUCachingAllocator::raw_delete(mem_ptr); @@ -1271,7 +1272,8 @@ PyObject* THNPModule_npuUnlockMutex(PyObject *module, PyObject *noargs) Py_RETURN_NONE; } -PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) { +PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) +{ HANDLE_TH_ERRORS pybind11::gil_scoped_release no_gil; NPU_CHECK_ERROR_WITHOUT_UCE(aclmdlInitDump()); diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp index aeb6fd8b83..e6998f57ea 100644 --- a/torch_npu/csrc/utils/TensorType.cpp +++ b/torch_npu/csrc/utils/TensorType.cpp @@ -6,7 +6,6 @@ namespace torch_npu { namespace utils { - using namespace at; using namespace torch::autograd; @@ -15,14 +14,13 @@ std::vector> all_declared_types_npu() std::vector> ret; // can't easily iterate over enum classes, does not support BFloat16 now std::vector backends = { c10::Backend::PrivateUse1 }; - std::vector scalar_types = { - ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float, - ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half, - ScalarType::Bool, ScalarType::BFloat16 - }; - - for (auto& backend : backends) { - for (auto& scalar_type : scalar_types) { + std::vector scalar_types = { ScalarType::Byte, ScalarType::Char, ScalarType::Double, + ScalarType::Float, ScalarType::Int, ScalarType::Long, + ScalarType::Short, ScalarType::Half, ScalarType::Bool, + ScalarType::BFloat16 }; + + for (auto &backend : backends) { + for (auto &scalar_type : scalar_types) { ret.emplace_back(std::make_pair(backend, scalar_type)); } } @@ -32,8 +30,8 @@ std::vector> all_declared_types_npu() struct PyTensorType { PyTypeObject py_type; - THPDtype* dtype; - THPLayout* layout; + THPDtype *dtype; + THPLayout *layout; bool is_npu; char name[64]; int backend; @@ -57,73 +55,67 @@ struct PyTensorType { static_assert(std::is_standard_layout::value, "PyTensorType must be standard layout"); -static void py_bind_tensor_types(const std::vector& tensor_types); +static void py_bind_tensor_types(const std::vector &tensor_types); -static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +static PyObject *Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS - auto& tensor_type = *((PyTensorType*)type); + auto &tensor_type = *((PyTensorType *)type); if (tensor_type.is_npu) { - TORCH_NPU_WARN_ONCE( - "Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. " + TORCH_NPU_WARN_ONCE("Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. " "It's best to use methods such as torch.tensor(data, dtype=*, device='npu') " "to create tensors."); } - TORCH_CHECK_TYPE( - !tensor_type.is_npu || c10_npu::device_count() != 0, - "type ", - tensor_type.name, + TORCH_CHECK_TYPE(!tensor_type.is_npu || c10_npu::device_count() != 0, "type ", tensor_type.name, " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE)) torch_npu::utils::npu_lazy_init(); - return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), - tensor_type.get_scalar_type(), - args, - kwargs)); + return THPVariable_Wrap( + torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), tensor_type.get_scalar_type(), args, kwargs)); END_HANDLE_TH_ERRORS } -static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg) +static PyObject *Tensor_instancecheck(PyObject *_self, PyObject *arg) { - HANDLE_TH_ERRORS - auto self = (PyTensorType*)_self; - if (THPVariable_Check(arg)) { - const auto& var = THPVariable_Unpack(arg); - - if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && - var.scalar_type() == static_cast(self->scalar_type)) { - Py_RETURN_TRUE; + HANDLE_TH_ERRORS + auto self = (PyTensorType *)_self; + if (THPVariable_Check(arg)) { + const auto &var = THPVariable_Unpack(arg); + + if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && + var.scalar_type() == static_cast(self->scalar_type)) { + Py_RETURN_TRUE; + } } - } - Py_RETURN_FALSE; - END_HANDLE_TH_ERRORS + Py_RETURN_FALSE; + END_HANDLE_TH_ERRORS } -PyObject* Tensor_dtype(PyTensorType* self, void *unused) +PyObject *Tensor_dtype(PyTensorType *self, void *unused) { - return torch::autograd::utils::wrap(self->dtype); + return torch::autograd::utils::wrap(self->dtype); } -PyObject* Tensor_layout(PyTensorType* self, void *unused) +PyObject *Tensor_layout(PyTensorType *self, void *unused) { - return torch::autograd::utils::wrap(self->layout); + return torch::autograd::utils::wrap(self->layout); } -PyObject* Tensor_is_npu(PyTensorType* self, void *unused) +PyObject *Tensor_is_npu(PyTensorType *self, void *unused) { - if (self->is_npu) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } + if (self->is_npu) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } } -PyObject* Tensor_is_sparse(PyTensorType *self, void *unused) +PyObject *Tensor_is_sparse(PyTensorType *self, void *unused) { - if (self->layout->layout == at::Layout::Strided) { - Py_RETURN_FALSE; - } else { - Py_RETURN_TRUE; - } + if (self->layout->layout == at::Layout::Strided) { + Py_RETURN_FALSE; + } else { + Py_RETURN_TRUE; + } } static struct PyMethodDef metaclass_methods[] = { @@ -131,7 +123,7 @@ static struct PyMethodDef metaclass_methods[] = { {nullptr} }; -using getter = PyObject* (*)(PyObject *, void *); +using getter = PyObject *(*)(PyObject *, void *); static struct PyGetSetDef metaclass_properties[] = { {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr}, @@ -142,46 +134,44 @@ static struct PyGetSetDef metaclass_properties[] = { }; static PyTypeObject metaclass = { - PyVarObject_HEAD_INIT(nullptr, 0) - "torch.tensortype", /* tp_name */ - sizeof(PyTypeObject) /* tp_basicsize */ + PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */ + sizeof(PyTypeObject) /* tp_basicsize */ }; -static void py_initialize_metaclass(PyTypeObject& metaclass) +static void py_initialize_metaclass(PyTypeObject &metaclass) { - metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; - metaclass.tp_methods = metaclass_methods; - metaclass.tp_getset = metaclass_properties; - metaclass.tp_base = &PyType_Type; - if (PyType_Ready(&metaclass) < 0) { - throw python_error(); - } + metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; + metaclass.tp_methods = metaclass_methods; + metaclass.tp_getset = metaclass_properties; + metaclass.tp_base = &PyType_Type; + if (PyType_Ready(&metaclass) < 0) { + throw python_error(); + } } static PyTypeObject tensor_type_prototype = { - PyVarObject_HEAD_INIT(&metaclass, 0) - nullptr, /* tp_name */ - sizeof(PyTensorType) /* tp_basicsize */ + PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */ + sizeof(PyTensorType) /* tp_basicsize */ }; -static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) +static void py_initialize_tensor_type(PyTypeObject &type, const char *name, PyObject *tp_dict) { - // NOTE: we don't use the typical static declaration of PyTypeObject because - // we need to initialize as many types as there are VariableType instances. - // We copy the basic object fields from a prototype definition and initialize - // the remaining fields below. - memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); - // Subclassing from torch.Tensor isn't supported. - // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. - type.tp_flags = Py_TPFLAGS_DEFAULT; - type.tp_name = name; - type.tp_new = Tensor_new; - if (PyType_Ready(&type) < 0) { - throw python_error(); - } - if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { - throw python_error(); - } + // NOTE: we don't use the typical static declaration of PyTypeObject because + // we need to initialize as many types as there are VariableType instances. + // We copy the basic object fields from a prototype definition and initialize + // the remaining fields below. + memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); + // Subclassing from torch.Tensor isn't supported. + // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. + type.tp_flags = Py_TPFLAGS_DEFAULT; + type.tp_name = name; + type.tp_new = Tensor_new; + if (PyType_Ready(&type) < 0) { + throw python_error(); + } + if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { + throw python_error(); + } } static std::string get_module(Backend backend) @@ -204,103 +194,103 @@ static std::string get_module(Backend backend) static std::string get_name(Backend backend, ScalarType scalarType) { - std::ostringstream ss; - ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; - return ss.str(); + std::ostringstream ss; + ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; + return ss.str(); } -static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) +static void set_type(PyTensorType &type_obj, Backend backend, ScalarType scalarType) { - // This field is lazily initialized from backend and scalar_type - type_obj.backend = static_cast(backend); - type_obj.scalar_type = static_cast(scalarType); - type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); - type_obj.dtype = torch::getTHPDtype(scalarType); - type_obj.is_npu = (backend == c10::Backend::PrivateUse1); + // This field is lazily initialized from backend and scalar_type + type_obj.backend = static_cast(backend); + type_obj.scalar_type = static_cast(scalarType); + type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); + type_obj.dtype = torch::getTHPDtype(scalarType); + type_obj.is_npu = (backend == c10::Backend::PrivateUse1); } -static void set_name(PyTensorType& type_obj, const std::string& name) +static void set_name(PyTensorType &type_obj, const std::string &name) { - size_t n = sizeof(type_obj.name); - strncpy(type_obj.name, name.c_str(), n); - type_obj.name[n - 1] = '\0'; + size_t n = sizeof(type_obj.name); + strncpy(type_obj.name, name.c_str(), n); + type_obj.name[n - 1] = '\0'; } static THPObjectPtr get_tensor_dict() { - auto torch = THPObjectPtr(PyImport_ImportModule("torch")); - if (!torch) { - throw python_error(); - } - - auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); - if (!tensor_class) { - throw python_error(); - } - - auto tensor_type = (PyTypeObject*)tensor_class.get(); - TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); - - auto res = THPObjectPtr(PyDict_New()); - if (!res) { - throw python_error(); - } - - if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { - throw python_error(); - } - if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { - throw python_error(); - } - - return res; + auto torch = THPObjectPtr(PyImport_ImportModule("torch")); + if (!torch) { + throw python_error(); + } + + auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); + if (!tensor_class) { + throw python_error(); + } + + auto tensor_type = (PyTypeObject *)tensor_class.get(); + TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); + + auto res = THPObjectPtr(PyDict_New()); + if (!res) { + throw python_error(); + } + + if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { + throw python_error(); + } + if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { + throw python_error(); + } + + return res; } static std::vector tensor_types; -static void initialize_npu_aten_types(std::vector& tensor_types) +static void initialize_npu_aten_types(std::vector &tensor_types) { - // only initialize npu types - auto declared_types = all_declared_types_npu(); - tensor_types.resize(declared_types.size()); - - for (size_t i = 0, end = declared_types.size(); i != end; i++) { - auto& tensor_type = tensor_types[i]; - Backend backend = declared_types[i].first; - ScalarType scalar_type = declared_types[i].second; - set_type(tensor_type, backend, scalar_type); - set_name(tensor_type, get_name(backend, scalar_type)); - } + // only initialize npu types + auto declared_types = all_declared_types_npu(); + tensor_types.resize(declared_types.size()); + + for (size_t i = 0, end = declared_types.size(); i != end; i++) { + auto &tensor_type = tensor_types[i]; + Backend backend = declared_types[i].first; + ScalarType scalar_type = declared_types[i].second; + set_type(tensor_type, backend, scalar_type); + set_name(tensor_type, get_name(backend, scalar_type)); + } } void _initialize_python_bindings() { - // Initialize the at::Type* pointers, name, and properties of the PyTensorType - // vector. After this call, the vector must not be resized. - initialize_npu_aten_types(tensor_types); - - // Initialize the Python metaclass for the torch.FloatTensor, etc. types. - // The metaclass handles __instancecheck__ checks and binds the dtype property - // on the type objects. - py_initialize_metaclass(metaclass); - - // Get the tp_dict of the Variable class. We copy function definitions - // onto each Tensor type object so that they can be accessed via e.g. - // `torch.npu.FloatTensor.add`. - auto tensor_dict = get_tensor_dict(); - - // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. - for (auto& tensor_type : tensor_types) { - py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); - } - - // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor - // is added to the `torch_npu` module as `FloatTensor`. Also add all the type - // objects to the set torch_npu._tensor_classes. - py_bind_tensor_types(tensor_types); + // Initialize the at::Type* pointers, name, and properties of the PyTensorType + // vector. After this call, the vector must not be resized. + initialize_npu_aten_types(tensor_types); + + // Initialize the Python metaclass for the torch.FloatTensor, etc. types. + // The metaclass handles __instancecheck__ checks and binds the dtype property + // on the type objects. + py_initialize_metaclass(metaclass); + + // Get the tp_dict of the Variable class. We copy function definitions + // onto each Tensor type object so that they can be accessed via e.g. + // `torch.npu.FloatTensor.add`. + auto tensor_dict = get_tensor_dict(); + + // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. + for (auto &tensor_type : tensor_types) { + py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); + } + + // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor + // is added to the `torch_npu` module as `FloatTensor`. Also add all the type + // objects to the set torch_npu._tensor_classes. + py_bind_tensor_types(tensor_types); } -static void py_bind_tensor_types(const std::vector& tensor_types) +static void py_bind_tensor_types(const std::vector &tensor_types) { auto torch_module = THPObjectPtr(PyImport_ImportModule("torch")); if (!torch_module) { @@ -312,7 +302,7 @@ static void py_bind_tensor_types(const std::vector& tensor_types) throw python_error(); } - for (auto& tensor_type : tensor_types) { + for (auto &tensor_type : tensor_types) { auto name = std::string(tensor_type.name); auto idx = name.rfind('.'); auto type_name = name.substr(idx + 1); @@ -323,7 +313,7 @@ static void py_bind_tensor_types(const std::vector& tensor_types) throw python_error(); } - PyObject* type_obj = (PyObject*)&tensor_type; + PyObject *type_obj = (PyObject *)&tensor_type; Py_INCREF(type_obj); if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) { throw python_error(); @@ -335,12 +325,12 @@ static void py_bind_tensor_types(const std::vector& tensor_types) } // Callback for python part. Used for additional initialization of python classes -static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs) +static PyObject *THPModule_initExtension(PyObject *_unused, PyObject *noargs) { - HANDLE_TH_ERRORS - _initialize_python_bindings(); - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS + HANDLE_TH_ERRORS + _initialize_python_bindings(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS } // autograd methods on torch._C @@ -349,9 +339,9 @@ static PyMethodDef TorchNpuExtensionMethods[] = { {nullptr, nullptr, 0, nullptr} }; -PyMethodDef* npu_extension_functions() +PyMethodDef *npu_extension_functions() { - return TorchNpuExtensionMethods; + return TorchNpuExtensionMethods; } } } -- Gitee From 2ec58dc0b5aaeeae8fe65b5e1a32a382aed0db77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 2 Jul 2025 10:12:15 +0000 Subject: [PATCH 189/328] =?UTF-8?q?!22565=20fix=20lazymodule=20for=20torch?= =?UTF-8?q?.Tensor.npu=20Merge=20pull=20request=20!22565=20from=20?= =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.7.0=5Flazymodule?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index dd80a69fae..d9bcbc268f 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -199,6 +199,7 @@ torch._register_device_module('npu', torch_npu.npu) unsupported_dtype = [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8] torch.utils.generate_methods_for_privateuse1_backend(for_tensor=True, for_module=True, for_storage=True, unsupported_dtype=unsupported_dtype) +torch.nn.parameter.UninitializedTensorMixin._allowed_methods.append(torch.Tensor.npu) # Apply monkey-patches. _apply_patches(all_monkey_patches) -- Gitee From 73a92a5aff434ad4308cbdf0eaae4934343f028b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 11:20:15 +0000 Subject: [PATCH 190/328] !22577 Update op_plugin commit id Merge pull request !22577 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 356df08eac..20a8bf302f 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 356df08eacd44a565377f59a994e5016f9597537 +Subproject commit 20a8bf302f8b1c481a873112e14a397d0cccb0db -- Gitee From 00122c288f6bcd2fccca65b04327dc994dfaeee9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 11:20:15 +0000 Subject: [PATCH 191/328] !22577 Update op_plugin commit id Merge pull request !22577 from pta-robot/v2.7.1 -- Gitee From 5148e466327639a48a1a2327932283762e2e5260 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 14:05:16 +0000 Subject: [PATCH 192/328] !22581 Update op_plugin commit id Merge pull request !22581 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 20a8bf302f..cdd8ddbd04 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 20a8bf302f8b1c481a873112e14a397d0cccb0db +Subproject commit cdd8ddbd0493f93dd03899fdae3a02cbe25be87e -- Gitee From 2605e421b04a415dcd3c00aed9d70b47bb840d4a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 2 Jul 2025 16:05:16 +0000 Subject: [PATCH 193/328] !22593 Update op_plugin commit id Merge pull request !22593 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index cdd8ddbd04..4cfc96543c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit cdd8ddbd0493f93dd03899fdae3a02cbe25be87e +Subproject commit 4cfc96543cbd35bf47e3f79bd358bd23f2598f2f -- Gitee From 85e2dbc21372d8f3a4667660afde8d40728c64c3 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 3 Jul 2025 01:38:47 +0000 Subject: [PATCH 194/328] !22588 Update torchair commit id Merge pull request !22588 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 4e9852e8a1..f10111e0d3 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 4e9852e8a142040d9d44f49319c1f7d1120c4b20 +Subproject commit f10111e0d36a3e11a57176e574b356d9c85f8115 -- Gitee From 44eb6ab6b8438aab44cf19ad72c876f2f6b726b9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 3 Jul 2025 03:20:23 +0000 Subject: [PATCH 195/328] !22600 Update op_plugin commit id Merge pull request !22600 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 4cfc96543c..c0ae69dea0 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 4cfc96543cbd35bf47e3f79bd358bd23f2598f2f +Subproject commit c0ae69dea033070745af78a5d6121651a74cc552 -- Gitee From b0b3e120a50a4d4fa165bda0e909b6252a5d40f5 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 3 Jul 2025 09:20:17 +0000 Subject: [PATCH 196/328] !22616 Update op_plugin commit id Merge pull request !22616 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c0ae69dea0..9b2b3c1910 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c0ae69dea033070745af78a5d6121651a74cc552 +Subproject commit 9b2b3c1910cdc41b22b2f0b63a46f3f61b7abb5b -- Gitee From 9ec316f9af2b70e39493e9ac4ef2535a257e95b1 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 3 Jul 2025 11:05:17 +0000 Subject: [PATCH 197/328] !22625 Update op_plugin commit id Merge pull request !22625 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 9b2b3c1910..2871a14a92 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 9b2b3c1910cdc41b22b2f0b63a46f3f61b7abb5b +Subproject commit 2871a14a92cd09b8b57319549d1bd10324aa3018 -- Gitee From 0f0b47251b8e3e4d2ee238fd8d778eff3d9591ed Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 3 Jul 2025 11:05:17 +0000 Subject: [PATCH 198/328] !22625 Update op_plugin commit id Merge pull request !22625 from pta-robot/v2.7.1 -- Gitee From 0f8511d971c3e94bf4cfeb68b16cc25ba310f151 Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Thu, 3 Jul 2025 11:13:13 +0000 Subject: [PATCH 199/328] !22570 fix memory_snapshot bug when stacks='python' Merge pull request !22570 from yuhaiyan/v2.7.1-dev1 --- torch_npu/csrc/npu/memory_snapshot.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp index 47fbf4de6c..cc893243a7 100644 --- a/torch_npu/csrc/npu/memory_snapshot.cpp +++ b/torch_npu/csrc/npu/memory_snapshot.cpp @@ -16,7 +16,11 @@ namespace torch_npu { std::shared_ptr gather() { +#if defined(__x86_64__) return torch::CapturedTraceback::gather(true, true, false); +#else + return torch_npu::CapturedTraceback::gather(true, true, false); +#endif } std::shared_ptr gather_with_cpp() -- Gitee From 75809baec3f4a9a9452f9afab3c305fd5755296b Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 3 Jul 2025 22:22:02 +0000 Subject: [PATCH 200/328] !22651 Update torchair commit id Merge pull request !22651 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index f10111e0d3..952cfa98cc 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit f10111e0d36a3e11a57176e574b356d9c85f8115 +Subproject commit 952cfa98cc8edd67813d39c567fe8d76b6d44a7c -- Gitee From 8ff8bbc5d5db252df5017066bf2cff2a01b01b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Fri, 4 Jul 2025 03:30:53 +0000 Subject: [PATCH 201/328] =?UTF-8?q?!22516=20Change=20ACL=5FOP=5FINIT=5FMOD?= =?UTF-8?q?E=20default=20value=20to=201=20Merge=20pull=20request=20!22516?= =?UTF-8?q?=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.7.1=5Fop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index e15bb200f5..c41a42ff9f 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -482,10 +482,11 @@ uint32_t OptionsManager::GetAclOpInitMode() const static uint32_t acl_op_init_mode = []() -> uint32_t { char* buf_val = std::getenv("ACL_OP_INIT_MODE"); // Default 0 - int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0; + int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 1; std::unordered_map aclOpInitMode = getAclOpInitMode(); if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) { - TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 0."); + acl_op_init_mode = 1; + TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 1."); } return static_cast(acl_op_init_mode); }(); -- Gitee From 164366dacbd4022bd9b57ca78170a51f618268a2 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Fri, 4 Jul 2025 03:53:25 +0000 Subject: [PATCH 202/328] !22657 cleancode Merge pull request !22657 from SCh-zx/cl27 --- env.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/env.sh b/env.sh index ff54b797d2..96fa71d80f 100644 --- a/env.sh +++ b/env.sh @@ -1,3 +1,4 @@ +#!/bin/bash # 配置CANN相关环境变量 CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' -- Gitee From de48e0218e5b85061c972aea6964e229f694fd2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Fri, 4 Jul 2025 06:38:08 +0000 Subject: [PATCH 203/328] =?UTF-8?q?!22669=20fix=20error=20Merge=20pull=20r?= =?UTF-8?q?equest=20!22669=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUQueue.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index bd29315e05..2fa4c4766a 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -314,7 +314,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) repo_error + ".\n" + "Since the operator is called asynchronously, the stacktrace may be inaccurate. " "If you want to get the accurate stacktrace, " - "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + + "please set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, " "resulting in performance degradation. " "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." + @@ -490,7 +490,7 @@ void Repository::Enqueue(void *cur_paras) repo_error + ".\n" + "Since the operator is called asynchronously, the stacktrace may be inaccurate. " "If you want to get the accurate stacktrace, " - "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + + "please set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" + "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, " "resulting in performance degradation. " "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." + -- Gitee From 559cd5927806af415186d8efaab42f5742293030 Mon Sep 17 00:00:00 2001 From: wgb Date: Fri, 4 Jul 2025 09:57:28 +0000 Subject: [PATCH 204/328] !22677 Resize_ to support ncdhw to other dims Merge pull request !22677 from wgb/v2.7.1 --- test/custom_ops/test_resize_.py | 32 +++++++++++++++++++ .../csrc/framework/StorageDescHelper.cpp | 8 +++-- 2 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 test/custom_ops/test_resize_.py diff --git a/test/custom_ops/test_resize_.py b/test/custom_ops/test_resize_.py new file mode 100644 index 0000000000..93b3018324 --- /dev/null +++ b/test/custom_ops/test_resize_.py @@ -0,0 +1,32 @@ +import numpy as np +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestResize(TestCase): + + def test_masked_select_out(self): + + input_data = torch.tensor([[[[[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20], [21, 22, 23, 24, 25]]]]]], dtype=torch.float) + mask = torch.tensor([True, False, True, False, True]) + + input_data_npu = input_data.npu() + mask_npu = mask.npu() + + out_tensor = torch.empty((1, 1, 1, 1, 1), dtype=input_data.dtype) + out_tensor_npu = out_tensor.npu() + + out_tensor_npu = out_tensor_npu.view(-1) + out_tensor_npu = torch.masked_select(input_data_npu, mask_npu, out=out_tensor_npu) + out_tensor = torch.masked_select(input_data, mask, out=out_tensor) + self.assertRtolEqual(out_tensor_npu, out_tensor) + + def test_resize_ncdhw(self): + out_tensor = torch.empty((1, 1, 1, 1, 1), dtype=torch.float16).npu() + shape = [25] + out_tensor.resize_(shape) + self.assertEqual(shape, out_tensor.shape) + +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp index eb568a74db..08a2d603b6 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.cpp +++ b/torch_npu/csrc/framework/StorageDescHelper.cpp @@ -62,9 +62,13 @@ void StorageDescHelper::UpdateDesc(torch_npu::NPUStorageDesc &npuDesc, const c10 } } npuDesc.base_strides_ = new_stride; - // 更新物理内存信息 - npuDesc.storage_sizes_ = FormatHelper::GetStorageSizes(npuDesc); + int NCDHW_OR_NDHWC_DIM = 5; + if ((npuDesc.npu_format_ == ACL_FORMAT_NCDHW || npuDesc.npu_format_ == ACL_FORMAT_NDHWC) && new_size.size() < NCDHW_OR_NDHWC_DIM) { + npuDesc.storage_sizes_ = new_size; + } else { + npuDesc.storage_sizes_ = FormatHelper::GetStorageSizes(npuDesc); + } if (new_data_numel > new_shape_numel) { // Refresh format to base format only when flattening storage data npuDesc.storage_sizes_ = new_size; -- Gitee From 1d84f94de35fbca7a4423107514c60b0bcb5b35b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 4 Jul 2025 11:50:17 +0000 Subject: [PATCH 205/328] !22681 Update op_plugin commit id Merge pull request !22681 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 2871a14a92..a968ca93d9 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 2871a14a92cd09b8b57319549d1bd10324aa3018 +Subproject commit a968ca93d9211886e750e924a2ffb06edb252147 -- Gitee From bd5592d2c1bfe185cd8f93a2ae8a2f826971b7d6 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 4 Jul 2025 14:05:17 +0000 Subject: [PATCH 206/328] !22697 Update op_plugin commit id Merge pull request !22697 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index a968ca93d9..f81fa68039 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit a968ca93d9211886e750e924a2ffb06edb252147 +Subproject commit f81fa68039d2ada17220b8a75c0bb0ad6e598b0a -- Gitee From 9624ef6229c923a4f433be721975bbdc46aec7d6 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 5 Jul 2025 09:46:59 +0000 Subject: [PATCH 207/328] !22715 Update op_plugin commit id Merge pull request !22715 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f81fa68039..954d098b04 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f81fa68039d2ada17220b8a75c0bb0ad6e598b0a +Subproject commit 954d098b0434e4594dcf008d4528057c2443b4a6 -- Gitee From dcae25134285807c5cbc0aa4ca0e36842b62f8ee Mon Sep 17 00:00:00 2001 From: liupeng303 Date: Mon, 7 Jul 2025 02:50:00 +0000 Subject: [PATCH 208/328] =?UTF-8?q?!22644=20Provide=20interface=20such=20a?= =?UTF-8?q?s=20getDeviceStatus,=20resetPeakStats=20for=20NPUPlugg=E2=80=A6?= =?UTF-8?q?=20Merge=20pull=20request=20!22644=20from=20liupeng303/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_pluggable_allocator_extensions.py | 31 +++++++++++++++++-- .../pluggable_allocator_extensions.cpp | 13 ++++++++ torch_npu/csrc/npu/Module.cpp | 18 +++++++++++ torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 26 +++++++++++++--- torch_npu/csrc/npu/NPUPluggableAllocator.h | 4 +++ 5 files changed, 86 insertions(+), 6 deletions(-) diff --git a/test/allocator/test_pluggable_allocator_extensions.py b/test/allocator/test_pluggable_allocator_extensions.py index 99cc499a93..54e270513d 100644 --- a/test/allocator/test_pluggable_allocator_extensions.py +++ b/test/allocator/test_pluggable_allocator_extensions.py @@ -2,6 +2,7 @@ import os import sys import shutil import subprocess +import ctypes import torch import torch.utils.cpp_extension @@ -27,6 +28,7 @@ def build_stub(base_dir): class TestPluggableAllocator(TestCase): module = None + new_alloc = None build_directory = "allocator/build" @classmethod @@ -59,9 +61,9 @@ class TestPluggableAllocator(TestCase): def test_pluggable_allocator(self): os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') # Load the allocator - new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free') + TestPluggableAllocator.new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free') # Swap the current allocator - torch_npu.npu.memory.change_current_allocator(new_alloc) + torch_npu.npu.memory.change_current_allocator(TestPluggableAllocator.new_alloc) # This will allocate memory in the device using the new allocator self.assertFalse(self.module.check_custom_allocator_used()) npu_tensor = torch.zeros(10, device='npu') @@ -69,6 +71,31 @@ class TestPluggableAllocator(TestCase): self.assertRtolEqual(npu_tensor.cpu().numpy(), cpu_tensor.numpy()) self.assertTrue(self.module.check_custom_allocator_used()) + def test_set_get_device_stats_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + get_device_stats_fn = ctypes.cast(getattr(myallocator, "my_get_device_stats"), ctypes.c_void_p).value + + msg = "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn" + with self.assertRaisesRegex(RuntimeError, msg): + torch.npu.memory_stats_as_nested_dict() + + TestPluggableAllocator.new_alloc.allocator().set_get_device_stats_fn(get_device_stats_fn) + self.assertEqual(torch.npu.memory_stats_as_nested_dict()["num_alloc_retries"], 0) + + def test_set_reset_peak_status_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + reset_peak_status_fn = ctypes.cast(getattr(myallocator, "my_reset_peak_status"), ctypes.c_void_p).value + + msg = "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn" + with self.assertRaisesRegex(RuntimeError, msg): + torch.npu.reset_peak_memory_stats() + + TestPluggableAllocator.new_alloc.allocator().set_reset_peak_status_fn(reset_peak_status_fn) + torch.npu.reset_peak_memory_stats() + self.assertEqual(torch.npu.max_memory_allocated(), 0) + def test_pluggable_allocator_after_init(self): os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') # Do an initial memory allocator diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp index 3ed2606b02..6bb80e59dd 100644 --- a/test/cpp_extensions/pluggable_allocator_extensions.cpp +++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp @@ -4,8 +4,10 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "third_party/acl/inc/acl/acl_rt.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" extern "C" { +using c10_npu::NPUCachingAllocator::DeviceStats; static bool useflag = false; void* my_malloc(ssize_t size, int device, aclrtStream stream) @@ -27,6 +29,17 @@ bool check_custom_allocator_used() { return useflag; } + +DeviceStats my_get_device_stats(int device) +{ + DeviceStats stats; + return stats; +} + +void my_reset_peak_status(int device) +{ + std::cout<<"resetPeakStatus success!"< func = reinterpret_cast(func_ptr); self.set_erase_stream_fn(func); + }) + .def( + "set_get_device_stats_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType=c10_npu::NPUCachingAllocator::DeviceStats(int); + std::function func = + reinterpret_cast(func_ptr); + self.set_get_device_stats_fn(func); + }) + .def( + "set_reset_peak_status_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType = void(int); + std::function func = + reinterpret_cast(func_ptr); + self.set_reset_peak_status_fn(func); }); m.def( diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index e8e0fd3eef..ef07cf8bef 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -74,6 +74,18 @@ void NPUPluggableAllocator::set_erase_stream_fn( erase_stream_fn_ = std::move(erase_stream_fn); } +void NPUPluggableAllocator::set_get_device_stats_fn( + std::function get_device_stats_fn) +{ + get_device_stats_fn_ = std::move(get_device_stats_fn); +} + +void NPUPluggableAllocator::set_reset_peak_status_fn( + std::function reset_peak_status_fn) +{ + reset_peak_status_fn_ = std::move(reset_peak_status_fn); +} + void* NPUPluggableAllocator::malloc( size_t size, int device, @@ -212,8 +224,11 @@ void NPUPluggableAllocator::eraseStream( c10_npu::NPUCachingAllocator::DeviceStats NPUPluggableAllocator::getDeviceStats(int device) { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support getDeviceStats. " - "If you need it, please file an issue describing your use case."); + if (get_device_stats_fn_) { + return get_device_stats_fn_(device); + } else { + TORCH_CHECK(false, "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn"); + } } void NPUPluggableAllocator::resetAccumulatedStats(int device) @@ -224,8 +239,11 @@ void NPUPluggableAllocator::resetAccumulatedStats(int device) void NPUPluggableAllocator::resetPeakStats(int device) { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support resetPeakStats. " - "If you need it, please file an issue describing your use case."); + if (reset_peak_status_fn_) { + reset_peak_status_fn_(device); + } else { + TORCH_CHECK(false, "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn"); + } } c10_npu::NPUCachingAllocator::SnapshotInfo NPUPluggableAllocator::snapshot() diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index 3a71319f3c..04f1d909be 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -45,6 +45,8 @@ struct NPUPluggableAllocator std::function record_stream_fn); void set_erase_stream_fn( std::function erase_stream_fn); + void set_get_device_stats_fn(std::function get_device_stats_fn); + void set_reset_peak_status_fn(std::function reset_peak_status_fn); void* malloc(size_t size, int device, aclrtStream stream); c10::DataPtr allocate(size_t size) override; @@ -108,6 +110,8 @@ protected: std::function base_alloc_fn_; std::function record_stream_fn_; std::function erase_stream_fn_; + std::function get_device_stats_fn_; + std::function reset_peak_status_fn_; std::mutex allocator_mutex_; // We do the bookeeping here in order to simplify custom allocators std::unordered_map allocation_metadata_; -- Gitee From b22e4b5080d3aaa22ca70fdbfd54fcd7c2456fcc Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 7 Jul 2025 06:24:17 +0000 Subject: [PATCH 209/328] !22729 Update op_plugin commit id Merge pull request !22729 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 954d098b04..72321907ac 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 954d098b0434e4594dcf008d4528057c2443b4a6 +Subproject commit 72321907accba073c2ebbfb9338fb19db61f41eb -- Gitee From 524af7e0ffe28302dbec4ee9a64477363bb69bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E6=89=BF=E6=98=86?= Date: Mon, 7 Jul 2025 06:31:43 +0000 Subject: [PATCH 210/328] =?UTF-8?q?!22607=20=E3=80=90inductor=E3=80=91supp?= =?UTF-8?q?ort=202.7.1=20Merge=20pull=20request=20!22607=20from=20?= =?UTF-8?q?=E6=9D=9C=E6=89=BF=E6=98=86/inductor-271-new?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_public_bindings.py | 32 + torch_npu/_inductor/__init__.py | 104 +- torch_npu/_inductor/codecache.py | 82 + torch_npu/_inductor/codegen/__init__.py | 35 + torch_npu/_inductor/codegen/_sizevars.py | 9 + torch_npu/_inductor/codegen/cpp_utils.py | 6 + torch_npu/_inductor/codegen/cpp_wrapper.py | 896 +++++++ torch_npu/_inductor/codegen/ir.py | 199 ++ torch_npu/_inductor/codegen/ir_fx.py | 864 +++++++ .../_inductor/codegen/kernel_analysis.py | 305 +++ .../_inductor/codegen/npu_kernel_features.py | 94 + torch_npu/_inductor/codegen/scheduling.py | 459 ++++ torch_npu/_inductor/codegen/split_tiling.py | 283 ++ torch_npu/_inductor/codegen/tile_generator.py | 242 ++ torch_npu/_inductor/codegen/triton.py | 1953 ++++++++++++++ torch_npu/_inductor/codegen/triton_utils.py | 26 + torch_npu/_inductor/codegen/wrapper.py | 246 ++ torch_npu/_inductor/config.py | 111 + torch_npu/_inductor/cpp_builder.py | 120 + torch_npu/_inductor/decomposition.py | 49 + torch_npu/_inductor/fx_passes/joint_graph.py | 15 + torch_npu/_inductor/graph.py | 114 + torch_npu/_inductor/ir.py | 58 + torch_npu/_inductor/lowering.py | 265 ++ torch_npu/_inductor/lowering_fx.py | 2291 +++++++++++++++++ torch_npu/_inductor/lowering_op_list.py | 107 + torch_npu/_inductor/npu_choices.py | 33 + torch_npu/_inductor/npu_device.py | 208 ++ .../_inductor/npu_fusion_attention_graph.py | 253 ++ torch_npu/_inductor/npu_triton_helpers.py | 22 + torch_npu/_inductor/npu_triton_heuristics.py | 1193 +++++++++ torch_npu/_inductor/runtime.py | 70 + torch_npu/_inductor/utils.py | 76 + torch_npu/utils/_dynamo_device.py | 21 +- 34 files changed, 10837 insertions(+), 4 deletions(-) create mode 100644 torch_npu/_inductor/codecache.py create mode 100644 torch_npu/_inductor/codegen/__init__.py create mode 100644 torch_npu/_inductor/codegen/_sizevars.py create mode 100644 torch_npu/_inductor/codegen/cpp_utils.py create mode 100644 torch_npu/_inductor/codegen/cpp_wrapper.py create mode 100644 torch_npu/_inductor/codegen/ir.py create mode 100644 torch_npu/_inductor/codegen/ir_fx.py create mode 100644 torch_npu/_inductor/codegen/kernel_analysis.py create mode 100644 torch_npu/_inductor/codegen/npu_kernel_features.py create mode 100644 torch_npu/_inductor/codegen/scheduling.py create mode 100644 torch_npu/_inductor/codegen/split_tiling.py create mode 100644 torch_npu/_inductor/codegen/tile_generator.py create mode 100644 torch_npu/_inductor/codegen/triton.py create mode 100644 torch_npu/_inductor/codegen/triton_utils.py create mode 100644 torch_npu/_inductor/codegen/wrapper.py create mode 100644 torch_npu/_inductor/config.py create mode 100644 torch_npu/_inductor/cpp_builder.py create mode 100644 torch_npu/_inductor/decomposition.py create mode 100644 torch_npu/_inductor/fx_passes/joint_graph.py create mode 100644 torch_npu/_inductor/graph.py create mode 100644 torch_npu/_inductor/ir.py create mode 100644 torch_npu/_inductor/lowering.py create mode 100644 torch_npu/_inductor/lowering_fx.py create mode 100644 torch_npu/_inductor/lowering_op_list.py create mode 100644 torch_npu/_inductor/npu_choices.py create mode 100644 torch_npu/_inductor/npu_device.py create mode 100644 torch_npu/_inductor/npu_fusion_attention_graph.py create mode 100644 torch_npu/_inductor/npu_triton_helpers.py create mode 100644 torch_npu/_inductor/npu_triton_heuristics.py create mode 100644 torch_npu/_inductor/runtime.py create mode 100644 torch_npu/_inductor/utils.py diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py index 1049fc8d87..929441bc59 100644 --- a/test/npu/test_public_bindings.py +++ b/test/npu/test_public_bindings.py @@ -550,6 +550,38 @@ class TestPublicBindings(TestCase): "torch_npu.op_plugin.meta._meta_registrations", "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_dequant_bias", "torch_npu.op_plugin.atb._atb_meta_registrations", + "torch_npu._inductor", + "torch_npu._inductor.codegen", + "torch_npu._inductor.config", + "torch_npu._inductor.decomposition", + "torch_npu._inductor.lowering", + "torch_npu._inductor.lowering_fx", + "torch_npu._inductor.npu_choices", + "torch_npu._inductor.npu_device", + "torch_npu._inductor.npu_fusion_attention_graph", + "torch_npu._inductor.npu_triton_helpers", + "torch_npu._inductor.npu_triton_heuristics", + "torch_npu._inductor.runtime", + "torch_npu._inductor.utils", + "torch_npu._inductor.codegen._sizevars", + "torch_npu._inductor.codegen.cpp_wrapper", + "torch_npu._inductor.codegen.ir", + "torch_npu._inductor.codegen.ir_fx", + "torch_npu._inductor.codegen.kernel_analysis", + "torch_npu._inductor.codegen.npu_kernel_features", + "torch_npu._inductor.codegen.scheduling", + "torch_npu._inductor.codegen.split_tiling", + "torch_npu._inductor.codegen.tile_generator", + "torch_npu._inductor.codegen.triton", + "torch_npu._inductor.codegen.triton_utils", + "torch_npu._inductor.codegen.cpp_utils", + "torch_npu._inductor.codegen.wrapper", + "torch_npu._inductor.codecache", + "torch_npu._inductor.cpp_builder", + "torch_npu._inductor.fx_passes.joint_graph", + "torch_npu._inductor.ir", + "torch_npu._inductor.graph", + "torch_npu._inductor.lowering_op_list", } # No new entries should be added to this list. diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py index 0aba17d108..04ef05321d 100644 --- a/torch_npu/_inductor/__init__.py +++ b/torch_npu/_inductor/__init__.py @@ -1 +1,103 @@ -import os \ No newline at end of file +import os + +import torch +from torch._dynamo.device_interface import register_interface_for_device, get_interface_for_device +from torch._inductor import lowering as inductor_lowering +from torch._inductor.choices import InductorChoices +from torch._inductor.codegen.common import register_backend_for_device, register_device_op_overrides +from torch._inductor.runtime import autotune_cache +from torch_npu.npu import device_count +from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device +from torch_npu.utils._inductor import NPUDeviceOpOverrides + +from . import config as npu_config +from . import codegen +from .npu_fusion_attention_graph import register_fa_pass +from .config import aggresive_autotune, num_vector_core, set_compile_threads +from .config import log as npulog +from .decomposition import _register_npu_inductor_decompositons +from .lowering import make_reduction, npu_make_fallback +from .npu_choices import should_use_persistent_reduction +from .npu_device import NewNPUDeviceOpOverrides +from .runtime import _load_cached_autotuning +from .utils import get_current_raw_stream, patch_is_gpu, patch_has_triton + +set_compile_threads() + + +def _inductor_register_backend_for_device(): + from .codegen.scheduling import NPUTritonScheduling + from .codegen.wrapper import NPUWrapperCodeGen + from .codegen.cpp_wrapper import CppWrapperNpu + register_backend_for_device('npu', NPUTritonScheduling, NPUWrapperCodeGen, CppWrapperNpu) + + +_inductor_register_backend_for_device() + + +def _inductor_register_device_op_overrides(): + register_device_op_overrides('npu', NewNPUDeviceOpOverrides()) + + +_inductor_register_device_op_overrides() + +device = get_interface_for_device("npu") + +inductor_lowering.make_reduction = make_reduction +inductor_lowering.make_fallback = npu_make_fallback + + +def patch_torch_for_aoti(): + from .graph import patch_codegen_with_cpp_wrapper + from .cpp_builder import patch_get_cpp_torch_device_options + from .codegen.cpp_utils import patch_device_to_aten + from .utils import patch_is_same_tensor + from .fx_passes.joint_graph import patch_constant_fold_uniform_value + from .ir import patch_fallback_kernel_codegen + from .codecache import patch_aot_code_compiler_compile + patch_codegen_with_cpp_wrapper() + patch_get_cpp_torch_device_options() + patch_device_to_aten() + patch_is_same_tensor() + patch_constant_fold_uniform_value() + patch_fallback_kernel_codegen() + patch_aot_code_compiler_compile() + + +if os.environ.get("DISABLE_AOTI_PATCH", "0") != "1": + patch_torch_for_aoti() + + +if npu_config.dump_fx_graph: + from .codegen.ir_fx import _patch_npu_inductor_ir + + _patch_npu_inductor_ir() + +if npu_config.dump_fx_graph: + from .lowering_fx import _register_npu_inductor_fallbacks +else: + from .lowering import _register_npu_inductor_fallbacks + +_register_npu_inductor_fallbacks() +_register_npu_inductor_decompositons() + + +# register fx_pass should be put behind of _register_npu_inductor_decompositons +def _replace_benchmark_all_configs(): + from torch._inductor.triton_heuristics import CachingAutotuner + from .npu_triton_heuristics import benchmark_all_configs + CachingAutotuner.benchmark_all_configs = benchmark_all_configs + + +if (aggresive_autotune): + _replace_benchmark_all_configs() + import os + + os.environ["TRITON_BENCH_METHOD"] = "npu" + +InductorChoices.should_use_persistent_reduction = should_use_persistent_reduction +autotune_cache._load_cached_autotuning = _load_cached_autotuning + +register_fa_pass() +patch_is_gpu() +patch_has_triton() diff --git a/torch_npu/_inductor/codecache.py b/torch_npu/_inductor/codecache.py new file mode 100644 index 0000000000..9cd9dac4f0 --- /dev/null +++ b/torch_npu/_inductor/codecache.py @@ -0,0 +1,82 @@ +import os +import contextlib +from typing import ( + Any, + Callable, + cast, + Dict, + Generator, + List, + NoReturn, + Optional, + Sequence, + Tuple, + TYPE_CHECKING, + TypeVar, + Union, +) + +import torch +from torch._inductor import config +from torch._inductor.codecache import get_lock_dir, LOCK_TIMEOUT +from torch._inductor.graph import GraphLowering + +from torch_npu.utils._error_code import ErrCode, pta_error + +empty_json = "{}" + + +@contextlib.contextmanager +def lock_context(key): + from filelock import FileLock + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + yield + + +def patch_aot_code_compiler_compile(): + # In v2.6.0, aoti has bug when init oss_proxy_executor with default op_json, + # which could not be skipped, so here we try to create a new npu op_json, + # and clear the content of default op_json. + from torch._inductor.codecache import AotCodeCompiler + AotCodeCompiler.src_compile = AotCodeCompiler.compile + + @classmethod + def compile_npu( + cls, + graph: GraphLowering, + source_code: str, + serialized_extern_kernel_nodes: Optional[str], + device_type: str, + additional_files: List[str], + ) -> Union[List[str], str]: + result = cls.src_compile( + graph, source_code, serialized_extern_kernel_nodes, + device_type, additional_files + ) + generated_files = additional_files + if not config.aot_inductor.package: + return result + + output_so = [r for r in result if r.endswith(".so")] + if len(output_so) > 1: + raise RuntimeError(f"Could not generate npu op json, because there are" + f"more than one so in generated files: {result}" + pta_error(ErrCode.INTERNAL)) + output_so = output_so[0] + key = os.path.basename(output_so)[0].replace(".", "_") + dir_basename = os.path.splitext(output_so)[0] + with lock_context(key): + if serialized_extern_kernel_nodes: + extern_kernel_nodes_json = dir_basename + "_npu.json" + with open(extern_kernel_nodes_json, "w") as f: + f.write(serialized_extern_kernel_nodes) + generated_files.append(extern_kernel_nodes_json) + + if serialized_extern_kernel_nodes: + source_json_file = dir_basename + ".json" + with open(source_json_file, "w") as f: + f.write(empty_json) + return generated_files + AotCodeCompiler.compile = compile_npu + \ No newline at end of file diff --git a/torch_npu/_inductor/codegen/__init__.py b/torch_npu/_inductor/codegen/__init__.py new file mode 100644 index 0000000000..9ec8bf6a95 --- /dev/null +++ b/torch_npu/_inductor/codegen/__init__.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + + +from torch._inductor import sizevars +from torch._inductor.codegen.simd import SIMDKernel +from torch._inductor.codegen.triton import TritonKernel +from torch._inductor.codegen.triton import TritonScheduling +from torch._inductor.ir import Reduction, LoopBody +from torch_npu._inductor.codegen._sizevars import simplify +from torch_npu._inductor.codegen.ir import (num_splits, loopbody__call__, transform_dims_in_indexing, + substituted_dims_in_indexing) +from torch_npu._inductor.codegen.scheduling import create_tiling +from torch_npu._inductor.codegen.triton import group_fn, select_index_dtype +from torch_npu._inductor.codegen.triton import is_compatible + +from ..config import log as npulog + + +Reduction.num_splits = num_splits +setattr(LoopBody, 'transform_dims_in_indexing', transform_dims_in_indexing) +setattr(LoopBody, 'substituted_dims_in_indexing', substituted_dims_in_indexing) + +LoopBody.__call__ = loopbody__call__ +# need to enable this to speedup attn_cp_test +# triton scheduling +TritonScheduling.group_fn = group_fn +TritonScheduling.select_index_dtype = select_index_dtype +TritonScheduling.create_tiling = create_tiling +# triton kernel +setattr(SIMDKernel, 'is_compatible', is_compatible) + +# util +sizevars.SizeVarAllocator.simplify = simplify diff --git a/torch_npu/_inductor/codegen/_sizevars.py b/torch_npu/_inductor/codegen/_sizevars.py new file mode 100644 index 0000000000..f294742050 --- /dev/null +++ b/torch_npu/_inductor/codegen/_sizevars.py @@ -0,0 +1,9 @@ +import sympy +from sympy import Expr +from torch._inductor.utils import sympy_subs + + +def simplify(self, expr: Expr): + if isinstance(expr, (tuple, list)): + return [sympy.expand(s).xreplace(self.replacements) for s in expr] + return sympy.expand(expr).xreplace(self.replacements) diff --git a/torch_npu/_inductor/codegen/cpp_utils.py b/torch_npu/_inductor/codegen/cpp_utils.py new file mode 100644 index 0000000000..7a9b0887e7 --- /dev/null +++ b/torch_npu/_inductor/codegen/cpp_utils.py @@ -0,0 +1,6 @@ +import torch_npu + + +def patch_device_to_aten(): + from torch._inductor import codegen + codegen.cpp_utils.DEVICE_TO_ATEN["npu"] = "at::kPrivateUse1" diff --git a/torch_npu/_inductor/codegen/cpp_wrapper.py b/torch_npu/_inductor/codegen/cpp_wrapper.py new file mode 100644 index 0000000000..9a16cfabf8 --- /dev/null +++ b/torch_npu/_inductor/codegen/cpp_wrapper.py @@ -0,0 +1,896 @@ +import functools +import os +import sys +from itertools import chain, count, zip_longest +from typing import Any, Callable, List, Optional, Tuple, TYPE_CHECKING, Union +import sympy +import torch +from torch import dtype as torch_dtype +from torch._inductor import config +from torch._inductor.codecache import CudaKernelParamCache +from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name +from torch._inductor.codegen.aoti_hipify_utils import maybe_hipify_code_wrapper +from torch._inductor.codegen.common import get_device_op_overrides +from torch._inductor.codegen.cpp_utils import cexpr, DTYPE_TO_CPP, DEVICE_TO_ATEN +from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu +from torch._inductor.codegen.multi_kernel import MultiKernelCall +from torch._inductor.codegen.wrapper import PythonWrapperCodegen, SymbolicCallArg +from torch._inductor.ir import IRNode, TensorBox +from torch._inductor.runtime.runtime_utils import dynamo_timed +from torch._inductor.utils import DeferredLineBase +from torch._inductor.virtualized import V +from torch._inductor.utils import _align, ALIGN_BYTES + +from .. import config as npu_config +from ..config import npu_block as NPU_ALIGN_BYTES + +if TYPE_CHECKING: + from torch._inductor.graph import GraphLowering + + +def checkIfTrue(value, msg): + if not value: + raise RuntimeError(msg) + return True + + +class DeferredNpuKernelLine(DeferredLineBase): + """ + When using cpp wrapper, NPU kernel load and launch needs to wait for Triton kernels + to be tuned and stored as cubin files, so use a deferred line to backfill those information + """ + + def __init__( + self, + kernel_name: str, + line_template: str, + keys: Tuple[str, ...], + additional_files: List[str], + ): + super().__init__(line_template) + checkIfTrue(not isinstance(line_template, DeferredLineBase), "line template can not be DeferredLineBase") + self.additional_files = additional_files + self.kernel_name = kernel_name + self.line_template = line_template + self.keys = keys + + def __call__(self): + if self.kernel_name.startswith("multi_kernel_"): + # MultiKernel will select one kernel after running the autotune block + self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name) + params = CudaKernelParamCache.get(self.kernel_name) + checkIfTrue(params is not None, f"{self.kernel_name} not found in CudaKernelParamCache") + + for key in self.keys: + checkIfTrue(key in params, f"{key} not found in CudaKernelParamCache[{self.kernel_name}]") + + if key == get_cpp_wrapper_cubin_path_name(): + checkIfTrue(os.path.exists(params[key]), f"{params[key]} does not exist") + self.additional_files.append(params[key]) + + return self.line_template % tuple(params[key] for key in self.keys) + + def _new_line(self, line): + return DeferredNpuKernelLine( + self.kernel_name, line, self.keys, self.additional_files + ) + + +class DeferredNpuDefaultGrid: + """ + A container for the default grid, which may be used by DeferredNpuGridLine + """ + + def __init__( + self, + kernel_name: str, + grid, + grid_callable: Optional[Callable[..., Any]] = None, + **grid_extra_kwargs, + ): + self.kernel_name = kernel_name + self.grid = grid + self.grid_callable = grid_callable + self.grid_extra_kwargs = grid_extra_kwargs + + def __iter__(self): + # DeferredNpuDefaultGrid can be passed to the base class, PythonWrapperCodegen, + # to generate the autotune code block, and thus we need this iterator + return iter(self.grid) + + def _process_grid(self, grid: Union[List[Any], Tuple[Any, ...]]): + if isinstance(grid, (list, tuple)): + return [self._process_grid(e) for e in grid] + else: + return grid.inner_expr if isinstance(grid, SymbolicCallArg) else grid + + def __call__(self): + if self.kernel_name.startswith("multi_kernel_"): + # MultiKernel will select one kernel after running the autotune block + self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name) + + grid = self.grid + checkIfTrue(isinstance(grid, (list, tuple)), f"expected {grid=} to be a list") + + grid = self._process_grid(grid) + + checkIfTrue(self.grid_callable is not None, "grid_callable can't be None") + + if not self.grid_extra_kwargs: + grid_fn = self.grid_callable(*grid) + else: + grid_fn = self.grid_callable(*grid, **self.grid_extra_kwargs) + + params = CudaKernelParamCache.get(self.kernel_name) + checkIfTrue(params is not None, f"{self.kernel_name} not found in CudaKernelParamCache") + + return grid_fn(params["meta"]) + + +class DeferredNpuGridLine(DeferredLineBase): + """ + When using cpp wrapper, NPU kernel load and launch needs to wait for Triton kernels + to be tuned and stored as cubin files, so use a deferred line to backfill those information + """ + + def __init__( + self, + kernel_name: str, + grid_var: str, + grid, + autotune_configs, + ): + super().__init__("") + self.kernel_name = kernel_name + self.grid_var = grid_var + self.grid = grid + self.autotune_configs = autotune_configs + + def __call__(self): + if self.kernel_name.startswith("multi_kernel_"): + # MultiKernel will select one kernel after running the autotune block + self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name) + + params = CudaKernelParamCache.get(self.kernel_name) + + checkIfTrue(params is not None, f"{self.kernel_name} not found in CudaKernelParamCache") + + if self.autotune_configs is not None: + # This indicates the Triton kernel is a user-defined one. + grid = None + if len(self.grid) == 1: + grid = self.grid[0] + else: + for i, c in enumerate(self.autotune_configs): + if all(arg == params["meta"][key] for key, arg in c.kwargs.items()): + grid = self.grid[i] + break + checkIfTrue(grid is not None, "grid can not be None") + grid_args_str = ", ".join( + [cexpr(V.graph.sizevars.simplify(item)) for item in grid] + ) + else: + launch_grid = (params['grid_x'], params['grid_y'], params['grid_z']) + grid_args_str = ", ".join( + [cexpr(item) for item in launch_grid] + ) + + return f"\n Grid {self.grid_var} = Grid({grid_args_str});\n" + + def _new_line(self, line): + return DeferredNpuGridLine( + self.kernel_name, self.grid_var, self.grid, self.autotune_configs + ) + + +class CppWrapperNpu(CppWrapperCpu): + """ + Generates cpp wrapper for running on NPU and calls CUDA kernels + """ + + def __init__(self) -> None: + self.device = 'npu' + self.device_codegen = get_device_op_overrides(self.device) + super().__init__() + self.grid_id = count() + self.visited_raii_handle = set() + self.visited_handle_for_kernel_id = dict() + + @staticmethod + def create( + is_subgraph: bool, subgraph_name: str, parent_wrapper: PythonWrapperCodegen + ): + # comment at CppWrapperCpu `codegen_subgraph` function. + return CppWrapperNpu() + + def super_write_header_rewrite(self): + """Copied from CppWrapperCpu to: + (1) change __file__ path for cpython, so that we can use aoti_runtime in current path. + (2) rewrite include path of aoti header file. + """ + if V.graph.is_const_graph: + # We do not write header for constant graph, it will be written by main module. + return + + if V.graph.aot_mode: + self.header.splice( + """ + #include + #include + """ + ) + with open( + os.path.join(os.path.dirname(__file__), "aoti_runtime", "interface.cpp") + ) as f: + self.header.splice(f.read()) + else: + self.header.splice( + """ + import torch + from torch._inductor.codecache import CppWrapperCodeCache + + cpp_wrapper_src = ( + ''' + #include + namespace py = pybind11; + + class RAIIPyObject { + public: + RAIIPyObject() : obj_(nullptr) {} + RAIIPyObject(PyObject* obj) : obj_(obj) {} + ~RAIIPyObject() { + Py_XDECREF(obj_); + } + RAIIPyObject& operator=(const RAIIPyObject& other) { + if (this != &other) { + Py_XDECREF(obj_); + obj_ = other.obj_; + Py_XINCREF(obj_); + } + return *this; + } + operator PyObject*() { + return obj_; + } + PyObject* get() { + return obj_; + } + private: + PyObject* obj_; + }; + + #include + #include + using namespace torch::aot_inductor; + """ + ) + + self.header.splice( + f""" + #include + #include + #include + // Here comment c_shim_npu.h because npu doesn't implement it. + // #include + + #include + typedef at::Half half; + typedef at::BFloat16 bfloat16; + + // Round up to the nearest multiple of {ALIGN_BYTES} + [[maybe_unused]] static int64_t align(int64_t nbytes) {{ + return (nbytes + {ALIGN_BYTES} - 1) & -{ALIGN_BYTES}; + }} + """ + ) + extend_aoti_c_shim_include = ( + f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h" + ) + extend_aoti_c_shim_path = os.path.join( + os.path.dirname(torch.__file__), + "include", + extend_aoti_c_shim_include, + ) + if os.path.exists(extend_aoti_c_shim_path): + self.header.splice(f"#include <{extend_aoti_c_shim_include}>") + + enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [ + "linux", + "win32", + ] + if config.profiler_mark_wrapper_call or enable_kernel_profile: + # No C shim for profiling APIs, assuming profiling is a debugging feature which + # does not provide any ABI compatibility promise. + self.header.splice("#include ") + + def write_header(self): + if V.graph.is_const_graph: + # We do not write header for constant graph, it will be written by main module. + return + + self.super_write_header_rewrite() + self.header.splice("#include ") + self.header.splice("#include ") + self.header.splice(self.device_codegen.abi_compatible_header()) + self.header.splice( + maybe_hipify_code_wrapper(self.device_codegen.kernel_driver()) + ) + self.header.splice("#include ") + self.header.splice("#include ") + if npu_config.aot_inductor.debug_kernel: + self.header.splice("#include ") + + def write_get_raw_stream(self, device_idx: int, graph=None) -> str: + name = f"stream{device_idx}" + self.writeline( + maybe_hipify_code_wrapper( + f"{self.device_codegen.cpp_stream_type()} {name};" + ) + ) + self.writeline( + f"AOTI_TORCH_ERROR_CODE_CHECK({self.device_codegen.aoti_get_stream()}({device_idx}, (void**)&{name}));" + ) + return name + + def codegen_inputs(self): + # See Note: [Input Alignment handling in Inductor] + # + # JIT Inductor does not guard on input alignment. It relies on copy_misaligned_inputs to + # copy misaligned inputs to aligned buffers. For AOTInductor, we expect users to use it + # as non-Python deployment for its best performance, so implicitly copying misaligned inputs + # to aligned buffers is going to bring a surprising performance hit. Instead, we check input + # alignment and throw an error if any input is misaligned. + if V.graph.aot_mode and V.graph.inputs_to_check: + for idx in V.graph.inputs_to_check: + input_name = V.graph.graph_input_names[idx] + checkIfTrue(input_name in V.graph.graph_inputs, f"{input_name} not found in graph inputs") + + value = V.graph.graph_inputs[input_name] + checkIfTrue(isinstance(value, TensorBox), + f"{input_name} is expected to be tensor but found as {type(value)}") + + self.prefix.splice( + f""" + if ((long({input_name}.data_ptr()) & ({NPU_ALIGN_BYTES} -1)) != 0) {{ + throw std::runtime_error("{input_name} is not aligned to {NPU_ALIGN_BYTES} bytes"); + }} + """ + ) + + super().codegen_inputs() + + def define_kernel( + self, + kernel_name: str, + kernel_body: str, + metadata: Optional[str] = None, + gpu=True, + ): + if gpu: + if config.triton.autotune_at_compile_time: + # Call PythonWrapperCodegen to create the autotune code block + PythonWrapperCodegen.define_kernel( + self, kernel_name, kernel_body, metadata, gpu + ) + else: + return CppWrapperCpu.define_kernel( + self, kernel_name, kernel_body, metadata, gpu + ) + + def generate(self, is_inference): + with dynamo_timed("CppWrapperNpu.generate", log_pt2_compile_event=True): + self.prefix.writeline("\n") + if not V.graph.aot_mode: + for kernel in chain( + sorted(self.src_to_kernel.values()), + sorted( + [entry[0] for entry in self.user_defined_kernel_cache.values()] + ), + ): + self.prefix.writeline( + maybe_hipify_code_wrapper( + f"static {self.device_codegen.cpp_kernel_type()} {kernel} = nullptr;" + ) + ) + self.prefix.writeline("\n") + return super().generate(is_inference) + + def generate_user_defined_triton_kernel( + self, + kernel_name: str, + raw_args: List[Any], + grid: List[Any], + configs, + triton_meta, + constexprs, + ): + if ( + config.triton.autotune_at_compile_time + and kernel_name not in self.kernel_autotune_names + ): + # Call PythonWrapperCodegen to create the autotune code block + PythonWrapperCodegen.generate_user_defined_triton_kernel( + self, + kernel_name, + raw_args, + grid, + configs, + triton_meta, + constexprs, + ) + + # in C++ wrapper, we don't pass constexpr args, as they don't + # get added as parameters to the PTX code compiled from the + # user-defined Triton kernel (only non-constexpr args do) + raw_args = [raw_arg for i, raw_arg in enumerate(raw_args) if i not in constexprs] + args = [self.val_to_arg_str(v) for v in raw_args] + arg_types = [ + arg.get_dtype() if isinstance(arg, IRNode) else type(arg) + for arg in raw_args + ] + + # Call self.generate_kernel_call to generate the real kernel call in cpp + self.generate_kernel_call( + kernel_name, + args, + arg_types=arg_types, + raw_args=raw_args, + grid=grid, + gpu=True, + triton=True, + triton_meta=triton_meta, + autotune_configs=configs, + ) + + @functools.lru_cache(None) # noqa: B019 + def generate_load_kernel_once( + self, + kernel_name: str, + device_index, + graph: "GraphLowering", # for per-graph caching + ): + keys = (get_cpp_wrapper_cubin_path_name(), "mangled_name", "shared_mem") + kernel_var_name = f"kernels.{kernel_name}" if V.graph.aot_mode else kernel_name + self.writeline(f"if ({kernel_var_name} == nullptr) {{") + deferred_gpu_kernel_line = DeferredNpuKernelLine( + kernel_name, + " " + kernel_var_name + r' = loadKernel("%s", "%s", %s, this->cubin_dir_);', + keys, + self.additional_files, + ) + self.writeline(deferred_gpu_kernel_line) + self.writeline("}") + return kernel_var_name + + def codegen_tensor_item_npu( + self, dtype: torch.dtype, tensor: str, scalar: str, indented_buffer=None + ): + dtype_str = str(dtype).split(".")[-1] + writer = indented_buffer or self + + if dtype == torch.float16 or dtype == torch.bfloat16: + scalar_tmp = f"{scalar}_tmp" + writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar_tmp};") + writer.writeline( + f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar_tmp}));" + ) + writer.writeline(f"float {scalar} = float({scalar_tmp});") + struct_data = f'float {scalar} __attribute__((aligned(4)));' + arg_data = f'static_cast({scalar})' + else: + writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar};") + writer.writeline( + f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar}));" + ) + struct_data = f'{DTYPE_TO_CPP[dtype]} {scalar} __attribute__((aligned(sizeof({DTYPE_TO_CPP[dtype]} ))));' + arg_data = f'static_cast<{DTYPE_TO_CPP[dtype]}>({scalar})' + + return struct_data, arg_data + + def codegen_device(self, device): + if device.type not in DEVICE_TO_ATEN: + raise RuntimeError(device.type + "not found in DEVICE_TO_ATEN") + device_str = DEVICE_TO_ATEN[device.type][5:].lower() # remove "at::k" + if device_str == "privateuse1": + device_str = "npu" + self.used_cached_devices.add(device_str) + return f"cached_torch_device_type_{device_str}, {device.index if device.index else 0}" + + def write_wrapper_decl(self): + super().write_wrapper_decl() + with self.prefix.indent(): + if not V.graph.aot_mode: + return + dump_path = npu_config.aot_inductor.dump_path_cpp + if npu_config.aot_inductor.debug_kernel: + self.prefix.splice( + f""" + auto dump_path = std::filesystem::current_path() / "{dump_path}"; + if (!std::filesystem::exists(dump_path)) {{ + std::filesystem::create_directory(dump_path); + }} + """ + ) + + self.prefix.splice( + """ + auto tensor_handle_to_tensor_pointer = [](AtenTensorHandle handle) { + return reinterpret_cast(handle); + }; + """ + ) + + def generate_debug_str(self, args, kernel_name, kernel_id, mark): + if not npu_config.aot_inductor.debug_kernel: + return "" + if kernel_id not in self.visited_handle_for_kernel_id: + self.visited_handle_for_kernel_id[kernel_id] = set() + + def get_tensor_from_handle(h, t): + if h in self.visited_handle_for_kernel_id[kernel_id]: + return "" + self.visited_handle_for_kernel_id[kernel_id].add(h) + return f" auto {t} = *tensor_handle_to_tensor_pointer({h});\n" + + # Only dump tensor args, e.g, ['buf2', '8L', '4L'] => ['buf2'] + tensor_args = [arg for arg in args if not arg[0].isdigit()] + + tensor_args_h = [f"{arg}_h" for arg in tensor_args] + tensor_args_t = [f"{arg}_t" for arg in tensor_args] + handle_tensor_str = "".join([ + get_tensor_from_handle(h, t) for h, t in zip(tensor_args_h, tensor_args_t) + ]) + + dump_path = npu_config.aot_inductor.dump_path_cpp + return f""" + c10_npu::npuSynchronizeDevice(); + \n{handle_tensor_str} + std::vector arg_{mark}{{{", ".join(tensor_args_t)}}}; + torch::save(arg_{mark}, "{dump_path}/{kernel_id}_{kernel_name}_{mark}.pt"); + """ + + def generate_launch_call( + self, + call_args, + arg_types, + arg_signatures, + kernel_id, + grid_var, + kernel_name + ): + kernel_val_name = f"kernels.{kernel_name}" if V.graph.aot_mode else kernel_name + new_args: list[str] = [] + + # Add more cases for other types as needed + signature2dtype = { + "i1": "int32_t", + "i8": "int8_t", + "i16": "int16_t", + "i32": "int32_t", + "i64": "int64_t", + "u32": "uint32_t", + "u64": "uint64_t", + "fp16": "float", + "bf16": "float", + "fp32": "float", + "f32": "float", + "fp64": "double", + } + + + struct_def_body = '' + struct_arg_body = '' + + def process_args(arg, arg_type, arg_signature=None): + var_name = f"var_{next(self.arg_var_id)}" + # ignore nvTmaDesc, as host-side TMA descriptors need + # to be passed to the compiled Triton kernel by value + if isinstance(arg_type, torch_dtype) and arg_signature != "nvTmaDesc": + if arg.endswith(".item()"): # scalar + # Need to declare a scalar in this case + arg = arg[:-7] + struct_data, arg_data = self.codegen_tensor_item_npu( + arg_type, + arg, + var_name, + ) + else: + # void* + device_ptr_type = self.device_codegen.cpp_device_ptr() + self.writeline( + maybe_hipify_code_wrapper( + f"{device_ptr_type} {var_name} = reinterpret_cast<{device_ptr_type}>({arg}.data_ptr());" + ) + ) + if npu_config.aot_inductor.debug_kernel: + if arg not in self.visited_raii_handle: + self.writeline( + f"AtenTensorHandle {arg}_h = {arg}.get();" + ) + self.visited_raii_handle.add(arg) + struct_data = f'void* {var_name} __attribute__((aligned(8)));' + arg_data = f'static_cast({var_name})' + + elif arg_type in (sympy.Integer, int): + # int + self.writeline(f"int {var_name} = {cexpr(arg)};") + struct_data = f'int {var_name} __attribute__((aligned(4)));' + arg_data = f'static_cast({var_name})' + + elif arg_type in (sympy.Float, float): + # float + self.writeline(f"float {var_name} = {cexpr(arg)};") + struct_data = f'float {var_name} __attribute__((aligned(4)));' + arg_data = f'static_cast({var_name})' + + # For symbolic call arguments, examine the arg signatures from triton meta + # to explicitly cast to the right type + # Reason: `auto` can infer unexpected type against kernel input signature. + elif ( + isinstance(arg_type, type(SymbolicCallArg)) + and arg_signature is not None + and arg_signature in signature2dtype.keys() + ): + # or scalar symbolic type,currently only support scalar symbolic type + self.writeline( + f"{signature2dtype[arg_signature]} {var_name} = {cexpr(arg)};" + ) + struct_data = f'{signature2dtype[arg_signature]} {var_name} __attribute__((aligned(sizeof({signature2dtype[arg_signature]}))));' + arg_data = f'static_cast<{signature2dtype[arg_signature]}>({var_name})' + else: + raise TypeError("Infer arg_type to cpp failed!") + + nonlocal struct_def_body + nonlocal struct_arg_body + struct_def_body += struct_data + ' ' + struct_arg_body += arg_data + ', ' + + for arg, arg_type, arg_signature in zip_longest( + call_args, arg_types, arg_signatures + ): + process_args(arg, arg_type, arg_signature) + + debug_str_before_kernel = self.generate_debug_str(call_args, kernel_name, kernel_id, "before") + debug_str_after_kernel = self.generate_debug_str(call_args, kernel_name, kernel_id, "after") + + launch_str = f""" + auto launch_call_{kernel_id} = [=]() {{ + int32_t grid_x = {grid_var}.grid_x; + int32_t grid_y = {grid_var}.grid_y; + int32_t grid_z = {grid_var}.grid_z; + rtError_t ret; + void* ffts_addr = NULL; + uint32_t ffts_len; + ret = rtGetC2cCtrlAddr((uint64_t*)&ffts_addr, &ffts_len); + if (ret != RT_ERROR_NONE) return ret; + void* workspace_addr = NULL; + + struct __attribute__((packed)) {{ + void* ffts_addr __attribute__((aligned(8))); + void* workspace_addr __attribute__((aligned(8))); + {struct_def_body} + int32_t grid_x __attribute__((aligned(4))); + int32_t grid_y __attribute__((aligned(4))); + int32_t grid_z __attribute__((aligned(4))); + }} kernel_args = {{ + static_cast(ffts_addr), + static_cast(workspace_addr), + {struct_arg_body} + static_cast(grid_x), + static_cast(grid_y), + static_cast(grid_z) + }}; + + uint32_t block_num = grid_x * grid_y * grid_z; + auto arg_ptr = static_cast(&kernel_args); + auto arg_size = sizeof(kernel_args); + {debug_str_before_kernel} + ret = rtKernelLaunch({kernel_val_name}, block_num, arg_ptr, arg_size, NULL, stream); + {debug_str_after_kernel} + if (ret != RT_ERROR_NONE) return ret; + return ret; + }}; + """ + return f"launch_call_{kernel_id}", launch_str + + def generate_default_grid( + self, + kernel_name: str, + grid_args: List[Any], + gpu: bool = True, + grid_callable: Optional[Callable[..., Any]] = None, + **grid_extra_kwargs, + ): + """ + Generate grid configs for launching a CUDA kernel using the grid + function from triton_heuristics. Because its computation needs + to read kernel config after autotune, it is done in a deferred way + using DeferredNpuDefaultGrid. + """ + checkIfTrue(gpu, "CppWrapperNpu.generate_default_grid does not support non-NPU") + return DeferredNpuDefaultGrid( + kernel_name, grid_args, grid_callable, **grid_extra_kwargs + ) + + def generate_kernel_call_npu( + self, + kernel_name: str, + call_args, + grid=None, + device_index=None, + npu=True, + triton=True, + arg_types=None, + raw_args=None, + grid_fn: str = "grid", + triton_meta=None, + autotune_configs=None, + grid_extra_kwargs="", + ): + if ( + config.triton.autotune_at_compile_time + and kernel_name not in self.kernel_autotune_names + ): + # Call PythonWrapperCodegen to create the autotune code block + PythonWrapperCodegen.generate_kernel_call( + self, + kernel_name, + call_args, + grid, + device_index, + npu, + triton, + arg_types, + raw_args, + grid_fn, + triton_meta, + autotune_configs, + grid_extra_kwargs, + ) + + if device_index is None: + current_device = V.graph.get_current_device_or_throw() + device_index = current_device.index + + stream = ( + "stream" + if V.graph.aot_mode + else self.write_get_raw_stream(device_index, V.graph) + ) + + if triton: + device_index, call_args = self.prepare_triton_kernel_call( + device_index, call_args + ) + _ = self.generate_load_kernel_once(kernel_name, device_index, V.graph) + + # args with value 1 are added into equal_to_1 and constants + # in triton_meta (in the Python codegen) which makes them + # inlined in the PTX and compiled CUBIN + arg_signatures = [] + if ( + triton_meta is not None + and triton_meta.get("configs") + and triton_meta.get("signature") + ): + equal_to_1 = triton_meta["configs"][0].equal_to_1 + call_args = [ + arg + for i, arg in enumerate(call_args) + if i not in equal_to_1 + ] + arg_types = [t for i, t in enumerate(arg_types) if i not in equal_to_1] + # extract the arg signatures from triton_meta + arg_signatures = triton_meta["signature"].values() + arg_signatures = [ + v + for i, v in enumerate(arg_signatures) + if i not in equal_to_1 + ] + + current_kernel_id = next(self.kernel_callsite_id) + current_grid_id = next(self.grid_id) + + # gen grids + grid_var = f"{kernel_name}_grid_{current_grid_id}" + self.writeline( + DeferredNpuGridLine(kernel_name, grid_var, grid, autotune_configs) + ) + + call, call_args_str = self.generate_launch_call( + call_args, arg_types, arg_signatures, current_kernel_id, grid_var, kernel_name + ) + self.writeline(f"{call_args_str}") + + # add debug printer code for all triton kernel related calls + debug_printer_manager = V.graph.wrapper_code.debug_printer + debug_printer_manager.set_printer_args( + call_args, kernel_name, arg_types, None + ) + with debug_printer_manager: + self.writeline(f"if ({grid_var}.is_non_zero()) {{") + self.writeline( + DeferredNpuKernelLine( + kernel_name, + r" launchKernel({}, {});".format( \ + call, + f'"{kernel_name}"', + ), + (), + self.additional_files, + ), + ) + + self.writeline("}\n") + else: + casted = [] + for arg_type, arg in zip(arg_types, call_args): + new_arg = arg + if arg_type.endswith("*") and arg != "nullptr": + new_arg = f"{arg}.data_ptr()" + casted.append(f"({arg_type}){new_arg}") + call_args_str = ", ".join(casted) + self.writeline(f"kernels.{kernel_name}({call_args_str}, {stream});") + + def generate_kernel_call( + self, + kernel_name: str, + call_args, + grid=None, + device_index=None, + gpu=True, + triton=True, + arg_types=None, + raw_args=None, + grid_fn: str = "grid", + triton_meta=None, + autotune_configs=None, + grid_extra_kwargs="", + ): + """ + Override the default value of argument 'gpu' to True here. + generate_kernel_call can still be called with gpu=False because of + a mix of cpu kernels and gpu kernels. + """ + + """ + To fit with NPU: we write a new function 'generate_kernel_call_npu + and make a new parameter called 'npu', which always equals to 'gpu', + because 'gpu' parameter means 'not cpu' in upper logic + """ + + if not gpu: + # Even in CppWrapperNpu, we may see cpp kernels + return CppWrapperCpu.generate_kernel_call( + self, + kernel_name, + call_args, + grid, + device_index, + gpu, + triton, + arg_types, + raw_args, + grid_fn, + triton_meta, + autotune_configs, + grid_extra_kwargs, + ) + + self.generate_kernel_call_npu( + kernel_name, + call_args, + grid, + device_index, + gpu, + triton, + arg_types, + raw_args, + grid_fn, + triton_meta, + autotune_configs, + grid_extra_kwargs, + ) + + def make_zero_buffer(self, name): + return f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_zero_({name}.get()));" diff --git a/torch_npu/_inductor/codegen/ir.py b/torch_npu/_inductor/codegen/ir.py new file mode 100644 index 0000000000..b288ad8ae5 --- /dev/null +++ b/torch_npu/_inductor/codegen/ir.py @@ -0,0 +1,199 @@ +from typing import List, Tuple, Dict, Any, Optional +import itertools +import sympy +from torch._inductor.ir import (ReductionHint, IRNode, ModularIndexing, FloorDiv) +from torch._inductor.utils import sympy_subs, sympy_index_symbol +from torch._inductor.virtualized import V +from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel + +from ..config import log + + +# NPU doesn't need to support ReductionHint.OUTER, and persistent reduction +def num_splits( + device, + dst_dtype, + src_dtype, + inner_fn, + ranges, + reduction_ranges, + reduction_type, + reduction_numel, + input_node: Optional[IRNode] = None, +): + return ReductionHint.DEFAULT, 1 + + +def detect_flattened_dims(kernel, index): + new_vars = {} + if not isinstance(index, (sympy.core.add.Add, ModularIndexing, FloorDiv)): + return new_vars + + def detect_flattened_axis(expr): + def init_new_vars(var, length): + if var not in new_vars: + new_vars[var] = {length: [None, None]} + if length not in new_vars[var]: + new_vars[var][length] = [None, None] + + if isinstance(expr, ModularIndexing): + var, divisor, length = expr.args + init_new_vars(var, length) + new_vars[var][length][1] = (expr, divisor, length) + elif isinstance(expr, FloorDiv): + var, divisor = expr.args + init_new_vars(var, divisor) + # over than 1 node_schedule, var may be deleted in kernel.range_tree_nodes + # it shoule be find in range_tree_nodes_removed dict + if (var in kernel.range_tree_nodes): + numel = kernel.range_tree_nodes[var].length + else: + numel = kernel.range_tree_nodes_removed[var].length + + length = expr.eval(numel, divisor) + new_vars[var][divisor][0] = (expr, divisor, length) + + else: + for x in expr.args: + detect_flattened_axis(x) + + # add + if isinstance(index, sympy.core.add.Add): + for x in index.args: + detect_flattened_axis(x) + elif isinstance(index, (ModularIndexing, FloorDiv)): + detect_flattened_axis(index) + else: + pass + + # make sure FloorDiv, MouldarIndexing must be in-pair + for var, divisors in new_vars.items(): + if var in kernel.range_tree_nodes: + parent_axis = kernel.range_tree_nodes[var] + else: + parent_axis = kernel.range_tree_nodes_removed[var] + for divisor, pair in divisors.items(): + if not pair[0] and not pair[1]: + pass + # FloorDiv not inplace + elif not pair[0]: + _, _, length = pair[1] + expr = FloorDiv(var, length) + new_vars[var][divisor][0] = (expr, length, parent_axis.length // length) + # ModularIndexing not inplace + elif not pair[1]: + expr = ModularIndexing(var, 1, divisor) + new_vars[var][divisor][1] = (expr, 1, divisor) + else: + pass + + return new_vars + + +def rebuild_flattened_dims(indexing): + def rebuild_flattened_dim(key, index, old_node, flatten_dim): + for _, pair in flatten_dim.items(): + new_var_expr = sympy.Integer(0) + origin_axis_length = 0 + pair_is_valid = True + # don't create duplicated axis, e.g. y1:1024, y1 % 1024 is duplicated + expr, divisor, length = pair[1] + if not old_node.parent.duplicated_check(divisor, length): + if expr not in V.kernel.expr_substituted: + V.kernel.expr_substituted[expr] = old_node.symbol() + break + + for axis in pair: + expr, divisor, length = axis + # 3. try to rebuild the axis in kernel + new_node = old_node.parent.lookup(divisor, length) + + # 4. substitute div/mod expression in indexing + index = index.subs(expr, new_node.symbol()) + indexing[key] = index + if isinstance(expr, FloorDiv): + new_var_expr = new_var_expr + new_node.symbol() * divisor + origin_axis_length = divisor * length + elif isinstance(expr, ModularIndexing): + new_var_expr = new_var_expr + new_node.symbol() + V.kernel.expr_substituted[expr] = new_node.symbol() + + if var not in V.kernel.range_tree_nodes_substituted: + V.kernel.range_tree_nodes_substituted[var] = [] + V.kernel.range_tree_nodes_substituted[var].append((origin_axis_length, new_var_expr)) + + def find_index_in_substitute(index, kernel): + return any([index.find(key) for key in kernel.expr_substituted.keys()]) + + kernel = V.kernel + for key, index in indexing.items(): + # 1. try to find out flattened axis from indexing + flatten_dims = detect_flattened_dims(kernel, index) + # 2. try to rebuild these flattened dims + for var, flatten_dim in flatten_dims.items(): + if (var in kernel.range_tree_nodes): + old_node = kernel.range_tree_nodes[var] + else: + old_node = kernel.range_tree_nodes_removed[var] + + rebuild_flattened_dim(key, index, old_node, flatten_dim) + + if find_index_in_substitute(index, kernel): + new_index = sympy_subs(index, kernel.expr_substituted) + indexing[key] = new_index + + +def substituted_dims_in_indexing(self, indexing, kernel, range_tree_nodes_substituted): + substituted = False + for var, candidates in range_tree_nodes_substituted.items(): + if not (len(candidates) > 0): + raise RuntimeError("assert len(candidates) > 0, candidates") + exprs = sorted(candidates, reverse=True, key=lambda x: x[0]) + # the best candidate is with the longest numel + numel = exprs[0][0] + expr = exprs[0][1] + node = kernel.range_tree_nodes[var] + if node.length != numel: + log.debug("sub nodes (expr%s, numel:%d) can not substitute parent node(%s:%d)", + expr, numel, node.symbol(), node.length) + continue + for key, index in indexing.items(): + if var in index.free_symbols: + index = index.subs(var, expr) + indexing[key] = index + substituted = True + + return substituted + + +def generate_body_indexing(body, indices): + index = list(itertools.chain.from_iterable(indices)) + if not (len(index) == len(body.var_ranges)): + raise RuntimeError("assert len(index) == len(body.var_ranges), (index, body.var_ranges)") + if not (all(v not in body.var_ranges for v in index)): + raise RuntimeError("assert all(v not in body.var_ranges for v in index)") + + replacements = dict(zip(body.var_ranges.keys(), index)) + indexing_map = dict(zip(index, body.var_ranges.keys())) + setattr(body, 'indexing_map', indexing_map) + body.indexing = { + name: sympy_subs(expr, replacements) + for name, expr in body.indexing_exprs.items() + } + + +def transform_dims_in_indexing(self, indices): + if self.indexing is None: + generate_body_indexing(self, indices) + + if V.kernel is not None and isinstance(V.kernel, NPUIndexTritonKernel): + rebuild_flattened_dims(self.indexing) + + +# select tiling axis, recover missing dimensions, +def loopbody__call__(self, *indices): + if self.indexing is None: + generate_body_indexing(self, indices) + result = self.root_block() + self.indexing = None + return result diff --git a/torch_npu/_inductor/codegen/ir_fx.py b/torch_npu/_inductor/codegen/ir_fx.py new file mode 100644 index 0000000000..6b768760a7 --- /dev/null +++ b/torch_npu/_inductor/codegen/ir_fx.py @@ -0,0 +1,864 @@ +import traceback +import typing +from typing import ( + Any, + Callable, + List, + Optional, + Union +) +from typing import Optional +from unittest.mock import patch +import sympy +import torch +from sympy import Expr +from torch._inductor import config +from torch._inductor import ir +from torch._inductor.virtualized import ops, V +from torch.utils._ordered_set import OrderedSet + +from ..lowering_fx import ( + fetch_graphs, + merge_traced_graphs, + node_id, + clone, + create_fake_input, + subtract_graph +) + + +def _patch_loops_get_name(self): + return self.node_name + + +def _patch_loops_get_traced_graph(self): + return self.traced_graph + + +@classmethod +def _patch_loops_create(cls, *args, **kwargs): + origin_node = kwargs.pop("origin_node", None) + traced_graph = kwargs.pop("traced_graph", None) + node_name = kwargs.pop("node_name", None) + tb = kwargs.pop("traceback", None) + r = cls(*args, **kwargs) + # Need to explicitly set origin_node here to propagate it down. + # todo(chilli): I think it would be better for IRNode to directly set + # origin_node + r._post_init_setattr("origin_node", origin_node) + r._post_init_setattr("traceback", tb or r.traceback) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return ir.TensorBox.create(r) + + +def _patch_pointwise_constant_to_device(self, device, traced_graph=None, node_name=None): + """Move this to a given device. Requires that all reads are to constants.""" + loader = self.make_loader() + loader = patch.object(ir.ConstantBuffer, "override_device", device)(loader) + + r = ir.Pointwise(device, self.dtype, loader, self.ranges) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + +@classmethod +def _patch_reduction_create( + cls, + device: torch.device, + dst_dtype: torch.dtype, + src_dtype: torch.dtype, + inner_fn: Callable[..., Any], + ranges: ir.Sequence[Expr], + reduction_ranges: ir.Sequence[Expr], + reduction_type: str, + reduction_hint: ir.ReductionHint = ir.ReductionHint.DEFAULT, + input_node: Optional[ir.IRNode] = None, + traced_graph=None, + node_name: str = None +) -> ir.TensorBox: + reduction_numel = V.graph.sizevars.simplify(ir.sympy_product(reduction_ranges)) + + if reduction_numel == 0: + # N.B. This is a hack to generate the literal of the given type + # Ideally, we should be fixing `def constant` in triton.py + # but it breaks due to hardcoded dtypes in other places + def py_cnst(val: object) -> Union[bool, float, int]: + if dst_dtype == torch.bool: + return bool(val) + elif dst_dtype.is_floating_point: + if not isinstance(val, typing.SupportsFloat): + raise RuntimeError("assert val must support float conversion") + return float(val) + else: + if not isinstance(val, typing.SupportsInt): + raise RuntimeError("assert val must support int conversion") + return int(val) + + rtypes_to_inits = { + "sum": py_cnst(0), + "xor_sum": py_cnst(0), + "prod": py_cnst(1), + "any": py_cnst(0), + # "all" is desugared to `!any(!val)` + } + + if reduction_type not in rtypes_to_inits: + raise RuntimeError(f"assert {reduction_type} not supported for zero-dimension tensors!") + + def const_fn(index: int) -> ir.OpsValue: + return ops.constant(rtypes_to_inits[reduction_type], dst_dtype) + + return ir.Pointwise.create( + device=device, + dtype=src_dtype, + inner_fn=const_fn, + ranges=list(ranges), + traced_graph=traced_graph, + node_name=node_name + ) + + if reduction_numel == 1: + # this reduction is actually a pointwise op + if reduction_type in ("argmin", "argmax"): + + def fn(index: int) -> ir.OpsValue: + return ops.constant(0, dst_dtype) + + else: + + def fn(index: int) -> ir.OpsValue: + reduction_index = [sympy.S.Zero for _ in reduction_ranges] + return inner_fn(index, reduction_index) + + return ir.Pointwise.create( + device=device, dtype=dst_dtype, inner_fn=fn, ranges=ranges + ) + + if ( + isinstance(reduction_numel, ir.Integer) + and V.graph.sizevars.size_hint(reduction_numel) + < config.unroll_reductions_threshold + and (ir.sympy_product(ranges) != 1 or ir.is_gpu(device.type)) + ): + # NB: This works around pytorch issues 140457 + # since turning reductions into pointwise ops can exacerbate this problem + return ir.Pointwise.create( + device=device, + dtype=dst_dtype, + inner_fn=cls._unroll_reduction_fn( + inner_fn, reduction_ranges, reduction_type, src_dtype + ), + ranges=ranges, + traced_graph=traced_graph, + node_name=node_name + ) + + # triton doesn't support reduce to single element well, so break it up + hint, split = cls.num_splits( + device, + dst_dtype, + src_dtype, + inner_fn, + ranges, + reduction_ranges, + reduction_type, + reduction_numel, + input_node, + ) + # intermediate reduction in split can contain complex indexing, + # and num_splits will fail to correctly set the hint + # reuse the passed hint if available + if reduction_hint == ir.ReductionHint.DEFAULT: + reduction_hint = hint + if split == -1: + if input_node is None: + raise RuntimeError("assert input_node cannot be None") + new_ranges, new_reduction_ranges = ir.extract_input_node_reduction_ranges( + input_node + ) + if new_ranges is None: + raise RuntimeError("assert new_ranges cannot be None") + if new_reduction_ranges is None: + raise RuntimeError("assert new_reduction_ranges cannot be None") + return cls.create_multilayer_existing_ranges( + device, + dst_dtype, + src_dtype, + inner_fn, + ranges, + reduction_ranges, + new_ranges, + new_reduction_ranges, + reduction_type, + reduction_hint, + ) + elif split > 1: + # triton doesn't support reduce to single element well, so break it up + return cls.create_multilayer( + device, + dst_dtype, + src_dtype, + inner_fn, + ranges, + reduction_ranges, + reduction_type, + split, + reduction_hint, + ) + + r = ir.Reduction( + device=device, + dtype=dst_dtype, + inner_fn=inner_fn, + ranges=ranges, + reduction_ranges=reduction_ranges, + reduction_type=reduction_type, + src_dtype=src_dtype, + reduction_hint=reduction_hint, + ) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + + return ir.TensorBox.create(r) + + +def _patch_baseview_get_traced_graph(self): + if hasattr(self, 'traced_graph') and self.traced_graph is not None: + return self.traced_graph + return self.data.get_traced_graph() + + +def _patch_base_view_get_reads(self): + with patch.object(ir.FlexibleLayout, "allow_indexing", True): + r = ir.extract_read_writes( + self.make_loader(), + self.get_size(), + ).reads + for md in r: + if md.index.has(ir.ModularIndexing): + if md.index.has(ir.FloorDiv): + self.realize() + return r + else: + for m in md.index.find(ir.ModularIndexing): + for arg in m.args: + if arg.has(ir.ModularIndexing): + self.realize() + return r + return r + + +def has_buffer(inp): + if not hasattr(inp, 'data'): + return False + if isinstance(inp.data, ir.Buffer): + return True + return has_buffer(inp.data) + + +def get_buffer(inp): + if isinstance(inp.data, ir.Buffer): + return inp.data + return get_buffer(inp.data) + + +def _patch_baseview_realize(self): + if hasattr(self, 'traced_graph') and self.traced_graph is not None: + r = self.data.realize() + buffer = get_buffer(self) + if isinstance(buffer, (ir.MultiOutput, ir.InputBuffer, ir.ConcatKernel)): + return r + traced_graph = buffer.data.get_traced_graph() + buf_name = buffer.get_name() + new_traced_graph, placeholder = subtract_graph(self.traced_graph, traced_graph, node_name=buf_name) + if placeholder is not None: + placeholder.name = buf_name + device = buffer.get_device() + dtype = buffer.get_dtype() + size = buffer.get_size() + stride = buffer.get_stride() + fake_input = create_fake_input(size, stride, device, dtype) + placeholder.meta['val'] = fake_input + self._post_init_setattr("traced_graph", new_traced_graph) + return r + else: + return self.data.realize() + + +def _patch_baseview_realize_hint(self): + if hasattr(self, 'traced_graph') and self.traced_graph is not None: + r = self.data.realize_hint() + if not has_buffer(self): + return r + buffer = get_buffer(self) + if isinstance(buffer, (ir.MultiOutput, ir.InputBuffer, ir.ConcatKernel)): + return r + traced_graph = buffer.data.get_traced_graph() + buf_name = buffer.get_name() + new_traced_graph, placeholder = subtract_graph(self.traced_graph, traced_graph, node_name=buf_name) + if placeholder is not None: + placeholder.name = buf_name + device = buffer.get_device() + dtype = buffer.get_dtype() + size = buffer.get_size() + stride = buffer.get_stride() + fake_input = create_fake_input(size, stride, device, dtype) + placeholder.meta['val'] = fake_input + self._post_init_setattr("traced_graph", new_traced_graph) + return r + else: + return self.data.realize_hint() + + +def _patch_mark_reuse(self, users): + if isinstance(self.data, ir.StorageBox): + if self.data.should_realize_on_reuse(users): + if hasattr(self, 'traced_graph') and self.traced_graph is not None: + r = self.data.realize() + buffer = get_buffer(self) + if isinstance(buffer, (ir.MultiOutput, ir.InputBuffer, ir.ConcatKernel)): + return r + traced_graph = buffer.data.get_traced_graph() + buf_name = buffer.get_name() + new_traced_graph, placeholder = subtract_graph(self.traced_graph, traced_graph, node_name=buf_name) + if placeholder is not None: + placeholder.name = buf_name + device = buffer.get_device() + dtype = buffer.get_dtype() + size = buffer.get_size() + stride = buffer.get_stride() + fake_input = create_fake_input(size, stride, device, dtype) + placeholder.meta['val'] = fake_input + self._post_init_setattr("traced_graph", new_traced_graph) + return r + else: + return self.data.realize() + else: + return self.data.mark_reuse(users) + + +@classmethod +def _patch_expandview_create(cls, x, new_size, traced_graph=None, node_name=None): + new_size = cls._normalize_size(x, new_size) + + if ir.is_storage_and_layout(x): + storage, old_layout = ir.as_storage_and_layout(x) + skip = len(new_size) - len(old_layout.size) + if skip < 0: + raise RuntimeError(f"assert Internal error: skip must be non-negative, got {skip}") + new_stride = [sympy.Integer(0)] * skip + for stride, size in zip(old_layout.stride, old_layout.size): + new_stride.append( + stride + if not V.graph.sizevars.shape_env.evaluate_expr( + sympy.Eq(size, 1), size_oblivious=True + ) + else sympy.Integer(0) + ) + new_layout = ir.FixedLayout( + old_layout.device, + old_layout.dtype, + list(new_size), + new_stride, + old_layout.offset, + ) + + r = ir.ReinterpretView(data=storage, layout=new_layout) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + r = ir.ExpandView(data=x, size=new_size) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + + return r + + +@classmethod +def _patch_permuteview_create(cls, x, dims, traced_graph=None, node_name=None): + dims = cls._map_neg_dims(dims) + if OrderedSet(dims) != OrderedSet(range(len(dims))): + raise RuntimeError("assert OrderedSet(dims) != OrderedSet(range(len(dims)))") + if ir.is_storage_and_layout(x): + storage, old_layout = ir.as_storage_and_layout(x) + new_layout = ir.FixedLayout( + old_layout.device, + old_layout.dtype, + [old_layout.size[i] for i in dims], + [old_layout.stride[i] for i in dims], + old_layout.offset, + ) + r = ir.ReinterpretView(data=storage, layout=new_layout) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + r = ir.PermuteView(data=x, dims=dims) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + +@classmethod +def _patch_view_create(cls, x, new_size, traced_graph=None, node_name=None): + if not isinstance(new_size, (tuple, list)): + raise RuntimeError("assert new_size must be tuple, list, or tuple") + old_size, new_size = cls.resolve_negative_size(x.get_size(), new_size) + # Skip pointless views + if V.graph.sizevars.statically_known_list_equals(old_size, new_size): + return x + + unbacked_symbols_in_sizes = False + if ( + len(ir.free_unbacked_symbols(old_size)) > 0 + or len(ir.free_unbacked_symbols(new_size)) > 0 + ): + unbacked_symbols_in_sizes = True + + if 0 in new_size: + + def fake_reindex(index): + return tuple([0] * len(old_size)) + + r = cls(x, list(new_size), fake_reindex) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + # next: a new class for FixedTransferLayout that output layout is constrained by input layout + elif (ir.is_contiguous_storage_and_layout( + x) or unbacked_symbols_in_sizes): # and not isinstance(x.data, ir.ReinterpretView): + if unbacked_symbols_in_sizes and (not ir.is_contiguous_storage_and_layout(x)): + # realize x; otherwise, the dynamic_reshape_indexer below will fail + # due to the size_hint's inability to process unbacked SymInts + x = ir.ExternKernel.realize_input(x) + + storage, old_layout = ir.as_storage_and_layout(x, want_contiguous=True) + new_layout = ir.FixedLayout( + old_layout.device, + old_layout.dtype, + new_size, + ir.FlexibleLayout.contiguous_strides(new_size), + old_layout.offset, + ) + + r = ir.ReinterpretView(data=storage, layout=new_layout) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + reindex = cls.dynamic_reshape_indexer(old_size, new_size) + + r = cls(data=x, size=list(new_size), reindex=reindex) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + +@classmethod +def _patch_sliceview_create(cls, x, dim, start, end, step=1, clamp=True, traced_graph=None, + node_name=None): # next: crm, clamp=True + step = sympy.expand(step) + if not (isinstance(step, sympy.Expr) or step > 0): + raise RuntimeError("assert step must be a sympy.Expr or a positive number") + try: + if start == 0 and end >= 2 ** 63 - 1 and step == 1: + return x + except TypeError: + pass + sizevars = V.graph.sizevars + new_size = list(x.get_size()) + + if clamp: + start, end = cls.normalize_start_end(x, dim, start, end) + + new_size[dim] = ir.FloorDiv(end - start + (step - 1), step) + + if ir.is_storage_and_layout(x): + # Fast path + storage, old_layout = ir.as_storage_and_layout(x) + new_stride = list(old_layout.stride) + new_stride[dim] = new_stride[dim] * step + new_layout = ir.FixedLayout( + old_layout.device, + old_layout.dtype, + new_size, + new_stride, + old_layout.offset + old_layout.stride[dim] * start, + ) + r = ir.ReinterpretView(data=storage, layout=new_layout) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + def reindex(index): + if len(index) != len(new_size): + raise RuntimeError(f"assert wrong ndim {index} {new_size}") + index = list(index) + index[dim] = index[dim] * step + start + return index + + # redirect to a generic view + r = ir.SliceView(data=x, size=new_size, reindex=reindex) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + +def _patch_buffer_get_traced_graph(self): + return self.traced_graph + + +@classmethod +def _patch_concatkernel_create(cls, inputs, dim): + device = inputs[0].get_device() + dtype = inputs[0].get_dtype() + new_size = list(inputs[0].get_size()) + offsets_start = [0] + offsets_end = [new_size[dim]] + if not (0 <= dim < len(new_size)): + raise RuntimeError(f"assert dim ({dim}) must be between 0 and {len(new_size) - 1}") + for i in range(1, len(inputs)): + input_size = inputs[i].get_size() + offsets_start.append(new_size[dim]) + if len(input_size) != len(new_size): + raise RuntimeError( + f"assert input_size and new_size is not same. Got {len(input_size)} vs {len(new_size)}") + if inputs[i].get_dtype() != dtype: + raise RuntimeError(f"assert Expected dtype {dtype}, but got {inputs[i].get_dtype()}") + if inputs[i].get_device() != device: + raise RuntimeError(f"assert Expected device {device}, but got {inputs[i].get_device()}") + + for j in range(len(new_size)): + if j == dim: + new_size[j] = new_size[j] + input_size[j] + else: + new_size[j] = V.graph.sizevars.guard_equals( + new_size[j], input_size[j] + ) + offsets_end.append(new_size[dim]) + + output_stride = ir.FlexibleLayout.contiguous_strides(new_size) + # If any of the inputs is in CL format, use CL format for the output + for i in range(len(inputs)): + x = inputs[i] + if ir.is_storage_and_layout(x): + layout = x.get_layout() + if ( + isinstance(layout, ir.FixedLayout) + and layout.is_channels_last_contiguous(layout.size, layout.stride) + ): + # use CL stride for the output + output_stride = ir.make_channels_last_strides_for(new_size) + break + + any_input_is_storage_and_layout = any(ir.is_storage_and_layout(x) for x in inputs) + fx_node_args = V.graph.current_node.args[0] + if not isinstance(fx_node_args, list): + raise RuntimeError("assert fx_node_args must be a list") + # If any of the inputs has meta tensor and the meta tensor is in CL format, use CL format for the output + if any_input_is_storage_and_layout is False and any( + "val" in arg.meta + and ( + arg.meta["val"].is_contiguous(memory_format=torch.channels_last) + or arg.meta["val"].is_contiguous(memory_format=torch.channels_last_3d) + ) + for arg in fx_node_args + ): + output_stride = ir.make_channels_last_strides_for(new_size) + + concat_kernel = ir.ConcatKernel( + name=None, + layout=ir.FixedLayout( + device=device, + dtype=dtype, + size=new_size, + stride=output_stride, + ), + inputs=[], + ) + + kernel = ir.StorageBox(concat_kernel) + op_names = [] + for i in range(len(inputs)): + input_buffer = cls.realize_into( + inputs[i], + ir.SliceView.create( + kernel, dim, offsets_start[i], offsets_end[i], clamp=False + ), + ) + concat_kernel.inputs.append(input_buffer) + + if isinstance(inputs[i].data, ir.BaseView): + input_unwrapped = inputs[i].data.unwrap_view() + else: + input_unwrapped = inputs[i].data + + if ( + input_unwrapped.is_input_buffer() + and ir.is_gpu(inputs[i].get_device().type) + and not ir.is_dynamic(input_buffer) + ): + op_names.append(input_buffer.get_operation_name()) + + if len(op_names) > 1 and V.graph.has_feature(device, ir.BackendFeature.FOREACH): + V.graph.register_operation_list(op_names) + + cat_inputs = [ir.TensorBox(ir.StorageBox(inp)) for inp in concat_kernel.inputs] + input_graphs = fetch_graphs([cat_inputs]) + node_name = f'cat_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, torch.ops.aten.cat, node_name, dim=dim) + + concat_kernel._post_init_setattr("name", V.graph.register_buffer(concat_kernel)) + concat_kernel._post_init_setattr("inputs", cls.unwrap_storage(concat_kernel.inputs)) + concat_kernel._post_init_setattr("traced_graph", new_graph) + concat_kernel._post_init_setattr("node_name", node_name) + + return kernel + + +def _patch_concatkernel_get_traced_graph(self): + return self.traced_graph + + +@classmethod +def _patch_concatkernel_realize_into(cls, src, dst): + # Attempt to turn this into a ReinterpretView rather than assert. + # This has concessions around layout, as as_storage_and_layout + # can cause us to go from flexible to fixed layout. + if not isinstance(dst, ir.ReinterpretView): + if ir.is_storage_and_layout(dst): + storage, layout = ir.as_storage_and_layout(dst) + dst = ir.ReinterpretView(data=storage, layout=layout) + if not isinstance(dst, ir.ReinterpretView): + raise RuntimeError(f"assert Expected dst to be an instance of ir.ReinterpretView. Got: {dst}") + if isinstance(src, ir.TensorBox): + # unwrap a TensorBox + return cls.realize_into(src.data, dst) + if isinstance(src, ir.StorageBox): + src.realize() + # ExternKernelAlloc has specific requirements for output layout, should create a copy + if not hasattr(src.data, "layout"): + raise RuntimeError("assert src.data has no attribute 'layout'") + if cls.can_realize_into_without_copy(src): + src.data.layout = ir.NonOwningLayout(dst) + return src.data + pw = clone(src, memory_format=torch.contiguous_format) + return cls.realize_into(pw, dst) + + +def _patch_externkernel_copy_input(x): + traced_graph = x.get_traced_graph() + node_name = x.get_name() + if traced_graph is None: + traced_graph = fetch_graphs([x])[0] + node_name = f'getitem_{next(node_id)}' + + pw = ir.Pointwise.create( + device=x.get_device(), + dtype=x.get_dtype(), + inner_fn=x.make_loader(), + ranges=x.get_size(), + origin_node=x.get_origin_node(), + traceback=x.get_traceback(), + traced_graph=traced_graph, + node_name=node_name + ) + pw.realize() + return pw + + +@classmethod +def _patch_externkernel_convert_to_reinterpret_view(cls, x): + """ + In order to pass this to an extern kernel we need a + ReinterpretView not a View. This allows us to avoid some + unneeded copies. + """ + if not isinstance(x, ir.BaseView): + raise RuntimeError(f"assert Expected type {ir.BaseView}, got {type(x)}") + if isinstance(x, ir.ReinterpretView): + return x + + # NOTE: Don't use extract_read_writes here as it fails when + # make_loader() inlines the computation + x_unwrap_view = x.unwrap_view() + buf = V.graph.get_buffer(x_unwrap_view.get_name()) + if buf is None: + raise RuntimeError("assert buf cannot be None") + x_unwrap_view_fx_node = buf.get_origin_node() + # Prefer channels last format according to how the format is set from eager. + if ( + x_unwrap_view_fx_node is not None + and "val" in x_unwrap_view_fx_node.meta + and isinstance(x_unwrap_view.layout, ir.FlexibleLayout) + and ( + x_unwrap_view_fx_node.meta["val"].is_contiguous( + memory_format=torch.channels_last + ) + or x_unwrap_view_fx_node.meta["val"].is_contiguous( + memory_format=torch.channels_last_3d + ) + ) + ): + x_unwrap_view.freeze_layout_with_same_order( + ir.make_channels_last_strides_for(x_unwrap_view.get_size()) + ) + else: + x_unwrap_view.freeze_layout() + + index_args, var_ranges = ir.dependencies.index_vars_squeeze( + x.get_size(), prefix="r" + ) + range_vars = index_args[0] + index = x.make_indexer()(range_vars) + + index = V.graph.sizevars.simplify_with_ranges(index, var_ranges) + strides = V.graph.sizevars.stride_vars(index, range_vars) + offset = V.graph.sizevars.offset_var(index, range_vars) + expected = ir.sympy_dot(range_vars, strides) + offset + + if index != expected: + ir.log.debug( + "convert_to_reinterpret_view failed: stride=%s offset=%s index=%s", + strides, + offset, + index, + ) + raise NotImplementedError + + r = ir.ReinterpretView( + data=x.data, + layout=ir.FixedLayout( + device=x.get_device(), + dtype=x.get_dtype(), + size=x.get_size(), + stride=strides, + offset=offset, + ), + ) + r._post_init_setattr("traced_graph", x.get_traced_graph()) + r._post_init_setattr("node_name", x.get_name()) + return r + + +@classmethod +def _patch_devicecopy_create(cls, x, device, non_blocking, traced_graph=None, node_name=None): + if ( + not x.is_extern() + and all(r in V.graph.constants for r in x.get_read_names()) + and not config.aot_inductor.use_runtime_constant_folding + ): + return x.constant_to_device(device) + + V.graph.add_device_info(device) + V.graph.add_device_info(x.get_device()) + + ir.developer_warning("DeviceCopy in input program") + constant_args = (non_blocking,) + r = ir.DeviceCopy( + ir.FlexibleLayout( + device=device, + dtype=x.get_dtype(), + size=x.get_size(), + ), + [cls.realize_input(x)], + constant_args, + ) + r._post_init_setattr("traced_graph", traced_graph) + r._post_init_setattr("node_name", node_name) + return r + + +def _patch_devicecopy_get_traced_graph(self): + return self.traced_graph + + +def _patch_multioutput_get_traced_graph(self): + return None + + +ir.MultiOutput.get_traced_graph = _patch_multioutput_get_traced_graph + + +def _patch_mutablebox_get_name(self): + return self.data.get_name() + + +def _patch_mutablebox_get_traced_graph(self): + return self.data.get_traced_graph() + + +@classmethod +def _patch_mutationlayout_realize_into(cls, src, dst, unsafe_alias=False): + dst.realize() + # NOTE: We must realize users of `dst` before we realize `src`, since + # realization order determines scheduling order. Otherwise, src's + # mutation would be scheduled before the existing users of dst! + V.graph.mark_buffer_mutated(dst.get_name()) + + if isinstance(src, ir.TensorBox): + src = src.data + + # We copy the contents of src into dst. In most cases this should + # be fused into a single kernel by the scheduler. + # NOTE: We cannot change src's layout to mutate dst directly as this + # would alias src to dst, which is not correct as further s to + # dst would effect users of src. However if there are no more users of + # dst, we can alias src to dst. + src.realize_hint() + + if not unsafe_alias: + input_graphs = fetch_graphs([dst, src]) + node_name = f'copy__{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, torch.ops.aten.copy, node_name) + + src = ir.Pointwise.create( + device=src.get_device(), + dtype=src.get_dtype(), + inner_fn=src.make_loader(), + ranges=[ + V.graph.sizevars.guard_equals(a, b) + for a, b in zip(src.get_size(), dst.get_size()) + ], + traced_graph=new_graph, + node_name=node_name, + ).data + + src.realize() + if not isinstance(src.data.layout, ir.FlexibleLayout): + raise RuntimeError("assert src.data.layout should be isinstance if ir.FlexibleLayout") + src.data.layout = ir.MutationLayoutSHOULDREMOVE(dst) + return src.data + + +def _patch_npu_inductor_ir(): + ir.Reduction.create = _patch_reduction_create + ir.BaseView.get_traced_graph = _patch_baseview_get_traced_graph + ir.BaseView.get_reads = _patch_base_view_get_reads + ir.BaseView.realize = _patch_baseview_realize + ir.BaseView.realize_hint = _patch_baseview_realize_hint + ir.BaseView.mark_reuse = _patch_mark_reuse + ir.ExpandView.create = _patch_expandview_create + ir.PermuteView.create = _patch_permuteview_create + ir.View.create = _patch_view_create + ir.SliceView.create = _patch_sliceview_create + ir.Buffer.traced_graph = None + ir.Buffer.get_traced_graph = _patch_buffer_get_traced_graph + ir.ConcatKernel.create = _patch_concatkernel_create + ir.ConcatKernel.get_traced_graph = _patch_concatkernel_get_traced_graph + ir.ConcatKernel.realize_into = _patch_concatkernel_realize_into + ir.ExternKernel.copy_input = _patch_externkernel_copy_input + ir.ExternKernel.convert_to_reinterpret_view = _patch_externkernel_convert_to_reinterpret_view + ir.DeviceCopy.create = _patch_devicecopy_create + ir.DeviceCopy.get_traced_graph = _patch_devicecopy_get_traced_graph + ir.MutableBox.get_name = _patch_mutablebox_get_name + ir.MutableBox.get_traced_graph = _patch_mutablebox_get_traced_graph + ir.Loops.get_name = _patch_loops_get_name + ir.Loops.get_traced_graph = _patch_loops_get_traced_graph + ir.Loops.create = _patch_loops_create + ir.Pointwise.constant_to_device = _patch_pointwise_constant_to_device + ir.MutationLayoutSHOULDREMOVE.realize_into = _patch_mutationlayout_realize_into diff --git a/torch_npu/_inductor/codegen/kernel_analysis.py b/torch_npu/_inductor/codegen/kernel_analysis.py new file mode 100644 index 0000000000..30d048940a --- /dev/null +++ b/torch_npu/_inductor/codegen/kernel_analysis.py @@ -0,0 +1,305 @@ +from typing import List, Tuple +import sympy +from torch._inductor import ir +from torch._inductor.scheduler import SchedulerNode +from torch._inductor.utils import sympy_index_symbol +from torch._inductor.virtualized import V + + +class IndexAnalysis: + def __init__(self, kernel, raw_index, is_store_index=False, is_index_expr=False): + self.index = raw_index.subs(V.graph.sizevars.var_to_val) + self.kernel = kernel + self.tiling_axis = [x.symbol() for x in self.kernel.tiling_axis] + self.stride_list = None # stride list [1,2,4,24] + self.reshape_sizes = [] # [RBLOCK, 1, 1, XBLOCK_SUB] + self.broadcast_sizes = [] # [RBLOCK, XBLOCK_SUB] + self.permute_shape = [] # [0,2,1,3] + self.var_replacements = {} # r2 ->r2_0, etc + self.var_directions = {} # r2_0 -> [None,:,None] + self.similar = None # (r,x,z,y) + self.need_permute = False + self.need_broadcast = False + self.need_reshape = False + self.gold = kernel.golden_var_list # tuple([x.symbol() for x in reversed(kernel.tiling_axis)]) + self.var_stride = [ + (key, coeff) + for key, coeff in self.index.as_coefficients_dict().items() + if not isinstance(key, sympy.Integer) + ] + # sort by stride + self.var_stride.sort(key=lambda x: x[1]) + # only contains tiing axis var + self.var_list = tuple([x[0] for x in self.var_stride if x[0] in self.tiling_axis]) + self.stride_list = tuple([x[1] for x in self.var_stride if x[0] in self.tiling_axis]) + self.is_store_index = is_store_index + self.is_index_expr = is_index_expr + + def get_most_similar_shape(self): + matched_dims = 0 + self.similar = None + for value in self.kernel.index_analysis.keys(): + if len(value) != len(self.gold): + continue + i = 0 + while i < len(self.var_list): + if value[i] == self.var_list[i]: + i = i + 1 + else: + break + + if i > matched_dims: + matched_dims = i + self.similar = value + return self.similar + + @classmethod + def same_var_list(cls, var1, var2): + if len(var1) != len(var2): + return False + for i, v in enumerate(var1): + if v != var2[i]: + return False + return True + + def shrink_permute_shape(self, permute_shape): + diff = len(self.gold) - len(self.kernel.tiling_axis) + new_shape = [x for x in permute_shape if x - diff >= 0] + return new_shape + + def analyze_permute_shape(self): + if self.is_index_expr: + return + if self.gold == self.similar: + self.need_permute = False + return + + similar = tuple(reversed(self.similar)) + gold = tuple(reversed(self.gold)) + self.permute_shape = [None] * len(gold) + + if self.is_store_index: + for i, x in enumerate(similar): + if x != gold[i]: + index = gold.index(x) + self.permute_shape[i] = index + self.need_permute = True + else: + self.permute_shape[i] = i + return + + for i, x in enumerate(gold): + if x != similar[i]: + index = similar.index(x) + self.permute_shape[i] = index + self.need_permute = True + else: + self.permute_shape[i] = i + + def analyze_broadcast_sizes(self): + if not self.need_reshape: + self.need_broadcast = False + return + self.need_broadcast = True + reversed_similar = reversed(self.similar) + similar = [x for x in reversed_similar] + self.broadcast_sizes = ["1"] * len(similar) + for i, x in enumerate(similar): + self.broadcast_sizes[i] = f"{x.name.upper()}BLOCK_SUB" + + def analyze_reshape_sizes(self): + if all(x in self.var_list for x in self.tiling_axis): + self.need_reshape = False + return + self.need_reshape = True + reversed_similar = reversed(self.similar) + similar = [x for x in reversed_similar] + var_list = [x for x in reversed(self.var_list)] + self.reshape_sizes = ["1"] * len(similar) + for _, x in enumerate(var_list): + index = similar.index(x) + self.reshape_sizes[index] = f"{x.name.upper()}BLOCK_SUB" + + def analyze_var_direction(self): + if self.var_list == self.gold: + return + var_list = self.var_list if len(self.var_list) == len(self.gold) else self.similar + if var_list == self.gold: + return + if not var_list: + return + var_list = list(reversed(var_list)) + gold = list(tuple(reversed(self.gold))) + if len(var_list) != len(gold): + raise RuntimeError("assert var_list and gold must have same length") + var_list = [x for x in var_list if x in self.kernel.tiling_axis] + gold = [x for x in gold if x in self.kernel.tiling_axis] + for i, x in enumerate(gold): + index = var_list.index(x) + if (index == i): + continue + new_var = sympy_index_symbol(f"{x}") if self.is_index_expr else sympy_index_symbol(f"{x}_{index}") + if new_var in self.var_replacements: + continue + direction = ["None"] * len(gold) + direction[index] = ":" + direction_str = f"[{','.join(direction)}]" + self.var_replacements[x] = new_var + self.var_directions[new_var] = direction_str + self.kernel.range_tree_nodes[x].var_directions[new_var] = direction_str + + def analyze_index(self): + if isinstance(self.index, sympy.Integer): + return + if not self.kernel.golden_var_list: + self.kernel.select_golden_varlist() + self.gold = self.kernel.golden_var_list + + if self.gold is None: + raise RuntimeError("assert gold must not be None") + if len(self.gold) != len(self.tiling_axis): + raise RuntimeError("assert gold must have same length as tiling_axis") + + def all_tiling_in_var_list(): + return all([x in self.var_list for x in self.tiling_axis]) + # 2 analyze permute shape for full_dim_len index + + if all_tiling_in_var_list(): + self.similar = self.var_list + self.analyze_permute_shape() + if self.var_list not in self.kernel.index_analysis: + self.kernel.index_analysis[self.var_list] = self + # 3. analyze reshape and broadcast sizes + else: + pass + + # 4 analyze var direction + self.analyze_var_direction() + + def generate_statement(self): + statement = "" + if self.need_reshape: + reshape_sizes = f"[{','.join(self.reshape_sizes)}]" + statement = f".reshape({reshape_sizes})" + if self.need_broadcast: + broadcast_sizes = f"[{','.join(self.broadcast_sizes)}]" + statement = f"{statement}.broadcast_to({broadcast_sizes})" + if self.need_permute: + statement = f"{statement}.permute({self.permute_shape})" + return statement + + +class ReductionAnalysis: + def __init__(self, kernel): + self.kernel = kernel + self.reduction = None + self.reduced_dim = None + if self.numof_reduction_axis() > 1: + self.kernel.persistent_reduction = True + self.reduced_dim = 0 + return + + reduction = self.kernel.find_reduction_node() + if reduction is None or not isinstance(reduction, ir.Reduction): + raise RuntimeError("failed to get one reduction node") + if not hasattr(reduction, "reduced_idx"): + raise RuntimeError("reduction node doesn't have attr reduced_idx") + self.reduction = reduction + self.reduced_dim = self.analyze_reduction_dim() + + def is_higher_order_reduction(self): + return self.dim < len(self.kernel.tiling_axis) - 1 + + def is_1d_reduction(self): + return self.kernel.numels["r"] > 1 and len(self.kernel.numels) == 1 + + def get_reduce_dim_reshape(self, reduce_axis): + if self.is_1d_reduction(): + shape_str = f"[{reduce_axis.name.upper()}BLOCK_SUB]" + else: + shape = ["1"] * len(self.kernel.tiling_axis) + shape[self.reduced_dim] = f"{reduce_axis.name.upper()}BLOCK_SUB" + shape_str = f"[{','.join(shape)}]" + return shape_str + + def dense_size_list(self) -> List[str]: + sizes = [f"{x.name.upper()}BLOCK_SUB" for x in self.kernel.tiling_axis] + if self.numof_reduction_axis() > 1: + return sizes + + reduce_axis = self.kernel.tiling_axis[-1] + sizes.pop(-1) + sizes.insert(self.reduced_dim, f"{reduce_axis.name.upper()}BLOCK_SUB") + return sizes + + def dense_size_str(self): + sizes = self.dense_size_list() + if self.numof_reduction_axis() > 1: + return f"[{'* '.join(sizes)}]" + return f"[{', '.join(sizes)}]" + + def numof_reduction_axis(self): + return self.kernel.numof_reduction_axis() + + def reduction_axis_list(self): + return self.kernel.reduction_axis_list() + + def analyze_reduction_dim(self): + + if self.numof_reduction_axis() > 1: + self.kernel.persistent_reduction = True + self.reduced_dim = 0 + return 0 + + if not self.kernel.golden_var_list: + self.kernel.select_golden_varlist() + if self.kernel.golden_var_list is None: + raise RuntimeError("assert self.kernel.golden_var_list is not None") + + dim = -1 + for i, x in enumerate(reversed(self.kernel.golden_var_list)): + if x.name[0] == 'r': + dim = i + break + return dim + + def analyze_reduction_dim1(self): + if self.numof_reduction_axis() > 1: + self.kernel.persistent_reduction = True + self.reduced_dim = 0 + return 0 + reduction = self.reduction + # kept = [0,1,3], reduced = [2] + for i, x in enumerate(reduction.reduced_idx): + if reduction.reduction_ranges[i] <= 1: + continue + reduced_idx = x + break + # the index (in reduction.ranges) of low_dims + low_dims = [i for i, x in enumerate(reduction.kept_idx) if x > reduced_idx] + if not low_dims: + return len(self.kernel.tiling_axis) - 1 + elif len(low_dims) == len(reduction.kept_idx): + return 0 + # reduction dim when low_dims are not meraged + dim = len(reduction.kept_idx) - len(low_dims) + + tiling_axis = self.kernel.tiling_axis[:-1] + merged = 1 + j = len(tiling_axis) - 1 + # remove all low_dims from tiling_axis + # all axis before ahead of j are high-orders + # then following is reduced dim + ranges = [x for x in reduction.ranges if x > 1] + for i in reversed(low_dims): + len_axis = tiling_axis[j].length + len_reduction = ranges[i] * merged + if len_reduction < len_axis: + merged = merged * len_reduction + elif len_reduction == len_axis: + j = j - 1 + merged = 1 + else: + raise RuntimeError(f"assert should not reach here low_dims({i})={len_reduction}, axis[{j}]=len)") + dim = j + 1 + return dim diff --git a/torch_npu/_inductor/codegen/npu_kernel_features.py b/torch_npu/_inductor/codegen/npu_kernel_features.py new file mode 100644 index 0000000000..889abb4155 --- /dev/null +++ b/torch_npu/_inductor/codegen/npu_kernel_features.py @@ -0,0 +1,94 @@ +import functools +from typing import Iterable +from typing import Iterable +from typing import Tuple, List +import sympy +import torch +from torch._inductor.codegen.simd import SIMDScheduling +from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures, NodeScheduleEntry +from torch._inductor.utils import cache_on_self +from torch._inductor.virtualized import V +from torch.utils._ordered_set import OrderedSet + + +class NumelList(Tuple): + + def numels(self): + numel = functools.reduce(lambda a, b: a * b, self, 1) + return numel + + def __eq__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel == numel2 + + def __le__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel <= numel2 + + def __lt__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel < numel2 + + def __ge__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel >= numel2 + + def __gt__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel > numel2 + + def __mod__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel % numel2 + + def __truediv__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel / numel2 + + def __floordiv__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel // numel2 + + def __mul__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel * numel2 + + def __rmul__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel * numel2 + + def __add__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel + numel2 + + def __radd__(self, other): + numel = self.numels() + numel2 = other.numels() if isinstance(other, NumelList) else other + return numel + numel2 + + def __hash__(self): + return super(NumelList, self).__hash__() + + +class NPUKernelFeatures(SIMDKernelFeatures): + def __init__( + self, + node_schedule: List[NodeScheduleEntry], + numel: sympy.Expr, + reduction_numel: sympy.Expr = sympy.S.One, + ): + super().__init__(node_schedule, numel, reduction_numel) + self.numel = NumelList(self.numel) if isinstance(self.numel, Iterable) else self.numel + self.reduction_numel = NumelList(self.reduction_numel) if isinstance(self.reduction_numel, + Iterable) else self.reduction_numel diff --git a/torch_npu/_inductor/codegen/scheduling.py b/torch_npu/_inductor/codegen/scheduling.py new file mode 100644 index 0000000000..cabe82d635 --- /dev/null +++ b/torch_npu/_inductor/codegen/scheduling.py @@ -0,0 +1,459 @@ +import collections +import contextlib +import itertools +import functools +import os +from typing import Dict, Sequence, List, Iterable, Any, Union +import sympy +from torch._dynamo.utils import counters +from torch._inductor import scheduler, metrics +from torch._inductor.codecache import code_hash +from torch._inductor.codegen.multi_kernel import MultiKernel +from torch._inductor.codegen.simd import DisableReduction, EnableReduction, SIMDKernelFeatures, SIMDKernel +from torch._inductor.codegen.simd import schedule_log, scheduler +from torch._inductor.codegen.triton import (TritonScheduling, log, config) +from torch._inductor.codegen.triton import ( + TritonScheduling, + config, + schedule_log, + get_fused_kernel_name, + get_kernel_category_by_source_code, + Placeholder, + get_kernel_metadata, + get_path, + IndentedBuffer +) +from torch._inductor.utils import sympy_index_symbol, ModularIndexing, FloorDiv, sympy_product +from torch._inductor.virtualized import V +from torch.fx.immutable_collections import immutable_dict +from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep +from torch.utils._ordered_set import OrderedSet +from torch._inductor.codegen.simd import CandidateTiling + +from .triton import NPUIndexTritonKernel, flatten +from .kernel_analysis import ReductionAnalysis +from .npu_kernel_features import NumelList, NPUKernelFeatures +from .split_tiling import SplitTiling +from .triton import NPUIndexTritonKernel +from .. import config as npu_config +from ..lowering_fx import ( + create_fx_from_snodes_by_traced_graph, + create_compile_kwargs, + generate_fx_graph_code, + dump_fx_graph_code +) + +from ..config import log + + +def flatten_groups(nums): + res = [] + for i in nums: + if isinstance(i, Iterable): + for x in i: + res.append(x) + else: + res.append(i) + return res + + +@classmethod +def create_tiling( + cls, pw_tiling: Sequence[sympy.Expr], reduction_tiling: Sequence[sympy.Expr] +) -> Dict[str, sympy.Expr]: + """ + Create a tiling dict from pointwise and reduction splits. + """ + + pw_tiling = flatten_groups(pw_tiling) + pw_prefixes = ["w", "v", "t", "z", "y", "x"][-len(pw_tiling):] + if len(reduction_tiling) == 0: + reduction_prefixes = [] + else: + reduction_tiling = flatten_groups(reduction_tiling) + reduction_tiling = [NumelList(reduction_tiling).numels()] + reduction_prefixes = ["r"][: len(reduction_tiling)] + tiling = immutable_dict( + list(zip(pw_prefixes, pw_tiling)) + + list(zip(reduction_prefixes, reduction_tiling))) + return tiling + + +class NPUTritonScheduling(TritonScheduling): + def __init__(self, input_scheduler): + super().__init__(input_scheduler) + self.kernel_type = NPUIndexTritonKernel + + def create_kernel_choices( + self, kernel_features: SIMDKernelFeatures, kernel_args, kernel_kwargs + ) -> List[SIMDKernel]: + + return [ + self.kernel_type( + *kernel_args, + **kernel_kwargs, + ) + ] + + # transform indexing before call codegen_node_schedule_with_kernel + def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures, nodes): + node_schedule = kernel_features.node_schedule + tiling = self.select_tiling( + node_schedule, kernel_features.numel, kernel_features.reduction_numel + ) + + kernels = self.create_kernel_choices( + kernel_features, [tiling], {"features": kernel_features} + ) + kernel = kernels[0] + setattr(kernel, "node_schedule", node_schedule) + self.decide_codegen_dims_in_kernel(node_schedule, kernel) + + for kernel in kernels: + self.codegen_node_schedule_with_kernel(node_schedule, kernel) + + MultiKernel.merge_workspaces_inplace(kernels) + for kernel in kernels: + with V.set_kernel_handler(kernel): + src_code = kernel.codegen_kernel() + + V.graph.removed_buffers |= kernel.removed_buffers + V.graph.inplaced_to_remove |= kernel.inplaced_to_remove + + traced_graph_hash = None + if npu_config.dump_fx_graph: + if not npu_config.traced_fx_graph_cache: + npu_config.traced_fx_graph_cache = os.path.join(os.getenv("TORCHINDUCTOR_CACHE_DIR"), + 'traced_fx_graph_cache') + os.makedirs(npu_config.traced_fx_graph_cache, exist_ok=True) + traced_graph, fx_call_args, fx_args, compile_kwargs = create_fx_from_snodes_by_traced_graph(nodes) + if traced_graph is None: + log.warning(f"For nodes {nodes}, could not gen fx graph while dump-graph.") + else: + traced_graph_hash = code_hash(traced_graph.print_readable(print_output=False)) + + kernel_name, src_code = self.define_kernel(src_code, node_schedule, kernel, traced_graph_hash) + + kernel.kernel_name = kernel_name + kernel.code_hash = code_hash(src_code) + del kernel + + final_kernel: Union[SIMDKernel, MultiKernel] + if len(kernels) > 1: + final_kernel = MultiKernel(kernels) + else: + (final_kernel,) = kernels + + with V.set_kernel_handler(final_kernel): + for node in kernel_features.scheduler_nodes(): + node.mark_run() + + self.codegen_comment(node_schedule) + final_kernel.call_kernel(final_kernel.kernel_name) + + if npu_config.dump_fx_graph and traced_graph is not None: + new_compile_kwargs = create_compile_kwargs(final_kernel, fx_call_args, fx_args) + if new_compile_kwargs: + compile_kwargs |= new_compile_kwargs + fx_dump_path = os.path.join(npu_config.traced_fx_graph_cache, traced_graph_hash) + os.makedirs(fx_dump_path, exist_ok=True) + fx_code = generate_fx_graph_code(traced_graph.code, src_code, kernel_name, compile_kwargs) + dump_fx_graph_code(fx_code, fx_dump_path, traced_graph_hash) + + if config.nan_asserts: + final_kernel.codegen_nan_check() + if config.warn_mix_layout: + final_kernel.warn_mix_layout(kernels[0].kernel_name) + + V.graph.removed_buffers |= final_kernel.removed_buffers + V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove + + if ( + V.graph.wrapper_code.supports_intermediate_hooks + and config.generate_intermediate_hooks + ): + # Not every node in the schedule will actually be live on output; + # we can't check dead buffers. + live_outs = kernels[0].args.live_output_buffers() + for node in kernel_features.scheduler_nodes(): + name = node.get_name() + if name not in live_outs: + continue + if node.node is None: + raise RuntimeError("assert node.node is not None") + + origin_node = node.node.get_origin_node() + if origin_node is not None: + counters["inductor"]["intermediate_hooks"] += 1 + V.graph.wrapper_code.writeline( + f"run_intermediate_hooks({origin_node.name!r}, {name})" + ) + + self.scheduler.free_buffers() + + def define_kernel(self, src_code, node_schedule, kernel, traced_graph_hash: str): + wrapper = V.graph.wrapper_code + if (src_code, traced_graph_hash) in wrapper.src_to_kernel: + kernel_name = wrapper.src_to_kernel[(src_code, traced_graph_hash)] + if npu_config.dump_fx_graph: + src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name) + subs_name = kernel_name if config.triton.unique_kernel_names else "triton_" + src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name) + if traced_graph_hash: + src_code = src_code.replace('TRACED_GRAPH_HASH', traced_graph_hash) + src_code = src_code.replace('TRACED_GRAPH_DIR', npu_config.traced_fx_graph_cache) + else: + fused_name = ( + get_fused_kernel_name(node_schedule, config.triton.descriptive_names) + if config.triton.descriptive_names + else "" + ) + kernel_category = get_kernel_category_by_source_code(src_code)[:3] + kernel_name = "_".join( + ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()] + ) + # use the original src_code as the key + wrapper.src_to_kernel[(src_code, traced_graph_hash)] = kernel_name + subs_name = kernel_name if config.triton.unique_kernel_names else "triton_" + + # DESCRIPTIVE_NAME is used for profiling purposes; it shows the full kernel name + # even when unique_kernel_names is turned off. Meanwhile, KERNEL_NAME is sometimes set + # to "triton_" to maximize caching opportunities (when unique_kernel_names = False). + src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name) + src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name) + if traced_graph_hash: + src_code = src_code.replace('TRACED_GRAPH_HASH', traced_graph_hash) + src_code = src_code.replace('TRACED_GRAPH_DIR', npu_config.traced_fx_graph_cache) + + src_code = src_code.replace("#pragma CMT", "#") + + basename, _, kernel_path = get_path(code_hash(src_code.strip()), "py") + + compile_wrapper = IndentedBuffer() + compile_wrapper.writeline(f"async_compile.triton({subs_name!r}, '''") + compile_wrapper.splice(src_code, strip=True) + current_device = V.graph.get_current_device_or_throw() + compile_wrapper.writeline(f"''', device_str='{current_device.type}')") + + metadata_comment = f"# kernel path: {kernel_path}" + origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper) + metadata_comment += "\n" + origins + "\n" + detailed_origins + # Extra debug message for npu. + snode_str = "" + snodes = [node for node in node_schedule if node not in (DisableReduction, EnableReduction)] + snode_str = f"\n# SchedulerNodes: {snodes}" + metadata_comment += snode_str + "\n" + if npu_config.dump_fx_graph: + from ..lowering_fx import snodes_to_fx + gm = snodes_to_fx.get(str(snodes), "") + gm_str = "\n# Graph Module str:\n" + gm_str += "\n".join([f"# {line}" for line in gm.split("\n")]) + metadata_comment += gm_str + "\n" + + wrapper.define_kernel( + kernel_name, compile_wrapper.getvalue(), metadata_comment + ) + + # log kernel metadata for offline analysis. + # E.g. one can find all unaligned inner reduction and check if + # padding helps with the perf kernel by kernel. + if metrics.is_metric_table_enabled("kernel_metadata"): + metrics.log_kernel_metadata(kernel_name, kernel_path, src_code) + + return kernel_name, src_code + + def codegen_node( + self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode] + ): + """ + Given a set of pre-fused nodes, generate a Triton kernel. + """ + + nodes: List[scheduler.SchedulerNode] = node.get_nodes() # type: ignore[assignment] + _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group + + node_schedule = self.generate_node_schedule(nodes, numel, rnumel) + schedule_log.debug("Schedule:\n %s", node_schedule) + + return self.codegen_node_schedule( + NPUKernelFeatures(node_schedule, numel, rnumel), nodes + ) + + def decide_codegen_dims_in_kernel(self, node_schedule, kernel): + def current_reduction_nodes(nodes): + return itertools.takewhile(lambda n: n is not DisableReduction, nodes) + + with kernel: + # 1. transform dims: create new dims to substitute floor_divide and modular expression + stack = contextlib.ExitStack() + for _, node in enumerate(node_schedule): + if node is DisableReduction: + stack.enter_context(kernel.disable_reduction()) + elif node is EnableReduction: + stack.close() + else: + index_vars = kernel.split_and_set_ranges(node.get_ranges()) + node._body.transform_dims_in_indexing(index_vars) + # 2. go through range_tree_nodes to findout, to find one axis could be substituted by others + self.additional_nodes_to_be_subs(kernel, kernel.range_tree_nodes_substituted) + # 3.do the substitution on all indexing + for node in node_schedule: + if node in (EnableReduction, DisableReduction): + continue + indexing = node._body.indexing + node._body.substituted_dims_in_indexing(indexing, kernel, kernel.range_tree_nodes_substituted) + + # 4.remove the substituted dims from kernel + for var, _ in kernel.range_tree_nodes_substituted.items(): + if (var in kernel.range_tree_nodes): + root = kernel.range_tree_nodes[var].parent + root.remove_entry(var) + # select split and tiling axis + split_tiling = SplitTiling(kernel) + split_tiling.select_split_tiling_axis() + kernel.load_store_indexing = split_tiling.indexing + # ReductionAnalysis depends on kernel.load_store_indexing + if kernel.inside_reduction: + kernel.reduce_analysis = ReductionAnalysis(kernel) + + def additional_nodes_to_be_subs(self, kernel, node_to_be_substituted): + for node in kernel.range_tree_nodes.values(): + if node.expr != sympy_index_symbol(f"{node.parent.prefix}index") \ + or len(node.parent.var_ranges) == 1 \ + or node.symbol() in node_to_be_substituted: + continue + numel = sympy.Integer(1) + new_var_expr = sympy.Integer(0) + for k, s in node.parent.var_ranges.items(): + if k == node.symbol(): + continue + numel = numel * s + sub_node = kernel.range_tree_nodes[k] + new_var_expr = new_var_expr + sub_node.symbol() * sub_node.divisor + + if numel == node.length: + node_to_be_substituted[node.symbol()] = [(node.length, new_var_expr)] + else: + log.warning("sub nodes (expr%s, numel:%d) can not make up parent node(%s:%d)", + new_var_expr, numel, node.symbol(), node.length) + + @classmethod + @functools.lru_cache(32) + def candidate_tilings(cls, node, numel, reduction_numel) -> list[CandidateTiling]: + """ + The main difference from gpu is default tiling, npu needs non-collapse ranges. + """ + is_pointwise = reduction_numel == 1 + + def assert_true(cond, msg=""): + if not cond: + raise AssertionError(msg) + + def tile_ranges(is_pointwise: bool, ranges, rw) -> list[CandidateTiling]: + assert_true(len(rw.range_vars) == len(ranges), f"{rw.range_vars=} {ranges=}") + + dep_sources = [rw.reads, rw.writes] + assert_true(all( + isinstance(dep, (MemoryDep, StarDep)) + for dep in itertools.chain.from_iterable(dep_sources) + )) + deps = [ + dep + for dep in itertools.chain.from_iterable(dep_sources) + if dep.name not in V.graph.removed_buffers + and isinstance(dep, MemoryDep) + ] + write_names = OrderedSet([dep.name for dep in rw.writes]) + + def collapse_ranges(ranges: Sequence[sympy.Expr]) -> sympy.Expr: + return V.graph.sizevars.simplify(sympy_product(ranges)) + + tilings = [ + CandidateTiling( + tiling=cls.create_partial_tiling( + ranges, is_pointwise + ), + name="none", + score=0, + ) + ] + + for dep in deps: + strides = V.graph.sizevars.stride_hints(dep.index, rw.range_vars) + assert_true(len(strides) == len(ranges)) + try: + split = strides.index(1) + 1 + if split == len(ranges): + continue + if all(s == 0 for s in strides[split:]): + continue + + except ValueError: + continue + + tiled_groups = ( + collapse_ranges(ranges[:split]), + collapse_ranges(ranges[split:]), + ) + + # score by number of elements + score = V.graph.sizevars.size_hint( + sympy_product( + size for size, stride in zip(ranges, strides) if stride != 0 + ) + ) + if dep.name in write_names: + # ngimel said contiguous writes is more important than reads + score *= 2 + if CandidateTiling.is_good_size(tiled_groups[0]): + score *= 2 + if CandidateTiling.is_good_size(tiled_groups[1]): + score *= 2 + + if ( + V.graph.sizevars.size_hint( + score - sympy_product(itertools.chain(ranges, reduction_ranges)) + ) + >= 0 + ): + tilings.append( + CandidateTiling( + tiling=cls.create_partial_tiling( + [ + collapse_ranges(ranges[:split]), + collapse_ranges(ranges[split:]), + ], + reduction_numel, + ), + score=score, + name=dep.name, + ) + ) + + return tilings + + pointwise_ranges, reduction_ranges = node.get_ranges() + if len(pointwise_ranges) <= 1 and len(reduction_ranges) <= 1: + return [] + + # Tile either pointwise or reduction dims. + pointwise_ranges, reduction_ranges = node.get_ranges() + partial_tilings = tile_ranges( + is_pointwise, + pointwise_ranges if is_pointwise else reduction_ranges, + node.pointwise_or_reduction_read_writes(is_pointwise), + ) + + # Fill in the missing ranges. + full_tilings = [ + CandidateTiling( + tiling=cls.complete_partial_tiling( + tiling.tiling, numel, reduction_numel + ), + score=tiling.score, + name=tiling.name, + ) + for tiling in partial_tilings + ] + + return full_tilings \ No newline at end of file diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py new file mode 100644 index 0000000000..782cc9f745 --- /dev/null +++ b/torch_npu/_inductor/codegen/split_tiling.py @@ -0,0 +1,283 @@ +from functools import reduce +import sympy as sympy +from torch._inductor.codegen.simd import (EnableReduction, DisableReduction) +from torch._inductor.codegen.triton import TritonKernel +from torch._inductor.loop_body import MemoryUsageType +from torch._inductor.runtime.runtime_utils import next_power_of_2 +from torch._inductor.utils import ModularIndexing, sympy_subs +from torch._inductor.virtualized import V + +from .kernel_analysis import IndexAnalysis +from .triton_utils import get_aligned_numel +from ..config import num_vector_core, log + + +# split and tiling axis selector +class SplitTiling: + def __init__(self, kernel: TritonKernel): + self.kernel = kernel + self.indexing = [] # load and store indexing among all scheduler nodes + kernel.sorted_axis = [x for x in kernel.range_tree_nodes.values()] + kernel.sorted_axis.sort(reverse=True, key=self.key) + for i, dim in enumerate(kernel.sorted_axis): + dim.sorted_order = i + + self.find_lowest_dimension() + self.should_outer_reduce = False + self.possible_need_permute = self.find_possible_permutes() + + def find_possible_permutes(self): + if len(self.kernel.low_dims) <= 1: + return False + var_lists = [] + low_dims = [self.kernel.sorted_axis[x].symbol() for x in self.kernel.low_dims] + for index in self.indexing: + var_stride = [ + (key, coeff) + for key, coeff in index.as_coefficients_dict().items() + if not isinstance(key, sympy.Integer) + ] + var_stride.sort(key=lambda x: x[1]) + var_list = tuple([x[0] for x in var_stride if x[0] in low_dims]) + var_lists.append(var_list) + for i, var_list in enumerate(var_lists): + if len(var_list) < len(low_dims): + continue + for j, other in enumerate(var_lists): + if i == j or len(other) < len(low_dims): + continue + if var_list != other: + return True + return False + + @classmethod + def key(cls, x): + # to be higher than x and y + if x.name[0] == 'w' or x.name[0] == 'v' or x.name[0] == 't': + return "zz" + x.name + # to be lower than floor_dir + elif isinstance(x.expr, ModularIndexing): + return x.name[0] + "0" + x.name[1:] + else: + return x.name + + @classmethod + def total_split_numels(cls, axis_list): + numels = [x.length for x in axis_list] + return reduce(lambda x, y: x * y, numels) if numels else 1 + + # Split 原则1 :先做维度合并,再切分 。通过维度合并降维降低split和tiling轴选择策略的复杂性 。 + # Split 原则2 : 切分轴尽量选择高维度的轴, 这样load/store 能够有比较好的线性度 , + # Split 原则3 : 规约轴和低维轴不应选为切分轴 。但如果高维规约类融合算子,而且高维尺寸非常大( >= 64KB),其他维度不足以支持切分,可以考虑对规约轴切分。 + # Split 原则4 :切分轴的总numel 要超过 aicore总数。切分轴的数量最好不要超过3个(triton 最多支持三维发射), 因此 如果一点要超, 需要维度合并。 + def select_split_axis(self): + self.kernel.split_axis.clear() + + # total numel exceed aicore or total split axis exceed 3 + def meet_stop_condition(): + if self.total_split_numels(self.kernel.split_axis) >= num_vector_core: + return True + if len(self.kernel.split_axis) == 3: + return True + return False + + def select_one_split_axis(not_reduction=True, not_low_dims=True): + for axis in self.kernel.sorted_axis: + if not_reduction and axis.prefix == "r": + continue + if not_low_dims and axis.sorted_order in self.kernel.low_dims: + continue + if axis in self.kernel.split_axis: + continue + axis.is_split_axis = True + return axis + return None + + count = 0 + while not meet_stop_condition(): + count += 1 + axis = select_one_split_axis(not_reduction=True, not_low_dims=True) + if axis is not None: + self.kernel.split_axis.append(axis) + continue + axis = select_one_split_axis(not_reduction=True, not_low_dims=False) + if axis is not None: + self.kernel.split_axis.append(axis) + continue + if count > 10: + break + + if not self.kernel.split_axis and self.kernel.sorted_axis: + self.kernel.split_axis.append(self.kernel.sorted_axis[0]) + + self.kernel.split_axis.sort(reverse=True, key=self.key) + for i, x in enumerate(self.kernel.split_axis): + x.split_order = i + + # Tiling 原则1:load / store 中索引表达式的中的低维轴都要成为tiling 轴. + # Tiling 原则2:对于规约算子,规约轴要成为tiling轴。 + # Tiling 原则3: 多维规约, 只有规约轴可以被选择为tiling轴 + # Tiling 原则4: tiling轴 要覆盖 total numel 的 80% + + # two tiling axis might be insufficient when there're 3 or more low-dims in indexing + def select_tiling_axis(self): + self.kernel.tiling_axis.clear() + + # cover the biggest axis and not exceed 3 axis + def meet_stop_condition(): + total_numel = reduce(lambda x, y: x + y, + map(lambda x: x.length, self.kernel.sorted_axis)) if self.kernel.sorted_axis else 1 + tiling_numel = reduce(lambda x, y: x + y, + map(lambda x: x.length, self.kernel.tiling_axis)) if self.kernel.tiling_axis else 1 + if self.kernel.numof_reduction_axis() > 1 and all( + self.kernel.range_tree_nodes[var].is_tiling_axis for var in self.kernel.reduction_axis_list()): + return True + # currently, the maximum dim that triton-ascend support is 2 + max_transpose_dims = 2 + if (self.possible_need_permute or tiling_numel / total_numel >= 0.8) and \ + len(self.kernel.tiling_axis) >= min(max_transpose_dims, len(self.kernel.sorted_axis)): + return True + return False + + def select_tiling(low_dim=True, reduction=True): + for axis in reversed(self.kernel.sorted_axis): + if low_dim and axis.sorted_order in self.kernel.low_dims and axis not in self.kernel.tiling_axis: + axis.is_tiling_axis = True + self.kernel.tiling_axis.append(axis) + if reduction and axis.prefix == 'r' and axis not in self.kernel.tiling_axis: + axis.is_tiling_axis = True + self.kernel.tiling_axis.append(axis) + if low_dim or reduction: + continue + # using principle 4, select one longest + longest = axis # self.find_longest_dimension(check_in_tiling = True) + if longest and longest not in self.kernel.tiling_axis: + self.kernel.tiling_axis.append(longest) + longest.is_tiling_axis = True + if meet_stop_condition(): + break + + select_tiling(low_dim=True, reduction=True) + count = 0 + while not meet_stop_condition(): + select_tiling(low_dim=False, reduction=False) + count += 1 + if count > 10: + break + self.kernel.tiling_axis.sort(reverse=True, key=self.key) + for i, x in enumerate(self.kernel.tiling_axis): + x.tiling_order = i + + def select_split_tiling_axis(self): + self.select_split_axis() + self.select_tiling_axis() + + # the below logic doesn't work when there're two reduction axis, but only one need outer reduction + def should_outer_reduce_me(self, x): + should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, + 32768) and x.is_loop + if should_outer: + self.should_outer_reduce = True + self.kernel.split_axis = x + self.kernel.split_axis.is_split_axis = True + return should_outer + + def find_longest_dimension(self, check_in_tiling=False): + longest = None + for axis in self.kernel.sorted_axis: + if (longest is None or axis.length > longest.length) and \ + (not check_in_tiling or axis not in self.kernel.tiling_axis): + longest = axis + return longest + + # return True when x is the low-dim in indexing + def is_lowest_dimension(self, x): + return x.sorted_order in self.kernel.low_dims + + def find_lowest_dimension(self): + def construct_low_dim(): + for index in self.indexing: + coefficients_dict = index.as_coefficients_dict() + for key, value in coefficients_dict.items(): + if not key.free_symbols: + continue + key = list(key.free_symbols)[0] + if key not in self.kernel.range_tree_nodes: + continue + + if value == sympy.Integer(1): + axis = self.kernel.range_tree_nodes[key] + self.kernel.low_dims.add(axis.sorted_order) + + # all read index should be considered + buf_names = [ + node.node.name + for node in self.kernel.node_schedule + if node not in (EnableReduction, DisableReduction) + ] + for node in self.kernel.node_schedule: + if node in (EnableReduction, DisableReduction): + continue + names = [] + + for read in node._body.memory_usage[MemoryUsageType.LOAD]: + name = read.index_name + arg = read.buffer_name + read_is_inptr = False if arg[:3] != 'arg' and arg in buf_names else True + if read_is_inptr: + names.append(name) + for key, index in node._body.indexing.items(): + if key in names and index not in self.indexing: + self.indexing.append(index) + + if self.kernel.inside_reduction: + construct_low_dim() + return + + # for non-reduction, write index should be considered + for node in self.kernel.node_schedule: + if node in (EnableReduction, DisableReduction): + continue + names = [] + for write in node._body.memory_usage[MemoryUsageType.STORE]: + names.append(write.index_name) + for write in node._body.memory_usage[MemoryUsageType.STORE_REDUCTION]: + names.append(write.index_name) + for key, index in node._body.indexing.items(): + if key in names and index not in self.indexing: + self.indexing.append(index) + + construct_low_dim() + + @staticmethod + def convert(x, y): + xnumel = x + ynumel = y + if isinstance(xnumel, (sympy.Symbol, sympy.Expr)) and not isinstance(xnumel, sympy.Integer): + xnumel = xnumel.subs(V.graph.sizevars.var_to_val) + + if isinstance(ynumel, (sympy.Symbol, sympy.Expr)) and not isinstance(ynumel, sympy.Integer): + ynumel = ynumel.subs(V.graph.sizevars.var_to_val) + + if isinstance(xnumel, sympy.Integer) and isinstance(ynumel, int): + ynumel = sympy.Integer(ynumel) + + if isinstance(ynumel, sympy.Integer) and isinstance(xnumel, int): + xnumel = sympy.Integer(xnumel) + + return (xnumel, ynumel) + + @staticmethod + def less_than(x, y): + xnumel, ynumel = SplitTiling.convert(x, y) + return xnumel < ynumel + + @staticmethod + def great_than(x, y): + xnumel, ynumel = SplitTiling.convert(x, y) + return xnumel > ynumel + + @staticmethod + def ge_than(x, y): + xnumel, ynumel = SplitTiling.convert(x, y) + return xnumel >= ynumel diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py new file mode 100644 index 0000000000..31c002800c --- /dev/null +++ b/torch_npu/_inductor/codegen/tile_generator.py @@ -0,0 +1,242 @@ +import copy +import functools +import math +import sys +from torch._inductor.runtime.runtime_utils import next_power_of_2 +from torch._inductor.runtime.triton_heuristics import Config + +from .triton_utils import byte_per_numel +from ..config import num_vector_core + + +# generate tiling configs +class TileGenerator: + + def __init__(self, numels, axis_names, tiling_axis, split_axis, low_dims, persistent_reduction, + configs, dtype, dual_reduction=False): + self.numels = numels.copy() + + self.blocks = [x for x in self.numels] + self.candidate_blocks = [] + self.sub_blocks = self.blocks.copy() + self.axis_name = axis_names + self.tiling_axis = tiling_axis + self.split_axis = split_axis + self.low_dims = low_dims + self.configs = configs + self.dtype_bytes = self.get_byte_per_numel(dtype) + self.stop_numel = 1024 // self.dtype_bytes + self.block_name = {} + self.sub_block_name = {} + self.persistent_reduction = persistent_reduction + self.dual_reduction = dual_reduction + for axis, name in enumerate(self.axis_name): + if axis not in tiling_axis and axis not in split_axis: + self.blocks[axis] = 1 + self.sub_blocks[axis] = 1 + continue + if axis in self.split_axis: + self.block_name[axis] = f"{name.upper()}BLOCK" + if axis in self.tiling_axis: + self.sub_block_name[axis] = f"{name.upper()}BLOCK_SUB" + + @classmethod + def aligned_numel(cls, numel): + aligned = next_power_of_2(numel) + return aligned + + @classmethod + def get_byte_per_numel(cls, dtype): + if dtype is None: + return 1 + return byte_per_numel[dtype] + + def valid_tile_numel(self, total_numel): + byte_num = self.dtype_bytes + max_numel = 16384 * 4 // byte_num + return total_numel <= max_numel + + def calculate_config_numel(self, config): + total_numel = 1 + for axis in self.tiling_axis: + total_numel = total_numel * config[self.sub_block_name[axis]] + return total_numel + + def calculate_total_numel(self): + smallest = sys.maxsize + + def calculate_total_numel_candi(blocks): + total_numel = 1 + for axis in self.tiling_axis: + total_numel = total_numel * self.sub_blocks[axis] + return total_numel + + for candi_blocks in self.candidate_blocks: + numel = calculate_total_numel_candi(candi_blocks) + if numel < smallest: + smallest = numel + return smallest + + def fill_config(self, config, blocks): + for axis in self.split_axis: + config[self.block_name[axis]] = blocks[axis] + for axis in self.tiling_axis: + tiling_numel = self.aligned_numel(self.sub_blocks[axis]) + config[self.sub_block_name[axis]] = tiling_numel + + def find_config(self, cfg): + for config in self.configs: + if config.kwargs == cfg: + return True + return False + + def add_to_configs(self, candi_block): + newcfg = {} + self.fill_config(newcfg, candi_block) + total_numel = self.calculate_config_numel(newcfg) + if self.valid_tile_numel(total_numel) and not self.find_config(newcfg): + self.configs.append(Config(newcfg, num_warps=1, num_stages=1)) + + def descend_one_axis(self, axis, is_split=False): + def calc_total_programs(): + grids = [] + for axis in self.split_axis: + numel = self.numels[axis] + block_size = self.blocks[axis] + programs = (numel + block_size - 1) // block_size + grids.append(programs) + + total_programs = functools.reduce(lambda x, y: x * y, grids) if grids else 1 + return total_programs + + reached_stop_numel = False + slow_decend_split = False + + while True: + total_numel = self.stop_numel + 100 + for candi_block in self.candidate_blocks: + self.add_to_configs(candi_block) + + # tile numel reached threshold + total_numel = self.calculate_total_numel() + if total_numel <= self.stop_numel: + self.add_to_configs(self.blocks) + reached_stop_numel = True + break + + numel = self.blocks[axis] if is_split else self.sub_blocks[axis] + if numel == 1: + self.add_to_configs(self.blocks) + break + + if is_split: + if self.persistent_reduction and self.axis_name[axis][0] == "r": + reached_stop_numel = True + break + total_programs = calc_total_programs() + if total_programs > num_vector_core: + break + if total_programs > num_vector_core // 2 or self.dual_reduction: + if len(self.candidate_blocks) > 2: + self.candidate_blocks.pop(0) + self.candidate_blocks.append(tuple(self.blocks)) + + self.blocks[axis] = numel // 2 + self.sub_blocks[axis] = self.blocks[axis] + total_programs = calc_total_programs() + if total_programs > num_vector_core: + slow_decend_split = True + step = numel // 4 if numel // 4 > 1 else 1 + self.blocks[axis] = numel // 2 if not slow_decend_split else numel - step + self.sub_blocks[axis] = self.blocks[axis] + else: + if numel >= 128: + self.sub_blocks[axis] = next_power_of_2(numel // 2) + else: # numel >4 and numel < 128 : + self.slow_descend_axis(axis) + return reached_stop_numel + + def slow_descend_axis(self, axis): + numel = self.sub_blocks[axis] + self.sub_blocks[axis] = self.aligned_numel(numel // 2) + + def descend_all_low_dims(self): + low_dim_numels = [self.sub_blocks[x] for x in self.low_dims] + if not low_dim_numels: + return + + def descent_all_axis(min_numel): + for axis in self.low_dims: + if self.axis_name[axis][0] == "r" and self.persistent_reduction: + continue + numel = self.sub_blocks[axis] + if numel == 1: + continue + if min_numel > 1 and abs(numel - min_numel) / min_numel < 0.2: + continue + if numel >= 128: + self.sub_blocks[axis] = next_power_of_2(numel // 2) + else: # numel >4 and numel < 128 : + self.slow_descend_axis(axis) + + count = 0 + total_numel = self.calculate_total_numel() + while total_numel > self.stop_numel and count < 100: + count += 1 + total_numel = self.calculate_total_numel() + for candi_block in self.candidate_blocks: + self.add_to_configs(candi_block) + min_numel = min(low_dim_numels) + descent_all_axis(min_numel) + total_numel_2 = self.calculate_total_numel() + if total_numel == total_numel_2: + descent_all_axis(0) + + return total_numel < self.stop_numel + + def descend_split_tiling(self): + + tiling_not_low_dims = [x for x in self.tiling_axis if x not in self.low_dims] + + def descend_split_axis(): + + for axis in self.split_axis: + if self.descend_one_axis(axis, is_split=True): + return True + + total = self.calculate_total_numel() + return total <= self.stop_numel + + def desceond_tiling_not_low_dims(): + for axis in tiling_not_low_dims: + if self.axis_name[axis][0] == "r" and self.persistent_reduction: + continue + if self.descend_one_axis(axis): + return True + total = self.calculate_total_numel() + return total <= self.stop_numel + + # need to all low dims fairly + def descend_low_dims(): + for axis in self.tiling_axis: + if self.axis_name[axis][0] == "r" and self.persistent_reduction: + continue + if axis in tiling_not_low_dims: + continue + if self.descend_one_axis(axis): + return True + total = self.calculate_total_numel() + return total <= self.stop_numel + + while True: + # descend split axis + if descend_split_axis(): + break + if len(self.candidate_blocks) > 0: + self.sub_blocks = list(self.candidate_blocks[0]) + # descend tiling but not low dims + if desceond_tiling_not_low_dims(): + break + # descend low dims, need to descend all axis at the same time + self.descend_all_low_dims() + break diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py new file mode 100644 index 0000000000..cf2b2a4f92 --- /dev/null +++ b/torch_npu/_inductor/codegen/triton.py @@ -0,0 +1,1953 @@ +import functools +import itertools +import operator +import os +import re +import textwrap +from enum import Enum +from typing import List, Set, Iterable, Callable, Sequence +from typing import ( + Optional, + Union, + Tuple, + Any, + cast, + Dict +) +import sympy +import torch +from torch._inductor import config, ir +from torch.utils._ordered_set import OrderedSet +from torch._inductor.codegen.common import ( + IndentedBuffer, + SizeArg, + DeferredLine, + ArgName +) +from torch._inductor.codegen.common import free_symbol_is_type +from torch._inductor.codegen.simd import CantSplit, DisableReduction, EnableReduction +from torch._inductor.codegen.triton import ( + IndexingOptions, + triton_reshape, + TritonCSEVariable, +) +from torch._inductor.ops_handler import OpsHandler +from torch._inductor.codegen.triton import ( + TritonKernel, + TritonKernelOverrides, + IterationRangesRoot, + IterationRangesEntry, + CSEVariable, + gen_common_triton_imports, + BlockPtrOptions, + triton_acc_type, + constant_repr, + is_welford_reduction, FixedTritonConfig, + prefix_is_reduction, upcast_acc_dtype, + get_kernel_category_by_source_code, + get_fused_kernel_name +) +from torch._inductor.codegen.triton_utils import config_of, signature_of, signature_to_meta +from torch._inductor.dtype_propagation import DtypePropagationOpsHandler +from torch._inductor.runtime.hints import ReductionHint +from torch._inductor.runtime.runtime_utils import next_power_of_2 +from torch._inductor.scheduler import SchedulerNode +from torch._inductor.utils import ( + Placeholder, + get_bounds_index_expr, + upcast_compute_type, + sympy_product +) +from torch._inductor.utils import sympy_index_symbol, generate_assert +from torch._inductor.utils import sympy_subs +from torch._inductor.virtualized import ( + V, + StoreMode, + ReductionType, + _ops as ops, +) +from torch.utils import _pytree as pytree +from torch.utils._sympy.functions import FloorDiv, Identity, ModularIndexing +from torch.utils._sympy.numbers import int_oo +from torch.utils._sympy.symbol import SymT, symbol_is_type +from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges +from torch._inductor.bounds import ValueRangeAnalysis +from torch._inductor.runtime import triton_heuristics + +from .. import config as inductor_npu_config + +from .kernel_analysis import IndexAnalysis, ReductionAnalysis +from .npu_kernel_features import NumelList +from ..runtime import NPUDeviceProperties +from .. import npu_triton_heuristics + + +def flatten(nums): + res = [] + for i in nums: + if isinstance(i, list): + res.extend(flatten(i)) + else: + res.append(i) + return res + + +class NPUTritonKernelOverrides(TritonKernelOverrides): + + @staticmethod + def exp(x): + return f"tl_math.exp({x})" + + @staticmethod + def sqrt(x): + return f"tl_math.sqrt({x})" + + @staticmethod + def tanh(x): + return f"tl_math.tanh({x})" + + @staticmethod + def rsqrt(x): + return f"tl.rsqrt({x})" + + @staticmethod + def floor(x): + return f"tl_math.floor({x})" + + @staticmethod + def erf(x): + return f"tl_math.erf({x})" + + @staticmethod + def ceil(x): + return f"tl_math.ceil({x})" + + @classmethod + def index_expr(cls, expr, dtype): + indexing = V.kernel.indexing(expr, block_ptr=False, is_index_expr=True) + if not isinstance(indexing, IndexingOptions): + raise TypeError(f"not a IndexingOptions : {indexing}") + + # Our sympy expr printing casts to the current kernel index dtype. + # we only respect non int32-int64 dtypes and otherwise use current kernel indexing dtype + index_dtype = torch.int32 if V.kernel.index_dtype == "tl.int32" else torch.int64 + dtype = dtype if dtype not in (torch.int32, torch.int64) else index_dtype + var = V.kernel.cse.generate( + V.kernel.compute, + indexing.index_str, + bounds=get_bounds_index_expr(expr), + dtype=dtype, + ) + + if dtype not in (torch.int32, torch.int64): + var = V.kernel.cse.generate( + V.kernel.compute, + cls.to_dtype(var, dtype), + dtype=upcast_compute_type(dtype), + ) + else: + # We are not always consistent in enforcing that the output of the index expr printing + # results in the indexing dtype. So if we detect that we have an input which might type promote + # to a dtype other than indexing dtype, add a cast. + # Trying to avoid + dtype = index_dtype + for index_var in expr.free_symbols: + if symbol_is_type(index_var, SymT.TMP): + dtype = torch.promote_types( + dtype, V.kernel.cse.varname_map[index_var.name].dtype + ) + + if dtype != index_dtype: + var = V.kernel.cse.generate( + V.kernel.compute, + cls.to_dtype(var, index_dtype), + dtype=index_dtype, + ) + + var.mask_vars = indexing.mask_vars + return var + + +def group_fn(self, sizes): + groups = list() + for s in sizes: + if not s: + groups.append(1) + elif isinstance(s, list): + group = flatten(s) + groups.append(NumelList(tuple(group)) if isinstance(group, list) else group) + else: + groups.append(s) + return tuple(groups) + + +@staticmethod +def select_index_dtype(node_schedule, numel, reduction_numel): + return "tl.int32" + + +class IterationRangesEntryNPUIndex(IterationRangesEntry): + def __init__( + self, + *args, **kwargs): + super().__init__(*args, **kwargs) + self.is_tiling_axis = False + self.is_split_axis = False + self.indexing_code = IndentedBuffer() + self.sorted_order = None + self.tiling_order = None + self.split_order = None + self.var_directions = {} + self.directions = [] + # don't use functools.lru_cache(None), so that previous indexing_code produdec by previous index, + # could be overwritten + self.codegen = self._codegen + + # axis mask + def _codegen_mask(self): + + if self.is_tiling_axis: + BLOCK_NAME = f"{self.name.upper()}BLOCK" + upper = f"min({BLOCK_NAME}+{self.symbol()}_offset, {self.name}_numel)" if self.is_split_axis else f"{self.name}_numel" + line = f"{self.name}_mask = {self.name} < {upper}" + self.writeline(line) + for var in self.var_directions.keys(): + line = f"{var.name}_mask = {var.name} < {upper}" + self.writeline(line) + else: + pass + + def get_axis_direction(self): + + # assume self.golden_var_list is to be correct axis order + + if self.directions: + return f"[{','.join(self.directions)}]" + tiling_axis = [x.symbol() for x in self.kernel.tiling_axis] + + rev_orders = [x for x in self.kernel.golden_var_list if x in tiling_axis] + self.directions = ["None"] * len(tiling_axis) + if len(tiling_axis) != len(rev_orders): + raise RuntimeError(f"assert tiling len={len(tiling_axis)}, not equal to golden varlist len ={len(rev_orders)}") + var_orders = list(reversed(rev_orders)) + index = var_orders.index(self.symbol()) + self.directions[index] = ":" + return f"[{','.join(self.directions)}]" + + # axis var, need to define var with diffent direction + def _codegen(self): + self.indexing_code.clear() + index = None + # for multiple reduce dims, don't need this + if not self.is_tiling_axis: + return self.name + + direction = self.get_axis_direction() + index = f"{self.name} = {self.codegen_index(direction)}" + for var, dir_index in self.var_directions.items(): + line = f"{var.name} = {self.codegen_index(dir_index)}" + self.writeline(line) + + # reduction axis + if self.prefix == 'r': + if V.kernel.inside_reduction and V.kernel.current_node \ + and isinstance(V.kernel.current_node, SchedulerNode) \ + and V.kernel.current_node.node \ + and V.kernel.current_node.node.data \ + and isinstance(V.kernel.current_node.node.data, ir.Reduction): + reduction_type = V.kernel.current_node.node.data.reduction_type + if reduction_type in {"argmax", "argmin"}: + self.writeline(f"{self.parent.prefix}index = " + f"{self.codegen_index(None)}") + if index: + self.writeline(index) + self._codegen_mask() + return self.name + + def writeline(self, line): + self.indexing_code.writeline(line) + + def is_1d_persisent_reduction(self): + return len(V.kernel.tiling_axis) == 1 and V.kernel.persistent_reduction + + def codegen_index(self, direction): + BLOCK_NAME = f"{self.name.upper()}BLOCK" + BLOCK_NAME_SUB = f"{BLOCK_NAME}_SUB" + index = None + if self.prefix == 'r': + if V.kernel.persistent_reduction: + if self.is_1d_persisent_reduction(): + index = f"tl.arange(0, {BLOCK_NAME_SUB})" + else: + index = f"base_{self.name}" + else: + index = f"(loop_{self.name} * {BLOCK_NAME_SUB}) + base_{self.name}" + else: + if self.is_split_axis: + offset = f"{self.symbol()}_offset" + index = f"{offset} + (loop_{self.name} * {BLOCK_NAME_SUB}) + base_{self.name}" + else: + index = f"(loop_{self.name} * {BLOCK_NAME_SUB}) + base_{self.name}" + + if len(V.kernel.tiling_axis) > 1 and direction is not None: + index += direction + + return index + + def codegen_header(self, code): + # generate offset index loop + lines = [] + BLOCK_NAME = f"{self.name.upper()}BLOCK" + BLOCK_NAME_SUB = f"{BLOCK_NAME}_SUB" + + if self.is_1d_persisent_reduction(): + return + + if self.is_split_axis: + lines.append(f"{self.symbol()}_offset = tl.program_id({self.split_order}) * {BLOCK_NAME}") + + if self.is_tiling_axis: + lines.append(f"base_{self.name}= tl.arange(0, {BLOCK_NAME_SUB})") + block = f"{BLOCK_NAME}" if self.is_split_axis else f"{self.symbol()}_numel" + lines.append(f"loops_{self.name} = ({block} + {BLOCK_NAME_SUB} - 1) // {BLOCK_NAME_SUB}") + + else: + pass + + code.writelines(lines) + + def precomputed_args(self): + # for dynamic shapes, find parts of indexing expressions that have to be precomputed + precomputed_args: List[sympy.Expr] = [] + if isinstance(self.expr, (sympy.Symbol, sympy.Integer)): + return precomputed_args + + if not isinstance(self.expr, (FloorDiv, ModularIndexing)): + raise RuntimeError("assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)") + for arg in self.expr.args[1:]: + if not isinstance(arg, (sympy.Integer, sympy.Symbol)): + symbols = arg.free_symbols + if len(symbols) > 0 and all( + symbol_is_type(s, SymT.SIZE) for s in symbols + ): + precomputed_args.append(arg) + return precomputed_args + + def __eq__(self, other): + return self.name == other.name + + +class IterationRangesRootNPUIndex(IterationRangesRoot): + def __init__( + self, + name: str, + numel: sympy.Expr, + prefix: str, + index: int, + kernel: TritonKernel, + pid_cache=None, + *, + is_loop: bool, + tensor_dim: Optional[int], + grid_dim: Optional[int], + ): + super().__init__(name, numel, prefix, index, kernel, pid_cache, is_loop=is_loop, tensor_dim=tensor_dim, + grid_dim=grid_dim, has_zdim=False) + + def __repr__(self): + return f"IterationRangesRootNPUIndex({self.name!r}, {self.numel}, ...)" + + def remove_entry(self, name): + if name in self.var_ranges: + del self.var_ranges[name] + if name in self.var_list: + del self.var_list[self.var_list.index(name)] + if name in V.kernel.range_tree_nodes: + V.kernel.range_tree_nodes_removed[name] = V.kernel.range_tree_nodes[name] + del V.kernel.range_tree_nodes[name] + if name in self.nodes: + del self.nodes[name] + + def duplicated_check(self, divisor, length): + """ + Lookup a given RangeTreeEntry, creating it if needed + """ + if V.graph.sizevars.statically_known_equals(divisor * length, self.numel): + expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor) + else: + expr = ModularIndexing( + sympy_index_symbol(f"{self.prefix}index"), divisor, length + ) + + return expr not in self.nodes + + def lookup(self, divisor, length): + """ + Lookup a given RangeTreeEntry, creating it if needed + """ + if V.graph.sizevars.statically_known_equals(divisor * length, self.numel): + expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor) + else: + expr = ModularIndexing( + sympy_index_symbol(f"{self.prefix}index"), divisor, length + ) + + if expr not in self.nodes: + node = IterationRangesEntryNPUIndex( + f"{self.prefix}{next(V.kernel.iter_vars_count)}", + divisor, + length, + expr, + self, + ) + V.kernel.range_tree_nodes[node.symbol()] = node + self.var_list.append(node.symbol()) + self.var_ranges[node.symbol()] = length + self.nodes[expr] = node + + return self.nodes[expr] + + +@classmethod +def is_compatible( + cls, + groups: Iterable[sympy.Expr], + lengths: Sequence[Sequence[sympy.Expr]], + reduction_numel: sympy.Expr = sympy.S.One +): + # Fill in the reduction numel, in case the node is missing it. + sizevars = V.graph.sizevars + if len(lengths[1]) == 0 and ( + sizevars.statically_known_equals( + sympy_product(groups), + sympy_product(lengths[0]) * reduction_numel, + ) + ): + lengths = (lengths[0], [reduction_numel]) + + try: + groups = flatten(groups) + NPUIndexTritonKernel._split_iteration_ranges(groups, lengths) + return True + except CantSplit: + return False + + +class NPUIndexTritonKernel(TritonKernel): + overrides = NPUTritonKernelOverrides + + def __init__( + self, + tiling: Dict[str, sympy.Expr], + min_elem_per_thread=0, + optimize_mask=True, + fixed_config: Optional[FixedTritonConfig] = None, + **kwargs, ): + + super().__init__(tiling=tiling, + min_elem_per_thread=min_elem_per_thread, + optimize_mask=optimize_mask, + fixed_config=fixed_config, + **kwargs) + self.first_node = True + self.inside_high_order_reduction = False + self.low_dims = set() + self.split_axis = [] + self.tiling_axis = [] + self.range_tree_nodes_removed: Dict[sympy.Symbol, IterationRangesEntry] = {} + self.range_tree_nodes_substituted = {} + self.expr_substituted = {} + self.sorted_axis = [] + self.prefix: IndentedBuffer = IndentedBuffer() + self.index_analysis = {} # var_list -> indexAnalysis + self.golden_var_list = None + self.reduce_analysis = None + self.load_store_indexing = None + + def _get_grid_type(self) -> type[triton_heuristics.GridExpr]: + return npu_triton_heuristics.GridNpu + + def gen_triton_ext_imports(self): + imports = IndentedBuffer() + imports.splice( + """ + from torch._inductor.runtime import triton_helpers + from torch_npu._inductor import npu_triton_heuristics + from torch_npu._inductor import npu_triton_helpers + from torch_npu._inductor.runtime import NPUDeviceProperties + from torch_npu._inductor.npu_triton_helpers import libdevice, math as tl_math + import torch + import torch_npu + """ + ) + return imports.getvalue() + + def patch_triton_hash(self): + # remove this method once the original invocation is fixed + import hashlib + from triton.compiler.compiler import triton_key, make_backend + from triton.runtime.driver import driver + backend = make_backend(driver.active.get_current_target()) + key = f"{triton_key()}-{backend.hash()}" + return hashlib.sha256(key.encode("utf-8")).hexdigest() + + def numof_tiling_axis(self): + return len(self.tiling_axis) + + # do nothing in NpuTritonKernel + def codegen_range_tree(self): + pass + + def initialize_range_tree(self, pid_cache): + self.total_numels = 0 + for k, x in self.numels.items(): + if not isinstance(x, sympy.Integer): + x = x.subs(V.graph.sizevars.var_to_val) + self.numels[k] = x + if x > 1: + self.total_numels += 1 + + no_r_dim = not self.inside_reduction or self.numels["r"] == 1 + prefixes = "wvtzyxr" + active_prefixes = prefixes[-len(self.numels):] + # prefix can not be 's', 'u', 'ps' , 'i', 'z' + # prefix can not be 'p' but can be 'z' since 2.6 + grid_dims = "xyztvw" + if self.no_x_dim: + tensor_dims = "r" + elif no_r_dim: + tensor_dims = "xyztvw" + else: + tensor_dims = "xyztvwr" + tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes) + for i, prefix in enumerate(active_prefixes): + is_reduction = prefix_is_reduction(prefix) + tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None + grid_dim = None if is_reduction else grid_dims.find(prefix) + index = i if grid_dim is None else grid_dim + self.range_trees.append( + IterationRangesRootNPUIndex( + f"{prefix}index", + self.numels[prefix], + prefix, + index, + self, + pid_cache=pid_cache, + is_loop=is_reduction and not self.persistent_reduction, + tensor_dim=tensor_dim, + grid_dim=grid_dim + ) + ) + + def codegen_reduction_numels(self, buffer) -> None: + reduction_trees = [tree for tree in self.range_trees if tree.is_reduction] + if len(reduction_trees) > 1: + raise AssertionError("Currently npu don't support multi-reduction ranges trees, e.g, r0, r1.") + + def get_axis_dtype(self, axis): + dtype = None + if axis is None: + return None + for node in self.node_schedule: + if node in (EnableReduction, DisableReduction): + continue + if axis.symbol() in node._body.indexing_map: + dtype = V.graph.get_dtype(node.node.name) + break + if dtype is None: + should_break_all = False + for node in self.node_schedule: + if should_break_all: + break + if node in (EnableReduction, DisableReduction): + continue + for key, _ in node._body.indexing_map.items(): + if key in self.range_tree_nodes: + dim = self.range_tree_nodes[key] + else: + dim = self.range_tree_nodes_removed[key] + + if dim.parent == axis.parent: + dtype = V.graph.get_dtype(node.node.name) + should_break_all = True + break + return dtype + + def create_inductor_meta(self): + mutated_args = set() + for mutation in self.mutations: + if mutation in self.args.input_buffers: + mutated_args.add(self.args.input_buffers[mutation]) + if ( + mutation in self.args.inplace_buffers + and mutation not in V.graph.removed_buffers + and mutation not in self.removed_buffers + ): + mutated_args.add(self.args.inplace_buffers[mutation].inner_name) + if mutation in self.args.output_buffers: + mutated_args.add(self.args.output_buffers[mutation]) + mutated_args = sorted(mutated_args) + tiling_axis = [x.sorted_order for x in self.tiling_axis] + split_axis = [x.sorted_order for x in self.split_axis] + axis_names = [x.name for x in self.sorted_axis] + split_axis_dtype = self.get_axis_dtype(self.split_axis[0]) if self.split_axis else None + inductor_meta = { + "grid_type": self._get_grid_type().__name__, + "autotune_hints": set(self.autotune_hints), + "kernel_name": str(Placeholder.DESCRIPTIVE_NAME), + "mutated_arg_names": mutated_args, + + # Due to breaking change of triton 3.0, the original invocation is broken + "backend_hash": self.patch_triton_hash(), # torch.utils._triton.triton_hash_with_backend(), + "split_axis": split_axis, + "tiling_axis": tiling_axis, + "axis_names": axis_names, + "low_dims": self.low_dims, + "numof_reduction_axis": self.numof_reduction_axis(), + "split_axis_dtype": split_axis_dtype, + "dual_reduction": self.numof_reduction_axis() > 1, + "traced_graph_hash": "TRACED_GRAPH_HASH", + "traced_graph_dir": "TRACED_GRAPH_DIR", + "store_cubin": config.triton.store_cubin, + "force_disable_caches": config.force_disable_caches, + } + return inductor_meta + + # numels sent to autotune configs + def get_size_hints(self): + size_hints = [] + if (len(self.range_tree_nodes.values()) == 0): + return [v for _, v in self.numels.items()] + + for _, node in enumerate(self.sorted_axis): + if isinstance(node.expr, ModularIndexing): + numel_expr = node.length + else: + numel_expr = node.expr.subs({sympy_index_symbol(r.name): r.numel for r in self.range_trees}) + + numel_expr = V.graph.sizevars.symbolic_hint(numel_expr) + + size_hints.append(numel_expr) + return size_hints + + def add_numel_to_call_args(self, name, call_args, arg_types): + for node in self.sorted_axis: + if isinstance(node.expr, ModularIndexing): + numel_expr = node.length + else: + numel_expr = node.expr.subs({sympy_index_symbol(r.name): r.numel for r in self.range_trees}) + + if isinstance(numel_expr, (sympy.Integer, sympy.Symbol)): + expr = numel_expr + else: + expr = V.graph.wrapper_code.generate_node_numel_expr(name, node, numel_expr) + call_args.append(expr) + arg_types.append(type(expr)) + + def gen_numel_args(self, signature, triton_meta_signature, argdefs): + for node in self.sorted_axis: + arg_name = f"{node.name}_numel" + if not inductor_npu_config.inductor_static_mode: + sizearg = SizeArg(arg_name, node.length) + signature.append(sizearg) + triton_meta_signature[arg_name] = signature_of( + sizearg, size_dtype=self.index_dtype + ) + argdefs.append(ArgName(arg_name)) + else: + argdefs.append(ArgName(arg_name, is_constexpr=True)) + self.triton_meta["constants"][arg_name] = node.length + + # BLOCK and SUB_BLOCK definitions + def add_autotune_args(self, argdefs): + for axis in self.split_axis: + argdefs.append(ArgName(f"{axis.name.upper()}BLOCK", is_constexpr=True)) + + for axis in self.tiling_axis: + if axis.name[0] == 'r' and self.persistent_reduction: + continue + argdefs.append(ArgName(f"{axis.name.upper()}BLOCK_SUB", is_constexpr=True)) + + def _get_heuristic(self): + if self.persistent_reduction: + if not self.inside_reduction: + raise RuntimeError("assert self.inside_reduction to be true") + return "persistent_reduction_npu_index" + elif self.inside_reduction: + return "reduction_npu_index" + return "pointwise_npu_index" + + def get_kernel_name(self, src_code, node_schedule, kernel): + wrapper = V.graph.wrapper_code + if src_code in wrapper.src_to_kernel: + kernel_name = wrapper.src_to_kernel[src_code] + else: + fused_name = ( + get_fused_kernel_name(node_schedule, config.triton.descriptive_names) + if config.triton.descriptive_names + else "" + ) + kernel_category = get_kernel_category_by_source_code(src_code)[:3] + kernel_name = "_".join( + ["triton", kernel_category, fused_name, wrapper.get_next_kernel_suffix()] + ) + return kernel_name + + # modify triton_meta, inductor_meta , etc. + def codegen_kernel(self, name=None): + code = IndentedBuffer() + size_hints = self.get_size_hints() + heuristics = self._get_heuristic() + if name is None: + code.splice(gen_common_triton_imports()) + # Note: add extra imports for extensions + code.splice(self.gen_triton_ext_imports()) + + if config.benchmark_kernel: + code.splice(self.imports_for_benchmark_kernel()) + + argdefs, _, signature, _ = self.args.python_argdefs() + + for i, arg in enumerate(signature): + if isinstance(arg, SizeArg): + symbol = cast(sympy.Symbol, arg.expr) + if symbol in V.graph.sizevars.inv_precomputed_replacements: + signature[i] = SizeArg( + arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol] + ) + + triton_meta_signature = signature_to_meta(signature, size_dtype=self.index_dtype, argdefs=argdefs) + + triton_meta = { + "signature": triton_meta_signature, + "device": + NPUDeviceProperties.create( + V.graph.get_current_device_or_throw() + ), + "constants": {}, + # special config for NPU, specify compile target + "mix_mode": "aiv", + } + + inductor_meta = self.create_inductor_meta() + num_gb = None + if config.benchmark_kernel or config.profile_bandwidth: + num_gb = self.estimate_kernel_num_bytes() / 1e9 + inductor_meta["kernel_num_gb"] = num_gb + + self.triton_meta = triton_meta + self.gen_numel_args(signature, triton_meta_signature, argdefs) + + # add in tiling args + self.add_autotune_args(argdefs) + # for scalar codegen + if len(self.range_tree_nodes) == 0: + self.write_scalar() + else: + self.codegen_body() + + for helper in self.helper_functions: + code.writeline("") + code.splice(helper) + + # Note: override original triton_heuristics + if self.inside_reduction: + reduction_hint = self.features.get_reduction_hint() + heuristics_line = f""" + @npu_triton_heuristics.{heuristics}( + size_hints={size_hints}, + reduction_hint={reduction_hint}, + filename=__file__, + triton_meta={triton_meta!r}, + inductor_meta={inductor_meta!r} + ) + @triton.jit + """ + else: + tile_hint = "" + if len(size_hints) == 2: + if len(signature) == 4: # input, output and 2 args + tile_hint = "tile_hint=TileHint.SQUARE," + else: + tile_hint = "tile_hint=TileHint.DEFAULT," + heuristics_line = f""" + @npu_triton_heuristics.{heuristics}( + size_hints={size_hints!r}, {tile_hint} + filename=__file__, + triton_meta={triton_meta!r}, + inductor_meta={inductor_meta!r}, + min_elem_per_thread={self.min_elem_per_thread} + ) + @triton.jit + """ + code.splice(heuristics_line) + code.writeline( + f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(x.full_name() for x in argdefs)}):" + ) + with code.indent(): + self.codegen_static_numels(code) + for old, new in self.args.aliases(): + code.writeline(f"{old} = {new}") + code.splice(self.body) + + if config.benchmark_kernel: + code.splice(self.codegen_kernel_benchmark(num_gb)) + + return code.getvalue() + + def codegen_static_numels(self, code): + for symbol in self.reduction_axis_list(): + if symbol.name[0] != "r" or not self.persistent_reduction: + continue + + node = self.range_tree_nodes[symbol] + simplified_tree_numel = V.graph.sizevars.simplify(node.length) + if isinstance(simplified_tree_numel, (sympy.Integer, int)): + val = int(simplified_tree_numel) + else: + continue + val = next_power_of_2(val) + code.writeline(f"{node.name.upper()}BLOCK_SUB: tl.constexpr = {val}") + + def lowest_axis_variable(self): + if len(self.tiling_axis) == 0: + return None + return self.tiling_axis[-1] + + def is_isolated_symbol(self, input_str, range_val): + patterns = [r'\b' + re.escape(range_val.name) + r'\b'] + for var in range_val.var_directions.keys(): + pattern = r'\b' + re.escape(var.name) + r'\b' + patterns.append(pattern) + + for pattern in patterns: + if re.search(pattern, input_str): + return True + return False + + def find_axis_in_load_store(self, range_val): + if not range_val: + return False + for line in self.loads._lines: + if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, range_val): + return True + for line in self.compute._lines: + if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, range_val): + return True + for line in self.post_loop_store._lines: + if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, range_val): + return True + for line in self.stores._lines: + if isinstance(line, DeferredLine): + line = line.line + if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, range_val): + return True + return False + + def write_scalar(self): + self.body.splice(self.indexing_code) + self.body.splice(self.loads) + self.body.splice(self.compute) + self.body.splice(self.stores) + self.loads.clear() + self.compute.clear() + self.stores.clear() + self.post_loop_store.clear() + self.prefix.clear() + + def codegen_body(self): + if not ( + self.loads + or self.stores + or self.compute + or self.post_loop_store + ): + return + + def write_pointwise(): + self.body.splice(self.indexing_code) + self.body.splice(self.loads) + self.body.splice(self.compute) + self.body.splice(self.stores) + + def codegen_range(index): + def is_1d_reduction(): + return self.numels["r"] > 1 and len(self.numels) == 1 + + def loop_body(index, indexing_code, is_last_axis, do_indent=True): + if do_indent: + self.body.do_indent() + if indexing_code: + self.body.splice(indexing_code) + if is_last_axis: + write_pointwise() + else: + codegen_range(index + 1) + if do_indent: + self.body.do_unindent() + + if index < 0 or index >= len(self.range_tree_nodes): + return + + range_val = self.sorted_axis[index] + numof_tilings = len(self.tiling_axis) + last_tiling = range_val.is_tiling_axis and numof_tilings >= 1 and range_val.tiling_order == len( + self.tiling_axis) - 1 + next_is_dual_reduction_tiling = index == len( + self.sorted_axis) - numof_tilings - 1 and self.numof_reduction_axis() + + is_last_axis = index == len(self.sorted_axis) - 1 + indexing_code = getattr(range_val, "indexing_code") + reduction_1d = is_1d_reduction() + do_indent = False + # do nothing except for writing porintwise + if len(self.loads._lines) == 0 and len(self.stores._lines) == 0: + do_indent = False + indexing_code = None + # tiling axis and last tiling + if range_val.is_tiling_axis and last_tiling: + do_indent = False + need_axis_loop = self.find_axis_in_load_store(range_val) + if not need_axis_loop: + indexing_code = None + if (range_val.prefix != 'r' or not self.persistent_reduction) and need_axis_loop: + self.body.splice(self.prefix) + self.body.writeline(f"for loop_{range_val.name} in range(loops_{range_val.name}):") + do_indent = True + loop_body(index, indexing_code, is_last_axis, do_indent) + self.body.splice(self.post_loop_store) + self.post_loop_store.clear() + + # tiling axis and but not last tiling + elif range_val.is_tiling_axis: + do_indent = False + if len(self.loads._lines) == 0 and len(self.stores._lines) == 0: + do_indent = False + indexing_code = None + if self.numof_reduction_axis() <= 1: + do_indent = True + self.body.writeline(f"for loop_{range_val.name} in range(loops_{range_val.name}):") + loop_body(index, indexing_code, is_last_axis, do_indent=do_indent) + + elif not is_last_axis: + do_indent = True + if range_val.is_split_axis: + offset = f"{range_val.name}_offset" + self.body.writeline(f"for {range_val.name} in range({offset}, " + f"min({offset} + {range_val.name.upper()}BLOCK, {range_val.name}_numel)):") + else: + self.body.writeline(f"for {range_val.name} in range({range_val.name}_numel):") + + if not reduction_1d and self.persistent_reduction: + self.body.do_indent() + self.body.splice(self.prefix) + self.prefix.clear() + self.body.do_unindent() + + loop_body(index, indexing_code, is_last_axis, do_indent=do_indent) + else: + write_pointwise() + + if self.first_node: + for node in self.sorted_axis: + node.codegen_header(self.body) + + while True: + if not self.sorted_axis[-1].is_tiling_axis: + x = self.sorted_axis[-1] + self.sorted_axis.pop(-1) + self.sorted_axis.insert(0, x) + else: + break + + if self.first_node: + codegen_range(0) + else: + last_axis_order = self.tiling_axis[-1].sorted_order + if self.persistent_reduction and self.numof_reduction_axis() > 1: + last_axis_order = last_axis_order - self.numof_reduction_axis() + 1 + for _ in range(last_axis_order): + self.body.do_indent() + codegen_range(last_axis_order) + for _ in range(last_axis_order): + self.body.do_unindent() + + self.cse.invalidate(self.outside_loop_vars) + self.loads.clear() + self.compute.clear() + self.stores.clear() + self.post_loop_store.clear() + self.prefix.clear() + self.first_node = False + + # for creat constant tensor, if have two axis, constant=tl.full([1,1]) else tl.full([1]) + def triton_tensor_ndim(self): + if self.numof_reduction_axis() > 1: + return 1 + + return len(self.tiling_axis) + + # indexing.mask_str is None , see varmean_test.py + def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable): + if not self.inside_reduction: + raise RuntimeError("assert self.inside_reduction") + + self.inside_reduction = False + indexing = self.indexing(index, block_ptr=True) + self.inside_reduction = True + var = self.args.output(name) + if isinstance(indexing, BlockPtrOptions): + self.post_loop_store.writeline( + DeferredLine( + name, + self.codegen_block_ptr_store_line( + name, + indexing, + indexing.format(var), + value, + f", boundary_check={indexing.boundary_check()!r}", + ), + ) + ) + else: + if not isinstance(indexing, IndexingOptions): + raise RuntimeError("assert isinstance(indexing, IndexingOptions)") + line = f"tl.store({var} + ({indexing.index_str} ), {value}, {indexing.mask_str})" + if self.numof_reduction_axis() > 1: + line = f"tl.store({var} + ({indexing.index_str} + tl.arange(0,1) ), {value}, {indexing.mask_str})" + self.post_loop_store.writeline( + DeferredLine(name, line) + ) + + # apply new var in case dim are permuted/broadcast + def store( + self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None + ) -> None: + + var = self.args.output(name) + original_index = index + index_analyze = IndexAnalysis(self, index, is_store_index=True) + index_analyze.analyze_index() + indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None, index_analyze=index_analyze) + index_str = indexing.index_str + value_str = f"{value}" + mask_str = indexing.mask_str + + if index_analyze.need_permute: + value_str = value_str.replace(f"{value}", f"{value}{index_analyze.generate_statement()}") + + advance_block_ptr = None + if isinstance(indexing, BlockPtrOptions): + block_ptr, advance_block_ptr, other = self.codegen_block_ptr( + name, var, indexing + ) + # block_ptr stores don't do implicit casting + line = self.codegen_block_ptr_store_line( + name, indexing, block_ptr, value, other + ) + elif mode is None: + line = f"tl.store({var} + ({index_str}), {value_str}, {mask_str})" + if self.numof_reduction_axis() > 1: + line = f"tl.store({var} + ({index_str} + tl.arange(0,1) ), {value_str}, {indexing.mask_str})" + + elif mode == "atomic_add": + line = f"tl.atomic_add({var} + ({index_str}), {value_str}, {indexing.mask_str})" + else: + raise NotImplementedError(f"store mode={mode}") + + self.stores.writeline(DeferredLine(name, line)) + if advance_block_ptr: + self.stores.writeline(advance_block_ptr) + + if not self.inside_reduction: + self.outside_loop_vars.add(value) + + def find_reduction_node(self): + node = self.current_node + if node is not None and isinstance(node, SchedulerNode): + reduction = node.node.data + if reduction is not None and isinstance(reduction, ir.Reduction): + return reduction + + for node in self.node_schedule: + if node in (EnableReduction, DisableReduction): + continue + reduction = node.node.data + if reduction is not None and isinstance(reduction, ir.Reduction): + return reduction + + return None + + # select the golden varlist, from to which to deduce permute, broadcast shape + def select_golden_varlist(self): + longest = None + maximum_length = 0 + self.golden_var_list = None + + def all_tiling_in_var_list(var_list): + return all([x in var_list for x in self.tiling_axis]) + + # all are load indexings, select the longest as gold + for index in self.load_store_indexing: + index = index.subs(V.graph.sizevars.var_to_val) + analyze = IndexAnalysis(self, index) + if len(analyze.var_list) > maximum_length and all_tiling_in_var_list(analyze.var_list): + longest = analyze.var_list + maximum_length = len(longest) + # this may cause problems + if not longest: + self.golden_var_list = tuple([x.symbol() for x in self.tiling_axis]) if self.tiling_axis else [] + else: + self.golden_var_list = tuple([x for x in longest if x in self.tiling_axis]) if self.tiling_axis else [] + if self.golden_var_list is None: + raise RuntimeError("assert self.golden_var_list is None") + + # to generate shape of the tile + + def dense_size_list(self) -> List[str]: + if self.inside_reduction: + if not self.reduce_analysis: + self.reduce_analysis = ReductionAnalysis(self) + return self.reduce_analysis.dense_size_list() + + if not self.golden_var_list: + self.select_golden_varlist() + + golden_var_list = self.golden_var_list if self.golden_var_list else [x.symbol() for x in self.tiling_axis] + if golden_var_list is None: + raise RuntimeError("assert golden_var_list is None") + sizes = [None for _ in golden_var_list] + for i, var in enumerate(reversed(golden_var_list)): + axis = self.range_tree_nodes[var] + sizes[i] = f"{axis.name.upper()}BLOCK_SUB" + return sizes + + def dense_size_str(self): + if self.inside_reduction: + if not self.reduce_analysis: + self.reduce_analysis = ReductionAnalysis(self) + return self.reduce_analysis.dense_size_str() + sizes = self.dense_size_list() + return f"[{', '.join(sizes)}]" + + # and add to shape to value + def reduction_resize(self, value, dim): + ndims = self.triton_tensor_ndim() + if ndims == 1: + return f"triton_helpers.promote_to_tensor({value})" + dense_list = self.dense_size_list() + dense_list[dim] = "1" + expand_str = ", ".join(dense_list) + return f"{value}.reshape({expand_str})" + + # to determine reduction_dim + def reduction_dim(self): + if not self.reduce_analysis: + self.reduce_analysis = ReductionAnalysis(self) + return self.reduce_analysis.reduced_dim + + def filter_masks(self, mask_vars): + for node in self.sorted_axis: + if not (node.is_tiling_axis): + mask_vars.discard(f"{node.name}_mask") + + def numof_reduction_axis(self): + root = self.range_trees[-1] + if root is None: + return 0 + + return len(root.var_list) + + def reduction_axis_list(self): + root = self.range_trees[-1] + if root is None: + return [] + return root.var_list + + def reduction( + self, + dtype: torch.dtype, + src_dtype: torch.dtype, + reduction_type: ReductionType, + value: Union[CSEVariable, Tuple[CSEVariable, ...]], + ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]: + if not self.inside_reduction: + raise RuntimeError("assert self.inside_reduction") + masks = {f"{node.symbol()}_mask" for node in self.sorted_axis} + self.filter_masks(masks) + masks = sorted(masks) + if self._load_mask: + masks.append(self._load_mask) + reduction_range_prefix = self.range_trees[-1].prefix + if not self.reduce_analysis: + self.reduce_analysis = ReductionAnalysis(self) + dense_size_str = self.dense_size_str() + + if len(dense_size_str) > 2: + value = self._map_tuple_or_scalar( + lambda v: self.cse.generate( + self.compute, f"tl.reshape({v}, {dense_size_str})", dtype=v.dtype, + ), + value, + + ) + + dim: int + root_op: str + + def final_reduction(value): + module = "tl" # use tl + if reduction_type in {"max", "min"}: + return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})", dim) + return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})", dim) + + def final_argreduce(buffer, result_var, value, index): + buffer.splice( + f"""\ + _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim}) + {result_var} = {self.reduction_resize(f'{result_var}_tmp', dim)} + """ + ) + + def get_reduction_axis(): + return list(self.range_tree_nodes.values())[-1] + + cache_key = (src_dtype, reduction_type, value) + if cache_key in self.cse.reduction_cache: + return self.cse.reduction_cache[cache_key] + + dim = self.reduction_dim() + acc_type = triton_acc_type(src_dtype) + torch_acc_type = upcast_acc_dtype(src_dtype) + result_var: Any = self.cse.newvar(dtype=torch_acc_type) + result_var.mask_vars = {var for var in masks if var[0] != "r"} + cond = " & ".join(masks) + + def where_cond(tval, fval): + if not cond: + return tval + return TritonKernelOverrides.where(cond, tval, fval) + + if self.persistent_reduction: + default = ir.Reduction.default_value(reduction_type, src_dtype) + default = self._map_tuple_or_scalar(constant_repr, default) + + def _mask_value(value, default): + return self.cse.generate(self.compute, where_cond(value, default), dtype=value.dtype) + + # masked_value doesn't work dual reduction + if self.numof_reduction_axis() == 1: + if isinstance(value, tuple): + masked_value = [_mask_value(v, d) for v, d in zip(value, default)] + else: + masked_value = _mask_value(value, default) + else: + masked_value = value + + if reduction_type in {"argmax", "argmin", "max", "min"}: + reduce_axis = get_reduction_axis() + broadcast_string: str + reshape_str = self.reduce_analysis.get_reduce_dim_reshape(reduce_axis) + broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape({reshape_str}), {masked_value}.shape)" + accumulator_index = str( + self.cse.generate( + self.compute, + broadcast_string, + dtype=torch.int64 + ) + ) + if reduction_type == "argmax" or reduction_type == "argmin": + root_op = {"argmax": "max", "argmin": "min"}[reduction_type] + final_argreduce( + self.compute, result_var, masked_value, accumulator_index + ) + elif reduction_type == "max" or reduction_type == "min": + result_var = self.cse.generate( + self.compute, final_reduction(masked_value), dtype=masked_value.dtype, + ) + elif reduction_type == "welford_reduce": + raise RuntimeError("assert False, welford_reduction and is not supported now..") + elif reduction_type == "welford_combine": + raise RuntimeError("assert False, welford_combine and is not supported now..") + else: + result_var = self.cse.generate( + self.compute, final_reduction(masked_value), dtype=masked_value.dtype, + ) + else: + accumulator = self.cse.namedvar(f"_{result_var}", dtype=torch_acc_type) + default = ir.Reduction.default_accumulator(reduction_type, src_dtype) + default = self._map_tuple_or_scalar(constant_repr, default) + if not isinstance(default, tuple): + self.prefix.writeline( + f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})" + ) + + if reduction_type in {"argmax", "argmin"}: + accumulator_index = f"_{result_var}_index" + long_max = torch.iinfo(torch.int64).max + self.prefix.writeline( + f"{accumulator_index} = tl.full({self.dense_size_str()}, {long_max}, tl.int64)" + ) + root_op = {"argmax": "max", "argmin": "min"}[reduction_type] + + self.compute.splice( + f"""\ + {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index( + {accumulator}, {accumulator_index}, {value}, {reduction_range_prefix}index + ) + {accumulator} = {where_cond(f'{accumulator}_next', accumulator)} + {accumulator_index} = {where_cond(f'{accumulator_index}_next', accumulator_index)} + """ + ) + final_argreduce(self.post_loop_store, result_var, accumulator, accumulator_index) + elif is_welford_reduction(reduction_type): + raise RuntimeError("assert False, welford_reduction and is not supported now..") + else: + combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype) + updated = combine_fn(accumulator, value) + self.compute.writeline( + f"{accumulator} = {where_cond(updated, accumulator)}" + ) + + if src_dtype == torch.bool: + accumulator = f"{accumulator}.to(tl.int8)" + result_type = triton_compute_type(dtype) + self.post_loop_store.writeline( + f"{result_var} = {final_reduction(accumulator)}.to({result_type})" + ) + else: + self.post_loop_store.writeline( + f"{result_var} = {final_reduction(accumulator)}" + ) + + self.cse.reduction_cache[cache_key] = result_var + + if isinstance(result_var, tuple): + self.outside_loop_vars |= set(result_var) + else: + self.outside_loop_vars.add(result_var) + + return result_var + + # broadcast, permute handling + def load(self, name: str, index: sympy.Expr): + var = self.args.input(name) + original_index = index + store_cache = self.cse.store_cache + if name in store_cache: + result_var = store_cache[name] + return result_var + + index_analyze = IndexAnalysis(self, index) + index_analyze.analyze_index() + indirect_indexing = self.is_indirect_indexing(index) + indexing = self.indexing(index, block_ptr=True) + has_rindex = indexing.has_rindex() + has_tmpmask = indexing.has_tmpmask() + is_coalesced = any( + i == 1 for i in self.get_strides_of_load(original_index).values() + ) + ep = "" + if ( + (has_tmpmask or has_rindex) + and V.graph.get_dtype(name) != torch.bool + and indexing.has_mask() + ): + other = ", other=0.0" + else: + other = "" + + advance_block_ptr = None + append_broadcast = None + dtype = V.graph.get_dtype(name) + + if V.graph.is_unspec_arg(name): + line = var + else: + if isinstance(indexing, BlockPtrOptions): + block_ptr, advance_block_ptr, other = self.codegen_block_ptr( + name, var, indexing, other + ) + line = f"tl.load({block_ptr}{other}{ep})" + # add needed size=1 dimensions + line = triton_reshape( + line, indexing.block_shape, indexing.reshape_suffix + ) + elif isinstance(original_index, sympy.Integer): + line = f"tl.load({var} + ({original_index}))" + full_list = ["1"] * (len(self.tiling_axis) if self.tiling_axis else 1) + append_broadcast = f"[{', '.join(full_list)} ]" + else: + index_str = indexing.index_str + mask_str = indexing.mask_str + line = f"tl.load({var} + ({index_str}), {mask_str}{ep}{other})" + + dtype = V.graph.get_dtype(name) + if dtype in (torch.bfloat16,): + line += ".to(tl.float32)" + if dtype == torch.bool and torch.version.hip is None: + line += ".to(tl.int1)" + if has_tmpmask: + # Masked loads must come after the mask is computed + load_buffer = self.compute + elif ( + self.inside_reduction + and self.range_trees[-1].is_loop + and not indirect_indexing + and not has_rindex + ): + # can lift a common load outside of reduction loop + # One exception is when this is an indirect_load. + load_buffer = self.prefix + + else: + load_buffer = self.loads + + result_var = self.cse.generate(load_buffer, line, dtype=dtype) + if not (isinstance(result_var, TritonCSEVariable)): + raise RuntimeError("assert isinstance(result_var, TritonCSEVariable)") + result_var.mask_vars = indexing.mask_vars # type: ignore[assignment] + + if append_broadcast and append_broadcast != '[]': + line = f"tl.broadcast_to({result_var}, {append_broadcast})" + result_var = self.cse.generate(load_buffer, line, dtype=dtype) + # triton can handle broadcast + elif index_analyze.need_permute: + line = f"{result_var}{index_analyze.generate_statement()}" + result_var = self.cse.generate(self.loads, line, dtype=dtype) + + if advance_block_ptr: + load_buffer.writeline(advance_block_ptr) + + if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex): + self.outside_loop_vars.add(result_var) + + return result_var + + # don't call symlify_indexing + def prepare_indexing( + self, + index: sympy.Expr, + index_analyze, + is_index_expr=False + ): + index = sympy_subs(index, V.graph.sizevars.precomputed_replacements) + # if simple replacements didn't get rid of floor/ceil, try full subs + if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)): + index = index.subs(V.graph.sizevars.precomputed_replacements) + + if len(index.atoms(sympy.ceiling)): + for a in index.atoms(sympy.ceiling): + # for nested exprs, atoms yields top level first (?) + # so if everything goes fine, lower level replacements will come up empty + symbols = a.free_symbols + if len(symbols) > 0 and all( + symbol_is_type(s, (SymT.SIZE, SymT.PRECOMPUTED_SIZE)) + for s in symbols + ): + replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)} + index = sympy_subs(index, replacements) + + simp_index = index + + simp_index = ( + simp_index if not isinstance(simp_index, Identity) else simp_index.args[0] + ) + + # to generate range.var_directions for permuted axis + index_analyze.analyze_index() + return self.codegen_indexing(simp_index) + + def replace_index_vars(self, index, index_analyze): + + new_index = index + if index_analyze.var_replacements: + new_index = sympy_subs(index, index_analyze.var_replacements) + return new_index + + def index_to_str(self, index: sympy.Expr) -> str: + if isinstance(index, list): + return f"[{', '.join(map(self.index_to_str, index))}]" + index = self.rename_indexing(index) + return self.kexpr(index) # type: ignore[call-arg] + + # 1. only remove the line which asserts index var should be in "xyr" + # 2. don't do simplify_indexing, which combine continuous dims + # 3. removed block_ptr, removed dense mask/broadcast support + # dense_mask_vars should be generated from sorted_axis + # upgraded to torch251 + def indexing( + self, + index: sympy.Expr, + *, + copy_shape=None, + dense_indexing=False, + override_mask=None, + block_ptr=False, + index_analyze=None, + is_index_expr=False + ) -> Union[IndexingOptions, BlockPtrOptions]: + """ + Compute the index and mask to pass to tl.load() or tl.store() + """ + if not index_analyze: + index_analyze = IndexAnalysis(self, index, is_index_expr=is_index_expr) + index_analyze.analyze_index() + + index = self.prepare_indexing(index, index_analyze, is_index_expr) + index_vars = index.free_symbols + has_rindex = False + index = sympy_subs(index, V.graph.sizevars.precomputed_replacements) + # if simple replacements didn't get rid of floor/ceil, try full subs + if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)): + index = index.subs(V.graph.sizevars.precomputed_replacements) + if len(index.atoms(sympy.ceiling)): + for a in index.atoms(sympy.ceiling): + # for nested exprs, atoms yields top level first (?) + # so if everything goes fine, lower level replacements will come up empty + symbols = a.free_symbols + if len(symbols) > 0 and all( + s.name.startswith("s") or s.name.startswith("ps") for s in symbols + ): + replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)} + index = sympy_subs(index, replacements) + + # if not self.inside_reduction : + index = self.replace_index_vars(index, index_analyze) + index_vars = index.free_symbols + has_rindex = False + + mask_vars: Set[str] = set() + for var in index_vars: + if not (isinstance(var, sympy.Symbol)): + raise RuntimeError("assert isinstance(var, sympy.Symbol)") + + has_rindex = has_rindex or var.name.startswith("r") + if override_mask: + pass + elif var.name.startswith("tmp"): + # indirect indexing + cse_var = self.cse.varname_map[var.name] + mask_vars.update(cse_var.mask_vars) + elif var.name.startswith(("s", "ps", "i")): + pass + else: + # var is one of xN, yN or rN + mask_vars.add(f"{var.name}_mask") + + expand_str = None + index_str = self.index_to_str(index) + + if isinstance(index, sympy.Integer): + expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str() + if (index != 0): + index_str = f"tl.full({expand_str}, {index_str}, tl.int32)" + else: + index_str = f"tl.arange(0,1)" + return IndexingOptions(index_str, OrderedSet(), expand_str, has_rindex, index) + + if override_mask: + mask_vars = {override_mask} + if self._load_mask: + mask_vars.add(self._load_mask) + self.filter_masks(mask_vars) + return IndexingOptions(index_str, mask_vars, expand_str, has_rindex, index) # type: ignore[arg-type] + + def codegen_indexing(self, expr: sympy.Expr): + expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges()) + for sym in sorted(expr.free_symbols, key=str): + if sym in self.range_tree_nodes: + # if indexing expression is complicated, we precompute it on the host side + # and send the result as a kernel argument + replacements = {} + for ps in self.range_tree_nodes[sym].precomputed_args(): # type: ignore[index] + replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps) + if len(replacements) > 0: + self.range_tree_nodes[sym].expr = sympy_subs( # type: ignore[index] + self.range_tree_nodes[sym].expr, replacements # type: ignore[index] + ) + self.range_tree_nodes[sym].codegen() # type: ignore[index] + return expr + + # when xindex(16) -> x2:2,x3:8, when new length:16 in , should return (x2,x3) + def split_and_set_ranges(self, lengths: Sequence[Sequence[sympy.Expr]]): + groups = [rt.numel for rt in self.range_trees] + if not self.inside_reduction: + groups[-1] = sympy.S.One + + return self.map_kernel_groups_to_node_sizes(groups, lengths, self.set_ranges) + + # support split multiple ranges (instead of double) from one flatten range, triple-ranges are needed in mamba model + @staticmethod + def _split_iteration_ranges( + groups: Iterable[sympy.Expr], lengths: Sequence[Sequence[sympy.Expr]] + ): + sv = V.graph.sizevars + new_ranges: List[List[sympy.Expr]] = [[] for _ in groups] + remaining = [sv.simplify(g) for g in groups] + for i, group in enumerate(remaining): + if isinstance(group, (list, tuple)): + remaining[i] = NumelList(group).numels() + + var_count = itertools.count() + + def add_range(i, expr): + expr = sv.simplify(expr) + if not sv.statically_known_multiple_of(remaining[i], expr): + raise CantSplit() + # guard on the last item out + remaining[i] = FloorDiv(remaining[i], expr) + new_ranges[i].append(expr) + return next(var_count) + + def make_combined(strides, index_list): + def getter(flat_vars): + expr = sympy.Integer(0) + for stride, index in zip(strides, index_list): + expr = stride * flat_vars[index] + expr + return expr + + return getter + + def size_hints(group): + if isinstance(group, (list, tuple)): + return sv.size_hint(NumelList(group).numels()) + return sv.size_hint(group) + + def add_multiple_range(size, return_getters): + # need to break size in multiple + index_list = [] + stride_list = [] + group = current_group + remained_size = size + # Two checks: + # 1. remaining sizes to be merged + # 2. remained_size is already divided to 1 + while (group < len(remaining) and remaining[group] > 1) and (remained_size > 1): + group_size = remaining[group] + # size should be divisible by group_size + if not sv.statically_known_multiple_of(remained_size, group_size): + raise CantSplit() + index_list.append(add_range(group, group_size)) + remained_size = FloorDiv(remained_size, group_size) + stride_list.append(remained_size) + group = group + 1 + if remained_size != 1: + raise CantSplit() + return_getters.append(make_combined(stride_list, index_list)) + + return_getters_groups = [] + current_group = 0 + + for length_group in lengths: + return_getters = [] + for size in length_group: + if sv.statically_known_equals(size, 1): # type: ignore[arg-type] + return_getters.append(lambda _: sympy.Integer(0)) + continue + + while ( + current_group < len(remaining) + and size_hints(remaining[current_group]) == 1 + ): + # scroll to next group with remaining elements + current_group += 1 + size_hint = sv.size_hint(size) + if size_hint > size_hints(remaining[current_group]): + # add multiple ranges (two or more) to the list, as well as the getter funcs + add_multiple_range(size_hint, return_getters) + else: + return_getters.append( + operator.itemgetter(add_range(current_group, size_hint)) + ) + return_getters_groups.append(return_getters) + + if not (all(V.graph.sizevars.size_hint(s) == 1 for s in remaining)): + raise RuntimeError("assert all(V.graph.sizevars.size_hint(s) == 1 for s in remaining)") + + return new_ranges, return_getters_groups + + # torch260 done + # just to override load method of CSEProxy, however, CSEProxy is an inner which can not be monkey patched, + # we need to override the whole inner class + def __enter__(self): + class CSEProxy: + self.name = "CSEProxy" + vr_analysis = ValueRangeAnalysis() + + @staticmethod + def __getattr__(name: str) -> Callable[..., CSEVariable]: # type: ignore[misc] + def inner(*args, **kwargs): + bounds = CSEProxy._bound_variable(name, *args, **kwargs) + + value = getattr(parent_handler, name)(*args, **kwargs) # type: ignore[has-type] + dtype_handler = DtypePropagationOpsHandler() + + output_idx = 0 + + def do_cse(v): + # cpp backend doesnt set current device + if V.graph.current_device is not None: + device_str = V.graph.get_current_device_or_throw().type + triton_backend = ( + config.cpu_backend == "triton" + if device_str == "cpu" + else config.cuda_backend == "triton" + ) + else: + triton_backend = False + + # only triton backend tracks dtype currently + if triton_backend: + if name == "masked": + output_dtype = value.dtype + else: + output_dtype = getattr( + dtype_handler, + name, + )(*args, **kwargs) + else: + # cpp backend doesnt track dtype yet + output_dtype = None + + csevar = V.kernel.cse.generate( + V.kernel.compute, + v, + bounds=bounds, + dtype=output_dtype, + ) + + nonlocal output_idx + if ( + config.test_configs.runtime_triton_dtype_assert + and triton_backend + ): + from torch._inductor.codegen.triton import triton_type + + # we tree_map over the output, so we need to fetch corresponding dtype + if isinstance(output_dtype, (list, tuple)): + output_dtype = output_dtype[output_idx] + + V.kernel.compute.writeline( + f"tl.static_assert({csevar}.dtype == {triton_type(output_dtype)})" + ) + output_idx += 1 + + csevar.update_on_args(name, args, kwargs) + + return csevar + + return pytree.tree_map(do_cse, value) + + return inner + + @staticmethod + def _bound_variable(name, *args, **kwargs): + """ + If the variable comes from an FX node, we forward the bound we have already computed + Else, if the variable when codegen'ing another op, we try to compute its bounds + """ + from torch._inductor.select_algorithm import TritonTemplateKernel + + if isinstance(V.kernel, TritonTemplateKernel): + return ValueRanges.unknown() + + fx_node = V.interpreter.current_node + if fx_node.target == name and self.node_to_bounds is not None: + if not (isinstance(self.node_to_bounds, dict)): + raise RuntimeError("assert isinstance(self.node_to_bounds, dict)") + + return self.node_to_bounds.get(fx_node, ValueRanges.unknown()) + elif config.compute_all_bounds and hasattr(ValueRangeAnalysis, name): + # These create lots of inner strings. We would need to compute the bounds at the ops + # We will also likely not get much from computing VRs on these nodes + if any( + s in fx_node.target + for s in ("set_indirect", "reduction", "scan") + ): + return ValueRanges.unknown() + + # We assume that the inputs come from `ops.` and are not strings. If you want to generate + # intermediary strings, wrap them in CSE variables with properly initialised bounds. + + # If there is no FX bound but we know how to compute one we do so + if (kwargs): + raise RuntimeError("assert not kwargs") + + def arg_to_bound(x): + if isinstance(x, CSEVariable): + return x.bounds + elif isinstance(x, sympy.Expr): + return bound_sympy(x) + else: + return x + + arg_bounds = list(map(arg_to_bound, args)) + return getattr(CSEProxy.vr_analysis, name)(*arg_bounds) + return ValueRanges.unknown() + + @staticmethod + def indirect_indexing( + var: CSEVariable, + size: Union[sympy.Expr, int], + check: bool = True, + wrap_neg=True, + ): + if isinstance(size, int): + size = sympy.Integer(size) + if not (isinstance(size, sympy.Expr)): + raise RuntimeError("assert isinstance(size, sympy.Expr), size") + # Skip CSE since this doesn't return an expression + + if var.bounds.lower < 0: # type: ignore[operator] + if wrap_neg: + stm = ops.add(var, ops.index_expr(size, torch.long)) + # Mixed negative and non-negative + if var.bounds.upper >= 0: # type: ignore[operator] + lt = ops.lt(var, 0) + stm = ops.where(lt, stm, var) + else: + stm = var + + # Propagate bounds as we know how to compute them properly + new_bounds = ValueRanges.unknown() + if var.bounds != ValueRanges.unknown() and isinstance( + size, sympy.Number + ): + # Take the negative part of the bound and add size to it + # Then take union of that and the positive part + # This is a tighter bound than that of a generic ops.where, as we have info on the cond + neg_bounds = var.bounds & ValueRanges(-int_oo, -1) + new_bounds = ValueRanges( + neg_bounds.lower + size, neg_bounds.upper + size + ) + # We don't have a good way of representing the empty range + if var.bounds.upper >= 0: # type: ignore[operator] + pos = var.bounds & ValueRanges(0, int_oo) + new_bounds = new_bounds | pos + + var = self.cse.generate(self.compute, stm, bounds=new_bounds) + + sympy_var = parent_handler.indirect_indexing(var, size, check) + if generate_assert(check): + assert_lower = not (var.bounds.lower >= 0) + # value ranges cannot x < s when x and s are symbols + assert_upper = not isinstance(size, sympy.Number) or not ( + var.bounds.upper < size + ) + self.check_bounds(sympy_var, size, assert_lower, assert_upper) + return sympy_var + + @staticmethod + def check_bounds( + expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool + ): + return self.check_bounds(expr, size, lower, upper) + + @staticmethod + def load(name: str, index: sympy.Expr) -> CSEVariable: + if name in self.cse.invalidated_stores: + # A load from an invalidated store requires us to + # keep the actual buffer around + V.kernel.must_keep_buffers.add(name) + if free_symbol_is_type(index, SymT.TMP): + return self.indirect_load(name, index) + store_cache = self.cse.store_cache + if name in store_cache: + return self.load(name, index) + out = self.load(name, index) + # count load that is not in the store_cache, and also not in the + # cse cache. + if out.use_count == 1: + self.num_load += 1 + return out + + @staticmethod + def _update_store_cache(name: str, value: CSEVariable): + self.cse.store_cache[name] = value + if self.current_node and name in V.graph.name_to_buffer: + buf = self.current_node.get_output(name) + for other_name in buf.get_mutations(): + self.cse.store_cache[other_name] = value + + @staticmethod + def store( + name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None + ) -> None: + self.store_buffer_names.add(name) + if mode is None: + CSEProxy._update_store_cache(name, value) + if name not in V.graph.removed_buffers: + return self.store(name, index, value, mode=mode) + return None # type: ignore[return-value] + + @staticmethod + def store_reduction(name: str, index: sympy.Expr, value: CSEVariable): + self.store_buffer_names.add(name) + CSEProxy._update_store_cache(name, value) + + if name not in V.graph.removed_buffers: + return self.store_reduction(name, index, value) + raise RuntimeError("store_reduction") + + @staticmethod + def reduction( + dtype: torch.dtype, + src_dtype: torch.dtype, + reduction_type: ReductionType, + value: Union[CSEVariable, Tuple[CSEVariable, ...]], + ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]: + self.num_reduction += 1 + return self.reduction(dtype, src_dtype, reduction_type, value) + + @staticmethod + def scan( + dtypes: Tuple[torch.dtype, ...], + combine_fn: Callable[ + [Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]], + Tuple[CSEVariable, ...], + ], + values: Tuple[CSEVariable, ...], + ) -> Tuple[CSEVariable, ...]: + return self.scan(dtypes, combine_fn, values) + + @staticmethod + def sort( + dtypes: Tuple[torch.dtype, ...], + values: Tuple[CSEVariable, ...], + stable: bool, + descending: bool, + ) -> Tuple[CSEVariable, ...]: + return self.sort(dtypes, values, stable, descending) + + @staticmethod + def bucketize( + values: CSEVariable, + boundaries: Tuple[str, sympy.Expr, sympy.Expr, sympy.Expr], + boundary_indices: CSEVariable, + indexing_dtype: torch.dtype, + right: bool, + sorter: Optional[Tuple[str, sympy.Expr]] = None, + sorter_indices: Optional[CSEVariable] = None, + ) -> CSEVariable: + return self.bucketize( + values, + boundaries, + boundary_indices, + indexing_dtype, + right, + sorter, + sorter_indices, + ) + + # Use mypy to check protocol implemented correctly + def _typecheck_CSEProxy(h: CSEProxy) -> OpsHandler[CSEVariable]: + return h + + super().__enter__() + if not (self.overrides): + raise RuntimeError("assert self.overrides") + parent_handler = self.overrides() + self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) + self.exit_stack.enter_context(V.set_kernel_handler(self)) + return self diff --git a/torch_npu/_inductor/codegen/triton_utils.py b/torch_npu/_inductor/codegen/triton_utils.py new file mode 100644 index 0000000000..1bbaef2a2f --- /dev/null +++ b/torch_npu/_inductor/codegen/triton_utils.py @@ -0,0 +1,26 @@ +import torch + +# wrapper npu 32 bytes align, get and pass unalign info to triton meta +# then autotune choose tiling param and send them to bishengIR +byte_per_numel = { + torch.float32: 4, # torch.float32 or torch.float + torch.float64: 8, # torch.float64 or torch.double + torch.float16: 2, # torch.float16 or torch.half + torch.bfloat16: 2, # torch.bfloat16 + torch.int32: 4, # torch.int32 or torch.int + torch.int64: 8, # torch.int64 or torch.long + torch.int16: 2, # torch.int16 or torch.short + torch.int8: 1, # torch.int8 + torch.uint8: 1, # torch.uint8 + torch.bool: 1, # torch.bool + torch.complex32: 4, # torch.complex32 (not yet available in PyTorch as of the latest stable release) + torch.complex64: 8, # torch.complex64 + torch.complex128: 16 # torch.complex128 +} + + +def get_aligned_numel(dtype): + if dtype in byte_per_numel: + return 32 // byte_per_numel[dtype] + else: + return 1 diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py new file mode 100644 index 0000000000..e433f93c25 --- /dev/null +++ b/torch_npu/_inductor/codegen/wrapper.py @@ -0,0 +1,246 @@ +import os +import copy +from typing import Any, Callable, Optional, TYPE_CHECKING, Union +import hashlib +import sympy + +import torch +from torch._inductor import config +from torch._inductor.codegen.wrapper import PythonWrapperCodegen, SymbolicCallArg, SubgraphPythonWrapperCodegen +from torch._inductor.runtime import triton_heuristics +from torch._inductor.utils import ( + cache_on_self, +) +from torch._inductor.virtualized import V +from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode +from torch.utils._sympy.singleton_int import SingletonInt +from torch._inductor.ir import GraphPartitionSignature + +from torch_npu._inductor import config as npu_config + + +class NPUWrapperCodeGen(PythonWrapperCodegen): + def __init__(self): + super().__init__() + + @staticmethod + def create( + is_subgraph: bool, + subgraph_name: str, + parent_wrapper: PythonWrapperCodegen, + partition_signatures: Optional[GraphPartitionSignature] = None, + ): + if is_subgraph: + return SubgraphPythonWrapperCodegen(subgraph_name, parent_wrapper, partition_signatures) + return NPUWrapperCodeGen() + + def write_header(self) -> None: + super().write_header() + self.imports.splice( + f""" + import torch_npu + """, + strip=True, + ) + + @cache_on_self + def write_triton_header_once(self) -> None: + import_str = f""" + import triton + import triton.language as tl + from {triton_heuristics.__name__} import start_graph, end_graph + import torch_npu + """ + if config.triton.autotune_at_compile_time: + self.kernel_autotune_calls.splice(import_str) + self.kernel_autotune_calls.writeline( + V.graph.device_ops.import_get_raw_stream_as("get_raw_stream") + ) + if not V.graph.cpp_wrapper: + self.imports.splice(import_str, strip=True) + self.imports.writeline( + V.graph.device_ops.import_get_raw_stream_as("get_raw_stream") + ) + + # generate numel expr for range_tree_node + def generate_node_numel_expr(self, kernel_name: str, node, numel_expr): + expr = f"{kernel_name}_{node.name}_numel" + if (expr, V.graph) not in self.kernel_numel_expr: + # declare expr once in each graph (scope) + self.kernel_numel_expr.add((expr, V.graph)) + self.writeline( + f"{self.declare}{expr} = {self.expr_printer(numel_expr)}{self.ending}" + ) + else: + self.writeline(f"{expr} = {self.expr_printer(numel_expr)}{self.ending}") + # We can get symbolic expressions here, like s0*64 + # It is fine to have them here, but we need to handle them correctly as their own type + # This is tricky to do, so we wrap in a custom type, distinct from scalars, but also from sympy* + # scalars as well. + # This is handled in `generate_args_decl` which has a correct comment of: TODO: only works for + # constant now, need type info. I agree, this needs type info, and while this is not true type info + # it suffices as a type hint for the purposes of producing the correct code for this type. + return SymbolicCallArg(expr, numel_expr) + + # don't free anything + def make_buffer_free(self, buffer): + return "" + + # don't assert + def codegen_input_size_asserts(self) -> None: + pass + + def get_next_kernel_suffix(self) -> str: + iter_val = copy.copy(self._names_iter) + return f"{next(iter_val)}" + + def add_benchmark_harness(self, output): + """ + Override, add aot-inductor debug kernel support. + """ + if not config.benchmark_harness: + return None + + if npu_config.aot_inductor.debug_kernel: + return self.add_npu_repro(output) + + return super().add_benchmark_harness(output) + + def add_npu_repro(self, output): + self.add_repro_func(output) + self.add_benchmark_func(output) + + output.writelines(["", "", 'if __name__ == "__main__":']) + with output.indent(): + # List how to use. Read details in torch_npu/_inductor/config.py. + output.writelines( + [ + "# torch_npu._inductor.config.force_fallback_kernel_id = 'all'", + "# or", + "# torch_npu._inductor.config.force_fallback_kernel_id = [1, 2, 10]", + "torch_npu._inductor.config.aot_inductor.debug_kernel_in_run = True", + "result = benchmark_compiled_module()", + "print(result)", + ] + ) + + def add_repro_func(self, output): + seen_constants = set() + + def add_fake_input(name, shape, stride, device, dtype): + output.writeline( + f"{name} = rand_strided(" + f"{self.codegen_python_shape_tuple(shape)}, " + f"{self.codegen_python_shape_tuple(stride)}, " + f"device='{device}', dtype={dtype})" + ) + + def get_hash(name): + byte = name.encode('utf-8') + sha1 = hashlib.sha1() + sha1.update(byte) + return sha1.hexdigest() + + def save_tensor(tensor, path): + dirname = os.path.dirname(path) + if not os.path.exists(dirname): + os.makedirs(dirname) + torch.save(tensor, path) + + def add_real_tensor(name, tensor): + tensor_dir = npu_config.aot_inductor.repro_tensor_path + if isinstance(tensor, FakeTensor): + raise RuntimeError(f"Could not generate repro func because detected {name} is FakeTensor " + f"when trying to dump it. Set repro and debug_kernel false to avoid it.") + hash_name = get_hash(name) + tensor_path = os.path.join(os.getcwd(), tensor_dir, f"{hash_name}.pt") + if name not in seen_constants: + save_tensor(tensor, tensor_path) + seen_constants.add(name) + output.writeline( + f"{name} = torch.load('{tensor_path}')" + ) + + def add_torchbind_input(name, value): + import pickle + + output.writeline(f"{name} = pickle.loads({pickle.dumps(value)!r})") + output.writelines( + ["", "", f"def repro_run({', '.join(V.graph.graph_inputs.keys())}):"] + ) + with output.indent(): + output.splice( + """ + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + """, + strip=True, + ) + for name, value in V.graph.constants.items(): + # all the constants are global variables, that's why we need + # these 'global var_name' lines + output.writeline(f"global {name}") + add_real_tensor(name, value) + + if len(V.graph.torchbind_constants) > 0: + output.writeline("import pickle") + for name, torchbind_obj in V.graph.torchbind_constants.items(): + # all the constants are global variables, that's why we need + # these 'global var_name' lines + output.writeline(f"global {name}") + add_torchbind_input(name, torchbind_obj) + + call_str = f"call([{', '.join(V.graph.graph_inputs.keys())}])" + output.writeline(f"fn = lambda: {call_str}") + output.writeline("return fn()") + + def add_benchmark_func(self, output): + def add_fake_input(name, shape, stride, device, dtype): + output.writeline( + f"{name} = rand_strided(" + f"{self.codegen_python_shape_tuple(shape)}, " + f"{self.codegen_python_shape_tuple(stride)}, " + f"device='{device}', dtype={dtype})" + ) + + def add_expr_input(name, val): + output.writeline(f"{name} = {val}") + + output.writelines( + ["", "", "def benchmark_compiled_module(times=10, repeat=10):"] + ) + with output.indent(): + output.splice( + """ + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + """, + strip=True, + ) + for name, value in V.graph.graph_inputs.items(): + if isinstance(value, sympy.Symbol) and isinstance( + V.graph.sizevars.var_to_val.get(value, None), SingletonInt + ): + continue + if isinstance(value, sympy.Expr): # Don't need to add symbolic + add_expr_input(name, V.graph.sizevars.size_hint(value, fallback=42)) + else: + shape = [ + V.graph.sizevars.size_hint(x, fallback=42) + for x in value.get_size() + ] + stride = [ + V.graph.sizevars.size_hint(x, fallback=42) + for x in value.get_stride() + ] + add_fake_input( + name, + shape, + stride, + value.get_device(), + value.get_dtype(), + ) + + call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})" + output.writeline(f"fn = lambda: {call_str}") + output.writeline("return fn()") diff --git a/torch_npu/_inductor/config.py b/torch_npu/_inductor/config.py new file mode 100644 index 0000000000..f9bf23ee33 --- /dev/null +++ b/torch_npu/_inductor/config.py @@ -0,0 +1,111 @@ +import logging +import os # noqa: C101 +from typing import Any, Callable, Dict, Optional, TYPE_CHECKING +import torch +from torch._inductor import config +from triton.runtime.driver import driver + +enable_npu_indexing = True + +config.triton.unique_kernel_names = True +# avoid test_opensora_cases_model_16_forward reinterpre_tensor issue +config.allow_buffer_reuse = False +# inductor debug switch +config.trace.enabled = True + +# npu hardware params from trion +target = driver.active.get_current_target() +device = driver.active.get_current_device() +prop = driver.active.utils.get_device_properties(device) + +num_cube_core = prop["num_aicore"] +num_vector_core = prop["num_aicore"] + +# unit byte +npu_block = 32 + + +# For debug +class aot_inductor: + # If debug_kernel is set, codegen in python wrapper (output_code.py) and cpp wrapper (model.pt2) + # will be modified to dump fx graph and weights. Meanwhile, generate repro func in output_code.py. + # Then, run aoti and output_code.py will dump tensor args before and after each triton kernel, + # which can be used to detect which kernel is incorrect. + debug_kernel = os.environ.get("AOTI_ASCEND_DEBUG_KERNEL", False) + + # No need to set debug_kernel_in_run manually. It will be set in output_code.py + # by codegen if debug_kernel is set. + debug_kernel_in_run = False + + # Path that to be used for dump weights in aoti to reproduce when debug_kernel is set. + repro_tensor_path = os.environ.get("AOTI_ASCEND_REPRO_TENSOR_PATH", "aoti_repro_tensors") + + # Path that to be used for dump tensor args before and after triton kernel in aoti execute + # when debug_kernel is set. + dump_path_cpp = os.environ.get("AOTI_ASCEND_DUMP_PATH_CPP", "aoti_dump_cpp") + + # Path that to be used for dump tensor args before and after triton kernel in output_code.py + # when debug_kernel_in_run is set. + dump_path_py = os.environ.get("AOTI_DUMP_PATH_PY", "aoti_dump_py") + + +traced_fx_graph_cache = os.environ.get("INDUCTOR_ASCEND_FX_GRAPH_CACHE", None) +check_accuracy = os.environ.get("INDUCTOR_ASCEND_CHECK_ACCURACY", False) +auto_fallback = os.environ.get("INDUCTOR_ASCEND_AUTO_FALLBACK", True) +fallback_warning = os.environ.get("INDUCTOR_ASCEND_FALLBACK_WARNING", False) + +# Trace fx graph when lowering and dump. +dump_fx_graph = os.environ.get("INDUCTOR_ASCEND_DUMP_FX_GRAPH", False) \ + or check_accuracy \ + or aot_inductor.debug_kernel +# Specify kernel ids that to be force fallback to fx graph call. +# Usage: `torch_npu._inductor.config.force_fallback_kernel_id = 'all' ` +# or `torch_npu._inductor.config.force_fallback_kernel_id = [1, 2, 10] ` +# (1) 'all' means try to fallback all kernel to fx graph call. +# (2) [1, 2, 10] means try to fallback kernel like triton_xxx_1, triton_xxx_2 and triton_xxx_10 +force_fallback_kernel_id = [] + +acc_comp_tol = { + torch.float32: {'rtol': 1.3e-6, 'atol': 1e-5}, + torch.float16: {'rtol': 1e-3, 'atol': 1e-5}, + torch.bfloat16: {'rtol': 1.6e-2, 'atol': 1e-5}, + "default": {'rtol': 1.3e-6, 'atol': 1e-5}, +} + +if ("Ascend910B" in target.arch): + num_vector_core = num_cube_core * 2 + +log_level_env = os.getenv('INDUCTOR_ASCEND_LOG_LEVEL', 'WARNING').upper() +log_level_mapping = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL +} +log_level = log_level_mapping.get(log_level_env.upper(), logging.INFO) +logging.basicConfig( + level=log_level, + format='%(asctime)s - %(levelname)s - %(message)s' +) +log = logging.getLogger(__name__) + +aggresive_autotune = os.getenv("INDUCTOR_ASCEND_AGGRESSIVE_AUTOTUNE", '0').lower() in ('1', 'true') +inductor_static_mode = os.environ.get('INDUCTOR_STATIC_MODE', '0').lower() in ('1', 'yes', 'true') +profile_path = "./profile_result/" + + +def set_compile_threads(): + if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ: + torchinductor_compile_threads = int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"]) + if torchinductor_compile_threads == 1: + return + log.warning(f"TORCHINDUCTOR_COMPILE_THREADS is set to {torchinductor_compile_threads}, " + "but currently only support 1. It will be modified to 1.") + + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + torch._inductor.config.compile_threads = 1 + + def get_env_num_workers(): + return 1 + torch._inductor.select_algorithm.get_env_num_workers = get_env_num_workers diff --git a/torch_npu/_inductor/cpp_builder.py b/torch_npu/_inductor/cpp_builder.py new file mode 100644 index 0000000000..a72ea3f07d --- /dev/null +++ b/torch_npu/_inductor/cpp_builder.py @@ -0,0 +1,120 @@ +import os +from typing import Any, List, Optional, Sequence, Tuple, Union + +import torch +from torch.utils.cpp_extension import _HERE, _TORCH_PATH, TORCH_LIB_PATH + +from torch_npu.utils.cpp_extension import PYTORCH_NPU_INSTALL_PATH +from torch_npu.utils._error_code import ErrCode, pta_error + +if "ASCEND_HOME_PATH" not in os.environ: + def lazy_error(): + raise RuntimeError("Could not find ASCEND_HOME_PATH in env. Please run set_env.sh first." + + pta_error(ErrCode.NOT_FOUND)) + get_ascend_home = lazy_error +else: + def get_ascend_home_from_env(): + return os.environ["ASCEND_HOME_PATH"] + get_ascend_home = get_ascend_home_from_env + +TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib') + + +def include_paths(npu: bool = False) -> List[str]: + """ + Get the includ paths required to build a C++ extension. + + Args: + npu: If 'True', includes NPU-specific include paths. + + Returns: + A list if include path strings. + """ + lib_include = os.path.join(_TORCH_PATH, "include") + paths = [ + lib_include, + # Remove this once torch/torch.h is officially no longer supported for C++ extensions. + os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'), + # Some internal (old) Torch headers don't properly prefix their includes, + # so we need to pass -Itorch/lib/include/TH as well. + os.path.join(lib_include, 'TH'), + os.path.join(lib_include, 'THC') + ] + if npu: + ASCEND_HOME = get_ascend_home() + paths.extend([ + os.path.join(ASCEND_HOME, "include"), + os.path.join(ASCEND_HOME, "include/experiment"), + os.path.join(ASCEND_HOME, "include/experiment/msprof"), + ]) + + paths.append(os.path.join(PYTORCH_NPU_INSTALL_PATH, "include")) + return paths + + +def library_paths(npu: bool = False) -> List[str]: + """ + Get the library paths required to build a C++. + + Args: + npu: If 'True', includes NPU-specific library paths. + + Returns: + A list of library path strings. + """ + # We need to link against libtorch.so + paths = [TORCH_LIB_PATH] + if npu: + if "LIBTORCH_NPU_PATH" in os.environ: + libtorch_npu_path = os.environ["LIBTORCH_NPU_PATH"] + else: + libtorch_npu_path = os.path.join(PYTORCH_NPU_INSTALL_PATH, "lib") + paths.append(libtorch_npu_path) + + ASCEND_HOME = get_ascend_home() + cann_lib_path = os.path.join(ASCEND_HOME, "lib64") + paths.append(cann_lib_path) + + return paths + + +def get_cpp_torch_device_options( + device_type: str, + aot_mode: bool = False, + compile_only: bool = False, +) -> Tuple[List[str], List[str], List[str], List[str], List[str], List[str], List[str]]: + + npu = "npu" == device_type + + definations: List[str] = [] + include_dirs: List[str] = [] + cflags: List[str] = [] + ldflags: List[str] = [] + libraries_dirs: List[str] = [] + libraries: List[str] = [] + passthough_args: List[str] = [] + + include_dirs = include_paths(npu) + libraries_dirs = library_paths(npu) + + if npu: + definations.append("USE_NPU") + libraries += ["torch_npu", "runtime", "ascendcl"] + + # Could not add BUILD_LIBTORCH=ON to definations because it cannot + # process defination include "=" like -DXXX=xx. + passthough_args += ["-DBUILD_LIBTORCH=ON -Wno-unused-function"] + + return ( + definations, + include_dirs, + cflags, + ldflags, + libraries_dirs, + libraries, + passthough_args, + ) + + +def patch_get_cpp_torch_device_options(): + torch._inductor.cpp_builder.get_cpp_torch_device_options = get_cpp_torch_device_options \ No newline at end of file diff --git a/torch_npu/_inductor/decomposition.py b/torch_npu/_inductor/decomposition.py new file mode 100644 index 0000000000..b9c725f3ff --- /dev/null +++ b/torch_npu/_inductor/decomposition.py @@ -0,0 +1,49 @@ +import torch._ops +from torch._inductor.decomposition import decompositions, pw_cast_for_opmath +from torch._inductor.decomposition import register_decomposition + +from .lowering import _init_set + +aten = torch.ops.aten + +DECOMPOSITION_OVERLOAD_OP = [ + aten._log_softmax, + aten.nll_loss_forward, + # aten.gelu_backward, + # aten.gelu, + aten.nll_loss_backward, + aten._log_softmax_backward_data, + aten.embedding_dense_backward, + aten.addmm, + aten.gelu +] + + +def _register_npu_inductor_decompositons(): + overload_op_set = set() + _init_set(DECOMPOSITION_OVERLOAD_OP, overload_op_set) + + for op in overload_op_set: + if (op in decompositions): + del decompositions[op] + + @register_decomposition([aten.scatter.src]) + @pw_cast_for_opmath + def scatter_src(self, input_tensor, dim, index_tensor, source_tensor): + (XNUMEL, YS) = input_tensor.shape + index_rblock = torch.arange(YS).npu().reshape((1, YS)).repeat((XNUMEL, 1)) + + index_tensor_brd = index_tensor.to(torch.int32).broadcast_to(XNUMEL, YS) + source_tensor_brd = source_tensor.broadcast_to(XNUMEL, YS).to(torch.float32) + scatter1 = torch.where(index_rblock == index_tensor_brd, 1.0, 0.0) * source_tensor_brd + return scatter1 + + @register_decomposition([aten.expm1]) + def expm1(x): + tensor = torch.exp(x) - torch.ones_like(x) + return tensor + + @register_decomposition([aten.erfc]) + def erfc(x): + tensor = torch.ones_like(x) - torch.exp(x) + return tensor diff --git a/torch_npu/_inductor/fx_passes/joint_graph.py b/torch_npu/_inductor/fx_passes/joint_graph.py new file mode 100644 index 0000000000..11210910d1 --- /dev/null +++ b/torch_npu/_inductor/fx_passes/joint_graph.py @@ -0,0 +1,15 @@ +import torch +import torch._inductor.fx_passes.joint_graph as joint_graph + + +def patch_constant_fold_uniform_value(): + # Fix bug in aot_inductor for torch. + # Eliminate dead-nodes to remove extra constants generated by torch.tensor. + src_func = joint_graph.constant_fold_uniform_value + + def new_constant_fold_uniform_value(gm): + src_func(gm) + if isinstance(gm, torch.fx.GraphModule): + gm.graph.eliminate_dead_code() + + joint_graph.constant_fold_uniform_value = new_constant_fold_uniform_value \ No newline at end of file diff --git a/torch_npu/_inductor/graph.py b/torch_npu/_inductor/graph.py new file mode 100644 index 0000000000..caff8fbc60 --- /dev/null +++ b/torch_npu/_inductor/graph.py @@ -0,0 +1,114 @@ +from typing import ( + Any, + List, + Tuple, + Union, +) +import itertools + +import torch +from torch.fx.node import Node +from torch._inductor import config, metrics +from torch._subclasses.fake_tensor import FakeTensor +from torch._dynamo.utils import defake, dynamo_timed +from torch._inductor.virtualized import NullHandler, V + + +def patch_codegen_with_cpp_wrapper(): + def npu_codegen_with_cpp_wrapper(self) -> Tuple[str, List[Tuple[int, Node]]]: + # add "npu" support + if any(device in self.device_types for device in ["cuda", "xpu", "npu"]): + if config.triton.autotune_at_compile_time: + # If autotune_at_compile_time is True, we can do the codegen in one-pass + return self.codegen() + else: + # first pass + self.cpp_wrapper = False + compiled = self.compile_to_module().call + + def materialize( + x: Union[torch.SymInt, torch.SymFloat, torch.Tensor] + ) -> Union[int, float, torch.Tensor]: + if x is None: + return None + elif isinstance(x, (torch.SymInt, torch.SymFloat)): + # Need concrete value to run dynamic shapes and tune the result + return x.node.hint + elif isinstance(x, FakeTensor): + return defake(x) + else: + if not isinstance(x, torch.Tensor): + raise AssertionError("Unknown type when creating real inputs" + str(type(x))) + return x + + tracing_context = torch._guards.TracingContext.try_get() + if tracing_context is not None and not isinstance( + V.real_inputs, NullHandler + ): + if tracing_context.output_strides: + tracing_context.output_strides.clear() + + params_flat = [ + param + for param in tracing_context.params_flat # type: ignore[union-attr] + if param is not None + ] + real_inputs = [ + materialize(x) + for x in itertools.chain(params_flat, V.real_inputs) + ] + else: + # In the backward pass, V.real_inputs is not OrderedSet. + # Generating random inputs based on self.example_inputs sometimes can be problematic, + # e.g. illegal memory access. A comprehensive fix is to autotune in a separate process. + real_inputs = [ + materialize(x) # type:ignore[arg-type] + for x in ( + self.example_inputs # type:ignore[union-attr] + if isinstance(V.real_inputs, NullHandler) + else V.real_inputs + ) + ] + + if self.mutated_inputs: + from .compile_fx import clone_preserve_strides + + mutated_input_idxs = [ + idx + for idx, name in enumerate(self.graph_inputs) + if name in self.mutated_inputs + and isinstance(real_inputs[idx], torch.Tensor) + ] + for idx in mutated_input_idxs: + # clone mutated Tensor inputs to avoid mutating them in + # the first pass of the CPP wrapper-based compilation, as + # this will lead to a side effect on the example inputs: + # e.g. if torch.compile(f)(x) if called on input-mutating + # f, the inputs x will be mutated twice in the process: + # once here, and again when running the compiled model; + # this will also lead to a numerically incorrect output + mutated_inp = real_inputs[idx] + if not isinstance(mutated_inp, torch.Tensor): + raise AssertionError + real_inputs[idx] = clone_preserve_strides(mutated_inp) + del mutated_inp + + with torch.utils._python_dispatch._disable_current_modes(): + compiled(real_inputs) + del real_inputs + + # second pass + self.cpp_wrapper = True + self.removed_buffers.clear() + self.removed_operations.clear() + self.inplaced_to_remove.clear() + V.graph.sizevars.precomputed_replacements.clear() + V.graph.sizevars.inv_precomputed_replacements.clear() + metrics.reset() + with config.patch({"triton.autotune_at_compile_time": False}): + return self.codegen() + else: + # cpu + return self.codegen() + from torch._inductor.graph import GraphLowering + GraphLowering.codegen_with_cpp_wrapper = npu_codegen_with_cpp_wrapper \ No newline at end of file diff --git a/torch_npu/_inductor/ir.py b/torch_npu/_inductor/ir.py new file mode 100644 index 0000000000..f8452bfc38 --- /dev/null +++ b/torch_npu/_inductor/ir.py @@ -0,0 +1,58 @@ +import torch +from torch._inductor.virtualized import ops, OpsValue, V +from torch._inductor.ir import log, Layout + + +def patch_fallback_kernel_codegen(): + def codegen_npu(self, wrapper) -> None: # type: ignore[no-untyped-def] + kernel = self.op_overload + if kernel.namespace == "aten": # type: ignore[union-attr] + if not isinstance(kernel, torch._ops.OpOverload): + raise AssertionError(f"kernel should be OpOverload, but got {type(kernel)}") + if V.graph.cpp_wrapper: + # Fallback all npu op to proxy executor and warn when gpu do not. + from torchgen.aoti.fallback_ops import inductor_fallback_ops + self.use_runtime_dispatch = True + if str(kernel) in inductor_fallback_ops: + log.warning( + "%s is using proxy executor as fallback instead of aoti shim.", + kernel, + ) + + elif kernel.namespace == "_quantized": # type: ignore[union-attr] + # Internal Quantized Fallback Ops + if not isinstance(kernel, torch._ops.OpOverload): + raise AssertionError + else: + # For non-aten OpOverload, i.e. custom ops + if V.graph.cpp_wrapper: + self.use_runtime_dispatch = True + + if self.use_runtime_dispatch: + self.codegen_comment(wrapper) + + exported_args = None + args = None + exported_args = self.export_extern_kernel_node() + + wrapper.generate_fallback_kernel_with_runtime_lookup( + self.get_name(), + self.python_kernel_name, + self.cpp_kernel_name, + args, + self.op_overload, + exported_args, + # NOTE: [special handling of all_reduce_coalesced_'s return value] + self.outputs if self.outputs else self.mutation_outputs, + ) + else: + self.codegen_comment(wrapper) + args = [*self.codegen_args(), *self.codegen_kwargs()] + V.graph.wrapper_code.generate_fallback_kernel(self, args) + if isinstance(self.layout, Layout): + self.codegen_size_asserts(wrapper) + + self.codegen_unbacked_symbol_defs(wrapper) + + from torch._inductor.ir import FallbackKernel + FallbackKernel.codegen = codegen_npu diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py new file mode 100644 index 0000000000..29ac8924a1 --- /dev/null +++ b/torch_npu/_inductor/lowering.py @@ -0,0 +1,265 @@ +import sympy +import torch._ops +from torch._inductor import ir +from torch._inductor import lowering +from torch._inductor.decomposition import decompositions, pw_cast_for_opmath +from torch._inductor.ir import ExpandView, TensorBox, ops_wrapper +from torch._inductor.ir import Reduction +from torch._inductor.lowering import sum_ +from torch._inductor.utils import sympy_product +from torch._prims_common import ( + is_boolean_dtype, + is_integer_dtype, + get_computation_dtype, +) +from torch._inductor.lowering import ( + lowerings, + make_fallback, + register_lowering, + to_dtype, + fallback_cumsum, + _validate_reduction_axis, + div, + squeeze, + square, + sub, + fallback_handler, + is_boolean_type, + logical_and, + make_pointwise, + _make_reduction_inner, + _validate_reduction_axis, + add_needs_realized_inputs, + add_layout_constraint +) +import torch_npu +from torch_npu import npu_dtype_cast +from .lowering_op_list import GENERATE_LIST, GENERATE_LIST2, FALLBACK_LIST, LOWERING_OVERLOAD_OP + + +def npu_make_fallback(op, layout_constraint=None, warn=True, override_decomp=False): + if op in decompositions and not override_decomp: + raise RuntimeError(f"both a fallback and a decomp for same op: {op}") + + def register_fallback(op_overload): + add_needs_realized_inputs(op_overload) + if layout_constraint is not None: + add_layout_constraint(op_overload, layout_constraint) + return register_lowering(op_overload, type_promotion_kind=None)( + fallback_handler(op_overload) + ) + + if isinstance(op, torch._ops.OpOverloadPacket): + for ol in op.overloads(): + op_overload = getattr(op, ol) + register_fallback(op_overload) + elif isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)): + register_fallback(op) + else: + raise RuntimeError(f"Unsupported fallback {op} with type {type(op)}") + +make_fallback = npu_make_fallback + + +def make_reduction(reduction_type: str, override_return_dtype=None): + def inner(x, axis=None, keepdims=False, *, dtype=None): + kwargs = _make_reduction_inner( + x, + axis=axis, + keepdims=keepdims, + dtype=dtype, + override_return_dtype=override_return_dtype, + ) + result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs) + if isinstance( + result.data.data, Reduction + ): # Only realize if reduction isn't unrolled + size = x.get_size() + axis = set(_validate_reduction_axis(x, axis)) + kept_idx = [] + reduced_idx = [] + for i in range(len(size)): + if i in axis: + reduced_idx.append(i) + else: + kept_idx.append(i) + + object.__setattr__(result.data.data, "kept_idx", kept_idx) + object.__setattr__(result.data.data, "reduced_idx", reduced_idx) + + result.realize() + return result + + return inner + +lowering.make_reduction = make_reduction + +aten = torch.ops.aten +tr_c10d = torch.ops.tr_c10d +prims = torch.ops.prims + + +def _init_set(input_list, output_set): + for fn in input_list: + output_set.add(fn) + if isinstance(fn, torch._ops.OpOverloadPacket): + for overload in fn.overloads(): + other_fn = getattr(fn, overload) + output_set.add(other_fn) + + +def _register_npu_inductor_fallbacks(): + gen_set = set() + _init_set(GENERATE_LIST, gen_set) + overload_op_set = set() + _init_set(LOWERING_OVERLOAD_OP, overload_op_set) + + # 把不在白名单的op fallback + for op in lowerings: + if op not in decompositions and op not in gen_set: + if isinstance(op, torch._ops.OpOverloadPacket) or \ + isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)): + flag = False + for gens in GENERATE_LIST2: + if str(op).find(gens) != -1: + flag = True + if flag: + continue + else: + make_fallback(op) + FALLBACK_LIST.append(op) + # 把需要overload的op在lowering里删除 + for op in overload_op_set: + if op in lowerings: + del lowerings[op] + + # register the reductions useing custom make_reduction + reduce_amax = register_lowering(aten.amax)(make_reduction("max")) + reduce_amin = register_lowering(aten.amin)(make_reduction("min")) + reduce_argmax = register_lowering(aten.argmax)( + make_reduction("argmax", override_return_dtype=torch.int64) + ) + reduce_argmin = register_lowering(aten.argmin)( + make_reduction("argmin", override_return_dtype=torch.int64) + ) + + + @register_lowering(aten.max, type_promotion_kind=None) + def reduce_max(x, dim=None, keepdim=False): + if dim is not None: + return ( + reduce_amax(x, axis=dim, keepdims=keepdim), + reduce_argmax(x, axis=dim, keepdims=keepdim), + ) + + return reduce_amax(x, axis=None, keepdims=keepdim) + + @register_lowering(aten.min, type_promotion_kind=None) + def reduce_min(x, dim=None, keepdim=False): + if dim is not None: + return ( + reduce_amin(x, axis=dim, keepdims=keepdim), + reduce_argmin(x, axis=dim, keepdims=keepdim), + ) + + return reduce_amin(x, axis=None, keepdims=keepdim) + + @register_lowering(aten.mean) + def mean(x, axis=None, keepdim=False, *, dtype=None): + if dtype is not None: + x = to_dtype(x, dtype) + size = x.get_size() + axis = _validate_reduction_axis(x, axis) + # compute in higher-precision until end of mean lowering + output_dtype = x.get_dtype() + if output_dtype in (torch.float16, torch.bfloat16): + x = to_dtype(x, torch.float) + sum_result = sum_(x, axis, keepdim) + denom = sympy_product(size[i] for i in axis) + denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device()) + denom = ExpandView.create(denom, list(sum_result.get_size())) + return to_dtype(div(sum_result, denom), output_dtype) + + @register_lowering(aten.cumsum) + def cumsum(x, axis=None, dtype=None): + if ( + is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype()) + ) and dtype is None: + # torch.int64->torch.int32 + dtype = torch.int32 + if len(x.get_size()) == 0: + if axis not in [0, -1]: + raise ValueError("axis must be 0 or -1") + dtype = dtype or x.get_dtype() + return to_dtype(x, dtype, copy=True) + return fallback_cumsum(x, dim=axis, dtype=dtype) + + @register_lowering(npu_dtype_cast, type_promotion_kind=None) + def _convert_npu_type(x: TensorBox, dtype: torch.dtype): + return to_dtype(x, dtype, copy=True) + + def var_mean_sum_(x, axis, correction, keepdim, return_mean): + if correction is None: + correction = 1 + + size = x.get_size() + axis = _validate_reduction_axis(x, axis) + x_mean = mean(x, axis, keepdim=True) + if return_mean: + x_mean.realize() + + diffs = square(sub(x, x_mean)) + sum_result = sum_(diffs, axis, keepdim) + denom = sympy_product(size[i] for i in axis) + if correction: + denom = sympy.Max(denom - correction, 0) + denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device()) + denom = ExpandView.create(denom, list(sum_result.get_size())) + x_var = div(sum_result, denom) + if not return_mean: + return (x_var,) + + x_mean = x_mean if keepdim else squeeze(x_mean, axis) + return x_var, x_mean + + def var_mean_helper_(x, *, axis, correction, keepdim, return_mean): + out_dtype = x.get_dtype() + compute_dtype = get_computation_dtype(out_dtype) + x = to_dtype(x, compute_dtype, copy=False) + kwargs = dict( + x=x, + axis=axis, + correction=correction, + keepdim=keepdim, + return_mean=return_mean, + ) + output = ( + var_mean_sum_(**kwargs) + ) + output = tuple(to_dtype(x, out_dtype, copy=False) for x in output) + return output[0] if not return_mean else output + + @register_lowering(aten.var_mean) + def var_mean(x, axis=None, *, correction=None, keepdim=False): + return var_mean_helper_( + x, axis=axis, correction=correction, keepdim=keepdim, return_mean=True + ) + + @register_lowering([aten.var, prims.var]) + def var_(x, axis=None, *, correction=None, keepdim=False): + return var_mean_helper_( + x, axis=axis, correction=correction, keepdim=keepdim, return_mean=False + ) + + @register_lowering(aten.embedding, type_promotion_kind=None) + def embedding(weight, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False): + return fallback_handler(aten.embedding.default)(weight, indices, padding_idx=-1, scale_grad_by_freq=False, + sparse=False) + + @register_lowering(aten.cat) + def cat(inputs, dim=0): + return fallback_handler(aten.cat.default)(inputs, dim) + + make_fallback(aten._log_softmax) + make_fallback(aten.gather) + make_fallback(aten.nll_loss_forward) diff --git a/torch_npu/_inductor/lowering_fx.py b/torch_npu/_inductor/lowering_fx.py new file mode 100644 index 0000000000..5084c29534 --- /dev/null +++ b/torch_npu/_inductor/lowering_fx.py @@ -0,0 +1,2291 @@ +import functools +import itertools +import os +import textwrap +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Set, + Tuple, + Union, +) +import sympy +import torch._ops +import torch._ops +from sympy.core import Expr, Integer, Symbol +from torch._inductor import ir +from torch._inductor import ir +from torch._inductor import lowering +from torch._inductor import lowering +from torch._inductor import scheduler +from torch._inductor import scheduler +from torch._inductor.decomposition import decompositions +from torch._inductor.decomposition import decompositions, pw_cast_for_opmath +from torch._inductor.fx_passes.post_grad import view_to_reshape +from torch._inductor.ir import ( + ExpandView, + IndexingConstant, + is_triton, + ops_wrapper, + PermuteView, + Pointwise, + Reduction, + SqueezeView, + TensorBox, + IRNode, + validate_ir, + View, +) +from torch._inductor.ir import ExpandView, TensorBox +from torch._inductor.ir import ExpandView, TensorBox +from torch._inductor.ir import Reduction +from torch._inductor.ir import Reduction +from torch._inductor.lowering import sum_ +from torch._inductor.utils import ModularIndexing, FloorDiv +from torch._inductor.utils import ( + decode_device, + sympy_product, +) +from torch._inductor.utils import sympy_product +from torch._inductor.utils import sympy_product +from torch._inductor.virtualized import ops, V +from torch._prims_common import ( + canonicalize_dims, + check, + dtype_to_type, + ELEMENTWISE_TYPE_PROMOTION_KIND, + get_computation_dtype, + is_boolean_dtype, + is_float_dtype, + is_integer_dtype, + Number, +) +from torch._prims_common import ( + is_boolean_dtype, + is_integer_dtype, + get_computation_dtype, +) +from torch._prims_common import ( + is_boolean_dtype, + is_integer_dtype, + get_computation_dtype, +) +from torch.fx.experimental.proxy_tensor import make_fx +from torch.utils._sympy.functions import ( + FloorDiv, + Identity, + ModularIndexing, +) +from .config import log +from .lowering_op_list import GENERATE_LIST, GENERATE_LIST2, FALLBACK_LIST, LOWERING_OVERLOAD_OP + +aten = torch.ops.aten +tr_c10d = torch.ops.tr_c10d +prims = torch.ops.prims +npu = torch.ops.npu + + +def _init_set(input_list, output_set): + for fn in input_list: + output_set.add(fn) + if isinstance(fn, torch._ops.OpOverloadPacket): + for overload in fn.overloads(): + other_fn = getattr(fn, overload) + output_set.add(other_fn) + + +LOWERING_OVERLOAD_OP = list(set(GENERATE_LIST) | set(LOWERING_OVERLOAD_OP)) + +fn_to_aten_fn = {} +node_id = itertools.count(0) +snodes_to_fx = {} + + +def register_fn_to_aten_fn(fn, aten_fn=None): + if fn not in fn_to_aten_fn: + fn_to_aten_fn[fn] = aten_fn + return fn + + +def register_to_aten(aten_fn=None): + def decorator(fn): + if fn not in fn_to_aten_fn: + fn_to_aten_fn[fn] = aten_fn + return fn + + return decorator + + +reduction_type_to_aten_fn = { + "sum": aten.sum, + "prod": aten.prod, + "xor_sum": prims.xor_sum, + "any": aten.any, + "max": aten.amax, + "min": aten.amin, + "argmax": aten.argmax, + "argmin": aten.argmin +} + +operator_to_string = { + '+': 'a', + '-': 'sub', + '*': 'm', + '/': 'd', + '(': 'l', + ')': 'r', + '.': 'p', +} + +string_to_operator = {v: k for k, v in operator_to_string.items()} + + +def map_operators_to_strings(expr_str: str): + expr_str = expr_str.replace(' ', '') + for op, string in operator_to_string.items(): + expr_str = expr_str.replace(op, string) + return '_' + expr_str + + +def map_strings_to_operators(expr_str: str): + for op, string in string_to_operator.items(): + expr_str = expr_str.replace(op, string) + return expr_str[1:] + + +class TracedGraph: + def __init__(self): + self.graph = torch.fx.Graph() + self.last_node: Optional[torch.fx.Node] = None + self.sym_nodes: Dict[str, torch.fx.Node] = {} + + def __str__(self): + return str(self.graph) + + def get_placeholder_names(self): + placeholder_names = set() + for node in self.graph.nodes: + if node.op == 'placeholder' and node.name not in self.sym_nodes: + placeholder_names.add(node.name) + return placeholder_names + + __repr__ = __str__ + + +def create_fake_input(size, stride, device, dtype): + size = [V.graph.sizevars.shape_env.create_symintnode(s, hint=None) \ + if isinstance(s, Expr) and not isinstance(s, Integer) else s for s in size] + stride = [V.graph.sizevars.shape_env.create_symintnode(s, hint=None) \ + if isinstance(s, Expr) and not isinstance(s, Integer) else s for s in stride] + with V.graph.fake_mode: + fake_input = torch.empty_strided(size, stride, device=device, dtype=dtype) + return fake_input + + +def create_sym_inputs(traced_graph: TracedGraph, size: List[Expr]): + for s in size: + if isinstance(s, (List, Tuple)): + create_sym_inputs(traced_graph, s) + continue + if isinstance(s, Expr) and not isinstance(s, Integer): + s_name = str(s) + if not isinstance(s, Symbol): + s_name = map_operators_to_strings(s_name) + if s_name in traced_graph.sym_nodes: + continue + new_node = traced_graph.graph.placeholder(s_name) + new_node.meta['val'] = V.graph.sizevars.shape_env.create_symintnode(s, hint=None) + traced_graph.sym_nodes.update({s_name: new_node}) + + +def process_ir_constant(inp: ExpandView) -> Union[TracedGraph, int, float]: + skip = False + if isinstance(inp.data, IndexingConstant): + dtype = inp.data.dtype + inp = inp.data.index + # convert to original dtype. + if dtype in [torch.float32, torch.float16, torch.bfloat16]: + # sympy inputs + if isinstance(inp, Expr) and not isinstance(inp, sympy.core.numbers.Number): + traced_graph = TracedGraph() + create_sym_inputs(traced_graph, [inp]) + s_name = str(inp) + if not isinstance(inp, Symbol): + s_name = map_operators_to_strings(str(inp)) + traced_graph.last_node = traced_graph.sym_nodes[s_name] + inp = traced_graph + else: + inp = float(inp) + elif isinstance(inp.data, ir.Constant): + dtype = inp.data.dtype + inp = inp.data.value + else: + skip = True + return inp, skip + + +def fetch_graphs(inputs: Optional[List[TensorBox]]): + if isinstance(inputs, (TensorBox, ir.StorageBox, ir.View, sympy.Symbol, ir.Constant)): + inputs = [inputs] + input_graphs = [] + for inp in inputs: + if isinstance(inp, List): + input_graphs.append(fetch_graphs(inp)) + continue + if not isinstance(inp, ( + TensorBox, ir.StorageBox, ir.View, ir.ReinterpretView, ir.PermuteView, ir.SliceView, ir.ExpandView)): + input_graphs.append(inp) + continue + if isinstance(inp, ExpandView): + inp, skip = process_ir_constant(inp) + if not skip: + input_graphs.append(inp) + continue + name = inp.get_name() + traced_graph = inp.get_traced_graph() + if traced_graph is not None: + input_graphs.append(traced_graph) + continue + traced_graph = TracedGraph() + device = inp.get_device() + dtype = inp.get_dtype() + size = inp.get_size() + stride = inp.get_stride() + new_node = traced_graph.graph.placeholder(name) + fake_input = create_fake_input(size, stride, device, dtype) + new_node.meta['val'] = fake_input + traced_graph.last_node = new_node + input_graphs.append(traced_graph) + return input_graphs + + +def merge_traced_graphs(input_graphs: List[TracedGraph], origin_fn, node_name, **kwargs): + new_graph = TracedGraph() + exist_nodes: Dict[str, torch.fx.Node] = {} + + def merge_graph(input_graphs: List[TracedGraph]): + for input_graph in input_graphs: + if isinstance(input_graph, List): + merge_graph(input_graph) + continue + if not isinstance(input_graph, TracedGraph): + continue + for node in input_graph.graph.nodes: + if node.name in exist_nodes: + continue + new_node = new_graph.graph.node_copy(node, lambda n: exist_nodes[n.name]) + exist_nodes[node.name] = new_node + if node.name in input_graph.sym_nodes: + new_graph.sym_nodes.update({node.name: new_node}) + + def parse_args(input_graphs, exist_nodes): + args = [] + for input_graph in input_graphs: + if isinstance(input_graph, TracedGraph): + args.append(exist_nodes[input_graph.last_node.name]) + elif isinstance(input_graph, (List, Tuple)): + args.append(parse_args(input_graph, exist_nodes)) + else: + if isinstance(input_graph, Expr) and not isinstance(input_graph, Integer): + if not isinstance(input_graph, Symbol): + input_graph = map_operators_to_strings(str(input_graph)) + args.append(new_graph.sym_nodes[str(input_graph)]) + else: + args.append(input_graph) + return args + + num_args = len(input_graphs) + + for k, v in kwargs.items(): + if isinstance(v, Expr) and not isinstance(v, Integer): + traced_graph = TracedGraph() + create_sym_inputs(traced_graph, [v]) + s_name = str(v) + if not isinstance(v, Symbol): + s_name = map_operators_to_strings(str(v)) + traced_graph.last_node = traced_graph.sym_nodes[s_name] + kwargs[k] = traced_graph.sym_nodes[s_name] + input_graphs.append(traced_graph) + merge_graph(input_graphs) + input_graphs = input_graphs[:num_args] + # if inputs do not have any valid graphs, like full/iota + create_sym_inputs(new_graph, input_graphs) + args = parse_args(input_graphs, exist_nodes) + with new_graph.graph.inserting_after(new_graph.last_node): + new_node = new_graph.graph.call_function(origin_fn, args=tuple(args), kwargs=kwargs) + new_node.name = node_name + new_graph.last_node = new_node + return new_graph + + +def merge_fx_graphs(traced_graphs: List[TracedGraph]): + new_graph = TracedGraph() + exist_nodes: Dict[str, torch.fx.Node] = {} + last_nodes = [] + + def merge_graph(input_graphs: List[TracedGraph]): + for input_graph in input_graphs: + if isinstance(input_graph, List): + merge_graph(input_graph) + continue + if not isinstance(input_graph, TracedGraph): + continue + for node in input_graph.graph.nodes: + if node.name in exist_nodes: + continue + new_node = new_graph.graph.node_copy(node, lambda n: exist_nodes[n.name]) + exist_nodes[node.name] = new_node + last_nodes.append(exist_nodes[input_graph.last_node.name]) + + merge_graph(traced_graphs) + new_graph.last_node = last_nodes + return new_graph + + +def subtract_graph(graph1: TracedGraph, graph2: TracedGraph, node_name=None) -> Tuple[TracedGraph, torch.fx.Node]: + new_graph = TracedGraph() + last_node2 = graph2.last_node + graph1_node_names = {node.name for node in graph1.graph.nodes} + graph2_node_names = {node.name for node in graph2.graph.nodes} + placeholder = None + exist_nodes: Dict[str, torch.fx.Node] = {} + if node_name not in graph1_node_names: + placeholder = new_graph.graph.placeholder(last_node2.name if node_name is None else node_name) + exist_nodes[last_node2.name] = placeholder + for node in graph1.graph.nodes: + if node.name in graph2_node_names and node.name not in graph1.sym_nodes: + continue + new_node = new_graph.graph.node_copy(node, lambda n: exist_nodes[n.name]) + exist_nodes[node.name] = new_node + new_graph.last_node = exist_nodes[graph1.last_node.name] + new_graph.sym_nodes = graph1.sym_nodes + return new_graph, placeholder + + +def get_last_node(gm: torch.fx.GraphModule): + last_node = None + for node in gm.graph.nodes: + last_node = node + return last_node + + +def tensor_info(tensor): + if isinstance(tensor, (list, tuple)): + infos = ", ".join(tensor_info(t) for t in tensor) + return f"[{infos}]" + if not isinstance(tensor, torch.Tensor): + return str(tensor) + info = str(tensor) + info = info[:-1] + info += f", strides={tensor.stride()})" + return info + + +def create_fx_from_snodes_by_traced_graph(snodes: List[scheduler.SchedulerNode]): + fx_call_inputs = [] + try: + for snode in snodes: + snode.node.data.traced_graph.last_node.name = snode.node.get_name() + except Exception as e: + log.warning(f"Could not rebuild fx graph for {snodes}, reason: {e}") + return None, None, None, None + + if len(snodes) == 1: + traced_graph = snodes[0].node.data.traced_graph + else: + traced_graph = merge_fx_graphs([snode.node.data.traced_graph for snode in snodes]) + fx_inputs = [] + for node in traced_graph.graph.nodes: + if node.op == 'placeholder': + fx_call_inputs.append(node.target) + fx_inputs.append(node.meta['val']) + non_contiguous_indices = {} + non_contiguous_indices["inputs"] = [ + i + for i, inp in enumerate(fx_inputs) + if torch.is_tensor(inp) and not inp.is_contiguous() + ] + num_inputs = len(fx_call_inputs) + fx_call_outputs = [] + for snode in snodes: + if snode.has_aliasing_or_mutation(): + for buf in snode.get_outputs(): + if len(buf.get_mutations()): + fx_call_outputs.extend(buf.get_mutations()) + elif len(buf.get_aliases()): + fx_call_outputs.append(buf.get_name()) + elif snode.node.get_name() not in (V.graph.removed_buffers | V.graph.inplaced_to_remove): + fx_call_outputs.append(snode.node.get_name()) + num_outputs = len(fx_call_outputs) + outputs = traced_graph.last_node if isinstance(traced_graph.last_node, List) \ + else [traced_graph.last_node] + outputs = [ + output + for output in outputs + if output.name not in (V.graph.removed_buffers | V.graph.inplaced_to_remove) + ] + fx_call_args = fx_call_inputs + fx_call_outputs + traced_graph.graph.output(tuple(outputs)) + traced_graph.graph.lint() + orig_module = torch.nn.Module() + gm = torch.fx.GraphModule(orig_module, traced_graph.graph) + gm.recompile() + + def runnable_gm(*args): + return torch.fx.Interpreter(gm).run(*args) + + with V.graph.fake_mode: + gm = make_fx(runnable_gm)(*fx_inputs) + view_to_reshape(gm) + last_node = get_last_node(gm) + fx_output_nodes = last_node.args[0] + fx_outputs = [node.meta['val'] for node in fx_output_nodes] + non_contiguous_indices["outputs"] = [ + i + num_inputs + for i, call_output in enumerate(fx_call_outputs) + if not V.graph.try_get_buffer(call_output).layout.is_contiguous() + ] + fx_args = fx_inputs + fx_outputs + snodes_to_fx[str(snodes)] = f"{gm}\n inputs: {tensor_info(fx_inputs)}\n outputs: {tensor_info(fx_outputs)}\n" + + return gm, fx_call_args, fx_args, { + "num_inputs": num_inputs, + "num_outputs": num_outputs, + "non_contiguous_indices": non_contiguous_indices, + } + + +def create_compile_kwargs(final_kernel, fx_call_args, fx_args): + _, kernel_call_args, _, arg_types = final_kernel.args.python_argdefs() + for idx, call_arg in enumerate(fx_call_args): + if call_arg in final_kernel.args.inplace_buffers: + fx_call_args[idx] = final_kernel.args.inplace_buffers[call_arg].other_names[-1] + fx_arg_shapes = [fx_arg.shape for fx_arg in fx_args if isinstance(fx_arg, torch.Tensor)] + + if set(kernel_call_args) != set(fx_call_args): + return None + final_kernel.add_numel_to_call_args(final_kernel.kernel_name, kernel_call_args, arg_types) + + index_map = {element: idx for idx, element in enumerate(kernel_call_args)} + call_args_mapping = [index_map[element] for element in fx_call_args] + + mismatch_indices_shapes = {} + + for i in range(len(fx_call_args)): + mismatch_indices_shapes[i] = fx_arg_shapes[i] + + return { + "call_args_mapping": call_args_mapping, + "mismatch_indices_shapes": mismatch_indices_shapes, + } + + +def generate_fx_graph_code(code, kernel_code, kernel_name, compile_kwargs): + code = textwrap.indent(code, ' ') + code_template = f""" +import os +import torch +from torch._inductor.compile_fx import clone_preserve_strides +from torch._dynamo.testing import rand_strided +from torch import device + +import math +import random +import os +import tempfile +from math import inf, nan +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +from torch._inductor.codegen.multi_kernel import MultiKernelCall +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch_npu._inductor import get_current_raw_stream as get_raw_stream +from torch_npu._inductor import config as npu_config + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool + +file_path = os.path.abspath(__file__) +dir_path = os.path.dirname(file_path) + + +class GraphModule(torch.nn.Module): + def __init__(self): + super().__init__() +{code} +model = GraphModule().npu() +call_args_mapping = {compile_kwargs['call_args_mapping']} +num_inputs = {compile_kwargs['num_inputs']} +num_outputs = {compile_kwargs['num_outputs']} +non_contiguous_indices = {compile_kwargs['non_contiguous_indices']} +mismatch_indices_shapes = {compile_kwargs['mismatch_indices_shapes']} + +def run(): + async_compile = AsyncCompile() + {kernel_name} = async_compile.triton('{kernel_name}', ''' +{kernel_code} + ''', device_str='npu') + + async_compile.wait(globals()) + del async_compile + + stream0 = get_raw_stream(0) + + + args = torch.load(os.path.join(dir_path, "data.pth")) + + call_inputs_indices = call_args_mapping[:num_inputs] + call_outputs_indices = call_args_mapping[num_inputs:] + + args = [arg.npu() if isinstance(arg, torch.Tensor) else arg for arg in args] + + fx_args = [] + for idx in call_args_mapping: + arg = args[idx] + if isinstance(arg, torch.Tensor): + fx_arg = clone_preserve_strides(arg).float() if arg.dtype == torch.bfloat16 else clone_preserve_strides(arg) + fx_args.append(fx_arg) + + fx_inputs = [fx_args[idx].contiguous() if idx in non_contiguous_indices['inputs'] else fx_args[idx] for idx in range(num_inputs)] + if len(mismatch_indices_shapes): + for ind, shape in mismatch_indices_shapes.items(): + if ind >= num_inputs: + break + fx_inputs[ind] = fx_inputs[ind].reshape(shape) + model_outputs = model.forward(*fx_inputs) + for idx, (out1, out2) in enumerate(zip(model_outputs, fx_args[num_inputs:(num_inputs + num_outputs)])): + out1 = out1.reshape(out2.shape) + if idx in non_contiguous_indices['outputs']: + out2.copy_(out1) + else: + out2.data = out1.data + + {kernel_name}.run(*args, stream=stream0) + + for actual, expected in zip([args[i] for i in call_outputs_indices], fx_args[num_inputs:]): + if actual.dtype != expected.dtype: + expected = expected.to(actual.dtype) + acc_comp_tol = npu_config.acc_comp_tol.get(actual.dtype, npu_config.acc_comp_tol['default']) + rtol = acc_comp_tol['rtol'] + atol = acc_comp_tol['atol'] + try: + torch.testing.assert_close(actual, expected, rtol=rtol, atol=atol, equal_nan=False) + except Exception as e: + print(e) + +if __name__ == "__main__": + run() +""" + return code_template + + +def dump_fx_graph_code(code, dump_path, traced_graph_hash): + py_path = os.path.join(dump_path, traced_graph_hash + '.py') + with open(py_path, 'w') as f: + f.write(code) + + +def clone(x, *, memory_format=None): + # TODO(jansel): memory format + input_graphs = fetch_graphs(x) + node_name = f'clone_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.clone, node_name) + return Pointwise.create( + device=x.get_device(), + dtype=x.get_dtype(), + inner_fn=x.make_loader(), + ranges=list(x.get_size()), + traced_graph=new_graph, + node_name=node_name + ) + + +def _register_npu_inductor_fallbacks(): + gen_set = set() + _init_set(GENERATE_LIST, gen_set) + overload_op_set = set() + _init_set(LOWERING_OVERLOAD_OP, overload_op_set) + + # 把不在白名单的op fallback + for op in lowering.lowerings: + if op not in decompositions and op not in gen_set: + if isinstance(op, torch._ops.OpOverloadPacket) or \ + isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)): + flag = False + for gens in GENERATE_LIST2: + if str(op).find(gens) != -1: + flag = True + if flag: + continue + else: + lowering.make_fallback(op) + FALLBACK_LIST.append(op) + + # 把需要overload的op在lowering里删除 + for op in overload_op_set: + if op in lowering.lowerings: + del lowering.lowerings[op] + + def transform_args( + args: List[Any], + kwargs: Dict[str, Any], + broadcast: bool, + type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND], + convert_input_to_bool: bool, + ) -> Tuple[List[Any], Dict[str, Any]]: + args_indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)] + kwargs_indices = [k for k, v in kwargs.items() if isinstance(v, TensorBox)] + # check that there's something to transform + if not args_indices and not kwargs_indices: + return args, kwargs + + if type_promotion_kind or convert_input_to_bool: + if convert_input_to_bool: + dtype = torch.bool + else: + # this is a crude approximation for promoting args + promoting_args = [ + a + for a in args + if isinstance(a, (Number, sympy.Basic)) or hasattr(a, "dtype") + ] + # only consider tensor kwargs for promotion, for now + promoting_args.extend(a for a in kwargs.values() if hasattr(a, "dtype")) + dtype = lowering.get_promoted_dtype( + *promoting_args, type_promotion_kind=type_promotion_kind # type: ignore[arg-type] + ) + + device = ( + args[args_indices[0]] if args_indices else kwargs[kwargs_indices[0]] + ).get_device() + + # sometimes args are an immutable list so we can't mutate them + def promote(arg): + if isinstance(arg, TensorBox): + return to_dtype(arg, dtype) + elif isinstance(arg, ir.Constant): + return ir.Constant(value=arg.value, dtype=dtype, device=device) + else: + return arg + + args = [promote(a) for a in args] + kwargs = {k: promote(v) for k, v in kwargs.items()} + + if broadcast: + broadcasted = broadcast_tensors( + *list( + itertools.chain( + (args[i] for i in args_indices), + (kwargs[k] for k in kwargs_indices), + ) + ) + ) + size = list(broadcasted[0].get_size()) + + for i, x in zip(args_indices, broadcasted[: len(args_indices)]): + args[i] = x + for k, x in zip(kwargs_indices, broadcasted[len(args_indices):]): + kwargs[k] = x + + for i in range(len(args)): + if isinstance(args[i], ir.Constant): + args[i] = ExpandView.create(args[i], size) + for k in kwargs: + if isinstance(kwargs[k], ir.Constant): + kwargs[k] = ExpandView.create(kwargs[k], size) + + return args, kwargs + + def _register_lowering( + aten_fn, decomp_fn, broadcast, type_promotion_kind, convert_input_to_bool + ): + + """ + Add a lowering to lowerings dict + + Arguments: + aten_fn: torch.ops.aten.* fn we are lowering + decomp_fn: alternate implementation on our IR + broadcast: True to apply broadcasting to tensor inputs + type_promotion_kind: kind of type promotion applied to tensor inputs, `None` means no type promotion + convert_input_to_bool: some logical ops require inputs are converted to bool + """ + + @functools.wraps(decomp_fn) + def wrapped(*args, **kwargs): + args: List[Any] = list(args) + kwargs: Dict[str, Any] = dict(kwargs) + unpacked = False + # maybe we need to use pytrees here + if len(args) == 1 and isinstance(args[0], (list, tuple)): + unpacked = True + args = list(args[0]) + + if not all( + (fn in lowering.fallbacks or lowering.in_namespace(fn, "_c10d_functional")) for fn in aten_fn + ): + # explicitly assert for "out=" ops for better error messages + if any(x == "out" for x in kwargs.keys()): + raise RuntimeError("assert out= ops aren't yet supported") + + args, kwargs = transform_args( + args, kwargs, broadcast, type_promotion_kind, convert_input_to_bool + ) + + if unpacked: + args = [args] + + out = decomp_fn(*args, **kwargs) + validate_ir(out) + + return out + + aten_fn = lowering.get_overloads(aten_fn) + + lowering.lowerings.update(dict.fromkeys(aten_fn, wrapped)) + return wrapped + + def register_lowering( + aten_fn, + broadcast=False, + type_promotion_kind=lowering.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, + convert_input_to_bool=False, + ): + + """ + Shim to support decorator syntax. + """ + return functools.partial( + _register_lowering, + aten_fn, + broadcast=broadcast, + type_promotion_kind=type_promotion_kind, + convert_input_to_bool=convert_input_to_bool, + ) + + def _make_reduction_inner(x, *, axis, keepdims, dtype, override_return_dtype): + if dtype is not None: + x = to_dtype(x, dtype) + size = x.get_size() + axis = set(lowering._validate_reduction_axis(x, axis)) + + kept_sizes = [] + kept_idx = [] + reduced_sizes = [] + reduced_idx = [] + for i in range(len(size)): + if i in axis: + reduced_idx.append(i) + reduced_sizes.append(size[i]) + else: + kept_idx.append(i) + kept_sizes.append(size[i]) + + def loader(index, reduction_index): + if len(reduction_index) != len(reduced_idx): + raise RuntimeError("assert reduction index length mismatch") + if keepdims: + if len(index) != len(size): + raise RuntimeError("assert index size length mismatch") + index = [index[i] for i in kept_idx] + if len(index) != len(kept_idx): + raise RuntimeError("assert index kept_idx length mismatch") + new_index = [None] * (len(index) + len(reduction_index)) + for idx, var in itertools.chain( + zip(kept_idx, index), zip(reduced_idx, reduction_index) + ): + new_index[idx] = var + return inner_loader(new_index) + + if keepdims: + new_size = list(size) + for i in reduced_idx: + new_size[i] = sympy.S.One + else: + new_size = kept_sizes + + inner_loader = x.make_loader() + return dict( + device=x.get_device(), + dst_dtype=override_return_dtype or x.get_dtype(), + src_dtype=x.get_dtype(), + inner_fn=loader, + ranges=new_size, + reduction_ranges=reduced_sizes, + ) + + def make_reduction(reduction_type: str, override_return_dtype=None): + def inner(x, axis=None, keepdims=False, *, dtype=None): + kwargs = _make_reduction_inner( + x, + axis=axis, + keepdims=keepdims, + dtype=dtype, + override_return_dtype=override_return_dtype, + ) + node_name = f'reduction_{next(node_id)}' + input_graphs = fetch_graphs([x, axis if axis is not None else list(range(len(x.get_size())))]) + new_graph = merge_traced_graphs(input_graphs, reduction_type_to_aten_fn[reduction_type], + node_name, keepdim=keepdims) + + result = Reduction.create(reduction_type=reduction_type, + input_node=x, + node_name=node_name, + traced_graph=new_graph, + **kwargs) + if isinstance( + result.data.data, Reduction + ): + # Only realize if reduction isn't unrolled + size = x.get_size() + axis = set(lowering._validate_reduction_axis(x, axis)) + kept_idx = [] + reduced_idx = [] + for i in range(len(size)): + if i in axis: + reduced_idx.append(i) + else: + kept_idx.append(i) + + object.__setattr__(result.data.data, "kept_idx", kept_idx) + object.__setattr__(result.data.data, "reduced_idx", reduced_idx) + + result.realize() + return result + + return inner + + lowering.make_reduction = make_reduction + + def to_dtype(x: TensorBox, dtype: torch.dtype, copy=False): + src_dtype = x.get_dtype() + if src_dtype == dtype: + return clone(x) if copy else x + + def _to_dtype(x): + return ops.to_dtype(x, dtype, src_dtype=src_dtype) + + register_fn_to_aten_fn(_to_dtype, aten.to.dtype) + return make_pointwise(_to_dtype, override_return_dtype=dtype, dtype=dtype)(x) + + @register_lowering(prims.convert_element_type, type_promotion_kind=None) + def _convert_element_type(x: TensorBox, dtype: torch.dtype): + if dtype.is_complex or x.get_dtype().is_complex: + if x.get_size(): + # Decompose since aa aten fallback is more friendly for c++ codegen. + # This decomposition doesn't work for empty tensor, which needs more investigation. + dst = empty_like(x, dtype=dtype) + ir.InplaceCopyFallback.create(dst, x) + return dst + else: + return lowering.fallback_handler( + prims.convert_element_type.default, add_to_fallback_set=False + )(x, dtype) + return to_dtype(x, dtype, copy=True) + + def register_pointwise( + aten_fn, + name=None, + broadcast=True, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, + convert_input_to_bool=False, + override_return_dtype=None, + override_fn_when_input_bool=None, + allow_alpha=False, + use_libdevice_for_f64=False, + triton_fallback=None, + ): + """A pointwise function that maps ops.{name} to inputs""" + name = name or aten_fn.__name__ + fn = ops_wrapper(name) + if use_libdevice_for_f64: + fn_libdevice = ops_wrapper("libdevice_" + name) + lowering.register_op_dtype_propagation_rules( + "libdevice_" + name, type_promotion_kind, override_return_dtype + ) + + lowering.register_op_dtype_propagation_rules( + name, type_promotion_kind, override_return_dtype + ) + + if override_fn_when_input_bool is not None: + override_fn_when_input_bool = ops_wrapper(override_fn_when_input_bool) + + fn = register_fn_to_aten_fn(fn, aten_fn) + + fn = make_pointwise( + fn, + override_return_dtype=override_return_dtype, + override_fn_when_input_bool=override_fn_when_input_bool, + override_fn_when_gpu_float64=fn_libdevice if use_libdevice_for_f64 else None, + # type: ignore[possibly-undefined] + allow_alpha=allow_alpha, + triton_fallback=triton_fallback, + ) + fn = register_lowering( + aten_fn, + broadcast=broadcast, + type_promotion_kind=type_promotion_kind, + convert_input_to_bool=convert_input_to_bool, + )(fn) + + if hasattr(prims, name): + register_lowering( + getattr(prims, name), + type_promotion_kind=None, + convert_input_to_bool=convert_input_to_bool, + )(fn) + return fn + + def make_pointwise( + fn, + override_return_dtype=None, + override_device=None, + override_fn_when_input_bool=None, + override_fn_when_gpu_float64=None, + allow_alpha=False, + triton_fallback=None, + **kwargs + ): + def inner(*inputs: TensorBox, alpha=None): + if triton_fallback is not None and any( + isinstance(inp, IRNode) and is_triton(inp) for inp in inputs + ): + # not implemented + if allow_alpha: + raise RuntimeError("assert allow_alpha is not allowed") + return triton_fallback(*inputs) + + inputs = lowering.promote_constants(inputs, override_return_dtype) + if allow_alpha: + if alpha is not None and alpha != 1: + inputs = list(inputs) + inputs[-1] = mul(inputs[-1], alpha) + else: + if alpha is not None: + raise RuntimeError("assert alpha is not None") + loaders = [x.make_loader() for x in inputs] + ranges = inputs[0].get_size() + dtype = override_return_dtype or inputs[0].get_dtype() + is_gpu_device = lowering.is_gpu(decode_device(inputs[0].get_device()).type) + + for other in inputs[1:]: + if not (isinstance(other, ir.BaseConstant) or len(ranges) == len(other.get_size())): + raise RuntimeError(f"assert ndim mismatch {fn} {ranges} {other.get_size()}") + + # in tracing, we will annotate pointwise nodes that correspond to the output of + # a pointwise node that would have been run in eager. intermediary pointwise nodes + # during decompositions are not annotated. + emulate_precision_casts = ( + V.graph is not None + and getattr(V.graph, "current_node", None) is not None + and V.graph.current_node.meta is not None + and V.graph.current_node.meta.get("low_precision_pointwise_barrier", False) + and dtype in (torch.bfloat16, torch.float16) + ) + + def inner_fn(index): + if len(index) != len(ranges): + raise RuntimeError(f"assert wrong ndim {index} {ranges}") + if dtype == torch.bool and override_fn_when_input_bool is not None: + return override_fn_when_input_bool(*[load(index) for load in loaders]) + elif ( + override_fn_when_gpu_float64 + and is_gpu_device + and dtype == torch.float64 + ): + return override_fn_when_gpu_float64(*[load(index) for load in loaders]) + else: + inputs_loaded = [] + for load in loaders: + out = load(index) + if emulate_precision_casts: + downcast = ops.to_dtype(out, dtype, use_compute_types=False) + out = ops.to_dtype(downcast, dtype) + inputs_loaded.append(out) + + out = fn(*inputs_loaded) + if emulate_precision_casts: + # fp16/bf16 kernels are computed in fp32. Casting down to fp16/bf16 here, + # then upcasting again, to emulate casts that eager would do. + downcast = ops.to_dtype(out, dtype, use_compute_types=False) + return ops.to_dtype(downcast, dtype) + return out + + if not override_device: + device = None + for i in inputs: + if lowering.is_gpu(i.get_device().type): + device = i.get_device() + break + if not device: + device = inputs[0].get_device() + + device = override_device or device + + input_graphs = fetch_graphs(inputs) + node_name = f'pointwise_{next(node_id)}' + origin_fn = fn_to_aten_fn[fn] + new_graph = merge_traced_graphs(input_graphs, origin_fn, node_name, **kwargs) + + return Pointwise.create( + device=device, + dtype=dtype, + inner_fn=inner_fn, + ranges=ranges, + node_name=node_name, + traced_graph=new_graph, + ) + + return inner + + @register_lowering(aten.where, broadcast=False, type_promotion_kind=None) + def where(cond, a, b): + def fn(*args): + return ops.where(*args) + + if isinstance(a, (float, int)): + a = lowering.constant_like(a)(b) + if isinstance(b, (float, int)): + b = lowering.constant_like(b)(a) + + args = [cond, a, b] + dtype = lowering.get_promoted_dtype( + args[1], args[2], type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT + ) + indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)] + for i, x in zip(indices, broadcast_tensors(*[args[i] for i in indices])): + args[i] = x + for i in range(len(args)): + if isinstance(args[i], ir.Constant): + args[i] = ExpandView.create(args[i], list(args[indices[0]].get_size())) + register_fn_to_aten_fn(fn, aten.where) + return make_pointwise(fn, override_return_dtype=dtype)( + args[0], to_dtype(args[1], dtype), to_dtype(args[2], dtype) + ) + + @register_lowering(aten.broadcast_tensors, broadcast=False, type_promotion_kind=None) + def broadcast_tensors(*inputs): + if len(inputs) == 1 and isinstance(inputs[0], (list, tuple)): + return broadcast_tensors(*inputs[0]) + target: List[sympy.Expr] = functools.reduce( + lowering.broadcast_symbolic_shapes, [x.get_size() for x in inputs], [] + ) + outputs = [] + for x in inputs: + sizes = x.get_size() + if len(sizes) != len(target) or any( + ( + ( + V.graph.sizevars.shape_env.evaluate_expr( + sympy.Eq(a, 1), size_oblivious=True + ) + and not V.graph.sizevars.shape_env.evaluate_expr( + sympy.Eq(b, 1), size_oblivious=True + ) + ) + or ( + not V.graph.sizevars.shape_env.evaluate_expr( + sympy.Eq(a, 1), size_oblivious=True + ) + and V.graph.sizevars.shape_env.evaluate_expr( + sympy.Eq(b, 1), size_oblivious=True + ) + ) + ) + for a, b in zip(sizes, target) + ): + x = expand(x, target) + outputs.append(x) + return outputs + + @register_lowering(aten.squeeze, type_promotion_kind=None) + def squeeze(x, dim=None): + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + + if dim is None: + return TensorBox(SqueezeView.create(x.data)) + + dim = ( + V.graph.sizevars.evaluate_static_shape(dim) + if isinstance(dim, (int, sympy.Expr)) + else tuple(V.graph.sizevars.evaluate_static_shape(d) for d in dim) + ) + dim = canonicalize_dims(len(x.get_size()), dim) # type: ignore[call-overload] + dims = set((dim,) if not isinstance(dim, tuple) else dim) + + new_shape = [] + for d, s in enumerate(x.get_size()): + if not ( + d in dims + and V.graph.sizevars.evaluate_expr(sympy.Eq(s, 1, size_oblivious=True)) + ): + new_shape.append(s) + + # squeeze does nothing if the size isn't 1 + return view(x, new_shape) if new_shape != x.get_size() else x + + @register_lowering([aten.squeeze_]) + def squeeze_(x, dim=None): + val = squeeze(x, dim) + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + if not isinstance(val, TensorBox): + raise RuntimeError("assert val should be instance of TensorBox") + x.data = val.data + return x + + @register_lowering(aten.isinf) + def isinf(x): + if lowering.is_integer_type(x): + return full_like(x, False, dtype=torch.bool) + fn = ops_wrapper("isinf") + register_fn_to_aten_fn(fn, aten.isinf) + return make_pointwise(fn, override_return_dtype=torch.bool)(x) + + @register_lowering(aten.isnan) + def isnan(x): + if lowering.is_integer_type(x): + return full_like(x, False, dtype=torch.bool) + fn = ops_wrapper("isnan") + register_fn_to_aten_fn(fn, aten.isnan) + return make_pointwise(fn, override_return_dtype=torch.bool)(x) + + @register_lowering(aten.ceil) + def ceil(x): + if lowering.is_integer_type(x): + return clone(x) + fn = ops_wrapper("ceil") + register_fn_to_aten_fn(fn, aten.ceil) + return make_pointwise(fn)(x) + + @register_lowering(aten.floor) + def floor(x): + if lowering.is_integer_type(x): + return clone(x) + fn = ops_wrapper("floor") + register_fn_to_aten_fn(fn, aten.floor) + return make_pointwise(fn)(x) + + @register_lowering(aten.round.default) + def round(x): + if lowering.is_integer_type(x): + return clone(x) + else: + fn = ops_wrapper("round") + register_fn_to_aten_fn(fn, aten.round) + return make_pointwise(fn)(x) + + @register_lowering(aten.trunc) + def trunc(x): + if lowering.is_integer_type(x): + return clone(x) + fn = ops_wrapper("trunc") + register_fn_to_aten_fn(fn, aten.trunc) + return make_pointwise(fn)(x) + + @register_lowering(aten.expand, type_promotion_kind=None) + def expand(x, sizes): + from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols + + (x,) = lowering.promote_constants([x]) + if isinstance(x, ir.BaseConstant): + return ExpandView.create(x, tuple(sizes)) + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + if not isinstance(sizes, (list, tuple)): + raise RuntimeError("assert x should be instance of (list, tuple)") + if tuple(x.get_size()) == tuple(sizes): + return x + + if not free_unbacked_symbols(x.get_size()): + x_size_product = V.graph.sizevars.size_hint(sympy_product(x.get_size())) + # It would be better to realize the input if any of its sizes + # are unbacked, because typically the size will be non-zero. However, + # this cannot be done directly as below as we'll choke on the size_hint + # here + if x_size_product > 0 and not free_unbacked_symbols(sizes): + # maybe realize input before broadcasting it + x.mark_reuse( + V.graph.sizevars.size_hint(sympy_product(sizes)) // x_size_product + ) + input_graphs = fetch_graphs([x.data, tuple(sizes)]) + node_name = f'expand_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.expand, node_name) + return TensorBox(ExpandView.create(x.data, tuple(sizes), traced_graph=new_graph, node_name=node_name)) + + @register_lowering(aten.expand_as, type_promotion_kind=None) + def expand_as(x, y): + return expand(x, y.get_size()) + + @register_lowering(aten.repeat) + def repeat(x, repeats): + input_graphs = fetch_graphs([x, repeats]) + node_name = f'repeat_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.repeat, node_name) + old_size = list(x.get_size()) + if len(repeats) > len(old_size): + old_size = [sympy.S.One] * (len(repeats) - len(old_size)) + old_size + x = view(x, list(old_size)) + if len(repeats) != len(x.get_size()): + raise RuntimeError("assert repeat should have same size as x.size") + + new_size = list(x.get_size()) + + zero_tensor = False + for i in range(len(repeats)): + if repeats[i] == 0: + zero_tensor = True + new_size[i] = new_size[i] * repeats[i] + + if zero_tensor: + return empty(new_size, dtype=x.get_dtype(), device=x.get_device()) + if all((a == 1 or b == 1) for a, b in zip(repeats, old_size)): + return clone(expand(x, new_size)) + + x_loader: Callable[[Any], Any] + + def inner_fn(index): + if len(index) != len(repeats): + raise RuntimeError("assert repeat should have same length as repeats") + index = list(index) + for i in range(len(repeats)): + if repeats[i] != 1: + if old_size[i] == 1: + index[i] = sympy.S.Zero + else: + index[i] = ModularIndexing(index[i], 1, old_size[i]) + return x_loader(index) + + old_size_product = V.graph.sizevars.size_hint(sympy_product(old_size)) + if old_size_product > 0: + # maybe realize the input + x.mark_reuse( + V.graph.sizevars.size_hint(sympy_product(new_size)) // old_size_product + ) + + x_loader = x.make_loader() + return Pointwise.create( + device=x.get_device(), + dtype=x.get_dtype(), + inner_fn=inner_fn, + ranges=list(new_size), + traced_graph=new_graph, + node_name=node_name + ) + + @register_lowering(aten._unsafe_view, type_promotion_kind=None) + @register_lowering(aten.view, type_promotion_kind=None) + @register_lowering(aten.reshape, type_promotion_kind=None) + def view(x, sizes): + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + if not isinstance(sizes, (list, tuple)): + raise RuntimeError("assert sizes should be instance of (list, tuple)") + input_graphs = fetch_graphs([x.data, sizes]) + node_name = f'view_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.reshape, node_name) + return TensorBox(View.create(x.data, sizes, traced_graph=new_graph, node_name=node_name)) + + @register_lowering(aten.permute, type_promotion_kind=None) + def permute(x, dims): + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + if not isinstance(dims, (list, tuple)): + raise RuntimeError("assert dims should be instance of (list, tuple)") + input_graphs = fetch_graphs([x.data, dims]) + node_name = f'permute_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.permute, node_name) + return TensorBox(PermuteView.create(x.data, tuple(dims), traced_graph=new_graph, node_name=node_name)) + + @register_lowering(aten.slice, type_promotion_kind=None) + def slice_(x, dim=0, start=0, end=2 ** 63, step=1, clamp=True): + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + dim = _validate_dim(x, dim, 0) + input_graphs = fetch_graphs([x.data]) + node_name = f'slice_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.slice, node_name, dim=dim, start=start, end=end, step=step) + + return TensorBox( + ir.SliceView.create(x.data, dim, start, end, step, traced_graph=new_graph, node_name=node_name)) + + @register_lowering(aten.select, type_promotion_kind=None) + def select(x, dim, idx): + idx = View.handle_negative_index(idx, x.get_size()[dim]) + return squeeze(slice_(x, dim, idx, idx + 1), dim) + + @register_lowering(aten.split, type_promotion_kind=None) + def split(x, sizes, dim=0): + dim = _validate_dim(x, dim, 0) + sizes_ = sizes + + # If sizes is an integer (or a SymInt), we turn it into a list of sizes + # by computing what the actual size of each chunk should be. + if not isinstance(sizes, (list, tuple)): + x_size = x.get_size()[dim] + chunks = V.graph.sizevars.evaluate_static_shape( + FloorDiv(x_size + sizes - 1, sizes) + ) + sizes_ = [sizes] * chunks + # The last chunk might have a smaller size than the rest. + sizes_[-1] = x_size - (chunks - 1) * sizes + + # From this point, we assume that the sum of the sizes of all chunks + # equals the size of the base tensor. + result = [] + start = 0 + for size in sizes_: + end = start + size + # No need for clamping here, since we compute the exact + # start and end values. + result.append(slice_(x, dim, start, end, clamp=False)) + start = end + return result + + @register_lowering(aten.split_with_sizes, type_promotion_kind=None) + def split_with_sizes(x, sizes, dim=0): + return split(x, sizes, dim) + + @register_lowering(aten.unbind, type_promotion_kind=None) + def unbind(x, dim=0): + dim = _validate_dim(x, dim, 0) + x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim]) + result = [select(x, dim, i) for i in range(x_size)] + return result + + @register_lowering(aten.unsqueeze, type_promotion_kind=None) + def unsqueeze(x, dim): + dim = _validate_dim(x, dim, 1) + new_shape = list(x.get_size()) + new_shape.insert(dim, sympy.S.One) + return view(x, new_shape) + + @register_lowering(aten.unsqueeze_, type_promotion_kind=None) + def unsqueeze_(x, dim): + val = unsqueeze(x, dim) + if not isinstance(x, TensorBox): + raise RuntimeError("assert x should be instance of TensorBox") + if not isinstance(val, TensorBox): + raise RuntimeError("assert val should be instance of TensorBox") + x.data = val.data + return x + + def _validate_dim(x, dim, offset=0): + dim = V.graph.sizevars.shape_env.evaluate_expr(sympy.sympify(dim)) + ndim = len(x.get_size()) + if dim < 0: + dim += ndim + offset + if not (0 <= dim < ndim + offset): + raise RuntimeError(f"assert dim {dim} is out of bounds. Expected: 0 <= dim < {ndim + offset}") + return dim + + @register_lowering(aten.copy, type_promotion_kind=None) + def copy(self, src, non_blocking=False): + x = src + if self.get_device() != src.get_device(): + x = lowering.to_device(x, self.get_device()) + if self.get_dtype() != src.get_dtype(): + x = to_dtype(x, self.get_dtype()) + + if self.get_size() != src.get_size(): + out = expand(x, self.get_size()) + return clone(out) + return clone(x) + + @register_lowering(prims.iota) + def iota( + length, + *, + start, + step, + dtype, + device, + requires_grad, + ): + def fn(index): + return ops.index_expr(step * index[0] + start, dtype=dtype) + + node_name = f'iota_{next(node_id)}' + new_graph = merge_traced_graphs([length], prims.iota, node_name, \ + start=start, step=step, \ + dtype=dtype, device=device, \ + requires_grad=requires_grad) + return Pointwise.create( + device=decode_device(device), + dtype=dtype, + inner_fn=fn, + ranges=[length], + traced_graph=new_graph, + node_name=node_name + ) + + @register_lowering(aten.select_scatter, type_promotion_kind=None) + def select_scatter(x, src, dim: int, index: int): + if x.get_dtype() != src.get_dtype(): + raise RuntimeError(f"assert Expected dtype {src.get_dtype()}, but got {x.get_dtype()}") + input_graphs = fetch_graphs([x, src, dim, index]) + node_name = f'select_scatter_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.select_scatter, node_name) + x_loader = x.make_loader() + dim = _validate_dim(x, dim, 0) + if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)): + index = index + x.get_size()[dim] + V.graph.sizevars.guard_leq(0, index) # type: ignore[arg-type] + V.graph.sizevars.guard_lt(index, x.get_size()[dim]) # type: ignore[arg-type] + src = expand(unsqueeze(src, dim), x.get_size()) + src_loader = src.make_loader() + + def inner_fn(idx): + return ops.where( + ops.eq( + ops.index_expr(idx[dim], torch.int32), + ops.index_expr(index, torch.int32), + ), + src_loader(idx), + x_loader(idx), + ) + + return Pointwise.create( + device=x.get_device(), + dtype=x.get_dtype(), + inner_fn=inner_fn, + ranges=list(x.get_size()), + traced_graph=new_graph, + node_name=node_name + ) + + @register_lowering(aten.slice_scatter, type_promotion_kind=None) + def slice_scatter(x, src, dim=0, start=None, end=None, step=1): + if x.get_dtype() != src.get_dtype(): + raise RuntimeError(f"assert Expected dtype {src.get_dtype()}, but got {x.get_dtype()}") + input_graphs = fetch_graphs([x, src]) + node_name = f'slice_scatter_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.slice_scatter, node_name, \ + dim=dim, + start=start, + end=end, + step=step) + x_loader = x.make_loader() + dim = _validate_dim(x, dim, 0) + dim_size = x.get_size()[dim] + + start, end = ir.SliceView.normalize_start_end(x, dim, start, end) + + src_size = list(x.get_size()) + src_size[dim] = FloorDiv(end - start + (step - 1), step) + src = expand(src, src_size) + src_loader = src.make_loader() + + def inner_fn(idx): + if start == 0 and end == dim_size and step == 1: + # selecting every element is the same as just src.clone() + return src_loader(idx) + + idx_dim = ops.index_expr(idx[dim], torch.int64) + src_idx = list(idx) + src_idx[dim] = FloorDiv(idx[dim] - start, step) + + mask = [] + if start != 0: + mask.append( + ops.ge( + idx_dim, + ops.index_expr(sympy.expand(start), torch.int64), + ) + ) + if end != dim_size: + mask.append( + ops.lt( + idx_dim, + ops.index_expr(sympy.expand(end), torch.int64), + ) + ) + if step != 1: + mask.append( + ops.eq( + ops.index_expr( + ModularIndexing(idx[dim] - start, 1, step), torch.int64 + ), + ops.constant(0, torch.int64), + ) + ) + if not mask: + raise RuntimeError("assert mask cannot be empty") + mask = functools.reduce(ops.and_, mask) + src_val = ops.masked( + mask, + lambda: src_loader(src_idx), + 0 if lowering.is_integer_type(x) else 0.0, + ) + return ops.where( + mask, + src_val, + x_loader(idx), + ) + + return Pointwise.create( + device=x.get_device(), + dtype=x.get_dtype(), + inner_fn=inner_fn, + ranges=list(x.get_size()), + traced_graph=new_graph, + node_name=node_name + ) + + @register_lowering([torch.tensor, aten.scalar_tensor]) + def tensor(data, *, dtype=None, device=None, layout=None, pin_memory=False): + lowering.assert_nyi(layout in (None, torch.strided), f"layout={layout}") + lowering.assert_nyi(not pin_memory, "pin_memory") + input_graphs = fetch_graphs([data]) + node_name = f'tensor_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.scalar_tensor, node_name, \ + dtype=dtype, + device='npu', + layout=layout, + pin_memory=False) + if isinstance(lowering._unwrap(data), int): + dtype = dtype or torch.int64 + else: + dtype = dtype or torch.get_default_dtype() + + ranges: List[sympy.Expr] = [] + + if isinstance(data, sympy.Basic): + + def inner_fn(index): + return ops.index_expr(data, dtype) + + elif isinstance(data, (float, int)): + + def inner_fn(index): + return ops.constant(data, dtype) + + elif len(data) == 0 or isinstance(data[0], (float, int)) and len(data) <= 8: + # inline small tensors + ranges.append(sympy.Integer(len(data))) + + def inner_fn(index): + def binary_search(start, end): + if start >= end: + raise RuntimeError(f"assert start ({start}) must be less than end ({end})") + if end - start == 1: + return ops.constant(data[start], dtype) + mid = (end - start) // 2 + start + return ops.where( + ops.lt( + ops.index_expr(index[0], torch.int64), + ops.constant(mid, torch.int64), + ), + binary_search(start, mid), + binary_search(mid, end), + ) + + if len(data) == 0: + return ops.constant(0, dtype) + return binary_search(0, len(data)) + + else: + return V.graph.add_tensor_constant( + torch.tensor(data, dtype=dtype, device=device) + ) + + return Pointwise.create( + device=decode_device(device), + dtype=dtype, + inner_fn=inner_fn, + ranges=ranges, + traced_graph=new_graph, + node_name=node_name + ) + + def tensor_constructor(fill_value): + # torch.zeros, torch.ones, etc + def inner( + *size, + names=None, + dtype=None, + device=None, + layout=None, + pin_memory=False, + memory_format=None, + ): + lowering.assert_nyi(names is None, "named tensors") + lowering.assert_nyi(layout in (None, torch.strided), f"layout={layout}") + lowering.assert_nyi(not pin_memory, "pin_memory") + device = decode_device(device) + dtype = dtype or torch.get_default_dtype() + if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)): + size = tuple(size[0]) + # See pytorch issues 118102 + # All sizes at lowering time should be sympy.Symbol, not SymInt! + for s in size: + if isinstance(s, torch.SymInt): + raise RuntimeError("assert s must not be of type torch.SymInt") + size = [sympy.expand(s) for s in size] + return _full(fill_value, device, dtype, size) + + return inner + + def _full(fill_value, device, dtype, size): + value = fill_value + if not isinstance(fill_value, (int, float)) and hasattr(value, "value"): + value = value.value + + if isinstance(value, (int, float)): + + def inner_fn(index): + return ops.constant(value, dtype) + + elif isinstance(value, sympy.Basic): + + def inner_fn(index): + return ops.index_expr(value, dtype) + + else: + if len(value.get_size()) != 0: + raise RuntimeError("assert value should be equal to 0") + value_loader = value.make_loader() + + def inner_fn(index): + return value_loader([]) + + node_name = f'full_{next(node_id)}' + new_graph = merge_traced_graphs([size, fill_value], aten.full.default, node_name, \ + device='npu', dtype=dtype, layout=torch.strided, pin_memory=False) + + return Pointwise.create( + device=device, + dtype=dtype, + inner_fn=inner_fn, + ranges=list(size), + traced_graph=new_graph, + node_name=node_name + ) + + @register_lowering(aten.empty_strided) + def empty_strided( + size, stride, *, dtype=None, layout=None, device=None, pin_memory=None + ): + if not isinstance(size, (list, tuple)): + raise RuntimeError(f"assert Expected list or tuple") + if not isinstance(stride, (list, tuple)): + raise RuntimeError(f"assert Expected list or tuple or None") + lowering.assert_nyi(not pin_memory, "pin_memory") + lowering.assert_nyi(layout in (None, torch.strided), f"layout={layout}") + dtype = lowering.decode_dtype(dtype) or torch.get_default_dtype() + device = device or torch.tensor(0.0).device + device = decode_device(device) + pointwise = _full(fill_value=0, device=device, dtype=dtype, size=size) + pointwise.realize() + buffer = pointwise.data.data + # explicitly set ranges to zeros in order to make a NopKernelSchedulerNode + buffer.data = lowering.dataclasses.replace(buffer.data, ranges=[0] * len(size)) + if not isinstance(buffer, ir.ComputedBuffer): + raise RuntimeError(f"assert Expected ir.ComputedBuffer") + size = [sympy.expand(s) for s in size] + stride = ( + [sympy.expand(s) for s in stride] + if stride + else ir.FlexibleLayout.contiguous_strides(size) + ) + buffer.layout = ir.FixedLayout( + device=device, + dtype=dtype, + size=size, + stride=stride, + ) + return pointwise + + @register_lowering([torch.empty, aten.empty]) + def empty( + *size, + names=None, + dtype=None, + layout=None, + device=None, + pin_memory=None, + memory_format=None, + ): + lowering.assert_nyi(names is None, "named tensors") + device = decode_device(device) + if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)): + size = tuple(size[0]) + return empty_strided( + size, None, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory + ) + + @register_lowering([torch.full, aten.full]) + def full(size, fill_value, **kwargs): + if kwargs.get("dtype") is None: + raise RuntimeError("assert kwargs dtype should be handled by decomposition") + return tensor_constructor(fill_value)(size, **kwargs) + + register_lowering(aten.clone)(clone) + + @register_lowering(aten.constant_pad_nd, type_promotion_kind=None) + def constant_pad_nd(x, padding, fill_value=0): + if (len(padding) % 2) != 0: + raise RuntimeError("assert len(padding) must % 2=0") + + input_graphs = fetch_graphs([x, padding]) + node_name = f'constand_pad_nd_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.constant_pad_nd, node_name, value=fill_value) + + if all(p == 0 for p in padding): + return clone(x) + + sizes = x.get_size() + + bounds = list(reversed(list(zip(padding[::2], padding[1::2])))) + n = len(sizes) - len(bounds) + + # if padding is a complicated expression, hoist it + bounds_precomp: List[Tuple[sympy.Symbol, Any]] = [] + for low, high in bounds: + bounds_precomp.append((V.graph.sizevars.lookup_precomputed_size(low), high)) # type: ignore[arg-type] + + output_size = list(sizes[:n]) + mask_sizes = [] + for (low, high), size in zip(bounds, sizes[n:]): + mask_sizes.append(size) + output_size.append(sympy.expand(size + low + high)) + if len(output_size) != len(sizes): + raise RuntimeError("assert len(output_size) must equal to len(sizes)") + fill_value = dtype_to_type(x.get_dtype())(fill_value) + + def mask(index): + mask = [] + for idx, (low, high), length in zip(index[n:], bounds, mask_sizes): + if low != 0: + mask.append(lowering.range_mask_low(idx, 0)) + if high != 0: + mask.append(lowering.range_mask_high(idx, length)) + mask = functools.reduce(ops.and_, mask) + return ops.masked(mask, lambda: x_loader(index), fill_value) + + def offset_fn(index): + new_index = list(index[:n]) + for idx, (low, high) in zip(index[n:], bounds_precomp): + new_index.append(idx - low) + if len(new_index) != len(index): + raise RuntimeError("assert len(new_index) must equal len(index)") + return mask(new_index) + + x_loader = x.make_loader() + return Pointwise.create( + device=x.get_device(), + dtype=x.get_dtype(), + inner_fn=offset_fn, + ranges=output_size, + traced_graph=new_graph, + node_name=node_name + ) + + @make_pointwise + @register_to_aten(aten_fn=aten.pow) + def pow_native(a, b): + return ops.pow(a, b) + + @register_lowering(aten.pow, broadcast=True) + def pow(a, b): + if isinstance(b, float) and b == int(b): + return pow(a, int(b)) + elif isinstance(b, float) and b == 0.5: + return sqrt(a) + elif isinstance(b, int) and b == 1: + return clone(a) + + input_graphs = fetch_graphs([a, b]) + node_name = f'pointwise_{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.pow, node_name) + + # Type promotion ensures all tensor arguments have the same type + dtype = next(x.get_dtype() for x in (a, b) if isinstance(x, ir.TensorBox)) + is_integer_pow = is_integer_dtype(dtype) + + # Optimize away small fixed powers, or for integers avoid falling back to ATen + embed_exponent = isinstance(b, int) and ( + -32 < b < 32 or (is_integer_pow and b >= 0) + ) + if embed_exponent: + loader = a.make_loader() + + def fn(idx): + return lowering.pow_recursive(loader(idx), b, a.get_dtype()) + + return Pointwise.create( + device=a.get_device(), + dtype=a.get_dtype(), + inner_fn=fn, + ranges=a.get_size(), + node_name=node_name, + traced_graph=new_graph, + ) + + if isinstance(a, Number): + if a == 1: + return full_like(b, 1) + if a == 2 and is_float_dtype(b.get_dtype()): + return exp2(b) + + if is_integer_pow: + # ops.pow doesn't work for integers + if isinstance(a, Number): + return lowering.fallback_pow_scalar(a, b) + elif isinstance(b, Number): + return lowering.fallback_pow_tensor_scalar(a, b) + else: + return lowering.fallback_pow_tensor_tensor(a, b) + + return pow_native(a, b) + + def mutate_to(changed, val, unsafe_alias=False): + if isinstance(changed, TensorBox): + changed_data = changed.data + else: + changed_data = changed + if isinstance(val, TensorBox): + val = val.data + + if not isinstance(val, ir.StorageBox): + # introduce a copy to handle views + input_graphs = fetch_graphs([changed, val]) + node_name = f'copy__{next(node_id)}' + new_graph = merge_traced_graphs(input_graphs, aten.copy_, node_name) + val = Pointwise.create( + device=changed.get_device(), + dtype=changed.get_dtype(), + inner_fn=val.make_loader(), + ranges=changed.get_size(), + traced_graph=new_graph, + node_name=node_name + ).data + if not isinstance(val, ir.StorageBox): + raise RuntimeError("assert val should be instance of ir.StorageBox") + + if isinstance(changed_data, ir.StorageBox) and not ( + changed_data.is_input_buffer() + # In AOTI, module parameters and buffers are not lifted as graph inputs + or changed_data.is_module_buffer() + or isinstance(changed_data.data, ir.NopKernel) + ): + # Fast path, just swing the data pointer + val.realize() + changed_data.data = val.data + return changed + + ir.MutationLayoutSHOULDREMOVE.realize_into( + val, changed_data, unsafe_alias=unsafe_alias + ) + return changed + + empty_like = register_lowering(aten.empty_like)(lowering.create_tensor_like(empty)) + ones_like = lowering.create_tensor_like(tensor_constructor(1)) + zeros_like = lowering.create_tensor_like(tensor_constructor(0)) + + @register_lowering(aten.full_like, type_promotion_kind=None) + def full_like(x, fill_value, **kwargs): + return lowering.create_tensor_like(tensor_constructor(fill_value))(x, **kwargs) + + @register_lowering(aten.fill_) + def fill_(x, fill_value): + return mutate_to(x, full_like(x, fill_value)) + + @register_lowering(aten.copy_, type_promotion_kind=None) + def copy_(dst, src, non_blocking=False): + if dst is src: + # dst.copy_(dst) can happen from the reinplacing pass + return dst + src = lowering.to_device(src, dst.get_device()) + src = to_dtype(src, dst.get_dtype()) + src = expand(src, dst.get_size()) + return mutate_to(dst, src) + + @make_pointwise + def floordiv(a, b): + return ops.floordiv(a, b) + + @make_pointwise + def truncdiv(a, b): + return ops.truncdiv(a, b) + + @register_lowering(aten.div, broadcast=True) + def div_mode(a, b, rounding_mode=None): + both_integer = lowering.is_integer_type(a) and lowering.is_integer_type(b) + both_boolean = lowering.is_boolean_type(a) and lowering.is_boolean_type(b) + + # floordiv and truncdiv need special handling for integer tensors on Triton, + # see the discussion at openai triton issues 605 + if rounding_mode == "floor": + if both_boolean: + raise RuntimeError("assert floordiv operands cannot be boolean at the same time") + return floordiv(a, b) if both_integer else floor(div(a, b)) + if rounding_mode == "trunc": + if both_boolean: + raise RuntimeError("assert truncdiv operands can not be boolean at the same time") + return truncdiv(a, b) if both_integer else trunc(div(a, b)) + return div(a, b) + + @register_lowering([aten.mul], broadcast=True) + def mul(a, b): + both_bool = lowering.is_boolean_type(a) and lowering.is_boolean_type(b) + if both_bool: + return logical_and(a, b) + else: + fn = ops_wrapper(aten.mul.__name__) + fn = register_fn_to_aten_fn(fn, aten.mul) + return make_pointwise(fn)(a, b) + + @register_lowering([aten.reciprocal], broadcast=True, ) + def reciprocal(a): + return div(1.0, a) + + @register_lowering([prims.div], broadcast=True) + def div_prim(a, b): + is_integral = all(lowering.is_boolean_type(x) or lowering.is_integer_type(x) for x in [a, b]) + + if is_integral: + return truncdiv(a, b) + + def fn(*args): + return ops.truediv(*args) + + fn = register_fn_to_aten_fn(fn, aten.div) + return make_pointwise(fn)(a, b) + + @register_lowering( + [aten.true_divide, aten.div.Tensor], + broadcast=True, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + ) + def div(a, b): + a, b = lowering.promote_constants( + (a, b), type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT + ) + return div_prim(a, b) + + @register_lowering(aten.rsqrt) + def rsqrt(x): + dtype = x.get_dtype() + if is_integer_dtype(dtype) or is_boolean_dtype(dtype): + x = to_dtype(x, torch.get_default_dtype()) + + def _rsqrt(x): + return ops.rsqrt(x) + + register_fn_to_aten_fn(_rsqrt, aten.rsqrt) + return make_pointwise(_rsqrt)(x) + + @register_lowering(aten.prod) + def prod(x, axis=None, keepdims=False, *, dtype=None): + if ( + is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype()) + ) and dtype is None: + dtype = torch.int64 + + fn = make_reduction("prod", override_return_dtype=dtype) + return fn(x, axis, keepdims, dtype=dtype) + + @register_lowering(aten.any) + def reduce_any(x, dim=None, keepdim=False): + x = to_dtype(x, torch.bool) + return make_reduction("any")(x, axis=dim, keepdims=keepdim) + + @register_lowering(aten.max, type_promotion_kind=None) + def reduce_max(x, dim=None, keepdim=False): + if dim is not None: + return ( + reduce_amax(x, axis=dim, keepdims=keepdim), + reduce_argmax(x, axis=dim, keepdims=keepdim), + ) + + return reduce_amax(x, axis=None, keepdims=keepdim) + + @register_lowering(aten.min, type_promotion_kind=None) + def reduce_min(x, dim=None, keepdim=False): + if dim is not None: + return ( + reduce_amin(x, axis=dim, keepdims=keepdim), + reduce_argmin(x, axis=dim, keepdims=keepdim), + ) + + return reduce_amin(x, axis=None, keepdims=keepdim) + + register_lowering(prims.xor_sum)(make_reduction("xor_sum")) + reduce_amax = register_lowering(aten.amax)(make_reduction("max")) + reduce_amin = register_lowering(aten.amin)(make_reduction("min")) + reduce_argmax = register_lowering(aten.argmax)( + make_reduction("argmax", override_return_dtype=torch.int64) + ) + reduce_argmin = register_lowering(aten.argmin)( + make_reduction("argmin", override_return_dtype=torch.int64) + ) + + add = register_pointwise( + aten.add, allow_alpha=True, override_fn_when_input_bool="logical_or" + ) + + def register_pointwise_numeric(op, name=None, triton_fallback=None): + return register_pointwise( + op, + name=name, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + triton_fallback=triton_fallback, + ) + + def register_pointwise_numeric_ldf64(op): + return register_pointwise( + op, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + use_libdevice_for_f64=True, + ) + + def register_inplace(aten_op, outplace_op): + @register_lowering(aten_op, type_promotion_kind=None) + def fn(*args, **kwargs): + result = outplace_op(*args, **kwargs) + result = to_dtype(result, args[0].get_dtype()) + return mutate_to(args[0], result) + + return fn + + rsqrt = register_pointwise_numeric(aten.rsqrt) + exp = register_pointwise_numeric_ldf64(aten.exp) + exp2 = register_pointwise_numeric(aten.exp2) + expm1 = register_pointwise_numeric(aten.expm1) + relu = register_pointwise(aten.relu) + sigmoid = register_pointwise_numeric_ldf64(aten.sigmoid) + sqrt = register_pointwise_numeric_ldf64(aten.sqrt) + square = register_pointwise(aten.square) + sub = register_pointwise(aten.sub, allow_alpha=True) + register_pointwise_numeric_ldf64(aten.cos) + register_pointwise_numeric_ldf64(aten.sin) + abs_val = register_pointwise(aten.abs) + bitwise_and = register_pointwise(aten.bitwise_and) + bitwise_left_shift = register_pointwise(aten.bitwise_left_shift) + bitwise_not = register_pointwise( + aten.bitwise_not, override_fn_when_input_bool="logical_not" + ) + bitwise_or = register_pointwise(aten.bitwise_or) + bitwise_right_shift = register_pointwise(aten.bitwise_right_shift) + bitwise_xor = register_pointwise(aten.bitwise_xor) + register_pointwise_numeric(aten.lgamma) + erf = register_pointwise_numeric(aten.erf) + register_lowering( + aten.special_erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT + )(erf) + + register_pointwise_numeric(aten.log1p) + register_pointwise_numeric(aten.tan) + register_pointwise_numeric(aten.tanh) + register_pointwise_numeric_ldf64(aten.log) + logical_and = register_pointwise( + aten.logical_and, + type_promotion_kind=None, + convert_input_to_bool=True, + override_return_dtype=torch.bool, + ) + logical_not = register_pointwise( + aten.logical_not, + type_promotion_kind=None, + convert_input_to_bool=True, + override_return_dtype=torch.bool, + ) + logical_or = register_pointwise( + aten.logical_or, + type_promotion_kind=None, + convert_input_to_bool=True, + override_return_dtype=torch.bool, + ) + logical_xor = register_pointwise( + aten.logical_xor, + type_promotion_kind=None, + convert_input_to_bool=True, + override_return_dtype=torch.bool, + ) + maximum = register_pointwise(aten.maximum) + minimum = register_pointwise(aten.minimum) + clamp_min = register_pointwise(aten.clamp_min, name='maximum') + clamp_max = register_pointwise(aten.clamp_max, name='minimum') + neg = register_pointwise(aten.neg) + abs_val1 = register_pointwise(aten.abs) + register_pointwise(aten.remainder) + sign = register_pointwise(aten.sign, override_fn_when_input_bool="identity") + register_pointwise(aten.ceil) + register_pointwise(aten.signbit, override_return_dtype=torch.bool) + + register_lowering(aten._neg_view)(neg) + + register_pointwise(aten.le, override_return_dtype=torch.bool) + register_pointwise(aten.lt, override_return_dtype=torch.bool) + register_pointwise(aten.ge, override_return_dtype=torch.bool) + gt = register_pointwise(aten.gt, override_return_dtype=torch.bool) + register_pointwise(aten.eq, override_return_dtype=torch.bool) + register_pointwise(aten.ne, override_return_dtype=torch.bool) + + register_pointwise_numeric(aten.cosh) + register_pointwise_numeric(aten.sinh) + register_pointwise_numeric(aten.acos) + register_pointwise_numeric(aten.acosh) + register_pointwise_numeric(aten.asin) + register_pointwise_numeric(aten.asinh) + register_pointwise_numeric(aten.atan2) + register_pointwise_numeric(aten.atan) + register_pointwise_numeric(aten.atanh) + register_pointwise_numeric(aten.copysign) + register_pointwise_numeric(aten.erfc) + register_pointwise_numeric(aten.erfinv) + register_pointwise_numeric(aten.hypot) + register_pointwise_numeric(aten.log10) + register_pointwise_numeric(aten.log2) + register_pointwise_numeric(aten.nextafter) + + register_inplace(aten.add_, add) + register_inplace(aten.bitwise_and_, bitwise_and) + register_inplace(aten.bitwise_left_shift_, bitwise_left_shift) + register_inplace(aten.bitwise_not_, bitwise_not) + register_inplace(aten.bitwise_or_, bitwise_or) + register_inplace(aten.bitwise_right_shift_, bitwise_right_shift) + register_inplace(aten.bitwise_xor_, bitwise_xor) + register_inplace(aten.mul_, mul) + register_inplace(aten.div_.Tensor, div) + register_inplace(aten.div_.Tensor_mode, div_mode) + register_inplace(aten.logical_and_, logical_and) + register_inplace(aten.logical_not_, logical_not) + register_inplace(aten.logical_or_, logical_or) + register_inplace(aten.logical_xor_, logical_xor) + register_inplace(aten.sub_, sub) + register_inplace(aten.relu_, relu) + register_inplace(aten.sigmoid_, sigmoid) + + register_lowering(aten.__and__)(bitwise_and) + register_lowering(aten.__lshift__)(bitwise_left_shift) + register_lowering(aten.__or__)(bitwise_or) + register_lowering(aten.__rshift__)(bitwise_right_shift) + register_lowering(aten.__xor__)(bitwise_xor) + + register_inplace(aten.__iand__, aten.__and__) + register_inplace(aten.__ilshift__, aten.__lshift__) + register_inplace(aten.__ior__, aten.__or__) + register_inplace(aten.__irshift__, aten.__rshift__) + register_inplace(aten.__ixor__, aten.__xor__) + + ########################################################################## + + @register_lowering(aten.mean) + def mean(x, axis=None, keepdim=False, *, dtype=None): + if dtype is not None: + x = to_dtype(x, dtype) + size = x.get_size() + axis = lowering._validate_reduction_axis(x, axis) + # compute in higher-precision until end of mean lowering + output_dtype = x.get_dtype() + if output_dtype in (torch.float16, torch.bfloat16): + x = to_dtype(x, torch.float) + sum_result = sum_(x, axis, keepdim) + denom = sympy_product(size[i] for i in axis) + denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device()) + denom = ExpandView.create(denom, list(sum_result.get_size())) + return to_dtype(div(sum_result, denom), output_dtype) + + @register_lowering(aten.cumsum) + def cumsum(x, axis=None, dtype=None): + if ( + is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype()) + ) and dtype is None: + # torch.int64->torch.int32 + dtype = torch.int32 + if len(x.get_size()) == 0: + if axis not in [0, -1]: + raise ValueError("axis must be 0 or -1") + dtype = dtype or x.get_dtype() + return to_dtype(x, dtype, copy=True) + return lowering.fallback_cumsum(x, dim=axis, dtype=dtype) + + @register_lowering(npu.npu_dtype_cast, type_promotion_kind=None) + def _convert_npu_type(x: TensorBox, dtype: torch.dtype): + return to_dtype(x, dtype, copy=True) + + def var_mean_sum_(x, axis, correction, keepdim, return_mean): + if correction is None: + correction = 1 + + size = x.get_size() + axis = lowering._validate_reduction_axis(x, axis) + x_mean = mean(x, axis, keepdim=True) + if return_mean: + x_mean.realize() + + diffs = square(sub(x, x_mean)) + sum_result = sum_(diffs, axis, keepdim) + denom = sympy_product(size[i] for i in axis) + if correction: + denom = sympy.Max(denom - correction, 0) + denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device()) + denom = ExpandView.create(denom, list(sum_result.get_size())) + x_var = div(sum_result, denom) + if not return_mean: + return (x_var,) + + x_mean = x_mean if keepdim else squeeze(x_mean, axis) + return x_var, x_mean + + def var_mean_helper_(x, *, axis, correction, keepdim, return_mean): + out_dtype = x.get_dtype() + compute_dtype = get_computation_dtype(out_dtype) + x = to_dtype(x, compute_dtype, copy=False) + kwargs = dict( + x=x, + axis=axis, + correction=correction, + keepdim=keepdim, + return_mean=return_mean, + ) + output = ( + var_mean_sum_(**kwargs) + ) + output = tuple(to_dtype(x, out_dtype, copy=False) for x in output) + return output[0] if not return_mean else output + + @register_lowering(aten.var_mean) + def var_mean(x, axis=None, *, correction=None, keepdim=False): + return var_mean_helper_( + x, axis=axis, correction=correction, keepdim=keepdim, return_mean=True + ) + + @register_lowering([aten.var, prims.var]) + def var_(x, axis=None, *, correction=None, keepdim=False): + return var_mean_helper_( + x, axis=axis, correction=correction, keepdim=keepdim, return_mean=False + ) + + @register_lowering(aten.embedding, type_promotion_kind=None) + def embedding(weight, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False): + return lowering.fallback_handler(aten.embedding.default)(weight, indices, padding_idx=-1, + scale_grad_by_freq=False, + sparse=False) + + @register_lowering(aten.cat) + def cat(inputs, dim=0): + return lowering.fallback_handler(aten.cat.default)(inputs, dim) + + lowering.make_fallback(aten._log_softmax) + lowering.make_fallback(aten.gather) + lowering.make_fallback(aten.nll_loss_forward) diff --git a/torch_npu/_inductor/lowering_op_list.py b/torch_npu/_inductor/lowering_op_list.py new file mode 100644 index 0000000000..0e8bb3a9a5 --- /dev/null +++ b/torch_npu/_inductor/lowering_op_list.py @@ -0,0 +1,107 @@ +import torch +from torch_npu import npu_dtype_cast + +aten = torch.ops.aten +tr_c10d = torch.ops.tr_c10d +prims = torch.ops.prims + +GENERATE_LIST = [ + prims.iota, + aten.full, + aten.mul, + aten.add, + aten.sub, + aten.div, + aten.exp, + aten.maximum, + aten.sum, + aten.select, + aten.unsqueeze, + aten.repeat, + aten.clone, + aten.reshape, + aten.where, + aten.lt, + aten.minimum, + aten.gt, + aten.le, + aten.ceil, + aten.floor, + aten.rsqrt, + aten.abs, + aten.log, + aten.bitwise_xor, + aten.amax, + # backward + prims.convert_element_type, + aten.min, + aten.max, + aten.erf, + aten.argmax, + aten.argmin, + aten.clamp_min, + aten.slice, + aten.neg, + aten.cat, + aten.arange, + aten.expand, + aten.eq, + aten.where, + aten.scalar_tensor, + aten.ge, + aten.permute, + aten.sqrt, + aten.relu, + aten.clamp, + aten.clamp_max, + aten.mean, + npu_dtype_cast, + aten.select_scatter, + aten.slice_scatter, + prims.broadcast_in_dim, + prims.maximum, + aten.ne, + aten.sigmoid, + aten.sign, + aten.logical_and, + aten.logical_or, + aten.logical_not, + aten.pow, + aten.gelu, + aten.tanh, + aten.isnan, + aten.bitwise_and, + aten.squeeze, + aten.copy, + aten.reciprocal +] + +GENERATE_LIST2 = [ + "foreach" +] + +FALLBACK_LIST = [] + +# Delete these op in lowering list and then update lowering list with new lowering, +# otherwise, it will not use npu overload lowering. +LOWERING_OVERLOAD_OP = [ + aten.cumsum, + aten.mean, + aten.max, + aten.min, + aten.amin, + aten.amax, + aten.argmax, + aten.argmin, + aten.sum, + + aten.var_mean, + aten.var, + + aten.embedding, + aten.split, + aten.split_with_sizes, + aten.nll_loss_forward, + aten.gather, + aten.cat, +] diff --git a/torch_npu/_inductor/npu_choices.py b/torch_npu/_inductor/npu_choices.py new file mode 100644 index 0000000000..438399e4b6 --- /dev/null +++ b/torch_npu/_inductor/npu_choices.py @@ -0,0 +1,33 @@ +import typing +from typing import Any, Dict, List, Type, TYPE_CHECKING +import sympy +from torch._inductor import config +from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures +from torch._inductor.codegen.triton import TritonKernel +from torch._inductor.runtime.hints import ReductionHint +from torch._inductor.virtualized import V + + +@staticmethod +def should_use_persistent_reduction( + features: SIMDKernelFeatures, cooperative_reduction: bool +) -> bool: + """ + Heuristic to decide if a persistent reduction should be used. + """ + if not config.triton.persistent_reductions: + return False + threshold = { + ReductionHint.INNER: 1024, + ReductionHint.DEFAULT: 1024 + }.get(features.get_reduction_hint(), 64) + if cooperative_reduction: + # The RSPLIT of cooperative reductions means each thread block is operating on fewer elements + try: + threshold *= 32 // min(V.graph.sizevars.size_hint(features.numel), 32) + except ValueError: + pass # unbacked symint + + if config.triton.multi_kernel: + threshold *= 16 + return V.graph.sizevars.statically_known_leq(features.reduction_numel, threshold) # type: ignore[arg-types] diff --git a/torch_npu/_inductor/npu_device.py b/torch_npu/_inductor/npu_device.py new file mode 100644 index 0000000000..ef5bf7b4d5 --- /dev/null +++ b/torch_npu/_inductor/npu_device.py @@ -0,0 +1,208 @@ +import torch +from torch_npu.npu import device_count +from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device +from torch_npu.utils._inductor import NPUDeviceOpOverrides +from . import config as npu_config + + +## Override original inductor device overrides in torch_npu +class NewNPUDeviceOpOverrides(NPUDeviceOpOverrides): + def import_get_raw_stream_as(self, name): + return f"from torch_npu._inductor import get_current_raw_stream as {name}" + + def set_device(self, device_idx): + return f"torch.npu.set_device({device_idx})" + + def synchronize(self): + return """ + stream = torch.npu.current_stream() + stream.synchronize() + """ + + def device_guard(self, device_idx): + return f"torch.npu.utils.device({device_idx})" + + def cpp_aoti_device_guard(self): + raise NotImplementedError + + def cpp_aoti_stream_guard(self): + return "AOTICudaStreamGuard" + + def kernel_driver(self): + source_code = """ + namespace { + + struct Grid { + Grid(uint32_t x, uint32_t y, uint32_t z) + : grid_x(x), grid_y(y), grid_z(z) {} + uint32_t grid_x; + uint32_t grid_y; + uint32_t grid_z; + + bool is_non_zero() { + return grid_x > 0 && grid_y > 0 && grid_z > 0; + } + }; + + } // anonymous namespace + + extern "C" { + typedef int (* callback)(unsigned int type, void* data, unsigned int len); + extern int MsprofReportApi(unsigned int agingFlag, const MsprofApi *api); + extern unsigned long int MsprofSysCycleTime(); + extern int MsprofRegisterCallback(unsigned int moduleId, callback handle); + static unsigned int __MsprofFlagL0 = 0; + static unsigned int __MsprofFlagL1 = 0; + + int ProfCtrlHandle(unsigned int CtrlType, void* CtrlData, unsigned int DataLen) { + if ((CtrlData == nullptr) || (DataLen == 0U)) { + return 1; + } + + if (CtrlType == 1) { + MsprofCommandHandle* handle = (MsprofCommandHandle *)(CtrlData); + if (handle->type >= 6) // 6 is not used here + return 1; + if (handle->type == 1) { // init - 0 , start - 1 + __MsprofFlagL0 = ((0x00000800ULL & handle->profSwitch) == 0x00000800ULL) ? 1 : 0; + __MsprofFlagL1 = ((0x00000002ULL & handle->profSwitch) == 0x00000002ULL) ? 1 : 0; + } + } + return 0; + } + } + """ + + load_code = """ + static std::unordered_map registered_names; + static std::unordered_map> func_stubs; + + static inline void * loadKernel( + std::string filePath, + const std::string &&nameFuncMode, + uint32_t sharedMemBytes, + const std::optional &cubinDir = std::nullopt) { + if (cubinDir) { + std::filesystem::path p1{*cubinDir}; + std::filesystem::path p2{filePath}; + filePath = (p1 / p2.filename()).string(); + } + std::string funcName; + std::string kernel_mode_str; + size_t spacePos = nameFuncMode.find(' '); + if (spacePos != std::string::npos) { + kernel_mode_str = nameFuncMode.substr(spacePos + 1); + funcName = nameFuncMode.substr(0, spacePos); + } else { + throw std::runtime_error(std::string("Parse kernel name failed, expect " + "'kernel_name kernel_mode', bug got: ") + nameFuncMode); + } + + std::ifstream file(std::string(filePath), std::ios::binary | std::ios::ate); + if (!file.is_open()) { + throw std::runtime_error(std::string("open npubin failed")); + } + + std::streamsize data_size = file.tellg(); + + file.seekg(0, std::ios::beg); + char* buffer = new char[data_size]; + if (!file.read(buffer, data_size)) { + throw std::runtime_error(std::string("read npubin failed")); + } + + rtError_t rtRet; + + rtDevBinary_t devbin; + devbin.data = buffer; + devbin.length = data_size; + const std::string kernel_mode{kernel_mode_str}; + if (kernel_mode == "aiv") { + devbin.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; + } else { + devbin.magic = RT_DEV_BINARY_MAGIC_ELF; + } + devbin.version = 0; + + int device = 0; + rtRet = rtSetDevice(device); + if (rtRet != RT_ERROR_NONE) { + throw std::runtime_error(std::string("rtSetDevice failed, 0x") + std::to_string(rtRet)); + } + + void *devbinHandle = NULL; + rtRet = rtDevBinaryRegister(&devbin, &devbinHandle); + if (rtRet != RT_ERROR_NONE) { + throw std::runtime_error(std::string("rtDevBinaryRegister failed, 0x") + std::to_string(rtRet)); + } + + const char* name = funcName.c_str(); + + std::string stubName(name); + stubName += "_" + std::to_string(registered_names[name]); + registered_names[name]++; + auto registered = func_stubs.emplace(stubName, std::make_unique(0)); + void *func_stub_handle = registered.first->second.get(); + rtRet = rtFunctionRegister(devbinHandle, func_stub_handle, stubName.c_str(), + (void *)name, 0); + if (rtRet != RT_ERROR_NONE) { + throw std::runtime_error(std::string("rtFunctionRegister failed, stubName = ") + stubName + + std::string(" , 0x") + std::to_string(rtRet)); + } + + return func_stub_handle; + } + """ + + # Could not use OpCommand when debug_kernel, because we want to + # use torch::save, which will cause dead lock in child thread. + launch_code = """ + static inline void launchKernel( + std::function launch_call, + std::string&& kernel_name) { + launch_call(); + } + """ if npu_config.aot_inductor.debug_kernel else """ + static inline void launchKernel( + std::function launch_call, + std::string&& kernel_name) { + at_npu::native::OpCommand cmd; + cmd.Name(kernel_name.c_str()) + .SetCustomHandler(launch_call) + .Run(); + } + """ + extra_code = "" + source_codes = source_code + load_code + launch_code + extra_code + return source_codes + + def abi_compatible_header(self): + return """ + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include "experiment/runtime/runtime/rt.h" + """ + + def cpp_stream_type(self): + return "aclrtStream" + + def aoti_get_stream(self): + return "aoti_torch_get_current_cuda_stream" + + def cpp_kernel_type(self): + return "void *" + + def cpp_device_ptr(self): + return "void*" diff --git a/torch_npu/_inductor/npu_fusion_attention_graph.py b/torch_npu/_inductor/npu_fusion_attention_graph.py new file mode 100644 index 0000000000..4242ba5b8e --- /dev/null +++ b/torch_npu/_inductor/npu_fusion_attention_graph.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. +import functools +import sympy +import torch +import torch.nn.functional as F +from torch.autograd import Function +from torch.library import Library, impl +import torch_npu + +npu_def = Library("npu_graph", "DEF") +npu_lib = Library("npu_graph", "IMPL", "PrivateUse1") +meta_lib = Library("npu_graph", "IMPL", "Meta") + +npu_def.define( + "npu_fa(Tensor query, Tensor key, Tensor value, int head_num, str input_layout, Tensor? pse=None, Tensor? padding_mask=None, Tensor? atten_mask=None, float scale=1., float keep_prob=1., int pre_tockens=2147483647, int next_tockens=2147483647, int inner_precise=0, int[]? prefix=None, int[]? actual_seq_qlen=None, int[]? actual_seq_kvlen=None, int sparse_mode=0, bool gen_mask_parallel=True, bool sync=False) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)") +npu_def.define( + "npu_fa_backward(Tensor query, Tensor key, Tensor value, Tensor dy, int head_num, str input_layout, *, Tensor? pse=None, Tensor? padding_mask=None, Tensor? atten_mask=None, Tensor? softmax_max=None, Tensor? softmax_sum=None, Tensor? softmax_in=None, Tensor? attention_in=None, float scale_value=1., float keep_prob=1., int pre_tockens=2147483647, int next_tockens=2147483647, int inner_precise=0, Tensor? seed=None, Tensor? offset=None, Tensor? numels=None, int[]? prefix=None, int[]? actual_seq_qlen=None, int[]? actual_seq_kvlen=None, int sparse_mode=0, bool gen_mask_parallel=True, bool sync=False) -> (Tensor, Tensor, Tensor, Tensor)") + + +@impl(npu_lib, "npu_fa") +def npu_fa(*args, **kwargs): + if len(args) > 8: + args = list(args) + # for scale + try: + args[8] = 1.0 / args[8] + except IndexError: + args[8] = 1.0 / (args[8] + 1e-6) + r1, r2, r3, r4, seed, offset, numel = torch_npu.npu_fusion_attention(*args, **kwargs) + r2.requires_grad = False + r3.requires_grad = False + r4.requires_grad = False + return r1, r2, r3, r4, torch.tensor([seed], requires_grad=False), torch.tensor([offset], + requires_grad=False), torch.tensor( + [numel], requires_grad=False) + + +@impl(npu_lib, "npu_fa_backward") +def npu_fa_backward(*args, **kwargs): + if 'scale_value' in kwargs: + kwargs['scale_value'] = 1.0 / kwargs['scale_value'] + return torch_npu.npu_fusion_attention_grad(*args, **kwargs) + + +@impl(meta_lib, "npu_fa") +def npu_fa(query, key, value, head_num, input_layout, pse=None, padding_mask=None, + atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, + inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, + gen_mask_parallel=True, sync=False): + B = query.size(0) + N = head_num + S1 = query.size(2) + S2 = key.size(2) + + if input_layout == "BSH": + B = query.size(0) + S1 = query.size(1) + S2 = key.size(1) + + if input_layout == "SBH": + B = query.size(1) + S1 = query.size(0) + S2 = key.size(0) + + attention_score = torch.empty_like(query, dtype=query.dtype, device='meta').contiguous() + softmax_max = torch.empty([B, head_num, S1, 8], dtype=torch.float32, device='meta') + softmax_sum = torch.empty([B, head_num, S1, 8], dtype=torch.float32, device='meta') + softmax_out = torch.empty([0], dtype=query.dtype, device='meta') + return (torch.empty_like(attention_score), + torch.empty_like(softmax_max), + torch.empty_like(softmax_sum), + torch.empty_like(softmax_out), + torch.tensor([0], device='meta', requires_grad=False), + torch.tensor([0], device='meta', requires_grad=False), + torch.tensor([0], device='meta', requires_grad=False)) + + +@impl(meta_lib, "npu_fa_backward") +def npu_fa_backward(query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None, atten_mask=None, + softmax_max=None, softmax_sum=None, softmax_in=None, attention_in=None, scale_value=1.0, + keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, seed=0, offset=0, + numels=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, + gen_mask_parallel=True, sync=False): + dq = torch.empty_like(query, dtype=query.dtype, device='meta').contiguous() + dk = torch.empty_like(key, dtype=query.dtype, device='meta').contiguous() + dv = torch.empty_like(value, dtype=query.dtype, device='meta').contiguous() + dpse = torch.empty([0], dtype=query.dtype, device='meta').contiguous() + return (torch.empty_like(dq), torch.empty_like(dk), torch.empty_like(dv), torch.empty_like(dpse) if pse else None) + + +class NpuGraphAttentionFunction(Function): + @staticmethod + def forward(ctx, query, key, value, head_num, input_layout, pse=None, padding_mask=None, atten_mask=None, scale=1.0, + keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, prefix=None, + actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False): + # 前向传播逻辑 + # 这里假设有一个实现前向传播的函数 `npu_fusion_attention_forward` + result0, result1, result2, result3, result4, result5, result6 = torch.ops.npu_graph.npu_fa( + query, key, value, head_num, input_layout, pse=pse, padding_mask=padding_mask, atten_mask=atten_mask, + scale=scale, keep_prob=keep_prob, pre_tockens=pre_tockens, next_tockens=next_tockens, + inner_precise=inner_precise, prefix=prefix, actual_seq_qlen=actual_seq_qlen, + actual_seq_kvlen=actual_seq_kvlen, sparse_mode=sparse_mode, gen_mask_parallel=gen_mask_parallel, sync=sync + ) + # 保存中间结果,以便在反向传播中使用 + ctx.save_for_backward(query, key, value, pse, padding_mask, atten_mask, result1, result2, result3, result0, + result4, result5, result6) + ctx.head_num = head_num + ctx.input_layout = input_layout + ctx.scale = scale + ctx.keep_prob = keep_prob + ctx.pre_tockens = pre_tockens + ctx.next_tockens = next_tockens + ctx.inner_precise = inner_precise + ctx.prefix = prefix + ctx.actual_seq_qlen = actual_seq_qlen + ctx.actual_seq_kvlen = actual_seq_kvlen + ctx.sparse_mode = sparse_mode + ctx.gen_mask_parallel = gen_mask_parallel + ctx.sync = sync + + return result0, result1, result2, result3, result4, result5, result6 + + @staticmethod + def backward(ctx, grad_result0, grad_result1, grad_result2, grad_result3, grad_result4, grad_result5, grad_result6): + # 获取保存的中间结果 + query, key, value, pse, padding_mask, atten_mask, result1, result2, result3, result0, result4, result5, result6 = ctx.saved_tensors + # 反向传播逻辑 + # 这里假设有一个实现反向传播的函数 `npu_fusion_attention_backward` + grad_query, grad_key, grad_value, grad_pse = torch.ops.npu_graph.npu_fa_backward( + query, key, value, grad_result0, ctx.head_num, ctx.input_layout, pse=pse, padding_mask=padding_mask, + atten_mask=atten_mask, softmax_max=result1, softmax_sum=result2, softmax_in=result3, attention_in=result0, + scale_value=ctx.scale, keep_prob=ctx.keep_prob, pre_tockens=ctx.pre_tockens, next_tockens=ctx.next_tockens, + inner_precise=ctx.inner_precise, seed=result4, offset=result5, numels=result6, prefix=ctx.prefix, + actual_seq_qlen=ctx.actual_seq_qlen, actual_seq_kvlen=ctx.actual_seq_kvlen, sparse_mode=ctx.sparse_mode, + gen_mask_parallel=ctx.gen_mask_parallel, sync=ctx.sync + ) + return ( + grad_query, grad_key, grad_value, None, None, grad_pse, None, None, None, None, None, None, None, None, None, + None, None, None, None, None, None, None, None, None, None, None) + + +def npu_fusion_attention_graph(query, key, value, head_num, input_layout, pse=None, padding_mask=None, + atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, + next_tockens=2147483647, + inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, + gen_mask_parallel=True, sync=False): + return NpuGraphAttentionFunction.apply(query, key, value, head_num, input_layout, pse, padding_mask, + atten_mask, scale, keep_prob, pre_tockens, next_tockens, + inner_precise, prefix, actual_seq_qlen, actual_seq_kvlen, sparse_mode, + gen_mask_parallel, sync) + + +torch_npu.npu_fusion_attention_graph = npu_fusion_attention_graph + + +def register_fa_pass(): + TOKEN_MAX = 2147483647 + from torch._inductor.pattern_matcher import register_replacement, fwd_only, joint_fwd_bwd + from torch._inductor.fx_passes.joint_graph import patterns + from torch._dynamo.utils import counters + from torch._inductor.fx_passes.fuse_attention import partialize_and_update_signature + + def _npu_fusion_attention_graph_pattern_1(query, key, value, inv_scale_factor, dropout_p): + q = query.permute(0, 2, 1, 3) + k = key.permute(0, 2, 1, 3) + v = value.permute(0, 2, 1, 3) + return torch.nn.functional.dropout( + torch.matmul(q, k.transpose(-2, -1)).div(inv_scale_factor).softmax(dim=-1), + p=dropout_p, + ).matmul(v) + + def _npu_fusion_attention_graph_replacement_1(query, key, value, inv_scale_factor, dropout_p): + counters["inductor"]["fuse_attention"] += 1 + head_num = query.size(2) + input_layout = "BNSD" + return torch_npu.npu_fusion_attention_graph( + query.transpose(1, 2), + key.transpose(1, 2), + value.transpose(1, 2), + head_num, + input_layout, + None, + atten_mask=None, + scale=inv_scale_factor, + keep_prob=1.0 - dropout_p, + )[0] + + def _get_sfdp_patterns(): + device = 'npu' + g_inp = functools.partial( + torch.empty, (2, 4, 8, 16), device=device, requires_grad=True + ) + c_inp = functools.partial(torch.tensor, 2.0, device=device) + d = {"dropout_p": 0.113377} + candidates = [] + for dtype in [torch.float]: + g = functools.partial(g_inp, dtype=dtype) + c = functools.partial(c_inp, dtype=dtype) + candidates.append(( + _npu_fusion_attention_graph_pattern_1, + _npu_fusion_attention_graph_replacement_1, + [g(), g(), g(), c()], + d, + )) + + for pattern, replacement, args, workaround in candidates: + # gets serialized to a python file and does not require tracing at runtime. + if not isinstance(workaround, dict): + raise ValueError("workaround not dict") + name = pattern.__name__ + + if dtype != torch.float: + name += "_half" + + if args[0].size(0) == 1: + name += "_bs1" + + training_name = name + "_training" + yield training_name, { + "search_fn": pattern, + "replace_fn": replacement, + "example_inputs": args, + "trace_fn": joint_fwd_bwd, + "pass_dicts": patterns, + "scalar_workaround": workaround, + } + + if workaround: + if not (len(workaround) == 1 and "dropout_p" in workaround): + raise ValueError("not (len(workaround) == 1 and dropout_p in workaround)") + # functools.partial insufficient because we look at signature downstream + pattern = partialize_and_update_signature(pattern, dropout_p=0.0) + replacement = partialize_and_update_signature( + replacement, dropout_p=0.0 + ) + workaround = {} + + inference_name = name + "_inference" + yield inference_name, { + "search_fn": pattern, + "replace_fn": replacement, + "example_inputs": args, + "trace_fn": fwd_only, + "pass_dicts": patterns, + "scalar_workaround": workaround, + } + + for _, register_replacement_kwargs in _get_sfdp_patterns(): + register_replacement( + **register_replacement_kwargs, + ) + diff --git a/torch_npu/_inductor/npu_triton_helpers.py b/torch_npu/_inductor/npu_triton_helpers.py new file mode 100644 index 0000000000..5140a2911a --- /dev/null +++ b/torch_npu/_inductor/npu_triton_helpers.py @@ -0,0 +1,22 @@ +import triton +import triton.language as tl + +import triton.language.extra.ascend.libdevice as libdevice +from torch._inductor.runtime import triton_helpers + +libdevice = tl.extra.ascend.libdevice +math = tl.math + + +@triton.jit +def maximum(a, b): + return tl.maximum(a, b) + + +@triton.jit +def minimum(a, b): + return tl.minimum(a, b) + + +triton_helpers.maximum = maximum +triton_helpers.minimum = minimum diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py new file mode 100644 index 0000000000..64f25854c8 --- /dev/null +++ b/torch_npu/_inductor/npu_triton_heuristics.py @@ -0,0 +1,1193 @@ +# This file is based on triton_heuristics with heuristics designed for NPU +import copy +import functools +import hashlib +import importlib +import json +import logging +import dataclasses +import os +import re +import sys +import time +from itertools import count +from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union, List +import torch +from torch._logging import warning_once +import triton +from torch._dynamo.utils import dynamo_timed +from torch._inductor import config +from torch._inductor.compile_fx import clone_preserve_strides +from torch._inductor.runtime.autotune_cache import AutotuneCache +from torch._inductor.runtime.benchmarking import benchmarker +from torch._inductor.runtime.runtime_utils import ( + create_bandwidth_info_str, + get_num_bytes, + +) +from torch._inductor.utils import triton_version_uses_attrs_dict +from torch.utils._ordered_set import OrderedSet +from torch._inductor.runtime.triton_heuristics import ( + CachingAutotuner, + HeuristicType, + unique_configs, + hash_configs, + Config, + ASTSource, + _find_names, + get_first_attr, + collected_calls, + _dump_launch_params, + builtins, + NoTritonConfigsError, + TritonCompileResult, + GridExpr, + config_to_dict +) +from torch._inductor.runtime.runtime_utils import triton_hash_to_path_key +from triton.compiler import CompiledKernel +from torch._inductor.triton_bundler import TritonBundler + +try: + from triton.backends.compiler import GPUTarget + from triton.runtime.autotuner import OutOfResources + import torch.autograd.profiler as autograd_profiler +except ImportError: + GPUTarget = None + OutOfResources = None + autograd_profiler = None + +from torch_npu.utils._error_code import ErrCode, pta_error + +from .codegen.split_tiling import SplitTiling +from .utils import get_current_raw_stream +from .codegen.tile_generator import TileGenerator +from .codegen.triton_utils import get_aligned_numel +from .config import aggresive_autotune +from .config import log +from . import config as npu_config + +kernel_idx = count() + + +@dataclasses.dataclass +class GridNpu(GridExpr): + numels: List[str] = None + + def generate(self, meta: dict[str, int]) -> None: + numel_args = [] + split_axis = meta.get("split_axis", None) + split_blocks = meta.get("split_blocks", None) + if split_axis is None or split_blocks is None: + raise RuntimeError(f"Could not get split_axis or split_blocks from meta {meta}.") + + def grid_fn(i): + if i >= len(split_axis): + return "1" + axis = split_axis[i] + block = split_blocks[i] + return f"({self.numels[axis]} + {block} - 1) // {block}" + self.x_grid = grid_fn(0) + self.y_grid = grid_fn(1) + self.z_grid = grid_fn(2) + + +class GridExprNpu(GridExpr): + @staticmethod + def from_meta_and_set_numel( + inductor_meta: dict[str, Any], + cfg: Union[Config, dict[str, int]], + numels: List[str], + mode: Literal["python", "cpp"] = "python", + ) -> GridExpr: + grid_cls = globals()[inductor_meta["grid_type"]] + if not issubclass(grid_cls, GridNpu): + raise AssertionError(f"grid_type in inductor_meta must be subclass of GridNpu" + f"but got {inductor_meta['grid_type']}") + grid = grid_cls(inductor_meta=inductor_meta, mode=mode, numels=numels) + if isinstance(cfg, Config): + cfg = config_to_dict(cfg) + grid.generate(cfg) + return grid + + +class TritonCompileResultNpu(TritonCompileResult): + def make_launcher(self): + cfg = self.config + compile_meta = self.compile_meta + binary = self.kernel + fn = binary.src.fn + binary._init_handles() + + known_constants = OrderedSet( + arg for i, arg in enumerate(fn.arg_names) if i in fn.constexprs + ) + none_args = OrderedSet( + k + for k, v in compile_meta["constants"].items() + if v is None and k not in known_constants + ) + none_args = none_args.difference(OrderedSet(compile_meta["signature"].keys())) + + if triton_version_uses_attrs_dict(): + call_args = fn.arg_names + def_args = fn.arg_names + if ( + "num_warps" in compile_meta["constants"] + or "num_stages" in compile_meta["constants"] + ): + # num_warps/num_stages are special implicit args that are not in the signature + # see test_triton_kernel_special_params + def_args = [ + arg for arg in def_args if arg not in ("num_warps", "num_stages") + ] + repl = { + k: str(compile_meta["constants"].get(k)) + for k in ("num_warps", "num_stages") + } + call_args = [repl.get(arg, arg) for arg in call_args] + else: + call_args = [ + arg + for i, arg in enumerate(fn.arg_names) + if i not in fn.constexprs and arg not in none_args + ] + cfg_dict = config_to_dict(cfg) + def_args = [ + name + for name in fn.arg_names + if name not in cfg_dict and name not in none_args + ] + + binary_shared = ( + binary.shared if hasattr(binary, "shared") else binary.metadata.shared + ) + + scope = { + "grid_meta": cfg.kwargs, + "bin": binary, + "launch_enter_hook": binary.__class__.launch_enter_hook, + "launch_exit_hook": binary.__class__.launch_exit_hook, + "metadata": ( + binary.packed_metadata + if hasattr(binary, "packed_metadata") + else binary.metadata + ), + "shared": binary_shared, + "num_warps": ( + binary.num_warps + if hasattr(binary, "num_warps") + else binary.metadata.num_warps + ), + "cta_args": ( + ( + binary.num_ctas, + *get_first_attr(binary, "cluster_dims", "clusterDims"), + ) + if hasattr(binary, "num_ctas") + else ( + (binary.metadata.num_ctas, *binary.metadata.cluster_dims) + if hasattr(binary, "metadata") + else () + ) + ), + "function": get_first_attr(binary, "function", "cu_function"), + "runner": get_first_attr(binary, "run", "c_wrapper"), + } + + if not hasattr(binary, "launch_metadata"): + # launch args before CompiledKernel.launch_metadata is added. + # TODO(jansel): delete this branch in mid-2025 + runner_args = [ + "grid_0", + "grid_1", + "grid_2", + "num_warps", + "*cta_args", + "shared", + "stream", + "function", + "launch_enter_hook", + "launch_exit_hook", + "metadata", + *call_args, + ] + else: + if binary.__class__.launch_enter_hook: + launch_metadata = f"bin.launch_metadata((grid_0, grid_1, grid_2), stream, {', '.join(call_args)})" + else: + launch_metadata = "None" + runner_args = [ + "grid_0", + "grid_1", + "grid_2", + "stream", + "function", + "metadata", + launch_metadata, + "launch_enter_hook", + "launch_exit_hook", + *call_args, + ] + + if "extra_launcher_args" in self.inductor_meta: + def_args = [*def_args, *self.inductor_meta["extra_launcher_args"]] + + numels = [ + arg + for arg in fn.arg_names + if "_numel" in arg + ] + grid = GridExprNpu.from_meta_and_set_numel(self.inductor_meta, cfg, numels) + # grid.prefix is usually empty, grid.x_grid is something like `-(xnumel//-1024)` + lines = [ + f"def launcher({', '.join(def_args)}, stream):", + *[f" {line}" for line in grid.prefix], + f" grid_0 = {grid.x_grid}", + f" grid_1 = {grid.y_grid}", + f" grid_2 = {grid.z_grid}", + f" runner({', '.join(runner_args)})", + ] + exec("\n".join(lines), scope) + + launcher = scope["launcher"] + launcher.config = cfg + launcher.n_regs = getattr(binary, "n_regs", None) + launcher.n_spills = getattr(binary, "n_spills", None) + launcher.shared = binary_shared + launcher.store_cubin = self.inductor_meta.get("store_cubin", False) + # store this global variable to avoid the high overhead of reading it when calling run + if launcher.store_cubin: + launcher.fn = fn + launcher.bin = binary + if triton_version_uses_attrs_dict(): + # arg filtering wasn't done above + cfg_dict = config_to_dict(cfg) + def_args = [x for x in def_args if x not in cfg_dict] + call_args = [ + x + for x in call_args + if compile_meta["signature"].get(x, "constexpr") != "constexpr" + and x not in none_args + ] + launcher.def_args = def_args + launcher.call_args = call_args + return launcher + + +class NPUCachingAutotuner(CachingAutotuner): + def __init__( + self, + fn, + triton_meta, # passed directly to triton + configs, + save_cache_hook, + mutated_arg_names: List[str], # see [Note: clone mutated buffers] + optimize_mem, + heuristic_type, + size_hints=None, + inductor_meta=None, # metadata not relevant to triton + custom_kernel=False, # whether the kernel is inductor-generated or custom + filename: Optional[str] = None, + reset_to_zero_arg_names: Optional[List[str]] = None, + ): + super().__init__(fn, triton_meta, configs, save_cache_hook, mutated_arg_names, optimize_mem, heuristic_type, + size_hints, inductor_meta, custom_kernel, filename, reset_to_zero_arg_names) + + self.exceptions = [] + self.fn_name = None + + @staticmethod + def api_accuracy_checker(expected, actual, kernel_name, dump_path): + from msprobe.core.common.const import CompareConst + from msprobe.pytorch.api_accuracy_checker.compare.compare_utils import BENCHMARK_COMPARE_SUPPORT_LIST + from msprobe.pytorch.api_accuracy_checker.triton_adapter.get_compare_result import get_compare_result + from msprobe.pytorch.api_accuracy_checker.triton_adapter.precision_compare import precision_compare + from msprobe.pytorch.api_accuracy_checker.triton_adapter.common.compare_utils import \ + convert_compare_column_to_row, print_check_details + from msprobe.pytorch.api_accuracy_checker.triton_adapter.precision_standard.triton_standard_register import \ + exist_in_precision_standard + + dtype = actual.dtype + + # only float use precision standard + if exist_in_precision_standard(kernel_name): + if str(dtype) in BENCHMARK_COMPARE_SUPPORT_LIST: + compare_column = precision_compare(kernel_name, expected, actual, dtype) # calc metrics + compare_row = convert_compare_column_to_row(compare_column, kernel_name) + status = get_compare_result(compare_row, kernel_name) # get compare results + if status == CompareConst.ERROR: + log.warning(f'CHECK ACCURACY FAILED! kernel: {kernel_name}, Dump Path: {dump_path}') + print_check_details(compare_column, kernel_name) + actual.copy_(expected) + checked_by_msprobe = True + else: + log.warning(f'The data type {dtype} is not supported for new precision standard. ' + f'Check accuracy by tolerance method.') + checked_by_msprobe = False + else: + log.warning(f'kernel_name {kernel_name} does not in new precision standard. ' + f'Check accuracy by tolerance method.') + checked_by_msprobe = False + return checked_by_msprobe + + def precompile( + self, + warm_cache_only=False, + reload_kernel: Optional[Callable[[], CachingAutotuner]] = None, + ): + if warm_cache_only: + self._precompile_worker() + return + with self.lock: + # Helper function for reloading a kernel generated in a worker + # in the parent class. Normally we don't need to reload the kernel + # in the parent process, but in certain cases (coordesc tuning, dynamic_scale_rblock), + # we need to actually run compilation on the parent process + if reload_kernel is not None: + self._reload_kernel = reload_kernel + self._precompile_worker() + self._make_launchers() + + def _precompile_worker(self): + if self.compile_results: + for result in self.compile_results: + TritonBundler.put( + triton_hash_to_path_key(result.kernel.hash), + self.triton_meta.get("device", 0), + ) + return + if self.launchers: + raise AssertionError("Before _precompile_worker, launchers must bt empty") + + if not self.configs: + raise NoTritonConfigsError("No triton configs are available") + + compile_results = [] + exc = None + exc_stack = "" + for c in self.configs: + try: + compile_results.append(self._precompile_config(c)) + except Exception as e: + import traceback + exc_stack = traceback.format_exc() + exc = e + if len(compile_results) == 0: + raise NoTritonConfigsError( + f"No valid triton configs. {type(exc).__name__}: {exc} \nStack trace:{exc_stack}" + ) + self.compile_results = compile_results + self.configs = None + + def _precompile_config(self, cfg: Config) -> TritonCompileResultNpu: + """Ahead of time compile a given autotuner config.""" + compile_meta = copy.deepcopy(self.triton_meta) + cfg_kwargs = cfg.kwargs + for k, v in cfg_kwargs.items(): + if k not in self.fn.arg_names: + continue + compile_meta["constants"][k] = v + + for i in self.fn.constexprs: + arg_name = self.fn.arg_names[i] + if arg_name not in compile_meta["constants"] and ( + arg_name == "num_warps" or arg_name == "num_stages" + ): + compile_meta["constants"][arg_name] = getattr(cfg, arg_name) + compile_meta["num_warps"] = cfg.num_warps + compile_meta["num_stages"] = cfg.num_stages + compile_meta["debug"] = ( + os.getenv("INDUCTOR_ASCEND_DEBUG", 'false').lower() in ('true', '1') + and self.inductor_meta.get("assert_indirect_indexing", True) + and not self.inductor_meta.get("is_hip", False) + ) + + # device type will be "hip" rather than "cuda" here + compile_meta["device_type"] = self.device_props.type + compile_meta["cc"] = self.device_props.cc + + if not ASTSource: + raise RuntimeError("Installed triton version too old, please upgrade") + + compile_args = ( + ASTSource( + self.fn, + compile_meta["signature"], + compile_meta["constants"], + ), + ) + + cc_warp_size = 32 + target = GPUTarget( + compile_meta["device_type"], + compile_meta["cc"], + cc_warp_size, + ) + + options = { + "num_warps": compile_meta["num_warps"], + "num_stages": compile_meta["num_stages"], + "debug": compile_meta["debug"] + } + compile_kwargs = { + "target": target, + "options": options, + } + + try: + binary = triton.compile(*compile_args, **compile_kwargs) + except Exception: + log.debug( + "Triton compilation failed: %s\n%s\nmetadata: %s", + self.inductor_meta.get("kernel_name", "triton_"), + self.fn.src, + compile_meta, + ) + raise + return TritonCompileResultNpu(binary, cfg, compile_meta, self.inductor_meta) + + def _make_launchers(self): + if len(self.launchers) == len(self.compile_results): + return + + from torch._dynamo.device_interface import DeviceGuard + + device_interface = self.get_device_interface() + + # load binary to the correct device + with DeviceGuard(device_interface, self.triton_meta["device"]): + # need to initialize context + device_interface.synchronize(device_interface.current_device()) + launchers = [] + exc = None + exc_stack = "" + for result in self.compile_results: + try: + launchers.append(result.make_launcher()) + except Exception as e: + import traceback + exc_stack = traceback.format_exc() + exc = e + + if len(launchers) == 0: + raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}\n" + f"Stack trace: {exc_stack}") + self.launchers = launchers + + def save_gpu_kernel(self, input_stream, input_launcher): + self.save_npu_kernel(input_stream, input_launcher) + + def save_npu_kernel(self, input_stream, input_launcher): + key = self.inductor_meta.get("kernel_name", None) # unique kernel name + + if key is None: + raise RuntimeError("assert key is not None, kernel_name can not be None") + params = { + "mangled_name": ( + input_launcher.bin.metadata.name + if hasattr(input_launcher.bin.metadata, "name") + else input_launcher.bin.metadata["name"] + ), + "num_warps": ( + input_launcher.bin.num_warps + if hasattr(input_launcher.bin, "num_warps") + else input_launcher.bin.metadata.num_warps + ), + "shared_mem": ( + input_launcher.bin.shared + if hasattr(input_launcher.bin, "shared") + else input_launcher.bin.metadata.shared + ), + "stream": input_stream, + # User defined triton kernels will have arbitrary kwarg names + "meta": input_launcher.config.kwargs, + } + from torch._inductor.codecache import CudaKernelParamCache + + bin_type = "npubin" + binary = input_launcher.bin.asm[bin_type] # npubin type = npubin + CudaKernelParamCache.set(key, params, binary, bin_type='cubin') # CudaKernelParam + + self.cuda_kernel_saved = True + + # bench method is called by torch, grid can not be modified + def bench(self, launcher, *args, with_profiler=False, **kwargs): + """Measure the performance of a given launcher""" + + if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get( + "spill_threshold", 16 + ): + return float("inf") + + device_interface = self.get_device_interface() + stream = device_interface.get_raw_stream(device_interface.current_device()) + + def kernel_call(): + cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs) + launcher( + *cloned_args, + **cloned_kwargs, + stream=stream, + ) + + if with_profiler: + from torch._inductor.utils import do_bench_using_profiling + ret = do_bench_using_profiling(kernel_call, warmup=10, rep=1) + + # remove fast_flush=True for high version triton + ret = benchmarker.benchmark_gpu(kernel_call, rep=1) + return ret + + def autotune_to_one_config(self, *args, **kwargs): + """Do the actual autotuning""" + start_time = time.time_ns() + timings = self.benchmark_all_configs(*args, **kwargs) + benchmark_time_taken_ns = time.time_ns() - start_time + self.launchers = [builtins.min(timings, key=timings.get)] + self.autotune_time_taken_ns = ( + self.precompile_time_taken_ns + benchmark_time_taken_ns + ) + if self.save_cache_hook: + self.save_cache_hook(self.launchers[0].config, self.autotune_time_taken_ns) + + def get_fx_graph_call(self, auto_fallback=False): + kernel_name = self.inductor_meta.get("kernel_name", "triton_") + traced_graph_hash = self.inductor_meta.get("traced_graph_hash") + dump_dir = self.inductor_meta.get("traced_graph_dir", "") + dump_path = os.path.join(dump_dir, traced_graph_hash) + if dump_dir == "" or not os.path.exists(dump_path): + return None, None, None, None + sys.path.append(dump_path) + fx_module = importlib.import_module(traced_graph_hash) + sys.path.remove(dump_path) + + model = fx_module.model + num_inputs = fx_module.num_inputs + num_outputs = fx_module.num_outputs + non_contiguous_indices = fx_module.non_contiguous_indices + mismatch_indices_shapes = fx_module.mismatch_indices_shapes + + def fx_graph_call(*fx_args): + fx_inputs = [fx_args[idx].contiguous() if idx in non_contiguous_indices['inputs'] else \ + fx_args[idx] for idx in range(num_inputs)] + if len(mismatch_indices_shapes): + for ind, shape in mismatch_indices_shapes.items(): + if ind >= num_inputs: + break + fx_inputs[ind] = fx_inputs[ind].reshape(shape) + model_outputs = model.forward(*fx_inputs) + for idx, (out1, out2) in enumerate(zip(model_outputs, fx_args[num_inputs:(num_inputs + num_outputs)])): + out1 = out1.reshape(out2.shape) + if idx in non_contiguous_indices['outputs']: + out2.copy_(out1) + else: + out2.data = out1.data + + def fallback_call(*args): + fx_args = [args[idx] for idx in fx_module.call_args_mapping] + return fx_graph_call(*fx_args) + + if auto_fallback: + return fallback_call, kernel_name, None, None + return fx_graph_call, kernel_name, dump_path, fx_module + + def data_dump(self, *args, dump_path=None): + data_dump_path = os.path.join(dump_path, 'data.pth') + torch.save(args, data_dump_path) + + def get_fn_name(self): + if self.fn_name is not None: + return self.fn_name + try: + self.fn_name = self.fn.fn.__name__ + except AttributeError: + self.fn_name = "unknown" + return self.fn_name + + def fallback_to_fx(self, *args, launcher, stream, **kwargs): + """ + Try to fallback kernel to fx graph call according to kernel id. + """ + def should_fallback(): + fallback_id = npu_config.force_fallback_kernel_id + if fallback_id != "all" and not isinstance(fallback_id, list): + raise RuntimeError("torch_npu._inductor.config.aot_inductor.force_fallback_kernel_id " + "should be set to 'all' or List, e.g, [1, 2, 10]." + pta_error(ErrCode.VALUE)) + + if isinstance(fallback_id, list): + kernel_name = self.get_fn_name() + try: + kernel_id = int(kernel_name.split("_")[-1]) + except ValueError: + kernel_id = -1 + if kernel_id not in fallback_id: + return False + return True + + if not should_fallback(): + return None + + fx_graph_call, _, _, fx_module = self.get_fx_graph_call() + if not fx_graph_call: + return None + + call_outputs_indices = fx_module.call_args_mapping[fx_module.num_inputs:] + fx_args = [] + for idx in fx_module.call_args_mapping: + arg = args[idx] + if isinstance(arg, torch.Tensor): + fx_arg = clone_preserve_strides(arg).float() if arg.dtype == torch.bfloat16 else clone_preserve_strides( + arg) + fx_args.append(fx_arg) + + fx_graph_call(*fx_args) + for actual, expected in zip([args[i] for i in call_outputs_indices], fx_args[fx_module.num_inputs:]): + if actual.dtype != expected.dtype: + expected = expected.to(actual.dtype) + actual.copy_(expected) + for arg in fx_args: + del arg + return True + + + def check_accuracy(self, *args, launcher, stream, **kwargs): + fx_graph_call, kernel_name, dump_path, fx_module = self.get_fx_graph_call() + if not fx_graph_call: + return None + call_outputs_indices = fx_module.call_args_mapping[fx_module.num_inputs:] + self.data_dump(*args, dump_path=dump_path) + + fx_args = [] + for idx in fx_module.call_args_mapping: + arg = args[idx] + if isinstance(arg, torch.Tensor): + fx_arg = clone_preserve_strides(arg).float() if arg.dtype == torch.bfloat16 else clone_preserve_strides( + arg) + fx_args.append(fx_arg) + + fx_graph_call(*fx_args) + + ret = launcher( + *args, + **kwargs, + grid=grid, + stream=stream, + ) + + try: + import msprobe + has_msprobe = True + except ImportError: + has_msprobe = False + warning_once(log, "msprobe import failed, please check. " + "It may be due to missing dependencies or other factors. " + "Check accuracy by tolerance method.") + for actual, expected in zip([args[i] for i in call_outputs_indices], fx_args[fx_module.num_inputs:]): + if actual.dtype != expected.dtype: + expected = expected.to(actual.dtype) + checked_by_msprobe = False + if has_msprobe: + checked_by_msprobe = self.api_accuracy_checker(expected, actual, kernel_name, dump_path) + if not has_msprobe or not checked_by_msprobe: + acc_comp_tol = npu_config.acc_comp_tol.get(actual.dtype, npu_config.acc_comp_tol['default']) + rtol = acc_comp_tol['rtol'] + atol = acc_comp_tol['atol'] + + matches = torch.isclose( + actual, expected, rtol=rtol, atol=atol, equal_nan=False + ) + if not matches.all(): + abs_diff = torch.abs(actual - expected) + rel_diff = abs_diff / torch.abs(expected) + rel_diff.masked_fill_(matches, 0) + log.warning(f"CHECK ACCURACY FAILED! Greatest Relative Difference: {rel_diff.max().item()}, " + f"Kernel Name: {kernel_name}, Dump Path: {dump_path}") + actual.copy_(expected) + del matches + for arg in fx_args: + del arg + return True + + def debug_kernel_in_run(self, *args, launcher, stream, **kwargs): + ''' + Save tensors for kernel args and outputs before and after kernel execute. + These tensors can be load and compared with tensors dumped by aot-inductor cpp runtime. + ''' + dump_path = npu_config.aot_inductor.dump_path_py + if not os.path.exists(dump_path): + os.makedirs(dump_path) + + idx = next(kernel_idx) + fn_name = self.get_fn_name() + dump_args = [arg for arg in args if isinstance(arg, torch.Tensor)] + torch.npu.synchronize() + torch.save(dump_args, f"{dump_path}/{idx}_{fn_name}_before.pt") + + result = super().run(*args, stream=stream, **kwargs) + + torch.npu.synchronize() + torch.save(dump_args, f"{dump_path}/{idx}_{fn_name}_after.pt") + return result + + + def run( + self, *args, stream, benchmark_run=False, **kwargs + ): # type:ignore[override] + if self.triton_interpret: + args, grid = self._interpret_args_grid(args, self.configs[0]) + return self.fn[grid]( + *args, + **kwargs, + **self.configs[0].kwargs, + ) + + if hasattr(self.launchers[0], "fallback"): + return self.launchers[0]( + *args, + **kwargs, + ) + + if len(self.launchers) != 1: + if len(self.launchers) == 0: + start_time = time.time_ns() + self.precompile() + self.precompile_time_taken_ns = time.time_ns() - start_time + if len(self.launchers) > 1: + self.autotune_to_one_config(*args, **kwargs) + + if not getattr( + self.launchers[0].config, "found_by_coordesc", False + ) and self.inductor_meta.get("coordinate_descent_tuning", False): + self.launchers = [ + self.coordinate_descent_tuning( + self.launchers[0], *args, **kwargs + ) + ] + + (launcher, ) = self.launchers + if launcher.store_cubin and (not benchmark_run or not self.cuda_kernel_saved): + self.save_gpu_kernel(stream, launcher) + + if self.dump_launch_params: + _dump_launch_params(args, kwargs, launcher, self.fn.__name__) + + if npu_config.check_accuracy: + if self.check_accuracy(*args, launcher=launcher, stream=stream, **kwargs): + return + + elif npu_config.dump_fx_graph: + fx_graph_call, kernel_name, dump_path, _ = self.get_fx_graph_call() + if not fx_graph_call: + log.warning(f"data dump for kernel {kernel_name} failed!") + else: + self.data_dump(*args, dump_path=dump_path) + + elif npu_config.force_fallback_kernel_id: + fallback_result = self.fallback_to_fx(*args, launcher=launcher, stream=stream, **kwargs) + if fallback_result is not None: + log.debug(f"fallback kernel {self.get_fn_name()} to fx graph call.") + return + else: + log.warning(f"kernel {self.get_fn_name()} could not fallback to fx.") + elif npu_config.aot_inductor.debug_kernel_in_run: + return self.debug_kernel_in_run(*args, launcher=launcher, stream=stream, **kwargs) + + # it is faster than entering and exiting a context manager, even if the context + # manager is a nullcontext. + if autograd_profiler._is_profiler_enabled: + with torch._C._profiler._RecordFunctionFast( + self.inductor_meta.get("kernel_name", "triton kernel"), + args, + { + "kernel_file": (self.filename or ""), + "kernel_hash": self.kernel_hash, + "kernel_backend": "triton", + "stream": stream, + }, + ): + return launcher( + *args, + **kwargs, + stream=stream, + ) + else: + return launcher( + *args, + **kwargs, + stream=stream, + ) + + +class NPUDebugAutotuner(NPUCachingAutotuner): + def __init__(self, *args, regex_filter="", **kwargs): + self.regex_filter = regex_filter + super().__init__(*args, **kwargs) + self.cached = None + + def run(self, *args, input_grid, stream): + possible_names = _find_names(self) + kernel_name = f"{max(possible_names, key=len)}" + if not re.match(self.regex_filter, kernel_name): + return + super().run(*args, grid=input_grid, stream=stream) + (launcher,) = self.launchers + + if self.cached is None: + ms = self.bench(launcher, *args, input_grid=input_grid) + num_in_out_ptrs = len( + [ + arg_name + for arg_name in self.fn.arg_names + if arg_name.startswith("in_out_ptr") + ] + ) + num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9 + gb_per_s = num_gb / (ms / 1e3) + self.cached = (ms, num_gb, gb_per_s, kernel_name) + else: + ms, num_gb, gb_per_s, kernel_name = self.cached + collected_calls.append((ms, num_gb, gb_per_s, kernel_name)) + print( + create_bandwidth_info_str(ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}") + ) + + +def cached_autotune( + size_hints: Optional[List[int]], + configs: List[Config], + triton_meta, + heuristic_type, + filename=None, + inductor_meta=None, + custom_kernel=False, +): + """ + A copy of triton.autotune that calls our subclass. Our subclass + has additional debugging, error handling, and on-disk caching. + """ + configs = unique_configs(configs) + if not (len(configs) == 1 or filename): + raise RuntimeError("assert len(configs) == 1 or filename") + + inductor_meta = {} if inductor_meta is None else inductor_meta + + disabled = inductor_meta.get("force_disable_caches", False) + + # on disk caching logic and/or remote caching + autotune_cache = None + if ( + not disabled + and filename is not None + and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning")) + and not os.environ.get("TRITON_INTERPRET", "0") == "1" + ): + configs_hash = hash_configs(configs) + + autotune_cache = AutotuneCache.create(inductor_meta, filename, configs_hash) + if autotune_cache: + best_config = autotune_cache.read_best(inductor_meta, configs) + if best_config: + configs = [best_config] + else: + if disabled: + log.debug("autotune caching is disabled by config.force_disable_caches") + + mutated_arg_names = inductor_meta.pop("mutated_arg_names", ()) + optimize_mem = inductor_meta.pop("optimize_mem", True) + + if "restore_value" in triton_meta: + mutated_arg_names += triton_meta.pop("restore_value") + + reset_to_zero_arg_names: List[str] = [] + if "reset_to_zero" in triton_meta: + reset_to_zero_arg_names.extend(triton_meta.pop("reset_to_zero")) + + def decorator(fn): + + if inductor_meta.get("profile_bandwidth"): + return NPUDebugAutotuner( + fn, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + regex_filter=inductor_meta["profile_bandwidth_regex"], + with_profiler=inductor_meta[ + "profile_bandwidth_with_do_bench_using_profiling" + ], + configs=configs, + save_cache_hook=autotune_cache and autotune_cache.save, + mutated_arg_names=mutated_arg_names, + reset_to_zero_arg_names=reset_to_zero_arg_names, + optimize_mem=optimize_mem, + heuristic_type=heuristic_type, + size_hints=size_hints, + custom_kernel=custom_kernel, + filename=filename, + with_bandwidth_info=True, + ) + return NPUCachingAutotuner( + fn, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + configs=configs, + save_cache_hook=autotune_cache and autotune_cache.save, + mutated_arg_names=mutated_arg_names, + reset_to_zero_arg_names=reset_to_zero_arg_names, + optimize_mem=optimize_mem, + heuristic_type=heuristic_type, + size_hints=size_hints, + custom_kernel=custom_kernel, + filename=filename, + ) + + return decorator + + +# split:sizeof split, xblock:axis1 length, rblock:axis2 length +def triton_config_npu_index( + size_hints, + inductor_meta, + triton_meta=None, + reduction=False, + persistent_reduction=False, + +) -> List[Config]: + num_warps = 1 + num_stages = 1 + configs = [] + split_axis = inductor_meta["split_axis"] + tiling_axis = inductor_meta["tiling_axis"] + low_dims = inductor_meta["low_dims"] + split_axis_dtype = inductor_meta["split_axis_dtype"] + axis_names = inductor_meta["axis_names"] + dual_reduction = inductor_meta["dual_reduction"] + + tile_generator = TileGenerator(size_hints, axis_names, tiling_axis, split_axis, low_dims, + persistent_reduction=persistent_reduction, configs=configs, + dtype=split_axis_dtype, dual_reduction=dual_reduction) + + tile_generator.descend_split_tiling() + + if not configs: + cfg = {} + for x in split_axis: + cfg[f"{axis_names[x].upper()}BLOCK"] = size_hints[x] + if not cfg: + cfg["dummy"] = 1 + tmp = Config(cfg, num_warps=num_warps, num_stages=num_stages) + configs.append(tmp) + + for cfg in configs: + split_blocks = [None for x in split_axis] + for i, axis in enumerate(split_axis): + name = axis_names[axis] + block_name = f"{name.upper()}BLOCK" + split_blocks[i] = cfg.kwargs[block_name] + cfg.kwargs["split_axis"] = tuple(split_axis) + cfg.kwargs["split_blocks"] = tuple(split_blocks) + + return configs + + +def pointwise_npu_index( + size_hints, + triton_meta, + tile_hint=None, + filename=None, + min_elem_per_thread=0, + inductor_meta=None, +): + inductor_meta = {} if inductor_meta is None else inductor_meta + triton_config_with_settings = functools.partial( + triton_config_npu_index + ) + return cached_autotune( + size_hints, + triton_config_with_settings(size_hints, inductor_meta=inductor_meta), + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.POINTWISE, + filename=filename, + ) + + +def reduction_npu_index( + size_hints, + reduction_hint=False, + triton_meta=None, + filename=None, + inductor_meta=None, +): + """args to @triton.heuristics()""" + inductor_meta = {} if inductor_meta is None else inductor_meta + inductor_meta["reduction_hint"] = reduction_hint + if triton_meta is None: + raise RuntimeError("assert triton_meta is not None") + + contiguous_config = triton_config_npu_index(size_hints, inductor_meta=inductor_meta, reduction=True) + return cached_autotune( + size_hints, + [ + *contiguous_config, + ], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + filename=filename, + heuristic_type=HeuristicType.REDUCTION, + ) + + +def persistent_reduction_npu_index( + size_hints, + reduction_hint=False, + triton_meta=None, + filename=None, + inductor_meta=None, +): + inductor_meta = {} if inductor_meta is None else inductor_meta + inductor_meta["reduction_hint"] = reduction_hint + configs = triton_config_npu_index(size_hints, inductor_meta=inductor_meta, reduction=True, + persistent_reduction=True) + + return cached_autotune( + size_hints, + configs, + triton_meta=triton_meta, + inductor_meta=inductor_meta, + filename=filename, + heuristic_type=HeuristicType.PERSISTENT_REDUCTION, + ) + + +def foreach(triton_meta, num_warps, filename=None, inductor_meta=None): + """ + Compile a triton foreach kernel + """ + return cached_autotune( + None, + [triton.Config({}, num_stages=1, num_warps=num_warps)], + triton_meta=triton_meta, + inductor_meta=inductor_meta, + heuristic_type=HeuristicType.TEMPLATE, + filename=filename, + ) + + +@dynamo_timed +def benchmark_all_configs(self, *args, input_grid, **kwargs): + print(f"candidate launcher count = {len(self.launchers)}") + + tilling_kernel_list = [] + + def kernel_call(launcher): + def call_kernel(): + if launcher.config.pre_hook is not None: + launcher.config.pre_hook( + {**dict(zip(self.arg_names, args)), **launcher.config.kwargs} + ) + cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs) + launcher( + *cloned_args, + **cloned_kwargs, + grid=input_grid, + stream=stream, + ) + + return call_kernel + + for launcher in self.launchers: + if not self.custom_kernel and launcher.n_spills > config.triton.spill_threshold: + return float("inf") + + stream = self.gpu_device.get_raw_stream( # type: ignore[call-arg] + self.gpu_device.current_device() + ) + tilling_kernel_list.append(kernel_call(launcher)) + + def do_batch_benchmark(tilling_kernel_list): + + def delete_file(base_path): + import shutil + if os.path.exists(base_path): + shutil.rmtree(base_path) + + import torch_npu + + stream = torch.npu.current_stream() + experimental_config = torch_npu.profiler._ExperimentalConfig( + aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization, + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, + l2_cache=False, + data_simplification=False + ) + + import uuid + random_uuid = uuid.uuid4().hex + md5_hash = hashlib.md5(random_uuid.encode()).hexdigest() + + from torch_npu._inductor.config import profile_path + + torch_path = profile_path + md5_hash + rep = 1 + with torch_npu.profiler.profile( + activities=[ + torch_npu.profiler.ProfilerActivity.NPU + ], + schedule=torch_npu.profiler.schedule(wait=0, warmup=1, active=rep, repeat=1, skip_first=1), + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(torch_path), + record_shapes=False, + profile_memory=False, + with_stack=False, + with_flops=False, + with_modules=False, + experimental_config=experimental_config) as prof: + stream.synchronize() + for _ in range(rep + 3): + for fn in tilling_kernel_list: + fn() + prof.step() + stream.synchronize() + + import pandas as pd + for root, _, files in os.walk(torch_path): + for file in files: + if file != 'kernel_details.csv': + continue + target_file = os.path.join(root, file) + df = pd.read_csv(target_file) + triton_rows = df[df['Name'].str.startswith('triton', na=False)] + ret = triton_rows['Duration(us)'].astype(float).tolist() + delete_file(torch_path) + return ret + + delete_file(torch_path) + return [] + + try: + timinglist = do_batch_benchmark(tilling_kernel_list) + if not len(timinglist) == len(self.launchers): + raise RuntimeError("not len(timinglist) == len(self.launchers)") + timings = {launcher: timing for launcher, timing in zip(self.launchers, timinglist)} + except Exception as e: + print("some cases in batch benchmark has error! Logging Exception as:") + print(e) + print("switched to single bench...") + timings = { + launcher: self.bench(launcher, *args, **kwargs) + for launcher in self.launchers + } + + for k, v in timings.items(): + self.coordesc_tuner.cache_benchmark_result(k.config, v) + + if log.isEnabledFor(logging.DEBUG): + for k, v in timings.items(): + log.debug( + "%s: %f, nreg %d, nspill %d, #shared-mem %s", + k.config, + v, + k.n_regs, + k.n_spills, + k.shared, + ) + return timings diff --git a/torch_npu/_inductor/runtime.py b/torch_npu/_inductor/runtime.py new file mode 100644 index 0000000000..9d7716a200 --- /dev/null +++ b/torch_npu/_inductor/runtime.py @@ -0,0 +1,70 @@ +import functools +from typing import List, Dict +from typing import Optional +from torch._inductor.remote_cache import JsonDataTy +from torch._inductor.runtime.hints import DeviceProperties +from torch.utils._triton import has_triton, has_triton_package + +from .config import num_vector_core + +if has_triton_package(): + from triton import Config + + +# overload this to avoid autotune after best_config already generated +def _load_cached_autotuning( + best_config: Dict[str, JsonDataTy], + configs_hash: str, + configs: List[Config], + inductor_meta: Dict, +) -> Optional[Config]: + if best_config is None: + return None + if best_config.pop("configs_hash", None) != configs_hash: + return None + # Remove time taken for comparison + best_config.pop("time_taken_ms", None) + + # if inductor_meta.get("coordinate_descent_tuning") : + num_warps = best_config.pop("num_warps") + num_stages = best_config.pop("num_stages") + triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages) + triton_config.found_by_coordesc = True + return triton_config + + +class NPUDeviceProperties(DeviceProperties): + + @classmethod + @functools.lru_cache(None) + def create(cls, device) -> DeviceProperties: + import torch + from torch._dynamo.device_interface import get_interface_for_device + + device_type = device.type + + if torch.version.hip and device_type == "cuda": + device_type = "hip" + + device_interface = get_interface_for_device(device) + props = device_interface.get_device_properties(device) + + try: + multi_processor_count = num_vector_core + except AttributeError: + if device_type == "xpu": + multi_processor_count = props.gpu_subslice_count + else: + raise + return cls( + type=device_type, + index=device.index, + multi_processor_count=multi_processor_count, + cc=device_interface.get_compute_capability(device), + major=getattr(props, "major", None), + regs_per_multiprocessor=getattr(props, "regs_per_multiprocessor", None), + max_threads_per_multi_processor=getattr( + props, "max_threads_per_multi_processor", None + ), + warp_size=getattr(props, "warp_size", 32 if device_type != "cpu" else None), + ) diff --git a/torch_npu/_inductor/utils.py b/torch_npu/_inductor/utils.py new file mode 100644 index 0000000000..a3ac4fd66b --- /dev/null +++ b/torch_npu/_inductor/utils.py @@ -0,0 +1,76 @@ +import functools +import torch +import torch_npu + + +# Not good implementation, but no other way +def get_current_raw_stream(device): + return torch.npu.current_stream(device).npu_stream + + +def patch_is_same_tensor(): + from torch._subclasses.fake_tensor import FakeTensor + + def is_same_tensor(data: torch.Tensor, value: torch.Tensor): + if isinstance(data, FakeTensor) or isinstance(value, FakeTensor): + return False + return ( + not data.is_mkldnn + and data.size() == value.size() + and data.stride() == value.stride() + and data.dtype == value.dtype + and data.device == value.device + and data.untyped_storage().data_ptr() == value.untyped_storage().data_ptr() + and data.storage_offset() == value.storage_offset() + ) + + from torch._inductor import utils, graph + utils.is_same_tensor = is_same_tensor + # We need to do extra-patch because of code like `from xxx import is_same_tensor` + graph.is_same_tensor = is_same_tensor + + +def patch_is_gpu(): + from torch._inductor.utils import GPU_TYPES + GPU_TYPES.append('npu') + + +def patch_has_triton(): + from torch.utils._triton import has_triton_package + + @functools.lru_cache(None) + def has_triton() -> bool: + if not has_triton_package(): + return False + + from torch._dynamo.device_interface import get_interface_for_device + + def cuda_extra_check(device_interface): + return True + + def cpu_extra_check(device_interface): + import triton.backends + + return "cpu" in triton.backends.backends + + def _return_true(device_interface): + return True + + triton_supported_devices = { + "cuda": cuda_extra_check, + "xpu": _return_true, + "cpu": cpu_extra_check, + "npu": _return_true + } + + def is_device_compatible_with_triton(): + for device, extra_check in triton_supported_devices.items(): + device_interface = get_interface_for_device(device) + if device_interface.is_available() and extra_check(device_interface): + return True + return False + + return is_device_compatible_with_triton() + + torch.utils._triton.has_triton = has_triton + torch._inductor.scheduler.has_triton = has_triton \ No newline at end of file diff --git a/torch_npu/utils/_dynamo_device.py b/torch_npu/utils/_dynamo_device.py index 43bc29d897..f1e53f4c2e 100644 --- a/torch_npu/utils/_dynamo_device.py +++ b/torch_npu/utils/_dynamo_device.py @@ -65,11 +65,26 @@ class NpuInterface(DeviceInterface): @staticmethod def get_compute_capability(device=None): - r"""Query the minor and major data of device. Cann does not - have a corresponding concept and is not supported. By default, it returns None + r"""Different from cuda, only return the chip model here. """ - return None + return torch.npu.get_device_name(device) + + @staticmethod + def exchange_device(device: int) -> int: + curr_device = current_device() + set_device(device) + return curr_device + + @staticmethod + def maybe_exchange_device(device: int) -> int: + return device + + @staticmethod + def is_bf16_supported(including_emulation: bool = False): + return True def _dynamo_register_interface_for_device(): register_interface_for_device("npu", NpuInterface) + for i in range(32): + register_interface_for_device(f"npu:{i}", NpuInterface) -- Gitee From 26c4fa914521ed5bf87d2021eec26bad01fc8698 Mon Sep 17 00:00:00 2001 From: louyujing <7927276+louyujing@user.noreply.gitee.com> Date: Mon, 7 Jul 2025 08:15:38 +0000 Subject: [PATCH 211/328] =?UTF-8?q?!22634=20=E3=80=90bugfix=E3=80=91Adapt?= =?UTF-8?q?=20torch=20distributed=20=5Fnew=5Fgroup=5Fwith=5Ftag=20Merge=20?= =?UTF-8?q?pull=20request=20!22634=20from=20louyujing/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/contrib/transfer_to_npu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py index af90cf8c79..8bb712eacc 100644 --- a/torch_npu/contrib/transfer_to_npu.py +++ b/torch_npu/contrib/transfer_to_npu.py @@ -352,7 +352,8 @@ def _init(): if hasattr(torch.distributed, 'init_device_mesh'): _del_nccl_device_backend_map() torch.distributed.device_mesh.init_device_mesh = _wrapper_cuda(torch.distributed.device_mesh.init_device_mesh) - torch.distributed.new_group = _wrapper_hccl(torch.distributed.new_group) + torch.distributed.distributed_c10d._new_group_with_tag = _wrapper_hccl( + torch.distributed.distributed_c10d._new_group_with_tag) # CUDAGraph torch.cuda.CUDAGraph = torch.npu.NPUGraph -- Gitee From 82c248ac931382607822f3affd6a9c94edce1fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Mon, 7 Jul 2025 11:26:41 +0000 Subject: [PATCH 212/328] =?UTF-8?q?!22708=20[feat]=20=E6=94=AF=E6=8C=81ipc?= =?UTF-8?q?=E5=86=85=E5=AD=98=E9=80=9A=E4=BF=A1-=E5=88=9B=E5=BB=BAreduce?= =?UTF-8?q?=5Ftensor=20Merge=20pull=20request=20!22708=20from=20=E5=A7=9C?= =?UTF-8?q?=E6=80=A1=E6=96=87/v2.7.1=5Fipc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 4 +- test/allowlist_for_publicAPI.json | 4 +- test/npu/test_unsupport_api.py | 10 - third_party/acl/inc/acl/acl.h | 1 + third_party/acl/inc/acl/acl_rt.h | 2 + torch_npu/__init__.py | 2 + torch_npu/csrc/InitNpuBindings.cpp | 2 + .../csrc/core/npu/NPUCachingAllocator.cpp | 268 +++++++++++++++- torch_npu/csrc/core/npu/NPUCachingAllocator.h | 17 + torch_npu/csrc/core/npu/NPUIPCPidManager.cpp | 36 +++ torch_npu/csrc/core/npu/NPUIPCPidManager.h | 11 + .../csrc/core/npu/interface/AclInterface.cpp | 92 ++++++ .../csrc/core/npu/interface/AclInterface.h | 15 + torch_npu/csrc/ipc/CMakeLists.txt | 6 + torch_npu/csrc/ipc/NPUIPCTypes.cpp | 252 +++++++++++++++ torch_npu/csrc/ipc/NPUIPCTypes.h | 150 +++++++++ torch_npu/csrc/ipc/StorageSharing.cpp | 301 ++++++++++++++++++ torch_npu/csrc/ipc/StorageSharing.h | 15 + torch_npu/csrc/npu/Module.cpp | 32 ++ torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 18 ++ torch_npu/csrc/npu/NPUPluggableAllocator.h | 2 + torch_npu/multiprocessing/reductions.py | 178 +++++++++++ torch_npu/utils/storage.py | 33 +- torch_npu/utils/unsupport_api.py | 2 - 24 files changed, 1426 insertions(+), 27 deletions(-) create mode 100644 torch_npu/csrc/core/npu/NPUIPCPidManager.cpp create mode 100644 torch_npu/csrc/core/npu/NPUIPCPidManager.h create mode 100644 torch_npu/csrc/ipc/CMakeLists.txt create mode 100644 torch_npu/csrc/ipc/NPUIPCTypes.cpp create mode 100644 torch_npu/csrc/ipc/NPUIPCTypes.h create mode 100644 torch_npu/csrc/ipc/StorageSharing.cpp create mode 100644 torch_npu/csrc/ipc/StorageSharing.h create mode 100644 torch_npu/multiprocessing/reductions.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 34058b029f..82ed2681be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -235,6 +235,7 @@ if (NOT DEFINED BUILD_LIBTORCH) set(FLOP_SRCS) set(NPU_SRCS) set(PROF_SRCS) + set(IPC_SRCS) set(UTILS_SRCS) set(SAN_SRCS) endif() @@ -254,6 +255,7 @@ if (NOT DEFINED BUILD_LIBTORCH) add_subdirectory(${TORCHNPU_ROOT}/distributed) add_subdirectory(${TORCHNPU_ROOT}/npu) add_subdirectory(${TORCHNPU_ROOT}/profiler) + add_subdirectory(${TORCHNPU_ROOT}/ipc) add_subdirectory(${TORCHNPU_ROOT}/utils) add_subdirectory(${TORCHNPU_ROOT}/sanitizer) endif() @@ -278,7 +280,7 @@ if (DEFINED BUILD_LIBTORCH) set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${FRAMEWORK_SRCS} ${LOGGING_SRCS} ${NPU_CPP_LIBS_SRCS}) else() # Compile code with pybind11 - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${CUS_DTYPE_SRCS} ${LOGGING_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${IPC_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) endif() add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS}) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index e24bb675fe..e26ec137f5 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -584,7 +584,9 @@ "ForkingPickler", "Union", "check_serializing_named_tensor", - "register_after_fork" + "register_after_fork", + "reduce_tensor", + "reduce_storage" ], "torch.multiprocessing.spawn": [ "Optional" diff --git a/test/npu/test_unsupport_api.py b/test/npu/test_unsupport_api.py index 8883f3eb06..54af07e0b2 100644 --- a/test/npu/test_unsupport_api.py +++ b/test/npu/test_unsupport_api.py @@ -67,16 +67,6 @@ class TestPtaUnsupportApi(TestCase): coalesce_tensor = sparse_tensor.coalesce().npu() coalesce_tensor.ccol_indices() - def test_Tensor_is_shared_runtimeerror(self): - with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."): - input_tensor = torch.tensor([1, 2, 3]).npu() - input_tensor.is_shared() - - def test_Tensor_share_memory__runtimeerror(self): - with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."): - input_tensor = torch.tensor([1, 2, 3]).npu() - input_tensor.share_memory_() - def test_Module_share_memory_runtimeerror(self): with self.assertRaisesRegex(RuntimeError, r"(.*) is not supported in npu."): model = SimpleModel().npu() diff --git a/third_party/acl/inc/acl/acl.h b/third_party/acl/inc/acl/acl.h index 95abdb6368..a31b673d07 100755 --- a/third_party/acl/inc/acl/acl.h +++ b/third_party/acl/inc/acl/acl.h @@ -25,6 +25,7 @@ extern "C" { #define ACL_PATCH_VERSION 0 #define ACL_PKG_VERSION_MAX_SIZE 128 #define ACL_PKG_VERSION_PARTS_MAX_SIZE 64 +#define ACL_IPC_HANDLE_SIZE 65 /** * @ingroup AscendCL diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 98b520ba4a..2fcbaa2792 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -187,6 +187,8 @@ typedef void (*aclrtCallback)(void *userData); typedef void (*aclrtExceptionInfoCallback)(aclrtExceptionInfo *exceptionInfo); +typedef int aclrtNotify; + /** * @ingroup AscendCL * @brief Set a callback function to handle exception information diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index d9bcbc268f..dccc979150 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -83,6 +83,7 @@ from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry from torch_npu.utils import _cann_package_check, _add_intercept_methods from torch_npu.utils import _register_ops_under_dtensor_rules from torch_npu.utils.exposed_api import public_npu_functions +from torch_npu.multiprocessing.reductions import _add_reductions_methods from torch_npu.npu.utils import _erase_stream as erase_stream from torch_npu.utils.hif8_tensor import HiFloat8Tensor from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler @@ -177,6 +178,7 @@ def _apply_class_patches(): add_perf_dump_patch() _apply_distributed_methods_patch() _apply_mstx_patch() + _add_reductions_methods() def _apply_distributed_methods_patch(): diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 4c1bf40361..672b5289f5 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/profiler/init.h" #include "torch_npu/csrc/flopcount/Init.h" #include "torch_npu/csrc/logging/Init.h" +#include "torch_npu/csrc/ipc/StorageSharing.h" #include "torch_npu/csrc/npu/Module.h" #include "torch_npu/csrc/custom_dtype/Init.h" #include "torch_npu/csrc/npu/Stress_detect.h" @@ -169,6 +170,7 @@ PyObject* initModule() AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions()); AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions()); AddPyMethodDefs(methods, torch_npu::logging::logging_functions()); + AddPyMethodDefs(methods, torch_npu::reductions::reductions_functions()); AddPyMethodDefs(methods, c10_npu::custom_dtype_functions()); static struct PyModuleDef torchnpu_module = { PyModuleDef_HEAD_INIT, diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 82314c2ebd..aee1a2fa92 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -24,6 +24,7 @@ #include "torch_npu/csrc/core/npu/GetCANNInfo.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/NPUEvent.h" +#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" @@ -100,6 +101,12 @@ const std::string kMinCannVersion = "8.1.RC1"; // minimum cann version wh const std::string kMinDriverVersion = "25.0.RC1"; // minimum driver version which supports 1g mem 25.0.RC1 const std::string kCannModule = "CANN"; // cann module name +static char SHAREABLE_HANDLE_VERSION = 1; +enum ShareableHandleType : char { + SHAREABLE_NPU_MALLOC = 'c', + SHAREABLE_NPU_EXPANDABLE_SEGMENT = 'e' +}; + using StatTypes = std::array(StatType::NUM_TYPES)>; void update_stat(Stat &stat, int64_t amount) @@ -355,7 +362,10 @@ bevhavior for allocator tensors that need to be used cross-process. */ struct ExpandableSegment { - ExpandableSegment(int device, aclrtStream stream, size_t size) + ExpandableSegment( + int device, + std::optional stream, + size_t size) : device_(device), stream_(stream), max_handles_(0), @@ -376,7 +386,7 @@ struct ExpandableSegment { auto default_stream = c10_npu::getDefaultNPUStream().stream(false); if (kSmallBuffer == segment_size_) { max_handles_ = numSegments(kSmallPoolVirAddrSize); - } else if (default_stream != stream) { + } else if (default_stream != *stream) { max_handles_ = numSegments(kLargePoolVirAddrSize); } } @@ -416,17 +426,17 @@ struct ExpandableSegment { for (auto j : c10::irange(begin, i)) { auto h = handles_.at(j).value(); handles_.at(j) = c10::nullopt; - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle)); } trimHandles(); return rangeFromHandles(begin, begin); } NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); - handles_.at(i) = handle; + handles_.at(i) = Handle{handle, std::nullopt}; } for (auto i : c10::irange(begin, end)) { NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0, - handles_.at(i).value(), 0, getHcclComm())); + handles_.at(i).value().handle, 0, getHcclComm())); } ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_); return rangeFromHandles(begin, end); @@ -446,6 +456,59 @@ struct ExpandableSegment { return rangeFromHandles(begin, end); } + // Setup IPC sharing for range. + // Returns the (larger) range that was actually shared. + // Serializes data to std::ostream that can be passed to the + // other process, and then restored as an exapandable segment + // via ExpandableSegment::fromShared(istream); + SegmentRange share(SegmentRange range, std::ostream& buf) + { + auto begin = segmentLeft(range.ptr); + auto end = segmentRight(range.ptr + range.size); + ShareHeader header{segment_size_, end - begin}; + buf.write((const char*)&header, sizeof(ShareHeader)); + for (auto i : c10::irange(begin, end)) { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + auto& handle = handles_.at(i).value(); + if (!handle.shareableHandle) { + uint64_t shareableHandle = 0; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMemExportToShareableHandle( + handle.handle, ACL_MEM_HANDLE_TYPE_NONE, 0, &shareableHandle)); + int32_t* pids = nullptr; + int pid_num = torch_npu::ipc::getPids(&pids); + NPU_CHECK_ERROR(c10_npu::acl::AclrtMemSetPidToShareableHandle(shareableHandle, pids, pid_num)); + handle.shareableHandle = shareableHandle; + } + uint64_t shandle = *handle.shareableHandle; + buf.write((const char*)&shandle, sizeof(uint64_t)); + } + return rangeFromHandles(begin, end); + } + + static std::unique_ptr fromShared( + c10::DeviceIndex device, + std::istream& buf) + { + ShareHeader header{}; + buf.read((char*)&header, sizeof(ShareHeader)); + auto segment = std::make_unique( + device, + std::nullopt, + header.segment_size); + for (auto i : c10::irange(header.num_handles)) { + (void)i; + uint64_t shareableHandle = 0; + buf.read((char*)&shareableHandle, sizeof(uint64_t)); + int32_t deviceId = static_cast(device); + aclrtDrvMemHandle handle; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMemImportFromShareableHandle( + shareableHandle, deviceId, &handle)); + segment->handles_.emplace_back(Handle{handle, std::nullopt}); + } + segment->mapAndSetAccess(0, header.num_handles); + return segment; + } + char *ptr() const { return (char *)ptr_; @@ -464,7 +527,7 @@ struct ExpandableSegment { segment_size_ * max_handles_, 0, 1)); for (auto i : c10::irange(handles_.size())) { HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(), - (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0)); + (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().handle, 0)); } } @@ -476,6 +539,15 @@ struct ExpandableSegment { } private: + void mapAndSetAccess(size_t begin, size_t end) + { + for (auto i : c10::irange(begin, end)) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0, + handles_.at(i).value().handle, 0, getHcclComm())); + } + ASCEND_LOGD("NPUCachingAllocator mapAndSetAccess: segment_size=%zu", segment_size_); + } + void unmapHandles(size_t begin, size_t end) { // note: unlike aclrtFree, MemUnmap and MemRelease do @@ -485,18 +557,23 @@ private: // cannot call c10::npu::stream_synchronize because // it might grab the GIL which can lead to a deadlock // Locking order must be GIL -> Allocator Lock - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_)); + if (stream_) { + NPU_CHECK_ERROR(aclrtSynchronizeStream(*stream_)); + } else { + c10_npu::NPUGuard device_guard(device_); + c10_npu::npuSynchronizeDevice(true); + } #ifndef BUILD_LIBTORCH const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { - trigger->traceNpuStreamSynchronization(reinterpret_cast(stream_)); + trigger->traceNpuStreamSynchronization(reinterpret_cast(*stream_)); } #endif for (auto i : c10::irange(begin, end)) { - aclrtDrvMemHandle h = handles_.at(i).value(); + Handle h = handles_.at(i).value(); handles_.at(i) = c10::nullopt; NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle)); } ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_); trimHandles(); @@ -553,11 +630,19 @@ private: } int device_; - aclrtStream stream_; + std::optional stream_; void *ptr_{}; size_t max_handles_; size_t segment_size_; - std::vector> handles_; + struct Handle { + aclrtDrvMemHandle handle; + std::optional shareableHandle; + }; + struct ShareHeader { + size_t segment_size; + size_t num_handles; + }; + std::vector> handles_; std::shared_ptr hcclComm_; }; @@ -1014,6 +1099,13 @@ private: std::unique_lock& lock_; }; +struct handle_str { + char data[ACL_IPC_HANDLE_SIZE]; +}; + +// handle for ptr +ska::flat_hash_map ipc_handle_map; + class DeviceCachingAllocator { private: // lock around all operations @@ -1549,6 +1641,40 @@ public: return basePtr; } + ShareableHandle shareIpcHandle(Block* block) + { + std::lock_guard lock(mutex); + std::ostringstream ss; + ss.put(SHAREABLE_HANDLE_VERSION); + ptrdiff_t offset = 0; + if (!block->expandable_segment_) { + ss.put(SHAREABLE_NPU_MALLOC); + size_t base_size; + void* base_ptr = getBaseAllocation(block, &base_size); + offset = (char*)block->ptr - (char*)base_ptr; + + handle_str handle; + auto it = ipc_handle_map.find(base_ptr); + if (it == ipc_handle_map.end()) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemGetExportKey( + base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE)); + int32_t* pids = nullptr; + int pid_num = torch_npu::ipc::getPids(&pids); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemSetImportPid(handle.data, pids, pid_num)); + ipc_handle_map[base_ptr] = handle; + } else { + handle = it->second; + } + ss.write((char*)&handle, ACL_IPC_HANDLE_SIZE); + } else { + ss.put(SHAREABLE_NPU_EXPANDABLE_SEGMENT); + auto full_range = block->expandable_segment_->share( + SegmentRange(block->ptr, block->size), ss); + offset = (char*)block->ptr - (char*)full_range.ptr; + } + return ShareableHandle{offset, ss.str()}; + } + void recordStream(Block *block, c10_npu::NPUStream stream) { std::lock_guard lock(mutex); @@ -2703,6 +2829,12 @@ private: record_trace(TraceEntry::SEGMENT_FREE, int64_t(block->ptr), block->size, block->stream, block->device, context ? context : block->context_when_segment_allocated); + auto it = ipc_handle_map.find(block->ptr); + if (it != ipc_handle_map.end()) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(it->second.data)); + ipc_handle_map.erase(it); + } + aclrtFree((void *)block->ptr); total_allocated_memory -= block->size; @@ -3178,6 +3310,15 @@ public: return device_allocator[block->device]->getBaseAllocation(block, outSize); } + ShareableHandle shareIpcHandle(void* ptr) override + { + Block* block = get_allocated_block(ptr); + if (!block) { + AT_ERROR("invalid device pointer: ", ptr); + } + return device_allocator[block->device]->shareIpcHandle(block); + } + void recordStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream) override { // Empty tensor's storage().data() might be a null ptr. As there is no @@ -3435,6 +3576,109 @@ public: this->free(ptr); } + std::mutex IpcMutex; + struct MemHandleCacheEntry { + MemHandleCacheEntry( + c10::DeviceIndex device, + std::string& handle, + const DeviceCachingAllocator& allocator) + : device_(device) + { + int type = SHAREABLE_NPU_MALLOC; + std::istringstream ss(handle); + if (handle.size() != ACL_IPC_HANDLE_SIZE) { + auto version = ss.get(); + TORCH_CHECK( + version <= SHAREABLE_HANDLE_VERSION, + "received sharable handle from a future version of torch that this version does not know how to handle", + PTA_ERROR(ErrCode::NOT_SUPPORT)); + type = ss.get(); + } + // otherwise this is coming from an old pytorch where it has to be a raw + // SHAREABLE_NPU_MALLOC + if (type == SHAREABLE_NPU_MALLOC) { + handle_str handle_r; + ss.read(handle_r.data, ACL_IPC_HANDLE_SIZE); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data)); + handle_s.assign(handle_r.data, ACL_IPC_HANDLE_SIZE); + } else if (type == SHAREABLE_NPU_EXPANDABLE_SEGMENT) { + expandable_segment_ = + ExpandableSegment::fromShared(device, ss) + .release(); + } else { + TORCH_INTERNAL_ASSERT( + false, "Unexpected or illformed shareable handle type"); + } + } + // this struct expects that clear is explicitly called to + // free resources, because we only want this code running when + // the shared pointer to this entry is destructed, not during + // deinitialization when npu may already have been shutdown. + // This replicates the previous behavior of this map when it + // stored raw npu_ipc_ptr_ handles. + void clear() + { + if (npu_ipc_ptr_) { + c10_npu::NPUGuard device_guard(device_); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(handle_s.c_str())); + npu_ipc_ptr_ = nullptr; + } + if (expandable_segment_) { + delete expandable_segment_; + expandable_segment_ = nullptr; + } + } + void* ptr() + { + if (npu_ipc_ptr_) { + return npu_ipc_ptr_; + } else { + return expandable_segment_->ptr(); + } + } + c10::DeviceIndex device_; + ExpandableSegment* expandable_segment_{nullptr}; + void* npu_ipc_ptr_{nullptr}; // nullptr if expandable_segment_ is not null + std::weak_ptr wp_; + std::string handle_s; + }; + ska::flat_hash_map ipcMemHandle_to_devptr; + + std::shared_ptr getIpcDevPtr(std::string handle) override + { + std::lock_guard lock(IpcMutex); + + auto iter = ipcMemHandle_to_devptr.find(handle); + if (iter != ipcMemHandle_to_devptr.end()) { + auto devptr = iter->second.wp_.lock(); + TORCH_INTERNAL_ASSERT(devptr, "entry in cache has missing shared_ptr"); + return devptr; + } + int curr_device = 0; + NPU_CHECK_ERROR(c10_npu::GetDevice(&curr_device)); + auto inserted = ipcMemHandle_to_devptr.insert( + iter, + {handle, + MemHandleCacheEntry( + static_cast(curr_device), handle, *device_allocator[curr_device])}); + auto sp = std::shared_ptr( + inserted->second.ptr(), [handle, this](void* ptr) { + std::unique_lock deleter_lock(IpcMutex); + + auto it = ipcMemHandle_to_devptr.find(handle); + TORCH_INTERNAL_ASSERT(it != ipcMemHandle_to_devptr.end()); + auto entry = std::move(it->second); + ipcMemHandle_to_devptr.erase(it); + + // ExpandableSegment synchronizes on destruction in unmapHandles, so + // we need to release the lock first to minimize the performance hit. + deleter_lock.unlock(); + entry.clear(); + }); + inserted->second.wp_ = sp; + return sp; + } + void FreeDeviceCachedMemory(int device) override { device_allocator[device]->emptyCache(device, true); diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index a4e14d2232..c7082c8904 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -188,6 +188,11 @@ using OutOfMemoryObserver = std::function; +struct ShareableHandle { + ptrdiff_t offset; + std::string handle; +}; + class NPUAllocator : public c10::Allocator { public: virtual c10::DataPtr allocate_with_aligned(size_t size, size_t aligned) const = 0; @@ -227,6 +232,8 @@ public: " does not yet support checkPoolLiveAllocations. " "If you need it, please file an issue describing your use case.", PTA_ERROR(ErrCode::NOT_SUPPORT)); } + virtual ShareableHandle shareIpcHandle(void* ptr) = 0; + virtual std::shared_ptr getIpcDevPtr(std::string handle) = 0; virtual bool isHistoryEnabled() { TORCH_CHECK( @@ -376,6 +383,16 @@ inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) return get()->releasePool(device, mempool_id); } +inline std::shared_ptr getIpcDevPtr(std::string handle) +{ + return get()->getIpcDevPtr(handle); +} + +inline ShareableHandle shareIpcHandle(void* ptr) +{ + return get()->shareIpcHandle(ptr); +} + inline void FreeDeviceCachedMemory(int device) { return get()->FreeDeviceCachedMemory(device); diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp new file mode 100644 index 0000000000..94bbd2739a --- /dev/null +++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp @@ -0,0 +1,36 @@ +#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" +namespace torch_npu { +namespace ipc { + +int32_t* pids = nullptr; +int pid_num = 0; +int capacity = 0; + +void addPid(int pid) +{ + const int requiredCapacity = pid_num + 1; + + if (requiredCapacity > capacity) { + int newCapacity = capacity + 10; + + int32_t* newArray = new int32_t[newCapacity]; + for (int i = 0; i < pid_num; ++i) { + newArray[i] = pids[i]; + } + + delete[] pids; + pids = newArray; + capacity = newCapacity; + } + + pids[pid_num++] = static_cast(pid); +} + +int getPids(int32_t** ret_pids) +{ + *ret_pids = pids; + return pid_num; +} + +} // namespace ipc +} // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.h b/torch_npu/csrc/core/npu/NPUIPCPidManager.h new file mode 100644 index 0000000000..bc5a72cd89 --- /dev/null +++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.h @@ -0,0 +1,11 @@ +#pragma once +#include + +namespace torch_npu { +namespace ipc { + +void addPid(int pid); +int getPids(int32_t** pids); + +} // namespace ipc +} // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 39c4b53443..c46740b72d 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -82,6 +82,13 @@ LOAD_FUNCTION(aclmdlRICaptureTaskUpdateBegin) LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd) LOAD_FUNCTION(aclrtHostRegister) LOAD_FUNCTION(aclrtHostUnregister) +LOAD_FUNCTION(aclrtIpcMemGetExportKey) +LOAD_FUNCTION(aclrtIpcMemSetImportPid) +LOAD_FUNCTION(aclrtIpcMemImportByKey) +LOAD_FUNCTION(aclrtIpcMemClose) +LOAD_FUNCTION(aclrtMemExportToShareableHandle) +LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) +LOAD_FUNCTION(aclrtMemImportFromShareableHandle) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -929,5 +936,90 @@ aclError AclrtHostUnregister(void *ptr) return func(ptr); } +aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *name, size_t len) +{ + typedef aclError (*AclrtIpcMemGetExportKey)(void *, size_t, char *, size_t); + static AclrtIpcMemGetExportKey func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemGetExportKey) GET_FUNC(aclrtIpcMemGetExportKey); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemGetExportKey", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(devPtr, size, name, len); +} + +aclError AclrtIpcMemSetImportPid(const char *name, int32_t pid[], int num) +{ + typedef aclError (*AclrtIpcMemSetImportPid)(const char *, int32_t[], int); + static AclrtIpcMemSetImportPid func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemSetImportPid) GET_FUNC(aclrtIpcMemSetImportPid); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemSetImportPid", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(name, pid, num); +} + +aclError AclrtIpcMemImportByKey(void **devPtr, const char *name) +{ + typedef aclError (*AclrtIpcMemImportByKey)(void **, const char *); + static AclrtIpcMemImportByKey func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemImportByKey) GET_FUNC(aclrtIpcMemImportByKey); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemImportByKey", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(devPtr, name); +} + +aclError AclrtIpcMemClose(const char *name) +{ + typedef aclError (*AclrtIpcMemClose)(const char *); + static AclrtIpcMemClose func = nullptr; + if (func == nullptr) { + func = (AclrtIpcMemClose) GET_FUNC(aclrtIpcMemClose); + } + + TORCH_CHECK(func, "Failed to find function aclrtIpcMemClose", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(name); +} + +aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType, + uint64_t flags, uint64_t *shareableHandle) +{ + typedef aclError (*AclrtMemExportToShareableHandle)(aclrtDrvMemHandle, aclrtMemHandleType, uint64_t, uint64_t *); + static AclrtMemExportToShareableHandle func = nullptr; + if (func == nullptr) { + func = (AclrtMemExportToShareableHandle) GET_FUNC(aclrtMemExportToShareableHandle); + } + + TORCH_CHECK(func, "Failed to find function aclrtMemExportToShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(handle, handleType, flags, shareableHandle); +} + +aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, size_t pidNum) +{ + typedef aclError (*AclrtMemSetPidToShareableHandle)(uint64_t, int32_t *, size_t); + static AclrtMemSetPidToShareableHandle func = nullptr; + if (func == nullptr) { + func = (AclrtMemSetPidToShareableHandle) GET_FUNC(aclrtMemSetPidToShareableHandle); + } + + TORCH_CHECK(func, "Failed to find function aclrtMemSetPidToShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(shareableHandle, pid, pidNum); +} + +aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle) +{ + typedef aclError (*AclrtMemImportFromShareableHandle)(uint64_t, int32_t, aclrtDrvMemHandle *); + static AclrtMemImportFromShareableHandle func = nullptr; + if (func == nullptr) { + func = (AclrtMemImportFromShareableHandle) GET_FUNC(aclrtMemImportFromShareableHandle); + } + + TORCH_CHECK(func, "Failed to find function aclrtMemImportFromShareableHandle", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(shareableHandle, deviceId, handle); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index fe567a77ae..efea001767 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -228,5 +228,20 @@ aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, */ aclError AclrtHostUnregister(void *ptr); +aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *name, size_t len); + +aclError AclrtIpcMemSetImportPid(const char *name, int32_t pid[], int num); + +aclError AclrtIpcMemImportByKey(void **devPtr, const char *name); + +aclError AclrtIpcMemClose(const char *name); + +aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType, + uint64_t flags, uint64_t *shareableHandle); + +aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, size_t pidNum); + +aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/ipc/CMakeLists.txt b/torch_npu/csrc/ipc/CMakeLists.txt new file mode 100644 index 0000000000..2c70da051f --- /dev/null +++ b/torch_npu/csrc/ipc/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB _IPC_SRCS *.cpp) + +LIST(APPEND IPC_SRCS ${_IPC_SRCS}) + +# Pass to parent +set(IPC_SRCS ${IPC_SRCS} PARENT_SCOPE) \ No newline at end of file diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.cpp b/torch_npu/csrc/ipc/NPUIPCTypes.cpp new file mode 100644 index 0000000000..b18b6e2f2e --- /dev/null +++ b/torch_npu/csrc/ipc/NPUIPCTypes.cpp @@ -0,0 +1,252 @@ +#include +#include +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUGuard.h" +#include "torch_npu/csrc/ipc/NPUIPCTypes.h" + +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + +namespace torch_npu { +namespace ipc { + +namespace { + +void warnProducerTerminatedBeforeSharedTensorsReleased() +{ + static bool warned = false; + if (!warned) { + LOG(WARNING) + << "Producer process has been terminated before all shared NPU tensors released. See Note [Sharing NPU tensors]"; + warned = true; + } +} + +struct NpuIPCGlobalEntities { + // This class is used as a singleton (see npu_ipc_global_entities) + // This variable is used to track its lifetime to avoid accessing it + // after it was destroyed which would lead to segmentation faults + // Note that a trvial type is used which doesn't suffer from construction + // and destruction order issues + static bool alive; + + std::mutex ref_counters_mutex_; + std::atomic sync_events_used_{0}; + std::map> + ref_counters_files_; + std::shared_ptr next_available_ref_counters_file_; + NpuIPCSentDataLimbo NpuIPCSentDataLimbo_; + + NpuIPCGlobalEntities() + { + alive = true; + } + + ~NpuIPCGlobalEntities() + { + NpuIPCSentDataLimbo_.collect(); + safe_clean_current_file(); + if (next_available_ref_counters_file_) { + warnProducerTerminatedBeforeSharedTensorsReleased(); + } + alive = false; + } + + void safe_clean_current_file() + { + std::lock_guard lock(ref_counters_mutex_); + if (next_available_ref_counters_file_ && + next_available_ref_counters_file_->offsets_in_use() == 0) { + ref_counters_files_.erase(next_available_ref_counters_file_->handle()); + next_available_ref_counters_file_.reset(); + } + } +}; + +bool NpuIPCGlobalEntities::alive = false; +NpuIPCGlobalEntities npu_ipc_global_entities; + +NpuIPCSentDataLimbo::~NpuIPCSentDataLimbo() +{ + collect(); + if (size() > 0) { + warnProducerTerminatedBeforeSharedTensorsReleased(); + } +} + +bool NpuIPCSentDataLimbo::collect() +{ + bool freed_memory = false; + std::vector> reset_blocks; + { + // Begin critical section to modify shared blocks + std::lock_guard lock(limbo_mutex_); + std::vector> kept_blocks; + for (auto& sd : shared_blocks_) { + if (sd->counter_value() > 0) { + kept_blocks.push_back(std::move(sd)); + } else { + freed_memory = true; + reset_blocks.push_back(std::move(sd)); + } + } + shared_blocks_ = std::move(kept_blocks); + } + // Need to reset blocks out of the critical section here, otherwise it + // deadlocks. + for (auto& sd : reset_blocks) { + sd.reset(); + } + return freed_memory; +} + +void NpuIPCSentDataLimbo::add(std::unique_ptr shared_block) +{ + std::lock_guard lock(limbo_mutex_); + static bool warned = false; + if (shared_blocks_.size() > NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO && + !warned) { + LOG(WARNING) + << "Producer process tried to deallocate over " + << NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO + << " memory blocks referred by consumer processes. Deallocation might be significantly slowed down. " + << "We assume it will never going to be the case."; + warned = true; + } + shared_blocks_.push_back(std::move(shared_block)); +} + +uint64_t NpuIPCSentDataLimbo::size() +{ + std::lock_guard lock(limbo_mutex_); + return shared_blocks_.size(); +} + +void NpuIPCSentDataDelete(void* ptr) +{ + std::unique_ptr sent_data( + static_cast(ptr)); + if (!NpuIPCGlobalEntities::alive) { + return; + } + if (sent_data->counter_value() > 0) { + npu_ipc_global_entities.NpuIPCSentDataLimbo_.add(std::move(sent_data)); + } + npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect(); +} + +void ReturnRefCounter(const std::string& handle, uint64_t offset /* unused */) +{ + if (!NpuIPCGlobalEntities::alive) { + return; + } + std::lock_guard lock( + npu_ipc_global_entities.ref_counters_mutex_); + auto& map = npu_ipc_global_entities.ref_counters_files_; + auto it = map.find(handle); + if (it != map.end()) { + it->second->return_offset(offset); + if (it->second->offsets_in_use() == 0 && !it->second->have_offsets()) { + map.erase(handle); + } + } +} + +} // namespace + +NpuIPCSentData::NpuIPCSentData( + std::string handle, + uint64_t offset, + uint64_t* counter_ptr, + at::Device device) + : handle_(std::move(handle)), + offset_(offset), + counter_ptr_(counter_ptr), + device_(device) +{ + if (npu_ipc_global_entities.sync_events_used_.load() < + NPU_IPC_MAXIMUM_EVENTS_TO_USE) { + } else { + auto stream = c10_npu::getCurrentNPUStream(device.index()); + c10_npu::stream_synchronize(stream); + event_ = nullptr; + event_sync_required_ = false; + } +} + +NpuIPCSentData::~NpuIPCSentData() +{ + ReturnRefCounter(handle_, offset_); + try { + if (event_sync_required_) { + } + } catch (...) { /* No throw */ + } +} + +uint64_t NpuIPCSentData::counter_value() +{ + return *counter_ptr_; +} + +at::DataPtr GetNewRefCountedSentData(void* data, at::Device device) +{ + { + std::lock_guard lock( + npu_ipc_global_entities.ref_counters_mutex_); + if (!npu_ipc_global_entities.next_available_ref_counters_file_) { + std::string ref_counter_handle = at::NewProcessWideShmHandle(); + + int flags = + at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE; + at::DataPtr sptr = at::RefcountedMapAllocator::makeDataPtr( + ref_counter_handle.c_str(), + flags, + sizeof(int64_t) * NPU_IPC_REF_COUNTER_FILE_SIZE, + nullptr); + auto rc = std::make_shared( + ref_counter_handle, NPU_IPC_REF_COUNTER_FILE_SIZE, std::move(sptr)); + npu_ipc_global_entities.ref_counters_files_[ref_counter_handle] = rc; + npu_ipc_global_entities.next_available_ref_counters_file_ = rc; + } + } + npu_ipc_global_entities.next_available_ref_counters_file_->set_counter(1); + auto sent_data = new NpuIPCSentData( + npu_ipc_global_entities.next_available_ref_counters_file_->handle(), + npu_ipc_global_entities.next_available_ref_counters_file_->get_offset(), + npu_ipc_global_entities.next_available_ref_counters_file_->counter_ptr(), + device); + + npu_ipc_global_entities.next_available_ref_counters_file_->rotate_offset(); + if (!npu_ipc_global_entities.next_available_ref_counters_file_ + ->have_offsets()) { + npu_ipc_global_entities.next_available_ref_counters_file_.reset(); + } + return at::DataPtr(data, sent_data, NpuIPCSentDataDelete, device); +} + +bool NpuIPCCollect() +{ + if (!NpuIPCGlobalEntities::alive) { + return true; + } + bool freed_memory = npu_ipc_global_entities.NpuIPCSentDataLimbo_.collect(); + if (npu_ipc_global_entities.NpuIPCSentDataLimbo_.size() == 0) { + npu_ipc_global_entities.safe_clean_current_file(); + } + return freed_memory; +} + +} // namespace ipc +} // namespace torch_npu + +namespace c10_npu { +namespace NPUCachingAllocator { + +REGISTER_FREE_MEMORY_CALLBACK("npu_ipc_collect", NpuIPCCollectCallback); + +} // namespace NPUCachingAllocator +} // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.h b/torch_npu/csrc/ipc/NPUIPCTypes.h new file mode 100644 index 0000000000..5156af2da4 --- /dev/null +++ b/torch_npu/csrc/ipc/NPUIPCTypes.h @@ -0,0 +1,150 @@ +#pragma once +#include + +#include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/core/npu/NPUFunctions.h" +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" + +namespace torch_npu { +namespace ipc { + +TORCH_NPU_API bool NpuIPCCollect(); + +struct NpuIPCReceivedData final { + NpuIPCReceivedData() = default; + explicit NpuIPCReceivedData(std::shared_ptr shared_ptr) + : shared_ptr_(std::move(shared_ptr)) {} + std::shared_ptr shared_ptr_; +}; + +struct NpuIPCSentData final { + std::string handle_; + uint64_t offset_; + uint64_t* counter_ptr_; // Reference counter shared memory block + at::DataPtr original_ptr_; // Original mem allocation + char* event_; // Sync event + bool event_sync_required_; + at::Device device_; + + NpuIPCSentData( + std::string handle, + uint64_t offset, + uint64_t* counter_ptr, + at::Device device); + ~NpuIPCSentData(); + + uint64_t counter_value(); + std::string handle() + { + return handle_; + } + uint64_t offset() + { + return offset_; + } + void set_original_ptr(at::DataPtr data_ptr) + { + original_ptr_ = std::move(data_ptr); + } +}; + +TORCH_NPU_API at::DataPtr GetNewRefCountedSentData( + void* data, + at::Device device); + +namespace { + +inline constexpr int64_t NPU_IPC_REF_COUNTER_FILE_SIZE = 10000; +inline constexpr int64_t NPU_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000; +inline constexpr int64_t NPU_IPC_MAXIMUM_EVENTS_TO_USE = 0; + +// All to be deleted data blocks with non zero reference counter goes there +struct NpuIPCSentDataLimbo final { + ~NpuIPCSentDataLimbo(); + bool collect(); + void add(std::unique_ptr shared_block); + uint64_t size(); + +private: + std::vector> shared_blocks_; + std::mutex limbo_mutex_; +}; + +struct NpuIPCRefCountersFile final { + NpuIPCRefCountersFile( + std::string handle, + uint64_t size, + at::DataPtr data_ptr) + : size_(size), + handle_(std::move(handle)), + refcounted_shared_mem_(std::move(data_ptr)) {} + + uint64_t* counter_ptr() + { + return static_cast(refcounted_shared_mem_.get()) + next_offset_; + } + + void set_counter(uint64_t value) + { + *counter_ptr() = value; + } + + bool have_offsets() + { + return next_offset_ < size_; + } + + bool offsets_in_use() + { + return used_slots_; + } + + uint64_t get_offset() + { + return next_offset_; + } + + void rotate_offset() + { + next_offset_++; + used_slots_++; + } + + void return_offset(uint64_t offset /* unused */) + { + used_slots_--; + } + + std::string handle() + { + return handle_; + } + +private: + uint64_t next_offset_{0}; + uint64_t size_; + uint64_t used_slots_{0}; + std::string handle_; + at::DataPtr refcounted_shared_mem_; +}; + +} // namespace +} // namespace ipc +} // namespace torch_npu + +namespace c10_npu { +namespace NPUCachingAllocator { +namespace { + +class NpuIPCCollectCallback : public FreeMemoryCallback { +public: + bool Execute() override + { + return torch_npu::ipc::NpuIPCCollect(); + } +}; + +} // namespace +} // namespace NPUCachingAllocator +} // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/ipc/StorageSharing.cpp b/torch_npu/csrc/ipc/StorageSharing.cpp new file mode 100644 index 0000000000..cd7b9e372a --- /dev/null +++ b/torch_npu/csrc/ipc/StorageSharing.cpp @@ -0,0 +1,301 @@ +#ifndef BUILD_LIBTORCH + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/core/NPUBridge.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" +#include "torch_npu/csrc/core/npu/NPUGuard.h" + +#include "torch_npu/csrc/ipc/NPUIPCTypes.h" +#include "torch_npu/csrc/ipc/StorageSharing.h" + +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + +namespace torch_npu { +namespace reductions { + +static PyObject* THNPStorage_shareNpu(PyObject* self, PyObject* args) +{ + HANDLE_TH_ERRORS + const auto& storage = THPStorage_Unpack(args); + TORCH_CHECK( + storage.device_type() == at::DeviceType::PrivateUse1, + "_share_npu_: only available on NPU.", PTA_ERROR(ErrCode::PARAM)); + c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl(); + + if (storage_impl->received_cuda()) { + AT_ERROR( + "Supported to send NPU tensor received from another process; other is not currently supported. Consider cloning before sending."); + } + + at::DeviceGuard device_guard(storage.device()); + THPObjectPtr tuple(PyTuple_New(8)); + THPObjectPtr device(THPUtils_packInt32(storage.device().index())); + THPObjectPtr _handle(Py_None); + Py_INCREF(Py_None); + THPObjectPtr size_bytes(THPUtils_packUInt64(storage.nbytes())); + THPObjectPtr _offset_bytes(THPUtils_packInt32(0)); + THPObjectPtr _ref_counter(Py_None); + Py_INCREF(Py_None); + THPObjectPtr _ref_counter_offset(THPUtils_packInt32(0)); + THPObjectPtr _event_handle(Py_None); + Py_INCREF(Py_None); + THPObjectPtr _event_sync_required(Py_None); + Py_INCREF(Py_None); + if (storage.data()) { + auto shandle = c10_npu::NPUCachingAllocator::shareIpcHandle(storage.mutable_data()); + _handle = PyBytes_FromStringAndSize( + shandle.handle.c_str(), (Py_ssize_t)shandle.handle.size()); + _offset_bytes = PyLong_FromSsize_t((Py_ssize_t)shandle.offset); + + at::DataPtr sent_data_ptr = torch_npu::ipc::GetNewRefCountedSentData( + storage.mutable_data(), storage.device()); + auto old_data_ptr = storage.set_data_ptr(std::move(sent_data_ptr)); + auto sent_data = + static_cast(storage.data_ptr().get_context()); + sent_data->set_original_ptr(std::move(old_data_ptr)); + _ref_counter = PyBytes_FromString((sent_data->handle()).c_str()); + _ref_counter_offset = THPUtils_packUInt64(sent_data->offset()); + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + aclrtNotify ipc_event_handle; + + if (sent_data->event_sync_required_) { + // TO BE DONE + } + + _event_handle = PyBytes_FromStringAndSize( + (char*)&ipc_event_handle, sizeof(aclrtNotify)); + _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_); + } + + if (!tuple || !device || !_handle || !size_bytes || !_offset_bytes || + !_event_handle) { + return nullptr; + } + PyTuple_SET_ITEM(tuple.get(), 0, device.release()); + PyTuple_SET_ITEM(tuple.get(), 1, _handle.release()); + // Size(in bytes) of the real storage, note this is not the size of basePtr + // memory block. + PyTuple_SET_ITEM(tuple.get(), 2, size_bytes.release()); + // Offset(in bytes) of the real storage in the basePtr memory block. + // NB: this offset MUST be in bytes instead of numel, since we use + // (storage_handle, offset) + // as key in shared_cache(multiprocessing/reduction.py). + // Offset in numel cannot uniquely represent a storage. + PyTuple_SET_ITEM(tuple.get(), 3, _offset_bytes.release()); + PyTuple_SET_ITEM(tuple.get(), 4, _ref_counter.release()); + PyTuple_SET_ITEM(tuple.get(), 5, _ref_counter_offset.release()); + PyTuple_SET_ITEM(tuple.get(), 6, _event_handle.release()); + PyTuple_SET_ITEM(tuple.get(), 7, _event_sync_required.release()); + return tuple.release(); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPStorage_releaseIPCCounter(PyObject* _unused, PyObject* args) +{ + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected", PTA_ERROR(ErrCode::PARAM)); + + PyObject* _ref_counter = PyTuple_GET_ITEM(args, 0); + PyObject* _ref_counter_offset = PyTuple_GET_ITEM(args, 1); + if (!(PyBytes_Check(_ref_counter) && THPUtils_checkLong(_ref_counter_offset))) { + THPUtils_invalidArguments( + args, + nullptr, + "_release_ipc_counter in NPU mode", + 1, + "(bytes _ref_counter, int _ref_counter_offset)"); + return nullptr; + } + std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter); + ptrdiff_t ref_counter_offset = + (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset); + // We don't want to break existing code, so resource deletion is best + // effort basis. Exception expected if producer process terminated + // before consumer released data. + int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_NOCREATE; + try { + auto sptr = at::RefcountedMapAllocator::makeDataPtr( + ref_counter_handle.c_str(), + flags, + sizeof(int64_t) * torch_npu::ipc::NPU_IPC_REF_COUNTER_FILE_SIZE, + nullptr); + *(static_cast(sptr.get()) + ref_counter_offset) -= 1; + } catch (c10::Error& err) { + // Already warned inside of producer process + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static std::string THNPStorage_bytesAsHandleString(PyObject* handle) +{ + HANDLE_TH_ERRORS + char* buffer = nullptr; + Py_ssize_t handle_size = 0; + if (PyBytes_AsStringAndSize(handle, &buffer, &handle_size) == -1) { + TORCH_CHECK(handle_size == ACL_IPC_HANDLE_SIZE, "incorrect handle", PTA_ERROR(ErrCode::PARAM)); + } + return std::string(buffer, handle_size); + END_HANDLE_TH_ERRORS_RET("") +} + +static PyObject* THNPStorage_newSharedNpu(PyObject* _unused, PyObject* args) +{ + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_GET_SIZE(args) == 8, "tuple of 8 items expected", PTA_ERROR(ErrCode::PARAM)); + PyObject* _device = PyTuple_GET_ITEM(args, 0); + PyObject* _handle = PyTuple_GET_ITEM(args, 1); + PyObject* _size_bytes = PyTuple_GET_ITEM(args, 2); + PyObject* _offset_bytes = PyTuple_GET_ITEM(args, 3); + PyObject* _ref_counter = PyTuple_GET_ITEM(args, 4); + PyObject* _ref_counter_offset = PyTuple_GET_ITEM(args, 5); + PyObject* _event_handle = PyTuple_GET_ITEM(args, 6); + PyObject* _event_sync_required = PyTuple_GET_ITEM(args, 7); + if (!(THPUtils_checkLong(_device) && THPUtils_checkLong(_size_bytes) && + PyBytes_Check(_handle) && PyBytes_Check(_ref_counter) && + PyBytes_Check(_event_handle) && THPUtils_checkLong(_offset_bytes) && + THPUtils_checkLong(_ref_counter_offset) && + PyBool_Check(_event_sync_required))) { + THPUtils_invalidArguments( + args, + nullptr, + "_new_shared in NPU mode", + 1, + "(int device, bytes handle, int storage_size_bytes, int storage_offset_bytes, bytes _ref_counter, int _ref_counter_offset, bytes event_handle, bool event_sync_required)"); + return nullptr; + } + + size_t storage_size = + (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(uint8_t); + ptrdiff_t storage_offset_bytes = + (ptrdiff_t)THPUtils_unpackLong(_offset_bytes); + + const auto device = c10::checked_convert( + THPUtils_unpackLong(_device), "c10::DeviceIndex"); + c10_npu::NPUGuard device_guard(device); + + if (PyObject_IsTrue(_event_sync_required)) { + // TO BE DONE + } + + std::string s_handle = THNPStorage_bytesAsHandleString(_handle); + if (s_handle.empty()) { + return nullptr; + } + std::shared_ptr basePtr = + c10_npu::NPUCachingAllocator::getIpcDevPtr(s_handle); + + // Offset the basePtr to reconstruct the real storage + // devPtr = basePtr + storage_offset + void* devPtr = basePtr.get(); + devPtr = (char*)devPtr + storage_offset_bytes; + + std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter); + ptrdiff_t ref_counter_offset = + (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset); + + struct IpcDeleterContext { + std::string ref_counter_handle; + ptrdiff_t ref_counter_offset; + int64_t device; + torch_npu::ipc::NpuIPCReceivedData received_data; + }; + + auto ctx = std::make_unique(); + ctx->ref_counter_handle = std::move(ref_counter_handle); + ctx->ref_counter_offset = ref_counter_offset; + ctx->device = device; + ctx->received_data.shared_ptr_ = std::move(basePtr); + + auto cur_device = c10_npu::current_device(); + c10::DataPtr data_ptr( + devPtr, + ctx.release(), + +[](void* ctx_) { + std::unique_ptr ctx( + static_cast(ctx_)); + + ctx->received_data.shared_ptr_.reset(); + + try { + c10_npu::stream_synchronize( + c10_npu::getCurrentNPUStream(ctx->device)); + } catch (c10::Error& err) { + // Already warned inside of producer process + } + + int flags = + at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_NOCREATE; + try { + auto sptr = at::RefcountedMapAllocator::makeDataPtr( + ctx->ref_counter_handle.c_str(), + flags, + sizeof(int64_t) * torch_npu::ipc::NPU_IPC_REF_COUNTER_FILE_SIZE, + nullptr); + *(static_cast(sptr.get()) + ctx->ref_counter_offset) -= 1; + } catch (c10::Error& err) { + // Already warned inside of producer process + } + }, + at::Device(at::DeviceType::PrivateUse1, cur_device)); + + c10::intrusive_ptr base = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + storage_size, + std::move(data_ptr), + nullptr, + false); + + base->set_resizable(false); + base->set_received_cuda(true); + + return THPStorage_NewWithStorage( + THPStorageClass, + std::move(base), + c10::impl::PyInterpreterStatus::TAGGED_BY_US); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPStorage_isShared(PyObject* self, PyObject* arg) +{ + const auto& storage = THPStorage_Unpack(self); + if (storage.device_type() == at::kPrivateUse1) { + Py_RETURN_TRUE; + } + if (at::MapAllocator::fromDataPtr(storage.data_ptr()) || + THManagedMapAllocator::fromDataPtr(storage.data_ptr())) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } +} + +static struct PyMethodDef TorchReductionsMethods[] = { + {"_share_npu_", THNPStorage_shareNpu, METH_O, nullptr}, + {"_release_ipc_counter_npu", THNPStorage_releaseIPCCounter, METH_VARARGS, nullptr}, + {"_new_shared_npu", THNPStorage_newSharedNpu, METH_VARARGS, nullptr}, + {"_is_shared", THNPStorage_isShared, METH_O, nullptr}, + {nullptr, nullptr, 0, nullptr}, +}; + +PyMethodDef* reductions_functions() +{ + return TorchReductionsMethods; +} + +} // namespace reductions +} // namespace torch_npu + +#endif \ No newline at end of file diff --git a/torch_npu/csrc/ipc/StorageSharing.h b/torch_npu/csrc/ipc/StorageSharing.h new file mode 100644 index 0000000000..a38e0c0ad6 --- /dev/null +++ b/torch_npu/csrc/ipc/StorageSharing.h @@ -0,0 +1,15 @@ +#ifndef BUILD_LIBTORCH +#pragma once + +#include +#include "torch_npu/csrc/core/npu/NPUMacros.h" + +namespace torch_npu { +namespace reductions { + +TORCH_NPU_API PyMethodDef* reductions_functions(); + +} // namespace reductions +} // namespace torch_npu + +#endif \ No newline at end of file diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 614ef73506..09e158364b 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -27,6 +27,8 @@ #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUAffinityController.h" +#include "torch_npu/csrc/core/npu/NPUPeerToPeerAccess.h" +#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -1661,6 +1663,34 @@ static PyObject* THNPModule_is_gte_cann_version(PyObject* self, PyObject *args) END_HANDLE_TH_ERRORS } +static PyObject* THNPModule_add_ipc_pid(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + int pid; + if (!PyArg_ParseTuple(args, "i", &pid)) { + throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); + } + torch_npu::ipc::addPid(pid); + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_add_p2p_access(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + int src_dev; + int dst_dev; + if (!PyArg_ParseTuple(args, "ii", &src_dev, &dst_dev)) { + throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); + } + bool warning_flag = false; + at_npu::native::NpuP2pCtrl::get_instance().get_p2p_access(src_dev, dst_dev, warning_flag); + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + static struct PyMethodDef THNPModule_methods[] = { {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr}, {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr}, @@ -1722,6 +1752,8 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_clear_fft_plan_cache", (PyCFunction)THNPModule_npu_clear_fft_plan_cache, METH_NOARGS, nullptr}, {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr}, {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr}, + {"_add_ipc_pid", (PyCFunction)THNPModule_add_ipc_pid, METH_VARARGS, nullptr}, + {"_add_p2p_access", (PyCFunction)THNPModule_add_p2p_access, METH_VARARGS, nullptr}, {nullptr}}; TORCH_NPU_API PyMethodDef* THNPModule_get_methods() diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index ef07cf8bef..660c69a89d 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -300,6 +300,24 @@ void NPUPluggableAllocator::copy_data(void* dest, const void* src, std::size_t c { default_copy_data(dest, src, count); } + +std::shared_ptr NPUPluggableAllocator::getIpcDevPtr(std::string handle) +{ + TORCH_NPU_WARN( + "NPUPluggableAllocator does not yet support getIpcDevPtr. " + "If you need it, please file an issue describing your use case."); + auto sp = std::shared_ptr(); + return sp; +} + +c10_npu::NPUCachingAllocator::ShareableHandle NPUPluggableAllocator::shareIpcHandle(void* ptr) +{ + TORCH_NPU_WARN( + "NPUPluggableAllocator does not yet support shareIPcHandle. " + "If you need it, please file an issue describing your use case."); + return c10_npu::NPUCachingAllocator::ShareableHandle{0, nullptr}; +} + void NPUPluggableAllocator::recordHistory( bool enabled, c10_npu::NPUCachingAllocator::CreateContextFn context_recorder, diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index 04f1d909be..a3691d48ee 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -83,6 +83,8 @@ struct NPUPluggableAllocator void FreeDeviceCachedMemory(int device) override; std::string name() override; void copy_data(void* dest, const void* src, std::size_t count) const final; + std::shared_ptr getIpcDevPtr(std::string handle) override; + c10_npu::NPUCachingAllocator::ShareableHandle shareIpcHandle(void*) override; void recordHistory( bool enabled, c10_npu::NPUCachingAllocator::CreateContextFn context_recorder, diff --git a/torch_npu/multiprocessing/reductions.py b/torch_npu/multiprocessing/reductions.py new file mode 100644 index 0000000000..cc40949f79 --- /dev/null +++ b/torch_npu/multiprocessing/reductions.py @@ -0,0 +1,178 @@ +__all__ = ["rebuild_npu_tensor"] + +import multiprocessing +import torch +from torch.multiprocessing.reductions import ( + shared_cache, + rebuild_storage_filename, + rebuild_storage_empty, + rebuild_storage_fd, + StorageWeakRef, + fd_id, + rebuild_tensor, + storage_from_cache, +) + +import torch_npu + + +def rebuild_npu_tensor( + tensor_cls, + tensor_size, + tensor_stride, + tensor_offset, + storage_cls, + dtype, + storage_device, + storage_handle, + storage_size_bytes, + storage_offset_bytes, + requires_grad, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, +): + # If storage_handle is None, storage points to nullptr. + if storage_handle is None or storage_size_bytes == 0: + storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True) + else: + storage = storage_from_cache( + storage_cls, (storage_handle, storage_offset_bytes) + ) + if storage is None: + torch_npu.npu._lazy_init() + storage = storage_cls._new_shared_npu( + storage_device, + storage_handle, + storage_size_bytes, + storage_offset_bytes, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, + ) + shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef( + storage + ) + else: + # We already ref counting this Storage, but producer needs new ref-counters to be released. + storage_cls._release_ipc_counter_npu( + ref_counter_handle, ref_counter_offset, device=storage_device + ) + + _storage = ( + storage + if isinstance(storage, torch.UntypedStorage) + else storage._untyped_storage + ) + + t = torch._utils._rebuild_tensor( + torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True), + tensor_offset, + tensor_size, + tensor_stride, + ) + + if tensor_cls == torch.nn.parameter.Parameter: + # It is crucial for integer tensors to receive + # the requires_grad=False as an argument in the constructor + t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad) + else: + t.requires_grad = requires_grad + + return t + + +def _npu_reduce_tensor(tensor): + storage = tensor._typed_storage() + + if tensor.requires_grad and not tensor.is_leaf: + raise RuntimeError( + "Cowardly refusing to serialize non-leaf tensor which requires_grad, " + "since autograd does not support crossing process boundaries. " + "If you just want to transfer the data, call detach() on the tensor " + "before serializing (e.g., putting it on the queue)." + ) + + torch._namedtensor_internals.check_serializing_named_tensor(tensor) + torch.utils.hooks.warn_if_has_hooks(tensor) + + if storage._untyped_storage.device.type == "npu": + ( + device, + handle, + storage_size_bytes, + storage_offset_bytes, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, + ) = storage._share_npu_() + tensor_offset = tensor.storage_offset() + shared_cache[handle] = StorageWeakRef(storage) + return ( + rebuild_npu_tensor, + ( + type(tensor), + tensor.size(), + tensor.stride(), + tensor_offset, # tensor offset in its storage + type(storage), + tensor.dtype, + device, + handle, # identifier which NPU allocation is the storage in. + storage_size_bytes, # size(in bytes) of the storage + storage_offset_bytes, # offset(in bytes) of the storage in the NPU allocation + tensor.requires_grad, + ref_counter_handle, + ref_counter_offset, + event_handle, + event_sync_required, + ), + ) + + # _backward_hooks purposely omitted here, see Note [Don't serialize hooks] + metadata = ( + tensor.storage_offset(), + tensor.size(), + tensor.stride(), + tensor.requires_grad, + ) + return (rebuild_tensor, (type(tensor), storage, metadata)) + + +def _npu_reduce_storage(storage): + from torch.multiprocessing import get_sharing_strategy + + if storage.is_npu: + raise RuntimeError( + "Cannot pickle NPU storage; try pickling a NPU tensor instead" + ) + elif get_sharing_strategy() == "file_system": + metadata = storage._share_filename_cpu_() + cache_key = metadata[1] + rebuild = rebuild_storage_filename + if isinstance(storage, torch.TypedStorage): + metadata += (storage.dtype,) + storage._shared_incref() + elif storage.size() == 0: + # This is special cased because Empty tensors + # (with size 0) cannot be mmapped. + return (rebuild_storage_empty, (type(storage),)) + else: + fd, size = storage._share_fd_cpu_() + df = multiprocessing.reduction.DupFd(fd) + cache_key = fd_id(fd) + metadata = (df, size) + rebuild = rebuild_storage_fd # type: ignore[assignment] + + shared_cache[cache_key] = StorageWeakRef(storage) + return (rebuild, (type(storage),) + metadata) + + +def _add_reductions_methods(): + torch.multiprocessing.reductions.reduce_tensor = _npu_reduce_tensor + torch.multiprocessing.reductions.reduce_storage = _npu_reduce_storage + + torch.multiprocessing.reductions.init_reductions() \ No newline at end of file diff --git a/torch_npu/utils/storage.py b/torch_npu/utils/storage.py index 349823492f..f4475025de 100644 --- a/torch_npu/utils/storage.py +++ b/torch_npu/utils/storage.py @@ -1,7 +1,7 @@ __all__ = [] import copy -from typing import Any, Dict +from typing import Any, Dict, Union from collections import OrderedDict import torch @@ -98,6 +98,37 @@ def _deepcopy(self, memo): return self._new_wrapped_storage(copy.deepcopy(self._untyped_storage, memo)) +def _share_npu_(self, *args, **kwargs): + return torch_npu._C._share_npu_(self, *args, **kwargs) + + +def _typed_storage_share_npu_(self, *args, **kwargs): + return self._untyped_storage._share_npu_(*args, **kwargs) + + +def _new_shared_npu(*args, **kwargs): + return torch_npu._C._new_shared_npu(*args, **kwargs) + + +def _typed_storage_new_shared_npu(*args, **kwargs): + return torch.UntypedStorage._new_shared_npu(*args, **kwargs) + + +def _release_ipc_counter_npu(*args, **kwargs): + return torch_npu._C._release_ipc_counter_npu(*args, **kwargs) + + +def _typed_storage_release_ipc_counter_npu(*args, device: Union[str, torch.device] = "npu", **kwargs): + return torch.UntypedStorage._release_ipc_counter_npu(*args, **kwargs) + + def _add_storage_methods(): torch.storage.UntypedStorage.cpu = _cpu torch.storage.TypedStorage._deepcopy = _deepcopy + + setattr(torch.UntypedStorage, "_share_npu_", _share_npu_) + setattr(torch.UntypedStorage, "_new_shared_npu", _new_shared_npu) + setattr(torch.UntypedStorage, "_release_ipc_counter_npu", _release_ipc_counter_npu) + setattr(torch.TypedStorage, "_share_npu_", _typed_storage_share_npu_) + setattr(torch.TypedStorage, "_new_shared_npu", _typed_storage_new_shared_npu) + setattr(torch.TypedStorage, "_release_ipc_counter_npu", _typed_storage_release_ipc_counter_npu) \ No newline at end of file diff --git a/torch_npu/utils/unsupport_api.py b/torch_npu/utils/unsupport_api.py index 61ba27b3a2..5626e940b6 100644 --- a/torch_npu/utils/unsupport_api.py +++ b/torch_npu/utils/unsupport_api.py @@ -6,8 +6,6 @@ value: parent_module(object) """ unsupported_Tensor_api = { - "is_shared": torch.Tensor, - "share_memory_": torch.Tensor } unsupported_nn_api = { -- Gitee From efbad7c05b0974c8a595e2fba13338fd15d011fd Mon Sep 17 00:00:00 2001 From: chuboning Date: Tue, 8 Jul 2025 02:08:35 +0000 Subject: [PATCH 213/328] !22737 Revert some files Merge pull request !22737 from chuboning/v2.7.1 --- test/npu/test_tensors.py | 22 - third_party/acl/inc/acl/acl_base.h | 8 - torch_npu/__init__.py | 7 +- torch_npu/csrc/core/npu/NpuVariables.cpp | 6 +- torch_npu/csrc/core/npu/NpuVariables.h | 3 +- .../csrc/core/npu/interface/AclInterface.cpp | 3 +- .../csrc/custom_dtype/CastKernelTeOpApi.cpp | 43 -- torch_npu/csrc/custom_dtype/Init.cpp | 42 +- torch_npu/csrc/custom_dtype/Init.h | 8 - torch_npu/csrc/custom_dtype/extension.h | 12 - .../csrc/framework/utils/CalcuOpUtil.cpp | 14 +- torch_npu/utils/hif8_tensor.py | 584 ------------------ 12 files changed, 9 insertions(+), 743 deletions(-) delete mode 100644 torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp delete mode 100644 torch_npu/csrc/custom_dtype/extension.h delete mode 100644 torch_npu/utils/hif8_tensor.py diff --git a/test/npu/test_tensors.py b/test/npu/test_tensors.py index e5fc17ae6d..3108eb9b64 100644 --- a/test/npu/test_tensors.py +++ b/test/npu/test_tensors.py @@ -1,5 +1,4 @@ from copy import deepcopy -import unittest import numpy as np import torch import torch_npu @@ -23,16 +22,6 @@ types = [ ] -def skipIfUnsupport910_95(): - def skip_dec(func): - def wrapper(self): - if "Ascend910_95" not in torch_npu.npu.get_device_name(): - return unittest.SkipTest("Device 910_95 condition not satisfied") - return func(self) - return wrapper - return skip_dec - - def get_npu_type(type_name): if isinstance(type_name, type): type_name = '{}.{}'.format(type_name.__module__, type_name.__name__) @@ -394,16 +383,5 @@ class TestViewOps(TestCase): self.assertEqual(tensor.view(3, -1).size(), target) -class TestTensorDtype(TestCase): - @skipIfUnsupport910_95() - def test_fp8(self): - tensor1 = torch.randn([2, 2], dtype=torch.float32).npu() - tensor2 = torch.randn([2, 2], dtype=torch.float32).npu() - tensor_f8e5m2 = tensor1.to(torch.float8_e5m2) - tensor_f8e4m3fn = tensor2.to(torch.float8_e4m3fn) - self.assertEqual(tensor_f8e5m2.dtype, torch.float8_e5m2) - self.assertEqual(tensor_f8e4m3fn.dtype, torch.float8_e4m3fn) - - if __name__ == "__main__": run_tests() diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index 4178016df5..a30f21375c 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -165,14 +165,6 @@ typedef enum { ACL_INT4 = 29, ACL_UINT1 = 30, ACL_COMPLEX32 = 33, - ACL_HIFLOAT8 = 34, - ACL_FLOAT8_E5M2 = 35, - ACL_FLOAT8_E4M3FN = 36, - ACL_FLOAT8_E8M0 = 37, - ACL_FLOAT6_E3M2 = 38, - ACL_FLOAT6_E2M3 = 39, - ACL_FLOAT4_E2M1 = 40, - ACL_FLOAT4_E1M2 = 41, } aclDataType; typedef enum { diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index dccc979150..2d2a84d3ae 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -1,4 +1,4 @@ -__all__ = ["erase_stream", "matmul_checksum", "HiFloat8Tensor"] +__all__ = ["erase_stream", "matmul_checksum"] import os import sys @@ -85,7 +85,6 @@ from torch_npu.utils import _register_ops_under_dtensor_rules from torch_npu.utils.exposed_api import public_npu_functions from torch_npu.multiprocessing.reductions import _add_reductions_methods from torch_npu.npu.utils import _erase_stream as erase_stream -from torch_npu.utils.hif8_tensor import HiFloat8Tensor from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler from torch_npu.asd.asd import _asd_patch from torch_npu.asd.checksum import _matmul_checksum as matmul_checksum @@ -115,10 +114,6 @@ for name in dir(torch.ops.npu): __all__.append(name) setattr(torch, name, _wrap_torch_error_func(getattr(torch.ops.npu, name))) -for name in dir(torch_npu._C._cd.DType): - if name.startswith('__') or name in ['_dir', 'name']: - continue - setattr(torch_npu, name, getattr(torch_npu._C._cd.DType, name)) all_monkey_patches = [ ["nn.functional", npu_functional], diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp index bfe2ee7245..24a2a8da62 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.cpp +++ b/torch_npu/csrc/core/npu/NpuVariables.cpp @@ -47,14 +47,10 @@ void SetSocVersion(const char* const socVersion) } SocVersion curSocVersion = SocVersion::UnsupportedSocVersion; - std::string inputVersion = socVersion; - std::string ascend95Version = "Ascend910_95"; auto const& iter = socVersionMap.find(socVersion); if (iter != socVersionMap.end()) { curSocVersion = iter->second; - } else if ((inputVersion.compare(0, ascend95Version.size(), ascend95Version) == 0)) { - curSocVersion = SocVersion::Ascend910_95; } else { std::string unsupported_soc(socVersion); std::replace(std::begin(unsupported_soc), std::end(unsupported_soc), '_', ' '); @@ -106,7 +102,7 @@ bool IsBF16Supported() bool IsAclnnOnly() { - return GetSocVersion() >= SocVersion::Ascend910_95; + return false; } } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h index 2fe0de9aff..6a3a8cdfd7 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.h +++ b/torch_npu/csrc/core/npu/NpuVariables.h @@ -30,8 +30,7 @@ enum class SocVersion { Ascend910_9381, Ascend910_9382, Ascend910_9372, - Ascend910_9362, - Ascend910_95 = 260 + Ascend910_9362 }; void SetSocVersion(const char* const socVersion); diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index c46740b72d..519948ffbd 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -852,8 +852,7 @@ bool IsCaptureSupported() static bool have_load_func = false; static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) && (GetSocVersion() < SocVersion::Ascend310B1)) || - ((GetSocVersion() >= SocVersion::Ascend910_9391) && - (GetSocVersion() < SocVersion::Ascend910_95)); + ((GetSocVersion() >= SocVersion::Ascend910_9391)); if (default_support_capture && !have_load_func) { have_load_func = true; typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *); diff --git a/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp b/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp deleted file mode 100644 index 2293ba94dd..0000000000 --- a/torch_npu/csrc/custom_dtype/CastKernelTeOpApi.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "torch_npu/csrc/custom_dtype/extension.h" -#include "op_plugin/AclOpsInterface.h" -#include "op_plugin/OpApiInterface.h" -#include "op_plugin/utils/op_api_common.h" - - -namespace c10_npu { - -at::Tensor cast_to_fp8(const at::Tensor &input, int otype) -{ - auto output = at::empty_like(input, c10_npu::GetATenDType(otype)); - - if (input.numel() == 0) { - return output; - } - - aclDataType out_acltype = c10_npu::GetAclDataType(otype); - TensorWrapper out_wrapper = {output, out_acltype}; - EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper); - - return output; -} - -void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype) -{ - aclDataType out_acltype = c10_npu::GetAclDataType(otype); - TensorWrapper out_wrapper = {output, out_acltype}; - EXEC_NPU_CMD(aclnnCast, input, out_acltype, out_wrapper); - return; -} - -at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype) -{ - aclDataType input_acltype = c10_npu::GetAclDataType(itype); - aclDataType out_acltype = c10_npu::GetAclDataType(otype); - auto output = at::empty_like(input, c10_npu::GetATenDType(otype)); - TensorWrapper input_wrapper = {input, input_acltype}; - TensorWrapper out_wrapper = {output, out_acltype}; - EXEC_NPU_CMD(aclnnCast, input_wrapper, out_acltype, out_wrapper); - - return output; -} -} diff --git a/torch_npu/csrc/custom_dtype/Init.cpp b/torch_npu/csrc/custom_dtype/Init.cpp index 90644aa1e3..a88344ce5e 100644 --- a/torch_npu/csrc/custom_dtype/Init.cpp +++ b/torch_npu/csrc/custom_dtype/Init.cpp @@ -3,7 +3,6 @@ #include #include #endif -#include "torch_npu/csrc/custom_dtype/extension.h" namespace c10_npu { @@ -27,14 +26,6 @@ struct DTypeConstants { static const int int4_value; static const int uint1_value; static const int complex32_value; - static const int hifloat8_value; - static const int float8_e5m2_value; - static const int float8_e4m3fn_value; - static const int float8_e8m0_value; - static const int float6_e3m2_value; - static const int float6_e2m3_value; - static const int float4_e2m1_value; - static const int float4_e1m2_value; }; const int DTypeConstants::float32_value = static_cast(DType::FLOAT); @@ -56,14 +47,6 @@ const int DTypeConstants::bfloat16_value = static_cast(DType::BF16); const int DTypeConstants::int4_value = static_cast(DType::INT4); const int DTypeConstants::uint1_value = static_cast(DType::UINT1); const int DTypeConstants::complex32_value = static_cast(DType::COMPLEX32); -const int DTypeConstants::hifloat8_value = static_cast(DType::HIFLOAT8); -const int DTypeConstants::float8_e5m2_value = static_cast(DType::FLOAT8_E5M2); -const int DTypeConstants::float8_e4m3fn_value = static_cast(DType::FLOAT8_E4M3FN); -const int DTypeConstants::float8_e8m0_value = static_cast(DType::FLOAT8_E8M0); -const int DTypeConstants::float6_e3m2_value = static_cast(DType::FLOAT6_E3M2); -const int DTypeConstants::float6_e2m3_value = static_cast(DType::FLOAT6_E2M3); -const int DTypeConstants::float4_e2m1_value = static_cast(DType::FLOAT4_E2M1); -const int DTypeConstants::float4_e1m2_value = static_cast(DType::FLOAT4_E1M2); #ifndef BUILD_LIBTORCH PyObject* cd_initExtension(PyObject*, PyObject *) @@ -94,20 +77,7 @@ PyObject* cd_initExtension(PyObject*, PyObject *) .def_readonly_static("bfloat16", &DTypeConstants::bfloat16_value) .def_readonly_static("int4", &DTypeConstants::int4_value) .def_readonly_static("uint1", &DTypeConstants::uint1_value) - .def_readonly_static("complex32", &DTypeConstants::complex32_value) - .def_readonly_static("hifloat8", &DTypeConstants::hifloat8_value) - .def_readonly_static("float8_e5m2", &DTypeConstants::float8_e5m2_value) - .def_readonly_static("float8_e4m3fn", &DTypeConstants::float8_e4m3fn_value) - .def_readonly_static("float8_e8m0", &DTypeConstants::float8_e8m0_value) - .def_readonly_static("float6_e3m2", &DTypeConstants::float6_e3m2_value) - .def_readonly_static("float6_e2m3", &DTypeConstants::float6_e2m3_value) - .def_readonly_static("float4_e2m1", &DTypeConstants::float4_e2m1_value) - .def_readonly_static("float4_e1m2", &DTypeConstants::float4_e1m2_value); - - m.def("cast_to_fp8", &cast_to_fp8, "Cast to FP8", py::call_guard()); - m.def("cast_to_fp8_noalloc", &cast_to_fp8_noalloc, "Cast to FP8", - py::call_guard()); - m.def("cast_from_fp8", &cast_from_fp8, "Cast from FP8", py::call_guard()); + .def_readonly_static("complex32", &DTypeConstants::complex32_value); Py_RETURN_NONE; } @@ -140,15 +110,7 @@ const std::string CustomDataTypeToString(int64_t dType) {DType::BF16, "torch_npu.bfloat16"}, {DType::INT4, "torch_npu.int4"}, {DType::UINT1, "torch_npu.uint1"}, - {DType::COMPLEX32, "torch_npu.complex32"}, - {DType::HIFLOAT8, "torch_npu.hifloat8"}, - {DType::FLOAT8_E5M2, "torch_npu.float8_e5m2"}, - {DType::FLOAT8_E4M3FN, "torch_npu.float8_e4m3fn"}, - {DType::FLOAT8_E8M0, "torch_npu.float8_e8m0"}, - {DType::FLOAT6_E3M2, "torch_npu.float6_e3m2"}, - {DType::FLOAT6_E2M3, "torch_npu.float6_e2m3"}, - {DType::FLOAT4_E2M1, "torch_npu.float4_e2m1"}, - {DType::FLOAT4_E1M2, "torch_npu.float4_e1m2"}}; + {DType::COMPLEX32, "torch_npu.complex32"}}; const auto iter = TYPE_TO_STRING_MAP.find(static_cast(dType)); return iter != TYPE_TO_STRING_MAP.end() ? iter->second : "Unknown dtype"; diff --git a/torch_npu/csrc/custom_dtype/Init.h b/torch_npu/csrc/custom_dtype/Init.h index 23235a0027..867e07ae3f 100644 --- a/torch_npu/csrc/custom_dtype/Init.h +++ b/torch_npu/csrc/custom_dtype/Init.h @@ -39,14 +39,6 @@ enum class DType { ENUM_OFFSET(INT4, ACL_INT4) ENUM_OFFSET(UINT1, ACL_UINT1) ENUM_OFFSET(COMPLEX32, ACL_COMPLEX32) - ENUM_OFFSET(HIFLOAT8, ACL_HIFLOAT8) - ENUM_OFFSET(FLOAT8_E5M2, ACL_FLOAT8_E5M2) - ENUM_OFFSET(FLOAT8_E4M3FN, ACL_FLOAT8_E4M3FN) - ENUM_OFFSET(FLOAT8_E8M0, ACL_FLOAT8_E8M0) - ENUM_OFFSET(FLOAT6_E3M2, ACL_FLOAT6_E3M2) - ENUM_OFFSET(FLOAT6_E2M3, ACL_FLOAT6_E2M3) - ENUM_OFFSET(FLOAT4_E2M1, ACL_FLOAT4_E2M1) - ENUM_OFFSET(FLOAT4_E1M2, ACL_FLOAT4_E1M2) }; inline bool IsCustomDType(int64_t t) diff --git a/torch_npu/csrc/custom_dtype/extension.h b/torch_npu/csrc/custom_dtype/extension.h deleted file mode 100644 index 91ef1df8a5..0000000000 --- a/torch_npu/csrc/custom_dtype/extension.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include -#include "torch_npu/csrc/custom_dtype/Init.h" - -namespace c10_npu { -at::Tensor cast_to_fp8(const at::Tensor &input, int otype); - -void cast_to_fp8_noalloc(const at::Tensor &input, at::Tensor output, int otype); - -at::Tensor cast_from_fp8(const at::Tensor &input, int itype, int otype); -} diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp index 13724a65f1..5754256574 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp @@ -52,8 +52,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(ENUM_PAIR_FUNC) _(at::ScalarType::Bits4x2, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits8, ACL_DT_UNDEFINED) \ _(at::ScalarType::Bits16, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Float8_e5m2, ACL_FLOAT8_E5M2) \ - _(at::ScalarType::Float8_e4m3fn, ACL_FLOAT8_E4M3FN) \ + _(at::ScalarType::Float8_e5m2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Float8_e4m3fn, ACL_DT_UNDEFINED) \ _(at::ScalarType::Float8_e5m2fnuz, ACL_DT_UNDEFINED) \ _(at::ScalarType::Float8_e4m3fnuz, ACL_DT_UNDEFINED) \ _(at::ScalarType::UInt16, ACL_UINT16) \ @@ -114,15 +114,7 @@ static std::unordered_map {ACL_BF16, at::ScalarType::BFloat16}, {ACL_INT4, at::ScalarType::Undefined}, {ACL_UINT1, at::ScalarType::Undefined}, - {ACL_COMPLEX32, at::ScalarType::ComplexHalf}, - {ACL_HIFLOAT8, at::ScalarType::Byte}, - {ACL_FLOAT8_E5M2, at::ScalarType::Float8_e5m2}, - {ACL_FLOAT8_E4M3FN, at::ScalarType::Float8_e4m3fn}, - {ACL_FLOAT8_E8M0, at::ScalarType::Byte}, - {ACL_FLOAT6_E3M2, at::ScalarType::Byte}, - {ACL_FLOAT6_E2M3, at::ScalarType::Byte}, - {ACL_FLOAT4_E2M1, at::ScalarType::Byte}, - {ACL_FLOAT4_E1M2, at::ScalarType::Byte}}; + {ACL_COMPLEX32, at::ScalarType::ComplexHalf}}; aclError AclrtMemcpyAsyncParamCheck( void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream) diff --git a/torch_npu/utils/hif8_tensor.py b/torch_npu/utils/hif8_tensor.py deleted file mode 100644 index 691d290a86..0000000000 --- a/torch_npu/utils/hif8_tensor.py +++ /dev/null @@ -1,584 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. -# -# See LICENSE for license information. - -"""Tensor class with HIF8 data""" -from __future__ import annotations - -__all__ = ["HiFloat8Tensor"] - -from typing import Any, Dict, Optional, Tuple, Union - -import torch -from torch.utils._pytree import tree_map -import torch_npu -from torch_npu.utils._error_code import ErrCode, pta_error - - -# init transformer engine -torch_npu._C._cd_init() - -tex = torch_npu._C._cd -aten = torch.ops.aten - -NPU_CUSTOM_DType = { - torch.uint8: tex.DType.uint8, - torch.int32: tex.DType.int32, - torch.float32: tex.DType.float32, - torch.half: tex.DType.float16, - torch.bfloat16: tex.DType.bfloat16, -} - - -class _FromHiFloat8Func(torch.autograd.Function): - """Cast from HIF8 to other dtype""" - - @staticmethod - def forward( - _ctx: torch.autograd.function.FunctionCtx, # unused - tensor: HiFloat8Tensor, - dtype: Optional[torch.dtype] = None, - ) -> torch.Tensor: - if dtype is None: - dtype = tensor.dtype - data = tensor._data.contiguous().view(1, -1).detach() - out = tex.cast_from_fp8( - data, - tex.DType.hifloat8, - NPU_CUSTOM_DType[dtype], - ) - out = out.view(tensor.size()) - return out - - @staticmethod - def backward( - _ctx: torch.autograd.function.FunctionCtx, # unused - grad: torch.Tensor, - ) -> Tuple[Optional[torch.Tensor], ...]: - # Assume that we want gradients in full precision - return grad, None - - -class _ToHiFloat8Func(torch.autograd.Function): - """Cast to HIF8 from other dtype""" - - @staticmethod - def forward( - _ctx: torch.autograd.function.FunctionCtx, # unused - tensor: torch.Tensor, - ) -> HiFloat8Tensor: - - # Check input tensor TODO - tensor = tensor.contiguous().npu().detach() - if tensor.dtype not in (torch.float32, torch.bfloat16, torch.float16): - tensor = tensor.float() - - # Cast data to HIF8 - data = tex.cast_to_fp8( - tensor.view(1, -1), - tex.DType.hifloat8, - ) - data = data.view(tensor.size()) - - # Construct HIF8 tensor - return HiFloat8Tensor( - data=data, - dtype=tensor.dtype, - ) - - @staticmethod - def backward( - _ctx: torch.autograd.function.FunctionCtx, # unused - grad: torch.Tensor, - ) -> Tuple[Optional[torch.Tensor], ...]: - # Assume that we want gradients in full precision - return grad, None - - -class _IdentityFunc(torch.autograd.Function): - """Identity function - - If constructor keyword-arguments are provided, then construct a - new HiFloat8Tensor using the provided tensor's attributes. - - """ - - @staticmethod - def forward( - ctx, - tensor: HiFloat8Tensor, - init_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: - - # Return input tensor if constructor kwargs are not provided - ctx.input_dtype = tensor.dtype - if init_kwargs is None: - return tensor - - # Construct new tensor if constructor kwargs are provided - default_kwargs = dict( - data=tensor._data, - dtype=tensor.dtype, - ) - for key, val in default_kwargs.items(): - if key not in init_kwargs: - init_kwargs[key] = val - return HiFloat8Tensor(**init_kwargs) - - @staticmethod - def backward(ctx, grad): - return grad.to(ctx.input_dtype), None - - -class _ViewFunc(torch.autograd.Function): - """View function - - View the HiFloat8Tensor using the provided shape. - - """ - - @staticmethod - def forward( - ctx, - tensor: torch.Tensor, - shape: Tuple[int] = None, - ) -> torch.Tensor: - - # Return input tensor if shape is not provided - ctx.shape = tensor.shape - if shape is None: - return tensor - - # Construct new tensor if shape is provided - if isinstance(tensor, HiFloat8Tensor): - return HiFloat8Tensor.make_like( - tensor, - data=tensor._data.view(*shape), - ) - return tensor.view(*shape) - - @staticmethod - def backward( - ctx, - grad: torch.Tensor, - ) -> Tuple[Union[torch.Tensor, None], ...]: - - if isinstance(grad, HiFloat8Tensor): - dgrad = HiFloat8Tensor.make_like( - grad, - data=grad._data.view(ctx.shape), - ) - return dgrad, None - return grad.view(ctx.shape), None - - -class _ReshapeFunc(torch.autograd.Function): - """Reshape function - - Reshape the HiFloat8Tensor using the provided shape. - - """ - - @staticmethod - def forward( - ctx, - tensor: torch.Tensor, - shape: Tuple[int] = None, - ) -> torch.Tensor: - - # Return input tensor if shape is not provided - ctx.shape = tensor.shape - if shape is None: - return tensor - - # Construct new tensor if shape is provided - if isinstance(tensor, HiFloat8Tensor): - return HiFloat8Tensor.make_like( - tensor, - data=tensor._data.reshape(*shape), - ) - return tensor.reshape(*shape) - - @staticmethod - def backward( - ctx, - grad: torch.Tensor, - ) -> Tuple[Union[torch.Tensor, None], ...]: - - if isinstance(grad, HiFloat8Tensor): - dgrad = HiFloat8Tensor.make_like( - grad, - data=grad._data.reshape(ctx.shape), - ) - return dgrad, None - return grad.reshape(ctx.shape), None - - -class _TransposeFunc(torch.autograd.Function): - """Transpose function - - Transpose the HiFloat8Tensor. - - """ - - @staticmethod - def forward(ctx, tensor, dim0, dim1): - ctx.save_for_backward(dim0, dim1) - if isinstance(tensor, HiFloat8Tensor): - return HiFloat8Tensor.make_like( - tensor, - data=tensor._data.transpose(dim0, dim1), - ) - return tensor.transpose(dim0, dim1) - - @staticmethod - def backward(ctx, grad): - dim0, dim1 = ctx.saved_tensors - if isinstance(grad, HiFloat8Tensor): - dgrad = HiFloat8Tensor.make_like( - grad, - data=grad._data.transpose(dim0, dim1), - ) - return dgrad, None - return grad.transpose(dim0, dim1), None, None - - -class HiFloat8Tensor(torch.Tensor): - """Experimental tensor class with HIF8 data - - The tensor presents as having a standard, higher-precision dtype, - but the data itself is (scaled) HIF8. For most tensor operations, - the data will be cast to the nominal dtype before performing the - operation. - - Parameters - ---------- - data: torch.Tensor - Raw HIF8 data in a uint8 tensor - dtype: torch.dtype, default = torch.float32 - Nominal tensor datatype. - - """ - - def __new__( - cls, - *, - data: torch.Tensor, - dtype: torch.dtype = torch.float32, - ): - # Check that data buffer is valid - if data.element_size() != 1: - raise ValueError( - f"HiFloat8Tensor requires data buffer with 8-bit dtype (got dtype={data.dtype})" - + pta_error(ErrCode.VALUE) - ) - if data.requires_grad: - raise ValueError( - "HiFloat8Tensor requires non-differentiable data buffer" - + pta_error(ErrCode.VALUE) - ) - if not data.is_npu: - data = data.npu() - - # Initialize tensor object - self = torch.Tensor._make_wrapper_subclass( - cls, - data.size(), - strides=data.stride(), - storage_offset=data.storage_offset(), - dtype=dtype, - layout=data.layout, - requires_grad=data.requires_grad, - device=data.device, - ) - self._data: torch.Tensor = data - - return self - - @classmethod - def make_like( - cls, - tensor: HiFloat8Tensor, - *, - data: torch.Tensor, - **kwargs, - ) -> HiFloat8Tensor: - """Use attributes of a HiFloat8Tensor to create another HiFloat8Tensor - - See constructor for list of keyword arguments. - - """ - default_kwargs = dict( - dtype=tensor.dtype, - ) - for key, val in default_kwargs.items(): - if key not in kwargs: - kwargs[key] = val - return HiFloat8Tensor(data=data, **kwargs) - - def __repr__(self): - return ( - "HiFloat8Tensor(" - f"data={self.from_hifloat8(dtype=self.dtype)}" - ")" - ) - - def from_hifloat8(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor: - """ - Construct PyTorch tensor from HiFloat8Tensor - - By default the resulting tensor's dtype is the - HiFloat8Tensor's nominal dtype. - """ - return _FromHiFloat8Func.apply(self, dtype) - - @classmethod - def to_hifloat8( - cls, - tensor: torch.Tensor - ): - """Construct HiFloat8Tensor from PyTorch tensor""" - return _ToHiFloat8Func.apply( - tensor - ) - - def float(self) -> torch.Tensor: - return self.from_hifloat8(dtype=torch.float32) - - def bfloat16(self) -> torch.Tensor: - return self.from_hifloat8(dtype=torch.bfloat16) - - def half(self) -> torch.Tensor: - return self.from_hifloat8(dtype=torch.float16) - - def cpu(self) -> torch.Tensor: - return self.from_hifloat8().cpu() - - def clone(self) -> HiFloat8Tensor: - return _IdentityFunc.apply(self, {"data": self._data.detach().clone()}) - - def view(self, *shape: Tuple[int]) -> HiFloat8Tensor: - return _ViewFunc.apply(self, shape) - - def reshape(self, *shape: Tuple[int]) -> HiFloat8Tensor: - return _ReshapeFunc.apply(self, shape) - - def contiguous( - self, - *, - memory_format: torch.memory_format = torch.contiguous_format, - ) -> HiFloat8Tensor: - """Returns tensor with data in provided memory format - - Returns `self` if data is already in correct memory format. - - """ - if self._data.is_contiguous(memory_format=memory_format): - return self - return _IdentityFunc.apply( - self, - {"data": self._data.detach().contiguous(memory_format=memory_format)}, - ) - - def to_dtype(self, dtype: torch.dtype) -> HiFloat8Tensor: - """Create `HiFloat8Tensor` with given nominal dtype - - The new tensor has the same underlying HIF8 data. - - """ - return HiFloat8Tensor.make_like( - self, - data=self._data, - dtype=dtype, - ) - - @classmethod - def __torch_dispatch__(cls, func, types, args, kwargs=None): - - # In-place copy op - if func == aten.copy_.default: - - # Check tensors - dst = args[0] - src = args[1] - if not isinstance(dst, torch.Tensor): - raise RuntimeError( - "Attempted to copy into something that isn't a PyTorch tensor" - + pta_error(ErrCode.TYPE) - ) - if not isinstance(src, torch.Tensor): - raise RuntimeError( - "Attempted to copy from something that isn't a PyTorch tensor" - + pta_error(ErrCode.TYPE) - ) - - # Special handling based on which tensors are HIF8 - dst_is_hif8 = isinstance(dst, HiFloat8Tensor) - src_is_hif8 = isinstance(src, HiFloat8Tensor) - if dst_is_hif8 and src_is_hif8: - # Directly copy HIF8 data if possible - dst._data.copy_(src._data) - - elif not dst_is_hif8 and src_is_hif8: - # Cast source tensor to higher precision - dst.copy_(src.from_hifloat8()) - - elif dst_is_hif8 and not src_is_hif8: - # Make sure input is in expected format - src = src.expand(dst.size()) - src = src.to( - device=dst.device, - memory_format=torch.contiguous_format, - ) - - # Cast to HIF8 - if not dst._data.is_contiguous(): - raise RuntimeError( - "Transformer Engine cast kernels require contiguous data" - + pta_error(ErrCode.INTERNAL) - ) - tex.cast_to_fp8_noalloc( - src.view(1, -1), - dst._data.view(1, -1), - tex.DType.hifloat8, - ) - else: - # Invalid case - raise RuntimeError( - "Using HiFloat8Tensor copy logic, but no HiFloat8Tensor found" - + pta_error(ErrCode.INTERNAL) - ) - - # Nothing to return for in-place ops - return None - - # Slice op - if func == aten.slice.Tensor: - tensor = args[0] - data = tensor._data - data_slice = data.__torch_dispatch__( - func, - types, - [data] + list(args[1:]), - kwargs, - ) - return HiFloat8Tensor.make_like(tensor, data=data_slice) - - # Detach op - if func == aten.detach.default: - # Simply return a new HiFloat8Tensor with the same attrs - return HiFloat8Tensor.make_like( - args[0], - data=args[0]._data, - ) - - # View op - if func == aten.view.default: - tensor = args[0] - data = tensor._data - data_view = data.__torch_dispatch__( - func, - types, - [data] + list(args[1:]), - kwargs, - ) - return HiFloat8Tensor.make_like( - tensor, - data=data_view, - ) - - def maybe_unwrap(t): - if isinstance(t, HiFloat8Tensor): - return t.from_hifloat8() - return t - - def maybe_update_inplace(arg, new_arg, schema_arg): - """Update values of HIF8 tensors - - Keep the same HIF8 scaling factors. - - """ - check_args = isinstance(arg, HiFloat8Tensor) and isinstance(new_arg, torch.Tensor) - check_schema = ( - hasattr(schema_arg, "alias_info") - and hasattr(schema_arg.alias_info, "is_write") - and schema_arg.alias_info.is_write - ) - - if check_args and check_schema: - arg.copy_(new_arg) - - # In-place op - if func._schema.is_mutable: - # Cast to higher precision, perform op, and cast values - # back to original HIF8 buffers - new_args = tree_map(maybe_unwrap, args) - new_kwargs = tree_map(maybe_unwrap, kwargs) - schema_args = func._schema.arguments - args_len = len(args) - out = super().__torch_dispatch__(func, types, new_args, new_kwargs) - for arg, new_arg, schema_arg in zip(args, new_args, schema_args): - maybe_update_inplace(arg, new_arg, schema_arg) - for kwarg, new_kwarg, schema_arg in zip(kwargs, new_kwargs, schema_args[args_len:]): - if not (kwarg == new_kwarg == schema_arg.name): - raise ValueError('name of the kw argument should match' + pta_error(ErrCode.VALUE)) - maybe_update_inplace(kwargs[kwarg], new_kwargs[new_kwarg], schema_arg) - return None - - # Default op - # Note: cast to higher precision and perform op - args = tree_map(maybe_unwrap, args) - if kwargs is not None: - kwargs = tree_map(maybe_unwrap, kwargs) - out = super().__torch_dispatch__(func, types, args, kwargs) - return out - - @classmethod - def _make_in_reduce_ex( - cls, - data: torch.Tensor, - dtype: torch.dtype, - ) -> HiFloat8Tensor: - """Build HiFloat8Tensor, for use in __reduce__ - - __reduce_ex__ assumes object constructor has positional - arguments. - - """ - return HiFloat8Tensor( - data=data, - dtype=dtype, - ) - - def __reduce_ex__(self, protocol: int) -> tuple: - """Custom pickling to remove references to HIF8 metadata objects""" - return ( - HiFloat8Tensor._make_in_reduce_ex, - (self._data, self.dtype), - ) - - def _get_data(self) -> HiFloat8Tensor: - """Get tensor data property""" - return super().data - - def _set_data(self, tensor: torch.Tensor) -> None: - """Set tensor data property - - Cast tensor to HIF8 and store in HIF8 buffer. - - """ - with torch.no_grad(): - self.copy_(tensor) - - # Cast to HIF8 when setting HiFloat8Tensor.data - data = property(_get_data, _set_data) - - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - if kwargs is None: - kwargs = {} - return torch._C._disabled_torch_function_impl(func, types, args, kwargs) - - def transpose(self, dim0, dim1): - return _TransposeFunc.apply(self, dim0, dim1) -- Gitee From 911a737426a2b9ab463852801223f21a8f732575 Mon Sep 17 00:00:00 2001 From: cx Date: Tue, 8 Jul 2025 02:55:26 +0000 Subject: [PATCH 214/328] !22756 change log level to warn Merge pull request !22756 from cx/v2.7.1 --- test/allocator/test_pluggable_allocator_extensions.py | 8 -------- torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 4 ++-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/test/allocator/test_pluggable_allocator_extensions.py b/test/allocator/test_pluggable_allocator_extensions.py index 54e270513d..a05fe8538a 100644 --- a/test/allocator/test_pluggable_allocator_extensions.py +++ b/test/allocator/test_pluggable_allocator_extensions.py @@ -76,10 +76,6 @@ class TestPluggableAllocator(TestCase): myallocator = ctypes.CDLL(os_path) get_device_stats_fn = ctypes.cast(getattr(myallocator, "my_get_device_stats"), ctypes.c_void_p).value - msg = "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn" - with self.assertRaisesRegex(RuntimeError, msg): - torch.npu.memory_stats_as_nested_dict() - TestPluggableAllocator.new_alloc.allocator().set_get_device_stats_fn(get_device_stats_fn) self.assertEqual(torch.npu.memory_stats_as_nested_dict()["num_alloc_retries"], 0) @@ -88,10 +84,6 @@ class TestPluggableAllocator(TestCase): myallocator = ctypes.CDLL(os_path) reset_peak_status_fn = ctypes.cast(getattr(myallocator, "my_reset_peak_status"), ctypes.c_void_p).value - msg = "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn" - with self.assertRaisesRegex(RuntimeError, msg): - torch.npu.reset_peak_memory_stats() - TestPluggableAllocator.new_alloc.allocator().set_reset_peak_status_fn(reset_peak_status_fn) torch.npu.reset_peak_memory_stats() self.assertEqual(torch.npu.max_memory_allocated(), 0) diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 660c69a89d..14ea0ce7e7 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -227,7 +227,7 @@ c10_npu::NPUCachingAllocator::DeviceStats NPUPluggableAllocator::getDeviceStats( if (get_device_stats_fn_) { return get_device_stats_fn_(device); } else { - TORCH_CHECK(false, "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn"); + TORCH_NPU_WARN("get_device_stats_fn_ is not define, please set by set_get_device_stats_fn"); } } @@ -242,7 +242,7 @@ void NPUPluggableAllocator::resetPeakStats(int device) if (reset_peak_status_fn_) { reset_peak_status_fn_(device); } else { - TORCH_CHECK(false, "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn"); + TORCH_NPU_WARN("reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn"); } } -- Gitee From a901070f774254cdb2d546829b55bb70b7dae90a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 8 Jul 2025 03:09:15 +0000 Subject: [PATCH 215/328] !22761 Update op_plugin commit id Merge pull request !22761 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 72321907ac..869fddbbb9 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 72321907accba073c2ebbfb9338fb19db61f41eb +Subproject commit 869fddbbb9faa16f3233bff16075f05748cacac2 -- Gitee From e79ffb4273dbe63b6649c1b4d8b6e3e29c342028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com> Date: Tue, 8 Jul 2025 06:45:02 +0000 Subject: [PATCH 216/328] =?UTF-8?q?!22725=20=E3=80=90PROF=E3=80=91fix=20dy?= =?UTF-8?q?namic=20prof=20step=20id=20err=20Merge=20pull=20request=20!2272?= =?UTF-8?q?5=20from=20=E6=A2=85=E9=A3=9E=E8=A6=81/2.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/profiler/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py index 65fbf5b038..d45ad41693 100644 --- a/torch_npu/profiler/profiler.py +++ b/torch_npu/profiler/profiler.py @@ -283,7 +283,7 @@ class profile(_KinetoProfile): self.current_action = self.schedule(self.step_num) self.action_controller.transit_action(prev_action, self.current_action) if self.record_steps: - self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num)) + self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num + self._step_num_offset)) self.step_rec_fn.__enter__() -- Gitee From 458b4f98b3d5e2fe9e566bbc1d046cc570d6026f Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 8 Jul 2025 10:39:33 +0000 Subject: [PATCH 217/328] !22779 Update op_plugin commit id Merge pull request !22779 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 869fddbbb9..fc295cc491 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 869fddbbb9faa16f3233bff16075f05748cacac2 +Subproject commit fc295cc49199bdd24fe075f75801e778517418c7 -- Gitee From faba85d525222eaac6bc1c08a24a6f8f9f0cd93d Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Wed, 9 Jul 2025 06:39:22 +0000 Subject: [PATCH 218/328] !22797 Skip test_aclrtSetDevice on 910A because it is slow Merge pull request !22797 from yuhaiyan/v2.7.1-dev1 --- test/npu/_fault_mode_cases/error_set_device.py | 6 +++--- test/npu/test_fault_mode.py | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/npu/_fault_mode_cases/error_set_device.py b/test/npu/_fault_mode_cases/error_set_device.py index ddda33f346..e694fb6b99 100644 --- a/test/npu/_fault_mode_cases/error_set_device.py +++ b/test/npu/_fault_mode_cases/error_set_device.py @@ -11,7 +11,7 @@ def _worker(i: int) -> None: def set_device(): torch_npu.npu.set_device(0) multiprocessing.set_start_method("spawn", force=True) - jobs = [multiprocessing.Process(target=_worker, args=(i,)) for i in range(70)] + jobs = [multiprocessing.Process(target=_worker, args=(i,)) for i in range(100)] for p in jobs: p.start() @@ -19,5 +19,5 @@ def set_device(): for p in jobs: p.join() - -set_device() +if __name__ == "__main__": + set_device() diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py index 88bc8cca19..713e9c67d9 100644 --- a/test/npu/test_fault_mode.py +++ b/test/npu/test_fault_mode.py @@ -6,6 +6,9 @@ from torch.testing._internal.common_utils import TestCase, run_tests from torch.utils.checkpoint import checkpoint import torch.distributed as dist import torch.nn as nn + +from torch_npu.testing.common_utils import SupportedDevices + os.environ["ASCEND_LAUNCH_BLOCKING"] = '0' import torch_npu @@ -156,6 +159,7 @@ class TestMode(TestCase): with self.assertRaisesRegex(RuntimeError, "Invalid device argument"): torch.npu.reset_max_memory_allocated(device="npu:8") + @SupportedDevices(['Ascend910B']) def test_aclrtSetDevice(self): path = os.path.join(os.path.dirname(__file__), '_fault_mode_cases/error_set_device.py') process = subprocess.Popen(["python", f"{path}"], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) -- Gitee From 0caf589260b90f39c291d196456589aab9a2d03a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 9 Jul 2025 08:54:17 +0000 Subject: [PATCH 219/328] !22820 Update op_plugin commit id Merge pull request !22820 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index fc295cc491..c97bd1f839 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit fc295cc49199bdd24fe075f75801e778517418c7 +Subproject commit c97bd1f83996ed7814efeee2584f91aadbbea721 -- Gitee From baa3b3878181a2802d9901ccf69ed475ac4dfcee Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 9 Jul 2025 10:54:16 +0000 Subject: [PATCH 220/328] !22831 Update op_plugin commit id Merge pull request !22831 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c97bd1f839..b0c392ac88 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c97bd1f83996ed7814efeee2584f91aadbbea721 +Subproject commit b0c392ac88e174c5e122aca4c694db7252785f7a -- Gitee From 4288b7b808c7304bb2e8fd4523a4a797de4a956b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 9 Jul 2025 10:54:16 +0000 Subject: [PATCH 221/328] !22831 Update op_plugin commit id Merge pull request !22831 from pta-robot/v2.7.1 -- Gitee From d79177ed5bcd6c498bbec755baa34c76aff6090b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 9 Jul 2025 14:09:16 +0000 Subject: [PATCH 222/328] !22846 Update op_plugin commit id Merge pull request !22846 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index b0c392ac88..522f5ae3e5 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit b0c392ac88e174c5e122aca4c694db7252785f7a +Subproject commit 522f5ae3e52668dabd5d7a7b6d5bb6f33b3f464f -- Gitee From 713b4592f942a8eac372d72251f09c876648a6b9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 9 Jul 2025 14:09:17 +0000 Subject: [PATCH 223/328] !22846 Update op_plugin commit id Merge pull request !22846 from pta-robot/v2.7.1 -- Gitee From 1b2f749691cd69c14fbe8753243854018bb47a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com> Date: Thu, 10 Jul 2025 01:57:09 +0000 Subject: [PATCH 224/328] =?UTF-8?q?!22789=20[Feature]=20Add=20long=20log?= =?UTF-8?q?=20for=20Logger=20Merge=20pull=20request=20!22789=20from=20?= =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/logging/Logger.cpp | 70 +++++++++++++++++++++++++++---- torch_npu/csrc/logging/Logger.h | 7 +++- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/torch_npu/csrc/logging/Logger.cpp b/torch_npu/csrc/logging/Logger.cpp index eaab8bc004..a527b4b4f2 100644 --- a/torch_npu/csrc/logging/Logger.cpp +++ b/torch_npu/csrc/logging/Logger.cpp @@ -8,6 +8,8 @@ #include "torch_npu/csrc/core/npu/register/OptionsManager.h" namespace npu_logging { +static const int BASE_PRINT_LIMIT = 1024; +static const int LONG_PRINT_LIMIT = 4096; static std::unordered_map LoggingLevelNames = { {LoggingLevel::DEBUG, "DEBUG"}, @@ -37,9 +39,8 @@ std::string Logger::getQName() return qname_; } -void Logger::log(LoggingLevel level, const char* format, va_list args) +void Logger::log(LoggingLevel level, const int log_buffer_size, const char* format, va_list args) { - const int log_buffer_size = 1024; char buffer[log_buffer_size] = {0}; int ret = vsnprintf(buffer, log_buffer_size, format, args); @@ -75,7 +76,7 @@ void Logger::debug(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::DEBUG, format, args); + log(LoggingLevel::DEBUG, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -86,7 +87,7 @@ void Logger::info(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::INFO, format, args); + log(LoggingLevel::INFO, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -97,7 +98,7 @@ void Logger::warn(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::WARNING, format, args); + log(LoggingLevel::WARNING, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -108,7 +109,7 @@ void Logger::error(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::ERROR, format, args); + log(LoggingLevel::ERROR, BASE_PRINT_LIMIT, format, args); va_end(args); } @@ -119,7 +120,62 @@ void Logger::critical(const char* format, ...) } va_list args; va_start(args, format); - log(LoggingLevel::CRITICAL, format, args); + log(LoggingLevel::CRITICAL, BASE_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_debug(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::DEBUG) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::DEBUG, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_info(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::INFO) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::INFO, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_warn(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::WARNING) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::WARNING, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_error(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::ERROR) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::ERROR, LONG_PRINT_LIMIT, format, args); + va_end(args); +} + +void Logger::long_critical(const char* format, ...) +{ + if (allow_level_ > LoggingLevel::CRITICAL) { + return; + } + va_list args; + va_start(args, format); + log(LoggingLevel::CRITICAL, LONG_PRINT_LIMIT, format, args); va_end(args); } diff --git a/torch_npu/csrc/logging/Logger.h b/torch_npu/csrc/logging/Logger.h index 1734a7c7be..7e76af5013 100644 --- a/torch_npu/csrc/logging/Logger.h +++ b/torch_npu/csrc/logging/Logger.h @@ -29,9 +29,14 @@ public: void warn(const char* format, ...); void error(const char* format, ...); void critical(const char* format, ...); + void long_debug(const char* format, ...); + void long_info(const char* format, ...); + void long_warn(const char* format, ...); + void long_error(const char* format, ...); + void long_critical(const char* format, ...); private: - void log(LoggingLevel level, const char* format, va_list args); + void log(LoggingLevel level, const int log_buffer_size, const char* format, va_list args); LoggingLevel allow_level_ = LoggingLevel::WARNING; std::string name_; -- Gitee From 78bc39c646ffe30bc890bd58352227fdc48ea7aa Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 10 Jul 2025 02:54:18 +0000 Subject: [PATCH 225/328] !22856 Update op_plugin commit id Merge pull request !22856 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 522f5ae3e5..5f8a47c041 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 522f5ae3e52668dabd5d7a7b6d5bb6f33b3f464f +Subproject commit 5f8a47c041527b54b40cb1498e64ec0e44da7f8c -- Gitee From 202db2201f667b69af66a34f57c62b4df12a1c4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Thu, 10 Jul 2025 03:22:45 +0000 Subject: [PATCH 226/328] =?UTF-8?q?!22810=20Fix=20some=20bugs=20in=20IPC?= =?UTF-8?q?=20Merge=20pull=20request=20!22810=20from=20=E5=A7=9C=E6=80=A1?= =?UTF-8?q?=E6=96=87/v2.7.1=5Fipc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 3 +++ third_party/acl/inc/acl/acl_base.h | 1 + third_party/acl/inc/acl/acl_rt.h | 2 -- .../csrc/core/npu/NPUCachingAllocator.cpp | 6 ++--- torch_npu/csrc/core/npu/NPUIPCPidManager.cpp | 10 ++++----- torch_npu/csrc/core/npu/NPUIPCPidManager.h | 3 ++- .../csrc/core/npu/interface/AclInterface.cpp | 22 +++++++++---------- .../csrc/core/npu/interface/AclInterface.h | 8 +++---- torch_npu/csrc/ipc/NPUIPCTypes.cpp | 2 ++ torch_npu/csrc/ipc/StorageSharing.cpp | 2 +- 10 files changed, 32 insertions(+), 27 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index 07fad8c4ce..df93e5a7a7 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2744,6 +2744,9 @@ "torch_npu.distributed.all_gather_into_tensor_uneven": { "signature": "(output, input, output_split_sizes=None, group=None, async_op=False)" }, + "torch_npu.multiprocessing.reductions.rebuild_npu_tensor": { + "signature": "(tensor_cls, tensor_size, tensor_stride, tensor_offset, storage_cls, dtype, storage_device, storage_handle, storage_size_bytes, storage_offset_bytes, requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required)" + }, "func: unsafe_empty_with_format": { "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor" }, diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h index a30f21375c..7d592db6ed 100755 --- a/third_party/acl/inc/acl/acl_base.h +++ b/third_party/acl/inc/acl/acl_base.h @@ -48,6 +48,7 @@ extern "C" { typedef void *aclrtStream; typedef void *aclrtEvent; typedef void *aclrtContext; +typedef void *aclrtNotify; typedef int aclError; typedef uint16_t aclFloat16; typedef struct aclDataBuffer aclDataBuffer; diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 2fcbaa2792..98b520ba4a 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -187,8 +187,6 @@ typedef void (*aclrtCallback)(void *userData); typedef void (*aclrtExceptionInfoCallback)(aclrtExceptionInfo *exceptionInfo); -typedef int aclrtNotify; - /** * @ingroup AscendCL * @brief Set a callback function to handle exception information diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index aee1a2fa92..e3c3a327be 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1657,9 +1657,9 @@ public: auto it = ipc_handle_map.find(base_ptr); if (it == ipc_handle_map.end()) { NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemGetExportKey( - base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE)); + base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE, 0)); int32_t* pids = nullptr; - int pid_num = torch_npu::ipc::getPids(&pids); + size_t pid_num = torch_npu::ipc::getPids(&pids); NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemSetImportPid(handle.data, pids, pid_num)); ipc_handle_map[base_ptr] = handle; } else { @@ -3599,7 +3599,7 @@ public: if (type == SHAREABLE_NPU_MALLOC) { handle_str handle_r; ss.read(handle_r.data, ACL_IPC_HANDLE_SIZE); - NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data, 0)); handle_s.assign(handle_r.data, ACL_IPC_HANDLE_SIZE); } else if (type == SHAREABLE_NPU_EXPANDABLE_SEGMENT) { expandable_segment_ = diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp index 94bbd2739a..393b4706c6 100644 --- a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp +++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp @@ -3,15 +3,15 @@ namespace torch_npu { namespace ipc { int32_t* pids = nullptr; -int pid_num = 0; -int capacity = 0; +size_t pid_num = 0; +size_t capacity = 0; void addPid(int pid) { - const int requiredCapacity = pid_num + 1; + const size_t requiredCapacity = pid_num + 1; if (requiredCapacity > capacity) { - int newCapacity = capacity + 10; + size_t newCapacity = capacity + 10; int32_t* newArray = new int32_t[newCapacity]; for (int i = 0; i < pid_num; ++i) { @@ -26,7 +26,7 @@ void addPid(int pid) pids[pid_num++] = static_cast(pid); } -int getPids(int32_t** ret_pids) +size_t getPids(int32_t** ret_pids) { *ret_pids = pids; return pid_num; diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.h b/torch_npu/csrc/core/npu/NPUIPCPidManager.h index bc5a72cd89..f27cd240d1 100644 --- a/torch_npu/csrc/core/npu/NPUIPCPidManager.h +++ b/torch_npu/csrc/core/npu/NPUIPCPidManager.h @@ -1,11 +1,12 @@ #pragma once #include +#include namespace torch_npu { namespace ipc { void addPid(int pid); -int getPids(int32_t** pids); +size_t getPids(int32_t** pids); } // namespace ipc } // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 519948ffbd..583d37be6f 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -935,43 +935,43 @@ aclError AclrtHostUnregister(void *ptr) return func(ptr); } -aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *name, size_t len) +aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *key, size_t len, uint64_t flag) { - typedef aclError (*AclrtIpcMemGetExportKey)(void *, size_t, char *, size_t); + typedef aclError (*AclrtIpcMemGetExportKey)(void *, size_t, char *, size_t, uint64_t); static AclrtIpcMemGetExportKey func = nullptr; if (func == nullptr) { func = (AclrtIpcMemGetExportKey) GET_FUNC(aclrtIpcMemGetExportKey); } TORCH_CHECK(func, "Failed to find function aclrtIpcMemGetExportKey", PTA_ERROR(ErrCode::NOT_FOUND)); - return func(devPtr, size, name, len); + return func(devPtr, size, key, len, flag); } -aclError AclrtIpcMemSetImportPid(const char *name, int32_t pid[], int num) +aclError AclrtIpcMemSetImportPid(const char *key, int32_t *pid, size_t num) { - typedef aclError (*AclrtIpcMemSetImportPid)(const char *, int32_t[], int); + typedef aclError (*AclrtIpcMemSetImportPid)(const char *, int32_t *, size_t); static AclrtIpcMemSetImportPid func = nullptr; if (func == nullptr) { func = (AclrtIpcMemSetImportPid) GET_FUNC(aclrtIpcMemSetImportPid); } TORCH_CHECK(func, "Failed to find function aclrtIpcMemSetImportPid", PTA_ERROR(ErrCode::NOT_FOUND)); - return func(name, pid, num); + return func(key, pid, num); } -aclError AclrtIpcMemImportByKey(void **devPtr, const char *name) +aclError AclrtIpcMemImportByKey(void **devPtr, const char *key, uint64_t flag) { - typedef aclError (*AclrtIpcMemImportByKey)(void **, const char *); + typedef aclError (*AclrtIpcMemImportByKey)(void **, const char *, uint64_t); static AclrtIpcMemImportByKey func = nullptr; if (func == nullptr) { func = (AclrtIpcMemImportByKey) GET_FUNC(aclrtIpcMemImportByKey); } TORCH_CHECK(func, "Failed to find function aclrtIpcMemImportByKey", PTA_ERROR(ErrCode::NOT_FOUND)); - return func(devPtr, name); + return func(devPtr, key, flag); } -aclError AclrtIpcMemClose(const char *name) +aclError AclrtIpcMemClose(const char *key) { typedef aclError (*AclrtIpcMemClose)(const char *); static AclrtIpcMemClose func = nullptr; @@ -980,7 +980,7 @@ aclError AclrtIpcMemClose(const char *name) } TORCH_CHECK(func, "Failed to find function aclrtIpcMemClose", PTA_ERROR(ErrCode::NOT_FOUND)); - return func(name); + return func(key); } aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType, diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index efea001767..d6c9a78aa4 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -228,13 +228,13 @@ aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, */ aclError AclrtHostUnregister(void *ptr); -aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *name, size_t len); +aclError AclrtIpcMemGetExportKey(void *devPtr, size_t size, char *key, size_t len, uint64_t flag); -aclError AclrtIpcMemSetImportPid(const char *name, int32_t pid[], int num); +aclError AclrtIpcMemSetImportPid(const char *key, int32_t *pid, size_t num); -aclError AclrtIpcMemImportByKey(void **devPtr, const char *name); +aclError AclrtIpcMemImportByKey(void **devPtr, const char *key, uint64_t flag); -aclError AclrtIpcMemClose(const char *name); +aclError AclrtIpcMemClose(const char *key); aclError AclrtMemExportToShareableHandle(aclrtDrvMemHandle handle, aclrtMemHandleType handleType, uint64_t flags, uint64_t *shareableHandle); diff --git a/torch_npu/csrc/ipc/NPUIPCTypes.cpp b/torch_npu/csrc/ipc/NPUIPCTypes.cpp index b18b6e2f2e..1ff8458c87 100644 --- a/torch_npu/csrc/ipc/NPUIPCTypes.cpp +++ b/torch_npu/csrc/ipc/NPUIPCTypes.cpp @@ -169,6 +169,7 @@ NpuIPCSentData::NpuIPCSentData( { if (npu_ipc_global_entities.sync_events_used_.load() < NPU_IPC_MAXIMUM_EVENTS_TO_USE) { + // NPU does not suppurt event_sync in IPC now. } else { auto stream = c10_npu::getCurrentNPUStream(device.index()); c10_npu::stream_synchronize(stream); @@ -182,6 +183,7 @@ NpuIPCSentData::~NpuIPCSentData() ReturnRefCounter(handle_, offset_); try { if (event_sync_required_) { + // NPU does not suppurt event_sync in IPC now. } } catch (...) { /* No throw */ } diff --git a/torch_npu/csrc/ipc/StorageSharing.cpp b/torch_npu/csrc/ipc/StorageSharing.cpp index cd7b9e372a..1169cbd1c5 100644 --- a/torch_npu/csrc/ipc/StorageSharing.cpp +++ b/torch_npu/csrc/ipc/StorageSharing.cpp @@ -71,7 +71,7 @@ static PyObject* THNPStorage_shareNpu(PyObject* self, PyObject* args) aclrtNotify ipc_event_handle; if (sent_data->event_sync_required_) { - // TO BE DONE + // NPU does not suppurt event_sync in IPC now. } _event_handle = PyBytes_FromStringAndSize( -- Gitee From 73f54e709e5be933fb09816ce7b51d0f1670ad68 Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Thu, 10 Jul 2025 03:34:26 +0000 Subject: [PATCH 227/328] !22838 Add weights_only=False Merge pull request !22838 from yuhaiyan/v2.7.1 --- test/nn/test_convolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py index e15c38c3df..da8574ef9b 100644 --- a/test/nn/test_convolution.py +++ b/test/nn/test_convolution.py @@ -98,7 +98,7 @@ class TestConvolutionNN(NNTestCase): path = download_file(get_url('legacy_conv2d')) with warnings.catch_warnings(): warnings.simplefilter('ignore', SourceChangeWarning) - m = torch.load(path, encoding='utf-8') + m = torch.load(path, encoding='utf-8', weights_only=False) input1 = torch.randn((1, 1, 1, 1), dtype=torch.float) self.assertEqual(m(input1).size(), (1, 1, 1, 1)) -- Gitee From 4c3dad0895174290a0713988863a2ce29da78b59 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 10 Jul 2025 05:09:18 +0000 Subject: [PATCH 228/328] !22867 Update op_plugin commit id Merge pull request !22867 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5f8a47c041..1ce52646a8 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5f8a47c041527b54b40cb1498e64ec0e44da7f8c +Subproject commit 1ce52646a8638a30217d6da73c6b3908cf3cd2b0 -- Gitee From f4adf9c93ecaf358bd80b4176846589c8f80b8bd Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 10 Jul 2025 09:09:23 +0000 Subject: [PATCH 229/328] !22891 Update op_plugin commit id Merge pull request !22891 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1ce52646a8..24023a4a03 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1ce52646a8638a30217d6da73c6b3908cf3cd2b0 +Subproject commit 24023a4a032545cc75cc7bef80df42a608cd593f -- Gitee From c40547cf8adff42f5d8df25f2eb058f9d02a41c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Thu, 10 Jul 2025 11:47:29 +0000 Subject: [PATCH 230/328] =?UTF-8?q?!22828=20fix=20log=20bug=20for=20error?= =?UTF-8?q?=5Fmsg=20in=20MakeSureQueueEmpty=20func=20Merge=20pull=20reques?= =?UTF-8?q?t=20!22828=20from=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUQueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 2fa4c4766a..d7ac32a79c 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -330,7 +330,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) #endif if (!error_msg.empty()) { - ASCEND_LOGE(error_msg); + ASCEND_LOGE("%s", error_msg.c_str()); } if (check_error && !runtime_error.empty()) { throw std::runtime_error(runtime_error); -- Gitee From 6d50f215ecd8cdc235ffa092ea7272cf03fe4fcf Mon Sep 17 00:00:00 2001 From: yuliangbin <1416490440@qq.com> Date: Thu, 10 Jul 2025 13:11:14 +0000 Subject: [PATCH 231/328] =?UTF-8?q?!22879=20[torch=5F2.7.1]=E5=8A=A8?= =?UTF-8?q?=E6=80=81Profiling=E4=B8=ADmsmonitor=E7=8E=AF=E5=A2=83=E5=8F=98?= =?UTF-8?q?=E9=87=8F=E5=90=8D=E6=9B=B4=E6=94=B9=20Merge=20pull=20request?= =?UTF-8?q?=20!22879=20from=20yuliangbin/monitor=5F2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/profiler/_non_intrusive_profile.py | 14 +++++++++++--- .../analysis/prof_common_func/_constant.py | 17 ++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/torch_npu/profiler/_non_intrusive_profile.py b/torch_npu/profiler/_non_intrusive_profile.py index a60303adec..c4ce45223b 100644 --- a/torch_npu/profiler/_non_intrusive_profile.py +++ b/torch_npu/profiler/_non_intrusive_profile.py @@ -8,7 +8,7 @@ from ..utils._path_manager import PathManager from ._dynamic_profiler._dynamic_profiler_utils import DynamicProfilerUtils from .dynamic_profile import init as dp_init from .dynamic_profile import step as dp_step -from .analysis.prof_common_func._constant import print_error_msg +from .analysis.prof_common_func._constant import print_error_msg, print_warn_msg __all__ = [ @@ -59,11 +59,19 @@ class _NonIntrusiveProfile: @staticmethod def init(): prof_config_path = os.getenv("PROF_CONFIG_PATH", "") - dyno_enable_flag = os.getenv("KINETO_USE_DAEMON", 0) + kine_to_value = os.getenv("KINETO_USE_DAEMON") + msmonitor_value = os.getenv("MSMONITOR_USE_DAEMON") + + if kine_to_value is not None: + print_warn_msg( + "Environment variable 'KINETO_USE_DAEMON' will be deprecated. " + "Please use 'MSMONITOR_USE_DAEMON' instead." + ) + dyno_enable_flag = msmonitor_value or kine_to_value or 0 try: dyno_enable_flag = int(dyno_enable_flag) except ValueError: - print_error_msg("Environment variable KINETO_USE_DAEMON value not valid, will be set to 0 !") + print_error_msg("Environment variable 'MSMONITOR_USE_DAEMON' value not valid, will be set to 0 !") dyno_enable_flag = 0 if not prof_config_path and dyno_enable_flag != 1: return diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index 56809c9b7f..1a62c54d6f 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -1,5 +1,5 @@ import os -from datetime import datetime +import time from typing import Union from torch_npu.utils._error_code import ErrCode, prof_error @@ -217,20 +217,23 @@ class Constant(object): def print_info_msg(message: str): - time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - print(f"{time_str} [INFO] [{os.getpid()}] profiler.py: {message}") + current_time = time.localtime() + time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time) + print(f"{time_str} [INFO] [{os.getpid()}] profiler.py: {message}", flush=True) def print_warn_msg(message: str): if not _should_print_warning(): return - time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - print(f"{time_str} [WARNING] [{os.getpid()}] profiler.py: {message}") + current_time = time.localtime() + time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time) + print(f"{time_str} [WARNING] [{os.getpid()}] profiler.py: {message}", flush=True) def print_error_msg(message: str): - time_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - print(f"{time_str} [ERROR] [{os.getpid()}] profiler.py: {message}") + current_time = time.localtime() + time_str = time.strftime("[%Y-%m-%d %H:%M:%S]", current_time) + print(f"{time_str} [ERROR] [{os.getpid()}] profiler.py: {message}", flush=True) def convert_ns2us_float(ns) -> float: -- Gitee From cb4127a07cc785393e4e3dccf1e76756defc9f71 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 10 Jul 2025 13:54:18 +0000 Subject: [PATCH 232/328] !22908 Update op_plugin commit id Merge pull request !22908 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 24023a4a03..4605932600 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 24023a4a032545cc75cc7bef80df42a608cd593f +Subproject commit 4605932600354c1ab56464418c294f6648a73d90 -- Gitee From 6791ee88e22bdd6d9a6270b06a24de2473c0cdd4 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 11 Jul 2025 10:25:51 +0000 Subject: [PATCH 233/328] !22698 Update torchair commit id Merge pull request !22698 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 952cfa98cc..08761e2972 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 952cfa98cc8edd67813d39c567fe8d76b6d44a7c +Subproject commit 08761e2972d0c8021f27ede1c9032bfa5f46edf4 -- Gitee From 002a352e55ba95ae2c526809e1b81ddbb1ea6aee Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 11 Jul 2025 11:09:22 +0000 Subject: [PATCH 234/328] !22941 Update op_plugin commit id Merge pull request !22941 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 4605932600..ac64f4df7c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 4605932600354c1ab56464418c294f6648a73d90 +Subproject commit ac64f4df7c6cf776d3fb3de61e0e6d88a2d22d46 -- Gitee From fcc473c755be324fd2fa73ff8f22681f66923bb2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 11 Jul 2025 14:09:22 +0000 Subject: [PATCH 235/328] !22947 Update op_plugin commit id Merge pull request !22947 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index ac64f4df7c..dd0799bb3b 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit ac64f4df7c6cf776d3fb3de61e0e6d88a2d22d46 +Subproject commit dd0799bb3b0e9546d8ebb0e4d10a20a144394651 -- Gitee From 17d413580d6be880ba31ceb6f453151af4b7d4f7 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 11 Jul 2025 14:09:22 +0000 Subject: [PATCH 236/328] !22947 Update op_plugin commit id Merge pull request !22947 from pta-robot/v2.7.1 -- Gitee From c630cecb3c4c27b905999bb5fdc5af38440cf218 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 11 Jul 2025 22:23:04 +0000 Subject: [PATCH 237/328] !22957 Update torchair commit id Merge pull request !22957 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 08761e2972..edf95b3a70 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 08761e2972d0c8021f27ede1c9032bfa5f46edf4 +Subproject commit edf95b3a70ccd0fcb90a935cfa9836879df9453d -- Gitee From 455e85bc92f2bec6f7f9eaa32196ae53f9072eb1 Mon Sep 17 00:00:00 2001 From: shaoyf Date: Sat, 12 Jul 2025 03:39:55 +0000 Subject: [PATCH 238/328] !22372 Enable the IsOpInputBaseFormat function to support faketensor Merge pull request !22372 from shaoyf/217_sdpa --- torch_npu/csrc/framework/FormatHelper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp index 9bd270b8fd..abc4936d30 100644 --- a/torch_npu/csrc/framework/FormatHelper.cpp +++ b/torch_npu/csrc/framework/FormatHelper.cpp @@ -135,7 +135,7 @@ FormatShape FormatHelper::GetStorageSizes(const torch_npu::NPUStorageDesc &desc) bool FormatHelper::IsOpInputBaseFormat(const at::Tensor &tensor) { - if (!torch_npu::utils::is_npu(tensor)) { + if (!torch_npu::utils::is_npu(tensor) || (typeid(*tensor.storage().unsafeGetStorageImpl()) != typeid(torch_npu::NPUStorageImpl))) { return true; } const auto format = torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor).npu_format_; -- Gitee From 5353066da4fc291f6db144d71289652231186f09 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 12 Jul 2025 04:54:22 +0000 Subject: [PATCH 239/328] !22963 Update op_plugin commit id Merge pull request !22963 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index dd0799bb3b..161f835137 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit dd0799bb3b0e9546d8ebb0e4d10a20a144394651 +Subproject commit 161f835137eaa0ca36e62202c141dfbde80babfe -- Gitee From 52f826776b7c709e41cb01aa1ccd7e1be36e1e3c Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Sat, 12 Jul 2025 22:16:41 +0000 Subject: [PATCH 240/328] !22976 Update torchair commit id Merge pull request !22976 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index edf95b3a70..ec5747ba54 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit edf95b3a70ccd0fcb90a935cfa9836879df9453d +Subproject commit ec5747ba5477a4508131ca4401088e7383908266 -- Gitee From 2ac57edc2cd66390140ebcc4f7eaed7302fa2b58 Mon Sep 17 00:00:00 2001 From: wgb Date: Mon, 14 Jul 2025 01:13:19 +0000 Subject: [PATCH 241/328] !22817 from blob add deleter Merge pull request !22817 from wgb/v2.7.1 --- test/cpp_extensions/extension.cpp | 12 ++++ .../test/test_cpp_extensions_aot.py | 1 + torch_npu/csrc/aten/common/from_blob.cpp | 55 ++++++++++++++++++- torch_npu/csrc/aten/common/from_blob.h | 31 +++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp index 636982882d..8d3a62f1ac 100644 --- a/test/cpp_extensions/extension.cpp +++ b/test/cpp_extensions/extension.cpp @@ -48,6 +48,17 @@ bool check_from_blob() return dtype_same && num_same && pos1_same && pos2_same && pos3_same && sub_same; } +bool check_from_blob_delete() +{ + int isgone = 0; + { + auto data = torch::tensor({1.0, 2.0, 3.0}, torch::kFloat).to(at::Device("npu:0")); + auto res = at_npu::native::from_blob(data.data_ptr(), data.sizes(), [&](void*) { isgone++; }); + } + bool is_deleted = (isgone == 1); + return is_deleted; +} + bool check_from_blob_strides() { auto data = torch::tensor({1, 2, 3, 4, 5, 6, 7, 8, 9}, torch::kInt32).to(at::Device("npu:0")); @@ -131,6 +142,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("check_storage_sizes", &check_storage_sizes, "check_storage_sizes"); m.def("check_from_blob", &check_from_blob, "check_from_blob"); m.def("check_from_blob_strides", &check_from_blob_strides, "check_from_blob_strides"); + m.def("check_from_blob_delete", &check_from_blob_delete, "check_from_blob_delete"); m.def("blocking_ops", &blocking_ops, "blocking_ops"); m.def("register_op_hook", ®ister_op_hook, "register_op_hook"); m.def("get_op_hook_call_count", &get_op_hook_call_count, "get_op_hook_call_count"); diff --git a/test/cpp_extensions/test/test_cpp_extensions_aot.py b/test/cpp_extensions/test/test_cpp_extensions_aot.py index 83650c5b3b..84175ed0ac 100644 --- a/test/cpp_extensions/test/test_cpp_extensions_aot.py +++ b/test/cpp_extensions/test/test_cpp_extensions_aot.py @@ -53,6 +53,7 @@ class TestCppExtensionAOT(TestCase): def test_from_blob(self): self.assertTrue(npu_extension.check_from_blob()) self.assertTrue(npu_extension.check_from_blob_strides()) + self.assertTrue(npu_extension.check_from_blob_delete()) def test_dispatch_allreduce(self): flags = os.O_WRONLY | os.O_RDONLY | os.O_CREAT diff --git a/torch_npu/csrc/aten/common/from_blob.cpp b/torch_npu/csrc/aten/common/from_blob.cpp index 08f2e63fd2..1363d69459 100644 --- a/torch_npu/csrc/aten/common/from_blob.cpp +++ b/torch_npu/csrc/aten/common/from_blob.cpp @@ -36,7 +36,12 @@ at::Tensor TensorMaker::make_tensor() std::size_t size_bytes = computeStorageSize(); - c10::DataPtr data_ptr{data_, *device_}; + c10::DataPtr data_ptr{}; + if (deleter_) { + data_ptr = c10::InefficientStdFunctionContext::makeDataPtr(data_, std::move(deleter_), *device_); + } else { + data_ptr = c10::DataPtr(data_, *device_); + } c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( c10::StorageImpl::use_byte_size_t(), @@ -86,6 +91,54 @@ std::size_t TensorMaker::computeStorageSize() const noexcept return storage_size; } +at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + std::function deleter, + const at::TensorOptions& options, + const c10::optional target_device) +{ + return for_blob(data, sizes) + .deleter(std::move(deleter)) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + int64_t storage_offset, + const std::function& deleter, + const at::TensorOptions& options, + const c10::optional target_device) +{ + return for_blob(data, sizes) + .strides(strides) + .storage_offset(storage_offset) + .deleter(deleter) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + const std::function& deleter, + const at::TensorOptions& options, + const c10::optional target_device) +{ + return for_blob(data, sizes) + .strides(strides) + .deleter(deleter) + .options(options) + .target_device(target_device) + .make_tensor(); +} + at::Tensor from_blob( void* data, at::IntArrayRef sizes, diff --git a/torch_npu/csrc/aten/common/from_blob.h b/torch_npu/csrc/aten/common/from_blob.h index f0d6bbd127..0669d2fdca 100644 --- a/torch_npu/csrc/aten/common/from_blob.h +++ b/torch_npu/csrc/aten/common/from_blob.h @@ -41,6 +41,12 @@ public: return *this; } + TensorMaker& deleter(std::function value) noexcept + { + deleter_ = std::move(value); + + return *this; + } at::Tensor make_tensor(); private: @@ -58,6 +64,7 @@ private: c10::optional device_{}; at::TensorOptions opts_{}; c10::Allocator* allocator_{}; + std::function deleter_{}; }; inline TensorMaker for_blob(void* data, at::IntArrayRef sizes) noexcept @@ -65,6 +72,30 @@ inline TensorMaker for_blob(void* data, at::IntArrayRef sizes) noexcept return TensorMaker{data, sizes}; } +TORCH_NPU_API at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + std::function deleter, + const at::TensorOptions& options = {}, + const c10::optional target_device = c10::nullopt); + +TORCH_NPU_API at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + int64_t storage_offset, + const std::function& deleter, + const at::TensorOptions& options = {}, + const c10::optional target_device = c10::nullopt); + +TORCH_NPU_API at::Tensor from_blob( + void* data, + at::IntArrayRef sizes, + at::IntArrayRef strides, + const std::function& deleter, + const at::TensorOptions& options = {}, + const c10::optional target_device = c10::nullopt); + TORCH_NPU_API at::Tensor from_blob( void* data, at::IntArrayRef sizes, -- Gitee From 71e998846953ff9630940f128b51c6a664464758 Mon Sep 17 00:00:00 2001 From: zhangqiongwen Date: Mon, 14 Jul 2025 02:50:56 +0000 Subject: [PATCH 242/328] !22802 add fsdp patch for foreach_copy and finalize_backward Merge pull request !22802 from zhangqiongwen/v2.7.1_foreach_copy_patch --- .../fsdp2/test_fully_shard_autograd.py | 324 ++++++++++++++++++ .../fsdp2/test_fully_shard_frozen.py | 259 ++++++++++++++ torch_npu/__init__.py | 2 + torch_npu/distributed/fsdp/_add_fsdp_patch.py | 88 +++++ .../distributed/fsdp/_fsdp_collectives.py | 17 +- 5 files changed, 689 insertions(+), 1 deletion(-) create mode 100644 test/distributed/fsdp2/test_fully_shard_autograd.py create mode 100644 test/distributed/fsdp2/test_fully_shard_frozen.py create mode 100644 torch_npu/distributed/fsdp/_add_fsdp_patch.py diff --git a/test/distributed/fsdp2/test_fully_shard_autograd.py b/test/distributed/fsdp2/test_fully_shard_autograd.py new file mode 100644 index 0000000000..01ab916026 --- /dev/null +++ b/test/distributed/fsdp2/test_fully_shard_autograd.py @@ -0,0 +1,324 @@ +import collections +import copy +import functools +import itertools +import unittest +from typing import Any, List, Optional, Type, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed.fsdp import fully_shard +from torch.nn.parallel.scatter_gather import _is_namedtuple +from torch.testing._internal.common_fsdp import ( + check_sharded_parity, + DoubleLinear, + FSDPTestMultiThread, + MLP, +) +from torch.testing._internal.common_utils import run_tests +from torch.testing._internal.distributed._tensor.common_dtensor import ( + ModelArgs, + Transformer, +) + +import torch_npu +from torch_npu.testing.common_utils import SupportedDevices +from torch_npu.testing._internal.common_fsdp import FSDPNPUTest + +torch.use_deterministic_algorithms(True) + + +class TestFullyShardAutograd(FSDPNPUTest): + @property + def world_size(self) -> int: + return min(4, torch.npu.device_count()) + + def _reduce_1d_partial_grads( + self, module: nn.Module, group: Optional[dist.ProcessGroup] = None + ) -> None: + group = group or dist.distributed_c10d._get_default_group() + for param in module.parameters(): + if param.grad is not None: + param.grad.div_(group.size()) + + def test_unused_forward_output(self): + """ + Tests that gradients propagate when running a backward where some + forward output is not used to compute the loss. + """ + self.run_subtests( + {"reshard_after_forward": [True, False, 2]}, + self._test_unused_forward_output, + ) + + def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]): + torch.manual_seed(42) + local_batch_size = 2 + global_batch_size, dim = (self.world_size * local_batch_size, 24) + model = DoubleLinear(dim=dim, use_second_linear=True) + ref_model = copy.deepcopy(model).npu() + fully_shard(model.lin1, reshard_after_forward=reshard_after_forward) + fully_shard(model, reshard_after_forward=reshard_after_forward) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + + torch.manual_seed(1) # same on all ranks + for iter_idx in range(10): + # Use all forward outputs in the loss/backward for the first half + # of the iterations and only the 1st forward output for the rest + global_inp = torch.rand((global_batch_size, dim), device="npu") + local_inp = global_inp[self.rank * local_batch_size:(self.rank + 1) * local_batch_size].detach() + out1, out2 = model(local_inp) + loss = (out1 * out2).sum() if iter_idx < 3 else out1.sum() + loss.backward() + optim.step() + ref_out1, ref_out2 = ref_model(global_inp) + ref_loss = (ref_out1 * ref_out2).sum() if iter_idx < 3 else ref_out1.sum() + ref_loss.backward() + self._reduce_1d_partial_grads(ref_model) + ref_optim.step() + dist.all_reduce(loss) # partial -> replicated + self.assertEqual(loss, ref_loss) + optim.zero_grad(set_to_none=(iter_idx % 2)) + ref_optim.zero_grad(set_to_none=(iter_idx % 2)) + check_sharded_parity(self, ref_model, model) + + def test_unused_forward_module(self): + """ + Tests that gradients propagate when running a backward where some + forward module is not used to compute the loss. + """ + self.run_subtests( + {"reshard_after_forward": [True, False, 2]}, + self._test_unused_forward_module, + ) + + def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]): + torch.manual_seed(42) + local_batch_size, dim = (2, 24) + global_batch_size = self.world_size * local_batch_size + model = DoubleLinear(dim=dim, use_second_linear=False) + ref_model = copy.deepcopy(model).npu() + fully_shard(model.lin1, reshard_after_forward=reshard_after_forward) + fully_shard(model.lin2, reshard_after_forward=reshard_after_forward) + fully_shard(model, reshard_after_forward=reshard_after_forward) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + + torch.manual_seed(1) # same on all ranks + for iter_idx in range(10): + global_inp = torch.rand((global_batch_size, dim), device="npu") + local_inp = global_inp[self.rank * local_batch_size:(self.rank + 1) * local_batch_size].detach() + losses: List[torch.Tensor] = [] + for _model, inp in ((ref_model, global_inp), (model, local_inp)): + losses.append(_model(inp).sum()) + losses[-1].backward() + self._reduce_1d_partial_grads(ref_model) + dist.all_reduce(losses[1]) # partial -> replicated + self.assertEqual(losses[0], losses[1]) + check_sharded_parity(self, ref_model, model) + for _optim in (optim, ref_optim): + _optim.step() + _optim.zero_grad(set_to_none=(iter_idx % 2)) + + def test_nontensor_activations(self): + """ + Tests that gradients propagate when running forward with nontensor + data structures wrapping the activations. This is mainly to test the + hook registration. + """ + self.run_subtests( + {"container_type": [list, collections.namedtuple, tuple, dict]}, + self._test_nontensor_activations, + ) + + def _test_nontensor_activations(self, container_type: Type): + class Module(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.lin1 = nn.Linear(dim, dim) + self.lin2 = nn.Linear(dim, dim) + self.relu = nn.ReLU() + + def forward(self, inp: Any): + # Assume that the "0th" element of `inp` is a tensor, run some + # forward computation on it, and pack it back into the same + # data structure type as `inp` + if isinstance(inp, list): + return [self._forward(inp[0])] + elif _is_namedtuple(inp): + return type(inp)(*([self._forward(inp[0])] + list(inp[1:]))) + elif isinstance(inp, tuple): + return (self._forward(inp[0]),) + elif isinstance(inp, dict): + return {"x": self._forward(inp["x"])} + else: + raise NotImplementedError( + f"Unsupported input type {type(inp)}: {inp}" + ) + + def _forward(self, x: torch.Tensor) -> torch.Tensor: + return self.relu(self.lin2(self.relu(self.lin1(x)))) + + class ToContainerType(nn.Module): + def __init__(self, container_type: Type): + super().__init__() + self.container_type = container_type + + def forward(self, x: torch.Tensor): + if self.container_type is list: + return [x] + elif self.container_type is collections.namedtuple: + nt = collections.namedtuple("NT", "x y") + return nt(x, torch.ones_like(x)) + elif self.container_type is tuple: + return (x,) + elif self.container_type is dict: + return {"x": x} + else: + raise NotImplementedError( + f"Unsupported container type: {self.container_type}" + ) + + class FromContainerType(nn.Module): + def __init__(self, container_type: Type): + super().__init__() + self.container_type = container_type + + def forward(self, x: torch.Tensor): + if self.container_type in (list, collections.namedtuple, tuple): + return x[0] + elif self.container_type is dict: + return x["x"] + else: + raise NotImplementedError( + f"Unsupported container type: {self.container_type}" + ) + + torch.manual_seed(42) + local_batch_size, dim = (2, 24) + global_batch_size = self.world_size * local_batch_size + model = nn.Sequential( + ToContainerType(container_type), + Module(dim), + Module(dim), + Module(dim), + FromContainerType(container_type), + ) + ref_model = copy.deepcopy(model).npu() + for module in model: + fully_shard(module) + fully_shard(model) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + + torch.manual_seed(1) # same on all ranks + for iter_idx in range(10): + global_inp = torch.rand((global_batch_size, dim), device="npu") + local_inp = global_inp[self.rank * local_batch_size:(self.rank + 1) * local_batch_size].detach() + losses: List[torch.Tensor] = [] + for _model, inp in ((ref_model, global_inp), (model, local_inp)): + losses.append(_model(inp).sum()) + losses[-1].backward() + self._reduce_1d_partial_grads(ref_model) + dist.all_reduce(losses[1]) # partial -> replicated + self.assertEqual(losses[0], losses[1]) + check_sharded_parity(self, ref_model, model) + for _optim in (optim, ref_optim): + _optim.step() + _optim.zero_grad(set_to_none=(iter_idx % 2)) + + +class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread): + @property + def world_size(self) -> int: + return 2 + + def perThreadSetUp(self): + super().perThreadSetUp() + torch.npu.set_device(0) + + @SupportedDevices(['Ascend910B']) + def test_post_acc_grad_hook_runs(self): + param_name_to_hook_count = collections.defaultdict(int) + + def hook(param_name: str, param: torch.Tensor) -> None: + nonlocal param_name_to_hook_count + param_name_to_hook_count[param_name] += 1 + + model = MLP(8) + for module in (model.in_proj, model.out_proj, model): + fully_shard(module) + for param_name, param in model.named_parameters(): + param_hook = functools.partial(hook, param_name) + param.register_post_accumulate_grad_hook(param_hook) + + inp = torch.randn((2, 8), device="npu") + model(inp).sum().backward() + param_names = {param_name for param_name, _ in model.named_parameters()} + self.assertEqual(param_names, set(param_name_to_hook_count.keys())) + for _, count in param_name_to_hook_count.items(): + self.assertEqual(count, 1) + + +class TestFullyShardPostAccGradHookMultiProcess(FSDPNPUTest): + @property + def world_size(self) -> int: + return min(torch.npu.device_count(), 2) + + @SupportedDevices(['Ascend910B']) + def test_post_acc_grad_hook_optim_parity(self): + """ + Tests parity of running the optimizer via the post-accumulate-grad + hook vs. normally. + """ + torch.manual_seed(42) + model_args = ModelArgs(dropout_p=0.0) + model = Transformer(model_args) + + ref_model = copy.deepcopy(model).npu() + for module in itertools.chain(ref_model.layers, [ref_model]): + fully_shard(module) + optim_kwargs = {"lr": 1e-2, "foreach": False} + ref_optim = torch.optim.AdamW(ref_model.parameters(), **optim_kwargs) + lr_scheduler_kwargs = {"step_size": 5} + ref_lr_scheduler = torch.optim.lr_scheduler.StepLR( + ref_optim, **lr_scheduler_kwargs + ) + + for module in itertools.chain(model.layers, [model]): + fully_shard(module) + param_to_optim = {} + param_to_lr_scheduler = {} + for param in model.parameters(): + param_to_optim[param] = torch.optim.AdamW([param], **optim_kwargs) + param_to_lr_scheduler[param] = torch.optim.lr_scheduler.StepLR( + param_to_optim[param], **lr_scheduler_kwargs + ) + + def optim_hook(param: nn.Parameter) -> None: + param_to_optim[param].step() + param_to_optim[param].zero_grad() + param_to_lr_scheduler[param].step() + + for param in model.parameters(): + param.register_post_accumulate_grad_hook(optim_hook) + + torch.manual_seed(42 + self.rank) + inp = torch.randint(0, model_args.vocab_size, (2, 16), device="npu") + for _ in range(10): + ref_loss = ref_model(inp).sum() + ref_loss.backward() + ref_optim.step() + ref_optim.zero_grad() + ref_lr_scheduler.step() + loss = model(inp).sum() + loss.backward() + self.assertTrue(torch.equal(ref_loss, loss)) + for ref_param, param in zip(ref_model.parameters(), model.parameters()): + self.assertTrue(torch.equal(ref_param, param)) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp2/test_fully_shard_frozen.py b/test/distributed/fsdp2/test_fully_shard_frozen.py new file mode 100644 index 0000000000..1e9aa6e064 --- /dev/null +++ b/test/distributed/fsdp2/test_fully_shard_frozen.py @@ -0,0 +1,259 @@ +import copy +import functools +import itertools +from typing import List, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from torch.distributed._composable import checkpoint, replicate +from torch.distributed.fsdp import fully_shard +from torch.distributed.fsdp._fully_shard._fsdp_param_group import ( + RegisterPostBackwardFunction, +) +from torch.testing._internal.common_fsdp import ( + check_sharded_parity, + MLP, + patch_reduce_scatter, + patch_register_post_backward_hook_backward, + reduce_scatter_with_assert, +) +from torch.testing._internal.common_utils import run_tests + +import torch_npu +from torch_npu.testing._internal.common_fsdp import FSDPNPUTest + + +class TestFullyShardFrozen(FSDPNPUTest): + @property + def world_size(self) -> int: + return min(4, torch.npu.device_count()) + + def test_train_mixed_requires_grad_per_group(self): + """ + Tests training parity with DDP when mixing frozen and non-frozen + parameters in the same FSDP communication group. This checks that + the reduce-scatters reduce the expected numel and that they are called + via the custom autograd function backward (i.e. that they are not + delayed until the end of backward). + """ + self.run_subtests( + { + "reshard_after_forward": [False, True, 2], + "use_activation_checkpointing": [False, True], + "freeze_after_init": [False, True], + }, + self._test_train_mixed_requires_grad_per_group, + ) + + def _test_train_mixed_requires_grad_per_group( + self, + reshard_after_forward: Union[bool, int], + use_activation_checkpointing: bool, + freeze_after_init: bool, + ): + torch.manual_seed(42) + num_mlps, lin_dim = (3, 32) + model = nn.Sequential( + *[MLP(lin_dim, torch.device("cpu")) for _ in range(num_mlps)] + ) + # Train biases only (e.g. like BitFit) + if not freeze_after_init: + for param_name, param in model.named_parameters(): + if "bias" not in param_name: + param.requires_grad_(False) + ref_model = replicate( + copy.deepcopy(model).npu(), + device_ids=[self.rank], + find_unused_parameters=freeze_after_init, + ) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + for mlp in model: + if use_activation_checkpointing: + checkpoint(mlp) + fully_shard(mlp, reshard_after_forward=reshard_after_forward) + fully_shard(model, reshard_after_forward=reshard_after_forward) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + orig_reduce_scatter = dist.reduce_scatter_tensor + if freeze_after_init: + for param_name, param in itertools.chain( + model.named_parameters(), ref_model.named_parameters() + ): + if "bias" not in param_name: + param.requires_grad_(False) + for mlp in model: + if not isinstance(mlp, MLP): + raise AssertionError("The reduce-scatter numel check assumes the model consists of " + f"only the same MLP class but got {type(mlp)}") + expected_numel = sum( + p._local_tensor.numel() + for n, p in model[0].named_parameters() + if "bias" in n + ) + + def assert_fn(output: torch.Tensor): + self.assertEqual(output.numel(), expected_numel) + + reduce_scatter = functools.partial( + reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn + ) + orig_backward = RegisterPostBackwardFunction.backward + backward_count = 0 + + def backward_with_count(*args, **kwargs): + nonlocal backward_count + backward_count += 1 + return orig_backward(*args, **kwargs) + + torch.manual_seed(42 + self.rank + 1) + device = torch.device("npu") + with patch_reduce_scatter( + reduce_scatter + ), patch_register_post_backward_hook_backward(backward_with_count): + for iter_idx in range(10): + inp = torch.randn((8, lin_dim), device=device) + losses: List[torch.Tensor] = [] + for _model, _optim in ((ref_model, ref_optim), (model, optim)): + _optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) + losses.append(_model(inp).sum()) + losses[-1].backward() + _optim.step() + check_sharded_parity(self, ref_model, model) + self.assertEqual(losses[0], losses[1]) + # Check that the post-backward hooks ran through the autograd + # backward, not the final callback (except possibly that of the + # first MLP, which does not have an input that requires grad) + self.assertTrue(backward_count >= num_mlps - 1) + + def test_train_mixed_requires_grad_across_groups(self): + """ + Tests training parity with DDP when mixing frozen and non-frozen + parameters across different FSDP communication groups, including + possibly unfreezing parameters. + """ + self.run_subtests( + { + "reshard_after_forward": [False, True, 2], + "unfreeze_params": [False, True], + }, + self._test_train_mixed_requires_grad_across_groups, + ) + + def _test_train_mixed_requires_grad_across_groups( + self, + reshard_after_forward: Union[bool, int], + unfreeze_params: bool, + ): + torch.manual_seed(42) + num_linears, lin_dim = (6, 32) + modules: List[nn.Module] = [] + for _ in range(num_linears): + modules += [nn.Linear(lin_dim, lin_dim), nn.ReLU()] + model = nn.Sequential(*modules) + ref_model = replicate( + copy.deepcopy(model).npu(), + device_ids=[self.rank], + find_unused_parameters=True, + ) + for module in model.modules(): + if isinstance(module, nn.Linear): + fully_shard(module, reshard_after_forward=reshard_after_forward) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + orig_backward = RegisterPostBackwardFunction.backward + backward_count = 0 + + def _set_requires_grad(seq: nn.Module, requires_grad: bool): + for i in range(num_linears): + # Interleave frozen -> non-frozen -> ... linears + if i % 2 == 0: + for param in seq[i % 2].parameters(): + param.requires_grad_(requires_grad) + + def backward_with_count(*args, **kwargs): + nonlocal backward_count + backward_count += 1 + return orig_backward(*args, **kwargs) + + _set_requires_grad(model, False) + _set_requires_grad(ref_model, False) + num_iters, no_grad_iter_idx = (3, 1) + torch.manual_seed(42 + self.rank) + inp = torch.randn((8, lin_dim), device="npu") + with patch_register_post_backward_hook_backward(backward_with_count): + for iter_idx in range(num_iters): + losses: List[torch.Tensor] = [] + for _model, _optim in ((ref_model, ref_optim), (model, optim)): + # Unfreeze the parameters on the last step to emulate some + # kinds of fine-tuning + if unfreeze_params and iter_idx == num_iters - 1: + _set_requires_grad(model, True) + if iter_idx == no_grad_iter_idx: + with torch.no_grad(): + losses.append(_model(inp).sum()) + else: + losses.append(_model(inp).sum()) + losses[-1].backward() + _optim.step() + _optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) + self.assertEqual(losses[0], losses[1]) + # Check that the post-backward hooks ran through the autograd + # backward, not the final callback (except possibly that of the + # first linear, which does not have an input that requires grad) + self.assertTrue(backward_count >= num_linears - 1) + + def test_multi_forward_mixed_requires_grad(self): + """ + Tests training parity with DDP when having trainable and frozen modules + that participate multiple times in forward. + """ + self.run_subtests( + {"reshard_after_forward": [True, False, 2]}, + self._test_multi_forward_mixed_requires_grad, + ) + + def _test_multi_forward_mixed_requires_grad( + self, + reshard_after_forward: Union[bool, int], + ): + class MultiForwardModule(nn.Module): + def __init__(self, device: torch.device): + super().__init__() + self.layer_0 = nn.Linear(5, 5, device=device) + self.layer_no_grad = nn.Linear(5, 5, device=device) + self.layer_with_grad = nn.Linear(5, 5, device=device) + self.layer_no_grad.requires_grad_(False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.layer_0(x) + for _ in range(3): + x = self.layer_no_grad(F.relu(self.layer_with_grad(x))) + # Make sure that calling the same layer multiple times + # works regardless whether gradient is enabled + with torch.no_grad(): + x += F.relu(self.layer_with_grad(x)) + return x + + torch.manual_seed(42) + model = MultiForwardModule(torch.device("cpu")) + ref_model = replicate(copy.deepcopy(model).npu(), device_ids=[self.rank]) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + for module in model.modules(): + if isinstance(module, nn.Linear): + fully_shard(module, reshard_after_forward=reshard_after_forward) + fully_shard(model, reshard_after_forward=reshard_after_forward) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + for iter_idx in range(10): + inp = torch.randn((8, 5), device="npu") + losses: List[torch.Tensor] = [] + for _model, _optim in ((ref_model, ref_optim), (model, optim)): + _optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) + losses.append(_model(inp).sum()) + losses[-1].backward() + _optim.step() + self.assertEqual(losses[0], losses[1]) + + +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 2d2a84d3ae..d811a332a6 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -79,6 +79,7 @@ import torch_npu.utils.custom_ops import torch_npu.distributed.rpc import torch_npu.op_plugin from torch_npu.profiler._add_mstx_patch import _apply_mstx_patch +from torch_npu.distributed.fsdp._add_fsdp_patch import _apply_fsdp_patch from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry from torch_npu.utils import _cann_package_check, _add_intercept_methods from torch_npu.utils import _register_ops_under_dtensor_rules @@ -174,6 +175,7 @@ def _apply_class_patches(): _apply_distributed_methods_patch() _apply_mstx_patch() _add_reductions_methods() + _apply_fsdp_patch() def _apply_distributed_methods_patch(): diff --git a/torch_npu/distributed/fsdp/_add_fsdp_patch.py b/torch_npu/distributed/fsdp/_add_fsdp_patch.py new file mode 100644 index 0000000000..6bc07049be --- /dev/null +++ b/torch_npu/distributed/fsdp/_add_fsdp_patch.py @@ -0,0 +1,88 @@ +from typing import cast + +import torch +from torch import distributed as dist +from torch.distributed.fsdp._fully_shard._fsdp_common import compiled_autograd_enabled, TrainingState +from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam, ShardedState +from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup, AllGatherState + +import torch_npu + + +def _patched_finalize_backward(self): + self._wait_for_post_backward() + for fsdp_param in self.fsdp_params: + if fsdp_param.grad_offload_event is not None: + fsdp_param.grad_offload_event.synchronize() + fsdp_param.grad_offload_event = None + if self._all_gather_result is not None: + # If there was a mistargeted unshard without a corresponding wait, + # then we wait here and clear the unshard + event = self._all_gather_result.all_gather_event + if event is not None: + torch.npu.current_stream().wait_event(event) + work = self._all_gather_result.all_gather_work + if isinstance(work, dist.distributed_c10d.Work): + work.wait() + self._all_gather_result = None + self._post_forward_indices.clear() + + +def _get_param_all_gather_inputs( + fsdp_params: list[FSDPParam], +) -> list[list[torch.Tensor]]: + if compiled_autograd_enabled(): + return [fsdp_param.all_gather_inputs for fsdp_param in fsdp_params] + + # Intentionally try to run a fast-path that bypasses abstractions for the + # common FSDP case of bf16/fp32 mixed precision in order to use foreach + # copy for lower CPU overhead and more efficient copying in eager + def use_foreach_copy(fsdp_param: FSDPParam) -> bool: + return ( + fsdp_param.param_dtype is not None + and not fsdp_param.offload_to_cpu + and not hasattr(fsdp_param._sharded_local_tensor, "fsdp_pre_all_gather") + ) + + param_all_gather_inputs: list[list[torch.Tensor]] = [[] for _ in fsdp_params] + foreach_copy_indices: list[int] = [] + foreach_copy_inputs: list[torch.Tensor] = [] + foreach_copy_input_numels: list[int] = [] + + # 1st pass: for foreach-copy parameters, get inputs and metadata for the + # foreach copy, and for the others, actually get their all-gather inputs + for i, fsdp_param in enumerate(fsdp_params): + if use_foreach_copy(fsdp_param): + foreach_copy_indices.append(i) + all_gather_input = ( + fsdp_param._sharded_param_data + if fsdp_param.sharded_state == ShardedState.SHARDED + else cast(torch.Tensor, fsdp_param._sharded_post_forward_param_data) + ) + foreach_copy_inputs.append(all_gather_input) + foreach_copy_input_numels.append(all_gather_input.numel()) + else: + param_all_gather_inputs[i] = fsdp_param.all_gather_inputs + + # 2nd pass: use foreach copy to compute the remaining all-gather inputs + if foreach_copy_inputs: + fsdp_param_0 = fsdp_params[foreach_copy_indices[0]] + param_dtype, device = fsdp_param_0.param_dtype, fsdp_param_0.device + flat_foreach_copy_input = torch.empty( + (sum(foreach_copy_input_numels),), device=device, dtype=param_dtype + ) + splits = torch.split(flat_foreach_copy_input, foreach_copy_input_numels) + # patch in npu: set non_blocking=True + if splits[0].device == foreach_copy_inputs[0].device: + torch._foreach_copy_(splits, foreach_copy_inputs, non_blocking=True) + else: + torch._foreach_copy_(splits, foreach_copy_inputs) + for i, split in zip(foreach_copy_indices, splits): + param_all_gather_inputs[i] = [split] + + return param_all_gather_inputs + + +def _apply_fsdp_patch(): + FSDPParamGroup.finalize_backward = _patched_finalize_backward + torch.distributed.fsdp._fully_shard._fsdp_collectives._get_param_all_gather_inputs = _get_param_all_gather_inputs diff --git a/torch_npu/distributed/fsdp/_fsdp_collectives.py b/torch_npu/distributed/fsdp/_fsdp_collectives.py index a1c203ffe0..9ff962b611 100644 --- a/torch_npu/distributed/fsdp/_fsdp_collectives.py +++ b/torch_npu/distributed/fsdp/_fsdp_collectives.py @@ -36,5 +36,20 @@ def all_gather_copy_in_npu( ) foreach_copy_dsts = torch.split(all_gather_input, inp_split_sizes) with torch.no_grad(): - torch._foreach_copy_(foreach_copy_dsts, all_gather_inputs) + if foreach_copy_dsts[0].device == all_gather_inputs[0].device: + torch._foreach_copy_(foreach_copy_dsts, all_gather_inputs, non_blocking=True) + else: + torch._foreach_copy_(foreach_copy_dsts, all_gather_inputs) return all_gather_input, all_gather_output + + +@torch.library.impl(lib, "split_with_sizes_copy", "PrivateUse1") +def split_with_sizes_copy( + all_gather_output: torch.Tensor, + all_gather_input_split_sizes: List[int], + dim: int, + out: List[torch.Tensor], +) -> None: + torch.split_with_sizes_copy( + all_gather_output, all_gather_input_split_sizes, dim=dim, out=out + ) -- Gitee From 76401a57aa2042d4d6e5a68098b698b93742f4c4 Mon Sep 17 00:00:00 2001 From: zhangqiongwen Date: Mon, 14 Jul 2025 02:52:02 +0000 Subject: [PATCH 243/328] !22768 add npu_rmsnorm sharding strategy Merge pull request !22768 from zhangqiongwen/v2.7.1_add_npu_rmsnorm_sharding_strategy --- test/distributed/tensor/test_math_ops.py | 66 +++++++++ torch_npu/distributed/tensor/__init__.py | 1 + torch_npu/distributed/tensor/_math_ops.py | 160 ++++++++++++++++++++++ 3 files changed, 227 insertions(+) create mode 100644 test/distributed/tensor/test_math_ops.py create mode 100644 torch_npu/distributed/tensor/_math_ops.py diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py new file mode 100644 index 0000000000..97f1f7ff01 --- /dev/null +++ b/test/distributed/tensor/test_math_ops.py @@ -0,0 +1,66 @@ +import torch +from torch.distributed._tensor import distribute_tensor, Replicate, Shard +from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase + +import torch_npu +from torch_npu.testing.testcase import run_tests +from torch_npu.testing.common_distributed import with_comms, skipIfUnsupportMultiNPU + + +class TestMathOps(DTensorTestBase): + @skipIfUnsupportMultiNPU(4) + @with_comms + def test_npu_rms_norm_forward(self): + device_mesh = self.build_device_mesh() + + x = torch.randn((1, 128, 64), dtype=torch.float32).npu() + gamma = torch.randn(64, dtype=torch.float32).npu() + + y, rstd = torch_npu.npu_rms_norm(x, gamma) + + dist_x = distribute_tensor(x, device_mesh, [Shard(1)]) + dist_gamma = distribute_tensor(gamma, device_mesh, [Replicate()]) + + dist_y, dist_rstd = torch_npu.npu_rms_norm(dist_x, dist_gamma) + + self.assertEqual(dist_y.full_tensor(), y) + self.assertEqual(dist_gamma.full_tensor(), gamma) + + @skipIfUnsupportMultiNPU(4) + @with_comms + def test_npu_rms_norm_backward(self): + device_mesh = self.build_device_mesh() + + x = torch.randn((1, 128, 64), dtype=torch.float32).npu() + gamma = torch.randn(64, dtype=torch.float32).npu() + grad_y = torch.randn((1, 128, 64), dtype=torch.float32).npu() + + x = x.npu() + gamma = gamma.npu() + grad_y = grad_y.npu() + x.requires_grad = True + gamma.requires_grad = True + + y, rstd = torch_npu.npu_rms_norm(x, gamma, epsilon=1e-06) + y.backward(grad_y) + dx = x.grad + dw = gamma.grad + + dist_x = distribute_tensor(x, device_mesh, [Shard(2)]) + dist_gamma = distribute_tensor(gamma, device_mesh, [Replicate()]) + + dist_y, dist_rsts = torch_npu.npu_rms_norm(dist_x, dist_gamma, epsilon=1e-06) + dist_grad_y = distribute_tensor(grad_y, device_mesh, dist_y.placements) + dist_y.backward(dist_grad_y) + dist_dx = dist_x.grad + dist_dw = dist_gamma.grad + + self.assertEqual(dist_y.full_tensor(), y) + self.assertEqual(dist_gamma.full_tensor(), gamma) + + self.assertEqual(dist_dx.full_tensor(), dx) + self.assertEqual(dist_dw.full_tensor(), dw) + + +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/distributed/tensor/__init__.py b/torch_npu/distributed/tensor/__init__.py index 3b1aecbd8e..dea9541363 100644 --- a/torch_npu/distributed/tensor/__init__.py +++ b/torch_npu/distributed/tensor/__init__.py @@ -1,2 +1,3 @@ import torch_npu.distributed.tensor._matrix_ops import torch_npu.distributed.tensor._attention +import torch_npu.distributed.tensor._math_ops diff --git a/torch_npu/distributed/tensor/_math_ops.py b/torch_npu/distributed/tensor/_math_ops.py new file mode 100644 index 0000000000..1aed8ef5aa --- /dev/null +++ b/torch_npu/distributed/tensor/_math_ops.py @@ -0,0 +1,160 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates + +from typing import Optional + +import torch +from torch.distributed.tensor._dtensor_spec import DTensorSpec +from torch.distributed.tensor._op_schema import ( + OpSchema, + OpStrategy, + PlacementStrategy, +) +from torch.distributed.tensor._ops.utils import ( + generate_redistribute_costs, + register_op_strategy, +) +from torch.distributed.tensor._ops._math_ops import ( + _replicate_dims_start_at, + _infer_reduce_dims_map, + map_placements_after_reduction) +from torch.distributed.tensor._utils import normalize_to_torch_size + +npu = torch.ops.npu + + +@register_op_strategy(npu.npu_rms_norm.default) +def npu_rms_norm_strategy(op_schema: OpSchema) -> OpStrategy: + mesh = op_schema.get_mesh_from_args(validate=False) + expected_args_len = 2 + ( + input_strategy, + gamma_strategy, + ) = op_schema.args_schema[:expected_args_len] + + normalized_shape = gamma_strategy.shape + normalized_size = normalize_to_torch_size(normalized_shape) + + input_ndim = input_strategy.ndim + axis = input_ndim - len(normalized_size) + + output_strategy = OpStrategy([]) + for idx, input_placement_strategy in enumerate(input_strategy.strategies): + op_args_target_specs = [] + redistribute_costs = [] + input_src_spec = input_placement_strategy.output_spec + + input_target_spec = DTensorSpec( + mesh=mesh, + placements=_replicate_dims_start_at(input_src_spec.placements, axis), + tensor_meta=input_src_spec.tensor_meta, + ) + op_args_target_specs.append(input_target_spec) + redistribute_costs.append( + generate_redistribute_costs(input_strategy, input_target_spec) + ) + + if gamma_strategy is not None: + gamma_src_spec = gamma_strategy.strategies[idx].output_spec + + gamma_target_spec = DTensorSpec( + mesh=mesh, + placements=_replicate_dims_start_at(gamma_src_spec.placements), + tensor_meta=gamma_src_spec.tensor_meta, + ) + op_args_target_specs.append(gamma_target_spec) + redistribute_costs.append( + generate_redistribute_costs(gamma_strategy, gamma_target_spec) + ) + + # the output spec is the same as input spec + output_target_spec = input_target_spec + output_strategy.strategies.append( + PlacementStrategy( + output_specs=output_target_spec, + input_specs=op_args_target_specs, + redistribute_cost=redistribute_costs, + ) + ) + + return output_strategy + + +@register_op_strategy(npu.npu_rms_norm_backward.default) +def npu_rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy: + mesh = op_schema.get_mesh_from_args(validate=False) + ( + grad_out_strategy, + input_strategy, + gamma_strategy, + rstd_strategy, + ) = op_schema.args_schema + + normalized_shape = gamma_strategy.shape + normalized_size = normalize_to_torch_size(normalized_shape) + input_ndim = input_strategy.ndim + axis = input_ndim - len(normalized_size) + outer_dims = list(range(axis)) + + out_tuple_strategy = OpStrategy([]) + for idx, input_placement_strategy in enumerate(input_strategy.strategies): + output_specs_list: list[Optional[DTensorSpec]] = [] + input_specs_list: list[DTensorSpec] = [] + redistribute_costs = [] + + input_src_spec = input_placement_strategy.output_spec + grad_out_target_spec = DTensorSpec( + mesh=mesh, + placements=_replicate_dims_start_at(input_src_spec.placements, axis), + tensor_meta=input_src_spec.tensor_meta, + ) + input_specs_list.append(grad_out_target_spec) + redistribute_costs.append( + generate_redistribute_costs(grad_out_strategy, grad_out_target_spec) + ) + output_specs_list.append(grad_out_target_spec) + + input_target_spec = DTensorSpec( + mesh=mesh, + placements=_replicate_dims_start_at(input_src_spec.placements, axis), + tensor_meta=input_src_spec.tensor_meta, + ) + input_specs_list.append(input_target_spec) + redistribute_costs.append( + generate_redistribute_costs(input_strategy, input_target_spec) + ) + + if gamma_strategy is not None: + gamma_src_spec = gamma_strategy.strategies[idx].output_spec + input_specs_list.append(gamma_src_spec) + redistribute_costs.append([0.0 for _ in gamma_strategy.strategies]) + # we may need to change to a pointwise rule over grad_out and + # input, then apply a reduction. + inp_placements = _replicate_dims_start_at(input_src_spec.placements, axis) + reduce_dims_map = _infer_reduce_dims_map( + outer_dims, input_src_spec.ndim, False + ) + out_placements = map_placements_after_reduction( + inp_placements, outer_dims, reduce_dims_map, "sum" + ) + gamma_out_spec = DTensorSpec( + mesh=mesh, + placements=out_placements, + tensor_meta=gamma_src_spec.tensor_meta, + ) + output_specs_list.append(gamma_out_spec) + else: + output_specs_list.append(None) + + rstd_src_spec = rstd_strategy.strategies[idx].output_spec + input_specs_list.append(rstd_src_spec) + redistribute_costs.append([0.0 for _ in rstd_strategy.strategies]) + + out_tuple_strategy.strategies.append( + PlacementStrategy( + output_specs=tuple(output_specs_list), + input_specs=input_specs_list, + redistribute_cost=redistribute_costs, + ) + ) + + return out_tuple_strategy -- Gitee From b4484bd5757deeaac8fade23a952e529ee39feca Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Jul 2025 03:09:26 +0000 Subject: [PATCH 244/328] !22984 Update op_plugin commit id Merge pull request !22984 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 161f835137..28f73786b0 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 161f835137eaa0ca36e62202c141dfbde80babfe +Subproject commit 28f73786b0c773498bbc36eb5bb2a14388bf6831 -- Gitee From e6d572b070df1fc72cca4c8ffd6146d45475c98c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Jul 2025 04:54:25 +0000 Subject: [PATCH 245/328] !22993 Update op_plugin commit id Merge pull request !22993 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 28f73786b0..99499aaec6 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 28f73786b0c773498bbc36eb5bb2a14388bf6831 +Subproject commit 99499aaec6aff8ed49c2fd3b964a7f4d0cf4d244 -- Gitee From bbef45273aa2e7ee1ee12d39262d53b4d72d083b Mon Sep 17 00:00:00 2001 From: louyujing <7927276+louyujing@user.noreply.gitee.com> Date: Mon, 14 Jul 2025 07:46:59 +0000 Subject: [PATCH 246/328] !22884 Fix the bug to adapt the torch Generator Merge pull request !22884 from louyujing/v2.7.1_20250710_160035 --- test/contrib/test_transfer_to_npu.py | 21 +++++++++++++++++++++ torch_npu/contrib/transfer_to_npu.py | 11 ++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/test/contrib/test_transfer_to_npu.py b/test/contrib/test_transfer_to_npu.py index af1187d47c..8c83a6e0d7 100644 --- a/test/contrib/test_transfer_to_npu.py +++ b/test/contrib/test_transfer_to_npu.py @@ -12,6 +12,27 @@ from torch_npu.contrib import transfer_to_npu class TestTransferToNpu(TestCase): + def test_generator(self): + g0 = torch.Generator() + self.assertTrue(isinstance(g0, torch.Generator)) + self.assertEqual(g0.device.type, 'cpu') + + g1 = torch.Generator('cuda') + self.assertTrue(isinstance(g1, torch.Generator)) + self.assertEqual(g1.device.type, 'npu') + + g2 = torch.Generator(torch.device('cuda')) + self.assertTrue(isinstance(g2, torch.Generator)) + self.assertEqual(g2.device.type, 'npu') + + g3 = torch.Generator(device='cuda') + self.assertTrue(isinstance(g3, torch.Generator)) + self.assertEqual(g3.device.type, 'npu') + + g4 = torch.Generator(device=torch.device('cuda')) + self.assertTrue(isinstance(g4, torch.Generator)) + self.assertEqual(g4.device.type, 'npu') + def test_wrap_isinstance(self): # check builtins isinstance grammar self.assertTrue(isinstance(1, int)) diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py index 8bb712eacc..996c777538 100644 --- a/torch_npu/contrib/transfer_to_npu.py +++ b/torch_npu/contrib/transfer_to_npu.py @@ -28,7 +28,7 @@ torch_fn_white_list = ['logspace', 'randint', 'hann_window', 'rand', 'full_like' 'eye', '_sparse_csr_tensor_unsafe', 'empty', '_sparse_coo_tensor_unsafe', 'blackman_window', 'zeros_like', 'range', 'sparse_csr_tensor', 'randn_like', 'from_file', '_cudnn_init_dropout_state', '_empty_affine_quantized', 'linspace', 'hamming_window', - 'empty_quantized', '_pin_memory', 'autocast', 'load', "Generator", 'set_default_device'] + 'empty_quantized', '_pin_memory', 'autocast', 'load', 'set_default_device'] torch_tensor_fn_white_list = ['new_empty', 'new_empty_strided', 'new_full', 'new_ones', 'new_tensor', 'new_zeros', 'to', 'pin_memory'] torch_module_fn_white_list = ['to', 'to_empty'] @@ -45,6 +45,14 @@ cur_path = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(cur_path, 'apis_config.json') +class _GeneratorProxy(torch.Generator): + + def __new__(cls, device='cpu'): + device = _replace_cuda_to_npu_in_list([device], None)[0] + instance = super().__new__(cls, device) + return instance + + def _get_function_from_string(attribute_string): try: module_path, _, attr_name = attribute_string.rpartition('.') @@ -332,6 +340,7 @@ def _init(): # torch.* _device_wrapper(torch, torch_fn_white_list) torch.UntypedStorage.__new__ = _wrapper_cuda(torch.UntypedStorage.__new__) + torch.Generator = _GeneratorProxy # torch.Tensor.* _device_wrapper(torch.Tensor, torch_tensor_fn_white_list) -- Gitee From c58d0d5e6dec542ce1e0205fe45c1683a5bdb314 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Jul 2025 08:54:26 +0000 Subject: [PATCH 247/328] !22995 Update op_plugin commit id Merge pull request !22995 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 99499aaec6..6d68946c55 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 99499aaec6aff8ed49c2fd3b964a7f4d0cf4d244 +Subproject commit 6d68946c555ee8fbdc14a0566f9fe223ea33ac29 -- Gitee From 97a784e59fd4616ac0aa18bd2ef39aa33d083f53 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Mon, 14 Jul 2025 09:01:00 +0000 Subject: [PATCH 248/328] !22933 p2presume Merge pull request !22933 from SCh-zx/p2p27 --- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 1d14cf06ef..24720d902d 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2749,7 +2749,7 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id) { at::Device device = at::Device(c10::DeviceType::PrivateUse1, device_id); std::vector devices = {device}; - const auto key = getKeyFromDevices(devices); + auto key = getKeyFromDevices(devices); { std::lock_guard lock(mutex_); @@ -2761,6 +2761,17 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id) HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); } } + if (hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) { + key = getKeySendRecv(rank_, getP2pPeer()); + if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { + // Reuse the cached communicator if there is one. + auto& hcclComms = devHCCLCommMap_[key]; + for (const auto& hcclComm : hcclComms) { + auto comm = hcclComm->getHcclComm(); + HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); + } + } + } } ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str()); } -- Gitee From 950242f93cae941b7a5a7099d6912a9058aca954 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Mon, 14 Jul 2025 09:28:12 +0000 Subject: [PATCH 249/328] !22742 compact error output Merge pull request !22742 from SCh-zx/err27 --- torch_npu/csrc/core/npu/NPUException.cpp | 9 +++-- torch_npu/csrc/core/npu/NPUException.h | 4 +-- .../csrc/core/npu/register/OptionsManager.cpp | 2 +- .../csrc/core/npu/register/OptionsManager.h | 2 +- torch_npu/csrc/distributed/HCCLUtils.hpp | 2 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 8 ++++- torch_npu/csrc/framework/utils/CalcuOpUtil.h | 33 ++++++++++--------- 7 files changed, 35 insertions(+), 25 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index fe4d2ec4c4..9620895c7c 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -47,11 +47,14 @@ void warn_(const ::c10::Warning& warning) std::string formatErrorCode(SubModule submodule, ErrCode errorCode) { + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { + return " "; + } std::ostringstream oss; int deviceIndex = -1; c10_npu::GetDevice(&deviceIndex); auto rank_id = c10_npu::option::OptionsManager::GetRankId(); - if (!(c10_npu::option::OptionsManager::ShouldPrintLessError())) { + if (!(c10_npu::option::OptionsManager::IsCompactErrorOutput())) { oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() << ", Device:" << deviceIndex << ", RankID:" << rank_id << ") "; } oss << "ERR" << std::setw(2) << std::setfill('0') << static_cast(submodule); @@ -149,10 +152,10 @@ const std::string c10_npu_check_error_message(std::string& errmsg) const char *c10_npu_get_error_message() { auto errmsg = c10_npu::acl::AclGetErrMsg(); - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { std::string log(errmsg); std::string errmsg_ = c10_npu::c10_npu_check_error_message(log); - thread_local std::string processedErrMsg = errmsg_; + thread_local std::string processedErrMsg = "CANN error: " + errmsg_; c10_npu::setRepoErrMsg(processedErrMsg.c_str()); return processedErrMsg.c_str(); } else { diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h index 203b6529b7..1d34ae2050 100644 --- a/torch_npu/csrc/core/npu/NPUException.h +++ b/torch_npu/csrc/core/npu/NPUException.h @@ -151,7 +151,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) " that driver and firmware packages do not match."); \ return true; \ }(); \ - } else if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \ + } else if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \ std::ostringstream oss; \ oss << " NPU function error: " \ << (device_error_msg.empty() ? getErrorFunction(#err_code, ##__VA_ARGS__) : device_error_msg) \ @@ -207,7 +207,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args) static c10_npu::acl::AclErrorCode err_map; \ if ((Error) != ACL_ERROR_NONE) { \ CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) \ + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) \ { \ std::ostringstream oss; \ oss << " OPS function error: " << getErrorFunction(#err_code, ##__VA_ARGS__) \ diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index c41a42ff9f..2e0bbeadf3 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -622,7 +622,7 @@ bool OptionsManager::IsOomSnapshotEnable() return (envFlag != 0); } -bool OptionsManager::ShouldPrintLessError() +bool OptionsManager::IsCompactErrorOutput() { static bool should_print = []() -> bool { int32_t disabled_error = OptionsManager::GetBoolTypeOption("TORCH_NPU_COMPACT_ERROR_OUTPUT"); diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 1a678c6ec4..feb33f6ca7 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -132,7 +132,7 @@ public: static std::string GetOomSnapshotDumpPath(); static bool IsOomSnapshotEnable(); static bool ShouldPrintWarning(); - static bool ShouldPrintLessError(); + static bool IsCompactErrorOutput(); private: static int GetBoolTypeOption(const char* env_str, int defaultVal = 0); diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp index e9ad7bbd6a..1033d8de97 100644 --- a/torch_npu/csrc/distributed/HCCLUtils.hpp +++ b/torch_npu/csrc/distributed/HCCLUtils.hpp @@ -17,7 +17,7 @@ auto Error = err_code; \ if ((Error) != HCCL_SUCCESS) { \ CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error); \ - if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \ + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \ std::ostringstream oss; \ oss << " HCCL function error: " << getErrorFunction(#err_code, ##__VA_ARGS__) \ << ", error code is " << Error << " " \ diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 24720d902d..3a03ea4111 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -2421,7 +2421,13 @@ int64_t ProcessGroupHCCL::getStreamId(bool p2p, int peer) std::vector devices = {at::Device(c10::DeviceType::PrivateUse1, device)}; auto key = getKeyFromDevices(devices); if (p2p && hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) { - TORCH_CHECK(peer >= 0, "In p2p scenarios, the passed 'dst rank id' is error.", DIST_ERROR(ErrCode::PARAM)); + TORCH_CHECK( + peer >= 0, + "In p2p scenarios, the passed 'dst rank id' : ", + peer, + " is error, ", + "expected value >= 0.", + DIST_ERROR(ErrCode::PARAM)); key = getKeySendRecv(rank_, peer); } if ((hcclStreams_.count(key) == 0) || hcclStreams_[key].empty()) { diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h index 0693b9c024..5ee41e7d64 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h @@ -36,22 +36,23 @@ using std::vector; #define ASCEND_ALWAYS_INLINE inline #endif -#define ACL_REQUIRE_OK_OP(expr, opstr) \ - do { \ - if (ASCEND_UNLIKELY((expr) != 0)) { \ - std::cout << (opstr) << std::endl; \ - TORCH_CHECK((expr) == 0, \ - __func__, \ - ":", \ - __FILE__, \ - ":", \ - __LINE__, \ - " NPU error,NPU error code is:", \ - expr, \ - "\n", \ - c10_npu::acl::AclGetErrMsg(), \ - OPS_ERROR(ErrCode::INTERNAL)); \ - } \ +#define ACL_REQUIRE_OK_OP(expr, opstr) \ + do { \ + if (ASCEND_UNLIKELY((expr) != 0)) { \ + std::cout << (opstr) << std::endl; \ + if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { \ + std::ostringstream oss; \ + oss << " NPU error,NPU error code is:" << (expr) << "\n" \ + << OPS_ERROR(ErrCode::INTERNAL); \ + std::string err_msg=oss.str(); \ + ASCEND_LOGE("%s", err_msg.c_str()); \ + TORCH_CHECK((expr) == 0, c10_npu::c10_npu_get_error_message()); \ + } else { \ + TORCH_CHECK((expr) == 0, __func__, ":", __FILE__, ":", __LINE__, \ + " NPU error,NPU error code is:", expr, "\n", \ + c10_npu::acl::AclGetErrMsg(), OPS_ERROR(ErrCode::INTERNAL)); \ + } \ + } \ } while (0) using StorageAndOffsetMemSizePair = std::pair; -- Gitee From 801996010df600c470e472dd0fba2d61f11b0191 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Jul 2025 11:09:25 +0000 Subject: [PATCH 250/328] !23003 Update op_plugin commit id Merge pull request !23003 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 6d68946c55..5baa1ab0c8 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 6d68946c555ee8fbdc14a0566f9fe223ea33ac29 +Subproject commit 5baa1ab0c8d1d7f1b1b8170cad5e5d52de06aef0 -- Gitee From ee81a9cd49d94af85855c909199728d5f8bad916 Mon Sep 17 00:00:00 2001 From: hhz886 Date: Mon, 14 Jul 2025 13:12:05 +0000 Subject: [PATCH 251/328] =?UTF-8?q?!22895=20=E3=80=90Profiler=E3=80=91log?= =?UTF-8?q?=20resource=20fix=20Merge=20pull=20request=20!22895=20from=20hh?= =?UTF-8?q?z886/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../analysis/prof_common_func/_log.py | 19 ++++++------------- .../prof_view/_communication_parser.py | 4 ++-- .../analysis/prof_view/_integrate_parser.py | 4 ++-- .../analysis/prof_view/_kernel_view_parser.py | 4 ++-- .../analysis/prof_view/_memory_view_parser.py | 4 ++-- .../prof_view/_operator_view_parser.py | 4 ++-- .../analysis/prof_view/_stack_view_parser.py | 4 ++-- .../prof_view/_trace_step_time_parser.py | 4 ++-- .../analysis/prof_view/_trace_view_parser.py | 4 ++-- .../prof_view/cann_parse/_cann_analyze.py | 4 ++-- .../prof_view/cann_parse/_cann_export.py | 4 ++-- .../prepare_parse/_fwk_pre_parser.py | 4 ++-- .../prepare_parse/_relation_parser.py | 4 ++-- 13 files changed, 30 insertions(+), 37 deletions(-) diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py index eba5db1af7..0fecde48c4 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_log.py +++ b/torch_npu/profiler/analysis/prof_common_func/_log.py @@ -57,14 +57,15 @@ class ProfilerLogger: if cls._instance is not None: if cls._pid == os.getpid(): return - cls.destroy() # Create logs directory log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR) PathManager.make_dir_safety(log_dir) # Create logger - logger = logging.getLogger(cls.DEFAULT_LOGGER_NAME) + logger = logging.getLogger( + f"{cls.DEFAULT_LOGGER_NAME}_{custom_name}" if custom_name else cls.DEFAULT_LOGGER_NAME + ) logger.setLevel(cls.DEFAULT_LOG_LEVEL) logger.propagate = False @@ -112,19 +113,11 @@ class ProfilerLogger: def destroy(cls) -> None: """ Close and cleanup the logger. - To avoid the deadlock problem caused by directly calling close on handler in multi-process scenarios, close the - file descriptor manually. + To avoid the deadlock problem caused by directly calling close on handler in multi-process scenarios, + when child process updates instance, the parent process instance obtained by fork does not call this method. """ if cls._instance: for handler in cls._instance.handlers[:]: cls._instance.removeHandler(handler) - if cls._pid == os.getpid(): - handler.close() - else: - try: - if hasattr(handler.stream, 'fileno'): - fileno = handler.stream.fileno() - os.close(fileno) - except (OSError, AttributeError, ValueError): - logging.warning("Close profiler logger handler stream failed.") + handler.close() cls._instance = None diff --git a/torch_npu/profiler/analysis/prof_view/_communication_parser.py b/torch_npu/profiler/analysis/prof_view/_communication_parser.py index fff6d265d6..e07f68b785 100644 --- a/torch_npu/profiler/analysis/prof_view/_communication_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_communication_parser.py @@ -46,8 +46,6 @@ class CommunicationParser(BaseParser): self._root_node = TorchOpNode() self._kernel_dict = {} self.step_list = [] - ProfilerLogger.init(self._profiler_path, "CommunicationParser") - self.logger = ProfilerLogger.get_instance() @staticmethod def combine_size_distribution(op_dict: dict, total_dict: dict): @@ -63,6 +61,8 @@ class CommunicationParser(BaseParser): return round(dividend / divisor, 4) def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "CommunicationParser") + self.logger = ProfilerLogger.get_instance() try: self._init_step_list(deps_data) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py index b6c545420c..28472a2411 100644 --- a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py @@ -26,10 +26,10 @@ class IntegrateParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) - ProfilerLogger.init(self._profiler_path, "IntegrateParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "IntegrateParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py index 30ffd8be8b..ded9a612c6 100644 --- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py @@ -17,8 +17,6 @@ class KernelViewParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] - ProfilerLogger.init(self._profiler_path, "KernelViewParser") - self.logger = ProfilerLogger.get_instance() @classmethod def _project_map_for_headers(cls, input_headers: list): @@ -35,6 +33,8 @@ class KernelViewParser(BaseParser): return output_headers def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "KernelViewParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) self._init_step_range(deps_data) diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py index a82c3dc3c8..47255efd09 100644 --- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py @@ -34,8 +34,6 @@ class MemoryViewParser(BaseParser): self.ge_record_list = [] self.memory_data = [] self.component_list = [] - ProfilerLogger.init(self._profiler_path, "MemoryViewParser") - self.logger = ProfilerLogger.get_instance() @staticmethod def _get_data_from_file(file_set: set, file_type_bean: any, bean_list: bool = False) -> list: @@ -73,6 +71,8 @@ class MemoryViewParser(BaseParser): return [cur_record_list, pta_ge_record_list] def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "MemoryViewParser") + self.logger = ProfilerLogger.get_instance() try: self.memory_data = deps_data.get(Constant.MEMORY_PREPARE, {}).get("memory_data", {}).get(Constant.Text, []) self.pta_record_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", []) diff --git a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py index f87e8dc8b8..7c10e9d4bf 100644 --- a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py @@ -22,10 +22,10 @@ class OperatorViewParser(BaseParser): self._torch_op_node = [] self._root_node = None self._kernel_dict = {} - ProfilerLogger.init(self._profiler_path, "OperatorViewParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "OperatorViewParser") + self.logger = ProfilerLogger.get_instance() try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self._kernel_dict = deps_data.get(Constant.RELATION_PARSER, {}) diff --git a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py index 2f793a8af8..b4a85271d9 100644 --- a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py @@ -23,10 +23,10 @@ class StackViewParser(BaseParser): self._root_node = None self._kernel_dict = {} self._metric = param_dict.get("metric") - ProfilerLogger.init(self._profiler_path, "StackViewParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "StackViewParser") + self.logger = ProfilerLogger.get_instance() try: self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py index 744e2cd8a6..46093bec4e 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py @@ -51,8 +51,6 @@ class TraceStepTimeParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) self.step_range = [] - ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser") - self.logger = ProfilerLogger.get_instance() @classmethod def is_float_num(cls, num): @@ -165,6 +163,8 @@ class TraceStepTimeParser(BaseParser): FileManager.create_csv_file(output_path, print_time, file_name, self.title) def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser") + self.logger = ProfilerLogger.get_instance() try: self._init_step_range(deps_data) self.generate_view() diff --git a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py index f90100e869..c5e572e1bc 100644 --- a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py +++ b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py @@ -27,8 +27,6 @@ class TraceViewParser(BaseParser): self._trace_data = [] self._torch_op_node = [] self._root_node = None - ProfilerLogger.init(self._profiler_path, "TraceViewParser") - self.logger = ProfilerLogger.get_instance() @staticmethod def _prune_trace_by_level(json_data: list) -> list: @@ -47,6 +45,8 @@ class TraceViewParser(BaseParser): return result def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "TraceViewParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, []) diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py index 8ef2072be6..da8037f982 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py @@ -34,10 +34,10 @@ class CANNAnalyzeParser(BaseParser): super().__init__(name, param_dict) self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path) self.msprof_path = shutil.which("msprof") - ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) if not os.path.isdir(self._cann_path): diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py index 6a703d0b95..7228525fae 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py @@ -41,10 +41,10 @@ class CANNExportParser(BaseParser): super().__init__(name, param_dict) self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path) self.msprof_path = shutil.which("msprof") - ProfilerLogger.init(self._profiler_path, "CANNExportParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "CANNExportParser") + self.logger = ProfilerLogger.get_instance() try: ProfilerConfig().load_info(self._profiler_path) if not os.path.isdir(self._cann_path): diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py index 6cc6f23516..939e06cf74 100644 --- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py @@ -28,10 +28,10 @@ class TracePreParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) - ProfilerLogger.init(self._profiler_path, "TracePreParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "TracePreParser") + self.logger = ProfilerLogger.get_instance() try: fwk_trace_data = FwkFileParser(self._profiler_path).get_fwk_trace_data() trace_file_path = os.path.join(self._output_path, Constant.TRACE_VIEW_TEMP) if os.path.isdir( diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py index e6eb02ddb8..5e8a941de2 100644 --- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py @@ -23,10 +23,10 @@ __all__ = [] class RelationParser(BaseParser): def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) - ProfilerLogger.init(self._profiler_path, "RelationParser") - self.logger = ProfilerLogger.get_instance() def run(self, deps_data: dict): + ProfilerLogger.init(self._profiler_path, "RelationParser") + self.logger = ProfilerLogger.get_instance() try: kernel_dict = FwkCANNRelationParser(self._profiler_path).get_kernel_dict() except Exception as e: -- Gitee From 1a323d7a6fa71d8b032579b1555b942b2286c076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Mon, 14 Jul 2025 13:15:36 +0000 Subject: [PATCH 252/328] =?UTF-8?q?!23017=20fix=20bind=20conf=20Merge=20pu?= =?UTF-8?q?ll=20request=20!23017=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.7?= =?UTF-8?q?.1=5Ffix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUAffinityController.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index 5567c3e6e2..6c2d35fd95 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace c10_npu { @@ -16,6 +17,7 @@ static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; static pthread_t main_thread; static bool start_main_thread_bind = false; +static std::mutex core_map_mutex; using ThreadCoreMap = std::unordered_map; @@ -264,6 +266,7 @@ CoreIdRange getCoreRange(c10::DeviceIndex device_id, ThreadType type) if (cpu_affinity_mode == 0 || cpu_affinity_mode == 1) { core_range = device_ranges[device_id]; } else { + std::lock_guard lock(core_map_mutex); if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) { device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges)); } -- Gitee From ec127d96420376ff854a7054979336257c72233b Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Jul 2025 14:24:26 +0000 Subject: [PATCH 253/328] !23029 Update op_plugin commit id Merge pull request !23029 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5baa1ab0c8..4860c71d90 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5baa1ab0c8d1d7f1b1b8170cad5e5d52de06aef0 +Subproject commit 4860c71d9006204e40d133d7bed43f3e70175414 -- Gitee From 6fb2ff1c915cf2b5078b9691f44fc4110452c332 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 14 Jul 2025 16:09:26 +0000 Subject: [PATCH 254/328] !23043 Update op_plugin commit id Merge pull request !23043 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 4860c71d90..601c55ad20 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 4860c71d9006204e40d133d7bed43f3e70175414 +Subproject commit 601c55ad20d0b1a9db7ffc2c3ffc7b1b09fa9f8c -- Gitee From 9c520b0e61c6cfd6893ead3d8c0099475637fa09 Mon Sep 17 00:00:00 2001 From: zhangqiongwen Date: Tue, 15 Jul 2025 01:28:24 +0000 Subject: [PATCH 255/328] !22803 add fsdp test case Merge pull request !22803 from zhangqiongwen/v2.7.1_fsdp_test_case --- .../fsdp2/test_fully_shard_comm.py | 1127 +++++++++++++++++ 1 file changed, 1127 insertions(+) create mode 100644 test/distributed/fsdp2/test_fully_shard_comm.py diff --git a/test/distributed/fsdp2/test_fully_shard_comm.py b/test/distributed/fsdp2/test_fully_shard_comm.py new file mode 100644 index 0000000000..1e9f039c3e --- /dev/null +++ b/test/distributed/fsdp2/test_fully_shard_comm.py @@ -0,0 +1,1127 @@ +import copy +import functools +import itertools +import unittest +from typing import Callable, List, Optional, Tuple, Union + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from torch.distributed._composable import checkpoint, replicate +from torch.distributed.device_mesh import DeviceMesh, init_device_mesh +from torch.distributed.fsdp import ( + FSDPModule, + fully_shard, + MixedPrecisionPolicy, + OffloadPolicy, +) +from torch.distributed.fsdp._fully_shard._fsdp_collectives import ( + _div_if_needed, + _get_gradient_divide_factors, + foreach_all_gather, + foreach_all_gather_copy_out, + foreach_reduce, +) +from torch.distributed.fsdp._fully_shard._fsdp_common import FSDPMeshInfo, TrainingState +from torch.distributed.fsdp._fully_shard._fsdp_init import ( + _get_post_forward_mesh_info, + _init_default_fully_shard_mesh, +) +from torch.distributed.fsdp._fully_shard._fsdp_param import ShardedState +from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup +from torch.distributed.tensor import DTensor +from torch.distributed.tensor.debug import CommDebugMode +from torch.distributed.tensor.experimental import implicit_replication +from torch.testing._internal.common_fsdp import ( + check_sharded_parity, + DoubleLinear, + FSDPTest, + FSDPTestMultiThread, + MLP, + patch_post_backward, + patch_reshard, + patch_unshard, +) +from torch.testing._internal.common_utils import run_tests +from torch.testing._internal.distributed._tensor.common_dtensor import ( + ModelArgs, + Transformer, + TransformerBlock, +) + +import torch_npu +from torch_npu.testing.common_utils import SupportedDevices +from torch_npu.testing._internal.common_fsdp import FSDPNPUTest + + +c10d_ops = torch.ops.c10d + +# For recording FSDP events like unshard or post-backward +EventType = Tuple[str, str, TrainingState] + + +class TestFullyShardCollectiveOps(FSDPTestMultiThread): + @property + def world_size(self) -> int: + return 128 + + def perThreadSetUp(self): + super().perThreadSetUp() + torch.npu.set_device(0) + + @property + def device(self) -> torch.device: + return torch.device("npu:0") + + def _get_param_sizes(self) -> List[torch.Size]: + # For world size 128, the fp32 all-gather and reduce-scatter testing + # requires ~0.22 GB + return [ + torch.Size([17, 257]), + torch.Size([17]), + torch.Size([64, 312]), + torch.Size([64]), + torch.Size([64, 64]), + torch.Size([512, 64]), + torch.Size([256]), + torch.Size([64, 297]), + ] + + def _init_params(self, param_sizes: List[torch.Size]) -> List[nn.Parameter]: + torch.manual_seed(42) + orig_params = [nn.Parameter(torch.randn(size, device=self.device)) for size in param_sizes] + # Since seed is per process, not per thread, we broadcast to ensure the + # same original parameters across ranks + for orig_param in orig_params: + dist.broadcast(orig_param, src=0) + return orig_params + + def _init_fsdp_param_group( + self, params: List[nn.Parameter], reshard_after_forward: Union[bool, int] + ): + module = nn.ParameterList([param.detach().clone() for param in params]) + mesh_info = FSDPMeshInfo(_init_default_fully_shard_mesh(), shard_mesh_dim=0) + post_forward_mesh_info = _get_post_forward_mesh_info( + reshard_after_forward, mesh_info + ) + fsdp_param_group = FSDPParamGroup( + list(module.parameters()), + (module,), + mesh_info, + post_forward_mesh_info, + self.device, + None, # shard_placement_fn + MixedPrecisionPolicy(), + OffloadPolicy(), + ) + fsdp_param_group.lazy_init() + return fsdp_param_group + + @SupportedDevices(['Ascend910B']) + def test_all_gather_fp32(self): + param_sizes = self._get_param_sizes() + default_stream = torch.npu.current_stream() + stream1, stream2 = torch.npu.Stream(), torch.npu.Stream() + for async_op, streams, reshard_after_forward in itertools.product( + (False, True), + ((default_stream, default_stream), (stream1, stream2)), + (True, 8), + ): + all_gather_copy_in_stream, all_gather_stream = streams + # Save test time by only testing reshard after forward as an int + # for non-async and non-default streams (like in pre-backward) + if type(reshard_after_forward) is int and ( + async_op or all_gather_stream is default_stream + ): + continue + self._test_all_gather( + param_sizes, + reshard_after_forward=reshard_after_forward, + async_op=async_op, + all_gather_copy_in_stream=all_gather_copy_in_stream, + all_gather_stream=all_gather_stream, + ) + + def _test_all_gather( + self, + param_sizes: List[torch.Size], + reshard_after_forward: Union[bool, int], + async_op: bool, + all_gather_copy_in_stream: torch.npu.Stream, + all_gather_stream: torch.npu.Stream, + ): + def all_gather(fsdp_param_group: FSDPParamGroup, group: dist.ProcessGroup): + all_gather_result = foreach_all_gather( + fsdp_param_group.fsdp_params, + group, + async_op=async_op, + all_gather_copy_in_stream=all_gather_copy_in_stream, + all_gather_stream=all_gather_stream, + device=self.device, + ) + foreach_all_gather_copy_out(all_gather_result, fsdp_params, group) + # Transition to unsharded state to register unsharded parameters + for fsdp_param in fsdp_param_group.fsdp_params: + fsdp_param.init_unsharded_param() + fsdp_param_group._to_unsharded() + + def check_all_gathered_params( + orig_params: List[nn.Parameter], module: nn.Module + ): + for orig_param, param in zip(orig_params, module.parameters()): + self.assertIsInstance(param, torch.Tensor) + self.assertIsInstance(param, nn.Parameter) + self.assertEqual(param, orig_param.to(param.dtype)) + + # Set up the reference parameters and construct the FSDP group + orig_params = self._init_params(param_sizes) + fsdp_param_group = self._init_fsdp_param_group( + orig_params, reshard_after_forward + ) + fsdp_params = fsdp_param_group.fsdp_params + module = fsdp_param_group.modules[0] + + # Sanity check that the parameter sharding is as expected + for orig_param, param in zip(orig_params, module.parameters()): + self.assertTrue(isinstance(param, DTensor)) + self.assertEqual(param.full_tensor(), orig_param) + + # Run the foreach all-gather (including copy-in and copy-out) + all_gather(fsdp_param_group, fsdp_param_group.mesh_info.shard_process_group) + + # Check all-gather correctness + check_all_gathered_params(orig_params, module) + + # For reshard after after forward as an int, further test emulating the + # pre-backward all-gather + if type(reshard_after_forward) is not int: + return + fsdp_param_group._to_sharded_post_forward() + all_gather( + fsdp_param_group, + fsdp_param_group.post_forward_mesh_info.shard_process_group, + ) + check_all_gathered_params(orig_params, module) + + @SupportedDevices(['Ascend910B']) + def test_reduce_scatter_fp32(self): + param_sizes = self._get_param_sizes() + default_stream = torch.npu.current_stream() + stream = torch.npu.Stream() + for reduce_scatter_stream in (default_stream, stream): + self._test_reduce_scatter( + param_sizes, + reduce_scatter_stream=reduce_scatter_stream, + reduce_scatter_dtype=torch.float32, + ) + + @SupportedDevices(['Ascend910B']) + def test_reduce_scatter_fp16(self): + param_sizes = self._get_param_sizes() + default_stream = torch.npu.current_stream() + stream = torch.npu.Stream() + for reduce_scatter_stream in (default_stream, stream): + self._test_reduce_scatter( + param_sizes, + reduce_scatter_stream=reduce_scatter_stream, + reduce_scatter_dtype=torch.float16, + ) + + def _test_reduce_scatter( + self, + param_sizes: List[torch.Size], + reduce_scatter_stream: torch.npu.Stream, + reduce_scatter_dtype: torch.dtype, + ): + # Set up the reference parameters and construct the FSDP group + orig_params = self._init_params(param_sizes) + fsdp_param_group = self._init_fsdp_param_group(orig_params, True) + fsdp_params = fsdp_param_group.fsdp_params + fsdp_param_group.comm_ctx.lazy_init(self.device) + + # Run one unshard to initialize metadata + fsdp_param_group.unshard() + fsdp_param_group.wait_for_unshard() + fsdp_param_group.reshard() + + # Run the foreach reduce-scatter (including copy-in and view-out) + torch.manual_seed(42) + unsharded_grads = [torch.ones_like(param) * self.rank for param in orig_params] + group = fsdp_param_group.mesh_info.shard_process_group + self.assertEqual(group.size(), self.world_size) + all_reduce_stream = torch.npu.Stream() + ( + reduce_scatter_input, + reduce_scatter_event, + post_reduce_event, + _, + _, + _, + ) = foreach_reduce( + fsdp_params, + unsharded_grads, + group, + reduce_scatter_stream, + orig_dtype=orig_params[0].dtype, + reduce_dtype=reduce_scatter_dtype, + device=self.device, + reduce_scatter_reduce_op=None, + all_reduce_group=None, + all_reduce_stream=all_reduce_stream, + all_reduce_grads=True, + partial_reduce_output=None, + ) + torch.npu.current_stream().wait_event(post_reduce_event) + + # Check reduce-scatter correctness + predivide_factor, postdivide_factor = _get_gradient_divide_factors( + group, None, reduce_scatter_dtype + ) + reduced_grads = [grad.detach().clone() for grad in unsharded_grads] + for grad in reduced_grads: + _div_if_needed(grad, predivide_factor) + dist.all_reduce( + grad, + group=group, + op=dist.ReduceOp.AVG if predivide_factor is None else dist.ReduceOp.SUM, + ) + _div_if_needed(grad, postdivide_factor) + for fsdp_param, reduced_grad in zip(fsdp_params, reduced_grads): + sharded_grad = fsdp_param.sharded_param.grad + self.assertIsInstance(sharded_grad, DTensor) + self.assertEqual(sharded_grad.full_tensor(), reduced_grad) + + +class TestFullyShardCommunication(FSDPNPUTest): + @property + def world_size(self) -> int: + return min(4, torch.npu.device_count()) + + @SupportedDevices(['Ascend910B']) + def test_fully_shard_communication_count(self): + """ + Tests that FSDP issues the expected number of all-gathers and + reduce-scatters during forward and backward. + """ + self.run_subtests( + {"reshard_after_forward": [True, False, 2]}, + self._test_communication_count, + ) + + def _test_communication_count( + self, + reshard_after_forward: Union[bool, int], + ): + torch.manual_seed(42) + model_args = ModelArgs() + model = Transformer(model_args) + fully_shard_fn = functools.partial( + fully_shard, reshard_after_forward=reshard_after_forward + ) + num_blocks = 0 + for module in model.modules(): + if isinstance(module, TransformerBlock): + fully_shard_fn(module) + num_blocks += 1 + fully_shard_fn(model) + # We construct `num_blocks` plus 1 FSDP states/communication groups + + torch.manual_seed(42 + self.rank) + inp = torch.randint(0, model_args.vocab_size, (2, 16), device="npu") + with CommDebugMode() as fwd_comm_mode: + loss = model(inp) + fwd_comm_counts = fwd_comm_mode.get_comm_counts() + self.assertEqual(len(fwd_comm_counts), 1) + self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_blocks + 1) + with CommDebugMode() as bwd_comm_mode: + loss.sum().backward() + bwd_comm_counts = bwd_comm_mode.get_comm_counts() + if reshard_after_forward is False: + self.assertEqual(len(bwd_comm_counts), 1) + else: + # The root always does not reshard after forward + self.assertEqual(len(bwd_comm_counts), 2) + self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks) + self.assertEqual( + bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_blocks + 1 + ) + + @SupportedDevices(['Ascend910B']) + def test_manual_reshard_with_reshard_after_forward_false(self): + """ + Tests that we can manually call ``reshard`` on FSDP modules that were + initialized with ``reshard_after_forward=False`` and still run unshard. + """ + torch.manual_seed(42) + model_args = ModelArgs() + model = Transformer(model_args) + for module in model.modules(): + if isinstance(module, TransformerBlock): + fully_shard(module, reshard_after_forward=False) + model = fully_shard(model, reshard_after_forward=False) + num_fsdp_modules = sum( + isinstance(module, FSDPModule) for module in model.modules() + ) + + torch.manual_seed(42 + self.rank) + inp = torch.randint(0, model_args.vocab_size, (2, 16), device="npu") + with CommDebugMode() as fwd_comm_mode: + loss = model(inp) + fwd_comm_counts = fwd_comm_mode.get_comm_counts() + self.assertEqual(len(fwd_comm_counts), 1) + self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_fsdp_modules) + + for module in model.modules(): + if isinstance(module, FSDPModule): + module.reshard() + + with CommDebugMode() as bwd_comm_mode: + loss.sum().backward() + bwd_comm_counts = bwd_comm_mode.get_comm_counts() + self.assertEqual(len(bwd_comm_counts), 2) + self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_fsdp_modules) + self.assertEqual( + bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_fsdp_modules + ) + + +class TestFullyShardPrefetch(FSDPNPUTest): + @property + def world_size(self) -> int: + return min(4, torch.npu.device_count()) + + @SupportedDevices(['Ascend910B']) + def test_fully_shard_backward_prefetch(self): + # Activation checkpointing should not affect the expected FSDP events + self.run_subtests( + { + "reshard_after_forward": [True, False, 2], + "checkpoint_impl": [None, "utils", "composable"], + }, + self._test_backward_prefetch_forward_backward, + ) + self.run_subtests( + { + "reshard_after_forward": [True, False, 2], + "checkpoint_impl": [None, "utils", "composable"], + }, + self._test_backward_prefetch_multi_forward, + ) + self._test_backward_prefetch_unused_in_backward(True) + + def _test_backward_prefetch_forward_backward( + self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str] + ): + n_layers = 3 + model, optim, inp = self._init_transformer( + n_layers, reshard_after_forward, checkpoint_impl + ) + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + # Check the order for normal 1 forward, 1 backward, 1 optimizer step + with patch_unshard(unshard_with_record), patch_post_backward( + post_backward_with_record + ): + for iter_idx in range(3): + loss = model(inp) + expected_events = [ + ("unshard", "", TrainingState.FORWARD), # root + ("unshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1", TrainingState.FORWARD), + ("unshard", "layers.2", TrainingState.FORWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + loss.sum().backward() + expected_events = [ + # Root does not reshard after forward so there is no + # unshard event for it in backward + ("unshard", "layers.2", TrainingState.PRE_BACKWARD), + # Explicit backward prefetching moves the unshards early + # by one module (note how swapping each unshard down one + # event would give the natural event order) + ("unshard", "layers.1", TrainingState.PRE_BACKWARD), + ("post_backward", "layers.2", TrainingState.POST_BACKWARD), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ("post_backward", "layers.1", TrainingState.POST_BACKWARD), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ("post_backward", "", TrainingState.POST_BACKWARD), + ] + if reshard_after_forward is False: + # No reshard after forward means no backward unshards + expected_events = [e for e in expected_events if e[0] != "unshard"] + self.assertEqual(events, expected_events) + events.clear() + optim.step() + optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) + + def _test_backward_prefetch_multi_forward( + self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str] + ): + n_layers = 3 + model, optim, inp = self._init_transformer( + n_layers, reshard_after_forward, checkpoint_impl + ) + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + # Check the order for multiple forwards before 1 backward + with patch_unshard(unshard_with_record), patch_post_backward( + post_backward_with_record + ): + loss1 = model(inp) + loss2 = model(inp) + expected_events = [ + ("unshard", "", TrainingState.FORWARD), # root + ("unshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1", TrainingState.FORWARD), + ("unshard", "layers.2", TrainingState.FORWARD), + # Root does not reshard after forward so there is not another + # unshard event for it + ("unshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1", TrainingState.FORWARD), + ("unshard", "layers.2", TrainingState.FORWARD), + ] + if reshard_after_forward is False: + # No reshard after forward means no second set of unshards + expected_events = expected_events[:-3] + self.assertEqual(events, expected_events) + events.clear() + (loss1 + loss2).sum().backward() + expected_events = [ + # Same as the single forward/backward case except the root's + # post-backward does not run until the end of backward in the + # final callback (since the input not requiring gradient means + # that we do not have a tensor on which to hook for + # post-backward) + ("unshard", "layers.2", TrainingState.PRE_BACKWARD), + ("unshard", "layers.1", TrainingState.PRE_BACKWARD), + ("post_backward", "layers.2", TrainingState.POST_BACKWARD), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ("post_backward", "layers.1", TrainingState.POST_BACKWARD), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ] + if reshard_after_forward is False: + # No reshard after forward means no backward unshards + expected_events = [e for e in expected_events if e[0] != "unshard"] + # However, the post-backward reshards, so the second set of + # unshards will run as real ops + expected_events += [ + # Repeat the same pattern except with the root's post-backward + # at the end since the final callback runs + ("unshard", "layers.2", TrainingState.PRE_BACKWARD), + ("unshard", "layers.1", TrainingState.PRE_BACKWARD), + ("post_backward", "layers.2", TrainingState.POST_BACKWARD), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ("post_backward", "layers.1", TrainingState.POST_BACKWARD), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ("post_backward", "", TrainingState.POST_BACKWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + + def _test_backward_prefetch_unused_in_backward( + self, reshard_after_forward: Union[bool, int] + ): + """ + Test a model with a linear module then a split into two linear modules, + where we run backward through one path first before the other, meaning + that (1) only one linear of the two split is used per backward and (2) + the initial shared linear is used in both backwards. + """ + dim = 8 + model = nn.Sequential(nn.Linear(dim, dim), DoubleLinear(dim)) + fully_shard(model[0], reshard_after_forward=reshard_after_forward) + fully_shard(model[1].lin1, reshard_after_forward=reshard_after_forward) + fully_shard(model[1].lin2, reshard_after_forward=reshard_after_forward) + fully_shard(model, reshard_after_forward=reshard_after_forward) + inp = torch.randn((4, dim), device="npu") + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + with patch_unshard(unshard_with_record), patch_post_backward( + post_backward_with_record + ): + loss1, loss2 = model(inp) + expected_events = [ + # Root has no parameters, so it does not have an unshard + ("unshard", "0", TrainingState.FORWARD), + ("unshard", "1.lin1", TrainingState.FORWARD), + ("unshard", "1.lin2", TrainingState.FORWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + + model.set_is_last_backward(False) + loss2.sum().backward(retain_graph=True) + expected_events = [ + ("unshard", "1.lin2", TrainingState.PRE_BACKWARD), + # NOTE: This `1.lin1` unshard is a mistargeted prefetch. + ("unshard", "1.lin1", TrainingState.PRE_BACKWARD), + ("post_backward", "1.lin2", TrainingState.POST_BACKWARD), + ("unshard", "0", TrainingState.PRE_BACKWARD), + ("post_backward", "0", TrainingState.POST_BACKWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + + model.set_is_last_backward(True) + loss1.sum().backward() + expected_events = [ + # NOTE: `1.lin1` is already unsharded from the mistargeted + # prefetch in the first backward. + # Prefetch `0` + ("unshard", "0", TrainingState.PRE_BACKWARD), + ("post_backward", "1.lin1", TrainingState.POST_BACKWARD), + ("post_backward", "0", TrainingState.POST_BACKWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + + @SupportedDevices(['Ascend910B']) + def test_set_modules_to_forward_prefetch(self): + n_layers = 4 + reshard_after_forward = True + checkpoint_impl = "utils" + model, _, inp = self._init_transformer( + n_layers, reshard_after_forward, checkpoint_impl + ) + + def set_forward_prefetch(model: Transformer, num_to_prefetch: int) -> None: + # Use model-specific knowledge to configure forward prefetching: + # each transformer block (layer) prefetches for the next few + for i, layer in enumerate(model.layers): + if i >= len(model.layers) - num_to_prefetch: + break + layers_to_prefetch = [model.layers[i + j] for j in range(1, num_to_prefetch + 1)] + layer.set_modules_to_forward_prefetch(layers_to_prefetch) + + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + reshard_with_record = self._get_reshard_with_record( + FSDPParamGroup.reshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + expected_backward_events = [ + # Default backward prefetching + ("unshard", "layers.3", TrainingState.PRE_BACKWARD), + ("unshard", "layers.2", TrainingState.PRE_BACKWARD), + ("reshard", "layers.3", TrainingState.POST_BACKWARD), + ("post_backward", "layers.3", TrainingState.POST_BACKWARD), + ("unshard", "layers.1", TrainingState.PRE_BACKWARD), + ("reshard", "layers.2", TrainingState.POST_BACKWARD), + ("post_backward", "layers.2", TrainingState.POST_BACKWARD), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ("reshard", "layers.1", TrainingState.POST_BACKWARD), + ("post_backward", "layers.1", TrainingState.POST_BACKWARD), + ("reshard", "layers.0", TrainingState.POST_BACKWARD), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ("reshard", "", TrainingState.POST_BACKWARD), + ("post_backward", "", TrainingState.POST_BACKWARD), + ] + with patch_unshard(unshard_with_record), patch_reshard( + reshard_with_record + ), patch_post_backward(post_backward_with_record): + set_forward_prefetch(model, num_to_prefetch=1) + loss = model(inp) + expected_forward_events = [ + ("unshard", "", TrainingState.FORWARD), + # `layers.i` prefetches `layers.i+1` + ("unshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1", TrainingState.FORWARD), + ("reshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.2", TrainingState.FORWARD), + ("reshard", "layers.1", TrainingState.FORWARD), + ("unshard", "layers.3", TrainingState.FORWARD), + ("reshard", "layers.2", TrainingState.FORWARD), + ("reshard", "layers.3", TrainingState.FORWARD), + ] + self.assertEqual(events, expected_forward_events) + events.clear() + loss.sum().backward() + self.assertEqual(events, expected_backward_events) + events.clear() + + set_forward_prefetch(model, num_to_prefetch=2) + loss = model(inp) + expected_forward_events = [ + ("unshard", "", TrainingState.FORWARD), + # `layers.i` prefetches `layers.i+1` and `layers.i+2` + ("unshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1", TrainingState.FORWARD), + ("unshard", "layers.2", TrainingState.FORWARD), + ("reshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.3", TrainingState.FORWARD), + ("reshard", "layers.1", TrainingState.FORWARD), + ("reshard", "layers.2", TrainingState.FORWARD), + ("reshard", "layers.3", TrainingState.FORWARD), + ] + self.assertEqual(events, expected_forward_events) + events.clear() + loss.sum().backward() + self.assertEqual(events, expected_backward_events) + events.clear() + + @SupportedDevices(['Ascend910B']) + def test_set_modules_to_backward_prefetch(self): + n_layers = 4 + reshard_after_forward = True + checkpoint_impl = "utils" + model, _, inp = self._init_transformer( + n_layers, reshard_after_forward, checkpoint_impl + ) + + def set_backward_prefetch(model: Transformer, num_to_prefetch: int) -> None: + # Use model-specific knowledge to configure backward prefetching: + # each transformer block (layer) prefetches for the previous few + for i, layer in enumerate(model.layers): + if i < num_to_prefetch: + continue + layers_to_prefetch = [model.layers[i - j] for j in range(1, num_to_prefetch + 1)] + layer.set_modules_to_backward_prefetch(layers_to_prefetch) + + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + reshard_with_record = self._get_reshard_with_record( + FSDPParamGroup.reshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + expected_forward_events = [ + # Default forward prefetching + ("unshard", "", TrainingState.FORWARD), # root + ("unshard", "layers.0", TrainingState.FORWARD), + ("reshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1", TrainingState.FORWARD), + ("reshard", "layers.1", TrainingState.FORWARD), + ("unshard", "layers.2", TrainingState.FORWARD), + ("reshard", "layers.2", TrainingState.FORWARD), + ("unshard", "layers.3", TrainingState.FORWARD), + ("reshard", "layers.3", TrainingState.FORWARD), + ] + with patch_unshard(unshard_with_record), patch_reshard( + reshard_with_record + ), patch_post_backward(post_backward_with_record): + set_backward_prefetch(model, num_to_prefetch=1) + loss = model(inp) + self.assertEqual(events, expected_forward_events) + events.clear() + loss.sum().backward() + expected_backward_events = [ + # Root prefetches `layers.3` per default + ("unshard", "layers.3", TrainingState.PRE_BACKWARD), + # `layers.i` prefetches for `layers.i-1` (same as default) + ("unshard", "layers.2", TrainingState.PRE_BACKWARD), + ("reshard", "layers.3", TrainingState.POST_BACKWARD), + ("post_backward", "layers.3", TrainingState.POST_BACKWARD), + ("unshard", "layers.1", TrainingState.PRE_BACKWARD), + ("reshard", "layers.2", TrainingState.POST_BACKWARD), + ("post_backward", "layers.2", TrainingState.POST_BACKWARD), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ("reshard", "layers.1", TrainingState.POST_BACKWARD), + ("post_backward", "layers.1", TrainingState.POST_BACKWARD), + ("reshard", "layers.0", TrainingState.POST_BACKWARD), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ("reshard", "", TrainingState.POST_BACKWARD), + ("post_backward", "", TrainingState.POST_BACKWARD), + ] + self.assertEqual(events, expected_backward_events) + events.clear() + + set_backward_prefetch(model, num_to_prefetch=2) + loss = model(inp) + self.assertEqual(events, expected_forward_events) + events.clear() + loss.sum().backward() + expected_backward_events = [ + # Root prefetches `layers.3` per default + ("unshard", "layers.3", TrainingState.PRE_BACKWARD), + # `layers.i` prefetches for `layers.i-1` and `layers.i-2` + ("unshard", "layers.2", TrainingState.PRE_BACKWARD), + ("unshard", "layers.1", TrainingState.PRE_BACKWARD), + ("reshard", "layers.3", TrainingState.POST_BACKWARD), + ("post_backward", "layers.3", TrainingState.POST_BACKWARD), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ("reshard", "layers.2", TrainingState.POST_BACKWARD), + ("post_backward", "layers.2", TrainingState.POST_BACKWARD), + ("reshard", "layers.1", TrainingState.POST_BACKWARD), + ("post_backward", "layers.1", TrainingState.POST_BACKWARD), + ("reshard", "layers.0", TrainingState.POST_BACKWARD), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ("reshard", "", TrainingState.POST_BACKWARD), + ("post_backward", "", TrainingState.POST_BACKWARD), + ] + self.assertEqual(events, expected_backward_events) + events.clear() + + @SupportedDevices(['Ascend910B']) + def test_fully_shard_multi_module_backward_prefetch(self): + n_layers = 5 + model_args = ModelArgs(n_layers=n_layers, checkpoint_activations=True) + model = Transformer(model_args) + for i in range(n_layers): + if i == 0: + fully_shard(model.layers[i]) + elif i % 2 == 1: + fully_shard([model.layers[i], model.layers[i + 1]]) + fully_shard([model.tok_embeddings, model.pos_embeddings]) + fully_shard([model.norm, model.output], reshard_after_forward=False) + fully_shard(model) + optim = torch.optim.AdamW(model.parameters(), lr=1e-2) + + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + inp = torch.randint( + 0, model_args.vocab_size, (2, model_args.max_seq_len), device="npu" + ) + with patch_unshard(unshard_with_record), patch_post_backward( + post_backward_with_record + ): + for _ in range(3): + loss = model(inp) + expected_events = [ + ( + "unshard", + "tok_embeddings, pos_embeddings", + TrainingState.FORWARD, + ), + ("unshard", "layers.0", TrainingState.FORWARD), + ("unshard", "layers.1, layers.2", TrainingState.FORWARD), + ("unshard", "layers.3, layers.4", TrainingState.FORWARD), + ("unshard", "norm, output", TrainingState.FORWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + loss.sum().backward() + expected_events = [ + # (norm, output) does not reshard after forward, so there is + # no unshard to begin backward + ("unshard", "layers.3, layers.4", TrainingState.PRE_BACKWARD), + ("post_backward", "norm, output", TrainingState.POST_BACKWARD), + ("unshard", "layers.1, layers.2", TrainingState.PRE_BACKWARD), + ( + "post_backward", + "layers.3, layers.4", + TrainingState.POST_BACKWARD, + ), + ("unshard", "layers.0", TrainingState.PRE_BACKWARD), + ( + "post_backward", + "layers.1, layers.2", + TrainingState.POST_BACKWARD, + ), + ( + "unshard", + "tok_embeddings, pos_embeddings", + TrainingState.PRE_BACKWARD, + ), + ("post_backward", "layers.0", TrainingState.POST_BACKWARD), + ( + "post_backward", + "tok_embeddings, pos_embeddings", + TrainingState.POST_BACKWARD, + ), + ] + events.clear() + optim.step() + optim.zero_grad() + + @SupportedDevices(['Ascend910B']) + def test_fully_shard_multi_module_unused_module(self): + class ModuleWithUnusedLinear(nn.Module): + def __init__(self) -> None: + super().__init__() + self.unused_lin = nn.Linear(1, 1) + self.lin = nn.Linear(16, 16) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return nn.functional.relu(self.lin(x)) + + model = nn.Sequential( + ModuleWithUnusedLinear(), ModuleWithUnusedLinear(), nn.Linear(16, 16) + ) + fully_shard([model[0].unused_lin, model[0].lin], reshard_after_forward=True) + fully_shard([model[1].unused_lin, model[1].lin], reshard_after_forward=True) + fully_shard(model) + optim = torch.optim.AdamW(model.parameters(), lr=1e-2) + + events: List[EventType] = [] + unshard_with_record = self._get_unshard_with_record( + FSDPParamGroup.unshard, events + ) + post_backward_with_record = self._get_post_backward_with_record( + FSDPParamGroup.post_backward, events + ) + inp = torch.randn((2, 16), device="npu") + with patch_unshard(unshard_with_record), patch_post_backward( + post_backward_with_record + ): + for _ in range(3): + loss = model(inp) + expected_events = [ + ("unshard", "", TrainingState.FORWARD), + ("unshard", "0.unused_lin, 0.lin", TrainingState.FORWARD), + ("unshard", "1.unused_lin, 1.lin", TrainingState.FORWARD), + ] + self.assertEqual(events, expected_events) + events.clear() + loss.sum().backward() + expected_events = [ + # Since both `model[0]` and `model[1]` have unused modules + # that never ran forward, they do not reshard after forward + # despite setting it to `True`. Check that there are no + # unshards in backward. + ( + "post_backward", + "1.unused_lin, 1.lin", + TrainingState.POST_BACKWARD, + ), + ( + "post_backward", + "0.unused_lin, 0.lin", + TrainingState.POST_BACKWARD, + ), + ("post_backward", "", TrainingState.POST_BACKWARD), + ] + events.clear() + optim.step() + optim.zero_grad() + + def test_backward_misprefetch(self): + torch.manual_seed(42) + model = MLP(dim=16, device="npu") + ref_model = copy.deepcopy(model) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + fully_shard(model.in_proj) + fully_shard(model.out_proj) + fully_shard(model) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + + # Backward should run through `out_proj` -> `in_proj`, so if `in_proj` + # prefetches for `out_proj`, then this is a misprefetch, as `out_proj` + # should not be needed anymore for backward. + model.in_proj.set_modules_to_backward_prefetch([model.out_proj]) + + torch.manual_seed(self.rank + 1) + inp = torch.randn((2, 16), device="npu") + for _ in range(3): + ref_optim.zero_grad() + ref_loss = ref_model(inp).sum() + ref_loss.backward() + for param in ref_model.parameters(): + dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) + ref_optim.step() + optim.zero_grad() + loss = model(inp).sum() + loss.backward() + optim.step() + self.assertEqual(ref_loss, loss) + + def _init_transformer( + self, + n_layers: int, + reshard_after_forward: Union[bool, int], + checkpoint_impl: Optional[str], + ): + model_args = ModelArgs( + n_layers=n_layers, checkpoint_activations=(checkpoint_impl == "utils") + ) + model = Transformer(model_args) + for module in model.modules(): + if isinstance(module, TransformerBlock): + if checkpoint_impl == "composable": + checkpoint(module) + fully_shard(module, reshard_after_forward=reshard_after_forward) + fully_shard(model, reshard_after_forward=reshard_after_forward) + optim = torch.optim.Adam(model.parameters(), lr=1e-2) + inp = torch.randint( + 0, model_args.vocab_size, (2, model_args.max_seq_len), device="npu" + ) + return model, optim, inp + + def _get_unshard_with_record( + self, orig_unshard: Callable, events: List[EventType] + ) -> Callable: + def unshard_with_record(self, *args, **kwargs): + nonlocal events + if ( + self._all_gather_result is None + and self._sharded_state != ShardedState.UNSHARDED + ): # skip no-ops + events.append(("unshard", self._module_fqn, self._training_state)) + return orig_unshard(self, *args, **kwargs) + + return unshard_with_record + + def _get_reshard_with_record( + self, orig_reshard: Callable, events: List[EventType] + ) -> Callable: + def reshard_with_record(self, *args, **kwargs): + nonlocal events + if ( + self._training_state == TrainingState.FORWARD + and not self._reshard_after_forward + ): # skip no-ops + return None + events.append(("reshard", self._module_fqn, self._training_state)) + return orig_reshard(self, *args, **kwargs) + + return reshard_with_record + + def _get_post_backward_with_record( + self, orig_post_backward: Callable, events: List[EventType] + ) -> Callable: + def post_backward_with_record(self, *args, **kwargs): + nonlocal events + ret = orig_post_backward(self, *args, **kwargs) + # Use training state after running post-backward to check that the + # state is transitioned to `POST_BACKWARD` as expected + events.append(("post_backward", self._module_fqn, self._training_state)) + return ret + + return post_backward_with_record + + +class TestFullyShardUnshardMultiProcess(FSDPNPUTest): + @property + def world_size(self) -> int: + return min(torch.npu.device_count(), 2) + + def test_unshard_async(self): + class ReduceModule(nn.Module): + def __init__(self, dim: int, mesh: DeviceMesh): + super().__init__() + self.mesh = mesh + self.weight = nn.Parameter(torch.randn(dim, dim)) + + def forward(self, x: torch.Tensor): + y = F.relu(x @ self.weight) + # NOTE: This all-reduce is not differentiable and is included + # to exercise the overlap. + work = dist.all_reduce(y, group=self.mesh.get_group(), async_op=True) + return y, work + + class MLPs(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.mlp1 = MLP(dim) + self.mlp2 = MLP(dim) + self.mlp3 = MLP(dim) + + def forward(self, ys: List[torch.Tensor], works: List[dist.Work]): + (y1, y2, y3), (work1, work2, work3) = ys, works + work1.wait() + z1 = self.mlp1(y1) + work2.wait() + z2 = self.mlp2(y2) + work3.wait() + z3 = self.mlp3(y3) + return z1 + z2 + z3 + + class ReduceModel(nn.Module): + def __init__(self, dim: int, mesh: DeviceMesh): + super().__init__() + self.reduce_module1 = ReduceModule(dim, mesh) + self.reduce_module2 = ReduceModule(dim, mesh) + self.reduce_module3 = ReduceModule(dim, mesh) + self.mlps = MLPs(dim) + + def forward(self, x: torch.Tensor): + y1, work1 = self.reduce_module1(x) + if isinstance(self.mlps.mlp1, FSDPModule): + self.mlps.mlp1.unshard(async_op=True) + y2, work2 = self.reduce_module2(x) + if isinstance(self.mlps.mlp2, FSDPModule): + self.mlps.mlp2.unshard(async_op=True) + y3, work3 = self.reduce_module3(x) + if isinstance(self.mlps.mlp3, FSDPModule): + self.mlps.mlp3.unshard(async_op=True) + return self.mlps([y1, y2, y3], [work1, work2, work3]) + + mesh = init_device_mesh("npu", (self.world_size,)) + batch_size, dim = 2, 8 + torch.manual_seed(42) + ref_model = replicate(ReduceModel(dim, mesh).npu()) + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) + torch.manual_seed(42) + model = ReduceModel(dim, mesh) + fully_shard(model.mlps.mlp1, reshard_after_forward=False) + fully_shard(model.mlps.mlp2, reshard_after_forward=False) + fully_shard(model.mlps.mlp3, reshard_after_forward=False) + fully_shard(model.mlps) + replicate(model.npu()) + optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True) + torch.manual_seed(42 + self.rank + 1) + inp = torch.randn((batch_size, dim), device="npu") + for _ in range(10): + losses: List[torch.Tensor] = [] + for _model, _optim in ((ref_model, ref_optim), (model, optim)): + losses.append(_model(inp).sum()) + losses[-1].backward() + with implicit_replication(): + _optim.step() + _optim.zero_grad() + self.assertEqual(losses[0], losses[1]) + + +class TestFullyShardUnshardMultiThread(FSDPTestMultiThread): + @property + def world_size(self) -> int: + return 2 + + def perThreadSetUp(self): + super().perThreadSetUp() + torch.npu.set_device(0) + + @SupportedDevices(['Ascend910B']) + def test_unshard_no_param_group(self): + # Check that we can call `unshard()` on a module with no parameter + # group / no managed parameters without erroring + model = nn.Sequential(nn.Linear(4, 4), nn.Linear(4, 4)) + for lin in model: + fully_shard(lin) + fully_shard(model) + handle = model.unshard(async_op=True) + handle.wait() + + @SupportedDevices(['Ascend910B']) + def test_unshard_without_lazy_init(self): + torch.manual_seed(42) + model = MLP(4) + for param in model.parameters(): + dist.broadcast(param, src=0) + ref_model = copy.deepcopy(model) + fully_shard(model) + model.unshard() # no lazy init yet + for ref_param, param in zip(ref_model.parameters(), model.parameters()): + self.assertEqual(ref_param, param) + + +if __name__ == "__main__": + run_tests() -- Gitee From 889ce33d9fad9eeb484521153691e62494c8e0c2 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 15 Jul 2025 03:09:27 +0000 Subject: [PATCH 256/328] !23046 Update op_plugin commit id Merge pull request !23046 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 601c55ad20..f8fab40561 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 601c55ad20d0b1a9db7ffc2c3ffc7b1b09fa9f8c +Subproject commit f8fab40561b64047e20d2a98c7eac6f100cc71b6 -- Gitee From 4d75e8082f422f8ab8657e380ed2d7f03df84fae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Tue, 15 Jul 2025 06:20:33 +0000 Subject: [PATCH 257/328] =?UTF-8?q?!22927=20Add=20get=5Fipc=5Fpid=20and=20?= =?UTF-8?q?check=20base=20format=20Merge=20pull=20request=20!22927=20from?= =?UTF-8?q?=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.7.1=5Fipc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/interface/AclInterface.cpp | 13 +++++++++++++ torch_npu/csrc/core/npu/interface/AclInterface.h | 2 ++ torch_npu/csrc/ipc/StorageSharing.cpp | 8 ++++++++ torch_npu/csrc/npu/Module.cpp | 10 ++++++++++ 4 files changed, 33 insertions(+) diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 583d37be6f..30a4280edc 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -89,6 +89,7 @@ LOAD_FUNCTION(aclrtIpcMemClose) LOAD_FUNCTION(aclrtMemExportToShareableHandle) LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) LOAD_FUNCTION(aclrtMemImportFromShareableHandle) +LOAD_FUNCTION(aclrtDeviceGetBareTgid) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -1020,5 +1021,17 @@ aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t dev return func(shareableHandle, deviceId, handle); } +aclError AclrtDeviceGetBareTgid(int32_t *pid) +{ + typedef aclError (*AclrtDeviceGetBareTgid)(int32_t *); + static AclrtDeviceGetBareTgid func = nullptr; + if (func == nullptr) { + func = (AclrtDeviceGetBareTgid) GET_FUNC(aclrtDeviceGetBareTgid); + } + + TORCH_CHECK(func, "Failed to find function aclrtDeviceGetBareTgid", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(pid); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index d6c9a78aa4..373aca671f 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -243,5 +243,7 @@ aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle); +aclError AclrtDeviceGetBareTgid(int32_t *pid); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/ipc/StorageSharing.cpp b/torch_npu/csrc/ipc/StorageSharing.cpp index 1169cbd1c5..18fdd4c5e0 100644 --- a/torch_npu/csrc/ipc/StorageSharing.cpp +++ b/torch_npu/csrc/ipc/StorageSharing.cpp @@ -14,6 +14,8 @@ #include "torch_npu/csrc/core/NPUBridge.h" #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" +#include "torch_npu/csrc/core/NPUStorageImpl.h" +#include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/ipc/NPUIPCTypes.h" #include "torch_npu/csrc/ipc/StorageSharing.h" @@ -33,6 +35,12 @@ static PyObject* THNPStorage_shareNpu(PyObject* self, PyObject* args) "_share_npu_: only available on NPU.", PTA_ERROR(ErrCode::PARAM)); c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl(); + auto npu_storage_impl = static_cast(storage.unsafeGetStorageImpl()); + auto format = npu_storage_impl->npu_desc_.npu_format_; + TORCH_CHECK(at_npu::native::FormatHelper::IsBaseFormatType(format), + "Try to share a storage without base format", + PTA_ERROR(ErrCode::TYPE)); + if (storage_impl->received_cuda()) { AT_ERROR( "Supported to send NPU tensor received from another process; other is not currently supported. Consider cloning before sending."); diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 09e158364b..72be776671 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1676,6 +1676,15 @@ static PyObject* THNPModule_add_ipc_pid(PyObject* self, PyObject *args) END_HANDLE_TH_ERRORS } +static PyObject* THNPModule_get_ipc_pid(PyObject* self, PyObject *noargs) +{ + HANDLE_TH_ERRORS + int32_t pid; + NPU_CHECK_ERROR(c10_npu::acl::AclrtDeviceGetBareTgid(&pid)); + return THPUtils_packInt32(pid); + END_HANDLE_TH_ERRORS +} + static PyObject* THNPModule_add_p2p_access(PyObject* self, PyObject *args) { HANDLE_TH_ERRORS @@ -1753,6 +1762,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr}, {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr}, {"_add_ipc_pid", (PyCFunction)THNPModule_add_ipc_pid, METH_VARARGS, nullptr}, + {"_get_ipc_pid", (PyCFunction)THNPModule_get_ipc_pid, METH_NOARGS, nullptr}, {"_add_p2p_access", (PyCFunction)THNPModule_add_p2p_access, METH_VARARGS, nullptr}, {nullptr}}; -- Gitee From ebcb6605b38690ec463c02c2622a58b5cd247ba8 Mon Sep 17 00:00:00 2001 From: sikingbo <929691988@qq.com> Date: Tue, 15 Jul 2025 09:31:41 +0000 Subject: [PATCH 258/328] !22765 _npu_format Merge pull request !22765 from sikingbo/v2.7.1 --- test/npu/test_npu_format.py | 49 +++++++++++++++++++++++++++++++++++++ torch_npu/__init__.py | 3 +++ torch_npu/npu/_format.py | 38 ++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 test/npu/test_npu_format.py create mode 100644 torch_npu/npu/_format.py diff --git a/test/npu/test_npu_format.py b/test/npu/test_npu_format.py new file mode 100644 index 0000000000..2bc1c067ff --- /dev/null +++ b/test/npu/test_npu_format.py @@ -0,0 +1,49 @@ +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestNPUFormat(TestCase): + + def test_enum_values(self): + """test the enumeration value""" + self.assertEqual(torch_npu.Format.NCHW.value, 0) + self.assertEqual(torch_npu.Format.NHWC.value, 1) + + def test_npu_format_cast(self): + """test npu_format_cast""" + tensor = torch.ones(2, 2).npu() + + out1 = torch_npu.npu_format_cast(tensor, 0) + fmt1 = torch_npu.get_npu_format(out1) + self.assertEqual(fmt1, torch_npu.Format.NCHW) + + out2 = torch_npu.npu_format_cast(tensor, torch_npu.Format.NHWC) + fmt2 = torch_npu.get_npu_format(out2) + self.assertEqual(fmt2, torch_npu.Format.NHWC) + + def test_npu_format_cast_(self): + """test npu_format_cast_""" + x1 = torch.ones(2, 2).npu() + x2 = torch.ones(2, 2).npu() + + torch_npu.npu_format_cast_(x1, 0) + fmt1 = torch_npu.get_npu_format(x1) + self.assertEqual(fmt1, torch_npu.Format.NCHW) + + torch_npu.npu_format_cast_(x2, torch_npu.Format.NHWC) + fmt2 = torch_npu.get_npu_format(x2) + self.assertEqual(fmt2, torch_npu.Format.NHWC) + + def test_get_npu_format(self): + """test get_npu_format""" + x1 = torch.ones(2, 2).npu() + torch_npu.npu_format_cast_(x1, 0) + + fmt1 = torch_npu.get_npu_format(x1) + self.assertEqual(fmt1, torch_npu.Format.NCHW) + self.assertEqual(fmt1, 0) + + +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index d811a332a6..ffbef110b0 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -75,6 +75,7 @@ from torch_npu.utils import _apply_module_patch, _add_tensor_methods, _add_colle _apply_npu_show_warning from torch_npu.utils._dynamo_device import _dynamo_register_interface_for_device from torch_npu.npu._stream_check import apply_sanitizer_patch +from torch_npu.npu._format import _apply_npu_format_patch import torch_npu.utils.custom_ops import torch_npu.distributed.rpc import torch_npu.op_plugin @@ -175,6 +176,7 @@ def _apply_class_patches(): _apply_distributed_methods_patch() _apply_mstx_patch() _add_reductions_methods() + _apply_npu_format_patch() _apply_fsdp_patch() @@ -192,6 +194,7 @@ def _apply_distributed_methods_patch(): torch.distributed.launcher.api._get_addr_and_port = torch_npu.distributed.distributed_c10d._trigger__get_addr_and_port_decorator(torch.distributed.launcher.api._get_addr_and_port) +torch.serialization.add_safe_globals([torch_npu.npu._format.Format]) torch.utils.rename_privateuse1_backend("npu") # rename device name to 'npu' and register funcs torch._register_device_module('npu', torch_npu.npu) diff --git a/torch_npu/npu/_format.py b/torch_npu/npu/_format.py new file mode 100644 index 0000000000..beb65e076f --- /dev/null +++ b/torch_npu/npu/_format.py @@ -0,0 +1,38 @@ +from enum import IntEnum + +import torch +import torch_npu + + +class Format(IntEnum): + """NPU storage format enumeration class""" + UNDEFINED = -1 + NCHW = 0 + NHWC = 1 + ND = 2 + NC1HWC0 = 3 + FRACTAL_Z = 4 + NC1HWC0_C04 = 12 + HWCN = 16 + NDHWC = 27 + FRACTAL_NZ = 29 + NCDHW = 30 + NDC1HWC0 = 32 + FRACTAL_Z_3D = 33 + NC = 35 + NCL = 47 + + def __str__(self): + return self.name + + +def _apply_npu_format_patch(): + orig_get_format = torch_npu.get_npu_format + + def patched_get_format(tensor): + """get the Format type of tensor""" + format_int = orig_get_format(tensor) + return Format(format_int) + + torch_npu.get_npu_format = patched_get_format + torch_npu.Format = Format -- Gitee From f4c62979a74e6cd63bba556c0a39e8956f3d8960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Tue, 15 Jul 2025 11:19:45 +0000 Subject: [PATCH 259/328] =?UTF-8?q?!23069=20reset=20ACL=5FOP=5FINIT=5FMODE?= =?UTF-8?q?=3D0=20Merge=20pull=20request=20!23069=20from=20=E5=A7=9C?= =?UTF-8?q?=E6=80=A1=E6=96=87/v2.7.1=5Faop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 2e0bbeadf3..8810a7fb08 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -482,11 +482,11 @@ uint32_t OptionsManager::GetAclOpInitMode() const static uint32_t acl_op_init_mode = []() -> uint32_t { char* buf_val = std::getenv("ACL_OP_INIT_MODE"); // Default 0 - int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 1; + int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0; std::unordered_map aclOpInitMode = getAclOpInitMode(); if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) { - acl_op_init_mode = 1; - TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 1."); + acl_op_init_mode = 0; + TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 0."); } return static_cast(acl_op_init_mode); }(); -- Gitee From 44fb8cc253f011dd6841bc6f580a29e7282a2d9c Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Tue, 15 Jul 2025 13:13:11 +0000 Subject: [PATCH 260/328] !22901 support NSLB-DP Merge pull request !22901 from SCh-zx/nslb27 --- third_party/hccl/inc/hccl/hccl.h | 2 + third_party/hccl/inc/hccl/hccl_types.h | 4 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 67 +++++++++++++++++++ .../csrc/distributed/ProcessGroupHCCL.hpp | 8 ++- 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h index 023914a348..216ef7a838 100644 --- a/third_party/hccl/inc/hccl/hccl.h +++ b/third_party/hccl/inc/hccl/hccl.h @@ -212,6 +212,8 @@ inline void HcclCommConfigInit(HcclCommConfig *config) config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET; config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET; config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE; + config->hcclWorldRankID = 0; + config->hcclJobID = 0; } /** diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h index 40631676c1..9a02c61c04 100644 --- a/third_party/hccl/inc/hccl/hccl_types.h +++ b/third_party/hccl/inc/hccl/hccl_types.h @@ -15,7 +15,7 @@ extern "C" { const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24; const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0; -const uint32_t HCCL_COMM_CONFIG_VERSION = 5; +const uint32_t HCCL_COMM_CONFIG_VERSION = 6; const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200; // 200MB buffer size const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0; // Disable deterministic calculations const uint32_t COMM_NAME_MAX_LENGTH = 128; @@ -132,6 +132,8 @@ typedef struct HcclCommConfigDef { uint32_t hcclOpExpansionMode; uint32_t hcclRdmaTrafficClass; uint32_t hcclRdmaServiceLevel; + uint32_t hcclWorldRankID; + uint64_t hcclJobID; } HcclCommConfig; typedef enum { diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3a03ea4111..d298e74eeb 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -19,8 +19,12 @@ #include #include #include +#include +#include #include +#include + #include "op_plugin/OpInterface.h" #include "third_party/acl/inc/acl/acl.h" #include "third_party/acl/inc/acl/acl_base.h" @@ -63,6 +67,7 @@ constexpr const char* P2P_DEVICE_KEY = "_p2p"; using hcclUs = std::chrono::steady_clock::time_point; constexpr int32_t MAX_GROUP_NAME_LEN = 128; +constexpr int32_t NSLB_JOBID_OFFSET = 32; // HCCL ReduceOp mapping std::map hcclOp = { @@ -950,6 +955,24 @@ ProcessGroupHCCL::ProcessGroupHCCL( c10d::PrefixStore *prefixStore = dynamic_cast(store_.get()); globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_; + c10::intrusive_ptr getTcpStore = store_; + while (getTcpStore) { + c10d::PrefixStore *asPrefixStore = dynamic_cast(getTcpStore.get()); + c10d::TCPStore *tcpStore = dynamic_cast(getTcpStore.get()); + if (tcpStore) { + if (!(tcpStore->getHost().empty())) { + tcpMasterAddr = tcpStore->getHost(); + tcpMasterPort = tcpStore->getPort(); + break; + } + } + if (asPrefixStore) { + getTcpStore = asPrefixStore->getUnderlyingStore(); + } else { + break; + } + } + try { if (blockingWait != nullptr) { auto val = std::stoi(blockingWait); @@ -2155,6 +2178,30 @@ std::vector>& ProcessGroupHCCL::getHCCLComm( return createHCCLComm(devicesKey, devices, commType, commConfig, p2pRank); } +void ProcessGroupHCCL::setNSLBCommConfig(HcclCommConfig** commConfig) +{ + const char* envPtr = std::getenv("RANK"); + if (envPtr == nullptr) { + ASCEND_LOGI("Failed to get env info for NSLB-DP."); + return; + } + uint32_t worldRankID = std::stoi(std::string(envPtr)); + options_->hccl_config["hccl_world_rank_id"] = worldRankID; + uint32_t masterPort = tcpMasterPort; + struct sockaddr_in sa; + std::string master_addr = tcpMasterAddr; + inet_pton(AF_INET, std::string(master_addr).c_str(), &(sa.sin_addr)); + uint32_t masterIp = ntohl(sa.sin_addr.s_addr); + uint64_t jobID = masterPort; + jobID = (jobID << NSLB_JOBID_OFFSET); + jobID += masterIp; + options_->hccl_config["hccl_job_id"] = jobID; + if ((*commConfig) != nullptr) { + (*commConfig)->hcclWorldRankID = worldRankID; + (*commConfig)->hcclJobID = jobID; + } +} + void ProcessGroupHCCL::createHCCLComm( const std::string& devicesKey, const std::vector& devices, @@ -2179,6 +2226,10 @@ void ProcessGroupHCCL::createHCCLComm( HcclCommConfig config; + if (options_->global_ranks_in_group.empty()) { + setNSLBCommConfig(&commConfig); + } + npuGuard.set_index(devices[i].index()); switch (commType) { case HcclCommType::DEFAULT: @@ -3118,6 +3169,22 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions() } } + if (options_->hccl_config.find("hccl_world_rank_id") != options_->hccl_config.end()) { + if (std::holds_alternative(options_->hccl_config["hccl_world_rank_id"])) { + config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_world_rank_id"]); + } else { + TORCH_CHECK(false, "Value type of hccl_world_rank_id should be int.", DIST_ERROR(ErrCode::TYPE)); + } + } + + if (options_->hccl_config.find("hccl_job_id") != options_->hccl_config.end()) { + if (std::holds_alternative(options_->hccl_config["hccl_job_id"])) { + config.hcclOpExpansionMode = std::get(options_->hccl_config["hccl_job_id"]); + } else { + TORCH_CHECK(false, "Value type of hccl_job_id should be int.", DIST_ERROR(ErrCode::TYPE)); + } + } + return config; } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 9c2f365b3e..7d2c3e94ed 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -384,7 +384,7 @@ public: return c10::make_intrusive(_is_high_priority_stream); } - std::unordered_map> hccl_config; + std::unordered_map> hccl_config; std::chrono::milliseconds opTimeout; // Schedule HCCL operations on high priority CUDA streams @@ -571,6 +571,8 @@ public: void resumeHcclComm(int device_id); + void setNSLBCommConfig(HcclCommConfig** commConfig); + bool setCommWorkingDevNic( const HcclComm& comm, int nranks, @@ -960,6 +962,10 @@ protected: std::string pg_desc_; + std::string tcpMasterAddr; + + uint32_t tcpMasterPort; + private: // Helper that encapsulates work shared across all collective communication // primitives. -- Gitee From 35ae646dd7b0045678760668cf24766952a4b207 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Tue, 15 Jul 2025 13:50:09 +0000 Subject: [PATCH 261/328] !23011 getdevice without setdevice Merge pull request !23011 from huangyunlong/2.7dd --- torch_npu/csrc/core/npu/NPUFunctions.cpp | 8 +------- torch_npu/csrc/core/npu/NPUStream.cpp | 2 ++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 085bb0be9d..3c4920ec1b 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -46,7 +46,6 @@ aclError GetDevice(int32_t *device) { if (targetDeviceIndex >= 0) { *device = targetDeviceIndex; - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(targetDeviceIndex)); return ACL_ERROR_NONE; } @@ -60,13 +59,8 @@ aclError GetDevice(int32_t *device) } if (err == ACL_ERROR_NONE) { local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) { + } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; - local_device = 0; - std::lock_guard lock(mtx); - if (used_devices.find(local_device) == used_devices.end()) { - NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device])); - } return ACL_ERROR_NONE; } return err; diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 4411760ab4..cc8a53c54d 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -229,6 +229,8 @@ static void initNPUStreamsOnce() { // Inits default and secondary streams (once, globally) c10::DeviceIndex device_index = current_device(); + // makesure on real devcie + SetTargetDevice(); if (!initialize_flag[device_index]) { std::lock_guard lock(mtx[device_index]); if (!initialize_flag[device_index]) { -- Gitee From 9cdaa2dbe76348ea0543baa9261cf7720d12550f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Tue, 15 Jul 2025 14:04:49 +0000 Subject: [PATCH 262/328] =?UTF-8?q?!23062=20fix=20HcclCommResumeFace=20wit?= =?UTF-8?q?h=20P2P=20hccl=20comm=20Merge=20pull=20request=20!23062=20from?= =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fresume?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/distributed/ProcessGroupHCCL.cpp | 31 ++++++++++++++----- .../csrc/distributed/ProcessGroupHCCL.hpp | 2 ++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index d298e74eeb..2b3a01b2c4 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1204,6 +1204,7 @@ void ProcessGroupHCCL::abortAndClearHcclComm(c10::optional abortRea abortCommsFromMap(devHCCLCommMap_, rank_, abortReason); devHCCLCommMap_.clear(); devHCCLCommNameMap_.clear(); + p2pSendRecvKeys_.clear(); hcclCommCounter_ = 0; return; } @@ -1246,6 +1247,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL() } } devHCCLCommMap_.clear(); + p2pSendRecvKeys_.clear(); } ASCEND_LOGI("process group destroyed, group id is %s.", options_->group_id.c_str()); logger->info("process group destroyed, group id is %s.", options_->group_id.c_str()); @@ -2360,6 +2362,9 @@ bool ProcessGroupHCCL::createHCCLCommEx( return false; } hcclComms[i] = subComm; + if (commType == HcclCommType::P2P) { + hcclComms[i]->p2pPeer = getP2pPeer(); + } // Creates the HCCL streams streamVal.push_back(getNPUStreamByCurrentType(devices[i].index())); } @@ -2462,6 +2467,14 @@ std::vector>& ProcessGroupHCCL::createHCCLComm( // Move the HCCL resource to cache devHCCLCommMap_.emplace(devicesKey, std::move(hcclComms)); + if (commType == HcclCommType::P2P) { + auto iter = p2pSendRecvKeys_.find(rank_); + if (iter == p2pSendRecvKeys_.end()) { + p2pSendRecvKeys_.emplace(rank_, std::vector{devicesKey}); + } else { + iter->second.push_back(devicesKey); + } + } return devHCCLCommMap_[devicesKey]; } @@ -2818,14 +2831,16 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id) HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); } } - if (hcclCommInitRootInfoConfigExist() && c10_npu::option::OptionsManager::GetP2PBufferSize() != 0) { - key = getKeySendRecv(rank_, getP2pPeer()); - if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) { - // Reuse the cached communicator if there is one. - auto& hcclComms = devHCCLCommMap_[key]; - for (const auto& hcclComm : hcclComms) { - auto comm = hcclComm->getHcclComm(); - HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); + if (p2pSendRecvKeys_.find(rank_) != p2pSendRecvKeys_.end()) { + auto p2pKeys = p2pSendRecvKeys_[rank_]; + for (const auto& p2pKey : p2pKeys) { + if (devHCCLCommMap_.find(p2pKey) != devHCCLCommMap_.end()) { + // Reuse the cached communicator if there is one. + auto& hcclComms = devHCCLCommMap_[p2pKey]; + for (const auto& hcclComm : hcclComms) { + auto comm = hcclComm->getHcclComm(); + HCCL_CHECK_ERROR(at_npu::hccl::HcclCommResumeFace(comm)); + } } } } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 7d2c3e94ed..057afe5ccb 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -748,6 +748,8 @@ protected: // // Note that the order of the device for the tensor list matters. std::unordered_map>> devHCCLCommMap_; + + std::unordered_map> p2pSendRecvKeys_; std::unordered_map devHCCLCommNameMap_; -- Gitee From d8efa4daa773fffa70feef8850e8b8ba7e027c4c Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 16 Jul 2025 01:29:26 +0000 Subject: [PATCH 263/328] !23036 Update torchair commit id Merge pull request !23036 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index ec5747ba54..e4bf05da76 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit ec5747ba5477a4508131ca4401088e7383908266 +Subproject commit e4bf05da768a6d9a98e67f86c269e41a2369d02b -- Gitee From 05629405f3432d9372f5cb5ae3285d30b2f870b3 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 16 Jul 2025 05:09:28 +0000 Subject: [PATCH 264/328] !23076 Update op_plugin commit id Merge pull request !23076 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f8fab40561..8407b7cbb0 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f8fab40561b64047e20d2a98c7eac6f100cc71b6 +Subproject commit 8407b7cbb0c7046f80d006987170db775d637cc5 -- Gitee From 2aa8cfd6bb160b0f683ff51857838a53434295f1 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 16 Jul 2025 09:09:28 +0000 Subject: [PATCH 265/328] !23090 Update op_plugin commit id Merge pull request !23090 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 8407b7cbb0..5a0e15319f 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 8407b7cbb0c7046f80d006987170db775d637cc5 +Subproject commit 5a0e15319f574e57f24e3b6ec36b3904d19dc6f1 -- Gitee From 250319f31f9895f993f4f4c9579e63a714858b3d Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 16 Jul 2025 11:39:26 +0000 Subject: [PATCH 266/328] !23105 Update op_plugin commit id Merge pull request !23105 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5a0e15319f..8c6e07f890 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5a0e15319f574e57f24e3b6ec36b3904d19dc6f1 +Subproject commit 8c6e07f890f86f7963de230516afc46b77dc95d9 -- Gitee From ce1cf92df3e667b361affa7d0e87a735722b308f Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 17 Jul 2025 01:38:01 +0000 Subject: [PATCH 267/328] !23114 Update torchair commit id Merge pull request !23114 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index e4bf05da76..67ea6dfe9d 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit e4bf05da768a6d9a98e67f86c269e41a2369d02b +Subproject commit 67ea6dfe9d879d03701a3d668a428b94afcbe521 -- Gitee From b3d11c6b519d30fb6ad05f1210f5ccafffae4dc5 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 17 Jul 2025 03:09:31 +0000 Subject: [PATCH 268/328] !23116 Update op_plugin commit id Merge pull request !23116 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 8c6e07f890..312f5ee3a0 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 8c6e07f890f86f7963de230516afc46b77dc95d9 +Subproject commit 312f5ee3a041e563ca2b8ada6139384be6b3e4b5 -- Gitee From 9b60b3b7751fe83bf720429a60a98af3dc300741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Thu, 17 Jul 2025 07:09:08 +0000 Subject: [PATCH 269/328] =?UTF-8?q?!23098=20add=20cve=20info=20Merge=20pul?= =?UTF-8?q?l=20request=20!23098=20from=20=E7=8E=8B=E8=B6=85/v2.7.0=5Fcve1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.zh.md | 2 +- SECURITYNOTE.md | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ figures/cve.png | Bin 0 -> 114982 bytes 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 figures/cve.png diff --git a/README.zh.md b/README.zh.md index 44cd229629..bddb97a048 100644 --- a/README.zh.md +++ b/README.zh.md @@ -259,7 +259,7 @@ AscendPyTorch版本分支的维护阶段如下: ## 安全声明 -[Ascend Extension for PyTorch插件 安全声明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md) +[Ascend Extension for PyTorch插件 安全声明](./SECURITYNOTE.md) ## 参考文档 diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md index 6856b92996..0805d0a4a4 100644 --- a/SECURITYNOTE.md +++ b/SECURITYNOTE.md @@ -225,3 +225,88 @@ PyTorch提供分布式训练能力,支持在单机和多机场景下进行训 | 版本 | 所有版本 | 所有版本 | | 特殊场景 | 无 | 无 | | 备注 | 该通信过程由开源软件PyTorch控制,配置为PyTorch原生设置,可参考[PyTorch文档](https://pytorch.org/docs/stable/distributed.html#launch-utility)。源端口由操作系统自动分配,分配范围由操作系统的配置决定,例如ubuntu:采用/proc/sys/net/ipv4/ipv4_local_port_range文件指定,可通过cat /proc/sys/net/ipv4/ipv4_local_port_range或sysctl net.ipv4.ip_local_port_range查看 | 该通信过程由CANN中HCCL组件控制,torch_npu不进行控制,端口范围可参考[《环境变量参考》](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/envvar/envref_07_0001.html)的“执行相关 > 集合通信与分布式训练 > 集合通信相关配置>HCCL_IF_BASE_PORT” | + +## 漏洞机制说明 + +Ascend Extension for PyTorch 社区非常重视社区版本的安全性,专门设置了漏洞管理专员负责处理漏洞相关的事务,同时为了构建更安全的AI全流程工具链,我们也欢迎您一起参与。 + +### 漏洞处理流程 + +对于每一个安全漏洞,Ascend Extension for PyTorch 社区会安排人员进行跟踪和处理,漏洞处理的端到端流程如下图所示。 + +![漏洞处理流程](./figures/cve.png) + +下面将重点解释漏洞上报、漏洞评估、漏洞披露的流程。 + +### 漏洞上报 + +您可以通过提交issue的方式联系 Ascend Extension for PyTorch 社区团队,我们将会第一时间安排安全漏洞专项人员向您联系。 +注意,为了确保安全性,请不要在issue中描述涉及安全隐私的具体信息。 + +#### 上报响应 + +1. Ascend Extension for PyTorch 社区会在3个工作日内确认、分析、上报安全漏洞问题,同时启动安全处理流程。 +2. Ascend Extension for PyTorch 安全团队在确认安全漏洞问题后,会对问题进行分发和跟进。 +3. 在安全漏洞问题从分类、确定、修复和发布的过程中,我们会及时更新报告。 + +### 漏洞评估 + +业界普遍使用 CVSS 标准评估漏洞的严重性,Ascend Extension for PyTorch 在使用 CVSS v3.1 进行漏洞评估时,需要设定漏洞攻击场景,基于在该攻击场景下的实际影响进行评估。漏洞严重等级评估是指针对漏洞利用难易程度,以及利用后对机密性、完整性、可用性的影响进行评估,并生成一个评分值。 + +#### 漏洞评估标准 + +Ascend Extension for PyTorch 通过以下向量评估一个漏洞的严重等级: + +- 攻击向量(AV):表示攻击的“远程性”以及如何利用此漏洞。 +- 攻击复杂性(AC):讲述攻击执行的难度以及成功进行攻击需要哪些因素。 +- 用户交互(UI):确定攻击是否需要用户参与。 +- 所需的权限(PR):记录成功进行攻击所需的用户身份验证级别。 +- 范围(S):确定攻击者是否可以影响具有不同权限级别的组件。 +- 机密性(C):衡量信息泄露给非授权方后导致的影响程度。 +- 完整性(I):衡量信息被篡改后导致的影响程度。 +- 可用性(A):衡量用户在需要访问数据或服务时受影响的程度。 + +#### 评估原则 + +- 评估漏洞的严重等级,不是评估风险。 +- 评估时必须基于攻击场景,且保证在该场景下,攻击者成功攻击后能对系统造成机密性、完整性、可用性影响。 +- 当安全漏洞有多个攻击场景时,应以造成最大的影响,即 CVSS 评分最高的攻击场景为依据。 +- 被嵌入调用的库存在漏洞,要根据该库在产品中的使用方式,确定漏洞的攻击场景后进行评估。 +- 安全缺陷不能被触发或不影响 CIA(机密性、完整性、可用性),CVSS 评分为 0 分。 + +#### 评估步骤 + +评估漏洞严重等级时,可根据下述步骤进行操作: + +1. 设定可能的攻击场景,基于攻击场景评分。 +2. 确定漏洞组件(Vulnerable Component)和受影响组件(Impact Component)。 + +3. 选择基础指标的值。 + + - 可利用指标(攻击向量、攻击复杂度、所需权限、用户交互、范围)根据漏洞组件选择指标值。 + + - 影响指标(机密性、完整性、可用性)要么反映对漏洞组件的影响,要么反映对受影响组件影响,以结果最严重的为准。 + +#### 严重等级划分 + +| **严重等级(Severity Rating)** | **CVSS评分(Score)** | **漏洞修复时长** | +| ------------------------------- | --------------------- | ---------------- | +| 致命(Critical) | 9.0~10.0 | 7天 | +| 高(High) | 7.0~8.9 | 14天 | +| 中(Medium) | 4.0~6.9 | 30天 | +| 低(Low) | 0.1~3.9 | 30天 | + +### 漏洞披露 + +安全漏洞修复后 Ascend Extension for PyTorch 社区会发布安全公告 (SA)以及安全说明(SN) ,安全公告内容包括该漏洞的技术细节、类型、上报人、CVE ID 以及受到该漏洞影响的版本和修复版本等信息。 +为了保护 Ascend Extension for PyTorch 用户的安全,在进行调查、修复和发布安全公告之前, Ascend Extension for PyTorch 社区不会公开披露、讨论或确认 Ascend Extension for PyTorch 产品的安全问题。 + +### 附录 + +#### 安全公告(SA) + +目前在维护版本,无安全漏洞 + +#### 安全说明(SN) + +涉及第三方的开源组件部分漏洞说明:无 diff --git a/figures/cve.png b/figures/cve.png new file mode 100644 index 0000000000000000000000000000000000000000..095d0f7ba20c416165a57aa8870c0325bbcb8af0 GIT binary patch literal 114982 zcmd42WmFu&8aCKKkYFLWOK>N+JHZ_WcXxujOMnD-*WeP|HMo0lcXx+xl6&u-{k`Yx zp8YXrdb+Eps@{6_si(pePy33kM0lgPjQK0)M@?m(Xwo z0Oa3)KM)DD$avtJADzV1os{iNom>qZOaNtD3wr}cClej4u@&$`tiKOc9ZejFMeJ;B zOl+M1Wg`a@6I(}fJEuEmByj*B1|&s=RNT^!S6ubc#oh$Y$B$2Mq|f-sqEUq3w@ITZ zc-RG?E0#E+i+wC8B^DFCfPjD&3im@7@`%CYeg}8ZMwxWv+QQQz;9wE`ZbF>s^y#Vf zb<@_&%)L_rFIBnU+qjwC4B+wgTBNf3PX_viJzL6OLRuX#dpkpJ807(0af zKh069jj#{@y@(<_5c&UkfpLdLI9#Qigv8Y!5w(rRJ5*N zRjOnp%ZQ0F`PJ^(@?ITR4u&vT+r^2&td_IQT6YrtCGxw< zas<5Iv=l3JCZeSym7uM$KyK)v*CQJ*p9GJ*7T5c9@?7T^6fBD$lpnpjgQpu%Ssd#^ zHlM$5^Eywnx50d{dA>WX6RV^jkYlHATVCOviS1Cu_&YhmzrwB-QK{Wt;jOyOyNKB0 z9!-C}{ln4BcCHXxrqx08vLzLuNb!Kz!<#$1oPFl$o$6j9JrzU#ve}M-Fh?|2arHW+ z0mV~~s1P5|QlP`_NI7qg`L;NLjR74Tp{8ErLV_%hKPbujdL+MOhzbD^7frX6>K3!l zXrTGLm@dp08cOR1i_0RMDk~S&>|b48UWUseDx1U=QCtoMu|E{%07;YErD59dl z*Y44DL8P`RCb@|^!<5&|m^YZoc3mZ$DM|E1>XMS-X)^6gqEvFOpa1CkBQpc{?z-$E zHh0#TImKPE3in1#j5|U#EiJE39m%6cQmu%?yxJl5?9IZK#pAh(qG{_3qoD%^Pwl!4yUHqrC`c@S+e>HWXlQZ?7=(TmV9U>c2hF zlIt(QP^F+N9sJ|**xk9dTlxBEeC>Fb8{f_6GD;mUIC{0G9UH)mIwaq1h(X2QI%4xiWVl&6$dileXA zmS4XP-B?L#w#n%gtC*@Uon~miF4r8E7l_j{PT++_^#|nrwa)=v{l9kHLJB{Z9<z4p$y`{yM!IRXKXnn%>y zJ!AQlh#FeKt_aZS=CR5Yo|MH{Fxzk>W&j)@xL-~{TS!Jm;Z^M2Akk$-*+NsutXhl*E< z;Yca3t;H?=ZiEw}&}CVt>Hh0d3=VDiZnzw+rq8Z9Io2o#c*mM_N1tpefj3#|vN(0e zZo*rd679G(RPvD?r0PE7dCbw_!(6Ny-QxSr^_#%5TxkhRw>@fcP^xp&<8m3Kq#)xzLWBcZFcBLOO~7?wG@CFf7y}F&3Wwd$0bx2S=|@Z2dIBD%~g_{l4Bh zIhI04_eqi=`cYlB>hmaXVybtjhPwPntNUg-k*lS1(vD}MD0^_qtAAm47Niv{rADHZ zVDGx*`u1Qj&qe^&oobGMQuF!TF5*H^(8h@{(B}kR5!wR0<>Aj2>fu_Q=vGov$CC zie)j zi^u#MtLX<7(x!6PRrn9p3e~y^GJR@U>)j zgRN<<#{yjiBK9jqKjr3@Ja?vX)Q?@& z*@jaVJBO4>v+rOl7iIZ`{dVrHla_gq$>oVqfjb!b=2lI;Vna9sq$HC7eFSI39;ti) zDAT4WW@3=w3{x3`@ROk_HYFcLtzL)FdmB<;t;nw|9S$ds!I*Sl2E+iKeE9Q3QPo z?3OugXYr$-f#^x}LN#GAq+Mm~20EwXOx}~c-9`q1)2LVJf0>6jDQe0I*RSx7MrFmS z{Rlndww`cGHH;J$e%ZC;i=HDpt_SMO*V(L?lCsZc?b>{vgRgYrCK!730mYpAD|Q^- z6JKMM#~pGA8OeeSd04iJh?wh zH9-N5unlO$~Y_|?-ItP3K zr({z-3ZW>%x8ESQGqpV@jIQrP1b8CshhC$ZiJaE6Jn?Q1R>zijVX5X(I0EA zH}h|mKsDs-21Q~?SqUofY6>bkqwc+?5SE&<nWE-{W&b_ z5~og8FnEe7_6~g$WqoV6lJ=wA$f&op=VPEV^T-VCgxVJK+!F4Crs^ozH@-Lr~Q z7l4cu7r0W7JUcw?zZr#@f8Ix$B{v)HshLMd(tOv%?HMm;Y0Q~2)m0yrNq#gDK%Iu_ zY$D$xxzzeFxx7HC^+*k%!>=P4%D=Vl)-d{bMki!4zPSx0hQJoGmAl;)6Nl$Y;B;4& z>DLMOKGZ;-gCMAS5IVmGsOMNUkF+9@;+`Z3xI?{%>T4}K-jkdGYmqMZKlg;J==j)f zaaM63_p&3}@h~jl#;5*Xfx5^|RJ65bb0Arv{6P#>uZYTR?!7Bx7~wm*i>Hn!ufaP? z$L|018}a6CIrZz;0NtMi6@y96D7vYW)%;cv75WV8x@sd#uf%K%NAh30x^cjn=bBD`M^@V90;Myiy3bL{zjcgH(`LTseAm|iPB zzdM$;+z^%)PmR~jjESx*&@8fX^P8?^uhq|@bne&7zG6ntf)yUf_hB}1UmGFcdPHJe z5}V8Wb*a1r8xP&Rc>x!EXBV>G-EWn~mWK7aPyJ zGqfF54~4h{U)Gi~^KVHtK|27AO$~#TPWNKRTzDpTdx7y8`*dE`YS!eRr?k)XZ3PK6 z+i_2fDBVn|<)*Rc%B=6oH%BJ#S<%9E_$RqmzTN}_+U_Rz`-PjZ3#amek_H~XicBZ;mj z*4u8$b8CB~FSt^iy9NLfpMQHM*@~@|=B_;c=21hHezmZOTP&=I<>**XP1JlrqmBIQ z;#2;US&0;@s-f87VbF#*HF~P*Y6rNS$S?|ln$ttsS+?%%1o$tou-ezl=M)wz9P^W_ ztoBYLnlSa@;C@~^@~BZ7?-WL5uzQMTG@oi?)(M zqqdMwO!jsbmstZB9iqAW5Hv`-gZyeu$4tiSk>zqx!UuV$EtJlZWSE zQ3Av5Jb45+>w~Fq4KWv>V43zJ30I8GE6Ju8l39m)>*?uZ<{LPfFV9`GIejunp0m)h z6qHiQfqbrlvxRK8QnR;A@7-H*{y5{9rbf_g0|3HhI8pM}_hn3M&bMV>y>UHFb@+p&Bls5<`=d4HY=L$5|RXXTSuGp38 zVtQrv>tOwtg-_tnQME#`)TX{eGzO-6PWZl{m?C)`RC4KZ@DHVz75X%WWEqo>6clWX z<(-K28rd_(14E5^pyf)%Z)wK|I={t)^b-z?R%^_VTUbrZga^~8#pk@5sSQWVDoDme zN1A-@fB(92m1UMT9hrq(s)LCAQ2s);@+0SG2$X=iU2I}G=jJBC<>J2oOB;k=AJY>5 z3m48i0BA7fTlH#bX}~95a5~xD7|KD$&P6_9H!(j9(MfV9N0a+uG4IOwAc6kp0Lr+~ z)%)?qz_DPx7pL;%;nmQYNVMSCvdPtcr(1B1Q2={+LhV$w()~`&(Tb~t9MeW_^C99+ zdY-Z1u=!+h_Etf%{BiT!mo9#Y8`2}XZ@3aEdy9mGeo9j6a*_X>1I)kjn&hQwM+w9#M-m{0uii`Ath>a3#Xs1VcfQnad@7w#HgE+h2`P)|XK(_te zhYIC!c>`~cV;QESwjN$>ZDBlQAK#P?*Y(X_JfEoHdfEOs>L9~FMLN7&8Vgj)Erg1* zwQ7f}UtX#hSB&KpHfa`kyT}OTmiSSc%;V*24#k^j?x{Q$VZ|CKm|$ta*nqU1tPeiB z7Y^vB)RwfNiRuEIO1=}~jvrpQ4$P$n)51dAbrmZkhoACNTeii|p1y9(8P%6b!zsX{ z6W`o7&_+T5*Y-rH3q`y$VF9!1sPSdod6HBJYPkz#(~tu~6DjwU98hFsQ*Q|uS@X!w zNDgF=f$dW@Pl`_<-}{R0dlX6*a2 zCg~(wkL^+8x^(hwt8El?*wYLHVZScRMCpy3Jrs)Kb&?|$%BSxHO zjUOn&E=WLx`Txy7T7hnP*Ir=ulVA6!eudpVJTfBS0?Nh zALANoAUX>r?*=30zQfV5@_x=1hpWRbXLe<*gX?swC*<3cHd70AAO65MGEv&o@?H15 zr^Np?8bjynEfEQRa*g3xR~aLzb?vGWiy^(rX2~z)811l#*WHLcVmnQhQJ1?rRfNOP#;2l;iTwy~* z6+WyzxJkuZ`L4RkOY7*l@8vbsgtEDgV{s!Q>D-C&0s!p>9CV5OrGrWpB9uye5x$C4 zLi|T+XdonEAl{M-Ha>iJWIy6VtspfDrK@Z`J1tMr`|G>wN@yT)yR2*&O9q3X;{4)6 z1e}Wz0E82z^SE3Nlv@X1i?v@AQ&rQ5otN50S#I7!7J|XO$F&c&dHO&F@Dce z<1hp9)2!hFC6WRCaNts0~@2pST zSAC`_gXNn$SCm~w)o7w|XPZ@zK1g#~R9HA2H%gDoV9RRw6hk2_qEc_VtDMwJM{ouu zWCQRs!ItX16q4z#MuZFKRM8d)jxOuxU>`Yb*eG6PL9R@9Y}~|CJv*i>e!6u%(%Ctm zRo%({K+(1^L;z?DU{3UIPSu3%wkC;*i#acinznt594E#f4m1aULBJUKOWV&|*sbCqsBqb{Tr zyOmeG%I7~i$iU>4No7T~xy&p&284R3AHU=3hkF}?aD5Zqqv@l*EZ?7nX5pP#o^DT< z^dhBSiW?XReYBO*O?K?AZ8qrKGpjakhn@sZlBaV`n6xxCHA|nagI5X}-%ao>--u-y zWbwfOkQ3%}A1EnI)#>D#?{2d9T%U? zv~IhVE(?t|QRO45c?9r3t=8ipQJmE#;5-@ig~T)yiQhbK3zxwF*>Z2c^lTQE&-=n= z&qiKXK<1lgc4e9F>xG5f7T?=Eb~}Gs$+zOw+yQ_#o)76uXeI1d8dysG9l67B&mF}{ z!wDSeJM;BzsV0>nP_jGCWxzGQX7ItaxYtz6nR+q|pGU|VV>p|@l zobn~ZM%r3f;sDJKaai>mkqY|nSju`Sb~7V5YORZ)@cPdsH0aAK7qjv2*BtD8tMzyN z*h$(Wt8tY`3O?DfLk<)Vq}}6gffsssC>8lBe>B#0jJ8nrlQ^Z7? zz(mJ}vHD_RpY*}bUjl;I>A8=H(lz4TWMuC0Np;pErdJJgo6*R|(KuhG!4m2k32(m% z2cQTy!biQ6QWy5CVs7eYcUEOcr~-lvq!B)+#~T$<0lW(r-+2n({``{upgNtd&1$TE zx%pxDlvZ<0U3z|mBq*nq+GsDE{i*_Ycr!Rr4 z)^)48SDMNlvGCoV@-GMN8cDC~F0Eabv*^Yrnc62=kC)Y(XbfT^lL@kpPolvOAwdjJ z%4Q!+m#dx_iPQ5zZ(KdwB6Hs@PFHHR8142n$zci8SMCm$E!GHX%aMztTiST4^GF{k z-v|c>X5<(MF`8PMJls6ExfA!8xT?#a^}#>7y!jw{Cj5uw2H7CU}c< z-VQtolbjzqCP6(JdHo-JeK7#e$i_yMy4?4^o=X8ySl*8L6O(OaIj6R#gr~^J+U|J9 z4zOx!O(Hu!o{cR{>r+W!XGnD3&jwC<$!Vr$?(qWjp+9zV4D|2`@HA7g5CEN{qin3T zw{4zx0Gx<=lbfe^|EuRA3G|@usqLj#;%Te(1}KleFK6%V zr{HITGB!04b^#6AW9Jzkjp0uTUyQ%9F+Yv@8~LuvQAevLZ%P);MtgiNwgy@CHSyv7 z!6w4`X|1n}?iQA3CK|fIhgURwRWUfa=#cWXxvSS} zzaNDJB2O7?V4?=J8v5#m&GPOr*&m#*avI~c;j8<8;MLn6$d179>2a%wms6BBb!XEt zCL_VD7d*lbAEJDUq5b+u4i*~hzi^U7HR|bglA~VTuYI*hvKz2pmTL?E!GVF5QSy?4 zc_5?9m;yU5XZl`ySKGHOfi?ktURAp%u?)$M%YOM{N@PtW`FiX>6S|bB zEkn74_D#;kow}bD%O*-Itx}GCJy1rZ-LrHBGs$MNptgNq6Hy#Y)iiQMpt1G!HhXk_ zlWyWIPgYBQ#Z^HiyitYeXvkLX;)nOz+S?D4;Vd2W za#K}>Jd!$?TWbX$IhTa|pq)e+k8|G_mR;C{*X9e-FGDRMMjN+WG=+E(8v z<{q-DK}?*|b~V`Xgkooi{Z${|8~f5I^J&d7pA-W2h;JPil*A%HHvLR z0A4*%30xmex6RPOTfR-);xq8FPyo;^wS44?J8!k**+GVmxqI(%&&^Y0lN0G$GHp8( zbjmL#-Oh<>c87+3rr+7KGKrBmJe=I9iF)*WCXITgZ7=;D z3goL%y)i?iB!8aqukju z-QEMK)%_x-Y$Gn5>J@Fz(JiXr{kuJh_^*Bvha8n`&ahc3Z!uXBls%rS{SFpYL5Fe% zH^e0w0AvTLKGk`fE!=9-gXlZvOg-*G=5NuK&JI8A?c*)f`bLaf)%k9@q&Bb7R?n*H z(ULPz+**g6K$DssAs=Fo30*|6FAO1EyZ0|_qoR>usEyR({;=S%&5x*idBsD6#0eAa zUZBYZEc)Ah&NPLzJ%lZ59vqW}t!};|XD`|ujP&-xSoPk4lV3e+D%twFo@cz7D<}ZP zIau<8d^}d^$q|V`R+GNP`%(?Fq48nHp$JI{`e{!<(RR5y}~E_Z+s$X2|n+6-MIFH9)w zRvezDXYTA*;2r`87&(&iNBP%kmrS*Iy-Rv3Z_QnuXZC$Ii;SS9VC z-9Xr{9EQ?*gPa|il_+9{DF5RE%&9@5t96u+$CRX7v4vm8#M{Z6v0={N-NcL34O@Zq zn;%}Pom<`-&5F(m*)UgEt`3%dMy!01Azb2HF5X^>D8S6iaayt|@6xaEu`ag$ArGvx zpb`D%tLF3l0x>0LyOd2oqX=C1TVIx$jEeJ1nGGfWAq)Cg{+NGNn)sToU0svzdN-)t zBKFY4)T)0(i~$`M=y`&PL0A;gVo$YH1Pwq6uB6`2B7A@V{Bq0w3_mdshc~gq*~iVa zc8PL0_+*CWz{JwW732N?Oqx!7CD(9s5;>Us6kH7NKaYtzCMC4);Z;#wE$v!={i!F(E^{|`GbTm=qGLSHq6kbr z;@~&eBf3WO&kGVjSP7+?vw=ZAmKYnCRr2)|6cj?mgGd$II|$8YjFdhB0GhLk?&egdI=z~7?^%nh1e^H$sDC{@Hr z8j5Or*tcCk(}#y`7;8%_1~T!rasz_DmP}VHEooHqh6UP{?*=xrMR&NwC zbMsl;AHo_Ilh#w1fl1;R4Z|QLW9>h#5tvEH`0NOkD^-c8!1ZUf$mjD?n|VHcp%=@_ zpP3G~IHb<+1V3$%MQ3CYtIGoGheZn1oWMLxMsw(#S|cQSn>s*>+7v*ag)l6+Jyf`L z&QhN&4&DX1{7>~QUaP5enl+n522t$#eAcl#Dou}LWl4y#NY*{=Fo;Q6M1{>Vb*1Co z6#qTnQQ)*qy);6o)`b2L^>(O;>#Tk~(Z`WNa@FC0C&PvDe#9lXy0)Oj_et^jW=f4& zJaq0wi(74$Z1s0hE{v=?i+S!zcWr)VuDH1T+PsFFzWRH!W=H+K6RmppC_y+g&nj8g zZGq;?{urI3M6WwD5<$t;+|DMBle^Qh$rBZhIr;^!Qx6%?!?Bd~?r0p3U1n|Drm-a@ z$A>JxMgK7c0|9$t>#v)2;yBP^vGYmungWfs^l$clRt4hHp5(4z?2MAEv-tk;qT8y& z)@;j)n)ZZ74%>3JP-ldiC+B?k7l9VDSBB)m_3%^+h2qdr2Y-Em_+-{oX83`d@UmcRZ@wrnGso*e?Q=T9iYTVT!Uwew}XLX3qVa0Ng zB+xlOi=_=A==EqI)(y}%x}MH$U0WmcgLq}&;1bgvHM|y1WZFC|&p;HSHdUbH4k_vw z3BFXPGnD+_ZeVUJP|8i-Uwj=*h<3-qFV4_MuLtO(n#Or^{^{Tk%Q$TbXWkX)5qh6t(k3VR5x+#|Pf+PHpT*lXs~viX-q*jP^_5Tlu{eQBDePyF zMlDzQS=ELLy>Ukb^75qoIvltO3HUL|GrA_3vBp6#^;)~zEe@65Wz7;fSoNoB@yw|> zR~@*9CH2F8O$L)!_ILg<&yIieX1Dp?j-S54t5rMFp*Dr;N}@5f0xbl zxLx3xB(3(zz#IQC(tPjXlPo#eeFDcJcOO4pMb&ez&AXwc!9wRCEI)l}xhD6^`plYg za2{W_Om6wRdxuEIr;Z&K-H)n=XYBFfc%Zapj$Y&CD37g;n-N_Usgmz@30jlk9% zCl;t)+M6S>W^|(0-9lMdh68EZXtz+LS3{6i;gp29MeroI;(AvvZIv#IpE)=TrHhCmTT}2gZ%V#&H+OrUVZG_JF*Hk$%EDpy0^=8wP zhwtHOLor`YNMCdaAV%I_xXZ){pNNVo|LQv4(Y8{j?YMMN5TaWl$Z6IVgqyKhRU7+> z9+-3avHvIiQY%ZvJ2eUNwjQs~n-rU(`R zcC!RnZ|`?c(jwE_v(oL@uOJIlP~7|*;A=PiVh7SjK_5jGy~-VXn22zI_s6_WU$*t5 zTrxhDYW|6Fw0FE27-W-pGmQQSJvq6e3UhwHN^T~=5jJK%VfjON@0+H7ByuK{nG7wn ziFf0!6EeUFJ4e*$ph#H>?;38P4(J1-A$|jayxF!_t2T{CN%xet70)*uJRMV=2sC*Z zJro7xFobPL)v!MtdTl-o zF;+oeq_0vF_gZ4-fCvtVHz|+;qK9WJ>uRUYuF|z6G6tHtvE-OSpdT7Jxtkxq-1h|Y zCRFDq53o4zR^YD=^&tZ+*FcN;_T}O;#dH$&5u?E@i~Yq+ntDKSd-YUt0#p zmvX~>j1FS8I@J=;dx>k_^ z;v1W(u7*X+gd2)-J%JhXpT6!DBn96^H+Av4mzJe6Go0`4Uee~hekrchx@1?P0^S)&V8=wu9V$YMR0hv`D2uvMyg={Ejd-xM>gDb)w9<({}BPs|$ zs$5&}E^4kEdyT96V9*M!TM8gUeXp=eWAGjV=*t_8HqhRxHaTYsjiEN1uN zFQ5)_LN-@Dl2nR8Cunvn!! zY;W)WR#1-u#n**A?zf!M1&jfERB!XRSG-6c#fm=3zv_uldvCNnkMKx{`7>&LkE_C0 zbAANV`L2CY|GxQ`whQLra2CtLeP2Xb`*! z04mS4RR8Tt0qNK=KT^H!K_e{nlh~J^Ag@QJ*}|V>I~Z^eno#A-LX$-u9ZkN|8uQpc zH{FA(5QnRR!gHzM5@OLYdLM`1c%N`M$}&!Ax?zsEflt-R;nZ5;yEbAbMy=T`_#7KZY;_s!wPBF6s1K16VW@sY zf{dS&*4k}v*fFdHk(faUu8*4X0i)7ud@WBoQ!P}<;Wws0V~g+2@ft@}P=G!=C##1| z=d%14MTBpdaR2a}@&EZf&kA92zywh*3(Zf6;t{c^QxL>F z2UBBgT>Z(HRATDI@o-0WjJhXrsTm0d?EX!>NuS?|PL|O!t67p5V6R$Fq$1VNFV1}D zB{jhY^l@CS@9qSAW+fR4;$x_)D_-56pMN~X?h^1gA1-x{nqvTPB$y(vzs3(cu1Ep* zM+^S{+H*#}kJpaCS!6T*gwM8qhHfE**gb7V&oX-Ohu6lB zm7n1<4UkBWV1%XumpBNGu`I7mYl=B{uTvbA?mSlX>~_nenFRfra zY?mU0{%`1jXw~*Ir^B&n>UIcbCS7oBPvf)1Q z(V2s*zFu3KCISVj*(&Jd$V1USHY*Tt0P^&I{gtJuYtC4@C*`A|B?z1od^cy^CC}hb z!kn_>gzXf}FkneSjJS+knQ=elz=c{X+e_wR(dDpywH*8bt497kG!H84C)7H5FZNce;m?06^GWl)W&rP*{B<0!l~pYYd(*Fh30fXmYp&-Lr_*-@ zBuJi-8Z%2#s#rP@`klotfnS^hg+AuSO9R;B+0CO~B5`fno$svHfiO9f5J-I(#r(KhJ$icPdnAkNJp`#YFb&0GEoOOhk znhL>bGT)R{D!+)qR*>QRK!mDwYQvJk4l(#fJd_?^r+ICO9`zS3`nzD)yvKAi2^DzV z2=SG5O!+OvUCd?EDrBw%LfoYCQBty z6HpwN4@x!9>bDZTb*+|8wHQ`HIb0j~RUiR_R;}A3O)Xh$fcU%UK&v~$;}@y+%9gas zxZ^MeCT)Z3N1PgDi_6|>ejTowjUk*R1NjD@(j`}?2Q^#D|HjzU1&t!82ntF*7Shv^ z9q|K9z;W|OR^r$r>HW=4mdj;BeD*zt_(V<@l z+a@h^z0$=i-5noMil?Y8`lF*_r&tmc%U9DInhWi6VkmPZgp!gKt}B;i|7%DqH4O@s z(|q53qE*bH6G;LFN_a_|k#QGFG12r8JN78LOWYN%4i-yE4jm1KWG2&xip`6*e_Nzk)Onm9Cl+A(nO?}H>Z zD43GsNLEm6z0{dMSaNI2WTN}{-6IYf@LA$|xQF19_BQ~?803VMDe9k+?CAchh5QRz zMNq|YdTuJD^#jpFZ=BOLU!R)R7;phj7RmeRSWVa~g_VwO#DbFMU}n-06&1j<ixv@>I%g6e(XMPq!teBa z>v?c+;2K$;fr$udf|0%wZ1W$boOlaq3x)-C1AyH|Z*1r+9F#s?ctktU8S^tIb7<_W zeBZ%-lhFIPZ`=q1NFMOl5${+UBme->QZ8C{3Ig0H^$Awq$Iu$AcdOR*qGIlu9Y*aB zMJQf)*M!Wb_Y=02?>w1yH*3M5FX7tjRsEXn_WaQ-_tbC#lwa9?#KkWTLAcRxw7cmU zIZkL_;Zc@z?J~}L3dHQW&tWdowNfbo z3Dhp*ttoLFm2!~4=K9BpUp|6!9j@~K^&c)HtHLX2*ol)`;eCOo}+1h!lm2@{Ulj(#7gexgJaL z26Cy>ZQ1n6=w~ECpKH6-FaPr$)!-mZjRHxmIK=tooj#dL+{^t`#hEklPQHjI>p;}% zeb*ChlupEW&5!IIf~5-|!^7FH%6o0oETX;k>G2r!rl7tebVNJEj4Fqlv=uzgp#)^= zJF?$ocUH_EyZt&|wG{p+pENhRqKuNVD46S}>enUv>bAZ%R6xDEZ*j3k?@lv>CiDy6 z94+UG3UR$qki?OK@QbMy<8bwEu{y&5#3?PNA9IM)VStb3Lp@e%8Tt6))J*A6hDbAFS2Jk@nGZW!|oCqA>=$J48$$QvB-^Z7(>3JvU-cC0w zc+@f5XAdmszE2lTEmg9ikRl!q6=9I~H~7;^o+rCaPhrq^ayEKyFWawu<>CyPyZ1er zErKhCjNhGm8_v*=DGcIXnQ2)r!CW{~Ev-G#sD=Wv@P0^nLu34lB> zd;8|?hSu3oZ`T0wCmuzNG_MR0<^R zRD>G5+p*~?r)Ub`;_j!hm9rGI5EBLjZxbHm|0dN7o|IqSh}WBtk|jGJSf%V%DR*;K zDNP)Czlb7Y1lsD9Q|^Yio~J>4Itx_VkCz<(kSQi#EI7(u7|R3HclUhV5@D$)7P zEHz0E!vW`rr_ZM$T*X8_?%ebOd^-n7JsG&I+#b8j7B)IE7oZ420B~yJ0%^DxkroI= zf_S5pYM=q&8WW7*{YUpuN`9XTXLz<5#F@G4`Wm}j!Q0N2*c@T{LlZ8&jQ@Q1{b+TJ z+*s98!e2h{=Sg#I@g4*8`Q6psZIt!ejkWc!lGjfE8*j&O0ricNuPP-3=;)kBFTPr6rrnPpa4L9^(4_+5kHQgf$08 z5hQ6$I&n@}3!4v*pA8(ewXn#??DXW>Ws&6tYCr4v(5S^rJrc#Unn97pu zZFlp#N1D%}N63xA148+9%Cm8nAO3=byk9KhDg}mO+=4@&I0rbFECLp? z8&F_jd#?n}zlg2o($cU)kw|g4?l`7X;a)o5xuNBq}84G{j_W&4+#<}lyZ zKY&5o|NMaeizogYV>H1(YSzE|E{OkE^M%iUw^OpIJjmO<^3R@=I{M5tQ%|ktk%k#HM|z;{Xj|ME}44q zCZhKWy{*^oVraMtOXQWULSsc`#GaOyo4Y%fp{3)W<@0zjkt4#UmOFi5I+niB?0j?x zFO$WulQ+pU!eV$5f`n(xtdS79xwW-7p7l1;&T6yhG*_lEi}45CbTUues^Ygk(|0_M zimzLJ@WI06AORiy!UpQAIy%HY2M_d8uEFHplse!N(B)cAF)gtp4# z=CD+)a&8$bua&Esa6v7@Ys5hgH4 zJc`h+{lU~38dRa}zSZ~R&_^Xvwc2?}_sbSsH?-OjUx0>Se(pUef%Vs;rTZt!|1N$>IT z5#(Om*ysjUA^iSX1*q{WFcB}y_to7Q8Z3)n;Keq(HMu|E01MOeQNhQDpTu})D51Ex zn62TrHG8FI-O%uG=}?}SnAlnSldZGfKDk@w-QAsDvy+80bR+>!z5CT}sxQUS&0`Fy z_$^o`njsM|=_F)i%9_YcYurau*-lSSRS8GO$FumoJw~(2W-XE6;Wb;`zB)rA6Y;m& zEY*VtaJ@gt=W$(B(fRyvmG#i1PM26`HQ(Ia4031X;Bf8xj>GnLU8Px0LP7$zFL1}m zguIR3Pq(R0U@Pm3B3digYB+=+N#kH7CC%GyS1+G)KAPj=;ZbvErKPn3TNj(Jhz4CE zxAURb4)Eey8Ds);h7n0tiQGk`Kpc(S4?M@d2nqzqz*Xi(S3k(!V5TBl&-W$t! z8fh056&=YE=xBy)aXkZ{85w7JVm29yM<(E&QP|BAd(%l`(DBFw)qzDHBGng3@Wh*7 zz1rFoB4zpdUg=BWw5nRq!_{6^VTcR(+{VOru$=hi$-#m-TI=w2=JPORr8i*3Ew$<^ zRxZpe43W#=o}EbQ3WODqCvR~%Danc5D~15*Fr}T@o?oBN#+60DhGS{|C1I^!{^@33 zue4MMy?5^zVf<=V)!}@-Yp*y5JW96f0<-bVapl1rkf za%On=&)xBgLgOAMW05z~29*iSC-1og-mbL~B)6_WIQqeyK$2`M8tM0nUPtZ3oJcw3 zqOgt^559q>?rsg{nJJ4Oi6A3)yCArf{9!ighNTEAx?e#@MOFJHY-LqUOT&zZLb3P- z9DUj_*%VeWDN@iPl_}z?s+h%Su&}X<LP?Z8Lf+}2j zaAHD(0?y9Pj$#oU)GXRC4=*oTDN&_F6>KFXCGh*PF*OP}K0ZE*Ma2)HBRX}Kaq*%{ zjSiL4q%15f;mRIwuWht6czAf!Lm}}V3TriHBi6dUAhv-Js6j+N(mOCfH!SYu)l5tC z`}c3^MO*^|1GNuY^`LB1l%9b>O=@Q>jVoMPQc6mR0*;N1jbc%=);x+ftW2j-UW&A! zprC|2J~g$3I(WX)pfp~%RK5I{sR@V=CFC_$)07lp)Fo&|q-10wO79#U9ZUWgnwaE~ z6G*4B@7S%k>q(J<7uTSOD=VWHqY;&m0532QO_*qE0(Lui0gDMDiZJ#%hwBU;3haq` zKOLsDdu!EEaZ^)ML&FUa-mU5N{p)>;ldq;Go63`IeaMnS!;2DicH3V$)VI6k4&RHG zuh8CMGsZBjPdn^r3i*DDt$a{@6VDt%O^rf8C;G(WB1!(iSBkWx=V}Tg%%_RRLGlK> zBd$7zaJhXY)Ae5d|M zncm6ZN4bGO0ge(OA)&XoHys^a&D4(*@S!o))zwQ*WIFs_e$j)t4+2rGc1t`y0Ri1( zQkS7DxE3XzB};SuYj(9!yWJfi_E)XxtTO6}jEuBCl`i>nsVOISnrz^wRKo-O6wL+K z8yc-_5HRb{WTd3R5)u;P;X=@f?aj^AZ8Ve_lS>bDTwQN2wg*b5UX+KRPnYUd%nW`B zB>2BR?5me+&n_jZDjBc>hSi$j(x$RjLyq+>kqf!bVO>Ab_i(MPB_2*Z`jeH7jShG< z3XPv3A^i*0CN`(NX2!<3rr$xV2m7e6uTS9lTqMfi@Zg|o@#ON-$--hki9v_<5LQWF zzoN7hdg}SV^Hz3IQUf%Q2mesqJvMd8Ya^;JRipOF2%w#W#K3}Uz+l}@atsXOx+X`23UJ|mZBdm{m zJ~*wPw*Tzg>h2b>|gjm9WQD2_m`fYmnX=GQ@l^tJB zLnFD>{CrqQ+E7sf3LJgYw?lC9U?Z@Azx*-DC8o|0aZ`IxJ7~b|aX})+3}$ul?HE*Q zI_LWE`I;k6WrNd=5#t6692kCQ`p32ZE^>b$Q6#jvT!NTW3HXmGtsWd4#0n^TAbqA_ zF3JN?QBj=+>G`wP7d(ZEq&`E60|TLFB|1bAbDIB6PHtRy00B`9;QipGyB;O}^v-Ac ze3}(Ho-1EnUCqnO^S(WyA;9?f`Fa-~?bG1qHy2k|a6neUYawzt%FD|;?~jj-jn&uJ z+nuzM(0)eD{qqMF06~QW0}-E>_kiF=jfz;pl?>}0m{SNu4-R&Aa6W>VX0nf%LrF?S zRajT|OFR$crw)&fINjV*OG4Wl*rLBbb=|AA;q%hDUUS?E#uB-iE)y?{tU)V2YR(eg zR3T9Q$1vZuJ(1vf2BUwG0f4IsUU(OFB(bxKHd$yL-PdQgn`xMUIUjY? z_VVdzSg~{+?D$w@;C%U2$N5vSn?^+V^oI{0O8iQc|M5RXTChT`sWRsRf$!J*9$!F!@AmbOZQg!Oj`3N%pN_WGwL1oZzS1tu76^=98MZw^nN@0x>bh`J zMaqsBMyIov5x)Ey9T&<$#>Cq@f)aqsH(?@uk`@1O$)6Du0xzs}Ye)LZDsVtNYKSmzV^AArBIIew~|`rbycLC0OQ zDp8K|_o=yvR5G4{NfI4d^FaIM%diX|-@myzS2J>;6Y5TyK-r2cFdd;q+%;JMI=ouv zTJ!NvUIHXnpH2)e?;i(-nBrV~BFoF|_UrFtY~)bL_q~Wbdu{a!Pv7}mRBTxeN4tBj z1P51OKjq(F#>Zx4p5<|>w>Mz8Vgv^MX~7NDm-6Ga+&)R|@AQ18SP?NLrq-l8S zDzN-25NP#aIp|KIa*!ETt^AvZbY&2{-s16i+4FlxVawsq!OIPXYT}1hKUHU7vgVV= zMuv5%k91qhU#6yIO6MTf`|c9tpPs~Q37Kmx?r&XflgrX8NTB*@Ehsc@0`m&)N7cdV z6~d;G(VZ}4r5)1?(iP-=mq}@g_3J%socH{CEg>-0>E*4;2k|2`%OC>JMU}kIRL6e- z!$vywR~tX=!Em91PoT)`H=6eQ`2OUlVfQ~Ui--N%XHtPawhO9Q+d>!!S zYmbc!{4FIK&iK03$C*|_O#4+No=$8k9^uop5b`sz>L)oTKB~7LbtA~U-)Nnb9KWl! zxBlT(b@b9)X-}*yhjbU(5YhjKpbD^Wo3xC;U&4u^`^PaM3mjm?>vv(y#0=3VVD9Ji z&ipejFX3R1IfI=(htAy(1q@h$u5~RZiG>A8w6o_x z%6Au68^(y;6Ue?+VOlE5rSR7#fPO#7f%fG%t5bpdCW7ACwDg>*PbsZm>A+B);zfNhe%RkS5(dVvUv`pwgbvPLz^(oH`O|;%tXNgF z*_G~kGw=Q+w>#7uR8VsoM2}#CGtSR4{i7!O;y3;<62!nkG=auRXu+Odvbv(nxgW#{ z`uyH5;PKJhosLR(b}W`)V%-#5gKp%#sHggM5Q32rdZxt%{_8<@>aYFQ=oiS^YKsv% z8RH{E$_i9=UyLso-{4pn8Zt6+3$7Q{_3L&v$2ddz-_)^4@yazfftJh1?iz^^9VnnP zGW+f!t9monTgJ~48Cd0O5cPAn$f6a>$Y$QYN&9iH;aFYGWYF&O@r{S?{{54lTVw6+`8txlo&gn*Us{R=h%|_`J*b+!H(DwmV{-w3 z85g<89Xxz|{EumMmPXdr#{aeK5olo2hVO9-A@^W2YiF`IRkvqv2UM|>`E}qcV2BDW z{LAM9u65iOpK`6;@WuKaxngm;hJ+>Zi_QcL*Z^jNMmpv%_dgn8kiP^)w^Jepl3Z4J zeggugjs6t=DTATE8}a;6w$Qb8n{l+wmPoIg=DALW&LGf>2^h8}C#S5S|6NhfH-mO^ zoroj+V!pO^=1Tu275K%fI*fkIiBIJ6XPLLRkB|5=6!0y>;_3eW{-i0HqS4c-gZ$!G z-Lt07UhQ~{2~^dmRs~oDgfn~#c=~DVJ88sR?T5-sd1-WIF`)#>b7xpULV?L5Nmr;& zx>}m{c#ViSYjIo`09yX>gKb~Gq~<-Vipc6}lPQ#}GIT{*$R%+K_qR)b%$5Pf?ncA|xbg)D$?*D~N_Be@UehFnq9(Cx zh83WCh_Bxvg<`8bPBDX3O7fY@Kn6B^n1f0F2oD|&72`=cY@-ZZhA4rRhHxlfYum|+D zAG)1xBN%rupHB+Yz;v*hYj;@bQbYLI2*tNil6N7yt>wA2sfW zySA#ut;G*0QnXW=<9^lg5he61yQgN5MoRox2R8Qq|-O`SUj1b8F9Mw4YJo{be(6&CL+iS2(+ zYNH!Yu332g*fdfpcjCz`E{Tzq2~Q6}$+qG4Sl{NVH2%*fFcW_1(2^tOep$%l zeR#yvnV1yiKcu~8Pr>wM5-Y?t%WwY~?f1aS$$6&MeLU~@CYGKqF&BzA&g)3UdM0Nq z{9O9?FK9(3xp@Zptz$IU9JlZ79^=tvPx|11$tkWq{5sc^R{lr{QQh_0F?ax|OC|V* zih7-Pv+BdCB!~emr2kl2HuL-7H)x6d{e1)N9$r@YsFQ%<^W1Vw?gxXuwqLlIn1osP zl$B*6sJH^pc7j~+-{6^UBOcDi+0+7?>)3)g>Y#uFYg;)6Ik4Gfn2$A4Quc@FR=mNiM;iW>#LM{zOz=m%XU@@JaF`>Cu_cvcqtMUz(JN5EmUhJ{qfP=eUt+} z$+OT7iNjP_QC}JCnLvzop5}JqfViNio3FWFj3MX%&=G>sTe#8W)(@B`yg}qY2w^-- zWse(8B*VH2=Tz0z)q&tySEou{mLo5%mgrxB{hT_lS`*KH92aaQWjn}n(;81lBPuNu zQe$vH8;qvKQbr$Mrjlg)gFbdxj0jt8+LiWAO1j`bI~a;oH7zT+B>uQPER9=Jz$f+n zPtquzy;9jTs%JV~gYx?TsfizTJ7j!l&nL)d(#T!2BmVUce{$Iga;xq&@}CEoWWDR6 z%2*qhvFZ5q!R^4*jZG1rqDu%l1@{9cDc^j22t|`}RozeOD9yn~7^%_jEuJ$^c~t?v zsVY(Y&Z3ZkZERdzTpXOFvfbN?zp};HJy#2>}ZZlNvA|&fuWx4ty`uMe+77- zcxHY4MU0x;^7n%lcm{})AC31VPuznSiYUFj)p zs0+u+OneF6LxiC6m7uw#=N^Sn#_AEm>{*;N7l)@R9aoriO#Gi&nHmMB2+2&hciRb| zO}!(CTY(v-V2e?S9fu-z6C- z1Iw&p`^RS$V#v9JV}wgKuO2|Q1|;sk5-HH*aJjjus+oujydImEApqNM{Nw7%ZsX{m z%>K3RhqEzFKTJI9okRxAdfy5d!8t%68Pn(i#SRO)5)t&B#cbYh>j{A$0;yZr`~w&Q z&hFnbE z4(~gDUMebxildd(|Jqc_=OrG1o-&AWJ}&9*aViH@We?Uzk&D%Nk@$Ss!TUMA{$^(F zEaiGS$yv$j*;)rP;$e1;k{VTyc~U%r&GaD-D;o*U=Y7Bmr6mw}sjTFqDyjs9hkiO! z97O>Pt}W$FclrId3BX~ zY-r^SMFuuo?$%=cLG5xbBx@M1Dus10(}yQjzF8b2_@h56hX=MFKQ2HSI;VG z(Gq=z>kM3c(g=IYZc|TKw(%xLk4K3a!$aNad~kdSf%Jl=*c6z87lq7dpmjTE!e-*D z<)I_gM(PL2Mh0%2mp4&9K)=Q{$jhaVL=WL_*3 zmK^xP@M)=i4&_~@E_WwPl@@2=){g_;3B5x^V16QW2z8-H@96lSJ}YM44)jN^3iPHA zwfiBTa=-2mJz}s*lT~_r{Bk9>X!#tFxGZpGD z6pzC%PARB@qan`kcixhGS2DCZk<*u?C#pt@5~1AlAyoN&AuxhE(|c2Y7RDl@(yF0I z*giu88w)PHI!FTsK>(niKaxR3)uLWAQq*Ln18g9Pk`VL^49y8MrZwmxlvc5|uKa=h z*wlr6qxbq{+bpSN!dnDoWO2gZT!&)bVQCpW{xi*?#F?+T&CIC+gL1>g9w{P9dpTEC z!R7!9fgii|+LY^3Mdw25`gfTyG&sQij`LCk>#**07ZE^8@x;y=Vlpn}NS-ugDy`%Ou;0J02Ax%K z$T*;TwEpkkzty#QCHTn96uYF*fZ!k+&$ z1QI~N_lGLXp(ZV@#YL?qI0vR@V{>?*A}v~giJFn>r#sg;({YKCPVPDhnD-I`aWM6z ze>`HiSJ+Q0>_i|{lAN@XQbLXqISyY~SwDAF~*7JTd}8nA4t zMnMAF1<7$V@P#FQx%fSl`3X!2{DK+NPVvZiyR(1+MlR+n7-u%Wk=#_$=vmmc-3kY9 zZXKX-P-4!cSX;Yf&tCt?WBZNn0FSkQk*y~q0u#M67~Thbx?wEuF!d4UO!K|k8w1&J z{bm;$-j;C~b@B9N44BT0I~9c?bn~Yi|3ZG3icu~&DRtu!+W)K)E-NhZ*Z zv+6i#&BA25wpC=Dzv^@t7C%X{*XQT2OCbRNIS-+WuE`w@>S5B+<&Gs_Z!9bhk(Zed zeg`i^5;4emyLZ_$%BI8p1xoO2AUsqrU(BU0wP#<^lPp@`CA&MHn($?+-*M;N+hp(P z$n=m?-MC;(J~j-!IIO)K@Gb4wT+cf^zQ%1=1=Y^Z3kul58A?R#zdfa-o0J+?bY|!i zxkf*jk-z22L)$*$#^m__u>kPmVq%0mE(Zty@<1c6^gVQtTPY8~0|GE>O`#V4W&W{mZ>BM;su=tTz*hiYwbYg5uMB7&EMI;7gZ?7*dEvjDs(wG@;%#abHwVle;G4F(Wdp`Q9omLF@Kjyj`cOv-n9QJ2T zOZW#Z`-8*t8kd0AHeOfD;;HHFw0bbof!leH-k|+ihW4(52R}+a`6-l=P&l}tsydIJ zjBh7kLMFTTpSSFh-OYp1J5TN@K_NTQ1o1@ABkgAwRYT%fTC-XBD<0DB>8*gp~cJ0UaR>YF45( zN8~-3B4^EKb*_o+n-3X(Z(3^8&;r`8=8}gJ&h&V%CfWmRCZIK%^pIDuFTNR-yEfp| zqMs4#QK&Qjdg_4k%WJViK}*vzVOIKe&-?)ZoQPWoG5#LRYmXDxZ63Y(75R`^YN@Gi z(~N$tJ-r@DY0G0YxZibj#dW+jn zjideZPo2e2{>>lt-{^2PiI79)AwS+_apyiIqX7ySKQ!s#4;jQ6O!-h3dlPjYN>@%1 z>=}*^3t#}*+BG(=+sr1VSj;9HVF$=$kc>P7evPV3400?ThPnf%wY@b=Z0xfAC~?{` zvjP6%8hV@*%?4?j01L}z&K8F?#m|K==6XCMp6SaYwL&WUS%-vQ^fx&MIPBpt(C}d_ zY+vi`r@4+A37QVh2gO21RG|0o2wgW)=5IF+B|_Ew5Mx8BN%zV@H7+WgRdpQDSzh^c zUup{r1T5Me*j7|luIE=Bszb2=XPG6d)A1&H52_-{Qz`(!DABT4PmYUdKn+;_9wKOW z-GM1AD*8E5xhD5Be4&{kUO~QT@a!@uO@m29SUDaC$?^~~VrvT-uikG(8D8BhsgOqq z>@v0i_Zc&%%{+Q-_jk`|krYR=95^yj@(3+*F^me0vnIZ7H2UXL{-=*oX^f^eEFKg- zz`f!(J86Pj(Kx&VYwO$^R^ac|@}xb1mT1$^hJi-;`r*HPuhkwd2P>z+#h8O8FLNnGyu`_&hn5I!60ZwwuO%U zf4n`mwkt*#zAlze{dI0k_7$R>_TiYCEJ zX?*)V4h9AWS6A2A84~62KbZwf9y~h{6{gP~k!gEwq&rOo#0%R@OZ@zK8~pMz6=o z*-`&&uVcy`Z_)DbpZ)E6xKVcn*+xKXuJ!oO=9E@uil!@KR1(MbL+y3{K2=EfY*Fc5 zWpG(PUV!<88fFJ6)-nJRvc{!fD5vTD-=H=+t2e?cFCm!%T`(KtL_6mny$gFa77+n} z*;`)M4xtsZWj-?sDrtbpSJnIZ@XOz3D8b%#8SEG>p{=o=O<;qwpfRvtaR|=oO#Amv zMf8<>y=%s;j^cCjDeSCbHBK?5HL#E4{^ohXQhDf&cG?yXHI;-H(wr1kx<9wEAy6jm zR?d&1rB6RKwY{dp+w-qi(O|+a_pW2i{Dm<2mh*I!4);H-`z79FO6PU-!Mg*rxBVX}sW_~J5O!l?f2RqtOt zzzQiAKT%ZT(TB+KVconARSSq)?X*rX;Yd33`@Q8Y)E%$&j|~fGD=ay_b}Xn6h`TEy z++RQT)%%752dT|%uRg!hpC4Vt3ZLg`ASQw8&8`Sa=6;UFie2K))5as=GQhjscrs!t z_8e|mHQ~5yM(bnr3_G3F6dg^?veVX1o%_{h0qyX1XRtHI z(Q(@BcxAtnFc9-R|Mm#e_>&5n1Q9<2syV3~3{okhw`#<-MGGnJ`0IXTN`qmISI7L1 zd9Ns}WL1TfEcBK0t80Y4-DuRHvWuh8+(6lMVl2eVUGtu&xQ@%Z6 z4t%?vsGTBJ^*h*#=EKeOC>WkmjTaW`J`>5A>br7yt>wrWZ%Og&HyLFi8hP0umYIhw zV=JX8ORz~bT)HD3D5>)lGVa`n?%`UK)yTQVudnsC5qUcO2b!b(y8Dkn4XEYY6jKCX zfHSL%dWSOe<%+)OaW9;-^2KfM z_Qy{f$TdCc@A0dbbgkGHhx^VZD51}vsWfg6IZY-C#GtChN_I+L4%EmrflX%wv}F%g za&l@zqpa_nbM~}lDIQM8PM-TFQ12iPzgrmyPa00IEJtRD8x98%ahNPio<2)aOSdoJ zxBE?h2SN9IJ{XwW=JZM%vh%wmmuOV;PAc!nDCm|OV+{X_;o-46957I#++H?vR zu=?{XNiEsrR@bS21jS>Z{|nmse|O*6>EDd*x!Vm)nNOkMJ1%}XEVUKQeX%6w8U({C zD>uEAM_pFr{Vi~MJ?_qGnMrN=9CXZQ2z7=9Y?}ElK>*LCCKPBHXR0dCTEHf zWo&3>rk%40|BYr#XUM2wI^nk)k$Vo@(z@pb#y?{REpGjGZ%9u7^`L!bj5EIcGIj zBX`A#L-}|j&Bw@1IcFzV7Rj+-oxQj+Q&QfZNfe4ij0w?OsQQc#fQPLNbua*{jK18I zUKZ7sM~i^N0&4K?u#BE+OV>^Q9njfQ7K5ba`3}dX2H%G|$tw=meS0afvvcILQD~BT z<0iD=fY7wNp69ox$K4SPrcCY{XU+%Yt>=z}{c!`d=5J?G!uF_S8Sk!o++5+a*_t8# zD$|PYTceYN&QafIr^fG6cJkaU(&;xsxv4(ruWIjD!{Cdk6UkAEvOtyk!!lRky~M*a zUrGDzouZ1AFKc-cp^srs+ z(e}+kT+BWM-Fo#$>6rC;IwgyBR+z~nZ~=W0xFDSxrQn2P&5sb@vUKVJ|2ZQ$ws-6H z#+RDV|5ujpdLu$%f8}NFGnKYRYsv88XD%qiQ-t=ZrF5cp^s7NvTJf)CWsfTn@gr47 ze6oXks|j+t?gKgv6o(W83dmYY)dt{GYqIQ{Fr@6uP)YsTY3r$*4kt2uK6&B|9U{!5 zttFoukkdOBKY|ocvdW{t74e^#?Y7QV<<~6u;|>zqZn(r|8L4zJXyJJ4o?R-t{W^A0 z@gE`L_p!9JLD)^6wP@R8Q@3!^#R~8lqv~dBRQC&y=C=$fn;f2fx zq1tj+qs|7pE(za(hQNxBd+~FmKx!{@-ODW9tWEJY{}B^a}c z4lw96ME?9~dD2=$aj8}2(Y6r@w58*EMCvxm^49DNN8hZS9_ z&XQYLioQbQXQH@gdedS~f1YKnI`tySALrErm3(`9@IVHyd%M%_@IDoZMsKNn#kawU z_6{(9!*+?xmrjAdJ=gf1 z71H6KV7|a2PBCnO&1?3W&VH7}u_^T=AKyNb#k!8@OO?X;q=)~_3c1<#jlWw%0}oc{ zZc;JtzsV9xJPs57@uGd3t%>uZf6FI2sw8JHW0M-tizT$hnfICu+;*z9p`mRXlDp6! zQL2!6Z~$XL8MNzw01BBTAJ7CU>EB+!fJ3v|akVa{(YQDD%Szj+`XeOYevO^NOM;3T zjt6v#ESg2hzDKp)py8{gv0&%r6m3|Ab4%c)$s_g`a&)Q6idgR)6_q`8!3!iO~~ z>g=R;!*ch7YLF!t@1$>w`yr~W4b&%x(}m*V;sCSr-Y7^w_m7X`#=*OiafKEjBj%D{ ztbGqdD+jX2WV33ENdrlGp*No5{1bDYW}tJ@*!3fiV=qQymqa8=bV@}rTs1tBS zfD_W&YzjMV0z}#xZO=0o^wW3`<`bF>d?yGn^8CT%dr+r5hqH@5N|;`TV30 zo^jsZnqZ;J=2Hcx(&sc%*$!v(`M?XzLQth+5;DMkhZID317*n`lu1S-z%{wWCZtwP?NG*-eRy>^1j~543V-3A$Gs*8^c&vjG(oB7G*&I$wXko zGXXRJYbgX3MdufWFA2#$MvI1=L8bHM68?F6@IBBSJHz<=K3{-^#YH%Hct0UlL|>Xi za<7`9?V{7lj#pMTHZr<#0$wIWU)MLnn_ss|MKh$~`27h~3WG__pjd#Cf`Y@%t^Fr^ z5kdW~{G|kFZ-H-Tp0P2tZA5}Chu@oEKQm(U>z1SQWV|tzFkelrsHgfS!S{>nX>sIY z*Q#_3*|@*4m~<~>ePRgcSA?*->_yV3%o%vAkdpi)f6H7&1>hOiuMDbbxi>o&S{BrB z+3T8MR>jeY*m@By4f##Gn%C#bR+-zVBA^~&`cXbHkCtad6CLo#G~{{H?M z#G|#$^j%{lZBD%bZ@b&o^MR1Dmgg9Fes6;If{y?SZ>p9g22u;JbYOD~tWxIX_0nPo zlW3tCReB0yr#=6UH7bLx=RQu!ZQ%uM8*m z8S(G8k9bT!27!fC@uS2w7UJ-7;NAT7ju!`zRT7*$JDqNIcu!>donO>rCxifuci3%i zQHs+yG}h}$zF^(3OmBXd{Ovn#qFs$%^V%U|$Kh6-e0+QWqm%vc`YOEbac;TNyncQQ z=4Y4!Wb4u@7|uc9=Vk45Fg)ldhEE4ew_sqBz#A47v*1}<3j!GI_(AmH|A67ku*(n3 zTrA97icO>BTQH3x2zR?WasufF9wSUPb<9+|O#wB| z$<>Rt*^`MX=^EQOx&78wg3JHu(pwTmWxC)TsV?%Br&`tz@?z7Lc6 z!O?l8sG94dN;E}e>?i$C3s9Zj`vJbkjfCED_ne0_nIJ=KoywzU{>Cw0a`}n@+fxVm z{op$=$WIgVynJ|+5ED7a=1!P0C%l>TV|Q!HSbQSI-+u9ujc)6p3pZa>XeP5WMm&fT zm3T@7-D8NO^*B8X|El#z{@O*ML0bdkN$JhH!CR&rY)e;9a z;`?3fJsD~&MAeVq(Q-l*Q{!TLO~A;q(=TUS#a59vou;fq_?SQ1CEDiqTkqbp)TL@< zz7cR>fKL2jtYHCMZvJrlCt?f2=l1~cBmaSkNPH6Mz+yf6_VbtODp!9pU%qjwoPf%L z)smQ}K<+ukOrP`lPKl4HCUX!28p5d?xt;0UGS;e1e0(ky-z!p#gJT=5`lDC_Qb6dT zw3gsaeip%PxIWUS+AVWoshI7l2`K_sYLyhr^Wyx$$4NcXL09N;xV7jYG2&6?{0Jd!W_Qm z8!eRN3C&Gwh~bP)K}0Qic=4q|!^le}LTMPcFZuw$Scn@vKyBBCBmlq&lI-a;T^W75 z0{tB4l()tkO=Ztc&GIoc{wL|YE9wZ@x*0=%WGob6NdUxvO&$B^S%XWdi_@Fq1_^R} z`X%=Dyu9z)aU)+9N)I@AIrRwZPpN`RbkyLubL2pMP{w|-$brnwqPD!P-HGfB8^{e^ zB7D#}n!Nitw2AL*ZGbY1S*bmC2G@JN1-n(7J^E=-&Bx_0iaJ)3h?{k-SE6 zer~B-P>)QCnljbK>f0iqsC-`h3qcUWEEq6$61;76uNW{l6Y=I^yTZwGw3`A%jjMrS zF;XIc>F}&c#E~9)hTZSvDKAAt#xAu=l^VLa&H)}Stj`!fN?lu9I*g8ri(=OV?e`5U zOM+YpIrXTY$CeKc;iy*wR{p0c6@=5Cw(BS|=S zjp0qS&c!ahed+D$)0Zashw=%L`9msWpC@$nE0iys?Jd5IxA62lllPtu%B(f~znBl$ zf$V>?z4?nuVQ_%4(QE0z9mL|W=75w13iu&eWVf=^T#uB2#?FH{62A8Jr(dcHDHcqT z>taln_cIo1yRz39JC)bo0ax=>Hpt@!{ih#pol_syTox7ZbL%TF_Tn zKm!A=xWuS88|Lk+~I{!c^7U1ep`b@r=EpfY4_t=;qLG2I0v2a?|2?gt7hdyT7` znmkSR;XcpwT3K2yEH7J_n%048z2dv~8({wlNW&OWZ?Q<$Ef5d(86kzf}V4$iEPqSJJ8;b_V&C3mNA-aW>^ZLHJfWR5NZ;kOkS#VZa2 zJu|i{?wv*L!FPM7QY(>~mg)4ClYw1m#kHVBUWAEb*2ZKk?m*15KX^=^6PCj{Z?v`Z zR{rArK?}_}+PnT;=IcMNllDHCTg!1iAL~Qvef9L#&}CmP`1I(xe@U5M_;PZ3Bg0ei z)B;W4x{az&yl9zdzoplktg174^FXm<>{1eDLVVQax^Bet-Urrfkt*8IAR!^W#{i52 zZH3xveTSDKs!t5uDzs0mD&b%ORJ>2C`Z5#r*90{e`xND@ubJl7n&pX#O;H3~pnEED z$W%FZk??Z~9;@lk+FG^KywnVY*EPEmj)Z=9s)4c88mF#QLpfTBH&AcOu0Pp|iYSA%uR zi}kw9?L-1k3iFS{&73ge>rq19)KX*tvtA1F4m#N8Od1W5DLAI9f3rm=U-7oKS*y~9 zUs4gP!R%^ru?F%mBKrnG#(w4Z@~ZI}7desrjA+OxQT}j*1C90;(qm{tbfg8buO*CCqv@z12MwHNGK5>&Z7a02q;m~ zCgg}W8nV)C-BH`5S9aFZdOX=E9Ue_)0Fm^~b>@O1I^vAofdver$;dQimtR)_lx~1EP8*?b7=PZB!Q}bpXa(-6zIemDKv za`!bKDe5Xpv5l1K@Ah?UmVp>W{34dtoDL}gwaq(l&zA8AWX6Z)0P5JL4l2yG}J*PCzwvRS%tqrG&z;K|#2h`4Iwmi5lEjz$0 z%1S^HC&)8A8vSg96whB4Dp6CncbG3Uxmay}>8jM2$JwZ%qy|f~=1fKYG*J;689XXf z0V5`t{yjA}*1~49XyMz!uUhx+=&|?&k_@l4@ol1wnvaEMmHKVbZSJ=(EUSo!q~*VM zl=ih!e*IK%0w?;TT_yM~+4ZF5)FcMmLZRa0GdIW6Lrw(m-FnJ+&Dw`obFKN15ku4u zZ)f&wT-~EjYa`{pF6~E2?{lGr|}*)?_cO@Bion{REr zgJXjJ1ibQbbxc>bl}0cNxZbB+5{~cq=g{elw<$I~wN)KM?&Kr6;3WU2rTIHgDBR2p z=3lt}NImuPAqPHh+W8RfZ4~K>tEs8TU;P0RQ_MR(R#xuJYX3D#XW^za=?%|X7Qj0V zc~2+|067t^E0v8!j}0|w5o#Vj*2&Y@S%_CE36y%3EcPGB;UgWFQ?^ft3BQ(@&i~HT z3&9fL>6j7v*Gu>ehRSUJ;}_8$@z+dEL)NW0d9RG&_{~2s->T1hQk&jSIFP3a=WRBa zFj?POgAm{Baj_+$j$o@rdXU=__2UiX0I=q5f#S|~6N z#SIX|b4}QNNh~a5*c%OFbQ$j?t!b|$Qsmm5FoUa zMI6`5RhWyf8(E#(EQ^hsG;0!RI+AzVXd;MU+w(K-K*C& zbP<^;Z|u)P{14koRo)Z+}AQQ#d`>+jPNHE99p{Q|67iGu)qUR=mB1VdP~Ujb@HQ7vYBC+M+APvMjZTm7R?Y4TwKaSw z?|&sA1C0i)(6~<%i>{&+_cqZO)?e_zY7SB2$S5eS-gk@RBB#d1&A1jS~4UD%(@%Zn+}?WrOuFwr@p_!Nw!`clM($ zudWr&h!b8*GjdnHUJQe z#oK7U!UP8Ipe5h^)!)8N#EcBxh z0K8AS7tM4JPBvl?+mtvuUg}XHzk{0ZwZdJCBekac?MrN<4fuz58y=Zsrd*X!kZ81N zgn$!^F7RZ6;m5)1T~Hrq?Hx^Rx!&$eLqh`!bRqxp5a3-UUBRekzndd9d;2qxC0Pc0 zdSOv$ofPG$==+#COh9TXNd7yVv?>xLy~Tb76<6Sy8^8z?1r>l%8HB31ZLN&g@vj;F z&TCx^hK;A^arw8Vbe?-DalB9`>W~);qaLiF%Pc$M?FP2h(c|%l2PuJ5RXd7;K5{VQ zLm-al>nVq=87{St7YAbJ=qNoO-@#z{=taW$%Oa%=30PC)^;h16E11~=t4*XsbTb|MyN6`3tNUarP#xgZ6Gv#N1;mAybHR!Gf1%!Wy z!p6-GXM2B#R={}SFp7*-WxAOa zZ@FR?5H@$i7~j@eC_rL^wq7IBEXvAOO2*ew?9dv!@SjYR03E+j(uro30!y8lz~m`y z31*;xb@qUzDIfjj(~xE?h@}WvcVEn^KBIUzS8PL)lFoBQ$Eu%s$z?sXNR9Biu{rCk z>9N$qGGg0&%g71VNs`d}-*-678e?9Bgk8P&yCM^0^zU)|UZOsVOE=9!&gI|No8L}E zs5>L-X}*cLQiFnEIN3M8*6ej!>1LwA-!eUkVq^1x=~I?OmMlTXFSH<3^?(3jZc8hx zSn~bKY3&x;<(5>;xE$5l@?DErhvle*koBeRCC70D0lfm-ll}b(&xP58C7O>mPj90B zn{RWX+Bwr0La$oUczRR$$af}1{REwqwtbH9GSwEb8+~CHnR&}boAZ^2U;ZZ@N&oT5CIy7`DY}X#?Uo6Nxx4U%+&-3;%67VaX?Ph_@;yo5 z@QR7`o&V)-GrsKcGZw@o_;5`EYqAVs`=KUz`|VzZX>YHtVf&Dyv1_;QGsj-b8ui(a zT8{8;HIopcl3gEq(GSth%rk?i${MO01soaf(u)oKhy z=*+{0BL=D}R0J4$d(wMeLDV18es}zo!D2M(2)D~lfPXX8|HRXvd2N$ARhWY@+aokl z`yXeznzyn>?y26}w1ZdsXpG;QbNK9JL4GQ0?r>#0sy4W@PzDGULo_)t%C` z6i!#$f&}aaiux{le697mzaC_FA4G9^9RAz4vO+{K-1v4h-%i!Xd~q~f!B}f&G5cDr zFg?eCoc}!kC2dI43rU{P&1)EDBVZF zAho>WVrh*XLVW!FGoSP_Om(UD25Om~zkmP!^$Ug}LPA35=;+|w3HpF4oRt+DyT89E zrKhKdi;F8S-_zNPEn36~x*Cj3OvY?P(0w9{X7Uxm0&6==%;Te@oZb69Nwvsy0|LU1x19-zh0NCKd>=`^Ij$HGn<|S@oa!Uq1u& z8Q6!CAGzHNNGJh)j+7)eHTi;W5~#8K`SS-|JkxCtFxiej_ zS|B&zE{=e(cmGi_5|j-ia*IG|0~*u1-(FW{XTR}_>oEtR1DASTzH6GGmvM56ikb{~ zhZSC4UZAxG%r`d|7aKB9@+%e`TwKjhT2An9hKHk_dt!!~I5@;CvY{av5fvygKzfoW z^Y|(}X=4NJk_0DEv<$7Q|5CK9cMNn8oyNhUt0!&FtWRnf>YIIUz zz@t~pyw%E+$#-^jO&9WZhhNy<-mWIs6vD^C>i_0>0Mb;9G&Cl=vrTY$bG|3!fvx@S zx>*sA-XWo2ZcS97P??&TaDu+yY?JrY__!rMNa(_Gb$xve>OhLp()_RY43(7n-`qi} z(=D%AM0}3U5bkw-=?R#A`i?j&%G?JK2}w=05=^QMOcFpquuO*mp7Z1Fw79spi>I(G z>E%)oe7A)y;a~$jnkUT6hsVeIRp#<9U%soMIoG`b5jarEGch)fjEVw{HzAlevy)Pr zi>0jxEjJKF!@WDdw)XnplC+11haJkj@5wL}0#bjmpkJ1lsIH`>G-7e_L`oSiq5~&P z9L_m+cXt?c0FlB;%nk1T|DG0>#O?TutK{kym=3Ke{ah37{1Jw`xVU(-J9l%szX&B; zuD5wRD?MqjhloPlpydJR?6??e|dR%U%qf4 zAtAvXcs@gox-eO7BXVvr4;=%$b zNAX+qk_ienp6_A5`0th|!COQ{5k=qim_H=u$a9igIyD=gocxfI0&;6htE(1fX54Fu zo3Ihq^Ll^$_|e;|0QZ9kPvQYFhbg3>&h^izsB*r2Ba0L0U+ehsD5JPxwa#AN-iJrih_a-#Yde1SWxrix`j%5^tarIOxvzpdB{t<>e(2Mkj=hjdmj2 zm$27wU#Bv+lf8I&kAld^$aHG$%eEa!Fv4>|r&GSdB=!CKN6EjdNgvX>{m;97@>Nt- zAx2?~0&4*i6LS)?kZS+mzud-65#RF!itnE!-Vfh_w?^T*lHWz}QF`$rK#N%t^;TcW zYEb<4SrdhTp4^bMp1XU^sD;|wJWP1EWx635Il|w-mvMP9_ji_^L|TAn{)u{ri-S z`C{u2`zCK@xTcBLh&tEfc-oBgF^~CRU73%;pLQjL{)Ij{@8f$dRE20nld|5kxxfv!WHVA3x}8%v zC&a`>q^uG;q;E1COfMEo9vzgF+F}gl>7Uxojot6CK$Coc!7 zbRO|6?5g(Ynv&hxlUL~~cuaa<0g>yw_4x1#Rg00~tP1FiQLlZn`Nr-`11 zYAgF+6*c{t-_~09Mf|SY`w#kWwz$=%o9E|Omzf?!8=)%W^^S}f8IvTcl~A*X_d8-R zzf#oVY)aJ6@KZtHLe|+8L$$_;gL46)B1b<@6!C&;zRHy{87;~cTl8HnRtQzlJhO18 z#k@nFESV54`YWkQs${(~6jWd&Ew-JLm)M&KG^T8@-H(IRLZxgeEiJrGF6sH)5zFqsu{HG0h zcDYTBK>BKu%cMc$`WCWe1%oTqcN4D^_-d%;u1G{hzP0>K#!24gjXL(L*6zM@_K^O| zG@*UU(6 zS!G}wz?ovp;^jt&Kt+2<*7WD#X};O?+~*aQoZtV{jT};0X4*%GKg&yH)EY9NT z5a#i%{D?o3((AL?hX~kR8&+XCgNYwN+@$Tp(tW^5B~8^W)WHZOVm3WOPAh| z!T#5SQt_d;FSLI6;`LIqu)R+jG90{1J#H+l5*K9Z!>N}SYFo&>jVD?yI=4Ho+IpV$ z8f|WFPzTW}(rA=5G3)v|XCoO4T(SAho!o3qR?vSKZ*M;}>r-gH_?zFdEa#{4?)|>a za-ys{NxqIYqhyhe3X0|%gy9F_qGpPiS@HKU4l5TphzEyl#$?VEQ!lqu*)#Hi4eobc3$dvlAXJueVf2U>pEr&ZGO zOxcKsCR-?Bh|zY7Pzqm~$8{X@lj{ao|DzKo=rr`a+Wj$m({w51Y}}V2StiMP)htwa zyxGP^P}=%+FEJqjnlPZ;IRk0Q`c!QN0O8=^4vQZ#2N=G{b}jtxuFEd_YwsiyTp=~t zt|Kx={*t_>SaDgoHrqbnQ5*SXzPYgIVQg~pzBZaFwxWK4VVo^JE?sxj#MxrEzuvL< z!xK~Lp!W)e<)w-j%sy_5EP+Mxw#ZtRRywYI$Z7%Ca?G)7w#^BAy}!oT zD^p41P>DmC8tV}05IiiHpyWg(De@cuUnJ;tl9OjZ&IHoy8N$8|kWQI&Mq-DFyIOJY zF%kZ`_FjoJ$Qq9}XzDjm)cK&?T~pnOo!_bSA%85TYm>gLc2;ONO+%p}>*|`AIWmkU zvemRwKjT1K4mogVYy$|>bGPjN*A@S4|D zx?0LNMWVuTrbrl*>}#Exv1M=cTdX-f5){pU{M+@5(cDyu3|oEtWa?3n(x0U7#JMdo zk(7T;Lui5=$tp6`;z{dS2J;!Z=yoh2z_4Ga{Z?5uFLoxUCf@gn zgBbIjgKCEjTa!0fZDfgi%9fcE4Fp!&UBr~ z20y>^hwGs_%hZ)9w3ly9j2#^8f0|HF7fj_c=3VVSrQcOJUO(N?$BD0K@1N(`8^b*L zGL6#3i+M#QYH(8R?MI}bX1c7&HZCegBg7*ab0|qt-Th*&5?>{&}C>^K%u;d?a(0kDw238Q;U%vZ7O+$k^2A0>!f|p* z`II%O+1Zw(Ihy@D-p9!Ec(`3iD3{;7Z^mP1eM79?waa$ae6DyQ_8=!gJ&8cml8vkj z^>HG-Wcb2#gpQFqS9CESuF^EeBu9JPugI(4#?b(kxt!Gx4T=_r$|r zq{Zdrp)SkE5^pMrIJ8tX+|P{r=i(Lwg$3Gr*47*UmVb-T3tbz(jQcc+Vkzjp1?^iY z;B$4iP8R(y7Xb1&;X+frpYq}nfFNb2!6x4 z5z3QIrr3HlR>(6}Ji#p?t>ES4^}-J*L)ak)k-t?GcII*>e7g@cW4;SwAkGs>sJv6r zlu2V5=V;@uxg2gTV)UOaN9~Im7rso3OP;`wFD?CTHbCe`6!L`D%riByL%d$M$FS1% zL657=xMk{x4-K{x9FTuNrNqi=eP_oW{2G>)fWE8EOmJ0Y{uoI~QSE4`{FgXmklY8y ztDmY_vFF{NK9SuqU|lwHC z%2uv1L3}VxVx39=k*=@UK*OE#QbuIS*b0f9SPg-YtNMefa+Z*8j{6;dPu(cp5DI_X z%TK7hQ*r)B_tZw3{ITt%%eCb- z8cQ2yq-ADi1}P={w70i6Gs_pJhAO#A=?GL|h(Z#RlP%26se_oFJn1lY^7Q0?frS>l zK!cS1%0#I;U~8l^OVz=Y{O*gq!a`0W2WFN%Y3Dw_DY%)1kC8$eHS{lhN8|2k9{G<~ zzZYc+!pcQWOdsApU+u`+nv}nu9d=zzzqXgm(z=uCe{womNyD4pv^FIfOM>%YUr(pZ zEQ%VS?l}Hf6cIIQRCB=DpP~8)ou+52N%|sQa}5&cW`*o?4>!27%a%*OJjSPA*?2hQ zVfU6@c=3=q_GRWXPZi3fgsVpl`^QoA#2B3*ny$vA9u+b265ZHub9U$A^%B4CkGk3( zi%X^<`-5p17f1-ll3>Uu5z-HJc(^NFNwy$ZTkNAph}nXij8EpP+R-Bv!K-KN@?<LhExGUle@)hdp4{@?U?Wtja8bTq!wtuuIV*yU%!chBBREw#^bhx> zHl%D&u5c{Yxcyhj9tL4iV|h&xGA{l|x%2y2KhlgN_1&N)>hw>m@4ZDdO$O#>M3PKq z9O~6~#&?CCHM@stbJV(-?&sZ@>pl4U`bOb4H~x^BqV~hn0x>H$zv#i;A9_hK}!bt!6JLATjcmsV8tnQ}kC%1BU`*zM|-%8yd;p)1XS2~C$9{wH!Ha`aLxuEqGJi9qi zsM^(82WAd+e!X&||Kx40Qw89a+kMl7Ryb63}!C=~_n^X+7{1 zzqYj800^?FA$=d0zBcDon^IVHb@DltGufR-`EbIO)%sIQJf`T&17k({5!5h@z~l^~ z+D64b!;|rd#TmY7#qCtf35OnotI^9~EKz*LXvzupRR_KO`t`T^TczDuWijqnUf#Bs z>CN(XXX~LM7|#?#4rdxlvL-HM6Eu@*wuCQ|$wfj_#{Oos^#1x~1obGI`0u$zf5+X!Ppe7V=*SHb zvnL*`3=Y06hvfnKg~UTbSU6pJB8O6Q=;WE1YYzWbLTav#FQ*(H!tqIcb-dfca++gk zS?$h)oOa@K@IIg;ttM(0&+;shCsXXUu2oc2!~o-<@Z#mwq>fEUOavXXf%~uE!nCzh zW%6?yGL zR?{GV-lyx_?d3&^&mf9zN|Hd4pp1&H^(#g|mIDWA4niDbQU5Q-8Y_02Td-09%lQ7&#>>|i$x`46CG3*;1T*4A>|I4E zk(1uQ#n8QapTUZT+O)lc`5ypg!uRG%~u4EnR$VC%~7W-R{>|Gjp|j ziykdba8MBFY8H)HpEt%BX@N=;c=kY=(3c^YoSyD;dAbic0^mFZC@(Ol0E?MwbwPyc zm&^n=P|}C~3V71=_V&6d2XhJOaMa1lDu%}VT6=%4ahgl3I)}|k=lum+RF@~M<2m1E z(~!QpF#q9ZLE_B^hsMddztx0}JSV@t*q0JvZ(4-Zp<7=Qkg(hq`3&wQ$|RTrtQOL-SIJ!*!7w}R6XnnN556~I|Ik44m*oSz+ z+=a5ivFGq}%F_MH_qNw?$Oq7 z;rHXP1&3<*QNzf<`fZDgULpPEZYrv8Z;2|RSg8H)p+L9S&&Z_kjNd$p<;}TlDsN>Q`!}zEU)mOnyPXmRky4xN4J_( zjZ)C!KOYUpD9I{tXo9v9w2Q9xyC|)!tRQ9xyR688eNBNv#w76BkOw>neT0Us-Q?u{ z+!{%I(l)WMAS5H32Adxc8rwg+)S1-8z0q-(!tpg9&#Zrz-v3@p{X{m1&E!?%{YDdl zQ*}GJ&da)u!9{cr`%7=ji@V5@RN;??t@-yG6De@PG#*V%&Uh)hQP#^V#d$4p2e0zt zVaBugVY{odW=YfaqeT0$w+G2JlJC@h+0kvuy9(lPvT}|Z6aMWfrBtluil%v_$xQGY;3vGs$BDX} zKzs=7K?ICGUOfNz`WZol1iiT}PZ|Ysr+L--p>!i>W@AG`rm*iMG_%^;a+$7*JS9W1 zjL~~0P^rXK-cfU-TCg{D^=c2<} zMbY}&O7wh@lf^$786?mk;4g}W_Y#K>js5i{{ z-w63#z6OsaxZyD|eScCpK}hlXULfLM-@~IL@MBm%Fm!Ud4KOF$Q-CxJDk=>0_4U#5 zO)newt=>nRWF)CJ@2<^sMn6`MM>WFWVd|-KvNh65P^k8l4p%%zn{uC5sMoyNT~Q$4 zwV{2Sx)3zA8q@b}(ZgAm5b0h3+o!d|6nlP|;_&Ctc~R}OKQlTW-ph+_(iBX2Hrroz z5%Wd55!r&;G>}1@-Ls)B8r|KGlGk`y@ri3B&(j^-k?caDGuK-HM8Z_5l@7KnA3o?P zDZP)2qmF$E{UESr$WyTb00d~}mnlwIp0zjhrCeNGpxFTQ6XC4gwKYpaLqpr!*Qbvk z+n1d^pHZasKwste^2WvnN9rGBc8~hX%F5zmE4{>Ms__0tKRDT{z(+I|pK|>A$b}(9 zGwE^sU4yIR&22)`QWJmT&yoB`;`B%}56gI@s1hm#_tK{HGe8L}o0F@niqw>r{CK@1PVT;)|=U0<(3Kz9L zTTxp^rP3Ihb!odIB2b`ESRuBOO2pc|84G5;Ht&4%f3t(ss?OI!kQDulCVdROjKhIlCpoS}JYTmoH%Jk{KVbs-~s} z-BR`ild zk26F_n*a5Y-9ahDdMw>E=)UpTU-jySt5q4*=p{|`^rXwY`!maq*4@bzUrT*l(HyTm z?+P~ez9w`!;HtiO>0&YQg*>@e*XY$*b!u%@VOC~y*p}Z?yU{kUnq#Q_H@4oK@>z2s z)e(uk2=&vPF?RK~*(^U1_haS!H(kn!LdiwiftpbsH+>;^Vsn)e-d!bg`1lfv35Y^L zZimYtt6-@cv+fyedk^_YSGHaI!~ zuauFo7>=6T*8EHz%6CF{;3R;0A{{L)WSAi2+q`cLyRW+X2+o}~GiPg-o71l|$3lJQ z)!Y42$L;S>PGozw&+9*9<;rBHPx1P*TKviJ6FoOocQ)(YYjL}%NaH}BI@&cWu_mz$ zmdU4MP|>SqD~fCO4XVPzZ&`dEJ8d(WTmJ82V0A&J=hBVk<{D+H^2`fm#@Zu=3$-%i zoHaE|*OhtMNtAur;uN2I__h*WeLs)mTKx5abyFGbnfj{i=QS5Xjyu^I&lKZi1-1Fze`9s*}ZAMrmFYXri{#)}$F z;-CD?k+et;Z1n<>wXZ$?A!1MkacL*JQ(RvxT*;_A)vGKETANLlQ~hRje%*kmTdyc> z$n5#=SBmIw%7fi>wnzLF87O`SEruxNIn`3{xkbfaWGWUa6ySZ7t*A`UEg!|Ar5Lti z!=)K^t`BL$lE9JE6GMq-twm2sOFLN`Xa_gGxw#u4_XG99^KeZWDCgk)H9c)z_XQCF zSS2bMV4l<#jILR(D=i}19i`WjXemm>xA!(t9g$!AF^)zZR(FGliHlPy?r^Z^( zUBrN#oE$J?{I5BY_>wkPwXQ@V5d@v3q@)^m^JA&haq2~LeC#%)!l@H*WOAwJ*m3U# z-)H{dmOlMy5u92qHb^-9&Ie<89K5k>?YT{ZUIZ)0mvG|unhUt5@{&>oIcqogyhx4j z3$kjTtlMZtePBc+YPokr zsK|y}O_c8zx%>>F5Je(liiR@b>h)%{$>4gK?;ofYh zzp0r}=2gW-^=>M?A6()#Gn!z*kbo9G{Ajva3|c2@V@R;U%?rQ7Mc%&e2@-qZX07UJ)EHt8_Qq2V?VWy<bW1a#vzxJ3!Zr7+0MG*?~h*n-4;ZVV1J4zrGQ4Jyv)|5uxkMdX1bJR- zYb(rK)YjBUqQU?yFE1}HZakRM#SpQ-%^a#OIgpLQTBvIw==o;$b>YVn&x*j|`dcOj zZXAOak5%j8p0DQmPgk}uJvZMNge(2RO9+4M=7QSpBue+Yx#8N8tJg4(%*%S>mv|`1 zH*!EbTsa7b>WAa{^-$khi|R~s4O{Q4(!|v#d7|>Z1|=);{{>$cJc)q}de~ABdw~@5 z@8AFqSxO-j zhX$E+?B(BcetS$c4vU^q*3nk@{kop}?Xjho_Eslk{-@R6=vjq`G`k#9Gcm;*B|B41 z?(>6?Ly;~gAFzKdFE59C1$?bb@UDatzblFqMF{ML8Eka-h(damaewOwkWa@^DW%#+n61cxoKJK z6xl|2Ub1`^qd5Jt1GWYatJ@8(Zv^J?idWRuEE}*?-b_ki1*sH+quil59pu|^h&`lu z_WE?sXoSHI?jC?wC~+?>nZnrx7D&L~+Srg`9eMC|w>r4Dnh2%He>TlAuoevs?cdQ+ z26%vic!~L+6lDz!9^Z3UU*D^TH;)l0U$7)gNl26dFA56(Kv@D+T;0!xFF+f4ZalJU z@c{|fn$WV(WQu?a^)LB`i~+lovK;q&l^#Q;XS}0jLIfGrbuw*9z5 zN3sz4>cE>QFMy6pj$**-awnQb+{vf&YD;V^oDG92e2`%|C9`N5Tv1p2Ht6@7+0+w% z6%w+XMNnwT7IE2G?2^qiP9&#Np8URz)=D-@&h_Hd%B3T>I>N-n0LWq930Q~)ia0*P z1`K#mN1L3Os0Z7-x;hvyZyOrIZJ@e?(2NXhN^sLm=XIEc5hQ4i(Z&BFVAaRicKdsv zuBHYk*BX2z0p>$FZoVz})^fS^$Iq;1vS=(^R^0R7#e%S65Bk0+yOf)aGmH3dynIjW zTNNsb`q2@0Oj$5oii+A#_GOxI41ZZ$1v%+@_q_314{0c$l%e5{b=k#iopd<$JKn{- zmv0}v+#nZg!d5jLG?MW87tT9pPxGn+^EU16QN{z~OEsoMm11UKu0}=zlh#L9mrQ7Y zpy_WKeAB3?sKUa+ii%rtBY25uF#}Ui1_RjyC}Fld-cvPpu(Z{S)!=7-7xx_{PFMse zm}DseZcGs!Xc!pNRThfiMBV%v594=hS3F9~K5a-zzC1<@BVmW?tY%1g)eUb1rO&~v zSBeigc`Wu#)Map)i8b94r-+lrC&|PCs@sAO_wzXD&&%X<(9^GiRW^94!kO=*qeCK! z+71;n32J4il)b94CC0%4B!wPsUNr`w(DT4tIE#{kqI~<^ zbFs$+5s+2_Z0wwOM8+Wc?6#?K%Q(#5z-sqgyF3#!pj?We8a95RlbJ+aB6Z86_4xoP zKZvtI*YzA2kO1L0QX~EJ8SvmSgXL)fgg{snuz=<0aNF5ZYF;(|J-F$9!E%ziv4#XQ z1kfLupPz@TG~!4F?m*w!`@i;0OzMp5yW%UwfbO~eo$9ue`8aM#)0 z9gGU0fbS2!3((WNH9Cbz3ErBd9$oxOjlkj*Rs70nHwk_vY<%~SZkLdQvbMJ8%AeH# z$rylB+W&wM%9Ze)>;`g!tE;)2F!|(kwH@vUi$Xak=)F!^v*Wxda2Be`l>_&|Y!O`B zX@X#Qh14sJGq4qGP{dK;s%KjKZ_*rQMef~seW(31Iw4Du8dK1FXn(v^(^SoxnwpBC z5ITqYdTeYgr$sN6y)? zGWiYuH`hRsE!S`Q5EmDhmKK5K?Ba6kWei&p%u}PHeD zirxK%i}AyyG3c;z5kyg;$9(%-psWMO1}H>8n+qV?($84H3CG9BKs|=wImP}~adByh z3+SczPc(o|+n35|0sg0ei-+Bnj({jT=659nPOG-Y+N6(&JLY2Xd zE9fB?Cav@iaIN9lLV{Lh-UFnHl8!dUaB9?C0(9HM5@8lE4aaR{Bv|1Fjj;Ct%gi;n zZ9=XATMIGZ5iRWxz?l*e1oCM-ZjO%q?r*X~f<))vVbkU-yNl&qll1V5-SOGK( zDe&y=ZUc?viqwyAQ^I7VWauPsNf<3PkKe#Vj4hA-K;#ji#a{ zh&VVn0B)U4{_8y;e`(%= z{QRv&o08eUItJ6=c97DA${$>p#wV>8Cpp|aJRy0}lFDjeI-fDjh=&&RzP8pwOG^t* zd}X0c9aSXPOW+>^B8z`X3p{hEmjI~*mJD#>csMz2)*L=-s5P9Lu>CI=;ISK=%J2oB zJbfyGIzT?vJg)cD!C}{-<;oqtG8|ExLxm6jq#(}9(8X`hH1cwDy8)7=k&HCtuvz73 zhR_W!)EwZfz#bPK;#D- za1_DzCOIeD$reBk94J<$r--SAe-3EyumTg4l4e`{4W^t%RAK%Mo-!q`y)H2ONzd^Z zM}W!WcXbY!>Xz*fG4|fQd+!XINXUl%w3S+9*w`;B_LkPx zijI_-pjx>%a(k#aEq21RMqGS6JUAdm3qh6)ANM^sKS~`g(bw0v)^5rj{&aci1MRis z-|O6U2q8wWV>LC$VCg~Fgi{^rx|5hC$M9Wh?557o&aU7HfVm17UHDW~^!}G=zm9CX zQ+n4T#8j~8hW99BK0EuZxNT!@e%!-5XFL1N6Ltx#Rh^dScaNfU%7lUUZ)7xoJ5pX_ z_o^`%62tY)%^_SHx1DKz$dKHBJNN)O9jb)A8G@l$x2V3ldc<8EH4sSv!Qi3SuP-qC zu`^K>`pfkE^b~ace80cB6ZMaY;9H_Nbr=+x!40A{l&6#%)3O z@4g`8bIcyqEdJHo3oah(5K2?z%wZA+=<^$5L#j>cDk?bm_}sqdhcK}L?ff~v3%j-c z9JT^T5`!7=!k=iC2Ze@0{D+B>I(Q50$*R%DH-W5jQd0Df9zCB@MtF)FCjJ@A)3@EY z5|S=pv_lCIo*MwO92B~u9SCNRA3uhL=lE^>GqTH%MDtZVOPK| zytueQ(L@wZfqagZxk+iHMM$85yi~Fn~P| z;TD4Yo1LsIh`lK(S{~dy;zS|Jcpb~mFmUh{BItSiJ`7PjUrdgTt%Ow!J!d$;z;zpw z15T}?PN)ALw*aB`Al=g-@dH+h(EBw0WSLQrrDIi0^xu*!#I`TI`Ms9C3w6tJc zA~qJA*6ptl+_h|W+=vcB7F`Ag21hWsKRg_E2Q3%`;WwGHY(heMkbnvjWc%Vt0O=$M zE3Ky>nHb@3*?+$`)$Dr#s}Ft!%1U?tqcvr=v;7$zC20Q)B^U_kQJG}Hk5JcQ(Pxq1 zJR_mcSGhgQAbND0et}skaJ{w^q=pZpg!!@#LpH#j>wYN+KuL(DWyA?fkxQwj_=dYh z22)FLYhmMNu$xFicl_r1qy@eh2z+>j;oF6RFW}z{HVm7&W}(n_C|<^@F^#zMJ->i~ zI|T(g>A``4*i3PhH??nJAdCb*Q3f9u_h3j-Bse5QLro2Xl*bktrhrXQseZ1(M!5Y7 zV`G|+uwcl^)WX6>kSyC5Xjn-})Ik^+7#ma8W7+yv<4@f{X|hbCbi`U~yIH`i*qb2Y zp22kmnuIc(*#JXf-U(6{$lYM55bVsWYiq~c;ejIv6U6^=b#h{a!n^xcN<;*@kVAK} zR2;2u7-Wcs44U|4UBF<2MOZaqP#~GFLQF~uaMZ&C1pEyORQj>Fhx4Lyi`v@SVBbO1 z>FVf!iU(O&&ImSazj;U4Hfg*LI{+#c7K}k>d7CqeLWo-fM#DBnN?|sySfdnJ`Mmbi zd{EnCbsvS18>LsGVq%mcem-zULdf?!%ZsL!4$pD{B-k2BLHL9_^-c0)u@{bH^dK8X{O{$FEOrhK5}3e(k_a>dYH|_X zXjxfz;lBWJ!2FZ7nb~7e23lHEz{-Ht_o5IG-XKFY&NGe24idhp&qjg3HzF+c00arC zc@N;!YucDD8^`1bx1}g?-barN&;BjzHMkak+<+VlG7&dG`A~VmfNWG@=dTcnmP&F) zhLF`@9)w-$pi>_|NQFSncz0{d*4}>h^X$u+90n%1B*6FH$6LRC{(O!aCZ2259s)D6 z5>q(=I?xZT0+a!u8S2~+y8Z+3kg&o?KyGO%x(Oq24Rc^Ze;v>dtOCe{0E1qeG0gzO zOmi16U4;&*Cnbo$r=;9nT2k9oe9?rp>Fo z57z>)bYRXFx|lHb2zt!IP|lnzV|emp6Xrq@0}O$r+rN+Fqa-G?K}}7)b{zzeBqVaj z1fl`7YgXX2hVKT9gs6xJz^FN|UOyL-QdaZv@qtT&e^gFR4$3)AhRX0YU}l??h-f?y z(g*nud#|6_NufdXQ%Ffog}}(7TjvOGfI&Uz0GBs5{4m~!8;K+p-T7Kwypb%b^HM2G%zx%jCOOyeW`RBOxY+ zQVDWkk2!oYHJrBL(a{k09W9F-WMpL_zlV|mWH6A5QczOf?9Gc6tA%4(Pkb?hPqwqQ z)qzqafb4+h+1rK?Y<>83S$Mg)xELANVJkuy`BGb3TUAwhq>~_H=IY9IRR>`3q!d0Dp8C3Anr-&=sqej^Ig9L4*?~vlqVqU)l}KdAfLI zw#Su8MI#%qQ(^f)atKT(nD2!bM26&qghxR^qB&59NlZ+HP!}311vRs#Z{NO>bHE<+ z%7MEDyhT{eQ`I)fQBge!Fv#dJh!>ul0Q2%%0GAs0C}Y~^`va#K!%GsYCmF*9Aj^Yk zoV$pvbxw;Q`+sw0_O?PiP;2F{jSX)~NN_Gg94)peTlcXn@JxZF1~0fWfHVNB zU}XZq6cT3O{NM6vpWvYZi3aWiyl#Wp&JHFG1w83*0UgHjDQr-#0t^P#3rOD}RjzTL z+MKQHE;%ytj#3|aOi*%6>=zCA0QQYMQz8@%Zfn(929Hj?zT?v(REq%)E`P_RK?#)TN(>JJVK`I61P`E)R zY(#MOK$bHZjXqp(`!w|$dEl4%_+EE}aLv zkIqg&5dA-XEFH*}H@IP*^jA|ua}91UK#ra?-iHq#?v9qF)zBb5xE)gd_E16@c(8E0 zV0;@Y@&Msr$w9>fwik?lJoZMV^7g)fR;Z`9H^l$Ro*sEk&18xn?-uzm6ySR+J%JSk zI|Yyzl+Qfi^!Wb$JGAwHbSgRJ6!1Y-L4iWV?-D-v>hcoW6Sp-=JE^-Qv@bWTaf7jr z&jVv_;uTLxl`H_y!?_3p?$Xj>;o&gl4~z`>&+<>7^1O8hdz>(k+aZ3zet~8PtSf*< zz{K?t&T2JegZgy*ebrD{eMK~6n1#^Jg?2qLzq2W%@X+Z^V%FvW-r#NNJM>uDnhpDR zC`)Wf(`yZ+GJrH79wZG)S0%0p7FG{UoEB3zH)W70;o)aVdGvP5A1|`4O zrVACJ#fFp4)$4=tzjS;dLi%GHDpgsb)ix&Al@Z~F{9S|&-kBz24r8xFKQtjJaiRrN zKY0?NFb~(UGuxDwkpU`aB}xpy_!FADKZDja?+Bp=RMI|`mCZ~~>%tVBogJj!6F~9n zS%JKVg%Gn9ZgMjbPZ~K9(J9nJ0lw@2Uj^!n!^NtwGrsZ1%>*Icx{i~|eT zBiPtU85y9J@D=VMtdSRE6iOzrn=ZB#X7l*A|J7HEBrDXjb$py0O4*v<82J6q*E=dp zcp%PuVcU zivB1qHMzAFE#hk2O!OUZG#j+vH1vGmzai6M3~ysg`n_Z@7P-CkX{aG%y|U+(|2A=> z?Y>&__&pQeXH@DW8$GPt60vvz%%UNFf+p*17Fvb7i}(ZtP;*&cSs}o~gPaEs%(|13 zR#sOvH8i>nq#RhWP~qV~MFbwp_SP2E=HVqljh)c~I#2=65uOCVpn5+GUA&dKc@0cZ z`1^xRoTj9dlwu0oJfIdR`C89@<3E2}OdhhV_o~_XVCTT;{Fzz~EgOc#XE^DZgX>g^ zmEB*}?I#NrKs>{!PH-?cD-OiUU3z@)Zp2p0{G|A1ZqPeollb(R`%Yf2@v(v{UtKD3 zi@0dpw0-2e{~w~>JD%(Pe;8KIIS zWN(t}z4!WFug?4P`@Q}+w@&BqdXC5A{;vmH-{toz2-mqU=B+Jd7hI=>&YL6R%b?YEMc;f?9s68q4sJ%{jjY>> z{_!!VZ2bjJo}7eLL#cJ&Q*EC8`}Reiz3uJgweyzUA6vdR?<8T&Wmqn)WX?BgtsJ2 zm{jhYG;7){o~332VLx4`^q_lXfAO#HdZ(*h)W4n|Kh5u=s^F)-u$r~gWF{~#w@2@_ z8tbv{wCWBDgL^SWlS4m(g52jA0~aDjS37Gbp3u)Ou)h9MU%9muk{wudRiQ`lB}pJ_ zoGNEzGV#TtZG7w2k6UM-UkVHjMVbsS=hATiEqLKv!O5c(`}Xbybp4-sPeeonA~Z6^ z&D!6t75w(O)&%#-nxnf|9~UnjxFmV$5*gXSrDSRS0RzeORbtLg;p)4elrR0_xi)uaelkjUv@)`2-qJ)qM21%|qp_y*n@ki5fo%Kd zGpA?A8(L~VB;GaA%&2vuug?g4{K&oMp`pw;2FN8Va#t2!i+UFQh{RuMbmYh(D*~B{ zkLJWDCzd_r^t&3o5p;JCeA^$nZ`jN`zHF#r&DDL@=hyYu7o$!nMEtBO=cC^RlYt_( zvC$PHagkZE{Is10@eK?Ss+{>2o_q(potY?bsMzC1AuC070b#`{bSkI+d_f4G%93k3 zo8u8P*)D#i;fHcmLRG*t3Bl(0SGvtORcpIlp%L*sCDg8^BAbS@kFM;$8!Yv$zkT1z zJvH&2^gYMyR+AshM7ta@26X6`{oOCHnYGL*~tEADZZ$GLOqg-vz?u#szIjP@tVP&?) z>#x`MbWs_K(X~Rqqnv$>#4#wGoEj{_&^*Z z={(Y)+9l7v_Hxtk3WIfTcli0(3XoGvY^Z!b%UHDvG{NYALl4pdOr;OR} zqmm6{X0so%6L)`k*fXpBbF0)+zM%8w=l7MaQrCZLndnJMegUSowK|po%#r-bSAfF4 zht9+xmAfmjRwK+Z&&C^_$kL=?s&$I_nc!T$;Q3t54^}cs1xgkBPo`UB`=1hv6SsPN z^+;{j@9xWa4>?ALeV5;`FZ$5s(~B}EDxHfu)1Fkt{Jh;lR3!fo>=7*|O?xAY)Rmw0 zFD6-BW3k(LPp7$YM(5(!ox=W!&b}5s>}1i)yW))J(yB#@uhrlFsa49AB=B~u!eOGb z@+n85ok+N84_6LMv7ZFzSaplVy%#w%IK)VG;nc63cQ0G)4>57^m^6&kB$~X9;Xp;8 z&IM%%Kj~3^isY0Ogd@-*DsQ1}o|mV|76fo3IXM}sfWktdEbZ$k6ScIK{?-dJ`2ClY zp&g4qJbXyvRR-t=bgsJARzPFum>3xJplE2?+!jTmjv%tETp~l@zLy$8%B78spoO#mD06ue*^Umzl8zN7w*Tl+X zel35akM8n2!R?~VKietyknGj)ugs_NR5wQ-@irkSQshkggWL+5fxPmM%m_zd?^J2y>xWaisk%_W_;`7Nn!K^$M)M z(3ya;Yh`eQYSmI#LS?5!(kmNm;GimvS5j;NKX>_`5qG)rs5j?Cg_#gVSZkRY1!4C+ zhF@wJt?aPg8mUb%lY8(t>5-O=`7RZ`maIr_`^f^`Lt{3{)KlV#FHh8_6{^#_{vA$x zk^VNxq~(&r;X*&r?|b_O4b~@?%qfJgz7a0wj+6}K+YwY#-+a676%G7i^}JGI4?I6; zMmYLxwyJaK`8LHyHKuM+S1$AU;o2hMlx70ql`bdRl^Hbl_BL^&jN&^zj{wU z+2*j5$`6J&r*;|K({6mUUAlC3Gg-a3t^6`c+>U6Hvc$6Y8YLyPd`BRW9z0o2y-B_L>iVZE0jz{Ld(B7ADQ{h)X|@dmV89tv0o~qb(^=4H{a~|Ib*%3dg&oIX5n`>V^ez= z*zFZ#v)vF_uBQWE=v`Y&2(2gquMKaKYrXk*J@Sh6gslq zu8~?yt~m0YrWzluL5M=4+5x7QJJNRUH)p>pW`8C5TQeVH$ipj`QQ$7&iI#qtZ;o==8T-y;uv z&yir0qf$&a4=Jb8+uW4ly84~hUyc8cA5Bebu_7BxOwXJ?4V4m^BuB7(4cW%lRw9Qy z+9XWoTzoYi&NZ(=JAS9!WXTc&P??H=)?G)3u6GRIhH02{8x$u%dj83{QOCZ2e<>Ss z?+5%01mr{jy75u07{}u4wfg6m80IFRG-G`CZnbeXPDg-?s4w}?e=1Hmd+->dTnyOz z_&~ei1n`sC5FZ>wmmLT-Itb{^68jedDT#?_LRvUEt%CXs9?YX~ZhxK9jgP}*53JZ_ zU&~p_)W`k6j0N&SKWWF;9H5ML+q79Z7n~M9Cd9I&$?E?3(#6tWB?44b!HnvWp7EFEZ&EW?ZCasf}hhIOt`_|j&2LHl2gNp|^s@`%}kkT*+Zw}=y%ii3f zQo*YtshyeTtkstv>BV#_eopRyga-S&HNOp;%{CKXzCUk;;#5PUqM}~E#xx(U1J7Z} z1|c)hNg6sjbVI=gAyN}2CKyq{FfsWxroCHwx1GkmcI&qM?cG`0_qY0kZJ92xe;mJ` zsLVA|*yV6I;bV`OV_QdgSxcA4&ZMH!A{KtTdHR8d?x-{hqmR<*Gp@GOXR@D%?v*#a zMJ-^V=E3c3r_pL5}WJ^!)p3Qrk=vdv4eV&-t&lWK{4mL5l|b$6~Lzx&dCCG2k+`cJ?S6xr0& zk|Db+xW4&U;!>_|T2dol@Ea7*Da~QLP|O*}Y@N?T^X!cxo39KDj81w9grG`Hd~N*n z{!8Bs;v*s$()>evp90-TPe-c7fBt*|2T+FRwL_#H@7^7Y{Jm}S5*3U%zZ=7R5iB9% z_&auXFd>k4iGgE$=1i7bo(UnEV-!gkaBWlc(jOgB<)(=jjZY8Dckt{E7f`xR>lUq> zu1xBo8}|pyM26{NorIB;r)2kb$KSCV3>uSdti5$Rp(h zS{UPluI26T53KUk$&)Qzv56pDf?3q*iQ_W+-FbQ72S zhSUzvqq_prV`D8xZX_#&yDv3*6g+?jks zb(iUrxT?6##HZRUS@MPUt=fZUI>Jo7`w18>VO)MO97^2_Kff2xR5-2Azy0 z3Vxp1`y#Il&D^Jvwl28N5-;Pp z8!oG}>rx6m8y6jY17tR|+?Jh7yHKGWadCP5yeQT)#v0w0g6CF=3A9sA=3uph>NU)@{k=x!y<)7a+S}SVKY70BWeVS`(S6B^y z_-C@=G6IihI`&`*_&Z?ri&ZJy> zW#ab5uLV&%>iA6PLuF-N^r)z-m|H}%b5{j84xh0ue?4=1?c?3b%bt>kR01OWDq5TG zmB}Wn{28K{CVzjHj)g4~4r3ji(6?{jBD)L-PywL;xG*zFt^)%LNRXUNl;RVecYaN+ zDm}@xQg)cJq3)Y$eO%L&?w&ofbIlST@|)=d42}ya<(}72r#<*gLxK>eN#dJie&=4l zz_|aVA6s+Or5Cgm_FQ6(j7_Oxl~huZc*!Tln?fZoS>Z5V|5%h%s77IYJ~b=Mr_WR1 zV_1Gn#-shzclZJaN0<(|t+ue5J+cm?`BGidTSD*y>z$6N*=?^U zlGq-S5tKQT-Wpi0k%=r*<4FZqk)>lJC^ND0GC1LP4;Z%`Z>0-&djDgyMJpv6Vpzg%cd* z?L}*T8td~WW4hkkwt0|6H}nM^EYKjiI6-p(cF66|R0`BtwT90eJ{q`g5m56KsY58A zX<$joX83t&6wu1qEC-(Jhk7Y$X?;Y8VRCX3`N7>ERQ$0AIbC}~JJVf+razLp-;L$H z9ML`ag!ivzCq?0W`?7w?m4%SUQO#Fq2&X^)=om88JS?3=?it;Gpk(S><(|Q~2vOJ4 z&f!9SACvTwOB;5BPn}P&533%OSUl;Std>ZAfb41@>(gI_ZU*U7%Vj?-is&px-E8uw z`+tRU|77{Fy5%l&)h}yefw{tB)eKCUHn$G`&pECdEAbmDYBeAH-E00b$4HJfw_1wr zK0Q!U<$Di^H##gJbWK_9ef(Q)&!O#0wQT=5z;M|c_%t!rl;@=IpHNj zcb|~6ZV;Jo49C$|XHT!$J!uVEjw|6S`)zV|RbTN%RFk*ST~b2Rob91N8F`~$U3mf2 z-$kNxw`Nm*N!~u!MV|fnpu6ByH~*h<{#_y^&*ZCP-k<4l3>7x-ooQnU@Dgx8Gj}t2m0U~I_P;oV*KatBgy5Z zwR`f0+&-*G_1D|?dDS|9J4BsTI5wu~(ZARc6}V;|er=Bs6`7&P(zb%Xa{BYH(<8J# zrWDv8?+8dTl6l^!ut4*4(YS^0icZ5!_n6bfbf(SeZ(>Xqois|=X2Pkj5OaG(wcH@X$q2ZLC&e9VhA-?O^z5m2@Tn?0qWRc1l+4^2zdhtnx_rh|) zqZ4Dn`$p{Yq6K@0Ov!p~wY^R9`CdqAN57pwlSnwUsb*qNafC0YqRZe4@5u7f+}SX{ zd!xTCPcE-GFAewaCVXv}ZP-g7kQQHIky0|~;xmjJ`r1dgXS>XIwq4Fz`?BI2N5~|X zCW;znPRG5{OdH;ICH9n0ochdT;SJ%j_7~j@L5XH2%EoRN`O z>7VMwhgU3|7GH2U*_Xx43tuT_6dKm(F20x0uxnxF<+CR;X%~0;u7BD3DCbUdnmlkk zXWTf&m-X1N_a9ibFZ)GbM8sNJX@t}e2=Zx6u9>&&8h-B{8hPtE zwzhTlB*n~F(^H{XWs+eXVT#@B-SL9<%-hKxF`r)fA^D>-=bPI1^Jmp*7)qD&KYaaC z?oAfF+e1~2HGEKP@X}AKb6Sld=~oWhyy27LwEVGQAJ)#+V{f(n-%AT)c3-y!8v>1P zq(SuO??r%s<>gg@ihzh1_4m*F?PP!UsZq^1P4rOkPqyaOiH)7E1?O##tL?TcJn6+k z>!Z%)ee%Kd#rs~v6;IBx>77X9-xXfyvMFf9CfRUkmCwkw%0!myr|UN{zVzk{qqXYw zkrX)tJF6Q`hg;n~I^C}IaPF*S7^%q++=%R4CYQR-cPppyvq8r%(YDFSYNnoNN(t2^ zZ<_1%D(b#^SDa>^nHcdaOL;=EB&h4!JhJw~*1*OtltEO~wdC#oO8&EschjnjMn-jp z!`fzrH?1~?Z^R14(O)jq|LcG6;w>H{YL*YZk7CFfl%M1zSJK$mO&9tBs90NZT05!A zRQtI+yd2}d-L=RiRS40VKWn;$o!jR4&sXUBozD5NCwbt=g$s?#@m2fVngR*0S=5{*MwGZ;4R;sjd1N%uTrpanE6cTJd0}LV{!T)R zeQVa4T;pxAN2!y+STdfrWGpQ3P@k&{g3hd!)4r6zAGem`QYpLAj7AE{D zT3B%jvT#obF;3t!6L(`s`5>jZO?rl*^ZrNY2YEGH%Tza1h@)n5S=@Tx&W?mK8S`>}wC_EK-d-s^kfaaYNur?IlMZ#V>M6IsnAs5qvJNh>G2zbtE;pRP8u4X zB+Pztn>+9DO8&js?qNmxcbf9$B3_FE5e>pLZVjUB|gBg!7X9&mI03 z<2ZI#Ts$^m<|NH1t>Wu$bVy|@%2DF*gT3@%jTkP5v48&dF7i1kL7C>I;?pOsaH6+b zw$ol)b^Y8SUPsRtK70s*&MfzV=+()MwSv<1rcT-N-VkE0;}aMXvefO^a~ryY;d&!b zKLiA}oZnj_~Q)MQ&;tH-3d*m{D*JLNx@IEj~)&3@py!_;rrzVG{h*P@+NyjE=M>wUi_ z_PoW)Wb5y!Sc#r5qIJFHdj@GL%;z7QoD#^7?C9ul{Pk41EZ3v`RQ+6Uh%dk48DfAG z)0K~#1=|mpl2Mj6+?SysX>y41|5>Vk(%tJP74gx2jY*fT+gp0+-_=ml`$d6`%E=zSvo-g6Ehyn#XsUp80Bzd$j4(AnKnWoK8TyFu26y@IgavG0-Y z2R5}PDl!$kX1DH7X{$ve@JLEl=gK{@gP1B!JyPfL1;<2QezYyC4=ly4HNOQGr|HGD zk5Ve?8|-Ys@)*~U6TUO|7D`}Hjv5quwN+G(5hp?II z41zb{TxHXL*~qAGE1%9+21LYqk&|3pUwulqB9%G7@@NuyT{VDSUPsM;SJ>X!3GAKp zGh)}M0?oV(o^g4`8&A(|0PD*|aKDsJ%plugz(j3zHIWMn0_?^TW2mBWXI|}VimUSS zft9zW*l(OuzkVGKL9q`MRbDA`G~Hur3gi!s*-yZ{g;2#o1VixOawFp^DSgV(MOqNY z0F&_~*hf4Li-JM{6DvUR!Mp}ww421=9v>1CGB^XmGG5EtuC~2sAYpB9BSc;SAeNMx zTK(h4=c30iH4&12b#{hV4Q%@iqA`E$>zSyUBjw+}r|ae>Z^{N-eq1&{1lNpZ*n;q) zM$p~^QSai@y?A9Hxpd~3UZng6J`2)Q)&rGmK zc<01$9<)DaTn*s@8~Jgt(T1YjuP7;{MOl*o-#H26@tg3M`dUm zxw^PO4RZG`53T)W?%%6X#k3>C!w~(5LPQEaEOdGr8X9o+1Xse|#>Oeda7WEsiKF@W z5->)==>wOG$-!aR@9aWQY*c z!k7qpOrqojh#lD7GFLn8ID>z}=;86>$7rDAsem&FD*?qfr;o9+UQ(;-{`4T*jZ5XR z*!y_(CK8Fumw$luOq54r&4E5_eCrm2s8ds2-OJjcK6|QtFju~IEgZB$Xfg_1iS7Xk zuX(H;9PnJl#hFcV|7Ao$L}_4wt(QzJEKk;#9qoZiziqf<2KY)RX>1n3koTQ z&VUa?>+{4!haQf6*zoXKuHX$=N_k`S8xIBo9_dq2f8}yDM_4z@2 zGD+~kp*AF1a{!{`0_~NPv)&=mKLo|RNtEj%3>4Q^R>1BS2#J>Ie-9q}>+S_^cKjEK5!X986=;qLwQ89XQ*A1#B8T{*=x}!|~I|*Q+;+?Ru z4T8%Ew$I)j7duH*sv5p|3laX<6hN3{+MKL`5Caw-T36V6Dapyx-p_VmH$xzXPfpI= zo{|kTco3XOJSuO6g@*2W@^yCB#=_)Z;Ohh2iiRUVYA7KVg0uy+kxzmj{%2ys6|Wmq z6Obt|>nK$5DIOS>IocL5G6hP3{RX)e3l1P@O0w{pNvsXXeXuPdp!1bxT>Xr=pMV)W zg3gO~VF(67MR+(g+ny|MNIYUVgd*YlGuht{CEoH+I`5PesNjsjK0u2EZMmF-2Z#^O zcK{q(><)-NAdvM{b;@cV2Xz{*3xaq#m`jtHsmw#$r$dX2C?A>w>xqGpaV*4qC!wYS zDp(M634Z{DW4cmV?;Tzavw$!&m4tVYxdAYoY>m?;2XgLt6gD&EH_y-#2hJ&be<~MQzo(InrLl3djJfpfVA>7N>P6aa? z{;v}}|01Xs^!%r|xMpFmK-BoqlGzCr0~aEbglJxegT_pFznM;EqB$qIroO2;ox%#;FiJE*J|)S~xsai*HDfe(LuXxF6?Rm+=Yw~h)w|n4gzyiVF z5}#NZ0w@}@pobLY=l=;LW!@FYij@J|eCRdc)~NP>Kdm4kL68f?UEwgo#Sufx(;|q< zK_Nu@mB`1B+%9^N@)Fwn{EP{J1Umd4=~Ch$(LyW77Q{(OZnaI3c)@Z-_G7Q*<8TzR z{8nCqJNySk1ctng4;uc`s^RZ-Dma9XkWElmF1>cvF$%Z{cc@hPND0Be>o4Tk!7>c2BD5s#BZ;?5n8kg8?}+If1Y z|KN=Sd9SaZ-*?W6`kfxqdOm%utuVi3v zV0%I?VI~X%QrxLEQNc4b2U8jNn&kZNi)G&_!-R}f0@;_`fvf#u`k+lA_Qf@h;H>ys zxWCwPS+DMIenA0N4XK2NM(}@+Xn2Cys2w!{TN%x?PV2hTmGSIa;BXK)-5+h@#A~Qw zz&cdH4H1qC3m+N!OpIY+wCIjdQ?0DY)-w$hFy9N~(Ry%H; zc;fu_gdmDX9_$4r%Ex#<=clKqhdV!>3aSJ-N))0o{GIV0pTgIPxOGULiNk{b_kgxw zFrp|YN8%x1(ea0eQ>h&B8N;?hgM)J|drCm5!Jet1c4N#@M%W`v57)EQNS!AJht*|h z93es`fKF3!uP`3;;tvL;aBnvTs$9B0>x1tc+=VqnoW+KR4qi0G;w=ueSPtYH?7N@D zWM)zlFa*k1)L!W_c3SD`XbRFOs8cXh+Z3z!>GW9ilhK~iQuN=@iAD(f-)h==j+VND z0*f35laf1+hEcKwwa3a*pcjXD%%hugxzk$t++%6IJ>`8fh{7aCPo28jse`XQHXj9P zFGdjuvVy*dC8LY3jktLW&%a1`($a&T5$`kv$|9d~BBQ#x8bObEy~UxrX%K(1%3;#- zO_kQ1x;rqi$VA?hlbU+BK1Nzlhtcdh*6(%f7c||gasFMtT=&19)p5NV37y+-wbp)+ zld2;wG=Be%moSJdtOY9~^-rs0)c~6Glap_wqTo&-`rjU}3&*?w%Ks6&?>~w&k}zg)X`7H#hl(S)F@F+YI0!sw>((9V=jRQ0bde)|J387&ycY| zCK7ce!5!ry{?zJlys5uFmyuu^3QeITB@#>U*5R-5@qhQv-0NyrQwjAfydzK?8P|m6 z-manr<;#M7RUEojl~=lHEeW1a&}7or#=H5I86Z1y1dI8!V8IBed8zW2-;1@2$^|# z$nJ?IEK%;8&!zuO)2;KuZ#(|{(g*AkX1C>sGO@hcnSV%k9wk4{!qQj4>@MQAooT8_vzY-;ea>QI1dzSn&wiF-VJf?5v z+YESLIu7F%9H_*DUCY$?4uRou`*>#iIF{nifms9h4dMuT5he)PO_|SnsB;l76KXNS z=koG_aLbYrOu~VvG^ld@<(V)tX67ic{t&Y7cIB(E9!S$K?tspTm9@WYDWhatks1|( z0T^5uWkPrGV0Ve@wNC4Qp@tr>=6C?XgGg`6j{Xj*N>ptW^4mm}SjbhNAI=6rlj86n zY-ysqOG-0>BqW_U{6Asrk)~Y+%p!(HMlRS6R^ybQCR5SVE5?ODi)kiIhg&8PAbo(Z z^GRBtk9XT+{+F`Bz=qG~r`z4e$#;(LM?7+~v#;e_5uH!41YvaLfR-2q?H?q_Nc9PI z5W%6-WLn%f(r033CIs&<*su}u@6X(Hb1TWCHdOH<5I%!*oBj3+QNS$7YxaZva&;@d>wl+ z*kpdJ_3E58^y^qc6N6u3_LG3g*NZheHg-)*%MsQmHBTgnFV9s3y9$-WE>9E>2*vXy zIH>VuW>|EXegQd9XMt@3RBIW$DAeE;VIDTw5Le}Fk#YeVSP_wk@|z?Q$mDUXJIn!c zdt2Yy8YS}J1N?JsQMe+ z(xukxC`v%!lre0xCG`gbRyWxZA~T)+TZNKIgM@XTC^ z2y-wWd?KZyrgkx~zv8sJkW&Wocr^6R+-*CJkwIXRY{H*o%93k8z>5oGK#4l^RzFqt z+7yJX9UHJIK%}~Nh-b8J>+qpNFOHnwPXJ`-6-g`->SU$af`Bn#Y6HYc#8?>#OJWe= zQ3ArAWNG4K&q@7Hg3yAx@~M~Z1$-x%^7UHMseZAiaqF3h7q>vCFIoP(Cy;e%yghay z;0y6&cM5<<6ge5I+}}%_ykv5LkO75>ItxVA_F#EJ`ic}NaYA{RhK-a_CBJ_C{nlCN8L>oM1);&ifQUCqNh27Z2{qEcg zwZnCsGCX|<$e79LLkMiV+}w@-1w+w`B3`2Qyh=#Q;oLxUxCf&j3l@6lMeGS*`)N9_ z>wl-K;`jIn%^who6f>?q1h4|-C(=Ui@|zay#1~9Skhpg*U6Jh^A0JL_oXYd>U(qJKlBt~viFqi`N6gV zdc5r9Qo8por^1*XIf$?kI6S_dGsx*Yl%Xu%v&{U;V5?(k?pc$K+{Kn%cS1Kpw42_w zf}Ar2ZyjG5%wknWsqo*;@aka$!2`vsEQ{zjM*Cg~Gg#3w2kcO~xoQsYTd29=sRRI^ z*t$;!$QA-SnZ)K7JxA`qPzOKckQ`U97SXFE-!jvu4b58+63#RK{MhJ>^ns)L1`l^- zoQle-%F&DV*)_zh5NuL3Fg|jG_5j?yz^QAh$9T+{YjF~vY_x+FgL4uLGW11DeSf~@ zq@O8vQy=cw?;)lNiLsE&@-%|=K_;e2=sYI`@h+8p*l3YU`?TN9L*ap3eWK5%Aj)O- zxsOj3QK82mND0dJ%a@dJ)q)}UH@E|Q3frR$^s%=GtMYvx`O&7r?RrhDAu-#W*;geYx=| z-{;ct_7YcTDD~mFGa7Ywy!{ljUjCc6fvf7Q8N6o=DZOXeDW-@!RKN1^(>J_GI$_VtGjW#;9utxrPY%lMq%g6I%KPS}mh#?K=au?#t;DA3 zyALdLRnv{`nf=&7DOOOtG}KoueO{|(=BBuLQToEVB=z|Xsi6z^n5H&TEtOAjekxv* z&~Ht04|8WO5$1A@_7cCkwNM|g@%fpWGf})8I$f*${JpZm z9rNR950s{@D*9y$_OxVd{Cc$An?2$09eAcab+?o`_Cuk_Rh4KuU(L8w#mjDK>^E=KWck#`6Q|NixWRBw7;&rJTBns7ZqD7O5(=~@>a~fCJN~@fBv`x=GYV$kuvy3yP!CW_^`R=2GZy6t2n%Hd2O z=Y}F5Y%DS^-XI~cP97HWvYnppzZ&^Mzk?o;E>K!InSE$euzEuHbkoy)5(L5+~3V~G5>g-QM7{o9ugOqz6MoZBa#e^^H)!L64+P)030|Gu#9o8bl$J|k}Ff{zrI zzgH8?z9#-m)7b%Z`l`rm&cZ3jvtVJU?jC)*H*51?q@)> z%HM<&o|D`Zo3jh#SJMY8JDRH2K4u8NiWO+%JK^GUfIu##!hPUt-gXnmb9cxnN5j4z zDDsf(*|N5Ie~#d5)MnlCh%JAZPm*jg$ME;LDyJN;MsCZ2Et zDy*sb8hqi!rgtCz!bJ%g$=eIIY{>UvC4!vAeMSTRgF|!Mw@FG%mpk7PIW=<6$v$AN zRs8OoX;~XdLzUvMo=u_`$5*$v>Clgeb}o+OstLEA5<8YggC`v=!CZ7QPh3^^=Q-}@_G_sSW>ftqvYY#jkbDy`8*MFYsiz>A+_wLHYf7kJSlMAq!Els<=;!jKX!*tBT`;eO0OV)mh0j46k$*`xuw`AN&!_ zg{)VZ;}zEv9@8|I2jV9;t!-0AkrY}E4iC4%P!@K?uuOxm=^f$y$mg0o5#CP0*|)lS zZwRC8l;@WlV4A}3OF!Sr5FKRnCRpX9Vo#Yu{T)0+H+h*xeSY00@gSk1Cbuz@_uw&6 z|0aS&!_UbSp3)U#mzN>WZdh#?;G;IbQyZBTQ z)MubWK(@lF{xu|53CLbihM|#xzC;=^004Z2GI0U!*V*lRcf^*wHve`kgdjm8vCuAU z-ng6hsoS}ltG2Zf>ApRQQPlBW)&Ly#=nLG~csEkqy>HCKob@md?KQ9X_9cE<*27^= z(i+!}@yMyEkP$K!_X;mXraRl@RQNj@bmgxu$Cu8hR6q5bOD=Jr`PzGA{j(v7aRx)C z#m4}bvi`Pj`3x$3#g-FgG5a^zv1S1j!W~3OQSmx_b^n(#dP=>6L&|m!RJ-(t_)2qg z>%33{`T6blm2oajlNcSW9eO`kbk*QAcmE1=YT@|$&7oh^0$PsaZ7R|IVZx!}2J35a zLRZp~gPL0|8_?6rnH;&r#IPdw?0IOg^pfQyP3Xd$?GC~Xs|lytQU=>Q<5^;hd71(O zd8mwyW!6s}&@0hf9r7ow_P!^rS2TOM+s!Jovgd5xlz{rxQ-e}>)pcu}m&tOrhww>k zX&}m?#0KtTkAZ5_(^j9f;H^*T)Pp?fRbb%Fe;y`DD4L4{mM6bvRs4wFY>f9cy}0f2 zhhKN!irSjzJr7uUX5hC!$2#+45&8+_!|9u4ktcq)8+xccgE~Y_-VG zG;kONU%yMczTh|^OQS{2YImPqFx~Vqi#b?Nk2=snVBAn>_@?=9_`~?NEmNBpvjDS0D7ZPx^*oi(=PV*c@?1%$- zLE;4;*cto#F9H6Zo<^u`s#c*{_;vBj2(9SQNv4*hM?vGOdE2i!$W&Q$Uvn=I>5}Ow z>13DuP9VSqSw`(v-hqPQuJe8F)q!@KHnaIEq86qEGPT;SmI?+Br;?`ys=B;{fb_Id zuN9vQ2HJlkj7MANEMo)Vzba_F-=c;=zk9uXOvrdb_|3+Iwi00QVP0tRk-JyyfLl6Klbmc0f1pRg;Q}gs-?h4 zJ+|LCu6i_Z1U7w4b;vzuRH~}gyfwUDnKw1eSW|E+MBg&?tNz>7fvND)nyj$>NG>{e zk(0jxO%@RQd6d%kr*9x4|5x*cC7_@8;sxe|^$rbr#TqG7Pk)k$A)j+_t;iW$8&7DZ zxk~fqiC-kc?yXPT3AZQ39}KVU>e$;l`AyuxVz8n~@yT;of*mqS5t9<95E##`s72ewGGu|rzWw?hZAVWeZ_Vho4; zQOakRj_0oS&NT2)>fkD&)Ku91YUX8 zK+Hz3IVHtvra7lp$|4l`YrqEj)u`I@0s?OC279nkfdPPjx9y?C@c_Qb`C+AP(khrq z0k{)g5;%}%6*hq<06Z4XI2jqIeg%3`t~#PP{3sLn9pb@08!Xd})QPhF%jU zD&U>~wB`fE11$Knv(p(UBX%(g>#3w;xw?9qnhprn=z$Z*N4^XDvZiK@AGC42tZV|r#<`^00z(7J1+z(?c=HFdVj$}DWkgkiY&CLY)+)YCv< zVRXkCQYEe!s}uk!Cvp?!Lx)fVKRak?1ZWQb_4%`BZ69tc07@E^L)IFVxq~=W_4q$) ziwU4!K;ra2-M2=+9Kz3@PjlD6FF}V9$nFJTP4VW>j*^Saq`|iTpDQp-XiC8G0b~Nd zC;?c}!G`s_Zk&9QI=btQaMz(brQalSG4R%&^@EgT$AT(hUyPy%229jcRA(>*0-7pd zdA*Fzvgi=O>`3&+XF|l%fn)PBc&$5bX15bP{hCvdQ*I0l4J{3X7`S;zt8Cv!bmt7= zy$0diZUWwholvAqY%gc-4rvug5aHKIi0f#Nz~bihYk`~Jp25s%10)fk#x-lDzPNbc zw*ZX;SpklL-~elo$cTtLe7buWHkTa9B%xc!x&gZBeJKX0AH=cx#ZE-;J6{{e*Q!ZG zTTgW0!6~>L*pKhupMd8E7bD&YEs4N>fbxWBOz|iz)MX2TSsu^HllK7bP_rrCnKlGa z0M1IPX3EaJhw2bfzgPLxm71z--S6p^wI^cJX8Cv(Fd@BzP7Wlu0ErHrGgdUxBe?Mf zT+RVXi_Jhvkfa2Xiv+Wzqy&5c9Fh0cf(F03pbAHS0i%QgKiods4ruO)R4*bD8caEi znUyK2sa>$(ijO%bAOLAOOt6A7Z)^$&%iDu0fn|uXmfHyMK%u9h;e)@T41-r_M8qeQ z4mFpQ&~Zbk+31JKkBgfTW9bD>US3oPW0=!1Fd|2<({h4Bcd31u$ovTp|I1T&3(NV0JG~mF)+ua7eYgqPl)wT=JzT_OM6CGNkU?%J!U}?j8L5-*=%4Hnw)4-4kr};ZI2GQBsws@W_ zOvE}h?4bp+PgD=S_xlHdkISp+p%b2d?c*e){~|15&}*Y1wB23&5x}y28_DLiM8}ba z6E(6>9RXiE;y@WJKY;lH1o4geAZ?y|U609&kHdm_PPP-JFpS$QDooE}*dfNb4PG~g z9i8shFfz<3d5{=R)?qqC1k55ta|45e`EigCq8mrBbg;drlHM6#Ow-(g`W;V|@Ta#I zIG?EnyIBW}qA2OZEebn0A*cC7F^>N6`jU0EHx~wLwi@aFQ^r!gA`~e5gT=+ewpU2#NHm7 zIat{;{$6$5Fpa~i*+am~dq!+0N3dPAjX|4WCjnuG3LfA!kH(tbIl@yHr3&#EhlUp5 zoq##8R4*?n`Oug{3ttLHyE(BzD|2IV&o_Lg4eU`aUwf_4dM5JrA`?xke*ECz;@U>Q zRio=xT9f`aIYOaIPav*D@6n97(7tZwCGo|{vtr({`v}YDik$CRJKiMQcaOwV!SkTw z?oW4oiiG1N9>(zMdMj9m(6&mvdQ5qf_1MWFNA4-FN0jehJ(eJ^-Xbl{{0LIZN9Xgk zegh-o>(400NoSKYemFb(9-WA6S6Z#rcjxotoOwn9upQ=g;hS#Le;#5MhN*l;CXe&h zrfac21p$C-qH5gFii8#4J8hZ09wej$`0XA&e*CXdAXJ+PrX9b3yTMa5-}X-@(aNvX z-JS5Gs>&I2QmiRVbR4jmR#sNv#x%RQxJh>STV^XFNZ9H-wD2y;$h@|9__^VA4qpPA!YbuTGmO?;9yT;ebtdS<^H-Rs-%ra$~tlDs*?z<`I=-6gDt9GI%w997xr z6VLc|pFn>%E{+8DpBBKN6nNbY=)iJ8C8y4%Kq*cHoKc~5cZ&)NE~`~D_!wFO-+^f` zP&mK?!F@ERL}m3cXZ5HITUsft}n)#r>b?k8GZklr8atqAn~lu z+;-~jxXa&yr$&N*j=e8FZ8M!w=`9fWREkz2{Qkn9!KCn0$5?z)8OMuix_i#rwj?&6 zZ?K&WoO;YzGt?!VtMRMx9FNtH7~O8aSh{^FT%5m(*yTRJWE7-a@ISzLEaxtC>=iu8 z$_f){`CzWz)q7db%5O3=GZRi9+6x-mAHnonb9qERI^8rRB}%()T4v|vEiNp;VZs5% z@;<4nB7otJZ23J01j_b&JOnJ3mMMYVfZBUbw-7k21b7p!PM?K+X zaPV!Xe81oqrZLh5*Y&G64>C|bdl~Nh`BvQH|KkF*Fz`LtXRDO@@{(NcR)pxfNSChS zY3VibwV(BO6mg-LfjZiIpU&K6HI#?&Bj^S33lyQCm;7b zQ&xl2f9~9BI&8{P7U-g)-&Wo!=anuVE}Uv+?UQbG z&^YpnR+mxFck74|*(Sixe5rfZtm6H~c&cdAKWJ~`3jUe%HD8 zB32`93hOwcn|HSzAnz$dZOTpQZ}R)35`nZKD@Q*LNmA}**02k<#z}U5F{T~rjI88e zX<}|LKdZpR84*I=*h0MF<+ch5@`Ksdq*=*1-!g+j(!5*@j4to=a3-6++f}^rDRO|U zrldWDil2P)dX_J{$L!)LW0Qv=zs0CS-otBo4UJ}cT_dq;2mB&xD0k~Gc=ZdWw?Aeo zkh%4UKoERQrPEfVm8xH(YJR_7)ubuqF{fAGk1q4l7Y4LcQ^Sq77hgzfSoEEg?FxC* znp(|5uCVme{;3iJWvJlTQGYT|n&^)1imUgyon1$YPls!`ceyBkmEx{6y!~pt)tB41 z{o|ggZJtNPM$QTh=o)gmEiF7@o`smZ*B=C%|F89##z|E96 zEeVG`48w|xi~li1;$|mQ(vLj`wF(K(e|`45qs|QwD8pE?AV2?={UpIk*(>( z&}v~Q49@N+K-S(dfM|_OO;-XqSsp81T_y3@7;KH@KG{fTJrhfnYM1%6D?hySl+Y5N zd(+ladn==dEXz{x84jT2tFmVJyK6hm_8b`nTXMm))%r>4q=#wM%-up_oL}63_+K3D zVEIEJkv+1Y>nl@sHe=s3|4nKUW6m7)_#_;w~oGmg9N(4JG29ntPibGL@e*t0@;2*T zSjb+;#yP0PUv=R#t^O4{*447_;I?wJ0He9|4KK%ut;NIlME@UqZy8lp)VBX_Lg^6c z5>UFkyBq07q`SMjBm@Czkd94ANrNaLDUH+y>28qzPoDRE#u?}P8RvXC3x* zv}jel7l$YkACE-PT-vPh8#~=9ZRUnd0_vn1)azB!JQxR3%vJq3}o^y7Zh&mmC@P;9%gym;;S3 zVB3vba{*-s<4LUgx8-AY=bZpJUG1%T*E7n9m*HfT$W-PU3y0~h7|%v|Ha{6>K&N(p zZ#>LW<3cqoHq332a<*o;5XgIwiJ9^&4-t72_H0&SGCpkviBKfO}B}qkHxA}e8{{5R_>oQhL_JO16TSF0lp6O|V64z0@cS)~0 z*7PL~Y6TuvM9Nj@HQSqlqW4*P8YM?VG8i zV{TH?lS~On#sF?KaMRvd20Am)&ON^3$Lictd-LKArUS*B+RuKQ7#q&+ysJgEAuZ}-|L$NDo>CW_W)C+mPI(6J# zyo=wp1Q!Mh zGXS(or^O(AHoxlIsGz5&mjColStIMAbwV+QZLxyIyJZiB1omxihQ>6@5boViPK@cJbnXN;=&_l|o;=eTK%qfsVy$;N$YW$+;)coiF1CgSk+r zVp(fnAzu4&bgZg~FF_C}n?VT#tc8Gu{^lyEzf2Fkw-zfTOkh6+TolV-Nd+*SAaEU6 zECgKF2Y{?}EFs6BzZiJ*8S{w z9HbLEipt(DZc{Xb%0RJVrg0sAwKw~mtP8BEREoIlyvrlEoBBx`_2Qwr`j}}`g_!^? z zwk#Kq^DCJ*;c!JdmY8+Ui~Z5gHr98I#as{bBv=)`|7%T`?R?JHlk)M3bbBd_N}E{< zQ!7C$LAMAAfLZ#9TLDVw)h3VG1lBQ#0N_-uMn}q^Y3$xL@XJ+I)mrnXN`czI22f91 zUY#H%LxQzva`cQ9t7~)5c>6oZGA&FD{z(d_3&62&A%cesN8sTu5ApgQ=MT-*ks*7IH{#pB1f2NYL<-j+5Z&knVDFwkA z!3f_3guRWL;mERd(nLT1`C8ZUUGtx`uI_^8&1*PgzlY+s4o5Y`^z@&`1)f7Dui=6% z&ff|y3~Zc!Y)d(X8FRkH^SYU3alMW0T7Gu15#!2P?}gjumSsA0n7TR72G)b1h_4Om zyWsf&p1%6A^k#&Y&>nHJOa6a1Hz2*q26R%@%iV#t1JwNPFJ?_dVLKgr@|(?-!hf3; zSJOQctT)845vAX@^S;GRq=+`|ilW}>*0uX*t!J%NgAeWBzouQE6+sj~hPe#HneAd_ z`Le$nF0y%%o+8U@@&W@QbNFkMhodg~4L!N^TmKMUDjbw&LWAz%!9X4 zs-&!!r{BCub3@C_00eLt0^bKLdrjN6>FbdYX>8=Mrcp0_6IxFELKn;02=dwS+m|K8 zW6kyYXX9(=SI@ltL)4gXj*Bl7gquEV!?SDorZgd9b{tM#-^NU44#0g7Dp>cKfSxq| z@a*tHxvrt&E#w$F4C*3}VOT*Yo`%BG8|#P02}MmZ0yx~yM0{#K2navtCzS9l*!il&Ib41X|iYe(b%@YG!cc>am>sJlOgA%7` zjq)kmMw!xKh1QO*&(Q^Q(~R<`%cgCG`-OF1&AIGfWmQ-5CCu1rR%CArF3e{jK*ato zr-cw$8D~!X?8W~u*q$9}rjw_z$~StBQcGGo?0d6?ol{_qLUVv#wYB(AH93doLki{n z{j83bM)RgIGG*x0T78And6SN^@nB1M)#h0ZfnYk$+1QB*>-=D3b5%bZDSXKcYf$MX zT_dsb$M>qjKC?$8l|8|v5oR)Vps3vu-{5#D(GWOII${A~LW3ZO zbW$u-s5jaKEobY}>&PNdgeK`muZZ-zK)NAwyqI4y15UC2kl zc4h8_=dx7Q^E-H|J@d3S$vl;aCr1>KA>BJ#{2VP~=K-{BKb5XtRq`z1elXtP6?xw! z^#&Ie4sf05;3?rdL!;1J+wHf4BSjy*1DzubnK7c@oevtswFh1DwY9X>KtI3ZQ!poQ zZobO=d@kf*+Gz53`*LY;N;@1%VP38{+xX|%vp3>YKZv<}@n6_ou~3DWeP|dM;kVwH z27WQV;A0E%Gu#~BNA?y2WnOlWD`0m)C`ixfw78ui98`Hb7L4!U>x_hX(3v+hI_hz|RMzQpF`+6;6oD{!!S&vcg{p`dwh?fMqDu`M1aQQR5mlGp9q zn%%=wD%iI%Gl`?6fhJ5t!7>IyHwZ*7M3*5I)#um-P^is-L=`mU92dWxfQ}ky;)ssN zu%X||tmI?f`g;o*19UDBXoJ>J6IkhhXFVn{R~d+l0euW8pxm!If`SKVD-j8KI|D^9 z8ZW*!nF3q-YjITMqlc`T?crjq8AG8?%ibHwQW})y!?wr|L@_s;;w6O9Vu=4dGXx7Y zz9017ULO4wcqTdj=;~;E8#x;}n#wvq)t8C|(Pyj^ zqp7d;NY2bsEdG`5xc|T}v~s@90O^L4$~D@XKEM`uLCQwKB=B*p5JDNsUdE-*T|fPh zkDYNEOW1m|=;n8;lPVv3CPXk6sK->2;8lD*u=u&OHOPr^t*d`)d9U>J zrpN0fO##i!uVBq3YjI{H07i4AsL?SuGF>&EIWV&_NUs6mOIRBkvAFxeFn6SpzDP?;=$I<*ELqANn-kTqpX&*EXhfUfoa-b5)`K-QM6e~mf%y9WV;aOft zZ0HExFSGg|ua{bNNv@jCXs7V$F1Tco|8an#V1lARhq<0aeG7uVrtgdK3_iaGvaWw) zX<7?@5Y>hE|D=ur?5>toc6W{85LpXbzwOPwPI;4Hgs`XBo#Zh1U9i$dOcesxpLahBdzBpT@a zXqi^#*q?^kZ41{uMW10IDxOFK)fNN|7f_o6t^+Q+8GF!b1hT(Jkojgh=eG*3geqX! z0^VM5xnyzMHG#S+sHyCOejO;DfF|OTU3s~qO^ps61qC#R_rbbQ<^$nLFP11sQo8qQ zJvwuiToY`3xG_0ErOKz5?9>rUTvqXTSMW;3(D>*vYZd zHb9@>@JIDgk5I@Q%jCGCICQ*XNmkV+`A0MpYS4uRUM9N9wU^yT{(&6fL$8d${Rvia zJcn@fUKggJ-%OATBaq?cI^PJ|!%h@s z&uptQCgiGo*jwOfuP(7sh5`(J5ikRG4v->)b}SImzd%+f$V;VseKKF(_=h}h|B(E% ze1Z_RHFXq|hhu_%af!4L>VB16p-LW^-`erdM`a!|~wE{$?uOo84V?+>;qktIHQ;^Drp(`5!#n%jz^gczo1Fhc zlswO?*LDzc7hsWe+As+E+I78fbgVHrFhGI)#hu{&CUPt-!-SM1GPOTNL^73lmC|sj z9NCwD`RK4_)9>rBG=9nBW{!(c6ECd3FTy-btd2ZObx{~urW%Q7>rm=svT8ePL zB4CzT?Row)&|aXuPk+B9VlzqK?;*W`Z|AYwk}bN0wsF;f;sZl|u33xoMA!f}nm&W!80ZAe#_$Dr|JdGaN# zQPoJj^;!qQ5LT}6Mdjzf6E)27=N`+PeVsFLWxN~9e?E1yz&#i{8M3*S8&xiMyaC=k z3&UZ&60!;MsBkHIhno-EHdx2+BbgMablTlqbrTpje~3%BDUt%` zED#aWs^(7ZU4V{sE}N+Y5IV%AJY6+y;HqIn0OnDknDq4Ua9C=+0OA?Y9>M|XU*zPS z;EDkX6`=B#Dc1zaBlo96tl&waQKIDH>`dn~Bu_I&k+dn!Nzj%Z`GxnR^lC^k^q%gq6N8dO3$Ea0XfITySZKkc>xt4`HjEzY1z{ zIH%&3-<={_ejhR1QIH4eu0N`lytn(eAP31GJkG->e%5qYBOjUm9RyrlHttE_H@Yj1+&E;mGA`#CtY{srpepr?NXylXq>s%vq@Cy&|pg<6#` zbD2m^kCmYy)RVuC%FPPbh|a-sxg4t{6}DsgxM(>2#LPjThq2H^+0Q%&cTtpSmEHf! z7vqqF&0IrP7#!g6tN%>>qdHnK#&mY%vQ_5#6@2V+a{{8T=kU`w-k5o1rM?v=Y zXCGc;X@utB!jprkfAQX1iNIpZ5j&vxq(Yg`zqeAav>Xbrp`w?m{E9{_YV+=DUWK}& zls@)mEa4BXGzcREeKXMC{`9E<6#qeq0_1DFrAg^{?Eb=siA{Cgx?YRGi2s;c71{bS zF_CEf1qXB~_SUxJ2DL;94@9nx{81t{tq8~-FEjn|)3(cM$P(1{SgLhpvmIyOf z6@U9v>Qb&8w?8F)*)*1sC+&xm@Gn>_Z=zL$CKJz0_EpCxHq{!Xw5{OJ=~xh2cPgB_X6qFXDWW?6r=g)8jZf1R8yEM#Cai_ciPI zj5y?zK^QJT4FHD}kR1O8PZ9tG03;v~S2hFO6F@ugz6d1LULHNQHi$sP8c-EJp_;)$ z8iUivL^UEceukt^P+2fz7$5n1#b3a}^ergGg5UjptNZNBCDpdXIns?|?-q4N3BU zYrDQ9OQdt}yZ_2w91c=A)_S$~-Adh04E_f(Asxr4RCgg=GKHd@1x?}jXOIpB%3I^s z4}M2Zc!~Xrs;WrAKurXGfN~ft%D`mQ7C0ZN1FOVhphCNzpsL}#V+tnG*^8B0Njt0w zFZFM=zdnZqqe}Y2qhi%p%(M)}8Fg3KN^0~6@>;m1)#SIvLk=Mf*zNE z;0p}5fc@5CCS}E{gNkVk+xLa+Y{)N|pmCxB+_M$p@09(BUe%HI5LhiLv@n<0WIq4? zeMoe!^8JOR{MldcPs#_K0e1ZlE%sMo;hK0FMNE8{l%cg3k_1lTa;eTIW7CS5%S}72 z&gFW;8^Vbp)F^G4%=-GsN>KerV8vm4jJM0t+@I6n{uAm1Y zz&iqnC6E>Z=mcP~4zT7-+TI_)H4(^H);p*R9zj6lPKb-^4wTvexCJzck7r||CF$4_ zr2R%f8v?8~0Qy2L6e6)-LCELRSh*n`(o7|6xTVz?YcaChE?cU8&h*~nkB&>gUvjmKKUhx) z39z4hT^L@~emRP&OCis*@i)Otr}x#?YlSkUc28CzeLVKqAa(0^NxGIZMUM@ehvq0| zaUfL!h~o@fTsL~{%wQCu6kNp0nwp<>y1;n`Kj0?P3LpYN_E?^?sbzGBU|C1s+8?Cz z7pCuU*z0z2c>drf$qsDLCqU#&^$~<(#eMAYJix2~QJyw7dR05=$O^0aZ5>A|F1fWoq&_m##>sf+>f)!#3HQ+m67jM?(b%rY|O^`hI&Rm^i$N*U-EDh)D;WLWQ=TH5#maHrb3$YR3Oms0{NvAm^HHh{Kx!G?=t=~ zB{#PR;3`_M#cY$niH&0%RcSr+4_Q1-S-@DlZPo#g&Zj&VL`v`rITWajz@0h*dJaxE zXww1xz+M07eXk4Y?L4Xz6O!6Dd}Zyi+|=ua4}H?)FWx^M<0p6 zb==XBWXYccY^1W~;FhKSi;9arC{Y_UMwRcC!YD{G1CdfNSl2q4UKiXOc@Up>2U-+B zv{k|^QBbR1tN?0Z_j@&6fN>#e=a{s5y%gt?tgw7IIzxpB8A!TkV&_({d8yl*)<3Q! zGj)DnI7@J(?&&8R*CoE zS#9a$R0EJs45>Zu#E|Qlz%BPlvABKV8`hIQy9Wo_@L!oK$E|@Bi6kaGDk?~mMvk17 zmHqI0wV6LsYOZ+nyAPS$&fd8`&Pc2FEEI$Z*m4@I(r<|U?_h_tu@L0Ybv!9U(VWzG zG-9kJplLa;b7(qEN`M4_$YJ6=}(9T9VD1LzoVFtNYJ8c;dKRu<6a#<(Y#ARh=0Xr-( zPI3{CfL&7}OMOh)F^v|XU7eqS`c>bjsGL+X~eR*_{!4h`9gk>^$!+ zXSU_H+uxq;aw+bEX^{u*{)Jv*ko>_Vd$ds#7T~ixPE7BYO0ZtB9!$2XsrH!74ZtV! z6gbrG)Yh%B+EZ;P3I-X%;JGH^YRv)`vO(WL-^xNB(C-YIs{!!~^acR?twOUL9s(|2 zu%cx@Q2Ua9sAu?DYb+2WAjWT@itc+~Ppa0ke5Z@c-cjPU8$2QY6n{aK>=EA4J~D%F z7;-pvcYUqkpGc)VV)B_jyNJYNewjyCsZ5zMS5n_f_2D`uN^Aj7{h&sTk5E0Nd0yco zHrMXRj~FnYF&EZJ@qsdpjV;I=u1~5K{n<2Eexgk-pb8$qx`Ncye#;h}C<93>u+%Hc z$$^_7uvGx{KcS#!RbHM6P_=Ji?ou4x?5|KS@8KuBn(xDdgIK>ZT5hvbk~*#rIPi`R zx$W{iR3aO%I{Nqbi~7%3i_Uz#6ITDiji()cNSg`JoSb9j%$&FSL4SrFFkVh_gL3@d z%M~;F`*w}3yMJqm@xt2LqhU4G85sB4rx$(;x#MNwF8*F46f@^e^lYx{hLb#wo((P$yoAjIi%E z@<4A?_a`Q1L%A|(6yErJx$7@&KGJtjWj|1y<0j>83+&}5@zQQ293OwQSU`UnXnR!< zbjar+Ff(4jW?=(8d411dDAIa!>f5yY51)Cji#r`dJYX0hr#Zv7BX9th3@o1zAm9!M zis8V}QV*Uj>gDIaZvNAk=SP_&%YCQ)_yq2mP5(p2hf8UD^i?=<`0W(WJ5Q-Oua8cN zEYPwnyD?H&K*_-r4+mn=T&u$GuMCjBN@Tvivh&>zPd_JQgG<@Z*%yb`Mr8E_X5$>A zPKT4PeWqb&3>|#w@8`}}x=L~%+*I*q$DvCm_y)TjoCaRx_}ug3y*0HzbB;;b%O08v zy#n>zcxQV$Y{sfRop@Yc;wCf?5 zaA#H(rqEb;-ZWGkIJ;>%ev@%D=oywF&ZD|Suh6V%aa|rQdzVbXpC^K3o1o`E!cLn= zgCYfAmXcfc(|61EFzsfWfSn8;Uo+-y!KyyF>YI3l0iB`crQadgkjh~?Y~3NBXfa4q z4F`vBM+pT%_f33`=p?)*dhh3o(YafYPhOz5y}LXAMLG^tx`2@=t0`T&@dNEA1G+cw z>ySejnAw$o74?^Hl>N9Vstu>LyyoA1g8@&)zrQNUpZCSyP$*j9Uf}T2O~F)js*+d{&B4);vQ zs^(iMZfCapD>(RsQ6FMo+pV!?GCvp1!O>@7UQq0$Xaox6Pzt%w=}jvX=)EoO^DMmQ zhkES=L!V-X{;_?F&x?b2Xa0Uk!s`^IS$;LfxX{TXGuOEBuVP4s$g8;%tFO~KR-Bo$-Q~Z2sL_Zt4VN$lPDp?_>AF1>6v;FB z*1-qsb)Sex;VndGt~SZpL+U$M9CMSFpfnv$=<4$MU9~>WDARaM3)iZ8fpzb5`yIs~ zJN*J`ZkdT5#ThVwjNJ>5-69gknENkq`NVHkGEjef0ta%6cItLUx-zb4AaKE0#?5vtY_b-uOi~mKO26+ymQ~mBu~?{Co;_ zfpfTG?4*d7HO@?m3LDyXvpBwk|0`!)HdaJ1kaCGyPPI%>xOwh4)-LjpTC6^=p4S)cge9Nx4c2t zn3aPkWDXzRTYiTE2O+g8Voa#}gbMq-j7hno8&7jF)B0@HNc-w+oxm$n0C9VyMGZL| zIlOb&RNaKEPInQ&rvSnROk2`Tc|HufQ>UjEo1PB>M#0o=Tgk9d%brwg5>~@U!QcBr zF~JCZ18_>{LP}o+;1AI#@UDAGT1+ z{UesH1X*8a(#SzSzC7vNuZ*QYwzJy5o07 z!mPgc_=oaEE-Q2Is`;pSaiV(f+stGhP(I|yXcS(Z!bUD}y&HOQ>7InR%AZhBqZxp)12BR^BFu?1$?xEC*ISLv=2 z>NmX*B^`Zz6#_@^yRiDWU(xsmI=-DQ$hxWWoog#hC*@0!bR;Xm8m|@ou)WA)wwHQx zflO&}!(^;%VZ%~eC49e)0?D8DA=;72WuRuE*$)|(JfFX23iEuNK0a^wSl>EB$cT^j z!569lZQ?vyet3YA%(ssGX!3aZ3B#c2P`hngAvl@>(N(ekgNI~Rjc@%Nv%V*Bz4z6W zK2Npo#4nGbJP-hmPBN$lxyv3f}llGL5^y=-k}SfLklg zg@*5P87DEHJCl?fLhKfBQ1>dxpp)}jwLa9~mWS-~O_}o4Od8d1&MjIEXjw16X!A?_ zmeFB-mF!8^FAV4B-&qbUb6z9E^I`HOJPTepup8kt!eAPjf2RD;VmvulE3W?Uj8)0( zp5MhTxATga-!0W7RiI{}?0AfP#c<*a!>XW0E_2B0IeYl>^ zf}q7E0((rz8&#(K=-Ifq1i>cn9;`U8BKY9Ou&pWf2d7-D;RTjIlb*kFAN`%U`^dks zUqVNUepmhaTV)lec$3DC-aGiDQ?4-{oEGN2BOc?*y$?Wxl{g;QcAjc*Deh^ZHZ#B} zruaME@h3g|A<-LA5!r$iT5`k3XP|gi{mN%>Q`fIeA3o(7gmeU%&SqhwVrD!3tsLOW zT_{O?q{Em~49Ya(+}?dDRaZz~F#Pvs@U7iuVqiz#FPH4#qQGup?@qkvvSzi64!$+H zS%yl+JFWopuC!_)2qf%diA@?iF+{4N=bbsE+nS9q;ybS-r_D6ar0V1(pBUno>AoQi zi-VbKrWPMtZwxD69bx}xiv_zlCvT0Z~kJf4V|TNhuY&?bt*arrR|FXd!CCah)U&s@ctwt)KQL66J7gMJ(Rk?XMTbxiL<*CO`A==9BdaM}`lkLJ>n0S7*(8qxWR!4nmdZ~n#84kL1!Y=f1m^&gpozW3{JwH&xJl_E+h>^Jhsh=1pPAjHiO=}|=B z#3Ug_Q^*5{DzEJzj7gosrk(5PE@|+Hgf=2piXxgK4tHrNlqYqESEuOJ%|$5o+oB|F z@lM7sevNBgxSp#j(u3J|n}ejiZ2LLRie%ck{sUj-Y9!w+*!USzSy&9&1x38bj46gp zgt+Nke2!fQ%jlLVB3G`D+H)FhlJFkrXs#r*ZepG(2~Jl3;Q3J~P8@m7w9dUJnykO3 zg)lyJ3c;+`v7NK!Of4KSArziY^8u#bNOdX4BP2r|vmqoo@-cIy%MuW-&pGbN4Nb|A z7sqE8^Fo|;=qEi@19PMqRVP?hFELQbV?g>JO23QP@lPB#crKk8r3_JzJqOA>_)xJo zCWB_5h>2fxQze`G*0i#>NggdB30fNJ^W7cYowtA2Gvdl%@2BxU5(z2~Tn>%lo`2Dt z_%H@bdn~Rt*WtmY)?5)p>jg{v*<&oC%f`YY9u{>7q|OG@1=Cxd6Ky9>2StIR-UAL& z>y_Fe5Y1H|(4vXk^D;{>8m9X%9oIlG4wl`IgPbd8yZR=;jT54VL*n;Kuqg|9INK!r zg9IAVY8x!#$IJ%oWp}5_rln=^Y zuyMF+j(QM`daxfnVB!JGbgCwfqg9}n{^`WX^2cVb)&tJ$iasy$2Rn1$@IGID^U=ZQ zULuz1EZ&N3IJ~6;EwAtbkA;~sX}Dky>Ik~)4J&)L38vdqhS^e>5WCw{@p6_5U-YG0 zRd>tYdIyF|jS&}cy-fWqY`u{m^e~LN#WMq4QC&*#-E%savjIb|!tM9a$%_VB9?m`_ zDPZVd9XE57=stKl4=tNPS?A8&pIz_7L{4!sFVx_F72A}xu+CC5|9J9_6bTnX@XFV> zcOMA@Kf8v}@;xMYCU(i-^7F0~Nk0>4AQ`YZ-CZ@BZwOR_z3pWiMnv);yiDz7_SnGw z3Ox(x67Y@U6ct^aVxWeIPk#D7FrB!KL?l^)GBLh zNFB==n^C8L-3yxnqFwV+r)s?A>U&HeL{B~{-Em>w`(YoG1L7U7eWYM!JW{2g25;s& z5D}CB+dxt=S^Q2M%&hc+5hE)|ky_IJ&kqP(8s@xkd8#Ph*=erlSjaa%wOz+Q>zJS$T+c71 zxBFc*e+K3-WvG3j#^#Ux`cx%VKJ*y&r;E;oWqRfTw{gyzfoEY3t9Vt>UT`nc}>T=Be)?|^M zOQMt1_RG1jeiX*YmqH0@drZjTauAmLncmv%J=L$3JUnNlLfxm*0@3;Q{tNDoI@3Ms zGLUXN)5X#*79gA4p;|w?er!350&&&Am|ZyTuVjAez+u#ZObO!A|4M<1ePTZXB(sE@ zIDTiQd-)v~aic!7zq<`MRsj@X#BBtNQ6&8WPo$^nV+ja61+L1)fw8gK9d`%?E-E!n zJDtQY;FvnTz5@-mPoF-GISW`!z zW|$oxvXg69@KPMb`f|0Pn6nwCUApa>%e0^O&%AHAk0Uh~4iYZsl1LqZzVo74T~DJB zuAfcakCuRdmvfmm;5BjqNJl6fr~4?!#Ka^bA|6Ts*u%h^1l;#Zsh0Z)bxgPWe>fp4 zC*I4;%hW3UJv}{xgQyVTnFc@(ePGH}ANs5F2AEVSDd8Z1Qc*PF1$7Stqod-fH;Mq( zd?jKkrIQhnk3Vd0h|vdB_%i4 z*8#wW8x0H-z(x*|bt?dyirTT`>iD>}xfv-K>=WoTuI3-+sH?00U0VZ4tyBiBK>=29 zUSIxaA3?~3LH@!@n8Wu#-5Mev|3>c8d7x?sAgWE7uCu z)-NH(HL$N1ar)5b!30pV+!+itlwg;eBp6Bw(j31wX;0iR$qaq_w z5krApB}nN?5mKg8liSrLstl}8dB*jY$~G3Kl`f!w4H`0V!N83S_GwT+@agCefTly5 z2RYk**RKuN*e`&|I>4p``fLDlX>Tv$&|G&4`TLBDZ(x)n_`M#_KvYxIk z(32UpxJuW|KGD4^r@;tHRrqUC`TO^8pvffT1I%uK;{@i?_YMyC>?poIKI2>gm-~9f z>OKHN9%<@a%6Z%i+%;hX)4{ zq!fYP3Sb?;600)(Qwbyafye;R?LhbV+_wW{!htFo2u&RlFfj=OM;C+xQ*&~rP#}Z& z4B*=b{;2dB>$)0AK>xG0-UjkI(0T<aV+!=No9sMVzWY4Vjyt2NV%cuvdUM*oJe4Za6fiK30mS|(cL?;(EOK&K zb&3^IBQePUYe7d=wqMmf7DUJZqRJiMXMml#mn!`ypsTO{6s9Y3RSPB;+myC{zN(@xOPcQ`r4sZ>rJZ)Yc_>>%Q*Z?F4oSTIBcy4HW zcm}{o1LCKu>KJI{KSx5+0nLLdw{s7`Nb&m*3&0WFXTbP$_LvE`&;xW9aApC!2*h~r z?cJ6~wGzem0Z|$_T_wfE(z3DuYp`1GqRf{7La_m+2aiD$T!5igpsLF^TVeJ&eOkCGjC_n|9x$$%O4pg@HBmW`d90U z4(4?K{nG{^=I;Nz4}K4_|KC0V9OD1}iLoE-!GAwoI|0b^{`;pY(%z@r@c(?^|Fd88 z&FCF4+fhK|rNlJ}|KIx3|Lcn$#+Hqq%eX+X5Q0XBTl-%|rzD9Tzk`cy^K-#ERB>TG zR4c-zQ`28uqQ-ucgcyy3LqSQ|ly$N6@k%ln{-CSbla=Y{>=rj|c=*?T2LBd+EP#gb zv!OpF2|SU|20~2Lo>KpwUSpz%J)wP{Uc0^{oO*f-yiP*s&By%T*QA8cp9tSi@2_{W zl|99;JiSJW6#oxPAH4qmKl=Zp1A-#r0slKlTTSiw?@zQG6#_!S?Sq5+@|3XO1m7*| z0U5=vQ6J1>fWoj?{kmKOWTkMpkb+w>cv~+Gs#%_F$nrQv5AgOjY-Mf`pt=FUH4ESd z^&DV40`LB*=8%SWVoMY_0~a*#!^K8N6LOf#fd`eS2N+Q%Gzn|q1`1@c(8(Q4av@;9 z-cik=S|{|*Wo2Sgl9OBEV*x==KYvDiaZVz;2_T%}4^C$_k&%`Lzm< zj!OWs6|gH+Foqj>izx>1^H|fMZy28RpUozHUq=X`i+!(msZsij@|;?uToWW{K&&$# zrfkG9!?v#P?(P8M9mFIEvapQWHU{qE7K{-*_rMfmNX5SzeOeI?m#Sb&m|*5^!3AT7 zj!+{PJXUy-G4W&_OqL+>V@pIo@zV+jBk5V{v2To?_RedpcEl27Mdb0YBL~RG($dy4 z#R+?W|HqK978?X&O!9cB2kP2~_~-sbKY*(WbHzw4>K+C8zAg5)8>-F7lTH!-P`Nx3 z)$dArPYoXeIYYnUd^$5n+5&(14voSvB(_b$rD-I#ySDDd-I4Eoe)5<8v#NsoA{r^4 zgOD_|LG=+C!i0~8xgZ9vQc}#(NQt1aw(P>$J(m*54FcH4=Z2$j1&_}oB;a)afb<(I zfa_ZoM7qb&MUV=5x)c0-b$9AJp3eW!>eU~&?iH4B{-JXXe z4Hfx5Zr@W7qRWI&ItdtlHOWI8KeLEdhUw1J-yxCRq#))Az-@~%YE?{qqSpYhmq9>t z5o*DtJg?aX)CSON1vyY49-veOO=3Gv`mJW6OgxYSMM#)}G^T!+Z_oH|QgqXp9g#sx zxOXIW{1pY}>{%~@e=AL#s+&usDcH%eGe8-n!|gP{C%P9F!g9*$SiU_8^1c@xl@8+Y z$vYSS=Smna6KRSJT4jg^O21*3J!k(eUjZl>y)n#Jq48~sXlHG=@&g*aa_Sz zzKHMiSe4ih{r$q3Lb8){&R<9R-iE7sriHKqt0FRpg{f2aiZul=gU*O!wnyn zy}|t&KX+^4)2+Gm=j1T!M^s?T)vnZHz>{$|Ax!`%-39j>_T1+CYpUSKrB{!f@DW5h z&GFWgRx1+=*G{iglKtKf|2|YEW)aM_zN^Jiztdf~&aQL=IKMg1`JJjRSN8SKfRL$B z49=!R!S8XZR?$FWT6 zvfW{?aKx@kew*)C>wEB(zU}WKV^ItZ=F!Z+y>cAw z*q-{>;Q|RpF6HY;9{M9_w-C5;G*)7x=B;uSB_6_<^3CJS9 zkjpWL&%;dV=c3ng4>%lb>x3>`+%V|=+p3WgyJ9;09vM%YTM;Vb;CJTpFhUPITqG|J zLx|}ZeN?LsPZ47(f>!rTXI&B-kG=Fx^YJvls|aHk2>$#%+}Wum+^_fP3Do4~6rW=Wu2=Ey)^ab>zUnYr#6#d#X_s1KP)u>*` z=NKurph0vM`$jxqhhr-k`ogc(<9ducf?zl(D5{4`3h(r?#z$X3D8j$|M2$BjoU^#u zp3iLv4i>qds>>sH)htaT%Zw~vE*7W5+gcX&T&HNauXQijo|O&3-n<^^5IqubZd0Qv zS@xGgF;`JwcjC4lxol3vv*NC#%Je$mDT_%goC<(l5BqI-x83j5q1a&w&1W5ab%b^M z8oQeI zA(6njhc<#fc6)K{Y5O`T{`0oRSkAnkunDZeCfzeN(1=flBAEch2Dl?LOzGE3od#CF z<2)jpf9yT9yYAXVrAiW@r~YBd#Pug|1P|Y0+V`L>(VFbo%H%qaUG~?U%IaFv`KwvS zt>KXy*C^IkZCO-y;}?513TVv3w`6AFxIgWU1B3*j`vWu{OHwT-Q9K8_esA0sJ|t)j z?l?k)aA^NBOXFYOyMEjfr$|NiE=9fAUsh z@!|IIsRby=`((!8j7B*TR-AyNY~0NtGe8vmKM?nH#%WiDkc*eowX06X_)D`sZFo zW%q~iq6-f_uc}?$0?}a@`}0p?bZ_Z4{_YBw@MtM!x8Lo=xiv4Yw^uY(YZO!^cGa>y z%sse>8q=4NTsx#BKjVM)D}DCRA6;!J=s2l{4wx`bFIYB4j8p_Uz5Z2ZSNXh@ZDOW| zhroYQqxlqS#E?&SO3g)2c@C3$VQRgFv3A&`&u3SzxVB!rFs7SB>|wjaCuPypWq=oj z9c#vZ-devy;)D(H6_MezECWtfE=(X``@B%- zieWJGH6++fBSA%gulhxvp9s$-&xfQgX?COuVIJO?t%leF)!pq9#}8BxyjIooB3@15uY1 zj=KmyBNRaRNFWLRF`dNZt<KoHEsD3ugqsb3!hz$z-muhGcr zlL|tUGp(>=!||Bd=H2oxj@kU%@DJzSjO`rF-IPHQuexa?RsGB~Rd zKwIy1`GiK~x2a2Xc7GC9k^V$4*DQzqM?O%A2eL{x_`YSE!*&C!h!g^V>$g8SwX1jDTA$%682AnEioa_dO? z2Wa4KSSa#757`B)gRcDygwcW;EiUO;$b8_t(WwCH<&DQuz$bzy!%Wm|`VgZI1tC^i zwADAyy*2Lf_QJWGDsmL?zv2Vh+e2GlN5-j|wRQ`WB}9;f5KL2kl9BGTrPevFkC(t1 z9S7c94tf{(rnRhpr6It-Eba+c;W8UmgnFNUrNk)XfdnL!KksTWm%N@IDXKyzhO}=n zaV)cpIZZBleQ(YdL>Hay0%oZXO#Ce4CAc^=h!!>nj5HgfTzz?t5qQju#?e$;Jl?~tA4uZNCM&59PR6@Ts zP}Q_>5-RCH&5w4R+SxsAtCe)?%>#dHS9!ek>+!fTw2?&pj8uC#D*Ck>4FRECG<8@z36cD)6G_d;6^O0xdeNvp6~HyRQ>#yQlXVDC?C^)@z99*@oak` z!y`j(_UKGgwxJUQGMQ-$xNh0o{%bZ-L;XCR^t>E(p{{myLLiyNjDZawx=|@5jYaY6 zZ(Atf1yuXA#g`3?=fzP*MXONinu$M&8|tt@R%|OTXN&a{K9KHRVT?aj>u0XBTntj& z1FSh;Y^t$u#~hFnw)eA+*)!915Ix|*NC zlR565dpYYA;#oA$4`oCm%$6hiSc~06Cz-88DL%W)!SCOW_;Klk;cA~O=lQ|Pvq>W1 zZ{-YWtC`fhtS(_p4zwQ;%H+rQMnq%ipC<-#n4Vg8qa~?ow@#-M-uZq#fSC1}U0%K; z!;qaD+d=+u#l|5K*|=1>D3Kt1zkIEM?lAaE)CW#Xw@iwLbx_>#Ate&~uwtu8drVdZ z`YvOUz`($V#EWNru>IM2tDA=;n@*xuzfJtD?dQ7+IQovdY);RU8_w7jhe`Avj(1I0 zS#{oDSI4m~-`;NQiLKuFCUQ_>V+OT2KmFk$J@9Q}RD9~$>hXg>=^I zFJc!ery{mh7;^j4KMqv{xIHXqmBoDdg5G1A2kJ~uzuLQzLZR1$Xoo-e-kdbj8V#WB zwR)^y754a-IDN3g=_x%i5b);xZOduZKd&mTX1CZDrT^s^tc_y68P8K=kfHZiojzYA zbyA!@opZNc$#_Up(Us1|#;BzeHRp2@*HYnzc_3_Df0{!`+-4o|^%^8Ounwf;2 zpB~3YQuB@(d<9mUoS#oDNC{>ex~Ff_Z;RL2XP3c!m_!!c+7|NOO^m@mjtT`kDn*lb zEdQKyS$$GyO+^6KdsP;eOe{txS9L1JqTtr1On}^zwCvssyW28GK5- zVaj`5b#fL(+L&Z%b-5laNz_c`wjT4#5&o`3?|@HV}G?w?Vw#? z+6c$5$M{O#2uE43LjU4`Mf&^GpvNlij3Kn z_IT&QyC2&@*e1uB{h8@zS9P>~?JiXdyPdTnxx(1YTS4Fj@FuzXa0Or%#9QN6X+pP7 z*oJ-YMqIP;tAzFccys@l1l*FdlWgD^z4UY^HW{wQHd+>0;Mc`ZYT7nOu#O#pA*B<8 zsn>?f1BWH7?wLAzUCf1+NY@!tpfe$zXgv@pYoLEE+ZN@SeYE)3s&KvQ(fCKZ-p0$i zGmj|YQl*lKkB{0e+&VhB^ua1JaAu9|X?GYN{$I043@?PvKW(b!oko*CbVXfWesP44 zEEZ7co612H>!axkg)0&JcWtb!5DQpJH8>Uvids zoDWcI4?G9SLSMpB%m&8>qJMd(TGFE`NFM!iU)NqE&*Khqm@`{3V#8R+BC@A>P-eqpi2(mbn!K<={0#~wB%>H9|!x=weo zG`iUAE^}X6)Xf7&2R?A`jB9Mp@5hj>^}M{KoE{p24Mc$8RxhfgSyR;%s~cS;P@C?T zl4UDc!*33iMlNO+Dv<=JR2Me_b=d-YHT9_PaE-v2xYMYlF^2n#Rw^C$C%jZrR7vq| z-&|()p!%VRC|2M-qun&n)(OqDgalwdrax-hkfZZ}ZH@ebKVZ`7$S+zRMSzR^rZogs zzwF2+k;I9`?n=GR1J8QQeE<2-{UsBmV_ZV_uMb@wr4mD1s6c{h#_Qg9zy_2Jwu`~Q z>P&OfUVBr0F=f^Mu43DUh8ZDWU;Hed*1{y?{ ze5`*Y%xhz|>Zf~$N=LlMj4QbOgDY+ekdLrRUn=2;n!`NEvp zO1;MGEp!k)%xiSV416dbrQC~PF6!TT31Z)W=i^G>|6yzWxpn{gVZ-pP1`s4EdKg2d zK>n+`(K=~(i@7}2{;cD^9I5s>Kfd?9p`d_mNh_s1Yy@5AbgivT*1=f_hwslEpmaOd2^*qulfvzf z;~;`1ZRXwWOncsU6qVcKicrE??S*In>{9pTB6cA-7 zVNg)z!$A0}Hjua+!$iDi~*7+sl*q@g}LS9bGR(xyrmIdz=PIn`D)zc$g!#vaeL{66f zg;Qe4vy!o>@oi)sIr58#%ihFyoD^9-cK?K@bx|g=Sz_~>^z_gLjleXrq=##l6bI)? zY6}gyj41hA8SmPxGP~wuA1umsFZPS*R_xSx;Nh(C$yK9}eBJP(`>5QA&Iv(Oro zyj&rKR`%pqsMTXzqjMnKLWAcC0a<~ z)o#=*-vO2_i=pH!z(b^zB>>>oK517k8}kuA|4=JsSaN(osa@uBd_74>I+zrTKt5GS z*AEX;p`Xe+S@r8zYMDB)P)D9|<Igd-z?BGMC%AJr zhWc+U9-n5ltxaVAb_$>deL6qKK)-5p-&|rMSNGn^mvw(ef;V{wCL87fzXcJ4;XoK# zvB;yvDYM!2MH8JyQ&eA=M9^H$eHTKJf0!YOEv#v}B#)ZAriR~M{%2|iC@DbwCxFlb z>;_+fC%AjYk~j>VjXs^XyM%@|E<0W7Hq{gMv3pRDg-TWKw#t8*&W@+?BHh3E6$R6| zs^b5S;zeTrR!Angt?%1wS!K%PHSzjw6EH?Wso3e5MZ)ES?Fl9^Ti|0eI%BU}Y}(m#(fSPN zNLp|R0c+er{wTi}Eh)}oHA7S$)na&vF+BNpuVd;(P~Md4OSiKD0KdD~9^&NS*cna( zR<=mXG#<+Eu6I>UA$GFgFw16Mp^QVMPvXbgzK^>|2tTsjF11?sZ*x06*49`;`0p44 zo#?RV`6)U%v=Yd2`AWfiG09kSC{&2_Xu}WRGYz1NfCY<9?mduM*%*nm;COevz%-m2BYL@cCA+#b7&^>ynZNIl3Us8nnHQAy{1*w>DS{v_j+*>9W z2SG?L>n=ED+sQzk`6E8bdUZ3&l39f=>{6f~%nO3{!GF(GtG!%?!LnYjIE%@Q%1p-# z96C02uAq7#DTOAQ0)0^C8(*a;YkN_>=nXJVakrEz-pi7-<;Tg>pgm{OcfVpcxzY|# zbe}BMlmpxhD)qXa(g&*56Rq8ha|+d7wM}d|mnh+24tE&azKGwQ$+M74(#r$Z|15|Mc##ojV_LtNxc}<7_V6d&#*+<_qST_z zZaPq;3(kSq7*KEwl&))4T&vUVvQC87r`7%%Q*Lc;#k6a9g0u13EmE#W)#l}z)H&{{ z!ttc3^FdP`e6Z1vrL#)ykwD}7QJW5@6E)MDs$dy)Y#7|fAr!RGi@F{6$5(C1@wqo= z;PTs#);@h-#FmF7(iUy1-!dD!yyzHL%AB^n;%E=O_ zhreeuq-WF>BxKf@uGqQ2eyqLnZXV7Xwiq4gztFlFVbA> z+40hi7agAvZ5(`94lDsOSk%l8L?lOfs`LoRj!ownpbOT@54h)c)hbZN3@2p)K_Rz2 zO=gh4huRVv4WinAhb#RuRZCT?#l1Q2h4Ge{xFNf`WxP?wTM+iLkc+3lQ2OLX#~}z_ zv-^0g(&LAJZAsLSZvmdlu z@j7qc?HUcc2|Q+0QpJap`&`0eO}4Y_`LdvqrN=S`^||gJP17ec_}PfChrUtsOpJ`k z@Q954z{-PrE@ZsS1_&jGEia7)iJx)66LD!*h#(ZPk)+!w3?_XqTQM4C6mrO>G~CdR zF^r=U*UmDkPY*Azb;YG(Ai&j}CMole*pNZ?t5Bu7RILU>!T_*3sox{r&AMmdq|ZK8 za~fkg3<6a7{LX490OGv?CO1CXvnH!LxsE$6=$jq8k9K92MUL^E$YhoadX{I7#WC$t7&`~t z_}0h^IQ#fEQ(;IDQf6#X-j62n`xEE2iD-ldL&|_TN3#Y)Ro#Yoc{1Kqr*kw=IbKn* z>N~gLjeSP*n8~9vJJZe~GOb>0(|lwD)BRHO+GM_yxUiI?Oa7&%v;;9XDSJdaoy?(< z1`j@zg)dpp+Dtdu`xng7ium~QA41l$=!C!IE3JggxvzB1+|uz4aCmma4#|sh7es1k zEiZn0jQfwzT&N~{-}2$M5I}!|gw8s}M_5)z-XH2pq%|L`6*cuO*jmYmK#L986r%`S z(sb4j=IrI(gFty<&Ee2DzH)=VS_z(UZDZ%%o`smicjgWG0ZzIN{ne1BH3%yYi zZS~F%F=yU8prQ9U1940M1JWMw+%jSPG(F%Qb@TSua%Q=*5-}bcunyc=<$&i%u)Xlt z^Cs9u4d9U{MbfKzZ)|=>unx?=bJs=6>rumzT4tEd z@gwZiOXTgy6QfR$E1GP%+~IzErkNPg`fdAND3(sZ`qR39s5A9hHG##Y*IAp6{&CJrbxWPN}7+XeSgp<4bkV=ivt>;v8 zXz=@PnsKSZH*^aR)*G0#iRgYF)!o`x6PQ|^q(Ui9#-NRLA7zK!=xEo<59^b{tW5L2 zGF0ywUSj&@q~!}_N&M@byZZACXZu`~oGjBXg@)jbo%fHmAEMA(lj;|4eyvo_J}paz zpwKXI@3pK9C%^1?Da?-AD9V*ZMoaIuycL+Mp=1WN>&oq` zt?@3QSs!bAzH16PC|cFtZ?qg<;%zbPQ@Uw&=RbC@Fyv)pxEHks0C~r z7?*+T2(@X1Mzsb%zo4Cl%WZC*A!h5#>Z(wvh;;J`S)gBM$|76b0v@ ziL608hf+|otp!D?FrS3$ZJ29RI_Re+%hV`w6dF3PvHD$(MEv9G?v8m|As8~CWQ|52 ziPEF-^0 zv%Ve!?{Ifab`~ue<V8}1=-wgW}hti9h}vLS-HD-K}&?N zgAF#Bw1*I{M4um}NOpJRvuWuJ^fVOZ;C`288i+9zq9A(+;;B@a4ef9n!_t?1dM~;; z$<({bqkizlywKR%{2QTGMHci=& zkYp1aaIxAQTWQ{*b29sO`OPhA9xVwy#2( z)8f903yN6>1?3|mA*IzAm!SSr3zI&o{2dh>q>|^XCO@qf`TcfvR_G@|*HM!(&utC& zdORvho>9rl27kM~jBXHJ--a3o`b%Tu5&CuY@v=FGKQLVkd^eUbOqYT-3iJJ)`P6B! z$XOrSxYUg}$FZS3#zDtx3qESr+-MymYaJ}}RY`zY3aa44WyAN!@+YOVSk9~)lHLeu z4)!l0C*&706zLUUtDbU${zStTU`6)#wlsjnMZ`nsj7v&NAvz{7GriqT!>bvW!^PoS z&bN}5j&BPN4~>b0AW+3vE8jxf#`us_mX8~9uI_4{VE#!_VF?WlbzytIBDB>^!7M^c z-+WDBktamJOc|E4W*D=aml5}Gu>=baK-3b;enzZJ+nM$y#TJYjP1qqdFok0H> z_alEf6OzAAMo+1r;O5@e3=l!$+9uSa%$MVo`prFIU4r!B`t0M>!C622Z^|bQ<*zO! z_IT%IC`!}!qpoQC*`aQT&2zGz;ol#wsGvBv0JHYwim4{Y+>-J;w9r8X0bxjX$uDCu zugf$noj3v6#Oa~{f&&cUGb)?SZo3~=W9v69PHz-5{I{geDPK1qZ?Figvd4ILzcEGV z&N~*}K;Gp}l(InN@7sCLFP<|LL)BuW3nogpseHe}JHeyepwn+3QpGWFqm}i`RwYM) zf>hL!6v_bC3`yAM{5aH3SxVf=&P)6}@f7{Imv~oq)kf>-;{IHHPnP62GSH@b_v|*G z5CKXN5)g=t{z0?%+B-(r@5ygyCdVbktVanAvgc!d?ep7Ljm#+$nwE1D9ee+>l!r(6 zbak{DDnJXIT^#C8OI1cR5hq~-AvaeZo)%nWM#9JnX-yCFqY=h%D%*k$FFw8P8PF0Q z6}8SfA!d+Fi|eh88yOatnu@iScqjC1)}2Y9-k&a#i)h=De+Aya&*Hs5&B)h~;Adv##fi$XL} zh!E!^W*IJFFG8Y&YRAwXIm@x$xPT6Vc#)*9;$pM+Q1Za>_hDL6Y_v{msINx>Qs;7$ zAOCNiu8y+9b0Ow<`b4=f4H*%?k7rbAzcLhLLR4sh;f8C>RjALINeaD4E{?uX0)#C? z0LsMR?X-dY@EzxrL`mP$b7qW_RlUOn){Owt`*lc9rPN=QI z(%+FecaMBCI4}*zZac(Z-ZrO?5NQx$h9t=@&hr^AL7V0EYxlGV3ARj&`~Iwx!&Q;R zhky8I?}I=Tue}pQ@5-?8OHr@inxuN(f5)zc7yCpIP@hHfRtz<4Yi>rt3q)Z$2MT;u zy8tsEPM^6I@D=gsfyR?n`Sd2h!`^nIhQ_938*>9fF~&$)5d8_=e4|UCT$na zaC1C=V5*W|ZI5~OkJ4tku@~_^ypAq4oFaYP5!-ZERmP>WxH?CzvGmZePTCHYEGJtq$rWcza3P@CWRz| z;%e;-I)1M*MbzS4_@1lX%)s;<$odfs4KndNvLl!Sk}iah$?M&SPVq7Eafq*wYFzQe zKi!sU7C2Q9wrVy84DA0ykD3GcEiKpf1_Di9J%o*bo6T6R<9TSK0giP9N4};+BrAh= zUbM-e<{0U?z2oU@@%p%LOt}A3OPhs4QaGcp$3C_wUXV;+r!`()W!8Wj(?7Z3b@?59JeHHikIb#>pAzSyy{ zr>v&tzr@6K15zViQLh~lp`h!-@N|Qrh$;-v_Jtl-P1w*@aDgFKP2aYra zdVF}E6u)4;Y@gpQ3A-do+{+my?v5wx>xDQ_QZjM@=NaY+?xzZq`@yK@I$u$4r83Vv zrMS2^WL}Z*UN3t|Cj?^Zw?Sg$0@Y1`fSgMQo!7I<=lf`j(~f^7)9v5!d@Q|u)UW(E zD+fPSJ5o91W^;I3Q=p4rjimIi?Hob#A65eb`SWh3nT54q3_=)gHFX0&iRD0+Oj$AX zJ^lU$B_ltrJYr0GRef|>s+wM)Ht05`5 zyZYE7UyS@zwOPr^0gvnYUW7ed{liUH9VO+;)L_b2w#0kxX7ruE zCM|rIYJK9EFaF2U`BI0ub?*1or%RZ_*#C>Pr}z)0zX6e zcy~K{KRzV$wO!uLlig;Q)TcSS7&ZRH?MeGN!gqUFmdJ#wQc{k5`@8_b3n#PZbz4~! zVH6y0z8ds(DQ6CF1Ga~9S&=iDFMOzx7{Ma8&DXvd%~oeEZE#7s&&J5#X2 zyGFY%up1%E(WgO0!ZfWpY)89mP+dwoM?f2j%iUKjW@s-;;kZZgvH|B_2{QhqfLm%#jr zFEw~mduFB^ zggkfb{#{E+ktu~2N&{WD*sG4)Qc{g?z}_Xn?jjs?qt(=#tbNL<5Z zIu4W6$=m{!>f49>{3s+)NWVN zKa@=W>XjlFyi;`!#y&EoPxhsDi>3tw=E$+ga>GTi)|3>>uypai1kZ*iM650Y?8#Ih z6i5EbFZAzd@?dtFH++ihLVm6Ff_;l4dMi|Q=QI^q@97&CB?R7<>l>)cz-rH-iicxp zGXd^Dt~AesE%cDvg(F$|ec(fXEN|^CZ+Y$xzPTwXtUpZousJR2Vj5^I}t!wC)|s-Wv7A$ zgtL`jG;hjNt;F;3w}idxZy!d(jYiT_1BJ$s%coP3~>wK4RYEqMR14-@ib3m z<9~h&BivLgM9RXOC}+v;8!fF$0Y(@iT3Ae^@39&*v_qw|Y}#Wy^}*eC?nX5Qk4dI_ z$-!hw?Svx|gbwDATX7=V=EHtFc@z`s)f zNz)bc{sUHAozpl|fF#crNb=<5f8%Yc6 zE)pOUQ!T&A{k?s%+c8Rw!FW+N+vvG1$BMXV{BkiYCP)9dxc*S|LoMz~jemt4Oa5Hw zZZL$zfD|jUwY8w$7Hp|06shbOxsVRmNm7MzCPZHmtJhNe+xMB0nSn22{UrHx1CzZj zvS<;5OoX0K9#`~m*Ehi73x>n#`^&wIZIfEUn)X~)CZA`F<*o;2g9TOz7bnf|^NbTK zZekxVtwu5ax1h;}gq>K$pUZz7j`YY=n!DRwdj6a^`OY0gjL;a9^u8QCtjmLpZCt8h z5Trmu%5iz$fiAo6hmHI5WpW!0S#yrTI+`txV+2K&tIyw#y0WftmB`RGyAA2oanbW! zyuuE)TbUYl$%v)5j_zm92r4pWPI&+HE@jE$xwTfIpmet`8N}`qQ(ZDmeS1N(vH(-m zbmodfKork?Vd_q_+idqWMiGYa8B!jl`jCL|@~+()M}y1;#uZI*uioT$*PHiqQlK?h zrROW#eRH>Oe$IX#{mok4SK;9pIT9wJ+9uC*;o4;6Fr=UYs?gS@Z_DeEE)vmco?~P> zkkjIAi{bSQWpSgxyZU!SMW=+AL?}|ab92YbSx-YJoL1Ga@p^&rA5bY3= zf-kLUmbY6)!C>)&V>5YNEFmA&HDSNGytnb^34!g+j*~kRmp?R;;X$QaiJ+juOyRq$ z`)j_SAY5^GxkV>dl3G3nEL#jcRb!~>Krqe6H-9R4Mw~RI;U&rx@>KP=)!2!T7VDda zcFR(A?(f)0gU)r2U`q?OIBx>+MV;glYTLJ2&s(|;Z$b6iJj%YwY43PVpQ4Me?rAtvo|~Tis6cmI zA10tiA68$##cZI(%88?+swmfF(=)YtkasRImx{-&txT>*)mVx{f`>@<=U7)7ZMQ(Z z4L0JBatu^)G6c4e^pwHQ{&!QGHHL+z8#AjK4IqobpB;fv3PXvY!JRVpAQMA+F^d-%LNiNjVdSOqg2ny&T`6l8I4Z<7V*h&Q@!FArtnpj+@&vge^(E zsf1OIvBEd5#RaXYomiwRyca5sMo%!E|36YF5h?OhEuSH>=$hh_=oi5$44-HPQ>A4 zXVS*?a;@MYm@%=6mZAlQj%~fL6$2z5_n!D2J36MynSWy6vIbA& zz+vhK^(s7-ZeeIATk#V*hyu(XJ|Hc$xw6~_WAXPCp1RCO6!ZA z6jdk{Vkp`z8guWA=Ahhd>Qv40m+L-Eaqh#c);O7vwtn-Z2`;#7$UC@PnUX0|zM}7d z5s2ja=$w(QW!}&h5_|Msr%3nny)C!Z;r%u=`4>srVXb$q@kX$nc3`u=_7qc{>1Trc z4qv4r!sa_pQH*!(=cUz~yx&`G&ZmdY~$qOHk{5ljaJ#K@5^Ajje+h)~qxYY<}Ma`?m zFx^Tw1E*83@YS8u$W)21eRCtsxOydi=Fp(oNOER|*eWdgg8t*8u<3lHzxOen(g%S6 zQ-qLTbq{62c#R_`DK~!ijfy>3rcUqqKkPEoYZtEWsQ(Q@)E2&zqW@M-&N!X>8z z&t=DjPq3dzd+Ef3?4bYz+U`B|xc2qrteG$O#O-yE$d#p%cKjdgB%eY-OssEhX4*Q8 zy#xD&-8Ut04U4AcMHD4r?_%u6vL*A_QLTwFq9>rzhaS(8A?2;Saf?P*LXUQ5domzcwX(y9J zu=8m!;iD)u9(60!b z-OuH7sPREFC3+S8WZPK>$$mA3T$ZAdzPX&>B*A*8Iqo`(84y~K`lE&eyS0jUi3P^ppyH=)$IL0NkbUi>P+%H3bO3eDvwjhwWO>bO}bkly#Foz479863zF1 zycOjtYTH|^7@JxfJn7ol4y%l2sInvM zz}G6#p#Afg>)oHSkE=>Un;V>uH|Iwkw$j48ygAf~#=b(xgDnE4oMe5@Il2I_1l0O@ zQ@>_))+g{u|HkU}#6ZR^_DJ&$DEBK!a zNbvz5yAX{^5eMshYmIL@hrYyAd|X|FDJVQ&2Yc+h3=TozdpbEW+(J6a80xsts}|@2 z>{Ev*Av{2)v6kVudwxP%Sg^{mvh?uG&brOgCF8U^L1%5{;RyO^ebCj{i%}Qu|Hn1( zy17f@BN42~bu6fft#K5bJnTTWmZtmAY5a6*^ziR&{POb7<*{-l(~yX~G~S~hsaW`g zW=d<1$|JxDk8J`AbGQd_!oG$QxJ0Jo{b>LV6 zwp>%|1C2FaZTcQ0c{n{p;8Vu6&h8OF%;<_)XlxjlFOR4Hc}TiKQH}kt2~H}J#;E{4 z5uhM%^}c+#J$>DXV%5^r1R4SQ_A!IYW)cvP?~LUb>*)NfacQFUfWj=n5O>1y>bx@b zZ~%J-;ENnqTJHhwltUJJXG}pug9nggdAvPUR#b%OFgavlf|mOMvQiyjE(1V2(*WDW zoh6qe7HDTMkerOm)TI*d0usr+wvY$y} zdL0`Z-35R~xq5oiQ58(|*YQ2T7g5UsbeqSg%dw8cAPUg|>EW$TQs!=W2!P5MWU#ro~}_4UlY+vmaw>qP+~>yGj1GK5$`xN77o zts&?4IP4f9tD|shEgU2r&^5z8kk&}eR%I&!kbhZ3AK%Qx2I@znh7Ob$Gx zEQ#=-Dv;=C7sQs#eSi()hWy(b#Vi2;@h#Z9kukFKe;x3*u5uGS{wr}Gp8_c8@Zb4n zCABOc3-UH!EC48TT(SCd8N^v7ysm~XMdJW4_8c7@N68?1RV+0`RI>5FQ}N+wH6XUK zHJd|5Mm9=@izq&z{0MZW2^*iUhAcDiq6f6?e*AI}Nw1I)yl`}pu<>hdQebHt_&Ghj z<-(RtEh}B1T8yWn2GA^H5AzIXVl)8u0bx{;@bu*T{N7!-Qi+|ty<2q1hIe~y?Sx~a zU~yH046|O9)=mqGoGm34z>}hZ^jn7kk;16>b#o4gOuRWqnt`;nD5T$@^N*Bm?b5NL zq9Tjk#J*eX5S@W$#Y|USrCGzg=BF~LcH4Dw&+`BH|3H_d7M*IwrBJ11{%Z+CY{LEMc>)YE!POM^V) zvTvCEh+G|sfLp0T96R9t#`Id#`hoR8*{fR{Tt zo&vy*0S+08{4ZqQTZ0ts?d>cdzrDAr)yXV6-~1)YAZrF_VL2efsqB)naR57N5YgFh zcVNmE!jmzg8wwbgKFOK^XB~$PU@KIVjRV+f0#Ou^`NJh1oFRWZFI&MBfEcSSo~u|X zi$Q&8Yak&BaiQHG3PuqcV{Entm`YFu68#2y)%acjt8ei=nq1UKHVy{$6pS)~=p`8l z4=*k0)5ZX(D|%ToAX>^HF8zzFin<>_L}f{_WsJzNoNOE(GA8r_q^fG_m<6!a5aK%# zzovK!;AHhA%*)FYmksUg6iefptv0Nn?gxeg^Y0h{S4B4N>dMUu$JW+1@iWTXw@I>Q zz#uC3eK<$dn6^I%4j~bd-{-0njH0QjDMtmszy=G?M+HymQ7``CDfc7p!=R1v6aZJs zjWDmWQb9Hpz-UIo-|j82-DC=-PB-Lu2gm^=eFg+R$+DY8DMo6z=74gPLmU?mFA4Dz1A_}8B`d44 zY-nO4ei|1ct5GF&KdDnIB_1^ki;`?yd_1ld4*m*u4FJ#LuwgQ2Qjv`VL~(Rh1&pGu(n8fA>n<( z9J5+f*P%i`7Qeb$lwauVGRs98y)Na&ZK;Zvu=KsXejy=EifS_XO75Rz!iB!ATR0lu zpHBhvR7*utwyGC2)W%U=`o;|s%yRV-%xJsJjR|^NqGI^7RF}uOXY^m%3B!oKcC0|? zTZdDhRK&s{eERml#czFm-8k?W692sGGnfC3Jb|@&rXlJH{rT_lMy@$NJtRYKi`Q7N zOlsY*X-i(qt7!%3i6y8kXVr459#e8==KFg}$$WZ1Xzy_}Kd}H$Lqh}bUM{q`u@Na5 z7zX?L`WE1Ib#>)phvO3xvMlU)GF)9;wx+P9CWeOOcb048^**Zt^s`A@2w_xDclYLC zJ_jeKVSpqXMnY!hk^NfufcfCahs=RaDk4 zHo4f?*qE4x7vOi{I_r4vUC&z zRLiW9cAy_QIn{Ue3v-jx)6?_wserd4BZr5E^666mEd@qn3uYh=J4#!J_C4%*e>FAD2g|DSWH2)_ z4p}e*7cq|Rk3+ywATQ5aB^6%?a0LNTW12!S;6lAQm>Du*3k?gqe|!`~kI%}gGKbeu zQ;P@RqX3~_fK$n0$4(LhfDcCr=@}UTwoK;GF)j|y#17f6IgeYD?c}f^_35IL z%)k{BkTsuYXk3ofFjPs!OsmO1Ivo6O8rE6l;gg5|zkL;W#Sj0t7ZmybPUpDX|MyFV zAOC+RbN_q42FCyAkJ*V9HYok?_31BDv#16iBYsk@4BFLyK7!D);;?_thXG6}qSwIV zyTz`L;Q`AzG)TxE%@%TFahHLP{qy&VdS4AgM4b1=+XvRAsc=7U=K0k-bqmk!_v9 zUG^S_DYvollu{oa7GbsdHk^vSK3|opNKVzV8a+}$a$999oC5~c>h!yNg#?_ufGIaR zI``H~>v2rU%I2d-&fK-Bx|Imzyvkz91ucZ^flbBukMGczrLY&gT~Pt{WgBb5<96o5 zm1mO-F58=~LY(IscZ>h^LTOdm#|`_Wh46qCUoUMB_3dCsX~(g#SHaB(T@efW9B*^p z)5pF0BxoIdRb4Ip;R4MjW5B~s#n_~)U6$zYnLkvkG;SBs`p|pc*BIV8Eh292`LS#p zY!(&<9!-2g!dY&eLx*B8UEm7z{?hnH+Xi8Sa5bfCdYgR3zg1z#$Yn5gh9m>?*=x_* zBOyuf<#5Fz|(dlj`kONdL7qNR<7W+dX|A)s>93!s!2L?#ln6 zZoB_oxpPvNL9E zV;Ialo_fCDzu@`dIlo@7*Y>%6Y>exGx$>zukf)kfQl+qD~!bLSru{H-lvqNBR1@!+V^T?;`}{e z4{!e`=6I>$rJDnAk6tKIs_XXG%HWd}-yZpB3ka2v|Ga!@XQgLe((5`hqIg^l4Z0L^ zgIWJxd~2?+l=9PZixfNX2>+yJi4`<=&Ofe+7VID1yIB$P`h%#=HH}Y~d*)T7H|I4g z8M+*?P6!JWr2$qKVRj;~$n_o$9x#_TiCcUc4&>j1k>7N8G@^5pJi?U+H?yz4X9Lc< z*6SyV0BMmE2D;t{a?uu0$iQO*zm{6*8v-{0_~*+Ei+>1O1e{gt&g6?>SydT4@^!*} zH*VGkfbB5?BnL52r@W0QLHSH9tK(c{Sk%`&=Z-G^)3{k-=%$JPOA|Nw!79sg$#*xV zxy(5Ph0FH-^r(C>G7_pi=GYJg4<|8vpDKVxohPzlq(p?`{v;~--XLLu<-s~Y4D_;1 zWwm&8w^(?lY4q&}z}Yh=bM31#4pSBH{=STwC1`I;Vv6RFKZZJ1VR-TE;ZhgHCc6Ru zS-q>KCv9^sFAU(7W)AmJ{woiWa(_#iy~qcE#Qq5HD+iCT(myqNGg;6QuIi-Nyol=C z2rH#Fl8Em|YZA2z-Q(9(T!9M9wy(az;Z__`jYoY4TDyF8#(#kiy;n01J6PSKZ6l$o zP>8kMY4bkQWP76%e8m=hdVDa)-L{w)g2m(PpTx(_2d24iKk=(GE;Q1#S$(bJLSU*6 z@=J$vXuzpbv*Q48bSxd{q|$)7T_YbyH$e1H7QAR9Gk516Fe~fVx`1nyWK{IdQ zZbXaDdS1nxFXaj4`>ucApBz(iou@68DnAX=R9jkAR<87kP}KKqI&Ic#BVQ($3k~XH zZuZq+%Azhjg7qzEhg+GX-jr9t=|ilqf$)WLIjQc1{f)A_#>o^0QS;-IWuqUtN9AvC|J!z7`=JmBNT`AB){)$?%IPYv8@M!8D^0@VtJ0N~`L9%F zTsnC?BIB8|@+-_rpRuA+rLncm%gFdNRj=7!p}A?=Oy*e$SW5?OU{3rz5DUoRy=BI^BvX#)}X*1N4w+VvZ3P&>H|$@ZiKob@S@+S z@+G)lF@^-Fwospwm*uU^sVnw0I*7%|yF*t;#Hvu{?hm*(B=MRak&VE^$J+-ML z$oacIq<3w3b$z|~R#c-&8^}78!CNAQ&Ctl$hKJ&c9gN1S3luC z$S`t<+fC+^!_JEfkp~U5zbPvkSn+MGMHmYO91?8c1%@|_ z%Ih&Ufz*>2l#qfqLWtkp)M)l-w9!5~OPT6NQgFVFPdd)?U0jwEL|utbKmMXaUy+Yo zC}E4IeF}*zM*6SC9{bU8n)yim1jjd~{{|ljq&gT@xiRGA6?-;UP52I9?>ur-n3r5uy!0eoHu&<%;LQ>2x)U9{(hFB$+pAVgp3TWWT;T@JPwxl`Y$c z0RZgkdomL>Mb$<3bk5Y4)86?L?bCBXOjdtS`%b=Y;5$g0bMSj)`pHyduK7d!5}|{TIAkqh^4jUdcDkM><4k2=pUy@R!*H=y93Ka9 zdydPcOvdv`KZr%Wx^zD+bH+WvYU=u`2tQSmov4tp4TI*G;cKj{gRib$dxBbw>fvG2 zI){_Lphi`0;BU7){*f?W!rpVV9bhDIL?IETb$H)0_-cAF zs*q1R>ZM1AK~cdnu{mSavu~ctz@Ik-(+!B5CpOw|(PqAa^k2~mZxkp9TCBCn<^(_Z zrsD6nv6O*!lU2)~aHoF>foLZOoStkM#!4r2UQ}-KTo&}u@YVNtD8UmDG}`VI4S+QY z_c`4$zY+DGG<%)KkyMhOX|)Ulc+a;i^CM>3oD7WAY^}WY5GIC*hT_@z)JikjN=(*; zf9!Xox0Df<(wvJ}+qH440|5`O$*g#U@28IaEQ9c$hWjn;(f5GL`i!vUp7CWnUQ9Z< zazOOGb6#>O7%D#Vj+@+eY72uqOn0KSAiLvHnN13sq?)U2)OQacEL|Xz3Vs^dU2X@^ z_rZaP7G`}s)R%qW;>zKg$& z+9yj~J44L4TW22_DF67I{H35jhHUh?{5Bf7nCj!rEr$rerfstN?fZE?{7v7$Wj#Tc zvjVs}ad#BvHsV~C@N{^Hzms3YZgP)XT+KuG-04Nv7Gw`KnD;NZ{#`{=f129zbC+bn%en@Pnt!v0AB`!hlea)6Je))b^0Tc*Q<@G=%> zS$8|$%*o-M(^7EwT1sgB7SAK!?ATL|z;UCMUzSVlp@>0nKCoB2eE>ULL=(hPdl8ux z7dclz9~;dL0I9r0H-t0Buk3bOuEqBAP8iY7nA#*${~91aztHk4Dc>Y8A?2kTra5fH zmSlq-5~&oiQaD&ws)_I~)?#4c9S5Im5An^@S11lLqYtr?LfLFWY}4vCyw)6(8nGp0 z7g_D@L|W6V{IP>)%$mb=h51YLr67~!u{Cg3GS|m)*-!4v;pWJZ597u*)qw3H70nhpdIDc zeZFmHg~Vp>X9>Fe;?mhAEyW_l?$IAFoB`m6Ffw#ZG8dOaTw`@ow!UTgTmg45wOe#^ z92;~Lga<`gxnt!uoxhH?D?QOShh6_1%4DxhnvZ8*6kQYhxOalisiCc zU1G5U02_JZnxNAw=s@gAW^Wbvd~EMvG0%fUE$FtlfrX*%1(u`zEacj#YAyFF#(t!f zo}iH8^CL`DZyaNsCL`8vp($eSq^pu9ac3Z>;?ji{j>h809ZZ|4)EdIu%Z%=eRJnpI zSVUa(Gikaohw9nPpC0xNXYd)PX_YQTS6+zD2(Dk%)4S6@!F&+f@-7dW+%#!|(Lk32DV0FNHLhur~wnA&sCmJz)V3 z(u|SX%uHh6g+i9;4Rd`WSLbl)Y#J@sIE7oRPg`5%btE zD_K4GA8W~yl-f(+$o{*x3)zg$I#ZrWDJQRX;%3ndW1lQw=3<-kes7072AVG7o6X== z@Lo;7?-$3NvD7E_`D=c?6yFtJ&1Q_fU7mcRK^6IJg|5t|l-`EUKJNG0u%v<{DcE`Wf N)mGO7s_s7z`4?AqtnvT= literal 0 HcmV?d00001 -- Gitee From c606e83c3fe61f76096a2f8ba39f2acfcd1ddf6f Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 17 Jul 2025 09:09:29 +0000 Subject: [PATCH 270/328] !23129 Update op_plugin commit id Merge pull request !23129 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 312f5ee3a0..1ab17b92b7 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 312f5ee3a041e563ca2b8ada6139384be6b3e4b5 +Subproject commit 1ab17b92b7e1a9f033e77d0fcacced2dcad0126f -- Gitee From 7aade2e9bd9824053f6a8f85d4c04fff2547e75d Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 17 Jul 2025 09:09:30 +0000 Subject: [PATCH 271/328] !23129 Update op_plugin commit id Merge pull request !23129 from pta-robot/v2.7.1 -- Gitee From b3d5d36d43690fdb9f3a32f4ac9843312d1bbf01 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 17 Jul 2025 14:09:30 +0000 Subject: [PATCH 272/328] !23146 Update op_plugin commit id Merge pull request !23146 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1ab17b92b7..21d114c04c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1ab17b92b7e1a9f033e77d0fcacced2dcad0126f +Subproject commit 21d114c04c5c9bfa3adf249702c01ff34c5b8be7 -- Gitee From 5d6a48a4c96a83c7ab64b7de1c89e3455ab50bce Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 17 Jul 2025 15:54:27 +0000 Subject: [PATCH 273/328] !23158 Update op_plugin commit id Merge pull request !23158 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 21d114c04c..4a36c8576c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 21d114c04c5c9bfa3adf249702c01ff34c5b8be7 +Subproject commit 4a36c8576c94337fa841087568276efc5b6778c1 -- Gitee From b1f5b5f2ca1c7e91ae1fd41eeb6d2029d82dba3f Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 17 Jul 2025 15:54:27 +0000 Subject: [PATCH 274/328] !23158 Update op_plugin commit id Merge pull request !23158 from pta-robot/v2.7.1 -- Gitee From ec8c9856df25e3bbfe628e30ef12394185da05e7 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 18 Jul 2025 02:16:01 +0000 Subject: [PATCH 275/328] !23152 Update torchair commit id Merge pull request !23152 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 67ea6dfe9d..75d97976fa 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 67ea6dfe9d879d03701a3d668a428b94afcbe521 +Subproject commit 75d97976fa6b861d06595e1c4e477e8abfee2b30 -- Gitee From dbaa795b4d56fbad07e94f58a399adc012f97e0c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 18 Jul 2025 03:09:31 +0000 Subject: [PATCH 276/328] !23166 Update op_plugin commit id Merge pull request !23166 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 4a36c8576c..afe25f364d 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 4a36c8576c94337fa841087568276efc5b6778c1 +Subproject commit afe25f364de523080bec74ba1d19afd2dc929c2b -- Gitee From a2132841a09181304812d0e69ac96d1a142bef67 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 18 Jul 2025 04:54:28 +0000 Subject: [PATCH 277/328] !23170 Update op_plugin commit id Merge pull request !23170 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index afe25f364d..92fa1cf486 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit afe25f364de523080bec74ba1d19afd2dc929c2b +Subproject commit 92fa1cf486d0e75665acd26b662c6df83be57c29 -- Gitee From 15a346f50535906102a73adcb5e553b9ff57f6b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Fri, 18 Jul 2025 06:13:13 +0000 Subject: [PATCH 278/328] =?UTF-8?q?!23135=20use=20=5Fget=5Fuce=5Faddr=20Me?= =?UTF-8?q?rge=20pull=20request=20!23135=20from=20=E7=8E=8B=E8=B6=85/v2.7.?= =?UTF-8?q?0=5Fuceaddr1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 3 --- torch_npu/npu/__init__.py | 3 +-- torch_npu/npu/utils.py | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index df93e5a7a7..454e5f6ef8 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -1028,9 +1028,6 @@ "torch_npu.npu.check_uce_in_memory": { "signature": "(device_id)" }, - "torch_npu.npu.get_uce_addr": { - "signature": "()" - }, "torch_npu.npu.clear_npu_overflow_flag": { "signature": "()" }, diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index 20a582e360..c76f9e2c14 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -98,7 +98,6 @@ __all__ = [ "stop_device", "restart_device", "check_uce_in_memory", - "get_uce_addr", "config", "matmul", "conv", @@ -134,7 +133,7 @@ from .utils import (synchronize, set_device, current_device, _get_device_index, device, device_of, StreamContext, stream, set_stream, current_stream, default_stream, set_sync_debug_mode, get_sync_debug_mode, init_dump, current_blas_handle, is_bf16_supported, finalize_dump, set_dump, get_npu_overflow_flag, clear_npu_overflow_flag, - check_uce_in_memory, stress_detect, get_uce_addr) + check_uce_in_memory, stress_detect, _get_uce_addr) from ._recovery import restart_device, stop_device from .streams import Stream, Event, SyncLaunchStream, ExternalEvent from .mstx import mstx diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py index 8fbba766c8..a16a5dbb94 100644 --- a/torch_npu/npu/utils.py +++ b/torch_npu/npu/utils.py @@ -17,7 +17,7 @@ __all__ = ["synchronize", "set_device", "current_device", "device", "device_of", "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode", "init_dump", "set_dump", "finalize_dump", "is_support_inf_nan", "is_bf16_supported", "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle", - "check_uce_in_memory", "stress_detect", "get_cann_version", "get_uce_addr"] + "check_uce_in_memory", "stress_detect", "get_cann_version"] def get_cann_version(module="CANN"): @@ -386,7 +386,7 @@ def check_uce_in_memory(device_id): return torch_npu._C._npu_check_uce_in_memory(device_id) -def get_uce_addr(): +def _get_uce_addr(): torch_npu.npu._lazy_init() return torch_npu._C._npu_get_uce_addr() -- Gitee From d7399a0894679187bd5699368645f4275ab990fb Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 18 Jul 2025 09:09:30 +0000 Subject: [PATCH 279/328] !23185 Update op_plugin commit id Merge pull request !23185 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 92fa1cf486..1ff91b7c49 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 92fa1cf486d0e75665acd26b662c6df83be57c29 +Subproject commit 1ff91b7c494d1440db33915c3aa11f4f23ec6b75 -- Gitee From b7c90d05228ef585a213c7fd58775b04f97eb220 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Fri, 18 Jul 2025 09:57:06 +0000 Subject: [PATCH 280/328] !23183 add maybesetdevice Merge pull request !23183 from huangyunlong/2.7mays --- torch_npu/csrc/core/npu/NPUFunctions.cpp | 11 +++++++++++ torch_npu/csrc/core/npu/NPUFunctions.h | 2 ++ torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 3c4920ec1b..77067fa03b 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -113,6 +113,17 @@ aclError SetDevice(c10::DeviceIndex device) return err; } +aclError MaybeSetDevice(c10::DeviceIndex device) +{ + if (isDeviceCtxActive(device)) { + ASCEND_LOGI("MaybeSetDevice: NPU device %d has not been initialized! We will set targetDeviceIndex.", device); + NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(device)); + } else { + targetDeviceIndex = device; + } + return ACL_ERROR_NONE; +} + aclError ResetUsedDevices() { std::lock_guard lock(mtx); diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 9489984597..4f978d4185 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -48,6 +48,8 @@ aclError GetDeviceWithoutSet(int32_t *device); */ C10_NPU_API aclError SetDevice(c10::DeviceIndex device); +C10_NPU_API aclError MaybeSetDevice(c10::DeviceIndex device); + /** * @ingroup torch_npu * @brief reset all device id by ACL interface: aclrtResetDevice. diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp index cd00ca610a..14f27d673c 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp @@ -54,7 +54,7 @@ void NPUGuardImpl::setDevice(c10::Device d) const void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept { c10_npu::StartMainThreadBind(d.index()); - NPU_CHECK_WARN(c10_npu::SetDevice(d.index())); + NPU_CHECK_WARN(c10_npu::MaybeSetDevice(d.index())); } c10::Stream NPUGuardImpl::getStream(c10::Device d) const noexcept -- Gitee From d83c9df89518dc9a5626eceb1d6570c9753980fa Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 19 Jul 2025 11:09:30 +0000 Subject: [PATCH 281/328] !23218 Update op_plugin commit id Merge pull request !23218 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 1ff91b7c49..e25baf6c20 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 1ff91b7c494d1440db33915c3aa11f4f23ec6b75 +Subproject commit e25baf6c20d6be4cdd3cfa4bffb3f4fd64fc3581 -- Gitee From 22f171e052954ac14dc80e7aa47029783f40e147 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 21 Jul 2025 03:39:34 +0000 Subject: [PATCH 282/328] !23227 Update op_plugin commit id Merge pull request !23227 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index e25baf6c20..60cac7a42b 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit e25baf6c20d6be4cdd3cfa4bffb3f4fd64fc3581 +Subproject commit 60cac7a42b3721a2fc72adb6415ca052fdb433f5 -- Gitee From 35da52482caaa17b6def8badc97655d99579893c Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 21 Jul 2025 06:14:39 +0000 Subject: [PATCH 283/328] !23210 Round up the allocation to the nearest power of two to improve reuse. Merge pull request !23210 from huangyunlong/2.7host --- torch_npu/csrc/core/npu/CachingHostAllocator.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp index f03bfdb05b..ca093bb837 100644 --- a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp +++ b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp @@ -1,4 +1,5 @@ #include +#include #include "torch_npu/csrc/core/npu/npu_log.h" #include #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -133,14 +134,16 @@ struct HostAllocator { c10_npu::SetCurrentDevice(); } + // Round up the allocation to the nearest power of two to improve reuse. + size_t roundSize = c10::llvm::PowerOf2Ceil(size); // allocate a new block if no cached allocation is found - err = aclrtMallocHost(ptr, size); + err = aclrtMallocHost(ptr, roundSize); if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); return err; } - blocks.insert({*ptr, Block(size, *ptr, true)}); + blocks.insert({*ptr, Block(roundSize, *ptr, true)}); return ACL_ERROR_NONE; } -- Gitee From fd1e1ab78ad632e0c680e4b2c24b6f27d3748dec Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Mon, 21 Jul 2025 07:26:34 +0000 Subject: [PATCH 284/328] !23232 fixlog Merge pull request !23232 from SCh-zx/fix27 --- torch_npu/csrc/core/npu/NPUException.cpp | 6 ++---- torch_npu/csrc/core/npu/NPUQueue.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 9620895c7c..74ec3c798d 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -48,15 +48,13 @@ void warn_(const ::c10::Warning& warning) std::string formatErrorCode(SubModule submodule, ErrCode errorCode) { if (c10_npu::option::OptionsManager::IsCompactErrorOutput()) { - return " "; + return ""; } std::ostringstream oss; int deviceIndex = -1; c10_npu::GetDevice(&deviceIndex); auto rank_id = c10_npu::option::OptionsManager::GetRankId(); - if (!(c10_npu::option::OptionsManager::IsCompactErrorOutput())) { oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() << ", Device:" << deviceIndex << ", RankID:" << rank_id << ") "; - } oss << "ERR" << std::setw(2) << std::setfill('0') << static_cast(submodule); oss << std::setw(3) << std::setfill('0') << static_cast(errorCode); oss << " " << submoduleMap[submodule] << " " << errCodeMap[errorCode]; @@ -177,7 +175,7 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg) int device = 0; auto err = c10_npu::GetDevice(&device); if (err != ACL_ERROR_NONE) { - err_msg = "ERROR happend in GetDevice."; + err_msg = "ERROR happened in GetDevice."; if (check_error) { TORCH_CHECK(false, err_msg, PTA_ERROR(ErrCode::ACL)); } else { diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index d7ac32a79c..4757113dea 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -290,11 +290,11 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) error_msg = c10_npu::c10_npu_get_error_message(); } runtime_error = throwError + ", " + error_msg + PTA_ERROR(ErrCode::ACL); - error_msg = throwError + " happend."; + error_msg = throwError + " happened."; } if (current_status == RepoStatus::CAN_EXIT) { - error_msg = "Inner error happend with CAN_EXIT status, detail: " + repo_error; + error_msg = "Inner error happened with CAN_EXIT status, detail: " + repo_error; } if (current_status == RepoStatus::ERROR_EXIT) { @@ -319,7 +319,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) "resulting in performance degradation. " "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." + PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error; - error_msg = "Inner error happend, detail: " + repo_error; + error_msg = "Inner error happened, detail: " + repo_error; } #ifndef BUILD_LIBTORCH @@ -470,7 +470,7 @@ void Repository::Enqueue(void *cur_paras) ThrowDeviceError(current_status, cur_paras); if (current_status == RepoStatus::CAN_EXIT) { - ASCEND_LOGE("Inner error happend with CAN_EXIT status, detail: %s", repo_error.c_str()); + ASCEND_LOGE("Inner error happened with CAN_EXIT status, detail: %s", repo_error.c_str()); } if (current_status == RepoStatus::ERROR_EXIT) { -- Gitee From 4aaae581529d0a9066e54a722a471abd1474349c Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 21 Jul 2025 08:54:33 +0000 Subject: [PATCH 285/328] !23242 Update op_plugin commit id Merge pull request !23242 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 60cac7a42b..f5aeebc51c 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 60cac7a42b3721a2fc72adb6415ca052fdb433f5 +Subproject commit f5aeebc51cf84d37be9f02a492c33988877742da -- Gitee From 9a19dc1423045489a7b7d33757380b91bdca5230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Mon, 21 Jul 2025 12:17:24 +0000 Subject: [PATCH 286/328] =?UTF-8?q?!23223=20add=20npugraph=5Ftree=20patch,?= =?UTF-8?q?=20support=20reduce-overhead=20mode=20Merge=20pull=20request=20?= =?UTF-8?q?!23223=20from=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_graph_tree.py | 32 +- torch_npu/__init__.py | 3 +- torch_npu/csrc/npu/Module.cpp | 192 ++++---- torch_npu/npu/_graph_tree.py | 797 ++++++++++++++++++++++++--------- torch_npu/utils/__init__.py | 1 + torch_npu/utils/_graph_tree.py | 207 +++++++++ 6 files changed, 934 insertions(+), 298 deletions(-) create mode 100644 torch_npu/utils/_graph_tree.py diff --git a/test/npu/test_graph_tree.py b/test/npu/test_graph_tree.py index 94c29660f9..f4552033d1 100644 --- a/test/npu/test_graph_tree.py +++ b/test/npu/test_graph_tree.py @@ -123,7 +123,7 @@ class TestNpuGraphFunctions(TestCase): model, inputs, (), device_index=0, is_backward=False, is_inference=False ) mock_manager.add_function.assert_called_with( - model, inputs, (), None, CompilationMode.FORWARD, (), + model, inputs, (), None, CompilationMode.FORWARD, (), (), () ) # Test backward mode @@ -132,7 +132,7 @@ class TestNpuGraphFunctions(TestCase): model, inputs, (), device_index=0, is_backward=True, is_inference=False ) mock_manager.add_function.assert_called_with( - model, inputs, (), None, CompilationMode.BACKWARD, (), + model, inputs, (), None, CompilationMode.BACKWARD, (), (), () ) # Test invalid mode combination @@ -204,6 +204,7 @@ class TestNPUWarmupNode(TestCase): stack_traces=None, stream=stream, already_warm=False, + graph_id=1, ) outputs = node.run([]) self.assertEqual(len(node.outputs_weakrefs), 1) @@ -756,6 +757,32 @@ class TestCheckMemoryPool(TestCase): "npu:0", (0, 0), {1001, 1002} ) + @patch('torch_npu._C._npu_checkPoolLiveAllocations') + @patch('torch_npu.npu._graph_tree.get_npugraph_segments') + @patch('torch_npu.npu._graph_tree.format_tb') + @patch('gc.collect') + def test_check_memory_pool_slow_path_all_match( + self, mock_gc, mock_format_tb, mock_segments, mock_check + ): + mock_check.return_value = False + mock_segments.return_value = [ + { + "segment_pool_id": (0, 0), + "address": 1000, + "blocks": [ + {"state": "active_allocated", "size": 100, "frames": []}, + {"state": "inactivate", "size": 200}, + ] + } + ] + mock_storage = MagicMock(spec=StorageWeakRefWrapper) + mock_storage.data_ptr.return_value = 1000 + mock_storage.return_value = True + check_memory_pool("npu:0", (0, 0), [mock_storage]) + mock_gc.assert_called_once_with() + mock_segments.assert_called_once_with((0, 0)) + mock_format_tb.assert_not_called() + @patch('torch_npu._C._npu_checkPoolLiveAllocations') @patch('torch_npu.npu._graph_tree.get_npugraph_segments') @patch('torch_npu.npu._graph_tree.format_tb') @@ -939,6 +966,7 @@ class TestNPUGraphTreeManager: "stack_trace", manager.stream, False, + GraphID(-1), ) assert manager.current_node == mock_node_instance assert manager.path_state == ExecutionState.WARMUP diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index ffbef110b0..f571d55240 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -72,7 +72,7 @@ from torch_npu.contrib.module import npu_modules from torch_npu.utils import _apply_module_patch, _add_tensor_methods, _add_collect_env_methods, \ _add_storage_methods, _add_serialization_methods, add_dynamo_methods, add_perf_dump_patch, \ add_optim_method, _inductor_register_device_op_overrides, \ - _apply_npu_show_warning + _apply_npu_show_warning, _apply_npugraph_tree_methods from torch_npu.utils._dynamo_device import _dynamo_register_interface_for_device from torch_npu.npu._stream_check import apply_sanitizer_patch from torch_npu.npu._format import _apply_npu_format_patch @@ -178,6 +178,7 @@ def _apply_class_patches(): _add_reductions_methods() _apply_npu_format_patch() _apply_fsdp_patch() + _apply_npugraph_tree_methods() def _apply_distributed_methods_patch(): diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 72be776671..d335acc4e3 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -411,91 +411,115 @@ void RegisterNpuPluggableAllocator(PyObject* module) addStorageDeleterFns(storages_to_add_deleters_to, delta); }); - m.def( - "_free_And_Remove_DeleterFn", - [](size_t storage_impl_ptr) { - // NOLINTNEXTLINE(performance-no-int-to-ptr) - c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr; - auto alloc = c10_npu::NPUCachingAllocator::get(); - auto data_ptr = storage_impl->data_ptr().get(); - bool succeeded = storage_impl->mutable_data_ptr().compare_exchange_deleter( - alloc->raw_deleter(), c10::detail::deleteNothing); - TORCH_CHECK(succeeded, "Expected standard deleter", PTA_ERROR(ErrCode::PARAM)); - c10_npu::NPUCachingAllocator::raw_delete(data_ptr); - }); - m.def( - "_has_Standard_Deleter", - [](size_t storage_impl_ptr) { + m.def( + "_free_And_Remove_DeleterFn", + [](size_t storage_impl_ptr) { + // NOLINTNEXTLINE(performance-no-int-to-ptr) + c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr; + auto alloc = c10_npu::NPUCachingAllocator::get(); + auto data_ptr = storage_impl->data_ptr().get(); + bool succeeded = storage_impl->mutable_data_ptr().compare_exchange_deleter( + alloc->raw_deleter(), c10::detail::deleteNothing); + TORCH_CHECK(succeeded, "Expected standard deleter", PTA_ERROR(ErrCode::PARAM)); + c10_npu::NPUCachingAllocator::raw_delete(data_ptr); + }); + m.def( + "_has_Standard_Deleter", + [](size_t storage_impl_ptr) { + // NOLINTNEXTLINE(performance-no-int-to-ptr) + c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr; + auto alloc = c10_npu::NPUCachingAllocator::get(); + return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter()); + }); + m.def( + "_add_cached_tensor", + [](const at::Tensor& t) { + at::caching::add_cached_tensor(t); + }); + m.def( + "_remove_cached_tensor", + [](const at::Tensor& t) { + at::caching::remove_cached_tensor(t); + }); + m.def( + "_construct_NPU_Tensor_From_Storage_And_Metadata", + [](py::dict& metadata, c10::Storage s) { + auto dtype_arg = metadata["dtype"].ptr(); + auto meta = c10::scalarTypeToTypeMeta(torch::toScalarType(dtype_arg)); + + constexpr c10::DispatchKeySet npu_dks(c10::DispatchKey::PrivateUse1); + at::Tensor tensor = at::detail::make_tensor_base( + std::move(s), npu_dks, meta); + + tensor.unsafeGetTensorImpl()->set_sizes_and_strides( + metadata["size"].cast>(), + metadata["stride"].cast>()); + tensor.unsafeGetTensorImpl()->set_storage_offset( + metadata["storage_offset"].cast()); + return tensor; + }); + m.def( + "_npu_checkPoolLiveAllocations", + [](c10::DeviceIndex device, c10_npu::MempoolId_t mempool_id, + const py::set& expected_live_allocations) { + std::unordered_set allocations; + allocations.reserve(expected_live_allocations.size()); + for (auto& elem : expected_live_allocations) { // NOLINTNEXTLINE(performance-no-int-to-ptr) - c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr; - auto alloc = c10_npu::NPUCachingAllocator::get(); - return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter()); - }); - m.def( - "_add_cached_tensor", - [](const at::Tensor& t) { - at::caching::add_cached_tensor(t); - }); - m.def( - "_remove_cached_tensor", - [](const at::Tensor& t) { - at::caching::remove_cached_tensor(t); - }); - m.def( - "_construct_NPU_Tensor_From_Storage_And_Metadata", - [](py::dict& metadata, c10::Storage s) { - auto dtype_arg = metadata["dtype"].ptr(); - auto meta = c10::scalarTypeToTypeMeta(torch::toScalarType(dtype_arg)); - - constexpr c10::DispatchKeySet npu_dks(c10::DispatchKey::PrivateUse1); - at::Tensor tensor = at::detail::make_tensor_base( - std::move(s), npu_dks, meta); - - tensor.unsafeGetTensorImpl()->set_sizes_and_strides( - metadata["size"].cast>(), - metadata["stride"].cast>()); - tensor.unsafeGetTensorImpl()->set_storage_offset( - metadata["storage_offset"].cast()); - return tensor; - }); - m.def( - "_npu_checkPoolLiveAllocations", - [](c10::DeviceIndex device, c10_npu::MempoolId_t mempool_id, - const py::set& expected_live_allocations) { - std::unordered_set allocations; - allocations.reserve(expected_live_allocations.size()); - for (auto& elem : expected_live_allocations) { - // NOLINTNEXTLINE(performance-no-int-to-ptr) - allocations.insert(reinterpret_cast(py::cast(elem))); + allocations.insert(reinterpret_cast(py::cast(elem))); + } + return c10_npu::NPUCachingAllocator::checkPoolLiveAllocations(device, mempool_id, allocations); + }); + m.def( + "_set_cached_tensors_enabled", + [](bool enabled) { + at::caching::set_cached_tensors_enabled(enabled); + }); + m.def( + "_construct_storage_from_data_pointer", + [](int64_t data_ptr, c10::Device device, size_t size_bytes) { + c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( + c10::StorageImpl::use_byte_size_t(), + size_bytes, + at::DataPtr(reinterpret_cast(data_ptr), device), + nullptr, + false); + return c10::Storage(storage_impl); + }); + m.def( + "_weak_ref_tensor", + [](const at::Tensor& t) { + void* data_ptr = t.data_ptr(); + std::vector sizes = t.sizes().vec(); + std::vector strides = t.strides().vec(); + auto options = t.options(); + auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options); + return new_tensor; + }); + m.def( + "_set_storage_access_error_msg", + [](const at::Tensor& t, std::string s) { + t.unsafeGetTensorImpl()->release_storage_and_set_meta_custom_data_ptr_error_msg_(s); + }); + m.def( + "_set_storage_data_ptr_access_error_msg", + [](size_t storage_impl_ptr, std::string s) { + // NOLINTNEXTLINE(performance-no-int-to-ptr) + c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr; + storage_impl->release_data_and_set_meta_custom_data_ptr_error_msg_(s); + }); + m.def( + "_tensors_data_ptrs_at_indices_equal", + [](py::list& tensors, py::list& data_ptrs, py::list& indices) { + for (auto index : indices) { + auto t = tensors[index].cast(); + auto data_ptr = data_ptrs[index].cast(); + if (reinterpret_cast(t.data_ptr()) != data_ptr) { + return false; } - return c10_npu::NPUCachingAllocator::checkPoolLiveAllocations(device, mempool_id, allocations); - }); - m.def( - "_set_cached_tensors_enabled", - [](bool enabled) { - at::caching::set_cached_tensors_enabled(enabled); - }); - m.def( - "_construct_storage_from_data_pointer", - [](int64_t data_ptr, c10::Device device, size_t size_bytes) { - c10::intrusive_ptr storage_impl = torch_npu::make_npu_storage_impl( - c10::StorageImpl::use_byte_size_t(), - size_bytes, - at::DataPtr(reinterpret_cast(data_ptr), device), - nullptr, - false); - return c10::Storage(storage_impl); - }); - m.def( - "_weak_ref_tensor", - [](const at::Tensor& t) { - void* data_ptr = t.data_ptr(); - std::vector sizes = t.sizes().vec(); - std::vector strides = t.strides().vec(); - auto options = t.options(); - auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options); - return new_tensor; - }); + } + return true; + }); } static PyObject* THNPModule_initExtension(PyObject* self, PyObject* noargs) @@ -1043,6 +1067,7 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) py::str requested_size_s = "requested_size"; py::str stream_s = "stream"; py::str segment_type_s = "segment_type"; + py::str segment_pool_id = "segment_pool_id"; py::str large_s = "large"; py::str small_s = "small"; py::str size_s = "size"; @@ -1081,6 +1106,7 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) // we want the python objects to pickle easily so use an int to // represent the stream rather than a torch.cuda.stream object segmentDict[stream_s] = int64_t(segmentInfo.stream); + segmentDict[segment_pool_id] = segmentInfo.owner_private_pool_id; segmentDict[segment_type_s] = (segmentInfo.is_large ? large_s : small_s); segmentDict[is_expandable_s] = segmentInfo.is_expandable; add_frame_key(segmentDict, segmentInfo.context_when_allocated); diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py index 7ecabd355e..1149dd53d2 100644 --- a/torch_npu/npu/_graph_tree.py +++ b/torch_npu/npu/_graph_tree.py @@ -52,20 +52,25 @@ from typing import ( Any, Callable, cast, + ContextManager, Dict, + Generator, Iterator, List, Optional, Sequence, Set, Tuple, + Type, + TYPE_CHECKING, + TypeVar, Union, ) import torch.fx from torch import Tensor from torch._dynamo.mutation_guard import GenerationTracker -from torch._dynamo.utils import preserve_rng_state +from torch._dynamo.utils import counters, dynamo_timed, preserve_rng_state from torch._inductor import config from torch._inductor.compile_fx import ( align_inputs_from_check_idxs, @@ -76,9 +81,20 @@ from torch._inductor.compile_fx import ( remove_unaligned_input_idxs, static_input, ) +from torch._inductor.cudagraph_utils import ( + check_for_mutation, + CheckInvariantStatus, + FunctionID, + log_cudagraph_skip_and_bump_counter, + log_data_ptr_mismatch, + maybe_warning_due_to_dynamic_shape, + ModelType, + OutputType, + PlaceholderInfo, + WrappedFunction, +) from torch.multiprocessing.reductions import StorageWeakRef from torch.storage import UntypedStorage -from torch.types import _bool from torch.utils import _pytree as pytree from torch.utils.weak import TensorWeakRef @@ -88,9 +104,16 @@ from torch_npu._C import ( _set_cached_tensors_enabled as _set_cached_tensors_enabled) +if TYPE_CHECKING: + from torch._inductor.utils import InputType + from torch.types import _bool + + StorageWeakRefPointer = int StorageDataPtr = int NBytes = int +S = TypeVar("S", bound="StorageWeakRefWrapper") +log = torch._logging.getArtifactLogger(__name__, "cudagraphs") @dataclasses.dataclass(frozen=True) @@ -99,27 +122,7 @@ class GraphID: id: int -@dataclasses.dataclass(frozen=True) -class FunctionID: - "Unique counter of a function wrapped in npugraphify_impl" - id: int - - -@dataclasses.dataclass(frozen=True) -class WrappedFunction: - """ - Represents a function that you want to record for NPU graph replay, - with a little more metadata so we can identify if we have an applicable - NPU graph in our NPU graph tree for it. - """ - - model: Callable[..., Any] - static_input_idxs: Sequence[int] - id: FunctionID - constants: Tuple[torch.Tensor, ...] - - -def clear_cublass_cache(): +def clear_cublass_cache() -> None: """ Cublas keeps a persistent workspace allocation for running matmuls. This poses a problem for doing warmup within a NPUGraph private pool because we do not want persistent allocations from @@ -136,7 +139,7 @@ def clear_cublass_cache(): @contextlib.contextmanager -def clear_cublas_manager(): +def clear_cublas_manager() -> Generator[None, None, None]: "Context manager around clearing cublas caches that will clear on enter and exit" clear_cublass_cache() try: @@ -146,7 +149,7 @@ def clear_cublas_manager(): @contextlib.contextmanager -def disable_conv_cache_emptying(): +def disable_conv_cache_emptying() -> Generator[None, None, None]: try: yield finally: @@ -154,7 +157,7 @@ def disable_conv_cache_emptying(): @contextlib.contextmanager -def enable_history_recording(): +def enable_history_recording() -> Generator[None, None, None]: "Turns on history recording in the NPU Caching Allocator" enabled = torch_npu._C._npu_isHistoryEnabled() try: @@ -166,7 +169,7 @@ def enable_history_recording(): torch.npu.memory._record_memory_history(None) -def get_history_recording(): +def get_history_recording() -> ContextManager[None]: # remove, prevents cleanup if not config.triton.cudagraph_trees_history_recording: return contextlib.nullcontext() @@ -194,7 +197,7 @@ class TreeManagerContainer: - All the storages are dead, we deallocate the tree manager """ - def __init__(self, device_index): + def __init__(self, device_index: int) -> None: # This class keeps a strong reference to tree_manager, # but upon all other strong references to the tree_manager will reset it to None. # We need a strong reference so that we can still access its attributes upon cleanup. @@ -213,7 +216,7 @@ class TreeManagerContainer: self.lock = threading.Lock() - def _finalize_tensor(self): + def _finalize_tensor(self) -> None: with self.lock: self.live_storages_count -= 1 if self.live_storages_count == 0: @@ -224,18 +227,18 @@ class TreeManagerContainer: if self.live_npugraphify_fns == 0: self.tree_manager = None - def finalize_npugraphify_fn(self): + def finalize_npugraphify_fn(self) -> None: with self.lock: self.live_npugraphify_fns -= 1 if self.live_npugraphify_fns == 0: self._finalize_tree_manager() - def _finalize_tree_manager(self): + def _finalize_tree_manager(self) -> None: if not self.lock.locked(): raise RuntimeError("check self.lock.locked() fail") self.tree_manager = None - def add_strong_reference(self, fn: Callable[..., Any]): + def add_strong_reference(self, fn: Callable[..., Any]) -> None: with self.lock: self.live_npugraphify_fns += 1 @@ -266,14 +269,14 @@ torch._C._stash_obj_in_tls("tree_manager_containers", local.tree_manager_contain torch._C._stash_obj_in_tls("tree_manager_locks", local.tree_manager_locks) -def mark_step_begin(): +def mark_step_begin() -> None: "Indicates that a new iteration of inference or training is about to begin." # iterate down to distinguish from GenerationTracking counter MarkStepBox.mark_step_counter -= 1 -def reset_npugraph_trees(): +def reset_npugraph_trees() -> None: "Clear all npugraph trees" # see shutdown below for why this is necessary container_dict = get_obj(local, "tree_manager_containers") @@ -292,7 +295,7 @@ def reset_npugraph_trees(): MarkStepBox.mark_step_counter = 0 -def get_obj(thread_local, attr_name): +def get_obj(thread_local: Any, attr_name: str) -> Any: if hasattr(thread_local, attr_name): return getattr(thread_local, attr_name) else: @@ -301,7 +304,7 @@ def get_obj(thread_local, attr_name): return torch._C._get_obj_in_tls(attr_name) -def get_container(device_index: int): +def get_container(device_index: int) -> TreeManagerContainer: container_dict = get_obj(local, "tree_manager_containers") lock = get_obj(local, "tree_manager_locks")[device_index] @@ -313,28 +316,46 @@ def get_container(device_index: int): def get_manager( - device_index: int, create_if_none_exists=True + device_index: int, create_if_none_exists: bool = True ) -> Optional[NPUGraphTreeManager]: if create_if_none_exists: return get_container(device_index).get_tree_manager() return get_container(device_index).tree_manager -def npugraphify_impl(model, inputs, static_input_idxs, *args, **kwargs): +def npugraphify_impl( + model: ModelType, + inputs: List[InputType], + static_input_idxs: Sequence[int], + *args: Any, + **kwargs: Any, +) -> ModelType: fn_cache: Dict[Tuple[int, ...], Callable[..., Any]] = {} # Detect int inputs: we need to index on these int_key = [i for i, v in enumerate(inputs) if isinstance(v, int)] get_ints: Any = operator.itemgetter(*int_key) if int_key else lambda _: None + has_warn = False + del inputs - def deferred_npugraphify(inputs): + def deferred_npugraphify(inputs: List[InputType]) -> OutputType: + nonlocal has_warn + int_key = get_ints(inputs) fn = fn_cache.get(int_key) if fn is not None: return fn(inputs) + if int_key is None: + log.info("recording npugraph tree for graph without symints") + else: + log.info("recording npugraph tree for symint key %s", int_key) + + if not has_warn: + has_warn = maybe_warning_due_to_dynamic_shape(fn_cache, int_key) + # first get indices we need to check to align, then update our static inputs, # and finally copy check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs) @@ -351,16 +372,18 @@ def npugraphify_impl(model, inputs, static_input_idxs, *args, **kwargs): def npugraphify( - model, - inputs, - static_input_idxs=(), + model: ModelType, + inputs: List[InputType], + static_input_idxs: Sequence[int] = (), *, device_index: int, is_backward: bool, is_inference: bool, stack_traces: Optional[StackTraces] = None, constants: Tuple[torch.Tensor, ...] = (), -): + placeholders: Tuple[PlaceholderInfo, ...] = (), + mutated_input_idxs: Tuple[int, ...] = (), +) -> Tuple[ModelType, OutputType]: manager = get_container(device_index).get_tree_manager() if is_backward and is_inference: raise RuntimeError("check is_backward and is_inference fail") @@ -377,6 +400,8 @@ def npugraphify( stack_traces, mode, constants, + placeholders, + mutated_input_idxs, ) @@ -392,8 +417,8 @@ class StorageWeakRefWrapper: def __init__( self, inp: Union[Tensor, UntypedStorage], - extra_ref_check: Optional[Callable[[], None]] = None, - ): + extra_ref_check: Optional[Callable[[], bool]] = None, + ) -> None: """ extra_ref_check is an additional check we need to run to check if the weak ref has expired. in checking storage use count we assume extra_ref_check @@ -410,7 +435,12 @@ class StorageWeakRefWrapper: self.extra_ref_check = extra_ref_check @classmethod - def from_weakref_and_data_ptr(cls, cdata, data_ptr, extra_ref_check=None): + def from_weakref_and_data_ptr( + cls: Type[S], + cdata: Any, + data_ptr: int, + extra_ref_check: Optional[Callable[[], bool]] = None, + ) -> StorageWeakRefWrapper: instance = cls.__new__(cls) instance._data_ptr = data_ptr instance.ref = StorageWeakRef.from_weakref(cdata) @@ -423,7 +453,7 @@ class StorageWeakRefWrapper: return self.ref.cdata - def swap_weakref(self, cdata): + def swap_weakref(self, cdata: Any) -> None: self.ref.__del__() self.ref.cdata = cdata @@ -431,10 +461,10 @@ class StorageWeakRefWrapper: "NB: returns the data ptr even if the storage has expired" return self._data_ptr - def remove_extra_reference(self): + def remove_extra_reference(self) -> None: self.extra_ref_check = None - def expired(self): + def expired(self) -> bool: if self.extra_ref_check is not None and not self.extra_ref_check(): return False @@ -442,7 +472,7 @@ class StorageWeakRefWrapper: stor_count = torch_npu._C._storage_Use_Count(self.ref.cdata) return (stor_count - (self.extra_ref_check is not None)) == 0 - def __repr__(self): + def __repr__(self) -> str: if self.ref is None or self.ref.expired(): return f"StorageWeakRefWrapper to {self.data_ptr()}; dead" else: @@ -466,7 +496,9 @@ def maybe_deref( @contextlib.contextmanager -def _use_npu_memory_pool_manager(device, mem_pool, stream): +def _use_npu_memory_pool_manager( + device: int, mem_pool: Tuple[int, int], stream: torch.npu.Stream +) -> Generator[None, None, None]: """ Context manager to use npu graph pool for new allocations. If you use this manager all npugraph tensors in use should be reflected in the allocator or they will be overwritten. @@ -527,16 +559,17 @@ class NPUWarmupNode: def __init__( self, wrapped_function: WrappedFunction, - parent, + parent: Optional[Union[NPUGraphNode, NPUWarmupNode]], npu_graphs_pool: Tuple[int, int], existing_npu_graph: Optional[torch.npu.NPUGraph], device_index: int, stack_traces: Optional[StackTraces], stream: torch.npu.Stream, already_warm: bool, - ): + graph_id: GraphID, + ) -> None: self.wrapped_function = wrapped_function - self.parent = parent + self.parent: Optional[Union[NPUGraphNode, NPUWarmupNode]] = parent self.npu_graphs_pool = npu_graphs_pool self.outputs_weakrefs: List[Optional[StorageWeakRefWrapper]] = [] self.tensor_weakrefs: List[Optional[TensorWeakRef]] = [] @@ -546,8 +579,9 @@ class NPUWarmupNode: self.stack_traces = stack_traces self.stream = stream self.already_warm = already_warm + self.id = graph_id - def run(self, new_inputs): + def run(self, new_inputs: Any) -> OutputType: if self.has_run: raise RuntimeError("Wrapped function should never be run twice") @@ -559,17 +593,17 @@ class NPUWarmupNode: if t() } - def get_non_npugraph_inps(): - non_npugraph_inps = set() + def get_non_npugraph_inps() -> List[weakref.ReferenceType[UntypedStorage]]: + non_npugraph_inps = [] for t in itertools.chain(new_inputs, self.wrapped_function.constants): if ( isinstance(t, torch.Tensor) and t.untyped_storage().data_ptr() not in existing_path_data_ptrs ): - non_npugraph_inps.add(t.untyped_storage().data_ptr()) + non_npugraph_inps.append(weakref.ref(t.untyped_storage())) return non_npugraph_inps - non_npugraph_inps = get_non_npugraph_inps() + non_npugraph_inps_storages = get_non_npugraph_inps() if config.triton.slow_path_cudagraph_asserts and not self.already_warm: refs = list(self.path_live_weakrefs()) @@ -582,16 +616,28 @@ class NPUWarmupNode: ), get_history_recording(): out = self.wrapped_function.model(new_inputs) + # We need to know which outputs are allocated within the cudagraph pool + # so that we can deallocate them at the beginning of the next cudagraph step, + # and set their access to error. + # We use a weakref to the inputs storage, in case a block which was previously + # allocated to the general caching allocator pool gets reallocated to a private pool. + + non_npugraph_inps_storage_ptrs = set() + for storage in non_npugraph_inps_storages: + s = storage() + if s is not None: + non_npugraph_inps_storage_ptrs.add(s._cdata) + if not len(new_inputs) == 0: raise RuntimeError("check len(new_inputs) == 0 fail") # sdpa returns cpu tensors when not recording npu graph - def add_ref(out_tensor): + def add_ref(out_tensor: Any) -> bool: return ( out_tensor is not None and isinstance(out_tensor, torch.Tensor) and out_tensor.is_npu - and out_tensor.untyped_storage().data_ptr() not in non_npugraph_inps + and out_tensor.untyped_storage()._cdata not in non_npugraph_inps_storage_ptrs and out_tensor.untyped_storage().data_ptr() != 0 ) @@ -603,20 +649,17 @@ class NPUWarmupNode: ) if config.triton.slow_path_cudagraph_asserts and not self.already_warm: - out_refs = self.path_live_weakrefs() - new_storages = [ - t - for t in out_refs - if t.data_ptr() not in non_npugraph_inps - ] - check_memory_pool(self.device_index, self.npu_graphs_pool, new_storages) + out_refs = list(self.path_live_weakrefs()) + check_memory_pool(self.device_index, self.npu_graphs_pool, out_refs) return out @property - def _path_from_root(self): + def _path_from_root( + self, + ) -> Generator[Union[NPUGraphNode, NPUWarmupNode], None, None]: nodes = [] - node = self + node: Union[NPUGraphNode, NPUWarmupNode] = self while node: nodes.append(node) node = node.parent @@ -630,9 +673,15 @@ class NPUWarmupNode: if is_live(output): yield output - def all_outputs_are_dead(self): + def all_outputs_are_dead(self) -> bool: return not list(self.path_live_weakrefs()) + def _is_npu_graph_recorded_tensor(self, t: torch.Tensor) -> bool: + for storage_weak_ref in self.path_live_weakrefs(): + if t.untyped_storage().data_ptr() == storage_weak_ref.data_ptr(): + return True + return False + # Aliases for List that say what the indices denote InputList = List # input indexes @@ -658,7 +707,7 @@ class AliasesPriorGraphOutput(OutputAliasInfo): index: PathOutputIndex - def __init__(self, index: PathOutputIndex): + def __init__(self, index: PathOutputIndex) -> None: if not isinstance(index, tuple): raise RuntimeError("check isinstance(index, tuple) fail") self.index = index @@ -671,7 +720,7 @@ class AliasesNewOutput(OutputAliasInfo): index: int - def __init__(self, index): + def __init__(self, index: int) -> None: if not isinstance(index, int): raise RuntimeError("check isinstance(index, int) fail") self.index = index @@ -705,7 +754,7 @@ class NPUGraphNode: device_index: int, stack_traces: Optional[StackTraces], stream: torch.npu.Stream, - ): + ) -> None: if not isinstance(inputs, (list, tuple)): raise RuntimeError("check isinstance(inputs, (list, tuple))") self.wrapped_function = wrapped_function @@ -714,6 +763,13 @@ class NPUGraphNode: self.stack_traces = stack_traces self.stream = stream + # Enable re-record a cudagraph when static tensor address changed. + # if not we should error when it changed. + self.rerecord_if_static_inputs_change = ( + torch._dynamo.config.inline_inbuilt_nn_modules + or torch._inductor.config.triton.cudagraph_support_input_mutation + ) + # if this is a root parent will be None. use weakref to prevent reference cycle self._parent = weakref.ref(parent) if parent is not None else None # reference to the shared memory pool for the entire npu graphs tree @@ -741,7 +797,7 @@ class NPUGraphNode: node.outputs_weakrefs for node in self._path_from_root ] - self.path_stacktraces: LevelList[StackTraces] = [ + self.path_stacktraces: LevelList[Optional[StackTraces]] = [ node.stack_traces for node in self._path_from_root ] @@ -754,16 +810,52 @@ class NPUGraphNode: if isinstance(t, torch.Tensor) and self._is_npu_graph_recorded_tensor(t) ] + # (depth, offset) of live tensors which are alias of previous graph outputs + self.live_npugraph_managed_path_refs: InputList[Optional[PathOutputIndex]] = [ + ( + self._is_alias_of_live_recorded_tensor(t) + if isinstance(t, torch.Tensor) + else None + ) + for t in inputs + ] + + # when replay, preserve the liveness of an input if it AliasesPriorGraphOutput + # and also aliases an output of the current CUDAGraphNode + self.preserved_aliased_inputs: InputList[bool] = [False] * len(inputs) + self.static_input_idxs: List[int] = list( set(wrapped_function.static_input_idxs) | set(self.npugraph_managed_idxs) ) + self.non_static_input_idx: LevelList[int] = [ + i + for i in range(len(inputs)) + if i not in self.static_input_idxs + ] + + counters["inductor"]["npugraph_recorded_non_static_inputs"] += len( + self.non_static_input_idx + ) + + self.non_managed_static_input_idxs: LevelList[int] = [ + i + for i in wrapped_function.static_input_idxs + if i not in self.npugraph_managed_idxs + ] + + def maybe_get_static_data_ptr( + idx: int, + inputs: List[InputType], + static_input_idxs: List[int], + ) -> Optional[int]: + inp = inputs[idx] + if isinstance(inp, torch.Tensor) and idx in static_input_idxs: + return inp.data_ptr() + return None + self.static_input_data_ptrs: InputList[Optional[int]] = [ - ( - inputs[i].data_ptr() - if isinstance(inputs[i], torch.Tensor) and i in self.static_input_idxs - else None - ) + maybe_get_static_data_ptr(i, inputs, self.static_input_idxs) for i in range(len(inputs)) ] @@ -822,7 +914,7 @@ class NPUGraphNode: # we reconstruct tensors at the correct data pointers of our inputs which are # non owning and do not prevent deallocation. On subsequent executions, input values # will be copied over to these tensors. - self.reconstructed_inputs: InputList[Union[Tensor, int]] = [ + self.reconstructed_inputs: List[InputType] = [ self._reconstruct_from_tensor_metadata(self._tensor_metadata(x)) if isinstance(x, torch.Tensor) else x @@ -868,9 +960,8 @@ class NPUGraphNode: self.static_output_tensors: OutputList[Optional[Tensor]] = [] # Cleared after recording - self.recording_outputs: Optional[ - OutputList[Union[torch.Tensor, int]] - ] = self._record(wrapped_function.model, recording_inputs) + self.recording_outputs: Optional[OutputType] = self._record( + wrapped_function.model, recording_inputs) self.outputs_metadata: OutputList[Union[Dict[str, Any], int, None]] = [] # As with inputs, we do not want to keep the outputs permanently alive because that would prevent @@ -890,6 +981,42 @@ class NPUGraphNode: self.graph.replay() + def _copy_inputs_and_remove_from_src( + self, dsts: List[InputType], srcs: List[InputType] + ) -> None: + dst_tensors = [] + src_tensors = [] + for idx in self.non_static_input_idx: + if not isinstance(srcs[idx], torch.Tensor): + continue + expanded_dims = self.expanded_dims[idx] + dst_tensors.append(index_expanded_dims(dsts[idx], expanded_dims)) # type: ignore[arg-type] + src_tensors.append(index_expanded_dims(srcs[idx], expanded_dims)) # type: ignore[arg-type] + srcs[idx] = None # type: ignore[call-overload] + # Fails on empty lists + if dst_tensors: + torch._foreach_copy_(dst_tensors, src_tensors) + + def check_static_inputs_are_stable(self, new_inputs: List[InputType]) -> None: + # avoid checking managed tensor static points since we already checked those in check_invariants + if ( + not self.rerecord_if_static_inputs_change + and not torch_npu._C._tensors_data_ptrs_at_indices_equal( + new_inputs, # type: ignore[arg-type] + self.static_input_data_ptrs, + self.non_managed_static_input_idxs, + ) + ): + # this should error + error_msg = log_data_ptr_mismatch( + self.wrapped_function.placeholders, + new_inputs, + self.static_input_data_ptrs, + self.non_managed_static_input_idxs, + CheckInvariantStatus.StaticInputIdxMismatch, + ) + torch._check(False, lambda: error_msg) + def _copy_input(self, idx, dst, src): expanded_dims = self.expanded_dims[idx] dst = index_expanded_dims(dst, expanded_dims) @@ -908,7 +1035,7 @@ class NPUGraphNode: dst_record[dtype].append(dst) src_record[dtype].append(src) - def run_first_inputs(self, new_inputs): + def run_first_inputs(self, new_inputs: List[InputType]) -> OutputType: if config.triton.fast_path_cudagraph_asserts: self.debug_check_invariants_before_invocation() @@ -918,45 +1045,32 @@ class NPUGraphNode: raise RuntimeError("check len(new_inputs) == 0 fail") outputs = self.recording_outputs self.recording_outputs = None + if outputs is None: + raise RuntimeError("check outputs is not None fail") return outputs - def run(self, new_inputs): - if config.triton.fast_path_cudagraph_asserts: - self.debug_check_invariants_before_invocation() + def run(self, new_inputs: List[InputType]) -> OutputType: + self.check_static_inputs_are_stable(new_inputs) - if not len(self.static_input_data_ptrs) == len(new_inputs): - raise RuntimeError("check len(self.static_input_data_ptrs) == len(new_inputs) fail") - # NB: this ranges over non-static inputs too - dst_record = {} - src_record = {} - for idx, data_ptr in enumerate(self.static_input_data_ptrs): - if idx in self.npugraph_managed_idxs: - continue - if not isinstance(new_inputs[idx], torch.Tensor): - pass - elif data_ptr is not None: - # static input, e.g., parameter - pass - else: - # non-static input, need to copy it into NPU graph - dst = self.reconstructed_inputs[idx] - src = new_inputs[idx] - self._record_input(idx, dst, src, dst_record, src_record) - - for dtype in dst_record.keys(): - if dtype not in src_record.keys(): - raise RuntimeError("Record for foreach_copy failed in NPUGraphNode.run.") - torch._foreach_copy_(dst_record[dtype], src_record[dtype]) + self._copy_inputs_and_remove_from_src(self.reconstructed_inputs, new_inputs) - new_inputs.clear() self.run_graph() outputs = self.reconstruct_outputs() - self.debug_check_invariants_after_invocation() + new_inputs.clear() + + if config.triton.fast_path_cudagraph_asserts: + self.debug_check_invariants_after_invocation() + + if config.triton.force_cudagraph_sync: + torch.npu.synchronize() + + # Reset this to run the check in the future + self.static_inputs_stable = False return outputs - def reconstruct_outputs(self): + def reconstruct_outputs(self) -> OutputType: "Reconstruct output tensors according to their saved metadata and alias information" # Cached tensors will not yet be set on the first execution @@ -965,7 +1079,7 @@ class NPUGraphNode: if not self.cached_tensor_outputs: self._initialize_cached_tensors() - outputs: List[Optional[Union[int, torch.Tensor]]] = [] + outputs: OutputType = [] for i, (storage_info, metadata) in enumerate( zip(self.output_storage_alias, self.outputs_metadata) @@ -978,6 +1092,14 @@ class NPUGraphNode: cached_t = self.cached_tensor_outputs[i] if cached_t is not None: + # this output represents a fresh allocated tensor. + # We return the same TensorImpl from run to run to avoid overhead. + # autograd.Function will reset the Autograd meta of output tensors + # as part of aot_autograd, but _backward_hooks are stored on tensors separately, + # so we need to manually reset hooks. + if cached_t._backward_hooks is not None: + cached_t._backward_hooks = None + # No need to update weakrefs, already correctly initialized outputs.append(cached_t) continue @@ -1047,27 +1169,28 @@ class NPUGraphNode: return output_storages - def run_graph(self): + def run_graph(self) -> None: if self.graph is None: raise RuntimeError("check self.graph is not None fail") self.graph.replay() - def all_outputs_are_dead(self): + def all_outputs_are_dead(self) -> bool: "All outputs of the path from this node to its root are dead" for depth, output_index in self.live_indices_after_graph: if is_live(self.path_weakrefs[depth][output_index]): return False return True - def _record(self, model, inputs): + def _record(self, model: ModelType, inputs: List[InputType]) -> OutputType: "Record the model" - def static_input_iter(): + def static_input_iter() -> Generator[torch.Tensor, None, None]: for i in self.wrapped_function.static_input_idxs: + _inp = inputs[i] if isinstance( - inputs[i], torch.Tensor - ) and not self._is_npu_graph_recorded_tensor(inputs[i]): - yield inputs[i] + _inp, torch.Tensor + ) and not self._is_npu_graph_recorded_tensor(_inp): + yield _inp # see: output_is_alias_of_persistent_static_inputs above static_input_persistent_storage_ptrs: Dict[int, StorageWeakRefWrapper] = {} @@ -1117,9 +1240,9 @@ class NPUGraphNode: def _add_first_outputs( self, - outputs, + outputs: OutputType, static_input_persistent_storage_ptrs: Dict[int, StorageWeakRefWrapper], - ): + ) -> None: "Add the outputs from the first invocation of the node and set up metadata" # getting liveness before we have added the outputs to path, so the length @@ -1166,6 +1289,12 @@ class NPUGraphNode: path_ref = self._is_alias_of_live_recorded_tensor(out_) if path_ref is not None: self._mark_prior_graph_output_as_aliased(path_ref) + + for idx, inp_path_ref in enumerate( + self.live_npugraph_managed_path_refs + ): + if path_ref == inp_path_ref: + self.preserved_aliased_inputs[idx] = True self.output_storage_alias.append(AliasesPriorGraphOutput(path_ref)) continue @@ -1212,7 +1341,7 @@ class NPUGraphNode: self.device, self.npu_graphs_pool, list(self.path_live_weakrefs()) ) - def _mark_prior_graph_output_as_aliased(self, index: PathOutputIndex): + def _mark_prior_graph_output_as_aliased(self, index: PathOutputIndex) -> None: "Remove a graph output from the unaliased, cached tensors in an ancestor node" depth, output_index = index node = list(self._path_from_root)[depth] @@ -1222,7 +1351,7 @@ class NPUGraphNode: raise RuntimeError("check x is not None fail") x.remove_extra_reference() - def _initialize_cached_tensors(self): + def _initialize_cached_tensors(self) -> None: # we should not be clearing output_weakrefs, and they should be set in the first # record run if not len(self.outputs_weakrefs) == len(self.outputs_metadata): @@ -1270,16 +1399,16 @@ class NPUGraphNode: self.outputs_weakrefs[i] = StorageWeakRefWrapper(out, extra_ref_check=check) self.cached_tensor_outputs.append(out) - def get_output_refcount(self, index): + def get_output_refcount(self, index: int) -> int: return sys.getrefcount(self.cached_tensor_outputs[index]) @property - def parent(self): + def parent(self) -> Optional[NPUGraphNode]: "unwraps the weakref to _parent" return self._parent() if self._parent is not None else None @property - def _path_to_root(self): + def _path_to_root(self) -> Generator[NPUGraphNode, None, None]: "Returns all nodes in the path starting at self and ending at root" node = self while node: @@ -1287,12 +1416,12 @@ class NPUGraphNode: node = node.parent @property - def _path_from_root(self): + def _path_from_root(self) -> Generator[NPUGraphNode, None, None]: "Returns all nodes in the path starting at the root and ending at self" nodes = reversed(list(self._path_to_root)) yield from nodes - def _is_npu_graph_recorded_tensor(self, t: torch.Tensor): + def _is_npu_graph_recorded_tensor(self, t: torch.Tensor) -> bool: "Is this tensor an output of a node in this path" for output_refs in self.path_weakrefs: for storage_weak_ref in output_refs: @@ -1323,7 +1452,7 @@ class NPUGraphNode: def _check_liveness( indices: List[PathOutputIndex], output_refs: List[List[Optional[StorageWeakRefWrapper]]], - ): + ) -> bool: "Check that all of the indices specified are dead references" for depth, output_index in indices: w = output_refs[depth][output_index] @@ -1333,7 +1462,7 @@ class NPUGraphNode: return False return True - def add_child(self, function_id: FunctionID, node: NPUGraphNode): + def add_child(self, function_id: FunctionID, node: NPUGraphNode) -> None: "Adds node as a a child of self" self.children[function_id].append(node) @@ -1366,7 +1495,7 @@ class NPUGraphNode: def debug_assert_invariants( self, expected_liveness: List[List[bool]], newly_dead: List[PathOutputIndex] - ): + ) -> None: if not config.triton.fast_path_cudagraph_asserts: return @@ -1407,12 +1536,12 @@ class NPUGraphNode: if is_live(self.path_weakrefs[depth][output_index]): raise RuntimeError("check not is_live(self.path_weakrefs[depth][output_index]) fail") - def debug_check_invariants_before_invocation(self): + def debug_check_invariants_before_invocation(self) -> None: self.debug_assert_invariants( self.recorded_liveness_before_graph, self.expected_dead_indices_before_graph ) - def debug_check_invariants_after_invocation(self): + def debug_check_invariants_after_invocation(self) -> None: self.debug_assert_invariants( self.recorded_liveness_before_graph, self.expected_dead_indices_after_graph ) @@ -1442,7 +1571,7 @@ class NPUGraphNode: if out is not None and is_live(out): yield out - def remove_node_cached_tensors(self): + def remove_node_cached_tensors(self) -> None: for t in self.cached_tensor_outputs: if t is not None: torch_npu._C._remove_cached_tensor(t) @@ -1455,17 +1584,19 @@ class NPUGraphNode: raise RuntimeError("check n is not None fail") n.remove_extra_reference() - def remove_path_cached_tensors(self): + def remove_path_cached_tensors(self) -> None: for node in self._path_from_root: node.remove_node_cached_tensors() - def clear_path_state(self): + def clear_path_state(self) -> None: "Clear the path state in this current executing node" # this doesnt actually do anything right now, leaving it as placeholder pass @staticmethod - def _tensor_metadata(x, ignore_storage_offset=True): + def _tensor_metadata( + x: torch.Tensor, ignore_storage_offset: bool = True + ) -> Dict[str, Any]: if not isinstance(x, torch.Tensor): raise RuntimeError("check isinstance(x, torch.Tensor) fail") # We ignore the storage offset for inputs, but not for outputs @@ -1481,19 +1612,19 @@ class NPUGraphNode: } def _reconstruct_from_tensor_metadata( - self, metadata: Dict[str, Any], storage=None + self, metadata: Dict[str, Any], storage: Optional[UntypedStorage] = None ) -> Tensor: s = self.create_storage(metadata) if storage is None else storage return torch_npu._C._construct_NPU_Tensor_From_Storage_And_Metadata(metadata, s) - def create_storage(self, metadata): + def create_storage(self, metadata: Dict[str, Any]) -> torch.types.Storage: return torch_npu._C._construct_storage_from_data_pointer( metadata["data_ptr"], metadata["device"], metadata["nbytes"] ) def _allocate_and_copy_recording_inputs( - self, inputs - ) -> List[Union[torch.Tensor, int]]: + self, inputs: List[InputType] + ) -> List[InputType]: """ Allocate inputs for non static, non npugraph managraphed managed tensors in the memory pool and copy over the tensor values. @@ -1501,7 +1632,7 @@ class NPUGraphNode: torch.npu.synchronize() self.stream.wait_stream(torch.npu.current_stream()) - recording_inputs: List[Union[Tensor, int]] = [] + recording_inputs: List[InputType] = [] with warnings.catch_warnings(record=True), torch.npu.device( self.device @@ -1518,36 +1649,75 @@ class NPUGraphNode: elif i not in self.static_input_idxs: # static_input does an allocation! recording_inputs.append(static_input(inp)) - # copy over and clear non recording input - self._copy_input(i, recording_inputs[-1], inp) - inputs[i] = None - del inp else: recording_inputs.append(inp) + self._copy_inputs_and_remove_from_src(recording_inputs, inputs) + return recording_inputs - def check_invariants(self, inputs: List[Tensor]) -> bool: + def check_invariants( + self, inputs: List[InputType] + ) -> Tuple[CheckInvariantStatus, Callable[..., str]]: """ Checks if this node can be run. The same pattern of tensor liveness and tensors managed in the npugraph private pool must remain stable. """ + _logger = functools.partial( + log_data_ptr_mismatch, + self.wrapped_function.placeholders, + inputs, + self.static_input_data_ptrs, + ) + # previously managed data pointers remain stable - for idx in self.npugraph_managed_idxs: - if inputs[idx].data_ptr() != self.static_input_data_ptrs[idx]: - return False + # this is on the hot path so moved to C++. equivalent to: + if not torch_npu._C._tensors_data_ptrs_at_indices_equal( + inputs, # type: ignore[arg-type] + self.static_input_data_ptrs, + self.npugraph_managed_idxs, + ): + status = CheckInvariantStatus.CudagraphManagedIdxMismatch + _logger = functools.partial( + _logger, + self.npugraph_managed_idxs, + status, + ) + return status, _logger if not self._check_liveness( self.expected_dead_indices_before_graph, self.path_weakrefs ): - return False + status = CheckInvariantStatus.ExpectedDeadIndicesBeforeGraphMismatch + return status, lambda: f"{status}" + + # static input data pointers should remain stable + # if we are inlining builtin nn modules we re-record in this case + # if we are not inlining builtin nn modules, we check this in check_static_inputs_are_stable + # and error if they are not stable + if ( + self.rerecord_if_static_inputs_change + and not torch_npu._C._tensors_data_ptrs_at_indices_equal( + inputs, # type: ignore[arg-type] + self.static_input_data_ptrs, + self.static_input_idxs, + ) + ): + status = CheckInvariantStatus.StaticInputIdxMismatch + _logger = functools.partial( + _logger, + self.static_input_idxs, + status, + ) + return status, _logger # the npugraph managed tensors which died upon recording must also die upon # this invocation. it is too late to check after we've replayed the graph, # because we would have already written over their memory. for idx in self.npugraph_managed_idxs: - inputs[idx] = None # type: ignore[call-overload] + if not self.preserved_aliased_inputs[idx]: + inputs[idx] = None # type: ignore[call-overload] torch._check( self._check_liveness( @@ -1556,7 +1726,7 @@ class NPUGraphNode: lambda: "graph recording observed an input tensor deallocate during graph " " recording that did not occur during replay. Please file an issue.", ) - return True + return CheckInvariantStatus.SUCCESS, lambda: f"{CheckInvariantStatus.SUCCESS}" def num_descendants(self) -> int: "Total number of descendents of this node" @@ -1568,12 +1738,12 @@ class NPUGraphNode: return num_desc -def get_npugraph_segments(pool_id): +def get_npugraph_segments(pool_id: Tuple[int, int]) -> Any: segments = torch.npu.memory_snapshot() return [segment for segment in segments if segment["segment_pool_id"] == pool_id] -def get_block_addrs(pool_id, live_only=True): +def get_block_addrs(pool_id: Tuple[int, int], live_only: bool = True) -> List[int]: blocks = [] for segment in get_npugraph_segments(pool_id): @@ -1587,7 +1757,7 @@ def get_block_addrs(pool_id, live_only=True): return blocks -def format_tb(frames): +def format_tb(frames: List[Any]) -> str: formatted_traceback = [] for entry in frames: @@ -1598,10 +1768,13 @@ def format_tb(frames): return "".join(traceback.format_list(formatted_traceback)) -def check_memory_pool(device, pool_id, live_storages_ptrs: List[StorageWeakRefWrapper]): +def check_memory_pool( + device: int, + pool_id: Tuple[int, int], + live_storages_ptrs: List[StorageWeakRefWrapper], +) -> None: if not all(isinstance(elem, StorageWeakRefWrapper) for elem in live_storages_ptrs): raise RuntimeError("check all(isinstance(elem, StorageWeakRefWrapper) for elem in live_storages_ptrs) fail") - # noqa: C419 unique_storages = {stor.data_ptr() for stor in live_storages_ptrs if stor()} # check if there is a divergence first, then do the expensive snapshot call after @@ -1633,7 +1806,7 @@ def check_memory_pool(device, pool_id, live_storages_ptrs: List[StorageWeakRefWr lambda: f"These storage data ptrs are not allocated in pool {pool_id} but should be {unique_storages}", ) - if allocated_not_in_live_storages != 0: + if len(allocated_not_in_live_storages) != 0: formatted = [] for dp, block in allocated_not_in_live_storages.items(): trace = format_tb(block.get("frames", [])) @@ -1689,7 +1862,7 @@ class NPUGraphTreeManager: replay. """ - def __init__(self, device_index: int): + def __init__(self, device_index: int) -> None: # roots are functions which have no dependencies on an other node. I.e., # when they are first invoked, none of their inputs are outputs are outputs # of another node, nor are there any live outputs of another node whose @@ -1699,7 +1872,7 @@ class NPUGraphTreeManager: # mapping from function id to wrapped function self.ids_to_funcs: Dict[FunctionID, WrappedFunction] = {} - self.ids_to_stack_traces: Dict[FunctionID, StackTraces] = {} + self.ids_to_stack_traces: Dict[FunctionID, Optional[StackTraces]] = {} self.warmed_up_functions: Set[FunctionID] = set() # if we fail to increment generation, and are stuck warming up, @@ -1707,6 +1880,9 @@ class NPUGraphTreeManager: self.warned_functions: Set[FunctionID] = set() torch_npu._C._set_cached_tensors_enabled(True) + # warn only once if a function mutates inputs + self.warned_mutation: Set[FunctionID] = set() + # NB: npu caching allocator will remember the stream a segment is allocated to # and only allocate that segment to the same stream. we need to use a single stream # for all allocations to the memory pool, otherwise the allocations to separate streams @@ -1733,6 +1909,19 @@ class NPUGraphTreeManager: self.graph_counter = itertools.count(0) self.func_counter = itertools.count(0) + # mapping from graph_id to (function id to mutation type hint) since we are + # specializing on a particular combination of Parent Node -> Function ID. + self.non_npugraph_managed_mutation_hint: Dict[ + Optional[GraphID], Dict[FunctionID, bool] + ] = defaultdict(dict) + self.warmup_node_counter = itertools.count(start=-1, step=-1) + + # mapping from graph_id to (function id to re-record count). We fall back to + # eager function if a function is re-recorded frequently on a node. + self.num_rerecord: Dict[Optional[GraphID], Dict[FunctionID, int]] = defaultdict( + lambda: defaultdict(lambda: 0) + ) + # whether we the current node is in a state of warmup, recording, execution. If # there is no current node the state will be ExecutionState.None. self.path_state = ExecutionState.NONE @@ -1742,7 +1931,7 @@ class NPUGraphTreeManager: # when there is no output from a previous recording or execution whose memory # we need to respect in the npu caching allocation. If you incremented generation, # this will also be none, as ignore those allocations. - self.current_node: Optional[NPUGraphNode] = None + self.current_node: Optional[Union[NPUGraphNode, NPUWarmupNode]] = None # current generation of npugraph invocations. when torch.compile is run # we increment the current generation. are willing to ignore live outputs @@ -1772,25 +1961,76 @@ class NPUGraphTreeManager: # then mod2(mod1(x)).sum().backward() self.running_forwards_with_pending_backwards = False + self.mode: Optional[CompilationMode] = None + self.disable_invalidate_aliases = False - def run(self, new_inputs: List[Tensor], function_id: FunctionID): + def run(self, new_inputs: List[InputType], function_id: FunctionID) -> OutputType: if self.graph is None: raise RuntimeError("Running NPUGraph after shutdown") + self.mode = self.id_to_mode[function_id] out = self._run(new_inputs, function_id) # The forwards are only pending following invocation, not before - mode = self.id_to_mode[function_id] - if mode == CompilationMode.FORWARD: + if self.mode == CompilationMode.FORWARD: self.running_forwards_with_pending_backwards = True - elif mode == CompilationMode.BACKWARD: + elif self.mode == CompilationMode.BACKWARD: self.running_forwards_with_pending_backwards = False return out - def set_to_running_backward(self): + def set_to_running_backward(self) -> None: self.running_forwards_with_pending_backwards = False + self.mode = CompilationMode.BACKWARD + + def _get_npu_graph_recorded_tensor_checker(self) -> Callable[[Tensor], bool]: + return ( + self.current_node._is_npu_graph_recorded_tensor + if isinstance(self.current_node, (NPUGraphNode, NPUWarmupNode)) + else lambda _: False + ) + + def new_warmup_node_id(self) -> GraphID: + return GraphID(next(self.warmup_node_counter)) + + def _update_non_npugraph_managed_mutation( + self, function_id: FunctionID, inputs: List[InputType] + ) -> None: + node_id = self._get_node_id() + maybe_mutation_str = check_for_mutation( + self.ids_to_funcs[function_id], + inputs, + self._get_npu_graph_recorded_tensor_checker(), + ) + if maybe_mutation_str: + self.non_npugraph_managed_mutation_hint[node_id][function_id] = True + # warn once per function_id + if function_id in self.warned_mutation: + return + self.warned_mutation.add(function_id) + log_cudagraph_skip_and_bump_counter(maybe_mutation_str) + else: + self.non_npugraph_managed_mutation_hint[node_id][function_id] = False - def _run(self, new_inputs: List[Tensor], function_id: FunctionID): + def _get_node_id(self) -> Optional[GraphID]: + if self.current_node is None: + return None + elif isinstance(self.current_node, (NPUGraphNode, NPUWarmupNode)): + return self.current_node.id + else: + raise RuntimeError(f"Unknown node type {type(self.current_node)}") + + def exceed_rerecord_limit( + self, node_id: Optional[GraphID], function_id: FunctionID + ) -> bool: + if torch._dynamo.config.inline_inbuilt_nn_modules: + return False + + return ( + self.num_rerecord[node_id][function_id] + > torch._inductor.config.triton.cudagraph_unexpected_rerecord_limit + ) + + def _run(self, new_inputs: List[InputType], function_id: FunctionID) -> OutputType: # we will try to end the current execution lazily, since # we dont want to do unnecessary checking of the existing outputs # on the hot path, but both recording and warmup only happen once @@ -1801,6 +2041,17 @@ class NPUGraphTreeManager: if self.in_warmup: self.try_end_curr_warmup(function_id) + node_id = self._get_node_id() + if function_id not in self.non_npugraph_managed_mutation_hint[node_id]: + self._update_non_npugraph_managed_mutation(function_id, new_inputs) + + # Early exit if the function mutates inputs which are neither parameters/buffers nor + # npugraph recorded tensors. This check should happen after `try_end_curr_recording` + # and `try_end_curr_warmup` which may change self.current_node. + if self.non_npugraph_managed_mutation_hint[node_id][function_id] or \ + self.exceed_rerecord_limit(node_id, function_id): + return self.ids_to_funcs[function_id].model(new_inputs) + # warming up a function and subsequentally recording may use different memory addresses # because both depend on the state of the caching allocator. if we warm up graph A, # then warm up graph B and make more allocations, the subsequent recording of A will not @@ -1811,26 +2062,44 @@ class NPUGraphTreeManager: function_id in self.warmed_up_functions or config.triton.skip_cudagraph_warmup ) - ) or self.in_warmup: + or self.in_warmup + or config.triton.force_cudagraphs_warmup + ): # If we are in the middle of executing npu graphs, then we need to checkpoint memory state. # Both Recording and Warmup will be reflected in the allocator and dont need changes if self.path_state == ExecutionState.EXECUTION: self.apply_checkpoint_execution_state_in_allocator() - return self.run_eager(new_inputs, function_id) + with dynamo_timed( + "NPUGraphTreeManager.run_eager", + log_pt2_compile_event=True, + ): + out = self.run_eager(new_inputs, function_id) + return out + if isinstance(self.current_node, NPUWarmupNode): + raise RuntimeError("self.current_node is NPUWarmupNode object") child_nodes = ( self.roots if self.current_node is None else self.current_node.children ) if not self.in_recording: + unexpected_rerecord, unexpected_rerecord_reason = False, lambda: "" for child in child_nodes[function_id]: # here we are checking memory consistency between recording and execution, # as well as things like stability of tensor locations, etc # and other - if child.check_invariants(new_inputs): + status, status_logger = child.check_invariants(new_inputs) + if status == CheckInvariantStatus.SUCCESS: return self.execute_node(child, new_inputs) + if ( + status == CheckInvariantStatus.StaticInputIdxMismatch + or status == CheckInvariantStatus.CudagraphManagedIdxMismatch + ): + unexpected_rerecord = True + unexpected_rerecord_reason = status_logger + # now that we know the new function can't be run as a child of the # current node, if it is a root, try to end the current execution. # as noted above, we want to do this lazily to avoid having to @@ -1842,6 +2111,29 @@ class NPUGraphTreeManager: if self.current_node is None: return self.run(new_inputs, function_id) + if len(self.ids_to_funcs[function_id].mutated_input_idxs) > 0: + self._update_non_npugraph_managed_mutation(function_id, new_inputs) + if self.non_npugraph_managed_mutation_hint[self._get_node_id()][ + function_id + ]: + return self.ids_to_funcs[function_id].model(new_inputs) + + # nb: run before checkpointing because checkpointing is slow, and we will + # be using the eager caching allocator pool which does not require live + # accounting of tensors in cudagraph allocator + if unexpected_rerecord: + curr_node_id = self._get_node_id() + self.num_rerecord[curr_node_id][function_id] += 1 + if self.exceed_rerecord_limit(curr_node_id, function_id): + _id = curr_node_id.id if curr_node_id else None + log_cudagraph_skip_and_bump_counter( + f"skipping npuagraph due to function {function_id.id} exceeding max " + f"re-recording limit " + f"(={torch._inductor.config.triton.cudagraph_unexpected_rerecord_limit}) " + f"on npugraph node {_id} due to {unexpected_rerecord_reason()}." + ) + return self.ids_to_funcs[function_id].model(new_inputs) + # at this point, we necessarily will do a new recording self.debug_fail_counter += 1 @@ -1850,9 +2142,15 @@ class NPUGraphTreeManager: self.apply_checkpoint_execution_state_in_allocator() # now, we are in a recording state ! - return self.record_function(new_inputs, function_id) + with dynamo_timed( + "NPUGraphTreeManager.record_function", + log_pt2_compile_event=True, + ): + out = self.record_function(new_inputs, function_id) - def shutdown(self): + return out + + def shutdown(self) -> None: """ Remove all cached tensors in all nodes. Because cached tensors can hold gradients which in turn might reference a backward which invokes a NPU Graph Node, we have to manually clear them on shutdown @@ -1873,8 +2171,17 @@ class NPUGraphTreeManager: self.roots = None # type: ignore[assignment] self.current_node = None - def record_function(self, new_inputs, function_id) -> List[Optional[Tensor]]: + def record_function( + self, new_inputs: List[InputType], function_id: FunctionID + ) -> OutputType: + if isinstance(self.current_node, NPUWarmupNode): + raise RuntimeError("self.current_node is NPUWarmupNode object") graph_id = self.new_graph_id() + log.debug( + "Recording function %d of graph recording id %d", + function_id.id, + graph_id.id, + ) torch.npu.synchronize() node = NPUGraphNode( self.ids_to_funcs[function_id], @@ -1896,16 +2203,27 @@ class NPUGraphTreeManager: torch.npu.synchronize() return node.run_first_inputs(new_inputs) - def execute_node(self, node: NPUGraphNode, new_inputs) -> List[Optional[Tensor]]: + def execute_node( + self, node: NPUGraphNode, new_inputs: List[InputType] + ) -> OutputType: self.current_node = node self.path_state = ExecutionState.EXECUTION self.update_generation() return node.run(new_inputs) - def run_eager(self, new_inputs, function_id: FunctionID): + def run_eager( + self, new_inputs: List[InputType], function_id: FunctionID + ) -> OutputType: # this is only stored on current node, because when we start a new path, # we will deallocate it already_warm = function_id in self.warmed_up_functions + if not already_warm: + log.debug("Running warmup of function %d", function_id.id) + else: + log.debug( + "Running eager of function %d because ancestor needed to warm up", + function_id.id, + ) self.warmed_up_functions.add(function_id) node = NPUWarmupNode( self.ids_to_funcs[function_id], @@ -1916,6 +2234,7 @@ class NPUGraphTreeManager: self.ids_to_stack_traces[function_id], self.stream, already_warm, + self.new_warmup_node_id(), ) self.current_node = node self.path_state = ExecutionState.WARMUP @@ -1930,20 +2249,24 @@ class NPUGraphTreeManager: def add_function( self, - model, - inputs, - static_input_idxs, - stack_traces, - mode, - constants, - ) -> Tuple[Callable[..., Any], List[Optional[Tensor]]]: + model: ModelType, + inputs: List[InputType], + static_input_idxs: Sequence[int], + stack_traces: Optional[StackTraces], + mode: CompilationMode, + constants: Tuple[torch.Tensor, ...], + placeholders: Tuple[PlaceholderInfo, ...], + mutated_input_idxs: Tuple[int, ...], + ) -> Tuple[ModelType, OutputType]: id_for_func = self.new_func_id() self.ids_to_stack_traces[id_for_func] = stack_traces self.ids_to_funcs[id_for_func] = WrappedFunction( model, - static_input_idxs, + list(static_input_idxs), id_for_func, tuple(t for t in constants if isinstance(t, torch.Tensor) and t.is_npu), + placeholders, + mutated_input_idxs, ) self.id_to_mode[id_for_func] = mode fn = functools.partial(self.run, function_id=id_for_func) @@ -1953,11 +2276,11 @@ class NPUGraphTreeManager: return fn, fn(inputs) @property - def in_recording(self): + def in_recording(self) -> bool: return self.path_state == ExecutionState.RECORDING @property - def in_warmup(self): + def in_warmup(self) -> bool: return self.path_state == ExecutionState.WARMUP def get_roots(self) -> Iterator[NPUGraphNode]: @@ -1965,16 +2288,18 @@ class NPUGraphTreeManager: yield from nodes @property - def current_node(self): + def current_node(self) -> Optional[Union[NPUGraphNode, NPUWarmupNode]]: return self._current_node @current_node.setter - def current_node(self, value): + def current_node( + self, value: Optional[Union[NPUGraphNode, NPUWarmupNode]] + ) -> None: self._current_node = value if value is None: self.path_state = ExecutionState.NONE - def update_generation(self): + def update_generation(self) -> None: self.current_gen = self.get_curr_generation() @staticmethod @@ -1985,7 +2310,7 @@ class NPUGraphTreeManager: return GenerationTracker.generation @staticmethod - def user_invoked_mark_step(): + def user_invoked_mark_step() -> bool: return MarkStepBox.mark_step_counter != 0 def can_start_new_generation(self) -> bool: @@ -1997,7 +2322,7 @@ class NPUGraphTreeManager: return not self.running_forwards_with_pending_backwards - def in_new_torch_compile_invocation(self): + def in_new_torch_compile_invocation(self) -> bool: return self.current_gen != self.get_curr_generation() def try_end_curr_recording(self, function_id: FunctionID) -> None: @@ -2042,19 +2367,21 @@ class NPUGraphTreeManager: if self.current_node.all_outputs_are_dead(): self.clear_current_path_state_and_set_to_none() - def try_end_curr_warmup(self, function_id: FunctionID): + def try_end_curr_warmup(self, function_id: FunctionID) -> None: if self.can_start_new_generation(): self.dealloc_current_path_weakrefs() self.current_node = None return + if self.current_node is None: + raise RuntimeError("check self.current_node is not None fail") if self.current_node.all_outputs_are_dead(): self.current_node = None return self.check_warn_on_unable_to_start_executing(function_id) - def check_warn_on_unable_to_start_executing(self, function_id: FunctionID): + def check_warn_on_unable_to_start_executing(self, function_id: FunctionID) -> None: "Warn if we in a potential loop where we are unable to hit fast path" if ( function_id in self.warned_functions @@ -2062,6 +2389,8 @@ class NPUGraphTreeManager: ): return + if self.current_node is None: + raise RuntimeError("check self.current_node is not None fail") existing_nodes = [ node for node in self.current_node._path_from_root @@ -2087,9 +2416,25 @@ class NPUGraphTreeManager: "before each model invocation" ) - def dealloc_current_path_weakrefs(self): + @staticmethod + def format_dealloc_msg(stack_trace: Optional[str]) -> str: + stack_trace = ( + stack_trace.strip() if stack_trace else "[Could not find stack trace]" + ) + return ( + "Error: accessing tensor output of NPUGraphs that has been overwritten by a subsequent run. " + f"Stack trace: {stack_trace}. " + "To prevent overwriting, clone the tensor outside of torch.compile() " + "or call torch.compiler.npugraph_mark_step_begin() before each model invocation." + ) + + def dealloc_current_path_weakrefs(self) -> None: + if self.current_node is None: + raise RuntimeError("check self.current_node is not None fail") # we could also allow the these weak refs to continue to be allocated, # but that adds some complications. + + stor_stack_trace: Dict[int, Optional[str]] = {} for node in self.current_node._path_from_root: if not len(node.tensor_weakrefs) == len(node.stack_traces): raise RuntimeError("check len(node.tensor_weakrefs) == len(node.stack_traces) fail") @@ -2098,34 +2443,60 @@ class NPUGraphTreeManager: if ten is None: continue - stack_trace = ( - stack_trace.strip() - if stack_trace - else "[Could not find stack trace]" - ) - msg = ( - "Error: accessing tensor output of NPUGraphs that has been overwritten by a subsequent run. " - f"Stack trace: {stack_trace}. " - "To prevent overwriting, clone the tensor outside of torch.compile() " - "or call torch.compiler.npugraph_mark_step_begin() before each model invocation." + torch_npu._C._set_storage_access_error_msg( + ten, self.format_dealloc_msg(stack_trace) ) + # we would to enable the following assertion, but an internal model failed with a command + # that does not repro. len(node.outputs_weakrefs) == len(node.stack_traces) + # so, pessimistically assume that they might differ by doing the debug info + # loop separately from the dealloc loop + if self.disable_invalidate_aliases: + continue + + for storage_ref, stack_trace in zip( + node.outputs_weakrefs, node.stack_traces + ): + if not storage_ref: + continue + + stor_stack_trace[storage_ref.data_ptr()] = stack_trace + deleted = set() for storage_ref in self.current_node.path_live_weakrefs(): - if storage_ref() and storage_ref.data_ptr() not in deleted: + _storage_deref = storage_ref() + if _storage_deref and storage_ref.data_ptr() not in deleted: deleted.add(storage_ref.data_ptr()) - torch_npu._C._free_And_Remove_DeleterFn(storage_ref()) - def clear_current_path_state_and_set_to_none(self): + msg = self.format_dealloc_msg( + stor_stack_trace.get(storage_ref.data_ptr()) + ) + torch_npu._C._free_And_Remove_DeleterFn(_storage_deref) + + if self.disable_invalidate_aliases: + continue + + torch_npu._C._set_storage_data_ptr_access_error_msg(_storage_deref, msg) + + + def clear_current_path_state_and_set_to_none(self) -> None: + if not isinstance(self.current_node, NPUGraphNode): + raise RuntimeError("check self.current_node is NPUGraphNode object fail") self.current_node.clear_path_state() self.current_node = None - def apply_checkpoint_execution_state_in_allocator(self): + def apply_checkpoint_execution_state_in_allocator(self) -> None: """ Checkpoint the current execution state in the caching allocator so that additional npugraph recordings can be made respecting existent live storages. """ + if not isinstance(self.current_node, NPUGraphNode): + raise RuntimeError("check self.current_node is NPUGraphNode object fail") self.debug_checkpointing_counter += 1 + log.debug( + "Checkpointing cuda caching allocator state. Number of checkpoints %d", + self.debug_checkpointing_counter, + ) state = self.current_node.checkpointed_caching_state device = self.current_node.device @@ -2139,7 +2510,8 @@ class NPUGraphTreeManager: self.current_node.remove_path_cached_tensors() live_storages_wrappers = list(self.current_node.path_live_weakrefs()) - live_storages_weak_refs = [t() for t in live_storages_wrappers] + # path_live_weakrefs guarantees that t() will not be None + live_storages_weak_refs: list[int] = [t() for t in live_storages_wrappers] ptrs_to_deallocate = self.current_node.data_ptrs_dead_since_invocation() torch_npu._C._npu_setCheckpointPoolState( device, state, stale_storages, live_storages_weak_refs @@ -2168,4 +2540,5 @@ class NPUGraphTreeManager: if self.current_node is None: return [] # explicitly ignoring previous recorded outputs from past path + # path_live_weakrefs() guarantees that t() will not be None return [t() for t in self.current_node.path_live_weakrefs()] diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index 0cb93e9951..41484e0570 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -19,6 +19,7 @@ from ._step import add_perf_dump_patch from .flops_count import _FlopsCounter as FlopsCounter from .affinity import _set_thread_affinity as set_thread_affinity from .affinity import _reset_thread_affinity as reset_thread_affinity +from ._graph_tree import _apply_npugraph_tree_methods # init flopcount diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py new file mode 100644 index 0000000000..6de299a02d --- /dev/null +++ b/torch_npu/utils/_graph_tree.py @@ -0,0 +1,207 @@ +import functools +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, +) + +import torch +from torch.utils._ordered_set import OrderedSet +from torch._dynamo import utils as dynamo_utils +from torch._inductor import config +from torch._inductor.compile_fx import ( + get_input_idxs_to_check, + index_expanded_dims_and_copy_, + static_input, +) +from torch._inductor.cudagraph_utils import ( + _get_use_stack_trace, + format_default_skip_message, + PlaceholderInfo, +) +from torch._inductor.output_code import get_expanded_dims +from torch._inductor.utils import ( + align_inputs_from_check_idxs, + copy_misaligned_inputs, + remove_unaligned_input_idxs, + InputType, +) + + +def npugraph_mark_step_begin(): + from torch_npu.npu._graph_tree import mark_step_begin + mark_step_begin() + + +def check_multiple_devices_or_any_cpu_nodes( + device_node_mapping: Dict[torch.device, torch.fx.Node] +) -> Optional[str]: + cpu_node = device_node_mapping.get(torch.device("cpu")) + if cpu_node: + msg = f"cpu device ({cpu_node.name})" + stack_trace = _get_use_stack_trace(cpu_node) + if stack_trace: + return format_default_skip_message(f"{msg}. Found from : \n {stack_trace}") + return format_default_skip_message(msg) + + if ( + len(device_node_mapping) == 1 + and next(iter(device_node_mapping.keys())).type == "npu" + ): + return None + + keys_repr = (repr(key) for key in device_node_mapping.keys()) + return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}") + + +def npugraphify( + model: Callable[..., Any], + static_input_idxs: Sequence[int] = (), + *, + device_index: int, + stack_traces: List[Optional[str]], + is_backward: bool, + is_inference: bool, + constants: Tuple[torch.Tensor, ...] = (), + placeholders: Sequence[PlaceholderInfo] = (), + mutated_input_idxs: Tuple[int, ...] = (), +) -> Callable[..., Any]: + from torch_npu.npu._graph_tree import npugraphify_impl as new_npugraphify_impl + npugraphify_fn: Callable[..., Any] + if config.triton.cudagraph_trees: + npugraphify_fn = functools.partial( + new_npugraphify_impl, + device_index=device_index, + stack_traces=stack_traces, + is_backward=is_backward, + is_inference=is_inference, + constants=constants, + placeholders=placeholders, + mutated_input_idxs=mutated_input_idxs, + ) + else: + npugraphify_fn = npugraphify_impl + + compiled_fn = None + + def run(new_inputs: Sequence[InputType]) -> Any: + nonlocal compiled_fn + if compiled_fn is None: + with dynamo_utils.dynamo_timed( + "npugraphify", + log_pt2_compile_event=True, + ), dynamo_utils.preserve_rng_state(): + compiled_fn = npugraphify_fn(model, new_inputs, static_input_idxs) + return compiled_fn(new_inputs) + + return run + + +def npugraphify_impl( + model: Callable[..., Any], + inputs: List[torch.Tensor], + static_input_idxs: Sequence[int] = (), +) -> Callable[[List[InputType]], Any]: + """ + Assumes inputs[static_input_idxs[i]] are always the same memory address + """ + check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs) # type: ignore[arg-type] + static_input_idxs: OrderedSet[int] = OrderedSet( + remove_unaligned_input_idxs(inputs, static_input_idxs) # type: ignore[arg-type] + ) + copy_misaligned_inputs(inputs, check_input_idxs) # type: ignore[arg-type] + + if not isinstance(inputs, list): + raise RuntimeError("check isinstance(inputs, list) fail") + + inps_expanded_dims = [ + get_expanded_dims(x) if idx not in static_input_idxs else [] + for idx, x in enumerate(inputs) + ] + + # allocate static tensor inputs + static_inputs = [ + x + if not isinstance(x, torch.Tensor) + else static_input(x) + if idx not in static_input_idxs + else x.detach() + for idx, x in enumerate(inputs) + ] + + # copy over input values for fresh allocations + for idx, (x, expanded_dims) in enumerate(zip(inputs, inps_expanded_dims)): + if isinstance(x, torch.Tensor) and idx not in static_input_idxs: + index_expanded_dims_and_copy_(static_inputs[idx], x, expanded_dims) + + # warmup + torch.npu.synchronize() + stream = torch.npu.Stream() + stream.wait_stream(torch.npu.current_stream()) + # copy static_inputs because it will be cleared in model + with torch.npu.stream(stream): + model(list(static_inputs)) + stream.synchronize() + torch.npu.current_stream().wait_stream(stream) + torch.npu.synchronize() + + # record + graph = torch.npu.NPUGraph() + with torch.npu.graph(graph, stream=stream, capture_error_mode="thread_local"): + static_outputs = model(list(static_inputs)) + if not isinstance(static_outputs, (list, tuple)): + static_outputs = (static_outputs,) + + if config.size_asserts: + + def run(new_inputs: List[InputType]) -> Callable[[List[InputType]], Any]: + if not len(static_inputs) == len(new_inputs): + raise RuntimeError("check len(static_inputs) == len(new_inputs) fail") + for idx, (dst, src, expanded_dims) in enumerate( + zip(static_inputs, new_inputs, inps_expanded_dims) + ): + if not isinstance(dst, torch.Tensor): + continue + if not isinstance(src, torch.Tensor): + raise RuntimeError("check isinstance(src, torch.Tensor) fail") + if idx in static_input_idxs: + if not dst.data_ptr() == src.data_ptr(): + raise RuntimeError("check dst.data_ptr() == src.data_ptr() fail") + else: + # could make one single op of multiple slices + # and avoid dispatch. + # Could also pre-index the `dst` tensors + index_expanded_dims_and_copy_(dst, src, expanded_dims) + new_inputs.clear() + graph.replay() + return static_outputs + + else: + copy_indices = [ + idx + for idx in range(len(static_inputs)) + if idx not in static_input_idxs + ] + + def run(new_inputs: List[InputType]) -> Callable[[List[InputType]], Any]: + for idx in copy_indices: + expanded_dims = inps_expanded_dims[idx] + src = new_inputs[idx] + if not isinstance(src, torch.Tensor): + raise RuntimeError("check isinstance(src, torch.Tensor) fail") + index_expanded_dims_and_copy_(static_inputs[idx], src, expanded_dims) + new_inputs.clear() + graph.replay() + return static_outputs + + return align_inputs_from_check_idxs(run, check_input_idxs) + + +def _apply_npugraph_tree_methods(): + torch._inductor.compile_fx.cudagraphify = npugraphify + torch._inductor.cudagraph_utils.check_multiple_devices_or_any_cpu_nodes = check_multiple_devices_or_any_cpu_nodes + torch.compiler.npugraph_mark_step_begin = npugraph_mark_step_begin -- Gitee From a71243984e749e4f849d5bebbf3f140d7abf812e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 21 Jul 2025 14:09:35 +0000 Subject: [PATCH 287/328] !23263 Update op_plugin commit id Merge pull request !23263 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f5aeebc51c..64189ddc66 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f5aeebc51cf84d37be9f02a492c33988877742da +Subproject commit 64189ddc66c3ec1e8f6787094ebd82a99f00b6ef -- Gitee From c21c93cbe554d03241b5b7c3b21c8bd9cf49a7fc Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 22 Jul 2025 01:24:28 +0000 Subject: [PATCH 288/328] !23199 Update torchair commit id Merge pull request !23199 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 75d97976fa..9818eff91d 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 75d97976fa6b861d06595e1c4e477e8abfee2b30 +Subproject commit 9818eff91d926398e6bc2a733d044efe21629477 -- Gitee From c0b2af86de607d8c80b77700da7d4635d603435d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Tue, 22 Jul 2025 03:32:06 +0000 Subject: [PATCH 289/328] =?UTF-8?q?!23256=20add=20new=20feature=20set/get?= =?UTF-8?q?=5Fdevice=5Flimit=20Merge=20pull=20request=20!23256=20from=20?= =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9/v2.7.1-set1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/npu/test_torch_npu.py | 6 +++ third_party/acl/inc/acl/acl_rt.h | 36 ++++++++++++++ third_party/acl/libs/acl.cpp | 3 ++ torch_npu/csrc/core/npu/NPUFunctions.cpp | 39 +++++++++++++++ torch_npu/csrc/core/npu/NPUFunctions.h | 6 +++ .../csrc/core/npu/interface/AclInterface.cpp | 39 +++++++++++++++ .../csrc/core/npu/interface/AclInterface.h | 12 +++++ torch_npu/csrc/npu/Module.cpp | 48 +++++++++++++++++++ torch_npu/npu/__init__.py | 4 +- torch_npu/npu/npu_config.py | 43 ++++++++++++++++- 10 files changed, 234 insertions(+), 2 deletions(-) diff --git a/test/npu/test_torch_npu.py b/test/npu/test_torch_npu.py index 0e2c96e1bd..29709ef991 100644 --- a/test/npu/test_torch_npu.py +++ b/test/npu/test_torch_npu.py @@ -78,6 +78,12 @@ class TorchNPUDeviceTestCase(TestCase): torch_npu.npu.synchronize() after_free_memory, after_total_memory = torch_npu.npu.mem_get_info(0) self.assertEqual(before_total_memory, after_total_memory) + + @unittest.skip("CANN doesn't support now.") + def test_set_device_res_limit(self): + ans_dict = {'cube_core_num': 12, 'vector_core_num': 24} + torch.npu.set_device_limit(torch.npu.current_device(), 12, 24) + self.assertEqual(ans_dict, torch.npu.get_device_limit(torch.npu.current_device())) class TorchNPUMemoryApiTestCase(TestCase): def test_npu_memory_stats(self): diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 98b520ba4a..ecc36f3812 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -181,6 +181,11 @@ typedef enum aclrtLastErrLevel { ACL_RT_THREAD_LEVEL = 0, } aclrtLastErrLevel; +typedef enum { + ACL_RT_DEV_RES_CUBE_CORE = 0, + ACL_RT_DEV_RES_VECTOR_CORE, +} aclrtDevResModelType; + typedef void* aclrtDrvMemHandle; typedef void (*aclrtCallback)(void *userData); @@ -1541,6 +1546,37 @@ ACL_FUNC_VISIBILITY aclError aclrtPeekAtLastError(aclrtLastErrLevel level); */ ACL_FUNC_VISIBILITY aclError aclrtGetLastError(aclrtLastErrLevel level); +/** + * @ingroup AscendCL + * @brief Get the value of the current device's limited resources + * @param [in] deviceId the device id + * @param [in] type resources type + * @param [out] value resources limit value + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value); + +/** + * @ingroup AscendCL + * @brief Set the value of the current device's limited resources + * @param [in] deviceId the device id + * @param [in] type resource type + * @param [in] value resource limit value + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value); + +/** + * @ingroup AscendCL + * @brief Reset the value of the current device's limited resources + * @param [in] deviceId the device id + * @retval ACL_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +ACL_FUNC_VISIBILITY aclError aclrtResetDeviceResLimit(int32_t deviceId); + #ifdef __cplusplus } #endif diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp index 4f24e6bf04..9bb32581dd 100644 --- a/third_party/acl/libs/acl.cpp +++ b/third_party/acl/libs/acl.cpp @@ -18,6 +18,9 @@ aclError aclmdlSetDump(const char *configPath){return 0;} aclError aclmdlInitDump(){return 0;} aclError aclmdlFinalizeDump(){return 0;} aclError aclrtDeviceTaskAbort(int32_t deviceId, uint32_t timeout){return 0;} +aclError aclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value){return 0;} +aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value){return 0;} +aclError aclrtResetDeviceResLimit(int32_t deviceId){return 0;} // Stream aclError aclrtCreateStream(aclrtStream *stream) { return 0; } diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 77067fa03b..0ceb84847b 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -5,6 +5,7 @@ #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" +#include "third_party/acl/inc/acl/acl_rt.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" #endif @@ -298,4 +299,42 @@ void stream_synchronize(aclrtStream stream) NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); } +aclError SetDeviceResLimit(int32_t device, int32_t type, uint32_t value) +{ + std::lock_guard lock(mtx); + if (used_devices.find(device) == used_devices.end()) { + TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not get device resource limit"); + } + TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); + c10_npu::acl::aclrtDevResModelType restype = static_cast(type); + aclError err = c10_npu::acl::AclrtSetDeviceResLimit(device, restype, value); + NPU_CHECK_ERROR(err); + return err; +} + +uint32_t GetDeviceResLimit(int32_t device, int32_t type) +{ + std::lock_guard lock(mtx); + if (used_devices.find(device) == used_devices.end()) { + TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not get device resource limit"); + } + TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); + c10_npu::acl::aclrtDevResModelType restype = static_cast(type); + uint32_t value; + NPU_CHECK_ERROR(c10_npu::acl::AclrtGetDeviceResLimit(device, restype, &value)); + return value; +} + +aclError ResetDeviceResLimit(int32_t device) +{ + std::lock_guard lock(mtx); + if (used_devices.find(device) == used_devices.end()) { + TORCH_CHECK(false, "NPU device ", device, " has not been initialized! Can not reset device resource limit"); + } + TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); + aclError err = c10_npu::acl::AclrtResetDeviceResLimit(device); + NPU_CHECK_ERROR(err); + return err; +} + } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 4f978d4185..e162f8fe8f 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -81,6 +81,12 @@ void SetTargetDevice(); int GetLocalDevice(); +aclError SetDeviceResLimit(int32_t device, int32_t type, uint32_t value); + +C10_NPU_API uint32_t GetDeviceResLimit(int32_t deviceId, int32_t type); + +aclError ResetDeviceResLimit(int32_t deviceId); + enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR }; // it's used to store npu synchronization state diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 30a4280edc..841976f3ba 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -90,6 +90,9 @@ LOAD_FUNCTION(aclrtMemExportToShareableHandle) LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) LOAD_FUNCTION(aclrtMemImportFromShareableHandle) LOAD_FUNCTION(aclrtDeviceGetBareTgid) +LOAD_FUNCTION(aclrtGetDeviceResLimit) +LOAD_FUNCTION(aclrtSetDeviceResLimit) +LOAD_FUNCTION(aclrtResetDeviceResLimit) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -1033,5 +1036,41 @@ aclError AclrtDeviceGetBareTgid(int32_t *pid) return func(pid); } +aclError AclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value) +{ + typedef aclError (*AclrtGetDeviceResLimit)(int32_t, aclrtDevResModelType, uint32_t*); + static AclrtGetDeviceResLimit func = nullptr; + if (func == nullptr) { + func = (AclrtGetDeviceResLimit) GET_FUNC(aclrtGetDeviceResLimit); + } + + TORCH_CHECK(func, "Failed to find function aclrtGetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId, type, value); +} + +aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value) +{ + typedef aclError (*AclrtSetDeviceResLimit)(int32_t, aclrtDevResModelType, uint32_t); + static AclrtSetDeviceResLimit func = nullptr; + if (func == nullptr) { + func = (AclrtSetDeviceResLimit) GET_FUNC(aclrtSetDeviceResLimit); + } + + TORCH_CHECK(func, "Failed to find function aclrtSetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId, type, value); +} + +aclError AclrtResetDeviceResLimit(int32_t deviceId) +{ + typedef aclError (*AclrtResetDeviceResLimit)(int32_t); + static AclrtResetDeviceResLimit func = nullptr; + if (func == nullptr) { + func = (AclrtResetDeviceResLimit) GET_FUNC(aclrtResetDeviceResLimit); + } + + TORCH_CHECK(func, "Failed to find function aclrtResetDeviceResLimit", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 373aca671f..6b16535aa9 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -32,6 +32,12 @@ enum aclrtStreamStatus { }; using aclrtStreamStatus = enum aclrtStreamStatus; +enum aclrtDevResModelType { + ACL_RT_DEV_RES_CUBE_CORE = 0, + ACL_RT_DEV_RES_VECTOR_CORE = 1, +}; +using aclrtDevResModelType = enum aclrtDevResModelType; + /** aclprofStepInfo is provide by acl, it used to be store dispatch op info. */ @@ -245,5 +251,11 @@ aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t dev aclError AclrtDeviceGetBareTgid(int32_t *pid); +aclError AclrtGetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t* value); + +aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uint32_t value); + +aclError AclrtResetDeviceResLimit(int32_t deviceId); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index d335acc4e3..bfb0cdb0c1 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1726,6 +1726,51 @@ static PyObject* THNPModule_add_p2p_access(PyObject* self, PyObject *args) END_HANDLE_TH_ERRORS } +static PyObject* THNPModule_set_device_res_limit(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + PyObject* device = nullptr; + PyObject* type = nullptr; + PyObject* value = nullptr; + + if (!PyArg_ParseTuple(args, "OOO", &device, &type, &value)) { + throw torch::TypeError("Pybind failed to parse parameters." + + PTA_ERROR(ErrCode::TYPE)); + } + int32_t device_ = THPUtils_unpackLong(device); + int32_t type_ = THPUtils_unpackLong(type); + uint32_t value_ = static_cast(THPUtils_unpackUInt32(value)); + c10_npu::SetDeviceResLimit(device_, type_, value_); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_get_device_res_limit(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + PyObject* device = nullptr; + PyObject* type = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &device, &type)) { + throw torch::TypeError("Pybind failed to parse parameters." + + PTA_ERROR(ErrCode::TYPE)); + } + int32_t device_ = THPUtils_unpackLong(device); + int32_t type_ = THPUtils_unpackLong(type); + uint32_t value = c10_npu::GetDeviceResLimit(device_, type_); + return PyLong_FromUnsignedLong(value); + END_HANDLE_TH_ERRORS +} + +static PyObject* THNPModule_reset_device_res_limit(PyObject* self, PyObject *args) +{ + HANDLE_TH_ERRORS + int32_t device = THPUtils_unpackLong(args); + c10_npu::ResetDeviceResLimit(device); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + static struct PyMethodDef THNPModule_methods[] = { {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr}, {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr}, @@ -1790,6 +1835,9 @@ static struct PyMethodDef THNPModule_methods[] = { {"_add_ipc_pid", (PyCFunction)THNPModule_add_ipc_pid, METH_VARARGS, nullptr}, {"_get_ipc_pid", (PyCFunction)THNPModule_get_ipc_pid, METH_NOARGS, nullptr}, {"_add_p2p_access", (PyCFunction)THNPModule_add_p2p_access, METH_VARARGS, nullptr}, + {"_npu_get_device_res_limit", (PyCFunction)THNPModule_get_device_res_limit, METH_VARARGS, nullptr}, + {"_npu_set_device_res_limit", (PyCFunction)THNPModule_set_device_res_limit, METH_VARARGS, nullptr}, + {"_npu_reset_device_res_limit", (PyCFunction)THNPModule_reset_device_res_limit, METH_O, nullptr}, {nullptr}}; TORCH_NPU_API PyMethodDef* THNPModule_get_methods() diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index c76f9e2c14..7210d6e431 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -114,7 +114,9 @@ __all__ = [ "graph_task_group_begin", "graph_task_group_end", "graph_task_update_begin", - "graph_task_update_end" + "graph_task_update_end", + "set_device_limit", + "get_device_limit" ] from typing import Tuple, Union, List, cast, Optional diff --git a/torch_npu/npu/npu_config.py b/torch_npu/npu/npu_config.py index 2233f7841c..5ca745339f 100644 --- a/torch_npu/npu/npu_config.py +++ b/torch_npu/npu/npu_config.py @@ -6,12 +6,14 @@ import torch_npu import torch_npu._C from torch_npu.utils._path_manager import PathManager from torch_npu.utils._error_code import ErrCode, pta_error, prof_error +from .utils import _get_device_index # this file is used to enhance the npu frontend API by set_option or other. __all__ = ["set_option", "set_aoe", "set_compile_mode", "set_mm_bmm_format_nd", "get_mm_bmm_format_nd", - "is_jit_compile_false", "finalize_dump", "init_dump", "set_dump"] + "is_jit_compile_false", "finalize_dump", "init_dump", "set_dump", + "set_device_limit", "get_device_limit"] _option_map = {"ACL_PRECISION_MODE": ["allow_fp32_to_fp16", "must_keep_origin_dtype"], "ACL_OP_SELECT_IMPL_MODE": ["high_performance", "high_precision"], @@ -170,3 +172,42 @@ class _allowHF32Conv: hf32_value = torch_npu._C._npu_getOption("ALLOW_CONV_HF32") return (hf32_value is None) or (hf32_value.decode() == "") or (hf32_value.decode() == "enable") return None + + +class _call_once_class: + def __init__(self, func): + self.func = func + self.called = False + self.result = None + + def __call__(self, *args, **kwargs): + if self.called: + raise RuntimeError(f"Function '{self.func.__name__}' has already been called, \ + You can only set this interface once.") + + self.called = True + self.result = self.func(*args, **kwargs) + return self.result + + +@_call_once_class +def set_device_limit(device, cube_num=-1, vector_num=-1): + from torch_npu.npu import device_count + device_id = _get_device_index(device, optional=True) + if device_id < 0 or device_id >= device_count(): + raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE)) + torch_npu.npu._lazy_init() + if cube_num != -1: + torch_npu._C._npu_set_device_res_limit(device_id, 0, cube_num) + if vector_num != -1: + torch_npu._C._npu_set_device_res_limit(device_id, 1, vector_num) + + +def get_device_limit(device): + from torch_npu.npu import device_count + device_id = _get_device_index(device, optional=True) + if device_id < 0 or device_id >= device_count(): + raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE)) + torch_npu.npu._lazy_init() + return {"cube_core_num": torch_npu._C._npu_get_device_res_limit(device_id, 0), \ + "vector_core_num": torch_npu._C._npu_get_device_res_limit(device_id, 1)} \ No newline at end of file -- Gitee From 40b29cbfef3137542dbb4471cb63a5e10cbc2655 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 22 Jul 2025 11:09:38 +0000 Subject: [PATCH 290/328] !23284 Update op_plugin commit id Merge pull request !23284 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 64189ddc66..c94178b515 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 64189ddc66c3ec1e8f6787094ebd82a99f00b6ef +Subproject commit c94178b515bd4c1cc88f6598a72c7d019fa10b7a -- Gitee From 64cac7434989e7da429f23de06da6e4b88eb4aa9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 22 Jul 2025 14:24:34 +0000 Subject: [PATCH 291/328] !23302 Update op_plugin commit id Merge pull request !23302 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c94178b515..c96fd62800 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c94178b515bd4c1cc88f6598a72c7d019fa10b7a +Subproject commit c96fd62800c46eee97466b5eb80faf80a1ad6e8f -- Gitee From 8489d49e7d0ed8a084292653a7d082cdf7c7ac45 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 23 Jul 2025 02:08:33 +0000 Subject: [PATCH 292/328] !23310 Update torchair commit id Merge pull request !23310 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 9818eff91d..61ab3e37a0 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 9818eff91d926398e6bc2a733d044efe21629477 +Subproject commit 61ab3e37a06ee45fb620715f8d3191c9d874f46d -- Gitee From c66e2c4e9b58d9b8214ccd7f3db4c791413cbd6b Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 23 Jul 2025 02:08:34 +0000 Subject: [PATCH 293/328] !23310 Update torchair commit id Merge pull request !23310 from torchair_robot/v2.7.1 -- Gitee From 172e02d62c556b8512685d10d797597994848c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E5=85=89=E6=B5=A9?= Date: Wed, 23 Jul 2025 03:17:08 +0000 Subject: [PATCH 294/328] =?UTF-8?q?!23306=20modify=20readme=20Merge=20pull?= =?UTF-8?q?=20request=20!23306=20from=20=E9=83=AD=E5=85=89=E6=B5=A9/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 21 ++++++++++++--------- README.zh.md | 21 ++++++++++++--------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 39ee21800e..fb8522c6a4 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,9 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m | CANN Version | Supported PyTorch Version | Supported Extension Version | Github Branch | |-----------------------|---------------------------|-----------------------------|-------------------| +| CANN 8.2.RC1 | 2.6.0 | 2.6.0 | v2.6.0-7.1.0 | +| | 2.5.1 | 2.5.1.post1 | v2.5.1-7.1.0 | +| | 2.1.0 | 2.1.0.post13 | v2.1.0-7.1.0 | | CANN 8.2.RC1.alpha002 | 2.7.1 | 2.7.1rc1 | v2.7.1 | | CANN 8.1.RC1 | 2.5.1 | 2.5.1 | v2.5.1-7.0.0 | | | 2.4.0 | 2.4.0.post4 | v2.4.0-7.0.0 | @@ -243,11 +246,11 @@ The version branches of AscendPyTorch have the following maintenance phases: | **PyTorch** | **Maintenance Policies** | **Status** | **Launch Date** | **Subsequent Status** | **EOL Date** | |-------------|--------------------------|-------------|-----------------|-------------------------------------------------------------------|--------------| -| 2.7.1 | Regular Release | Development | 2025/06/06 | Expected to enter maintenance status from December 6, 2025 | | -| 2.6.0 | Long Term Support | Development | 2025/03/31 | Expected to enter maintenance status from March 31, 2026 | | -| 2.5.1 | Regular Release | Development | 2024/11/08 | Expected to enter maintenance status from April 8, 2025 | | -| 2.4.0 | Regular Release | Development | 2024/10/15 | Expected to enter maintenance status from June 15, 2025 | | -| 2.3.1 | Regular Release | Development | 2024/06/06 | Expected to enter maintenance status from June 7, 2025 | | +| 2.7.1 | Long Term Support | Development | 2025/06/06 | Expected to enter maintenance status from October 30, 2026 | | +| 2.6.0 | Regular Release | Development | 2025/07/25 | Expected to enter maintenance status from January 25, 2026 | - | +| 2.5.1 | Regular Release | Development | 2024/11/08 | Expected to enter maintenance status from August 8, 2025 | | +| 2.4.0 | Regular Release | Maintained | 2024/10/15 | Expected to enter maintenance free status from June 15, 2026 | | +| 2.3.1 | Regular Release | Maintained | 2024/06/06 | Expected to enter maintenance free status from June 7, 2026 | | | 2.2.0 | Regular Release | Maintained | 2024/04/01 | Expected to enter maintenance free status from September 10, 2025 | | | 2.1.0 | Long Term Support | Development | 2023/10/15 | Expected to enter maintenance status from September 15, 2025 | | | 2.0.1 | Regular Release | EOL | 2023/7/19 | | 2024/3/14 | @@ -261,10 +264,10 @@ For more detailed information on installation guides, model migration, training/ | Document Name | Document Link | |------------------------------------------|-------------------------------------------------------------------------------------------------------------------------| -| Installation Guide | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) | -| Network Model Migration and Training | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | -| Operator Adaptation | [link](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/operatordev/tbeaicpudevg/atlasopdev_10_0086.html) | -| API List (PyTorch and Custom Interfaces) | [link](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/apiref/apilist/ptaoplist_000002.html) | +| Installation Guide | [link](https://www.hiascend.com/document/detail/zh/Pytorch/710/configandinstg/instg/insg_0001.html) | +| Network Model Migration and Training | [link](https://www.hiascend.com/document/detail/zh/Pytorch/710/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | +| Operator Adaptation | [link](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/operatordev/tbeaicpudevg/atlasopdev_10_0086.html) | +| API List (PyTorch and Custom Interfaces) | [link](https://www.hiascend.com/document/detail/zh/Pytorch/710/apiref/apilist/ptaoplist_000002.html) | ## License diff --git a/README.zh.md b/README.zh.md index bddb97a048..d1fa12a060 100644 --- a/README.zh.md +++ b/README.zh.md @@ -159,6 +159,9 @@ print(z) | CANN版本 | 支持的PyTorch版本 | 支持的Extension版本 | Gitee分支 | |-----------------------|--------------|------------------|-------------------| +| CANN 8.2.RC1 | 2.6.0 | 2.6.0 | v2.6.0-7.1.0 | +| | 2.5.1 | 2.5.1.post1 | v2.5.1-7.1.0 | +| | 2.1.0 | 2.1.0.post13 | v2.1.0-7.1.0 | | CANN 8.2.RC1.alpha002 | 2.7.1 | 2.7.1rc1 | v2.7.1 | | CANN 8.1.RC1 | 2.5.1 | 2.5.1 | v2.5.1-7.0.0 | | | 2.4.0 | 2.4.0.post4 | v2.4.0-7.0.0 | @@ -245,11 +248,11 @@ AscendPyTorch版本分支的维护阶段如下: | **PyTorch版本** | **维护策略** | **当前状态** | **发布时间** | **后续状态** | **EOL日期** | |---------------|----------|----------|------------|----------------------|-----------| -| 2.7.1 | 常规分支 | 开发 | 2025/06/06 | 预计2025/12/06起进入维护状态 | - | -| 2.6.0 | 长期支持 | 开发 | 2025/03/31 | 预计2026/03/31起进入维护状态 | - | -| 2.5.1 | 常规分支 | 开发 | 2024/11/08 | 预计2025/04/08起进入维护状态 | - | -| 2.4.0 | 常规分支 | 开发 | 2024/10/15 | 预计2025/06/15起进入维护状态 | - | -| 2.3.1 | 常规分支 | 开发 | 2024/06/06 | 预计2025/06/07起进入维护状态 | | +| 2.7.1 | 长期分支 | 开发 | 2025/06/06 | 预计2026/10/30起进入维护状态 | - | +| 2.6.0 | 常规分支 | 开发 | 2025/07/25 | 预计2026/01/25起进入维护状态 | - | +| 2.5.1 | 常规分支 | 开发 | 2024/11/08 | 预计2025/08/08起进入维护状态 | - | +| 2.4.0 | 常规分支 | 维护 | 2024/10/15 | 预计2026/06/15起进入无维护状态 | - | +| 2.3.1 | 常规分支 | 维护 | 2024/06/06 | 预计2026/06/07起进入无维护状态 | | | 2.2.0 | 常规分支 | 维护 | 2024/04/01 | 预计2025/09/10起进入无维护状态 | | | 2.1.0 | 长期支持 | 开发 | 2023/10/15 | 预计2025/09/15起进入维护状态 | | | 2.0.1 | 常规分支 | EOL | 2023/7/19 | | 2024/3/14 | @@ -267,10 +270,10 @@ AscendPyTorch版本分支的维护阶段如下: | 文档名称 | 文档链接 | | -------------------------- | ------------------------------------------------------------ | -| 安装指南 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) | -| 网络模型迁移和训练 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | -| 算子适配 | [参考链接](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) | -| API清单(PyTorch原生接口与自定义接口) | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/apiref/apilist/ptaoplist_000002.html) | +| 安装指南 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/710/configandinstg/instg/insg_0001.html) | +| 网络模型迁移和训练 | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/710/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html) | +| 算子适配 | [参考链接](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0048.html) | +| API清单(PyTorch原生接口与自定义接口) | [参考链接](https://www.hiascend.com/document/detail/zh/Pytorch/710/apiref/apilist/ptaoplist_000002.html) | ## 许可证 -- Gitee From aaefb80603d71e346ed5dc8d918293e65a240627 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 23 Jul 2025 05:24:35 +0000 Subject: [PATCH 295/328] !23321 Update op_plugin commit id Merge pull request !23321 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c96fd62800..5e568f6307 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c96fd62800c46eee97466b5eb80faf80a1ad6e8f +Subproject commit 5e568f6307ae6afc8911005a082189b2ad5cde55 -- Gitee From cfb9c08f913f0cfe0cc99d4c19ea4443dc69bdbb Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 23 Jul 2025 09:39:38 +0000 Subject: [PATCH 296/328] !23330 Update op_plugin commit id Merge pull request !23330 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 5e568f6307..f43abda9c3 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 5e568f6307ae6afc8911005a082189b2ad5cde55 +Subproject commit f43abda9c3271fcffb7420bb72b9a582d2d0f45e -- Gitee From bd89086cde25834aab7646899006d19a1340623e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 23 Jul 2025 09:39:38 +0000 Subject: [PATCH 297/328] !23330 Update op_plugin commit id Merge pull request !23330 from pta-robot/v2.7.1 -- Gitee From c4a1a6bec607e6c6b56d3f2e1cad2d4f4728fa34 Mon Sep 17 00:00:00 2001 From: yuliangbin <1416490440@qq.com> Date: Wed, 23 Jul 2025 11:36:03 +0000 Subject: [PATCH 298/328] =?UTF-8?q?!23314=20[feature=5Ftorch=5F2.7.1]=20pr?= =?UTF-8?q?ofiler=E5=A2=9E=E5=8A=A0msprof=E6=9D=83=E9=99=90=E6=A0=A1?= =?UTF-8?q?=E9=AA=8C=20Merge=20pull=20request=20!23314=20from=20yuliangbin?= =?UTF-8?q?/cann=5F2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../prof_common_func/test_file_manager.py | 14 ++++ .../prof_common_func/test_path_manager.py | 68 ++++++++++++++++++- .../prof_common_func/_file_manager.py | 8 +++ .../prof_common_func/_path_manager.py | 33 ++++++++- .../prof_parse/_fwk_cann_relation_parser.py | 8 ++- .../prof_view/cann_parse/_cann_export.py | 48 +++++++++++-- 6 files changed, 170 insertions(+), 9 deletions(-) diff --git a/test/profiler/analysis/prof_common_func/test_file_manager.py b/test/profiler/analysis/prof_common_func/test_file_manager.py index 84d6529fac..f030ee93d5 100644 --- a/test/profiler/analysis/prof_common_func/test_file_manager.py +++ b/test/profiler/analysis/prof_common_func/test_file_manager.py @@ -2,6 +2,7 @@ import os import shutil import stat import json +from unittest.mock import patch from torch_npu.profiler.analysis.prof_bean._ge_memory_record_bean import GeMemoryRecordBean from torch_npu.profiler.analysis.prof_common_func._file_manager import FileManager @@ -82,6 +83,19 @@ class TestFileManager(TestCase): expect = {**data1, **data2} self.assertEqual(read_data, expect) + @patch('os.stat') + @patch('os.geteuid') + def test_check_file_owner(self, mock_geteuid, mock_stat): + test_file = "file_owner.json" + test_path = os.path.join(self.tmp_dir, test_file) + mock_geteuid.return_value = 1000 + mock_stat.return_value.st_uid = 0 + self.assertTrue(FileManager.check_file_owner(test_path)) + mock_stat.return_value.st_uid = 1000 + self.assertTrue(FileManager.check_file_owner(test_path)) + mock_stat.return_value.st_uid = 9999 + self.assertFalse(FileManager.check_file_owner(test_path)) + if __name__ == "__main__": run_tests() diff --git a/test/profiler/analysis/prof_common_func/test_path_manager.py b/test/profiler/analysis/prof_common_func/test_path_manager.py index f3974cb123..650686364d 100644 --- a/test/profiler/analysis/prof_common_func/test_path_manager.py +++ b/test/profiler/analysis/prof_common_func/test_path_manager.py @@ -1,7 +1,7 @@ import os import shutil import stat -import json +from unittest.mock import patch, MagicMock from torch_npu.profiler.analysis.prof_common_func._constant import Constant from torch_npu.profiler.analysis.prof_common_func._file_manager import FileManager @@ -178,6 +178,72 @@ class TestPathManager(TestCase): ProfilerPathManager.get_realpath(link_path) self.assertEqual(os.path.realpath(self.tmp_dir), ProfilerPathManager.get_realpath(self.tmp_dir)) + @classmethod + def create_dir_structure(cls, base_path, structure): + for name, children in structure.items(): + dir_path = os.path.join(base_path, name) + os.makedirs(dir_path, exist_ok=True) + cls.create_dir_structure(dir_path, children) + + def test_get_all_subdir(self): + dir_structure = { + 'dir1': { + 'subdir1': {}, + 'subdir2': { + 'subsubdir1': {} + } + }, + 'dir2': {}, + 'dir3': { + 'subdir3': { + 'subsubdir2': { + 'deepdir': {} + } + } + } + } + self.create_dir_structure(self.tmp_dir, dir_structure) + result = ProfilerPathManager.get_all_subdir(self.tmp_dir) + expected = [ + os.path.join(self.tmp_dir, 'dir1'), + os.path.join(self.tmp_dir, 'dir1', 'subdir1'), + os.path.join(self.tmp_dir, 'dir1', 'subdir2'), + os.path.join(self.tmp_dir, 'dir1', 'subdir2', 'subsubdir1'), + os.path.join(self.tmp_dir, 'dir2'), + os.path.join(self.tmp_dir, 'dir3'), + os.path.join(self.tmp_dir, 'dir3', 'subdir3'), + os.path.join(self.tmp_dir, 'dir3', 'subdir3', 'subsubdir2'), + os.path.join(self.tmp_dir, 'dir3', 'subdir3', 'subsubdir2', 'deepdir'), + ] + self.assertCountEqual(result, expected) + + result_depth_2 = ProfilerPathManager.get_all_subdir(self.tmp_dir, max_depth=2) + expected_depth_2 = [ + os.path.join(self.tmp_dir, 'dir1'), + os.path.join(self.tmp_dir, 'dir1', 'subdir1'), + os.path.join(self.tmp_dir, 'dir1', 'subdir2'), + os.path.join(self.tmp_dir, 'dir1', 'subdir2', 'subsubdir1'), + os.path.join(self.tmp_dir, 'dir2'), + os.path.join(self.tmp_dir, 'dir3'), + os.path.join(self.tmp_dir, 'dir3', 'subdir3'), + os.path.join(self.tmp_dir, 'dir3', 'subdir3', 'subsubdir2'), + ] + self.assertCountEqual(result_depth_2, expected_depth_2) + + @patch('os.stat') + def test_path_is_other_writable(self, mock_stat): + mock_stat_result = MagicMock() + mock_stat_result.st_mode = 0o777 + mock_stat.return_value = mock_stat_result + + self.assertTrue(ProfilerPathManager.path_is_other_writable(self.tmp_dir)) + mock_stat_result.st_mode = 0o755 + self.assertFalse(ProfilerPathManager.path_is_other_writable(self.tmp_dir)) + mock_stat_result.st_mode = 0o775 + self.assertTrue(ProfilerPathManager.path_is_other_writable(self.tmp_dir)) + mock_stat_result.st_mode = 0o700 + self.assertFalse(ProfilerPathManager.path_is_other_writable(self.tmp_dir)) + if __name__ == "__main__": run_tests() diff --git a/torch_npu/profiler/analysis/prof_common_func/_file_manager.py b/torch_npu/profiler/analysis/prof_common_func/_file_manager.py index 25af63385e..d7440f03c6 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_file_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_file_manager.py @@ -186,3 +186,11 @@ class FileManager: db_size = os.path.getsize(db_path) if db_size < 0 or db_size > Constant.MAX_FILE_SIZE: raise RuntimeError(f"Invalid db file size, please check the db file: {db_path}") + + @classmethod + def check_file_owner(cls, path): + stat_info = os.stat(path) + if stat_info.st_uid == 0: + return True + current_uid = os.geteuid() + return current_uid == stat_info.st_uid diff --git a/torch_npu/profiler/analysis/prof_common_func/_path_manager.py b/torch_npu/profiler/analysis/prof_common_func/_path_manager.py index cb12d2fb41..a56a31b7a1 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_path_manager.py +++ b/torch_npu/profiler/analysis/prof_common_func/_path_manager.py @@ -2,8 +2,7 @@ import os import re from torch_npu.utils._error_code import ErrCode, prof_error -from ....utils._path_manager import PathManager -from ._constant import Constant +from ._constant import Constant, print_error_msg __all__ = [] @@ -51,7 +50,7 @@ class ProfilerPathManager: return info_path else: return "" - + @classmethod def get_host_path(cls, cann_path: str) -> str: host_path = os.path.join(cann_path, 'host') @@ -176,3 +175,31 @@ class ProfilerPathManager: msg = f"Invalid input path is a soft chain: {path}" + prof_error(ErrCode.UNAVAIL) raise RuntimeError(msg) return os.path.realpath(path) + + @classmethod + def get_all_subdir(cls, path, max_depth=4, cur_depth=0): + paths = [] + if cur_depth > max_depth: + return paths + with os.scandir(path) as entries: + for entry in entries: + if entry.is_dir(): + full_path = entry.path + paths.append(full_path) + # Recursively obtain subdirectories and paths + paths.extend(cls.get_all_subdir(full_path, max_depth, cur_depth + 1)) + return paths + + @classmethod + def path_is_other_writable(cls, path): + stat_info = os.stat(path) + return bool(stat_info.st_mode & 0o022) + + @classmethod + def check_path_permission(cls, path): + file_stat = os.stat(path) + current_uid = os.getuid() + file_uid = file_stat.st_uid + if file_uid not in (0, current_uid): + raise PermissionError(f"The '{path}' path and current owner have inconsistent permissions." + f"please execute 'chown root {path}'" + prof_error(ErrCode.PERMISSION)) diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py index ba29da446e..dc7142738a 100644 --- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py +++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py @@ -1,3 +1,5 @@ +import os + from ._fwk_file_parser import FwkFileParser from .._profiler_config import ProfilerConfig from ..prof_bean._torch_op_node import TorchOpNode @@ -49,7 +51,11 @@ class FwkCANNRelationParser: def get_kernel_dict(self) -> dict: acl_to_npu_dict = CANNFileParser(self._profiler_path).get_acl_to_npu_data() if not acl_to_npu_dict and ProfilerConfig().get_level() != Constant.LEVEL_NONE: - print_error_msg("Failed to get acl to npu flow events.") + error_msg = ( + f"Failed to get acl to npu flow events. " + f"For details, please check the logs at: {os.path.join(self._profiler_path, 'logs')}" + ) + print_error_msg(error_msg) return acl_to_npu_dict dequeue_data_list = FwkFileParser(self._profiler_path).get_dequeue_data() return self.combine_kernel_dict(acl_to_npu_dict, dequeue_data_list) diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py index 7228525fae..1b21e3e409 100644 --- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py +++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py @@ -23,6 +23,7 @@ from datetime import datetime from torch_npu.utils._error_code import ErrCode, prof_error from ...prof_common_func._constant import Constant, print_warn_msg, print_error_msg, print_info_msg from ...prof_common_func._path_manager import ProfilerPathManager +from ...prof_common_func._file_manager import FileManager from .._base_parser import BaseParser from ..._profiler_config import ProfilerConfig from ...prof_common_func._log import ProfilerLogger @@ -36,6 +37,7 @@ class CANNExportParser(BaseParser): error_msg = f"Export CANN Profiling data failed, please verify that the ascend-toolkit is installed and " \ f"set-env.sh is sourced. or you can execute the command to confirm the CANN Profiling " \ f"export result: msprof --export=on" + _MSPROF_PY_PATH = "tools/profiler/profiler_tool/analysis/msprof/msprof.py" def __init__(self, name: str, param_dict: dict): super().__init__(name, param_dict) @@ -49,10 +51,7 @@ class CANNExportParser(BaseParser): ProfilerConfig().load_info(self._profiler_path) if not os.path.isdir(self._cann_path): return Constant.SUCCESS, None - if not self.msprof_path: - err_msg = "Export CANN Profiling data failed! msprof command not found!" + prof_error(ErrCode.NOT_FOUND) - print_error_msg(err_msg) - raise RuntimeError(err_msg) + self._check_msprof_environment() self._check_prof_data_size() start_time = datetime.utcnow() @@ -77,6 +76,47 @@ class CANNExportParser(BaseParser): print_info_msg(f"CANN profiling data parsed in a total time of {end_time - start_time}") return Constant.SUCCESS, None + def _check_msprof_environment(self): + self._check_msprof_profile_path_is_valid() + self._check_msprof_cmd_path_exist() + self._check_msprof_cmd_path_permission() + self._check_msprof_py_path_permission() + + def _check_msprof_profile_path_is_valid(self): + self._check_profiler_path_parent_dir_invalid(ProfilerPathManager.get_all_subdir(self._cann_path)) + + def _check_profiler_path_parent_dir_invalid(self, paths: list): + for path in paths: + if not FileManager.check_file_owner(path): + raise RuntimeError(f"Path '{self._cann_path}' owner is neither root nor the current user. " + f"Please execute 'chown -R $(id -un) '{self._cann_path}' '.") + if ProfilerPathManager.path_is_other_writable(path): + raise RuntimeError(f"Path '{self._cann_path}' permission allow others users to write. " + f"Please execute 'chmod -R 755 '{self._cann_path}' '.") + return False + + def _check_msprof_cmd_path_exist(self): + if not self.msprof_path: + raise RuntimeError("Export CANN Profiling data failed! 'msprof' command not found!" + + prof_error(ErrCode.NOT_FOUND)) + + def _check_msprof_cmd_path_permission(self): + ProfilerPathManager.check_path_permission(self.msprof_path) + + def _check_msprof_py_path_permission(self): + msprof_script_path = self._get_msprof_script_path(self._MSPROF_PY_PATH) + if not msprof_script_path: + raise FileNotFoundError( + "Failed to find msprof.py path. Please check the CANN environment." + ) + ProfilerPathManager.check_path_permission(msprof_script_path) + + def _get_msprof_script_path(self, script_path: str) -> str: + msprof_path = os.path.realpath(self.msprof_path.strip()) + pre_path = msprof_path.split("tools")[0] + full_script_path = os.path.join(pre_path, script_path) + return full_script_path if os.path.exists(full_script_path) else "" + def _check_prof_data_size(self): if not self._cann_path: return -- Gitee From eb979601bda4ae936f4900332e7103b05291a030 Mon Sep 17 00:00:00 2001 From: panzhaohu Date: Wed, 23 Jul 2025 11:42:30 +0000 Subject: [PATCH 299/328] !23334 added the parameter mstx Merge pull request !23334 from panzhaohu/feature_v2.7.1 --- test/profiler/test_experimental_config.py | 75 ++++++++++++++++++- .../_dynamic_profiler_config_context.py | 5 ++ .../_dynamic_profiler_monitor_shm.py | 1 + torch_npu/profiler/experimental_config.py | 28 ++++--- 4 files changed, 98 insertions(+), 11 deletions(-) diff --git a/test/profiler/test_experimental_config.py b/test/profiler/test_experimental_config.py index 0397472e8e..a0e08fa34d 100644 --- a/test/profiler/test_experimental_config.py +++ b/test/profiler/test_experimental_config.py @@ -45,18 +45,30 @@ class TestExperimentalConfig(TestCase): experimental_config = _ExperimentalConfig() self.assertTrue(isinstance(experimental_config(), Cpp_ExperimentalConfig)) - def test_mstx_domain_switches_will_reset_when_msproftx_not_enabled(self): + def test_mstx_domain_switches_will_reset_when_msproftx_and_mstx_not_enabled(self): experimental_config = _ExperimentalConfig(msprof_tx=False, + mstx=False, mstx_domain_include=['x'], mstx_domain_exclude=['y']) self.assertEqual([], experimental_config._mstx_domain_include) self.assertEqual([], experimental_config._mstx_domain_exclude) + def test_mstx_domain_switches_will_reset_when_input_invaild_msproftx_and_mstx(self): + experimental_config = _ExperimentalConfig(msprof_tx=1, + mstx=2) + self.assertEqual(False, experimental_config._msprof_tx) + self.assertEqual(False, experimental_config._mstx) + def test_mstx_domain_switches_will_save_empty_list_when_not_set_domain_switches(self): experimental_config = _ExperimentalConfig(msprof_tx=True) self.assertEqual([], experimental_config._mstx_domain_include) self.assertEqual([], experimental_config._mstx_domain_exclude) + experimental_config = _ExperimentalConfig(mstx=True) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + def test_mstx_domain_switches_will_reset_when_input_invalid_domain_switches(self): experimental_config = _ExperimentalConfig(msprof_tx=True, mstx_domain_include=1, @@ -70,6 +82,32 @@ class TestExperimentalConfig(TestCase): self.assertEqual([], experimental_config._mstx_domain_include) self.assertEqual([], experimental_config._mstx_domain_exclude) + experimental_config = _ExperimentalConfig(mstx=True, + mstx_domain_include=1, + mstx_domain_exclude=1) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(mstx=True, + mstx_domain_include=[1], + mstx_domain_exclude=[1]) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(msprof_tx=True, + mstx=True, + mstx_domain_include=1, + mstx_domain_exclude=1) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(msprof_tx=True, + mstx=True, + mstx_domain_include=[1], + mstx_domain_exclude=[1]) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + def test_mstx_domain_switches_will_reset_exclude_domain_when_both_set_domain_switches(self): experimental_config = _ExperimentalConfig(msprof_tx=True, mstx_domain_include=['x'], @@ -77,6 +115,19 @@ class TestExperimentalConfig(TestCase): self.assertEqual(['x'], experimental_config._mstx_domain_include) self.assertEqual([], experimental_config._mstx_domain_exclude) + experimental_config = _ExperimentalConfig(mstx=True, + mstx_domain_include=['x'], + mstx_domain_exclude=['y']) + self.assertEqual(['x'], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(msprof_tx=True, + mstx=True, + mstx_domain_include=['x'], + mstx_domain_exclude=['y']) + self.assertEqual(['x'], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + def test_mstx_domain_switches_will_save_when_input_valid_domain_switches(self): experimental_config = _ExperimentalConfig(msprof_tx=True, mstx_domain_include=['x']) @@ -88,6 +139,28 @@ class TestExperimentalConfig(TestCase): self.assertEqual([], experimental_config._mstx_domain_include) self.assertEqual(['y'], experimental_config._mstx_domain_exclude) + experimental_config = _ExperimentalConfig(mstx=True, + mstx_domain_include=['x']) + self.assertEqual(['x'], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(mstx=True, + mstx_domain_exclude=['y']) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual(['y'], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(msprof_tx=True, + mstx=True, + mstx_domain_include=['x']) + self.assertEqual(['x'], experimental_config._mstx_domain_include) + self.assertEqual([], experimental_config._mstx_domain_exclude) + + experimental_config = _ExperimentalConfig(msprof_tx=True, + mstx=True, + mstx_domain_exclude=['y']) + self.assertEqual([], experimental_config._mstx_domain_include) + self.assertEqual(['y'], experimental_config._mstx_domain_exclude) + def test_host_sys_switches_will_save_empty_list_when_not_set_host_sys(self): experimental_config = _ExperimentalConfig() self.assertEqual([], experimental_config._host_sys) diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py index 5da94ae763..0c5ab8c826 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py @@ -206,6 +206,8 @@ class ConfigContext: export_type = json_data.get('PROFILE_EXPORT_TYPE', 'text').lower() msprof_tx = json_data.get('PROFILE_MSPROF_TX', 'false') msprof_tx = self.BOOL_MAP.get(msprof_tx.lower(), False) + mstx = json_data.get('PROFILE_MSTX', 'false') + mstx = self.BOOL_MAP.get(mstx.lower(), False) host_sys = DynamicProfilerUtils.parse_str_params_to_list(json_data.get('PROFILE_HOST_SYS', None)) mstx_domain_include = DynamicProfilerUtils.parse_str_params_to_list(json_data.get('PROFILE_MSTX_DOMAIN_INCLUDE', None)) mstx_domain_exclude = DynamicProfilerUtils.parse_str_params_to_list(json_data.get('PROFILE_MSTX_DOMAIN_EXCLUDE', None)) @@ -224,6 +226,7 @@ class ConfigContext: record_op_args=record_op_args, export_type=export_type, msprof_tx=msprof_tx, + mstx=mstx, host_sys=host_sys, mstx_domain_include=mstx_domain_include, mstx_domain_exclude=mstx_domain_exclude, @@ -247,6 +250,7 @@ class ConfigContext: record_op_args = exp_config.get('record_op_args', False) export_type = exp_config.get('export_type', 'text') msprof_tx = exp_config.get('msprof_tx', False) + mstx = exp_config.get('mstx', False) mstx_domain_include = exp_config.get('mstx_domain_include', None) mstx_domain_exclude = exp_config.get('mstx_domain_exclude', None) host_sys = exp_config.get('host_sys', None) @@ -263,6 +267,7 @@ class ConfigContext: record_op_args=record_op_args, export_type=export_type, msprof_tx=msprof_tx, + mstx=mstx, mstx_domain_include=mstx_domain_include, mstx_domain_exclude=mstx_domain_exclude, host_sys=host_sys, diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py index e4ebdb84a4..a9a9509d70 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py @@ -39,6 +39,7 @@ class DynamicProfilerShareMemory: "record_op_args": False, "export_type": ["text"], "msprof_tx": False, + "mstx": False, "host_sys": [], "mstx_domain_include": [], "mstx_domain_exclude": [], diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py index 083253539b..a379b3f907 100644 --- a/torch_npu/profiler/experimental_config.py +++ b/torch_npu/profiler/experimental_config.py @@ -1,3 +1,4 @@ +import warnings from typing import Union import torch_npu._C @@ -69,6 +70,7 @@ class _ExperimentalConfig: aic_metrics: int = Constant.AicMetricsNone, l2_cache: bool = False, msprof_tx: bool = False, + mstx: bool = False, data_simplification: bool = True, record_op_args: bool = False, op_attr: bool = False, @@ -86,6 +88,7 @@ class _ExperimentalConfig: self._aic_metrics = Constant.AicPipeUtilization self._l2_cache = l2_cache self._msprof_tx = msprof_tx + self._mstx = mstx self._data_simplification = data_simplification self.record_op_args = record_op_args self._export_type = self._conver_export_type_to_list(export_type) @@ -105,7 +108,7 @@ class _ExperimentalConfig: metrics=self._aic_metrics, l2_cache=self._l2_cache, record_op_args=self.record_op_args, - msprof_tx=self._msprof_tx, + msprof_tx=self._msprof_tx or self._mstx, op_attr=self._op_attr, host_sys=self._host_sys, mstx_domain_include=self._mstx_domain_include, @@ -147,16 +150,21 @@ class _ExperimentalConfig: print_warn_msg("Please use level1 or level2 if you want to collect aic metrics, reset aic metrics to None!") self._aic_metrics = Constant.AicMetricsNone if not isinstance(self._l2_cache, bool): - print_warn_msg("Invalid parameter l2_cache, which must be of boolean type, reset it to False.") + print_warn_msg("Invalid parameter l2_cache, which must be of bool type, reset it to False.") self._l2_cache = False if not isinstance(self._msprof_tx, bool): - print_warn_msg("Invalid parameter msprof_tx, which must be of boolean type, reset it to False.") + print_warn_msg("Invalid parameter msprof_tx, which must be of bool type, reset it to False.") self._msprof_tx = False + if self._msprof_tx: + warnings.warn("The parameter msprof_tx will be deprecated. Please use the new parameter mstx instead.") + if not isinstance(self._mstx, bool): + print_warn_msg("Invalid parameter mstx, which must be of bool type, reset it to False.") + self._mstx = False if self._data_simplification is not None and not isinstance(self._data_simplification, bool): - print_warn_msg("Invalid parameter data_simplification, which must be of boolean type, reset it to default.") + print_warn_msg("Invalid parameter data_simplification, which must be of bool type, reset it to default.") self._data_simplification = True if not isinstance(self.record_op_args, bool): - print_warn_msg("Invalid parameter record_op_args, which must be of boolean type, reset it to False.") + print_warn_msg("Invalid parameter record_op_args, which must be of bool type, reset it to False.") self.record_op_args = False if self._profiler_level not in \ (ProfilerLevel.Level0, ProfilerLevel.Level1, ProfilerLevel.Level2, ProfilerLevel.Level_none): @@ -172,7 +180,7 @@ class _ExperimentalConfig: else: self._aic_metrics = AiCMetrics.PipeUtilization if not isinstance(self._op_attr, bool): - print_warn_msg("Invalid parameter op_attr, which must be of boolean type, reset it to False.") + print_warn_msg("Invalid parameter op_attr, which must be of bool type, reset it to False.") self._op_attr = False if not all(export_type in [ExportType.Text, ExportType.Db] for export_type in self._export_type): print_warn_msg("Invalid parameter export_type, reset it to text.") @@ -190,16 +198,16 @@ class _ExperimentalConfig: elif self._gc_detect_threshold == 0.0: print_info_msg("Parameter gc_detect_threshold is set to 0, it will collect all gc events.") if not isinstance(self._sys_io, bool): - print_warn_msg("Invalid parameter sys_io, which must be of boolean type, reset it to False.") + print_warn_msg("Invalid parameter sys_io, which must be of bool type, reset it to False.") self._sys_io = False if not isinstance(self._sys_interconnection, bool): - print_warn_msg("Invalid parameter sys_interconnection, which must be of boolean type, reset it to False.") + print_warn_msg("Invalid parameter sys_interconnection, which must be of bool type, reset it to False.") self._sys_interconnection = False def _check_mstx_domain_params(self): - if not self._msprof_tx: + if not self._msprof_tx and not self._mstx: if self._mstx_domain_include or self._mstx_domain_exclude: - print_warn_msg("mstx_domain_include and mstx_domain_exclude are only valid when msprof_tx is True.") + print_warn_msg("mstx_domain_include and mstx_domain_exclude are valid when msprof_tx or mstx is True.") self._mstx_domain_include = [] self._mstx_domain_exclude = [] return -- Gitee From cb780d3accaa56130a0540afe4a2268b80993806 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Wed, 23 Jul 2025 13:12:32 +0000 Subject: [PATCH 300/328] !23271 set default streampool to 32 Merge pull request !23271 from SCh-zx/stream27 --- test/distributed/test_hccl_stream_id.py | 8 ++++---- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/distributed/test_hccl_stream_id.py b/test/distributed/test_hccl_stream_id.py index 9be7044479..ae34f80656 100644 --- a/test/distributed/test_hccl_stream_id.py +++ b/test/distributed/test_hccl_stream_id.py @@ -38,14 +38,14 @@ class HcclStreamIdTest(TestCase): dist_group.recv(recv_tensor, src) p2p_stream_id = _world.default_pg._get_backend(torch.device('npu'))._get_stream_id(True, src) - stream_num = os.environ.get("STREAMS_PER_DEVICE", 8) + stream_num = os.environ.get("STREAMS_PER_DEVICE", 32) try: stream_num = int(stream_num) except Exception: - stream_num = 8 + stream_num = 32 - if stream_num != 32: - stream_num = 8 + if stream_num != 8: + stream_num = 32 assert0 = ((collective_stream_id & stream_num) == stream_num) assert1 = (collective_stream_id == p2p_stream_id) collective_stream = torch.npu.Stream(stream_id=collective_stream_id, device_type=20) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 8810a7fb08..ab0384eaa5 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -497,11 +497,11 @@ uint32_t OptionsManager::GetStreamsPerDevice() { const static uint32_t streams_per_device = []() -> uint32_t { char* buf_val = std::getenv("STREAMS_PER_DEVICE"); - // Default 8 - int64_t streams_per_device = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 8; + // Default 32 + int64_t streams_per_device = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 32; if (streams_per_device != 8 && streams_per_device != 32) { - streams_per_device = 8; - TORCH_NPU_WARN_ONCE("STREAMS_PER_DEVICE only support 8 or 32, but get other value, so reset it to the default value 8"); + streams_per_device = 32; + TORCH_NPU_WARN_ONCE("STREAMS_PER_DEVICE only support 8 or 32, but get other value, so reset it to the default value 32"); } return static_cast(streams_per_device); }(); -- Gitee From f6e227ccc7651c742139ccfb1d02d6bb7d5269c9 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Wed, 23 Jul 2025 14:24:38 +0000 Subject: [PATCH 301/328] !23340 Update op_plugin commit id Merge pull request !23340 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f43abda9c3..c63373afcc 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f43abda9c3271fcffb7420bb72b9a582d2d0f45e +Subproject commit c63373afcc4cdf452a21f15c2deffc1fec113982 -- Gitee From 404d93c975deb7b887900865830817c7dcebc471 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Wed, 23 Jul 2025 22:36:25 +0000 Subject: [PATCH 302/328] !23345 Update torchair commit id Merge pull request !23345 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 61ab3e37a0..80723743e5 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 61ab3e37a06ee45fb620715f8d3191c9d874f46d +Subproject commit 80723743e5b785cac34888522365cc170f7ae03b -- Gitee From 79f71c4f44a08ed30b1daf7f262254351afa577a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 24 Jul 2025 11:24:38 +0000 Subject: [PATCH 303/328] !23356 Update op_plugin commit id Merge pull request !23356 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index c63373afcc..19eeddb729 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit c63373afcc4cdf452a21f15c2deffc1fec113982 +Subproject commit 19eeddb729edc89b2f47077cd47df9a5aaf045ab -- Gitee From dde9a5a2933244d8cf28a73673bdfdb190f88148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Thu, 24 Jul 2025 13:08:40 +0000 Subject: [PATCH 304/328] =?UTF-8?q?!23352=20sync=20correct=20device=20in?= =?UTF-8?q?=20empty=5Fcache=20Merge=20pull=20request=20!23352=20from=20?= =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../csrc/core/npu/NPUCachingAllocator.cpp | 24 ++++++++++++++++--- torch_npu/csrc/core/npu/NPUFunctions.cpp | 10 ++++++++ torch_npu/csrc/core/npu/NPUFunctions.h | 2 ++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index e3c3a327be..20ccf2caee 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -3295,10 +3295,28 @@ public: void emptyCache(bool check_error) override { - int count = static_cast(device_allocator.size()); - for (int i = 0; i < count; i++) { - device_allocator[i]->emptyCache(i, check_error); + ASCEND_LOGD("Begin empty cache with check_error = %d", check_error); + int32_t current_device = 0; + if (check_error) { + NPU_CHECK_ERROR(c10_npu::GetDevice(¤t_device)); + } else { + NPU_CHECK_WARN(c10_npu::GetDevice(¤t_device)); + } + auto used_devices_list = c10_npu::GetUsedDevices(); + for (int8_t device_idx : used_devices_list) { + if (check_error) { + NPU_CHECK_ERROR(c10_npu::SetDevice(device_idx)); + } else { + NPU_CHECK_WARN(c10_npu::SetDevice(device_idx)); + } + device_allocator[device_idx]->emptyCache(device_idx, check_error); + } + if (check_error) { + NPU_CHECK_ERROR(c10_npu::MaybeSetDevice(current_device)); + } else { + NPU_CHECK_WARN(c10_npu::MaybeSetDevice(current_device)); } + ASCEND_LOGD("End empty cache with check_error = %d", check_error); } void *getBaseAllocation(void *ptr, size_t *outSize) override diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 0ceb84847b..8449ecfcfa 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -125,6 +125,16 @@ aclError MaybeSetDevice(c10::DeviceIndex device) return ACL_ERROR_NONE; } +std::vector GetUsedDevices() +{ + std::lock_guard lock(mtx); + std::vector used_devices_list; + for (const auto it : used_devices) { + used_devices_list.emplace_back(it.first); + } + return used_devices_list; +} + aclError ResetUsedDevices() { std::lock_guard lock(mtx); diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index e162f8fe8f..279778e4a3 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -57,6 +57,8 @@ C10_NPU_API aclError MaybeSetDevice(c10::DeviceIndex device); * @retval ACL_ERROR_NONE The function is successfully executed. * @retval OtherValues Failure */ +std::vector GetUsedDevices(); + aclError ResetUsedDevices(); aclError DestroyUsedStreams(); -- Gitee From b0f120b8c30f6ef300ea3e234083484ef5034fd0 Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 24 Jul 2025 22:34:27 +0000 Subject: [PATCH 305/328] !23364 Update torchair commit id Merge pull request !23364 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 80723743e5..d75f6fe3f0 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 80723743e5b785cac34888522365cc170f7ae03b +Subproject commit d75f6fe3f026a2b65b91beaa5c32d067107fbac8 -- Gitee From 7dd8d1f0105ccfebd3b0bf247da54a842af0f8cb Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 24 Jul 2025 22:34:28 +0000 Subject: [PATCH 306/328] !23364 Update torchair commit id Merge pull request !23364 from torchair_robot/v2.7.1 -- Gitee From a6af177565f7545202ffe2c330b1a3d517f3ef49 Mon Sep 17 00:00:00 2001 From: yuliangbin <1416490440@qq.com> Date: Fri, 25 Jul 2025 07:35:06 +0000 Subject: [PATCH 307/328] =?UTF-8?q?!23094=20[torch=5Fv2.7.1]=E5=86=85?= =?UTF-8?q?=E5=AD=98profiling=E4=B8=ADstream=5Fptr=E6=94=B9=E4=B8=BAstream?= =?UTF-8?q?=5Fid=20Merge=20pull=20request=20!23094=20from=20yuliangbin/m?= =?UTF-8?q?=5Fbug=5F2.7.1=5F1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 6 +++--- torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp | 10 +++++----- torch_npu/csrc/core/npu/interface/AclInterface.cpp | 12 ++++++++++++ torch_npu/csrc/core/npu/interface/AclInterface.h | 2 ++ torch_npu/csrc/profiler/npu_profiler.cpp | 7 ++++++- torch_npu/csrc/profiler/npu_profiler.h | 5 ++++- 6 files changed, 32 insertions(+), 10 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 20ccf2caee..5ad16c9860 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1554,7 +1554,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif return block; @@ -1619,7 +1619,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } @@ -2434,7 +2434,7 @@ private: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 7d5173dec8..660089b0fb 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -113,7 +113,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); #endif block->data_ptr = nullptr; @@ -154,7 +154,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -180,7 +180,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -204,7 +204,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(this->last_stream)} + this->last_stream } ); } #endif @@ -254,7 +254,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(block_pair.first)} + block_pair.first } ); #endif } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 841976f3ba..6b8053b9c3 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -93,6 +93,7 @@ LOAD_FUNCTION(aclrtDeviceGetBareTgid) LOAD_FUNCTION(aclrtGetDeviceResLimit) LOAD_FUNCTION(aclrtSetDeviceResLimit) LOAD_FUNCTION(aclrtResetDeviceResLimit) +LOAD_FUNCTION(aclrtStreamGetId) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -1072,5 +1073,16 @@ aclError AclrtResetDeviceResLimit(int32_t deviceId) return func(deviceId); } +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id) +{ + typedef aclError(*AclrtStreamGetIdFunc)(aclrtStream, int32_t*); + static AclrtStreamGetIdFunc func = nullptr; + if (func == nullptr) { + func = (AclrtStreamGetIdFunc)GET_FUNC(aclrtStreamGetId); + } + TORCH_CHECK(func, "Failed to find function ", "aclrtStreamGetId", PROF_ERROR(ErrCode::NOT_FOUND)); + return func(stream, stream_id); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 6b16535aa9..367963c070 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -257,5 +257,7 @@ aclError AclrtSetDeviceResLimit(int32_t deviceId, aclrtDevResModelType type, uin aclError AclrtResetDeviceResLimit(int32_t deviceId); +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp index 295eda9aea..1aae58c434 100644 --- a/torch_npu/csrc/profiler/npu_profiler.cpp +++ b/torch_npu/csrc/profiler/npu_profiler.cpp @@ -6,6 +6,7 @@ #include "torch_npu/csrc/core/npu/npu_log.h" #include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/interface/AclInterface.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" @@ -380,6 +381,10 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) if (!ProfilerMgr::GetInstance()->ReportMemEnable().load()) { return; } + int32_t stream_id; + if (c10_npu::acl::AclrtStreamGetId(data.stream, &stream_id) != ACL_ERROR_NONE) { + stream_id = -1; + } ProfilerMgr::GetInstance()->UploadWithLock(std::make_unique( data.ptr, static_cast(Utils::GetClockTime()), @@ -387,7 +392,7 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) data.total_allocated, data.total_reserved, data.total_active, - data.stream_ptr, + stream_id, data.device_type, data.device_index, data.component_type, diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index 2127825bc1..854191dfb7 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -7,6 +7,9 @@ #include +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + #include "torch_npu/csrc/toolkit/profiler/inc/data_reporter.h" #include "torch_npu/csrc/profiler/profiler_mgr.h" #include "torch_npu/csrc/profiler/mstx_mgr.h" @@ -55,7 +58,7 @@ struct MemoryUsage { int64_t total_allocated{0}; int64_t total_reserved{0}; int64_t total_active{0}; - int64_t stream_ptr{0}; + aclrtStream stream{nullptr}; }; struct ExperimentalConfig { -- Gitee From daf91d78efc20223ada3d19caf948284b8609307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= Date: Fri, 25 Jul 2025 09:47:29 +0000 Subject: [PATCH 308/328] =?UTF-8?q?!23367=20support=20npugraph=20backend?= =?UTF-8?q?=20for=20torch.compile=20Merge=20pull=20request=20!23367=20from?= =?UTF-8?q?=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/utils/_graph_tree.py | 149 +++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py index 6de299a02d..b031f62e9b 100644 --- a/torch_npu/utils/_graph_tree.py +++ b/torch_npu/utils/_graph_tree.py @@ -1,4 +1,5 @@ import functools +from collections import defaultdict from typing import ( Any, Callable, @@ -12,6 +13,15 @@ from typing import ( import torch from torch.utils._ordered_set import OrderedSet from torch._dynamo import utils as dynamo_utils +from torch._dynamo.backends.common import aot_autograd +from torch._dynamo.backends.cudagraphs import ( + check_for_mutation_ignore_cuda_graph_managed_tensor, + find_input_mutations, + get_device_node_mapping, + get_stack_traces, +) +from torch._dynamo.backends.debugging import boxed_nop +from torch._dynamo.backends.registry import register_backend from torch._inductor import config from torch._inductor.compile_fx import ( get_input_idxs_to_check, @@ -21,15 +31,25 @@ from torch._inductor.compile_fx import ( from torch._inductor.cudagraph_utils import ( _get_use_stack_trace, format_default_skip_message, + get_mutation_stack_trace, + get_placeholder_info, + log_cudagraph_skip_and_bump_counter, + BoxedDeviceIndex, PlaceholderInfo, ) from torch._inductor.output_code import get_expanded_dims from torch._inductor.utils import ( align_inputs_from_check_idxs, copy_misaligned_inputs, + count_tangents, + get_first_incompatible_cudagraph_node, + num_fw_fixed_arguments, + output_node, remove_unaligned_input_idxs, + BoxedBool, InputType, ) +from torch.multiprocessing.reductions import StorageWeakRef def npugraph_mark_step_begin(): @@ -201,7 +221,136 @@ def npugraphify_impl( return align_inputs_from_check_idxs(run, check_input_idxs) +def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]: + if not torch._dynamo.config.cudagraph_backend_support_input_mutation: + mut_skip = check_for_mutation_ignore_cuda_graph_managed_tensor( + aot_model, num_fixed + ) + if mut_skip: + return mut_skip + + skip = check_multiple_devices_or_any_cpu_nodes( + get_device_node_mapping(aot_model) + ) + if skip: + return skip + + node = get_first_incompatible_cudagraph_node(aot_model) + if node: + return format_default_skip_message(f"incompatible op ({node.name})") + + return None + + +def get_device_index(gm) -> int: + device = next(iter(get_device_node_mapping(gm))) + if not (device.type == "npu"): + raise RuntimeError("check device.type == npu fail", ) + return device.index + + +def npugraphs(dynamo_model, dynamo_inputs): + from torch_npu.npu._graph_tree import npugraphify_impl as new_npugraphify_impl + + do_npugraphs = BoxedBool(True) + boxed_device_index = BoxedDeviceIndex(None) + + def forward_npugraphs(aot_model, aot_inputs, is_inference=False): + interp = boxed_nop(aot_model, aot_inputs) + fixed = num_fw_fixed_arguments(len(dynamo_inputs), len(aot_inputs)) + skip_msg = check_for_skip(aot_model, fixed) + if skip_msg: + BoxedBool.disable(do_npugraphs) + log_cudagraph_skip_and_bump_counter( + f"skipping npugraphs due to {skip_msg}" + ) + return interp + + boxed_device_index.set(get_device_index(aot_model)) + out = new_npugraphify_impl( + interp, + aot_inputs, + range(fixed), + device_index=boxed_device_index.value, + is_backward=False, + is_inference=False, + stack_traces=get_stack_traces(aot_model), + placeholders=get_placeholder_info(aot_model.graph), + mutated_input_idxs=find_input_mutations(aot_model.graph), + ) + out._boxed_call = True + return out + + def backward_npugraphs(aot_model, aot_inputs): + interp = boxed_nop(aot_model, aot_inputs) + if not do_npugraphs: + return aot_model + + fixed = count_tangents(aot_model) + + skip_msg = check_for_skip(aot_model, fixed) + if skip_msg: + log_cudagraph_skip_and_bump_counter( + "skipping npugraphs due to %s", skip_msg + ) + + # See [Backward Generation Handling] + from torch_npu.npu._graph_tree import get_manager + manager = get_manager( + boxed_device_index.value, create_if_none_exists=False + ) + + if manager is None: + raise RuntimeError("check manager is None fail") + + def fn(inputs): + manager.set_to_running_backward() + return aot_model(inputs) + + fn._boxed_call = True + return fn + + out = new_npugraphify_impl( + interp, + aot_inputs, + range(fixed), + device_index=get_device_index(aot_model), + is_backward=True, + is_inference=False, + stack_traces=get_stack_traces(aot_model), + placeholders=get_placeholder_info(aot_model.graph), + mutated_input_idxs=find_input_mutations(aot_model.graph), + ) + out._boxed_call = True + return out + + aot_npugraphs = aot_autograd( + fw_compiler=forward_npugraphs, + bw_compiler=backward_npugraphs, + inference_compiler=functools.partial(forward_npugraphs, is_inference=True), + keep_inference_input_mutations=torch._dynamo.config.cudagraph_backend_keep_input_mutation, + ) + return aot_npugraphs(dynamo_model, dynamo_inputs) + + +class NpugraphsBackend: + compiler_name = "npugraphs" + + @staticmethod + def reset(): + from torch_npu.npu._graph_tree import reset_npugraph_trees + + reset_npugraph_trees() + + @staticmethod + def __call__(model, inputs): + return npugraphs(model, inputs) + + def _apply_npugraph_tree_methods(): + # aot_npugraphs only applies graphs to the graph. It is also helpful + # for debugging and can serve as a perf baseline. + register_backend(name="npugraphs", compiler_fn=NpugraphsBackend()) torch._inductor.compile_fx.cudagraphify = npugraphify torch._inductor.cudagraph_utils.check_multiple_devices_or_any_cpu_nodes = check_multiple_devices_or_any_cpu_nodes torch.compiler.npugraph_mark_step_begin = npugraph_mark_step_begin -- Gitee From 82f5dd5a12e7f14a9fc868dbe1629032c8563d2e Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 25 Jul 2025 11:09:43 +0000 Subject: [PATCH 309/328] !23382 Update op_plugin commit id Merge pull request !23382 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 19eeddb729..f04c3f3f81 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 19eeddb729edc89b2f47077cd47df9a5aaf045ab +Subproject commit f04c3f3f81b0fcdefe2971e2695c73008bd3d8f2 -- Gitee From 1affc01735901bcabf5bb1cfbdcee981b9fa510a Mon Sep 17 00:00:00 2001 From: pta-robot Date: Fri, 25 Jul 2025 11:09:43 +0000 Subject: [PATCH 310/328] !23382 Update op_plugin commit id Merge pull request !23382 from pta-robot/v2.7.1 -- Gitee From c4862e05c47b87898582c4fc68fa1e1307a5430d Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Fri, 25 Jul 2025 22:33:40 +0000 Subject: [PATCH 311/328] !23391 Update torchair commit id Merge pull request !23391 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index d75f6fe3f0..580d288b45 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit d75f6fe3f026a2b65b91beaa5c32d067107fbac8 +Subproject commit 580d288b456dcc71482abbebc8dd2b8c1f6406ed -- Gitee From 0a3676f58b71132480b5739583367a7ded2c4db2 Mon Sep 17 00:00:00 2001 From: sincatter Date: Sat, 26 Jul 2025 02:40:01 +0000 Subject: [PATCH 312/328] !23388 add moe_distribute_dispatch/combine(sdma) interface Merge pull request !23388 from sincatter/v2.7.1_ml --- test/allowlist_for_publicAPI.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index e26ec137f5..7f20b62289 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2882,7 +2882,11 @@ "npu_transpose_batchmatmul", "npu_gather_sparse_index", "npu_moe_distribute_combine_add_rms_norm", - "npu_moe_eplb_update_expert" + "npu_moe_eplb_update_expert", + "npu_moe_distribute_dispatch_setup", + "npu_moe_distribute_dispatch_teardown", + "npu_moe_distribute_combine_setup", + "npu_moe_distribute_combine_teardown" ], "torch_npu.contrib": [ "npu_fused_attention_with_layernorm", -- Gitee From c261c9f34382d25eae6fbfa459b5daedecaaf270 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 26 Jul 2025 09:09:42 +0000 Subject: [PATCH 313/328] !23397 Update op_plugin commit id Merge pull request !23397 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index f04c3f3f81..61e230d55b 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit f04c3f3f81b0fcdefe2971e2695c73008bd3d8f2 +Subproject commit 61e230d55bf22e5cc6bc4c9205c31dbc3b17600e -- Gitee From d6d0fcf5d4d9702cc4a024216df4f1f9addeec90 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Sat, 26 Jul 2025 11:09:42 +0000 Subject: [PATCH 314/328] !23410 Update op_plugin commit id Merge pull request !23410 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 61e230d55b..718098b3ed 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 61e230d55bf22e5cc6bc4c9205c31dbc3b17600e +Subproject commit 718098b3edcd2fe11f041946b1a413ec168fc066 -- Gitee From 0021d2ed8990cbd7447a395c11c3436dfe84b89c Mon Sep 17 00:00:00 2001 From: kuhn Date: Mon, 28 Jul 2025 01:17:02 +0000 Subject: [PATCH 315/328] !23337 get_device_properties add properties and gracefully handle missing ones Merge pull request !23337 from kuhn/v2.7.1_dev --- test/npu/test_torch_npu.py | 23 +++++++++++++--- torch_npu/csrc/npu/Module.cpp | 51 ++++++++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/test/npu/test_torch_npu.py b/test/npu/test_torch_npu.py index 29709ef991..9bbf248786 100644 --- a/test/npu/test_torch_npu.py +++ b/test/npu/test_torch_npu.py @@ -55,10 +55,25 @@ class TorchNPUDeviceTestCase(TestCase): self.assertIsInstance(res, str) def test_npu_get_device_properties(self): - name = torch_npu.npu.get_device_properties(0).name - self.assertIsInstance(name, str) - total_memory = torch_npu.npu.get_device_properties(0).total_memory - self.assertIsInstance(total_memory, int) + props = torch_npu.npu.get_device_properties(0) + self.assertIsInstance(props.name, str) + self.assertIsInstance(props.total_memory, int) + self.assertIsInstance(props.cube_core_num, int) + self.assertIsInstance(props.vector_core_num, int) + self.assertIsInstance(props.L2_cache_size, int) + + def test_npu_get_unsupported_device_properties(self): + props = torch_npu.npu.get_device_properties(0) + unsupported_fields = [ + 'major', 'minor', 'is_multi_gpu_board', 'is_integrated', 'multi_processor_count', + 'max_threads_per_multi_processor', 'warp_size', 'regs_per_multiprocessor', 'gcnArchName', 'uuid' + ] + for field in unsupported_fields: + try: + value = getattr(props, field) + self.assertIsNone(value, msg=f"Field '{field}' should return None for NPU, but got {value}.") + except Exception as e: + self.fail(f"Accessing unsupported field '{field}' raised unexpected exception: {type(e).__name__}") def test_npu_get_device_capability(self): res = torch_npu.npu.get_device_capability() diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index bfb0cdb0c1..ce450b778c 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -55,6 +55,19 @@ struct NPUDeviceProp { std::string name; size_t totalGlobalMem = 0; + int64_t cube_core_num = 0; + int64_t vector_core_num = 0; + int64_t L2_cache_size = 0; + std::optional major; + std::optional minor; + std::optional is_multi_gpu_board; + std::optional is_integrated; + std::optional multi_processor_count; + std::optional max_threads_per_multi_processor; + std::optional warp_size; + std::optional regs_per_multiprocessor; + std::optional gcnArchName; + std::optional uuid; }; struct NPUDeviceMem { @@ -68,13 +81,25 @@ void RegisterNPUDeviceProperties(PyObject* module) py::class_(m, "_NPUDeviceProperties") .def_readonly("name", &NPUDeviceProp::name) .def_readonly("total_memory", &NPUDeviceProp::totalGlobalMem) - .def("__repr__", [](const NPUDeviceProp& prop) { + .def_readonly("cube_core_num", &NPUDeviceProp::cube_core_num) + .def_readonly("vector_core_num", &NPUDeviceProp::vector_core_num) + .def_readonly("L2_cache_size", &NPUDeviceProp::L2_cache_size) + .def_readonly("major", &NPUDeviceProp::major) + .def_readonly("minor", &NPUDeviceProp::minor) + .def_readonly("is_multi_gpu_board", &NPUDeviceProp::is_multi_gpu_board) + .def_readonly("is_integrated", &NPUDeviceProp::is_integrated) + .def_readonly("multi_processor_count", &NPUDeviceProp::multi_processor_count) + .def_readonly("max_threads_per_multi_processor", &NPUDeviceProp::max_threads_per_multi_processor) + .def_readonly("warp_size", &NPUDeviceProp::warp_size) + .def_readonly("regs_per_multiprocessor", &NPUDeviceProp::regs_per_multiprocessor) + .def_readonly("gcnArchName", &NPUDeviceProp::gcnArchName) + .def_readonly("uuid", &NPUDeviceProp::uuid) + .def("__repr__", [](const NPUDeviceProp &prop) { std::ostringstream stream; - stream << "_NPUDeviceProperties(name='" << prop.name - << "', total_memory=" - << prop.totalGlobalMem / - (CHANGE_UNIT_SIZE * CHANGE_UNIT_SIZE) - << "MB)"; + stream << "_NPUDeviceProperties(name='" << prop.name << "', total_memory=" + << prop.totalGlobalMem / (CHANGE_UNIT_SIZE * CHANGE_UNIT_SIZE) << "MB, cube_core_num=" + << prop.cube_core_num << ", vector_core_num=" << prop.vector_core_num << ", L2_cache_size=" + << prop.L2_cache_size / (CHANGE_UNIT_SIZE * CHANGE_UNIT_SIZE) << "MB)"; return stream.str(); }); m.def( @@ -102,6 +127,10 @@ NPUDeviceProp* GetDeviceProperties(int64_t deviceid) const char* device_name; size_t device_free; size_t device_total; + int64_t cube_core_num; + int64_t vector_core_num; + int64_t L2_cache_size; + device_name = c10_npu::acl::AclrtGetSocName(); if (device_name == nullptr) { prop.name = " "; @@ -111,6 +140,16 @@ NPUDeviceProp* GetDeviceProperties(int64_t deviceid) } NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total)); prop.totalGlobalMem = device_total; + + NPU_CHECK_ERROR_WITHOUT_UCE(aclGetDeviceCapability(deviceid, ACL_DEVICE_INFO_AI_CORE_NUM, &cube_core_num)); + prop.cube_core_num = cube_core_num; + + NPU_CHECK_ERROR_WITHOUT_UCE(aclGetDeviceCapability(deviceid, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &vector_core_num)); + prop.vector_core_num = vector_core_num; + + NPU_CHECK_ERROR_WITHOUT_UCE(aclGetDeviceCapability(deviceid, ACL_DEVICE_INFO_L2_SIZE, &L2_cache_size)); + prop.L2_cache_size = L2_cache_size; + return ∝ } -- Gitee From 4cb04af522c01b1707a4e6b78af6ee3f21a56880 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 28 Jul 2025 03:51:42 +0000 Subject: [PATCH 316/328] !23250 aovid set unused device or init unused stream when shutdown Merge pull request !23250 from huangyunlong/2.7shut --- torch_npu/csrc/core/npu/NPUFunctions.cpp | 16 +++++++++------- torch_npu/csrc/core/npu/NPUStream.cpp | 6 +++++- torch_npu/csrc/core/npu/NPUStream.h | 2 ++ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 8449ecfcfa..b89d1df45c 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -154,14 +154,16 @@ aclError DestroyUsedStreams() NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&cur_device)); std::lock_guard lock(mtx); for (const auto it : used_devices) { - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); - NPUStream stream = getCurrentNPUStream(it.first); - aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false)); - if (acl_ret != ACL_ERROR_NONE) { - return acl_ret; + if (c10_npu::StreamInitFlag(it.first)) { + NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); + NPUStream stream = getCurrentNPUStream(it.first); + aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false)); + if (acl_ret != ACL_ERROR_NONE) { + return acl_ret; + } } } - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(cur_device)); + NPU_CHECK_ERROR_WITHOUT_UCE(MaybeSetDevice(cur_device)); return ACL_ERROR_NONE; } @@ -184,7 +186,7 @@ aclError SynchronizeUsedDevices() } #endif } - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(cur_device)); + NPU_CHECK_ERROR_WITHOUT_UCE(MaybeSetDevice(cur_device)); return ACL_ERROR_NONE; } diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index cc8a53c54d..b2ddf38ff9 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -427,7 +427,6 @@ aclrtStream getCurrentNPUStreamNoWait(c10::DeviceIndex device_index) NPUStatus emptyAllNPUStream(bool check_error) { - initNPUStreamsOnce(); NPUStatus ret; for (auto i = decltype(num_npus){0}; i < num_npus; ++i) { auto& default_streamsi = default_streams[i]; @@ -665,4 +664,9 @@ NPUStream getNPUStreamFromSyncLaunchPool(c10::DeviceIndex device_index) return NPUStream_fromInternals(&sync_launch_streams[device_index][idx]); } +bool StreamInitFlag(c10::DeviceIndex device_index) +{ + ASCEND_LOGI("Device %d, Npu StreamInitFlag Check is %d", device_index, initialize_flag[device_index]); + return initialize_flag[device_index]; +} } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUStream.h b/torch_npu/csrc/core/npu/NPUStream.h index aa637a8b24..bea6e28650 100644 --- a/torch_npu/csrc/core/npu/NPUStream.h +++ b/torch_npu/csrc/core/npu/NPUStream.h @@ -146,6 +146,8 @@ std::ostream& operator<<(std::ostream& stream, const NPUStream& s); void recovery_all_npu_streams(c10::DeviceIndex device_index); NPUStream getNPUStreamFromSyncLaunchPool(c10::DeviceIndex device_index = -1); + +bool StreamInitFlag(c10::DeviceIndex device_index); } // namespace c10_npu namespace std { -- Gitee From 7f50ec75a7b5886e5e05766a568b152f33aa6ea4 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Mon, 28 Jul 2025 11:09:45 +0000 Subject: [PATCH 317/328] !23432 Update op_plugin commit id Merge pull request !23432 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 718098b3ed..0adf450191 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 718098b3edcd2fe11f041946b1a413ec168fc066 +Subproject commit 0adf45019153b1c51f33f1704838822f40b22829 -- Gitee From d1fdd6c6cf1328cde8ca9d985f015782c6618383 Mon Sep 17 00:00:00 2001 From: shenweiling Date: Tue, 29 Jul 2025 07:38:56 +0000 Subject: [PATCH 318/328] !23460 npu_fused_infer_attention_v2 rename npu_fused_infer_attention_score_v2 Merge pull request !23460 from shenweiling/v2.7.1 --- test/allowlist_for_publicAPI.json | 2 +- test/npu/test_aclgraph_update.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json index 7f20b62289..4f1174a3ba 100644 --- a/test/allowlist_for_publicAPI.json +++ b/test/allowlist_for_publicAPI.json @@ -2831,7 +2831,7 @@ "npu_rms_norm", "npu_add_rms_norm_cast", "npu_fused_infer_attention_score", - "npu_fused_infer_attention_v2", + "npu_fused_infer_attention_score_v2", "npu_mla_prolog", "npu_mla_prolog_v2", "npu_convert_weight_to_int4pack", diff --git a/test/npu/test_aclgraph_update.py b/test/npu/test_aclgraph_update.py index 18dbb79c5c..7db212734f 100644 --- a/test/npu/test_aclgraph_update.py +++ b/test/npu/test_aclgraph_update.py @@ -124,7 +124,7 @@ class TestAclgraphUpdate(TestCase): self.assertEqual(softmax_lse.cpu(), res_src[1].cpu()) @SupportedDevices(['Ascend910B']) - def test_npu_fused_infer_attention_v2(self): + def test_npu_fused_infer_attention_score_v2(self): torch.npu.set_device(0) length = [29] length_new = [100] @@ -132,7 +132,7 @@ class TestAclgraphUpdate(TestCase): query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu") - res_src = torch_npu.npu_fused_infer_attention_v2( + res_src = torch_npu.npu_fused_infer_attention_score_v2( query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new) g = torch.npu.NPUGraph() @@ -142,7 +142,7 @@ class TestAclgraphUpdate(TestCase): output = None softmax_lse = None - workspace = torch_npu._npu_fused_infer_attention_v2_get_max_workspace( + workspace = torch_npu._npu_fused_infer_attention_score_v2_get_max_workspace( query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length) @@ -153,14 +153,14 @@ class TestAclgraphUpdate(TestCase): event.wait(stream) event.reset(stream) torch.npu.graph_task_group_begin(stream) - torch_npu.npu_fused_infer_attention_v2.out( + torch_npu.npu_fused_infer_attention_score_v2.out( query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace, next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length, out=[output, softmax_lse]) handle = torch.npu.graph_task_group_end(stream) with torch.npu.stream(update_stream): torch.npu.graph_task_update_begin(update_stream, handle) - torch_npu.npu_fused_infer_attention_v2.out( + torch_npu.npu_fused_infer_attention_score_v2.out( query, key, value, num_query_heads=32, input_layout="BNSD", softmax_scale=scale, pre_tokens=65535, workspace=workspace, next_tokens=65535, return_softmax_lse=False, actual_seq_qlen=length_new, out=[output, softmax_lse]) torch.npu.graph_task_update_end(update_stream) -- Gitee From 94a7e3b0fd0aae5eeae8c8ad2914fcf4729951da Mon Sep 17 00:00:00 2001 From: pta-robot Date: Tue, 29 Jul 2025 09:09:44 +0000 Subject: [PATCH 319/328] !23463 Update op_plugin commit id Merge pull request !23463 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 0adf450191..a41de2bf06 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 0adf45019153b1c51f33f1704838822f40b22829 +Subproject commit a41de2bf0614525d03a3d899290f79a427ea507d -- Gitee From 3fbd4d21cdf4518539f374fb43cdd53b21c3255d Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Tue, 29 Jul 2025 13:36:29 +0000 Subject: [PATCH 320/328] !23471 Update torchair commit id Merge pull request !23471 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index 580d288b45..df181ae876 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit 580d288b456dcc71482abbebc8dd2b8c1f6406ed +Subproject commit df181ae8764c04d966c044ff79f43e8544e223e2 -- Gitee From 8f49c0a26d2fd76c076153f9e15aa22b03fe1601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 30 Jul 2025 01:32:57 +0000 Subject: [PATCH 321/328] =?UTF-8?q?!23428=20Support=20p2p=20detect=20Merge?= =?UTF-8?q?=20pull=20request=20!23428=20from=20=E7=8E=8B=E8=B6=85/v2.7.0?= =?UTF-8?q?=5Fstresshccl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/torch_npu_schema.json | 4 +- third_party/acl/inc/aml/aml_fwk_detect.h | 8 +++ .../csrc/core/npu/interface/MlInterface.cpp | 21 ++++++++ .../csrc/core/npu/interface/MlInterface.h | 10 ++++ torch_npu/csrc/npu/Module.cpp | 24 +++++++-- torch_npu/csrc/npu/Stress_detect.cpp | 52 ++++++++++++++----- torch_npu/csrc/npu/Stress_detect.h | 5 +- torch_npu/npu/utils.py | 33 +++++++++++- 8 files changed, 134 insertions(+), 23 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index 454e5f6ef8..32a249c6c6 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -1197,7 +1197,7 @@ "signature": "(device_id)" }, "torch_npu.npu.stress_detect": { - "signature": "()" + "signature": "(mode=0)" }, "torch_npu.npu.seed": { "signature": "()" @@ -1647,7 +1647,7 @@ "signature": "(device=None)" }, "torch_npu.npu.utils.stress_detect": { - "signature": "()" + "signature": "(mode=0)" }, "torch_npu.optim.NpuFusedAdadelta": { "signature": "(params, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)" diff --git a/third_party/acl/inc/aml/aml_fwk_detect.h b/third_party/acl/inc/aml/aml_fwk_detect.h index 2eb355b338..26dbde4549 100644 --- a/third_party/acl/inc/aml/aml_fwk_detect.h +++ b/third_party/acl/inc/aml/aml_fwk_detect.h @@ -32,8 +32,16 @@ typedef struct AmlAicoreDetectAttr { uint8_t reserve[64]; } AmlAicoreDetectAttr; +struct AmlP2PDetectAttr { + void *workspace; + uint64_t workspaceSize; + uint8_t reserve[64]; +}; + AmlStatus AmlAicoreDetectOnline(int32_t deviceId, const AmlAicoreDetectAttr *attr); +AmlStatus AmlP2PDetectOnline(int32_t devId, void *comm, const AmlP2PDetectAttr *attr); + #ifdef __cplusplus } #endif diff --git a/torch_npu/csrc/core/npu/interface/MlInterface.cpp b/torch_npu/csrc/core/npu/interface/MlInterface.cpp index b992b4a188..4008c8eb27 100644 --- a/torch_npu/csrc/core/npu/interface/MlInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/MlInterface.cpp @@ -14,6 +14,7 @@ namespace amlapi { REGISTER_LIBRARY(libascend_ml) LOAD_FUNCTION(AmlAicoreDetectOnline) +LOAD_FUNCTION(AmlP2PDetectOnline) bool IsExistAmlAicoreDetectOnline() { @@ -24,6 +25,15 @@ bool IsExistAmlAicoreDetectOnline() return isExist; } +bool IsExistAmlP2PDetectOnline() +{ + const static bool isExist = []() -> bool { + static auto func = GET_FUNC(AmlP2PDetectOnline); + return func != nullptr; + }(); + return isExist; +} + AmlStatus AmlAicoreDetectOnlineFace(int32_t deviceId, const AmlAicoreDetectAttr *attr) { typedef AmlStatus (*amlAicoreDetectOnline)(int32_t, const AmlAicoreDetectAttr *); @@ -35,5 +45,16 @@ AmlStatus AmlAicoreDetectOnlineFace(int32_t deviceId, const AmlAicoreDetectAttr return func(deviceId, attr); } +AmlStatus AmlP2PDetectOnlineFace(int32_t deviceId, void *comm, const AmlP2PDetectAttr *attr) +{ + typedef AmlStatus (*amlP2PDetectOnline)(int32_t, void *, const AmlP2PDetectAttr *); + static amlP2PDetectOnline func = nullptr; + if (func == nullptr) { + func = (amlP2PDetectOnline) GET_FUNC(AmlP2PDetectOnline); + } + TORCH_CHECK(func, "Failed to find function ", "AmlP2PDetectOnline", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(deviceId, comm, attr); +} + } // namespace amlapi } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/interface/MlInterface.h b/torch_npu/csrc/core/npu/interface/MlInterface.h index 6af498629a..33389fcce3 100644 --- a/torch_npu/csrc/core/npu/interface/MlInterface.h +++ b/torch_npu/csrc/core/npu/interface/MlInterface.h @@ -8,10 +8,20 @@ namespace amlapi { */ bool IsExistAmlAicoreDetectOnline(); +/** + * This API is used to check whether AmlP2PDetectOnline exist. +*/ +bool IsExistAmlP2PDetectOnline(); + /** * This API is used to call AmlAicoreDetectOnline. */ AmlStatus AmlAicoreDetectOnlineFace(int32_t deviceId, const AmlAicoreDetectAttr *attr); +/** + * This API is used to call AmlP2PDetectOnline. +*/ +AmlStatus AmlP2PDetectOnlineFace(int32_t deviceId, void *comm, const AmlP2PDetectAttr *attr); + } // namespace amlapi } // namespace c10_npu diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index ce450b778c..72c8e3d302 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -742,15 +742,29 @@ PyObject* THNPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) END_HANDLE_TH_ERRORS } -PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* noargs) +PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS + PyObject* value1 = nullptr; + PyObject* value2 = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &value1, &value2)) { + ASCEND_LOGE("Stress detect failed, argument is invalid."); + return PyLong_FromLong(1); + } + int mode = THPUtils_unpackLong(value1); + int64_t comm = THPUtils_unpackLong(value2); + torch_npu::utils::npu_lazy_init(); - int device_id; - NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_id)); + int deviceId; + aclError err = c10_npu::GetDevice(&deviceId); + if (err != ACL_ERROR_NONE) { + ASCEND_LOGE("Stress detect failed, error happened in GetDevice, err is %d.", err); + return PyLong_FromLong(1); + } - int ret = StressDetector::perform_stress_detect(device_id); + int ret = StressDetector::perform_stress_detect(deviceId, mode, comm); return PyLong_FromLong(ret); END_HANDLE_TH_ERRORS } @@ -1822,7 +1836,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_restart_device", (PyCFunction)THNPModule_restart_device_wrap, METH_O, nullptr}, {"_npu_check_uce_in_memory", (PyCFunction)THNPModule_check_uce_in_memory_wrap, METH_O, nullptr}, {"_npu_get_uce_addr", (PyCFunction)THNPModule_get_uce_addr_wrap, METH_NOARGS, nullptr}, - {"_npu_stress_detect", (PyCFunction)THNPModule_stressDetect_wrap, METH_NOARGS, nullptr}, + {"_npu_stress_detect", (PyCFunction)THNPModule_stressDetect_wrap, METH_VARARGS, nullptr}, {"_npu_getLocalDevice", (PyCFunction)THNPModule_getLocalDevice_wrap, METH_NOARGS, nullptr}, {"_npu_getDeviceCount", (PyCFunction)THNPModule_getDeviceCount_wrap, METH_NOARGS, nullptr}, {"_npu_canDeviceAccessPeer", (PyCFunction)THNPModule_npuCanDeviceAccessPeer_wrap, METH_VARARGS, nullptr}, diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp index 3fcade819b..6b747287f0 100644 --- a/torch_npu/csrc/npu/Stress_detect.cpp +++ b/torch_npu/csrc/npu/Stress_detect.cpp @@ -16,6 +16,8 @@ std::mutex StressDetector::mtx; int StressDetector::device_id; void* StressDetector::workspaceAddr = nullptr; size_t StressDetector::workspaceSize = 0; +int StressDetector::stressMode = 0; +void* StressDetector::localHcclComm = nullptr; constexpr int kDetectSucceeded = 0; constexpr int kDetectFailed = 1; @@ -40,16 +42,35 @@ void StressDetector::worker_thread() // Execute the task int ret = -1; - if (c10_npu::amlapi::IsExistAmlAicoreDetectOnline()) { - AmlAicoreDetectAttr attr; - attr.mode = AML_DETECT_RUN_MODE_ONLINE; - attr.workspace = workspaceAddr; - attr.workspaceSize = workspaceSize; - ret = c10_npu::amlapi::AmlAicoreDetectOnlineFace(device_id, &attr); - ASCEND_LOGI("Stress detect with AmlAicoreDetectOnline, result is %d.", ret); - } else { - ret = c10_npu::acl::AclStressDetect(device_id, workspaceAddr, workspaceSize); - ASCEND_LOGI("Stress detect with StressDetect, result is %d.", ret); + try { + if (stressMode == 0) { + if (c10_npu::amlapi::IsExistAmlAicoreDetectOnline()) { + AmlAicoreDetectAttr attr; + attr.mode = AML_DETECT_RUN_MODE_ONLINE; + attr.workspace = workspaceAddr; + attr.workspaceSize = workspaceSize; + ret = c10_npu::amlapi::AmlAicoreDetectOnlineFace(device_id, &attr); + ASCEND_LOGI("Stress detect with AmlAicoreDetectOnline, result is %d.", ret); + } else { + ret = c10_npu::acl::AclStressDetect(device_id, workspaceAddr, workspaceSize); + ASCEND_LOGI("Stress detect with StressDetect, result is %d.", ret); + } + } else { + if (c10_npu::amlapi::IsExistAmlP2PDetectOnline()) { + AmlP2PDetectAttr attr; + attr.workspace = workspaceAddr; + attr.workspaceSize = workspaceSize; + ret = c10_npu::amlapi::AmlP2PDetectOnlineFace(device_id, localHcclComm, &attr); + ASCEND_LOGI("Stress detect with AmlP2PDetectOnline, result is %d.", ret); + } else { + ASCEND_LOGW("Stress detect with AmlP2PDetectOnline failed, CANN version lower than 8.2.RC1 and currently does not support AmlP2PDetectOnline."); + TORCH_NPU_WARN("Stress detect with AmlP2PDetectOnline failed, CANN version lower than 8.2.RC1 and currently does not support AmlP2PDetectOnline."); + } + } + } catch (std::exception &e) { + ret = -1; + ASCEND_LOGW("Stress detect failed. type is %d, error:%s", stressMode, e.what()); + TORCH_NPU_WARN("Stress detect failed. type is ", stressMode, ", error: ", e.what()); } // Task complete, free memory @@ -76,21 +97,23 @@ int StressDetector::transfer_result(int detectResult) case ACLNN_STRESS_LOW_BIT_FAIL: case ACLNN_STRESS_HIGH_BIT_FAIL: ret = kDetectFailedWithHardwareFailure; - ASCEND_LOGE("Stress detect failed due to hardware malfunction, error code is %d.", detectResult); + ASCEND_LOGW("Stress detect failed due to hardware malfunction, error code is %d.", detectResult); + TORCH_NPU_WARN("Stress detect failed due to hardware malfunction, error code is ", detectResult); break; case ACLNN_CLEAR_DEVICE_STATE_FAIL: TORCH_CHECK(false, "Stress detect error. Error code is 574007. Error message is Voltage recovery failed.", PTA_ERROR(ErrCode::ACL)); break; default: ret = kDetectFailed; - ASCEND_LOGE("Stress detect test case execution failed, error code is %d.", detectResult); + ASCEND_LOGW("Stress detect test case execution failed, error code is %d.", detectResult); + TORCH_NPU_WARN("Stress detect test case execution failed, error code is ", detectResult); break; } return ret; } // Synchronous stress detection task execution -int StressDetector::perform_stress_detect(int deviceid) +int StressDetector::perform_stress_detect(int deviceid, int mode, int64_t comm) { // If it's the first call, start the persistent thread if (!thread_initialized.load()) { @@ -115,6 +138,7 @@ int StressDetector::perform_stress_detect(int deviceid) ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_ERROR_NONE) { ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret); + TORCH_NPU_WARN("call AclrtMallocAlign32 failed, skip StressDetect, error is ", ret); task_in_progress.store(false); // Task ends return kDetectFailed; } @@ -132,6 +156,8 @@ int StressDetector::perform_stress_detect(int deviceid) StressDetector::device_id = deviceid; StressDetector::workspaceAddr = workspaceAddr; StressDetector::workspaceSize = workspaceSize; + StressDetector::stressMode = mode; + StressDetector::localHcclComm = reinterpret_cast(static_cast(comm)); // Mark new task submitted new_task_submitted.store(true); diff --git a/torch_npu/csrc/npu/Stress_detect.h b/torch_npu/csrc/npu/Stress_detect.h index 4319122be7..47f020a870 100644 --- a/torch_npu/csrc/npu/Stress_detect.h +++ b/torch_npu/csrc/npu/Stress_detect.h @@ -9,10 +9,11 @@ #include #include #include "torch_npu/csrc/core/npu/NPUMacros.h" +#include "torch_npu/csrc/distributed/HCCLUtils.hpp" class StressDetector { public: - TORCH_NPU_API static int perform_stress_detect(int deviceid); + TORCH_NPU_API static int perform_stress_detect(int deviceid, int mode, int64_t comm); TORCH_NPU_API static void stop_worker_thread(); private: @@ -44,6 +45,8 @@ private: static int device_id; static void* workspaceAddr; static size_t workspaceSize; + static int stressMode; + static void* localHcclComm; // Flag to indicate if the thread has been initialized static std::atomic thread_initialized; diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py index a16a5dbb94..c73486d91a 100644 --- a/torch_npu/npu/utils.py +++ b/torch_npu/npu/utils.py @@ -371,9 +371,38 @@ def clear_npu_overflow_flag(): torch_npu.npu_clear_float_status(float_status) -def stress_detect(): +hccl_detect_group = None + + +def stress_detect(mode=0): + if mode not in [0, 1]: + warnings.warn("Detecct_type should be 0 or 1. For details, 0 as `Online aicore detect`, 1 as `Online p2p detect`.") + return 1 torch_npu.npu._lazy_init() - return torch_npu._C._npu_stress_detect() + comm = 0 + if mode == 1: + if not torch.distributed.is_initialized(): + warnings.warn("The torch.distributed should to be initialized for p2p detection.") + return 1 + global hccl_detect_group + if hccl_detect_group is None: + rank = int(os.getenv('RANK', -1)) + local_world_size = int(os.getenv('LOCAL_WORLD_SIZE', -1)) + if rank == -1 or local_world_size == -1: + warnings.warn("Environment variable 'RANK' or 'LOCAL_WORLD_SIZE' is not set.") + return 1 + worker_index = rank // local_world_size + local_rank = rank % local_world_size + local_ranks = [] + for i in range(local_world_size): + local_ranks.append(local_world_size * worker_index + i) + try: + hccl_detect_group = torch.distributed.new_group(ranks=local_ranks) + comm = hccl_detect_group._get_backend(torch.device('npu')).get_hccl_comm(local_rank) + except Exception as err: + warnings.warn("Create local hccl group for p2p detection failed.") + return 1 + return torch_npu._C._npu_stress_detect(mode, comm) def current_blas_handle(): -- Gitee From c80b6b273af2b1758046461558afd670a11020ff Mon Sep 17 00:00:00 2001 From: JianxinZhang Date: Wed, 30 Jul 2025 08:26:50 +0000 Subject: [PATCH 322/328] !23442 reselect static kernel Merge pull request !23442 from JianxinZhang/v2.7.1 --- torch_npu/csrc/core/npu/interface/OpInterface.cpp | 14 ++++++++++++++ torch_npu/csrc/core/npu/interface/OpInterface.h | 9 +++++++++ torch_npu/csrc/npu/Module.cpp | 11 +++++++++++ torch_npu/npu/__init__.py | 5 +++++ 4 files changed, 39 insertions(+) diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.cpp b/torch_npu/csrc/core/npu/interface/OpInterface.cpp index e950ee9f93..19a2e59b7b 100644 --- a/torch_npu/csrc/core/npu/interface/OpInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/OpInterface.cpp @@ -1,5 +1,6 @@ #include "OpInterface.h" #include "torch_npu/csrc/core/npu/register/FunctionLoader.h" +#include "torch_npu/csrc/core/npu/NPUException.h" namespace c10_npu { @@ -14,6 +15,7 @@ namespace opapi { REGISTER_LIBRARY(libopapi) LOAD_FUNCTION(aclnnSilentCheck) LOAD_FUNCTION(aclnnSilentCheckV2) +LOAD_FUNCTION(aclnnReselectStaticKernel) bool IsExistAclnnSilentCheck() { @@ -24,5 +26,17 @@ bool IsExistAclnnSilentCheck() return isExist; } +aclnnStatus ReselectStaticKernel() +{ + typedef aclnnStatus (*AclnnApiFunc)(); + static AclnnApiFunc aclnnReselectStaticKernelFunc = nullptr; + if (aclnnReselectStaticKernelFunc == nullptr) { + aclnnReselectStaticKernelFunc = (AclnnApiFunc)GET_FUNC(aclnnReselectStaticKernel); + } + TORCH_CHECK(aclnnReselectStaticKernelFunc, "Failed to find function ", "aclnnReselectStaticKernel", PTA_ERROR(ErrCode::NOT_FOUND)); + auto ret = aclnnReselectStaticKernelFunc(); + return ret; +} + } // namespace opapi } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.h b/torch_npu/csrc/core/npu/interface/OpInterface.h index 663f9a6144..e4cd5db180 100644 --- a/torch_npu/csrc/core/npu/interface/OpInterface.h +++ b/torch_npu/csrc/core/npu/interface/OpInterface.h @@ -1,11 +1,20 @@ #pragma once +#include + namespace c10_npu { namespace opapi { +typedef int32_t aclnnStatus; + /** * This API is used to check whether aclnnSilentCheck exist. */ bool IsExistAclnnSilentCheck(); +/** + This Api is used to reselect static kernel, it need to be called once at process. + */ +aclnnStatus ReselectStaticKernel(); + } // namespace opapi } // namespace c10_npu diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 72c8e3d302..006a70c9ac 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1646,6 +1646,16 @@ PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noar END_HANDLE_TH_ERRORS } +PyObject* THNPModule_aclnn_reselect_static_kernel(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + NPUStatus ret = c10_npu::emptyAllNPUStream(); + TORCH_CHECK(ret == NPU_STATUS_SUCCESS, "Failed to empty NPU task queue, ret:", ret, PTA_ERROR(ErrCode::INTERNAL)); + c10_npu::opapi::ReselectStaticKernel(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS @@ -1877,6 +1887,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr}, {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, + {"_aclnn_reselect_static_kernel", (PyCFunction)THNPModule_aclnn_reselect_static_kernel, METH_NOARGS, nullptr}, {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index 7210d6e431..5142af3a02 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -474,6 +474,11 @@ def utilization(device=None): return torch_npu._C._npu_getDeviceUtilizationRate(device_id) +def _aclnn_reselect_static_kernel(): + torch_npu.npu._lazy_init() + torch_npu._C._aclnn_reselect_static_kernel() + + from .random import * # noqa: F403 from .memory import * # noqa: F403 -- Gitee From bcb80138a4508dc85c02b166c6e84f956056a9fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= Date: Wed, 30 Jul 2025 08:56:44 +0000 Subject: [PATCH 323/328] =?UTF-8?q?!23402=20Add=20ShareableHandle=5Fto=5Fh?= =?UTF-8?q?andle=20map=20Merge=20pull=20request=20!23402=20from=20?= =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87/v2.7.1=5Fipc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/InitNpuBindings.cpp | 1 + .../csrc/core/npu/NPUCachingAllocator.cpp | 33 ++++++++++++++++++- torch_npu/csrc/core/npu/NPUCachingAllocator.h | 6 ++++ torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 6 ++++ torch_npu/csrc/npu/NPUPluggableAllocator.h | 1 + 5 files changed, 46 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 672b5289f5..b30b086e5c 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -80,6 +80,7 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg) } catch (...) { ASCEND_LOGE("NPUSwappedMemoryAllocator::emptyCache failed"); } + c10_npu::NPUCachingAllocator::clearIpcHandles(); ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize."); c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize(); diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 5ad16c9860..4503d884a3 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -361,6 +361,10 @@ However, it is possible to temporarily disable (expandable_segments:False) the bevhavior for allocator tensors that need to be used cross-process. */ +std::mutex ipcHandleMutex; +ska::flat_hash_map ipcShareableHandle_to_handle; +ska::flat_hash_set ipcHandles; + struct ExpandableSegment { ExpandableSegment( int device, @@ -489,6 +493,7 @@ struct ExpandableSegment { c10::DeviceIndex device, std::istream& buf) { + std::lock_guard lock(ipcHandleMutex); ShareHeader header{}; buf.read((char*)&header, sizeof(ShareHeader)); auto segment = std::make_unique( @@ -499,11 +504,21 @@ struct ExpandableSegment { (void)i; uint64_t shareableHandle = 0; buf.read((char*)&shareableHandle, sizeof(uint64_t)); + + auto iter = ipcShareableHandle_to_handle.find(shareableHandle); + if (iter != ipcShareableHandle_to_handle.end()) { + aclrtDrvMemHandle handle = iter->second; + segment->handles_.emplace_back(Handle{handle, shareableHandle}); + continue; + } + int32_t deviceId = static_cast(device); aclrtDrvMemHandle handle; NPU_CHECK_ERROR(c10_npu::acl::AclrtMemImportFromShareableHandle( shareableHandle, deviceId, &handle)); - segment->handles_.emplace_back(Handle{handle, std::nullopt}); + segment->handles_.emplace_back(Handle{handle, shareableHandle}); + ipcShareableHandle_to_handle.insert(iter, {shareableHandle, handle}); + ipcHandles.insert(handle); } segment->mapAndSetAccess(0, header.num_handles); return segment; @@ -573,6 +588,13 @@ private: Handle h = handles_.at(i).value(); handles_.at(i) = c10::nullopt; NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); + if (C10_UNLIKELY(h.shareableHandle)) { + std::lock_guard lock(ipcHandleMutex); + auto iter = ipcHandles.find(h.handle); + if (iter != ipcHandles.end()) { + continue; + } + } NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h.handle)); } ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_); @@ -3319,6 +3341,15 @@ public: ASCEND_LOGD("End empty cache with check_error = %d", check_error); } + void clearIpcHandles() override + { + std::lock_guard lock(ipcHandleMutex); + for (auto &handle : ipcHandles) { + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(handle)); + } + ipcHandles.clear(); + } + void *getBaseAllocation(void *ptr, size_t *outSize) override { Block *block = get_allocated_block(ptr); diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index c7082c8904..13c68aa0e3 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -203,6 +203,7 @@ public: virtual bool initialized() = 0; virtual void setMemoryFraction(double fraction, int device) = 0; virtual void emptyCache(bool check_error) = 0; + virtual void clearIpcHandles() = 0; virtual void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) = 0; virtual void* getBaseAllocation(void* ptr, size_t* size) = 0; virtual void recordStream(const c10::DataPtr& ptr, c10_npu::NPUStream stream) = 0; @@ -310,6 +311,11 @@ C10_NPU_API inline void emptyCache(bool check_error = true) return get()->emptyCache(check_error); } +inline void clearIpcHandles() +{ + return get()->clearIpcHandles(); +} + inline void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) { return get()->cacheInfo(dev_id, cachedAndFree, largestBlock); diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 14ea0ce7e7..7610374a3b 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -189,6 +189,12 @@ void NPUPluggableAllocator::emptyCache(bool check_error) } } +void NPUPluggableAllocator::clearIpcHandles() +{ + TORCH_NPU_WARN("NPUPluggableAllocator does not yet support clearIpcHandles. " + "If you need it, please file an issue describing your use case."); +} + void NPUPluggableAllocator::cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) { TORCH_NPU_WARN("NPUPluggableAllocator does not yet support cacheInfo. " diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index a3691d48ee..266db02a60 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -60,6 +60,7 @@ struct NPUPluggableAllocator bool initialized() override; void setMemoryFraction(double fraction, int device) override; void emptyCache(bool check_error) override; + void clearIpcHandles() override; void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) override; void* getBaseAllocation(void* ptr, size_t* size) override; void recordStream(const c10::DataPtr&, streamType stream) override; -- Gitee From 2cbd143b9e74a976972ad50c3d8d17f1cb054267 Mon Sep 17 00:00:00 2001 From: dilililiwhy Date: Wed, 30 Jul 2025 08:59:54 +0000 Subject: [PATCH 324/328] !23484 Update commit id (op-plugin & torchair) Merge pull request !23484 from dilililiwhy/update_submodule_npu_fused_infer_attention_score_v2_271 --- third_party/op-plugin | 2 +- third_party/torchair/torchair | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index a41de2bf06..80414c2b6f 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit a41de2bf0614525d03a3d899290f79a427ea507d +Subproject commit 80414c2b6f000a52e8ca2d41ea36e4339028a1ef diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index df181ae876..e4795c9949 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit df181ae8764c04d966c044ff79f43e8544e223e2 +Subproject commit e4795c99490e62f0a74e3bbb1506b7fa89e869c9 -- Gitee From 6c390ebcbc9f145b2776215cba50152e11c75277 Mon Sep 17 00:00:00 2001 From: ysybh <244036962@qq.com> Date: Thu, 31 Jul 2025 02:24:52 +0000 Subject: [PATCH 325/328] !23449 Support launch host func Merge pull request !23449 from ysybh/v2.7.1_0729 --- test/npu/test_aclgraph_launch_host_func.py | 60 +++++++++++++ torch_npu/csrc/core/npu/NPUGraph.cpp | 16 ++++ torch_npu/csrc/core/npu/NPUGraph.h | 8 ++ .../csrc/core/npu/interface/AclInterface.cpp | 39 +++++++++ .../csrc/core/npu/interface/AclInterface.h | 6 ++ torch_npu/csrc/npu/Graph.cpp | 85 +++++++++++++++++-- torch_npu/csrc/npu/Graph.h | 33 +++++++ torch_npu/npu/__init__.py | 12 +++ 8 files changed, 253 insertions(+), 6 deletions(-) create mode 100644 test/npu/test_aclgraph_launch_host_func.py create mode 100644 torch_npu/csrc/npu/Graph.h diff --git a/test/npu/test_aclgraph_launch_host_func.py b/test/npu/test_aclgraph_launch_host_func.py new file mode 100644 index 0000000000..7748a3308a --- /dev/null +++ b/test/npu/test_aclgraph_launch_host_func.py @@ -0,0 +1,60 @@ +import unittest +from itertools import chain + +import torch +from torch import nn +import torch_npu +from torch_npu.testing.common_utils import SupportedDevices +from torch_npu.testing.testcase import TestCase, run_tests + +callback_stream = torch.npu.Stream() + + +def callback_add(params): + global callback_stream + with torch.npu.stream(callback_stream): + x, y, result = params + result.copy_(x + y) + + +class MyModel(nn.Module): + def __init__(self): + super(MyModel, self).__init__() + self.result = torch.rand([5, 5]).npu() + + def forward(self, graph, x, y): + call_params = [torch.matmul(x, y), torch.matmul(x, y), self.result] + for _ in range(10000): + torch_npu.npu._launch_host_func(torch.npu.current_stream(), callback_add, call_params) + return self.result + + +class TestAclgraphLaunchHostFunc(TestCase): + + @SupportedDevices(['Ascend910B']) + def test_launch_host_func(self): + torch_npu.npu.set_compile_mode(jit_compile=False) + torch_npu.npu.set_device(0) + + self.capture_stream = torch_npu.npu.Stream() + self.graph = torch_npu.npu.NPUGraph() + + torch_npu.npu._subscribe_report(self.capture_stream) + a = torch.randn([5, 5]).npu() + b = torch.randn([5, 5]).npu() + model = MyModel() + with torch_npu.npu.stream(self.capture_stream): + with torch_npu.npu.graph(self.graph, stream=self.capture_stream): + self.res = model.forward(self.graph, a, b) + + torch.npu.synchronize() + for _ in range(5): + self.graph.replay() + torch.npu.synchronize() + real = torch.matmul(a, b) + torch.matmul(a, b) + self.assertEqual(self.res.cpu(), real.cpu()) + torch_npu.npu._unsubscribe_report(self.capture_stream) + + +if __name__ == '__main__': + run_tests() \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp index a00448bd1c..fc76880158 100644 --- a/torch_npu/csrc/core/npu/NPUGraph.cpp +++ b/torch_npu/csrc/core/npu/NPUGraph.cpp @@ -47,6 +47,22 @@ void graph_task_update_end(c10_npu::NPUStream stream) NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskUpdateEnd(stream)); } +void launch_callback(c10_npu::NPUStream stream, NPUCallbackFunc func, void *fnData) +{ + aclrtCallbackBlockType type = aclrtCallbackBlockType::ACL_CALLBACK_BLOCK; + NPU_CHECK_ERROR(c10_npu::acl::AclrtLaunchCallback(func, fnData, type, stream)); +} + +void subscribe_report(uint64_t threadId, c10_npu::NPUStream stream) +{ + NPU_CHECK_ERROR(c10_npu::acl::AclrtSubscribeReport(threadId, stream)); +} + +void unsubscribe_report(uint64_t threadId, c10_npu::NPUStream stream) +{ + NPU_CHECK_ERROR(c10_npu::acl::AclrtUnSubscribeReport(threadId, stream)); +} + /** * Note [CUDA Graph Wrapper Class] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/torch_npu/csrc/core/npu/NPUGraph.h b/torch_npu/csrc/core/npu/NPUGraph.h index 442ae335cc..2228f0916b 100644 --- a/torch_npu/csrc/core/npu/NPUGraph.h +++ b/torch_npu/csrc/core/npu/NPUGraph.h @@ -4,6 +4,8 @@ #include #include +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" #include "torch_npu/csrc/core/npu/NPUGraphsUtils.h" #include "torch_npu/csrc/core/npu/NPUMacros.h" #include "torch_npu/csrc/core/npu/NPUStream.h" @@ -18,11 +20,17 @@ struct TORCH_NPU_API NPUTaskGroupHandle { aclrtTaskGrp task_group; }; +typedef TORCH_NPU_API void (*NPUCallbackFunc)(void *fnData); + TORCH_NPU_API void graph_task_group_begin(c10_npu::NPUStream stream); TORCH_NPU_API NPUTaskGroupHandle graph_task_group_end(c10_npu::NPUStream stream); TORCH_NPU_API void graph_task_update_begin(c10_npu::NPUStream stream, NPUTaskGroupHandle handle); TORCH_NPU_API void graph_task_update_end(c10_npu::NPUStream stream); +TORCH_NPU_API void launch_callback(c10_npu::NPUStream stream, NPUCallbackFunc func, void *fnData); +TORCH_NPU_API void subscribe_report(uint64_t threadId, c10_npu::NPUStream stream); +TORCH_NPU_API void unsubscribe_report(uint64_t threadId, c10_npu::NPUStream stream); + struct TORCH_NPU_API NPUGraph { NPUGraph(); ~NPUGraph(); diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 6b8053b9c3..acf97e0300 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -94,6 +94,9 @@ LOAD_FUNCTION(aclrtGetDeviceResLimit) LOAD_FUNCTION(aclrtSetDeviceResLimit) LOAD_FUNCTION(aclrtResetDeviceResLimit) LOAD_FUNCTION(aclrtStreamGetId) +LOAD_FUNCTION(aclrtLaunchCallback) +LOAD_FUNCTION(aclrtSubscribeReport) +LOAD_FUNCTION(aclrtUnSubscribeReport) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -1084,5 +1087,41 @@ aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id) return func(stream, stream_id); } +aclError AclrtLaunchCallback(aclrtCallback fn, void *userData, aclrtCallbackBlockType blockType, aclrtStream stream) +{ + typedef aclError (*AclrtLaunchCallback)(aclrtCallback, void *, aclrtCallbackBlockType, aclrtStream); + static AclrtLaunchCallback func = nullptr; + if (func == nullptr) { + func = (AclrtLaunchCallback) GET_FUNC(aclrtLaunchCallback); + } + + TORCH_CHECK(func, "Failed to find function aclrtLaunchCallback", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(fn, userData, blockType, stream); +} + +aclError AclrtSubscribeReport(uint64_t threadId, aclrtStream stream) +{ + typedef aclError (*AclrtSubscribeReport)(uint64_t, aclrtStream); + static AclrtSubscribeReport func = nullptr; + if (func == nullptr) { + func = (AclrtSubscribeReport) GET_FUNC(aclrtSubscribeReport); + } + + TORCH_CHECK(func, "Failed to find function aclrtSubscribeReport", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(threadId, stream); +} + +aclError AclrtUnSubscribeReport(uint64_t theadId, aclrtStream stream) +{ + typedef aclError (*AclrtUnSubscribeReport)(uint64_t, aclrtStream); + static AclrtUnSubscribeReport func = nullptr; + if (func == nullptr) { + func = (AclrtUnSubscribeReport) GET_FUNC(aclrtUnSubscribeReport); + } + + TORCH_CHECK(func, "Failed to find function aclrtUnSubscribeReport", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(theadId, stream); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 367963c070..3aedfbf6d0 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -259,5 +259,11 @@ aclError AclrtResetDeviceResLimit(int32_t deviceId); aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id); +aclError AclrtLaunchCallback(aclrtCallback fn, void *userData, aclrtCallbackBlockType blockType, aclrtStream stream); + +aclError AclrtSubscribeReport(uint64_t theadId, aclrtStream stream); + +aclError AclrtUnSubscribeReport(uint64_t theadId, aclrtStream stream); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/npu/Graph.cpp b/torch_npu/csrc/npu/Graph.cpp index c8d30cfa44..09193b652e 100644 --- a/torch_npu/csrc/npu/Graph.cpp +++ b/torch_npu/csrc/npu/Graph.cpp @@ -1,16 +1,53 @@ -#include - -#include - -#include -#include +#include +#include #include "torch_npu/csrc/core/npu/NPUGraph.h" #include "torch_npu/csrc/core/npu/NPUGraphsUtils.h" #include "torch_npu/csrc/npu/Stream.h" +#include "torch_npu/csrc/npu/Graph.h" template using shared_ptr_class_ = py::class_>; +static std::map> callbacks = {}; +constexpr int processReportTimeout = 100; +static ThreadArgs* threadArgs = nullptr; +static uint64_t threadId = -1; + +void *process_callback(void *arg) +{ + ThreadArgs* args = static_cast(arg); + auto ret = aclrtSetCurrentContext(args->context); + while (!args->exitFlag) { + (void)aclrtProcessReport(processReportTimeout); + } + delete args; + args = nullptr; + return nullptr; +} + +void LaunchCallFunc(void *userData) +{ + PyGILState_STATE state = PyGILState_Ensure(); + if (userData == nullptr) { + return; + } + auto data = (PyFuncStruct *)(userData); + PyObject *argslist = Py_BuildValue("(O)", data->pyFuncArgs); + if (argslist == nullptr) { + return; + } + PyObject *result = PyObject_CallObject(data->pyFunc, argslist); + if (result == nullptr) { + return; + } + if (argslist != nullptr) { + Py_XDECREF(argslist); + } + if (result != nullptr) { + Py_XDECREF(result); + } + PyGILState_Release(state); +} void TORCH_NPU_API THNPGraph_init(PyObject* module) { // Pybind11 patch notes say "py::module_" is more up-to-date syntax, @@ -36,6 +73,42 @@ void TORCH_NPU_API THNPGraph_init(PyObject* module) { .def("_graph_task_update_end", [](py::object py_stream) { auto stream = (*py_stream).ptr(); c10_npu::graph_task_update_end(THNPUtils_PyObject_to_NPUStream(stream)); + }) + .def("_launch_host_func", [](py::object py_stream, py::object py_func, py::object py_data) { + auto func = (*py_func).ptr(); + auto userDataList = (*py_data).ptr(); + auto stream = THNPUtils_PyObject_to_NPUStream((*py_stream).ptr()); + PyFuncStruct *data = new(std::nothrow) PyFuncStruct(func, userDataList); + c10_npu::launch_callback(stream, LaunchCallFunc, data); + callbacks[stream].emplace_back(data); + }) + .def("_subscribe_report", [](py::object py_stream) { + auto stream = (*py_stream).ptr(); + aclrtContext context = aclrtContext(); + NPU_CHECK_ERROR(aclrtGetCurrentContext(&context)); + if ((threadArgs == nullptr) || (threadId == -1)) { + threadArgs = new ThreadArgs(context, false); + pthread_create(&threadId, nullptr, process_callback, threadArgs); + } + c10_npu::subscribe_report(threadId, THNPUtils_PyObject_to_NPUStream(stream)); + }) + .def("_unsubscribe_report", [](py::object py_stream) { + auto stream = THNPUtils_PyObject_to_NPUStream((*py_stream).ptr()); + c10_npu::unsubscribe_report(threadId, stream); + auto it = callbacks.find(stream); + if (it != callbacks.end()) { + std::vector& funcs = it->second; + for (PyFuncStruct* func : funcs) { + delete func; + func = nullptr; + } + funcs.clear(); + callbacks.erase(it); + } + if (callbacks.empty()) { + threadArgs->exitFlag = true; + threadId = -1; + } }); shared_ptr_class_(torch_N_m, "_NPUGraph") diff --git a/torch_npu/csrc/npu/Graph.h b/torch_npu/csrc/npu/Graph.h new file mode 100644 index 0000000000..8402a92094 --- /dev/null +++ b/torch_npu/csrc/npu/Graph.h @@ -0,0 +1,33 @@ +#include +#include +#include +#include + +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + +struct PyFuncStruct { + PyFuncStruct(PyObject *pyFunc, PyObject *pyFuncArgs) + : pyFunc(pyFunc), pyFuncArgs(pyFuncArgs) + { + Py_XINCREF(pyFunc); + Py_XINCREF(pyFuncArgs); + } + + ~PyFuncStruct() + { + Py_XDECREF(pyFunc); + Py_XDECREF(pyFuncArgs); + } + + PyObject* pyFunc = nullptr; + PyObject* pyFuncArgs = nullptr; +}; + +struct ThreadArgs { + ThreadArgs(aclrtContext context, bool exitFlag) + : context(context), exitFlag(exitFlag) {} + + aclrtContext context; + bool exitFlag; +}; \ No newline at end of file diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index 5142af3a02..75bf03d13a 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -418,6 +418,18 @@ def device_count() -> int: return r +def _launch_host_func(op_stream, fn, user_data): + torch_npu._C._launch_host_func(op_stream, fn, user_data) + + +def _subscribe_report(op_stream): + torch_npu._C._subscribe_report(op_stream) + + +def _unsubscribe_report(op_stream): + torch_npu._C._unsubscribe_report(op_stream) + + def can_device_access_peer(device_id, peer_device_id): r"""Checks if peer access between two devices is possible. """ -- Gitee From 1ffeb4d1d846ede933676b7636c6e56584dc0514 Mon Sep 17 00:00:00 2001 From: pta-robot Date: Thu, 31 Jul 2025 03:39:51 +0000 Subject: [PATCH 326/328] !23508 Update op_plugin commit id Merge pull request !23508 from pta-robot/v2.7.1 --- third_party/op-plugin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/op-plugin b/third_party/op-plugin index 80414c2b6f..c551965d55 160000 --- a/third_party/op-plugin +++ b/third_party/op-plugin @@ -1 +1 @@ -Subproject commit 80414c2b6f000a52e8ca2d41ea36e4339028a1ef +Subproject commit c551965d55e703309a9a52cb8f4e7e2202b295e0 -- Gitee From 46b19e6ad0f0a2b689d9facdce44ecf4420aa2cb Mon Sep 17 00:00:00 2001 From: torchair_robot Date: Thu, 31 Jul 2025 06:10:37 +0000 Subject: [PATCH 327/328] !23510 Update torchair commit id Merge pull request !23510 from torchair_robot/v2.7.1 --- third_party/torchair/torchair | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair index e4795c9949..3d7ede5d52 160000 --- a/third_party/torchair/torchair +++ b/third_party/torchair/torchair @@ -1 +1 @@ -Subproject commit e4795c99490e62f0a74e3bbb1506b7fa89e869c9 +Subproject commit 3d7ede5d52b56a5ae58c31a933f353917542cff0 -- Gitee From e9da01efa0b58b7de494e199eb0f0d29e6baea5a Mon Sep 17 00:00:00 2001 From: wl1259 Date: Thu, 31 Jul 2025 15:41:58 +0800 Subject: [PATCH 328/328] =?UTF-8?q?1=E3=80=81add=20inductor=20test=202?= =?UTF-8?q?=E3=80=81=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/_inductor/__init__.py | 3 + test/_inductor/test_abs.py | 26 ++ test/_inductor/test_add.py | 29 ++ test/_inductor/test_add_sum.py | 38 +++ test/_inductor/test_alias.py | 27 ++ test/_inductor/test_argmax.py | 24 ++ test/_inductor/test_argmax_unalign.py | 24 ++ test/_inductor/test_arrange.py | 31 +++ test/_inductor/test_attncp.py | 32 +++ test/_inductor/test_batch_norm.py | 46 ++++ test/_inductor/test_broadcast.py | 39 +++ test/_inductor/test_cat.py | 27 ++ test/_inductor/test_ceil.py | 28 ++ test/_inductor/test_check_accuracy.py | 70 +++++ test/_inductor/test_clamp.py | 88 ++++++ test/_inductor/test_clone.py | 27 ++ test/_inductor/test_codecache.py | 27 ++ test/_inductor/test_cos.py | 28 ++ test/_inductor/test_debug_msg.py | 129 +++++++++ test/_inductor/test_device_put.py | 34 +++ test/_inductor/test_div.py | 28 ++ test/_inductor/test_embedding.py | 41 +++ test/_inductor/test_embedding_fallback.py | 29 ++ test/_inductor/test_empty.py | 45 ++++ test/_inductor/test_eq.py | 35 +++ test/_inductor/test_exceptions.py | 71 +++++ test/_inductor/test_exp.py | 28 ++ test/_inductor/test_expm1.py | 27 ++ test/_inductor/test_floor.py | 28 ++ test/_inductor/test_force_fallback.py | 57 ++++ test/_inductor/test_ge.py | 28 ++ test/_inductor/test_geometric.py | 36 +++ test/_inductor/test_gt.py | 30 +++ test/_inductor/test_high_order_sum.py | 25 ++ test/_inductor/test_issue54.py | 60 +++++ test/_inductor/test_issue57.py | 35 +++ test/_inductor/test_issue59.py | 38 +++ test/_inductor/test_issue62.py | 47 ++++ test/_inductor/test_issue70.py | 22 ++ test/_inductor/test_lazy_register.py | 35 +++ test/_inductor/test_opensora_graph1.py | 266 +++++++++++++++++++ test/_inductor/test_permute.py | 39 +++ test/_inductor/test_reduction_brocast_add.py | 30 +++ test/_inductor/test_relu.py | 28 ++ test/_inductor/test_renorm.py | 27 ++ test/_inductor/test_repeat.py | 29 ++ test/_inductor/test_reshape.py | 35 +++ test/_inductor/test_rsqrt.py | 28 ++ test/_inductor/test_slice.py | 50 ++++ test/_inductor/test_split_loop.py | 28 ++ test/_inductor/test_sqrt.py | 27 ++ test/_inductor/test_sub.py | 28 ++ test/_inductor/test_sum.py | 55 ++++ test/_inductor/test_sum_add.py | 37 +++ test/_inductor/test_triton.py | 36 +++ test/_inductor/test_var.py | 26 ++ test/_inductor/test_var_mean.py | 31 +++ test/_inductor/test_var_mean_add_mul.py | 48 ++++ test/_inductor/test_where.py | 29 ++ test/_inductor/testutils.py | 36 +++ torch_npu/_inductor/__init__.py | 11 +- torch_npu/_inductor/codecache.py | 52 +++- torch_npu/_inductor/lowering.py | 6 +- torch_npu/_inductor/lowering_fx.py | 4 + torch_npu/_inductor/lowering_op_list.py | 3 +- torch_npu/_inductor/npu_triton_heuristics.py | 98 +++++-- torch_npu/_inductor/utils.py | 4 +- torch_npu/utils/_dynamo.py | 61 ++++- 68 files changed, 2640 insertions(+), 34 deletions(-) create mode 100644 test/_inductor/__init__.py create mode 100644 test/_inductor/test_abs.py create mode 100644 test/_inductor/test_add.py create mode 100644 test/_inductor/test_add_sum.py create mode 100644 test/_inductor/test_alias.py create mode 100644 test/_inductor/test_argmax.py create mode 100644 test/_inductor/test_argmax_unalign.py create mode 100644 test/_inductor/test_arrange.py create mode 100644 test/_inductor/test_attncp.py create mode 100644 test/_inductor/test_batch_norm.py create mode 100644 test/_inductor/test_broadcast.py create mode 100644 test/_inductor/test_cat.py create mode 100644 test/_inductor/test_ceil.py create mode 100644 test/_inductor/test_check_accuracy.py create mode 100644 test/_inductor/test_clamp.py create mode 100644 test/_inductor/test_clone.py create mode 100644 test/_inductor/test_codecache.py create mode 100644 test/_inductor/test_cos.py create mode 100644 test/_inductor/test_debug_msg.py create mode 100644 test/_inductor/test_device_put.py create mode 100644 test/_inductor/test_div.py create mode 100644 test/_inductor/test_embedding.py create mode 100644 test/_inductor/test_embedding_fallback.py create mode 100644 test/_inductor/test_empty.py create mode 100644 test/_inductor/test_eq.py create mode 100644 test/_inductor/test_exceptions.py create mode 100644 test/_inductor/test_exp.py create mode 100644 test/_inductor/test_expm1.py create mode 100644 test/_inductor/test_floor.py create mode 100644 test/_inductor/test_force_fallback.py create mode 100644 test/_inductor/test_ge.py create mode 100644 test/_inductor/test_geometric.py create mode 100644 test/_inductor/test_gt.py create mode 100644 test/_inductor/test_high_order_sum.py create mode 100644 test/_inductor/test_issue54.py create mode 100644 test/_inductor/test_issue57.py create mode 100644 test/_inductor/test_issue59.py create mode 100644 test/_inductor/test_issue62.py create mode 100644 test/_inductor/test_issue70.py create mode 100644 test/_inductor/test_lazy_register.py create mode 100644 test/_inductor/test_opensora_graph1.py create mode 100644 test/_inductor/test_permute.py create mode 100644 test/_inductor/test_reduction_brocast_add.py create mode 100644 test/_inductor/test_relu.py create mode 100644 test/_inductor/test_renorm.py create mode 100644 test/_inductor/test_repeat.py create mode 100644 test/_inductor/test_reshape.py create mode 100644 test/_inductor/test_rsqrt.py create mode 100644 test/_inductor/test_slice.py create mode 100644 test/_inductor/test_split_loop.py create mode 100644 test/_inductor/test_sqrt.py create mode 100644 test/_inductor/test_sub.py create mode 100644 test/_inductor/test_sum.py create mode 100644 test/_inductor/test_sum_add.py create mode 100644 test/_inductor/test_triton.py create mode 100644 test/_inductor/test_var.py create mode 100644 test/_inductor/test_var_mean.py create mode 100644 test/_inductor/test_var_mean_add_mul.py create mode 100644 test/_inductor/test_where.py create mode 100644 test/_inductor/testutils.py diff --git a/test/_inductor/__init__.py b/test/_inductor/__init__.py new file mode 100644 index 0000000000..ac0a39a91f --- /dev/null +++ b/test/_inductor/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. \ No newline at end of file diff --git a/test/_inductor/test_abs.py b/test/_inductor/test_abs.py new file mode 100644 index 0000000000..62482afd7a --- /dev/null +++ b/test/_inductor/test_abs.py @@ -0,0 +1,26 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestAbs(TestUtils): + def op_calc(self, first_element): + result = torch.abs(first_element) + return result + + @parametrize('shape', [(1024, 32), (256, 8)]) + @parametrize('dtype', ['float16', 'float32', 'bfloat16']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + self.assertEqual(std_result, inductor_result, atol=1e-3, rtol=1e-3) + +instantiate_parametrized_tests(TestAbs) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_add.py b/test/_inductor/test_add.py new file mode 100644 index 0000000000..f34078e105 --- /dev/null +++ b/test/_inductor/test_add.py @@ -0,0 +1,29 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestAdd(TestUtils): + def op_calc(self, first_element, second_element): + result = first_element + second_element + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + second_element = self._generate_tensor(shape, dtype) + + std_sum = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_sum = compiled_op_calc(first_element, second_element) + + self.assertEqual(std_sum, inductor_sum) + + +instantiate_parametrized_tests(TestAdd) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_add_sum.py b/test/_inductor/test_add_sum.py new file mode 100644 index 0000000000..bafa69ddb4 --- /dev/null +++ b/test/_inductor/test_add_sum.py @@ -0,0 +1,38 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSumAdd(TestUtils): + def foo(self, a, b, dim): + y = a + b + y = y.sum(dim) + return y + + # case:change shapes + @parametrize('shape', [(9, 9, 31, 64)]) + @parametrize('dim', [3]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + a, b = [torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)] + r1 = self.foo(a, b, dim) + func = torch.compile(self.foo, backend="inductor", dynamic=False) + r = func(a, b, dim) + self.assertEqual(r, r1, atol=1e-3, rtol=1e-3) + + @parametrize('shape', [(9, 10, 31, 63)]) + @parametrize('dim', [0, 1]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes1(self, shape, dim, dtype): + a, b = [torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)] + r1 = self.foo(a, b, dim) + func = torch.compile(self.foo, backend="inductor", dynamic=False) + r = func(a, b, dim) + self.assertEqual(r, r1, atol=1e-3, rtol=1e-3) + + +instantiate_parametrized_tests(TestSumAdd) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_alias.py b/test/_inductor/test_alias.py new file mode 100644 index 0000000000..5728ca8c24 --- /dev/null +++ b/test/_inductor/test_alias.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestAlias(TestUtils): + def op_calc(self, input_element, dim): + x = torch.ops.aten.alias(input_element) + y = x + 1.0 + return y + + # case:change shapes + @parametrize('shape', [(32, 64)]) + @parametrize('dim', [0]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + std_ret = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc(input_element, dim) + self.assertEqual(std_ret, inductor_ret, atol=1e-1, rtol=1e-1, equal_nan=True) + +instantiate_parametrized_tests(TestAlias) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_argmax.py b/test/_inductor/test_argmax.py new file mode 100644 index 0000000000..fa7130668c --- /dev/null +++ b/test/_inductor/test_argmax.py @@ -0,0 +1,24 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestArgmax(TestUtils): + def argmax(self, a, dim): + return torch.argmax(a, dim) + + def test_argmax(self): + shape = (512, 64) + dim = -1 + a = torch.randn(shape, requires_grad=False, dtype=torch.float32, device='npu') + + argmax_triton = torch.compile(self.argmax, backend="inductor", dynamic=False) + r = self.argmax(a, dim) + r1 = argmax_triton(a, dim) + self.assertEqual(r, r1, atol=1e-3, rtol=1e-3) + +instantiate_parametrized_tests(TestArgmax) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_argmax_unalign.py b/test/_inductor/test_argmax_unalign.py new file mode 100644 index 0000000000..34baef1ba1 --- /dev/null +++ b/test/_inductor/test_argmax_unalign.py @@ -0,0 +1,24 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestMaxWithIndex(TestUtils): + def op_calc(self, input_element, dim): + return torch.argmax(input_element, dim) + + @parametrize('shape', [(512, 64)]) # (513, 64), (514,33) + @parametrize('dim', [-1]) + @parametrize('dtype', ['float32']) + def test_reduction_cases(self, shape, dim, dtype): + input_element = torch.randn(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) * 2000 + std_argmax = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_argmax = compiled_op_calc(input_element, dim) + self.assertEqual(std_argmax, inductor_argmax, atol=1e-2, rtol=1e-2) + +instantiate_parametrized_tests(TestMaxWithIndex) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_arrange.py b/test/_inductor/test_arrange.py new file mode 100644 index 0000000000..f80b2fb92f --- /dev/null +++ b/test/_inductor/test_arrange.py @@ -0,0 +1,31 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestArrange(TestUtils): + def op_calc(self, start, end, step): + a = torch.arange(start, end, step, device=torch.device('npu')) + y = a + a + return y + + @parametrize('shape', [(2, )]) + @parametrize('dtype', TestUtils._test_dtypes) + def test_pointwise_cases(self, shape, dtype): + s = self._generate_tensor(shape, dtype) + start = min(s) + end = max(s) + step = (end - start) / 32 + + std_arrange = self.op_calc(start, end, step) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_arrange = compiled_op_calc(start, end, step) + + self.assertEqual(std_arrange, inductor_arrange) + +instantiate_parametrized_tests(TestArrange) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_attncp.py b/test/_inductor/test_attncp.py new file mode 100644 index 0000000000..966ecc855f --- /dev/null +++ b/test/_inductor/test_attncp.py @@ -0,0 +1,32 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestAttnCp(TestUtils): + shape = (8, 8, 256, 128) + dim = -1 + + def foo(self, a, b, c): + y = a + b + y = y.sum(self.dim) + y = y.unsqueeze(self.dim) + y = y.broadcast_to(self.shape) + b + y = c + y.permute(0, 1, 3, 2) + return y + + + def test_pointwise_cases(self): + a, b = [torch.randn(self.shape, dtype=torch.float32, device="npu") for _ in range(2)] + d = torch.randn(self.shape, dtype=torch.float32, device="npu") + c = d.permute(0, 1, 3, 2).contiguous() + func = torch.compile(self.foo, backend="inductor") + r = func(a, b, c) + r1 = self.foo(a, b, c) + self.assertEqual(r, r1, atol=1e-3, rtol=1e-3) + +instantiate_parametrized_tests(TestAttnCp) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_batch_norm.py b/test/_inductor/test_batch_norm.py new file mode 100644 index 0000000000..a92166dc40 --- /dev/null +++ b/test/_inductor/test_batch_norm.py @@ -0,0 +1,46 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestNativeBatchNorm(TestUtils): + def op_calc(self, input_element): + # 创建权重和偏置张量 + weight = torch.ones(32).npu() + bias = torch.zeros(32).npu() + + # 创建运行均值和方差张量 + running_mean = torch.zeros(32).npu() + running_var = torch.ones(32).npu() + momentum = 0.1 + eps = 1e-05 + # 执行批量归一化 + output, running_mean_out, running_var_out = torch.native_batch_norm( + input=input_element, + weight=weight, + bias=bias, + running_mean=running_mean, + running_var=running_var, + training=True, + momentum=momentum, + eps=eps + ) + return output, running_mean_out, running_var_out + + @parametrize('shape', [(16, 32, 64)]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dtype): + input_element = self._generate_tensor(shape, dtype) + + std_ret, _, _ = self.op_calc(input_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret, _, _ = compiled_op_calc(input_element) + self.assertEqual(std_ret, inductor_ret, atol=1e-1, rtol=1e-1, equal_nan=True) + + +instantiate_parametrized_tests(TestNativeBatchNorm) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_broadcast.py b/test/_inductor/test_broadcast.py new file mode 100644 index 0000000000..93e78f0351 --- /dev/null +++ b/test/_inductor/test_broadcast.py @@ -0,0 +1,39 @@ +import copy +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestBroadcast(TestUtils): + broadcast_size = 128 + + def op_calc(self, a, b, dim, new_shape): + a = a.unsqueeze(dim) + a = a.broadcast_to(new_shape) + b = b.unsqueeze(dim) + b = b.broadcast_to(new_shape) + y = a + b + return y + + + @parametrize('shape', [(8, 8, 256)]) + @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16']) + def test_view_cases(self, shape, dtype): + a = self._generate_tensor(shape, dtype) + b = self._generate_tensor(shape, dtype) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + for dim in [3, 2, 1, 0]: + new_shape = list(copy.deepcopy(shape)) + new_shape.insert(dim, self.broadcast_size) + std_broadcast = self.op_calc(a, b, dim, new_shape) + inductor_broadcast = compiled_op_calc(a, b, dim, new_shape) + + self.assertEqual(std_broadcast.float(), inductor_broadcast.float(), atol=1e-3, rtol=1e-3) + + +instantiate_parametrized_tests(TestBroadcast) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_cat.py b/test/_inductor/test_cat.py new file mode 100644 index 0000000000..26d89caaa8 --- /dev/null +++ b/test/_inductor/test_cat.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestCat(TestUtils): + + def op_calc(self, input_element, dim): + return torch.cat([input_element, input_element], dim) + + # case:change shapes + @parametrize('shape', [(8, 16, 32, 64)]) + @parametrize('dim', [-1]) + @parametrize('dtype', ['bfloat16']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + std_cat = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_cat = compiled_op_calc(input_element, dim) + self.assertEqual(std_cat, inductor_cat, atol=1e-1, rtol=1e-1, equal_nan=True) + + +instantiate_parametrized_tests(TestCat) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_ceil.py b/test/_inductor/test_ceil.py new file mode 100644 index 0000000000..0058a1ceda --- /dev/null +++ b/test/_inductor/test_ceil.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestRelu(TestUtils): + def op_calc(self, first_element): + result = torch.ceil(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float32', 'float16', 'bfloat16']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestRelu) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_check_accuracy.py b/test/_inductor/test_check_accuracy.py new file mode 100644 index 0000000000..e30fe2228c --- /dev/null +++ b/test/_inductor/test_check_accuracy.py @@ -0,0 +1,70 @@ +import os +from unittest.mock import patch + +import torch +import torch.nn.functional as F +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + +os.environ["INDUCTOR_ASCEND_CHECK_ACCURACY"] = "1" + + +class TestCheckAccuracy(TestUtils): + def test_check_accuracy_1(self): + count_data_dump = 0 + count_check_accuracy = 0 + + def run(x, y): + return F.relu(x) - y + + from torch_npu._inductor.npu_triton_heuristics import NPUCachingAutotuner + src_data_dump = NPUCachingAutotuner.data_dump + + def wrap_data_dump(self, *args, **kwargs): + status = src_data_dump(self, *args, **kwargs) + if status: + nonlocal count_data_dump + count_data_dump += 1 + return status + + src_check_accuracy = NPUCachingAutotuner.check_accuracy + + def wrap_check_accuracy(self, *args, **kwargs): + status = src_check_accuracy(self, *args, **kwargs) + if status: + nonlocal count_check_accuracy + count_check_accuracy += 1 + return status + + x = torch.randn(10).npu() + y = torch.randn(10).npu() + g = run(x, y) + + run = torch.compile(run) + # compile warmup + _ = run(x, y) + + with patch.object(NPUCachingAutotuner, "data_dump", wrap_data_dump), \ + patch.object(NPUCachingAutotuner, "check_accuracy", wrap_check_accuracy): + self.assertTrue(torch_npu._inductor.config.dump_fx_graph) + self.assertTrue(torch_npu._inductor.config.check_accuracy) + + # Try run custom path and make sure no data_dump and check_accuracy is invoked. + torch_npu._inductor.config.dump_fx_graph = False + torch_npu._inductor.config.check_accuracy = False + z = run(x, y) + self.assertEqual(count_data_dump, 0) + self.assertEqual(count_check_accuracy, 0) + self.assertEqual(z, g) + + torch_npu._inductor.config.dump_fx_graph = True + torch_npu._inductor.config.check_accuracy = True + z = run(x, y) + self.assertEqual(count_data_dump, 1) + self.assertEqual(count_check_accuracy, 1) + self.assertEqual(z, g) + + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/_inductor/test_clamp.py b/test/_inductor/test_clamp.py new file mode 100644 index 0000000000..0daf41e02d --- /dev/null +++ b/test/_inductor/test_clamp.py @@ -0,0 +1,88 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestClamp(TestUtils): + + def op_calc(self, arg, min_value=None, max_value=None): + return arg.clamp(min_value, max_value) + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases_minmax_is_tensor(self, shape, dtype): + min_0 = self._generate_tensor(shape, dtype) + max_0 = self._generate_tensor(shape, dtype) + + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, min_value=min_0, max_value=max_0) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, min_value=min_0, max_value=max_0) + + self.assertEqual(std_result, inductor_result) + + @parametrize('shape', [(1,)]) + @parametrize('dtype', ['float32']) + def test_pointwise_cases_single_scalar(self, shape, dtype): + min_numel = 0 + max_numel = 100 + + first_element = 200 * torch.rand(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) + + std_result = self.op_calc(first_element, min_value=min_numel, max_value=max_numel) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, min_value=min_numel, max_value=max_numel) + self.assertEqual(std_result, inductor_result) + + @parametrize('shape', [(1024, 32)]) + @parametrize('dtype', ['int32']) + def test_pointwise_cases_minmax_is_number(self, shape, dtype): + min_numel = 0 + max_numel = 100 + + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, min_value=min_numel, max_value=max_numel) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, min_value=min_numel, max_value=max_numel) + + self.assertEqual(std_result, inductor_result) + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases_max_only(self, shape, dtype): + max_numel = 100 + + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, min_value=None, max_value=max_numel) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, min_value=None, max_value=max_numel) + + self.assertEqual(std_result, inductor_result) + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases_min_only(self, shape, dtype): + min_numel = 0 + + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, min_value=min_numel, max_value=None) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, min_value=min_numel, max_value=None) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestClamp) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_clone.py b/test/_inductor/test_clone.py new file mode 100644 index 0000000000..81fddfbad3 --- /dev/null +++ b/test/_inductor/test_clone.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestClone(TestUtils): + def op_calc(self, input_element, dim): + return torch.clone(input_element) + + @parametrize('shape', [(8, 64, 128)]) + @parametrize('dim', [0]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + std_ret = self.op_calc(input_element, dim) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc(input_element, dim) + + self.assertEqual(std_ret, inductor_ret, equal_nan=True) + + +instantiate_parametrized_tests(TestClone) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_codecache.py b/test/_inductor/test_codecache.py new file mode 100644 index 0000000000..feb7645f19 --- /dev/null +++ b/test/_inductor/test_codecache.py @@ -0,0 +1,27 @@ +import pytest +import torch +from torch.testing._internal.common_utils import run_tests +from torch._inductor.codecache import CacheBase +from testutils import TestUtils +import torch_npu +import torch_npu._inductor + + +class TestCodeCache(TestUtils): + def test_codecache(self): + device_properties = torch_npu.npu.get_device_properties( + torch_npu.npu.current_device() + ) + + system1 = CacheBase.get_system() + self.assertEqual(system1["device"]["name"], device_properties.name) + self.assertEqual(system1["version"]["cann"], torch.version.cann) + + from torch_npu.contrib import transfer_to_npu + system2 = CacheBase.get_system() + self.assertEqual(system2["device"]["name"], device_properties.name) + self.assertEqual(system2["version"]["cann"], torch.version.cann) + + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/_inductor/test_cos.py b/test/_inductor/test_cos.py new file mode 100644 index 0000000000..b396e95a8c --- /dev/null +++ b/test/_inductor/test_cos.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestLog(TestUtils): + def op_calc(self, first_element): + result = torch.cos(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestLog) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_debug_msg.py b/test/_inductor/test_debug_msg.py new file mode 100644 index 0000000000..f0512a9e23 --- /dev/null +++ b/test/_inductor/test_debug_msg.py @@ -0,0 +1,129 @@ + +import os +import re +import logging +import tempfile +from pathlib import Path +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from torch._inductor import config +from testutils import TestUtils +import torch_npu + +os.environ["INDUCTOR_ASCEND_DUMP_FX_GRAPH"] = "1" +os.environ["TORCH_COMPILE_DEBUG"] = "1" + + +class TestDebugMsg(TestUtils): + @parametrize('shape_x', [(32, 512, 64)]) + @parametrize('shape_y', [(32, 1, 64)]) + @parametrize('dtype', ['float32']) + def test_case1(self, shape_x, shape_y, dtype): + x = self._generate_tensor(shape_x, dtype) + y = self._generate_tensor(shape_y, dtype) + + + def run_case1(x, y): + z = x + y + return z + + run = torch.compile(run_case1, backend='inductor') + with config.patch( + { + "trace.debug_dir": tempfile.mkdtemp(), + "force_disable_caches": True, + } + ): + with self.assertLogs( + logging.getLogger("torch._inductor.debug"), level=logging.WARNING + ) as cm: + run(x, y) + + self.assertEqual(len(cm.output), 1) + m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0]) + self.assertTrue(m) + filename = Path(m.group(1)) + self.assertTrue(filename.is_dir()) + content = open(filename / "output_code.py").read().rstrip() + + self.assertIn( + "# SchedulerNodes: [SchedulerNode(name='op0')]", + content + ) + + self.assertIn( + """ +# def forward(self, arg0_1, arg1_1): +# expand = torch.ops.aten.expand.default(arg1_1, [32, 512, 64]); arg1_1 = None +# add = torch.ops.aten.add.Tensor(arg0_1, expand); arg0_1 = expand = None +# return (add,)""", + content + ) + + self.assertIn( + """ +# inputs: [FakeTensor(..., device='npu:0', size=(32, 512, 64), strides=(32768, 64, 1)), FakeTensor(..., device='npu:0', size=(32, 1, 64), strides=(64, 64, 1))] +# outputs: [FakeTensor(..., device='npu:0', size=(32, 512, 64), strides=(32768, 64, 1))]""", + content + ) + + + @parametrize('shape_x', [(32, 512, 64)]) + @parametrize('shape_y', [(32, 1, 64)]) + @parametrize('dtype', ['float32']) + def test_case2(self, shape_x, shape_y, dtype): + x = self._generate_tensor(shape_x, dtype) + y = self._generate_tensor(shape_y, dtype) + + + def run_case2(x, y): + z = x + y + z = z.repeat([256, 1, 1]) + return z + + run = torch.compile(run_case2, backend='inductor') + with config.patch( + { + "trace.debug_dir": tempfile.mkdtemp(), + "force_disable_caches": True, + } + ): + with self.assertLogs( + logging.getLogger("torch._inductor.debug"), level=logging.WARNING + ) as cm: + run(x, y) + + self.assertEqual(len(cm.output), 1) + m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0]) + self.assertTrue(m) + filename = Path(m.group(1)) + self.assertTrue(filename.is_dir()) + content = open(filename / "output_code.py").read().rstrip() + + self.assertIn( + "# SchedulerNodes: [SchedulerNode(name='op0')]", + content + ) + + self.assertIn( + """ +# def forward(self, arg0_1, arg1_1): +# expand = torch.ops.aten.expand.default(arg1_1, [32, 512, 64]); arg1_1 = None +# add = torch.ops.aten.add.Tensor(arg0_1, expand); arg0_1 = expand = None +# repeat = torch.ops.aten.repeat.default(add, [256, 1, 1]); add = None +# return (repeat,)""", + content + ) + + self.assertIn( + """ +# inputs: [FakeTensor(..., device='npu:0', size=(32, 512, 64), strides=(32768, 64, 1)), FakeTensor(..., device='npu:0', size=(32, 1, 64), strides=(64, 64, 1))] +# outputs: [FakeTensor(..., device='npu:0', size=(8192, 512, 64), strides=(32768, 64, 1))]""", + content + ) + + +instantiate_parametrized_tests(TestDebugMsg) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_device_put.py b/test/_inductor/test_device_put.py new file mode 100644 index 0000000000..8cdd2bc1c6 --- /dev/null +++ b/test/_inductor/test_device_put.py @@ -0,0 +1,34 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestDevicePut(TestUtils): + def op_calc(self, input_element1, input_element2): + return torch.add(input_element1, input_element2) + + @parametrize('shape', [(8, 16, 8)]) + @parametrize('dtype', ['int32']) + def test_cases_shapes(self, shape, dtype): + low = 0 + high = 2 + dtype = eval('torch.' + dtype) + npu_device = torch.device('npu:0') + input_element1_tmp = torch.randint(low, high, shape, dtype=dtype).cpu() + input_element2_tmp = torch.randint(low, high, shape, dtype=dtype).cpu() + input_element1 = torch.ops.prims.device_put(input_element1_tmp, npu_device) + input_element2 = torch.ops.prims.device_put(input_element2_tmp, npu_device) + + std_ret = self.op_calc(input_element1, input_element2) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc(input_element1, input_element2) + + self.assertEqual(std_ret, inductor_ret) + + +instantiate_parametrized_tests(TestDevicePut) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_div.py b/test/_inductor/test_div.py new file mode 100644 index 0000000000..e832c85ded --- /dev/null +++ b/test/_inductor/test_div.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestDiv(TestUtils): + def op_calc(self, first_element, second_element): + result = torch.div(first_element, second_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + second_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, second_element) + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestDiv) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_embedding.py b/test/_inductor/test_embedding.py new file mode 100644 index 0000000000..a7e82fc86a --- /dev/null +++ b/test/_inductor/test_embedding.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestEmbeddingDense(TestUtils): + def op_calc(self, arg, embedding): + output = embedding(arg) + return output + + # UT skip, reason: precision fail + # Added to pytorch-disable-tests.json + def test_pointwise_cases(self): + + arg0 = torch.tensor([[14, 1, 2, 10, 0, 10, 0], + [9, 13, 13, 4, 7, 15, 14], + [8, 0, 3, 15, 4, 2, 6], + [15, 12, 13, 9, 0, 8, 1], + [8, 15, 4, 15, 12, 9, 3], + [6, 11, 12, 8, 0, 13, 8], + [4, 10, 1, 12, 0, 0, 4], + [6, 6, 15, 6, 0, 10, 15], + [2, 5, 14, 0, 5, 7, 9], + [13, 4, 14, 11, 11, 9, 2], + [1, 1, 5, 1, 1, 6, 14], + [3, 9, 8, 4, 13, 8, 3], + [4, 10, 8, 13, 6, 8, 3]], device='npu:0') + embedding = nn.Embedding(16, 128).npu() + std_sub = self.op_calc(arg0, embedding) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_sum = compiled_op_calc(arg0, embedding) + self.assertEqual(std_sub, inductor_sum) + + +instantiate_parametrized_tests(TestEmbeddingDense) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_embedding_fallback.py b/test/_inductor/test_embedding_fallback.py new file mode 100644 index 0000000000..c2cf3680ff --- /dev/null +++ b/test/_inductor/test_embedding_fallback.py @@ -0,0 +1,29 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestEmbeddingDenseBackward(TestUtils): + def op_calc(self, slice_4, sum_23): + result = torch.ops.aten.embedding_dense_backward.default(sum_23, slice_4, 512, -1, False) + return result + + @parametrize('shape', [(1, 512, 128)]) + @parametrize('dtype', ['float32']) + def test_pointwise_cases(self, shape, dtype): + first_element = torch.randint(low=0, high=128, size=(1, 512), dtype=torch.int64).npu() + second_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, second_element) + + self.assertEqual(std_result, inductor_result, atol=1e-1, rtol=1e-1) + + +instantiate_parametrized_tests(TestEmbeddingDenseBackward) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_empty.py b/test/_inductor/test_empty.py new file mode 100644 index 0000000000..49c006bf10 --- /dev/null +++ b/test/_inductor/test_empty.py @@ -0,0 +1,45 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestEmpty(TestUtils): + def op_calc(self): + x = torch.empty(8, 64, 128, dtype=torch.float32).npu() + x.uniform_(-100, 100) + return x + + def op_calc_empty_permuted(self): + input_shape = (8, 64, 128) + physical_layout = (0, 1, 2) + x = torch.empty_permuted(input_shape, physical_layout).npu() + x.uniform_(-100, 100) + return x + + # case: change shapes + @parametrize('shape', [(8, 64, 128)]) + @parametrize('dim', [0]) + @parametrize('dtype', ['float32']) + def test_cases_empty(self, shape, dim, dtype): + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc() + + self.assertTrue(inductor_ret.numel() > 0) + + @parametrize('shape', [(8, 64, 128)]) + @parametrize('dim', [0]) + @parametrize('dtype', ['float32']) + def test_cases_empty_permuted(self, shape, dim, dtype): + compiled_op_calc = torch.compile(self.op_calc_empty_permuted, backend="inductor") + inductor_ret = compiled_op_calc() + + self.assertTrue(inductor_ret.numel() > 0) + + +instantiate_parametrized_tests(TestEmpty) + +if __name__ == "__main__": + run_tests() + diff --git a/test/_inductor/test_eq.py b/test/_inductor/test_eq.py new file mode 100644 index 0000000000..cc04d49957 --- /dev/null +++ b/test/_inductor/test_eq.py @@ -0,0 +1,35 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestEq(TestUtils): + def op_calc(self, first_element, second_element): + return torch.eq(first_element, second_element) + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16']) + def test_pointwise_cases(self, shape, dtype): + + first_element = self._generate_tensor(shape, dtype) + second_element = first_element.clone() + + # randomly change some elements in second tensor + flat_second_view = second_element.flatten() + num_elements_to_change = first_element.numel() // 3 + random_indices = torch.randint(0, first_element.numel(), (num_elements_to_change,)) + flat_second_view[random_indices] = 1 - flat_second_view[random_indices] + + std_result = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, second_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestEq) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_exceptions.py b/test/_inductor/test_exceptions.py new file mode 100644 index 0000000000..39badf983e --- /dev/null +++ b/test/_inductor/test_exceptions.py @@ -0,0 +1,71 @@ +import functools +from functools import partial + +from testutils import TestUtils +import torch +from torch._inductor.codecache import _load_triton_kernel_from_source +from torch.testing._internal.common_utils import run_tests +import torch_npu + + +src_code_1 = ''' +import triton +import triton.language as tl +from triton.compiler.compiler import AttrsDescriptor + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + +from torch._inductor.runtime import triton_helpers +from torch_npu._inductor import npu_triton_heuristics +from torch_npu._inductor import npu_triton_helpers +from torch_npu._inductor.runtime import NPUDeviceProperties +from torch_npu._inductor.npu_triton_helpers import libdevice, math as tl_math +import torch +import torch_npu + +@npu_triton_heuristics.pointwise_npu_index( + size_hints=[16384, 32], tile_hint=TileHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp16', 'in_ptr1': '*fp16', 'out_ptr0': '*fp16', 'y0_numel': 'i32', 'x1_numel': 'i32'}, + 'device': NPUDeviceProperties(type='npu', index=0, multi_processor_count=40, cc='Ascend910B3', + major=None, regs_per_multiprocessor=None, max_threads_per_multi_processor=None, warp_size=32), + 'constants': {}, 'mix_mode': 'aiv'}, + inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_unk_fused_add_0', 'mutated_arg_names': [], + 'backend_hash': 'bc71dba4086164e7ac2b0779fa861dbf7467f0265d4a57b8f48cf6dda02b150f', 'split_axis': [0], + 'tiling_axis': [0, 1], 'axis_names': ['y0', 'x1'], 'low_dims': {1}, 'numof_reduction_axis': 0, + 'split_axis_dtype': torch.float16, 'dual_reduction': False, 'traced_graph_hash': 'TRACED_GRAPH_HASH', + 'traced_graph_dir': 'TRACED_GRAPH_DIR'}, + min_elem_per_thread=0 +) +@triton.jit +def triton_unk_fused_add_0(in_ptr0, in_ptr1, out_ptr0, y0_numel, x1_numel, Y0BLOCK: tl.constexpr, Y0BLOCK_SUB: tl.constexpr, X1BLOCK_SUB: tl.constexpr): + y0_offset = tl.program_id(0) * Y0BLOCK + base_y0= tl.arange(0, Y0BLOCK_SUB) + loops_y0 = (Y0BLOCK + Y0BLOCK_SUB - 1) // Y0BLOCK_SUB + base_x1= tl.arange(0, X1BLOCK_SUB) + loops_x1 = (x1_numel + X1BLOCK_SUB - 1) // X1BLOCK_SUB + for loop_y0 in range(loops_y0): + y0 = y0_offset + (loop_y0 * Y0BLOCK_SUB) + base_y0[:,None] + y0_mask = y0 < min(Y0BLOCK+y0_offset, y0_numel) + for loop_x1 in range(loops_x1): + x1 = (loop_x1 * X1BLOCK_SUB) + base_x1[None,:] + x1_mask = x1 < x1_numel + tmp0 = tl.load(in_ptr0 + (x1 + 128*y0), x1_mask & y0_mask) + # Not define tmp1 and make error manually for triton: 'tmp1 is not defined' + tmp2 = tmp0 + tmp1 + tl.store(out_ptr0 + (x1 + 32*y0), tmp2, x1_mask & y0_mask) +''' + + +class TestExceptions(TestUtils): + def test_triton_kernel_failed(self): + with self.assertRaisesRegex(Exception, "tmp1 is not defined"): + load_kernel = functools.partial(_load_triton_kernel_from_source, "triton_unk_fused_add_0", src_code_1) + kernel = load_kernel() + kernel.precompile() + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_exp.py b/test/_inductor/test_exp.py new file mode 100644 index 0000000000..e2b08352a1 --- /dev/null +++ b/test/_inductor/test_exp.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestExp(TestUtils): + def op_calc(self, first_element): + result = torch.exp(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + self.assertEqual(std_result, inductor_result, atol=1e-1, rtol=1e-1, equal_nan=True) + + +instantiate_parametrized_tests(TestExp) + +if __name__ == "__main__": + run_tests() + diff --git a/test/_inductor/test_expm1.py b/test/_inductor/test_expm1.py new file mode 100644 index 0000000000..149dc846a4 --- /dev/null +++ b/test/_inductor/test_expm1.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestExpm1(TestUtils): + def op_calc(self, first_element): + result = torch.expm1(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + + self.assertEqual(std_result, inductor_result, atol=1e-3, rtol=1e-3, equal_nan=True) + + +instantiate_parametrized_tests(TestExpm1) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_floor.py b/test/_inductor/test_floor.py new file mode 100644 index 0000000000..6ed849901d --- /dev/null +++ b/test/_inductor/test_floor.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestFloor(TestUtils): + def op_calc(self, first_element): + result = torch.floor(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestFloor) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_force_fallback.py b/test/_inductor/test_force_fallback.py new file mode 100644 index 0000000000..5f3c5f5243 --- /dev/null +++ b/test/_inductor/test_force_fallback.py @@ -0,0 +1,57 @@ +import os +from unittest.mock import patch + +import torch +import torch.nn.functional as F +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + +os.environ["INDUCTOR_ASCEND_DUMP_FX_GRAPH"] = "1" + + +class TestForceFallback(TestUtils): + def test_case1(self): + op_list = [] + + def opoverload_call(self, /, *args, **kwargs): + op_list.append(str(self)) + return self._op(*args, **kwargs) + + def run(x, y): + return F.relu(x) + y + + x = torch.randn(10).npu() + y = torch.randn(10).npu() + g = run(x, y) + + run = torch.compile(run) + # compile warmup + _ = run(x, y) + + with patch.object(torch._ops.OpOverload, "__call__", opoverload_call): + op_list.clear() + z = run(x, y) + self.assertTrue(len(op_list) == 0) + self.assertEqual(z, g) + + op_list.clear() + torch_npu._inductor.config.force_fallback_kernel_id = [0] + z = run(x, y) + self.assertTrue("aten.relu.default" in op_list) + self.assertTrue("aten.add.Tensor" in op_list) + self.assertEqual(z, g) + + op_list.clear() + torch_npu._inductor.config.force_fallback_kernel_id = 'all' + z = run(x, y) + self.assertTrue("aten.relu.default" in op_list) + self.assertTrue("aten.add.Tensor" in op_list) + self.assertEqual(z, g) + + # reset + torch_npu._inductor.config.force_fallback_kernel_id = [] + + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/_inductor/test_ge.py b/test/_inductor/test_ge.py new file mode 100644 index 0000000000..0692581721 --- /dev/null +++ b/test/_inductor/test_ge.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestGe(TestUtils): + def op_calc(self, first_element, second_element): + return torch.ge(first_element, second_element) + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + second_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, second_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestGe) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_geometric.py b/test/_inductor/test_geometric.py new file mode 100644 index 0000000000..fafe22a044 --- /dev/null +++ b/test/_inductor/test_geometric.py @@ -0,0 +1,36 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestGeometric(TestUtils): + def op_calc(self): + # 创建一个形状为 (3, 3)的张量, 每个位置的概率为 0.5 + prob = torch.full((16, 16), 0.5).npu() + + #使用 aten.geometric生成几何分布的随机数 + geometric_tensor = torch.ops.aten.geometric(prob, p=0.5) + + return geometric_tensor + + # UT skip, reason: this has problem in torch 260 + # Added to pytorch-disable-tests.json + @parametrize('shape', [(16, 16, 16)]) + @parametrize('dim', [0]) + @parametrize('dtype', ['int32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + std_ret = self.op_calc() + std_ret_mean = torch.mean(std_ret) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc() + + inductor_ret_mean = torch.mean(inductor_ret) + self.assertTrue(inductor_ret_mean is not None) + + +instantiate_parametrized_tests(TestGeometric) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_gt.py b/test/_inductor/test_gt.py new file mode 100644 index 0000000000..d99fc0ec46 --- /dev/null +++ b/test/_inductor/test_gt.py @@ -0,0 +1,30 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestGt(TestUtils): + def op_calc(self, first_element, second_element): + result = torch.gt(first_element, second_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + second_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element, second_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestGt) + +if __name__ == "__main__": + run_tests() + diff --git a/test/_inductor/test_high_order_sum.py b/test/_inductor/test_high_order_sum.py new file mode 100644 index 0000000000..a0253c261f --- /dev/null +++ b/test/_inductor/test_high_order_sum.py @@ -0,0 +1,25 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class TestSum(TestUtils): + def op_sum(self, npu_dropout_backward_9): + view_337: "f32[32768, 256]" = torch.ops.aten.view.default(npu_dropout_backward_9, [32768, 256]) + sum_63: "f32[1, 256]" = torch.ops.aten.sum.dim_IntList(view_337, [0], True) + view_338: "f32[256]" = torch.ops.aten.view.default(sum_63, [256]) + return view_338 + + + def test_high_order_sum(self): + npu_dropout_backward_9 = torch.randn((32768, 256), device='npu', dtype=torch.float32) + ref = self.op_sum(npu_dropout_backward_9) + func = torch.compile(self.op_sum, backend="inductor", dynamic=False) + calc = func(npu_dropout_backward_9) + + self.assertEqual(ref, calc, atol=1e-3, rtol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_issue54.py b/test/_inductor/test_issue54.py new file mode 100644 index 0000000000..405ee2aaff --- /dev/null +++ b/test/_inductor/test_issue54.py @@ -0,0 +1,60 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class Test_issue54(TestUtils): + def func_layernorm(self, args): + add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11 = args + permute: "f32[256, 256]" = torch.ops.aten.permute.default(primals_6, [1, 0]) + addmm: "f32[32768, 256]" = torch.ops.aten.addmm.default(primals_7, view, permute) + view_1: "f32[64, 512, 256]" = torch.ops.aten.view.default(addmm, [64, 512, 256]) + addmm_1: "f32[32768, 256]" = torch.ops.aten.addmm.default(primals_9, view, permute_1) + view_3: "f32[64, 512, 256]" = torch.ops.aten.view.default(addmm_1, [64, 512, 256]) + view_4: "f32[64, 512, 4, 64]" = torch.ops.aten.view.default(view_3, [64, 512, 4, 64]) + permute_2: "f32[64, 4, 512, 64]" = torch.ops.aten.permute.default(view_4, [0, 2, 1, 3]) + permute_3: "f32[256, 256]" = torch.ops.aten.permute.default(primals_10, [1, 0]) + addmm_2: "f32[32768, 256]" = torch.ops.aten.addmm.default(primals_11, view, permute_3) + view_6: "f32[64, 512, 256]" = torch.ops.aten.view.default(addmm_2, [64, 512, 256]) + + view_8: "f32[64, 512, 4, 64]" = torch.ops.aten.view.default(view_1, [64, 512, 4, 64]) + permute_5: "f32[64, 4, 512, 64]" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]) + + permute_6: "f32[64, 4, 64, 512]" = torch.ops.aten.permute.default(permute_2, [0, 1, 3, 2]) + expand_1: "f32[64, 4, 512, 64]" = torch.ops.aten.expand.default(permute_5, [64, 4, 512, 64]) + clone: "f32[64, 4, 512, 64]" = torch.ops.aten.clone.default(expand_1, memory_format=torch.contiguous_format) + view_9: "f32[256, 512, 64]" = torch.ops.aten.view.default(clone, [256, 512, 64]) + expand_2: "f32[64, 4, 64, 512]" = torch.ops.aten.expand.default(permute_6, [64, 4, 64, 512]) + clone_1: "f32[64, 4, 64, 512]" = torch.ops.aten.clone.default(expand_2, memory_format=torch.contiguous_format) + view_10: "f32[256, 64, 512]" = torch.ops.aten.view.default(clone_1, [256, 64, 512]) + bmm: "f32[256, 512, 512]" = torch.ops.aten.bmm.default(view_9, view_10) + view_7: "f32[64, 512, 4, 64]" = torch.ops.aten.view.default(view_6, [64, 512, 4, 64]) + permute_4: "f32[64, 4, 512, 64]" = torch.ops.aten.permute.default(view_7, [0, 2, 1, 3]) + expand_4: "f32[64, 4, 512, 64]" = torch.ops.aten.expand.default(permute_4, [64, 4, 512, 64]) + clone_2: "f32[64, 4, 512, 64]" = torch.ops.aten.clone.default(expand_4, memory_format=torch.contiguous_format) + view_13: "f32[256, 512, 64]" = torch.ops.aten.view.default(clone_2, [256, 512, 64]) + + return bmm, view_13 + + def test_issue54(self): + device = 'npu' + add_3 = torch.randn((64, 512, 256), device=device, dtype=torch.float32) + primals_6 = torch.randn((256, 256), device=device, dtype=torch.float32) + primals_7 = torch.randn((256), device=device, dtype=torch.float32) + view = torch.randn((32768, 256), device=device, dtype=torch.float32) + primals_9 = torch.randn((256), device=device, dtype=torch.float32) + permute_1 = torch.randn((256, 256), device=device, dtype=torch.float32) + primals_10 = torch.randn((256, 256), device=device, dtype=torch.float32) + primals_11 = torch.randn((256), device=device, dtype=torch.float32) + args = (add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11) + ref = self.func_layernorm(args) + func = torch.compile(self.func_layernorm, backend="inductor", dynamic=False, + options={"unroll_reductions_threshold": 1, "aggressive_fusion": True}) + calc = func(args) + self.assertEqual(ref[0], calc[0], atol=1e-2, rtol=1e-2) + self.assertEqual(ref[1], calc[1], atol=1e-2, rtol=1e-2) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_issue57.py b/test/_inductor/test_issue57.py new file mode 100644 index 0000000000..15686f9409 --- /dev/null +++ b/test/_inductor/test_issue57.py @@ -0,0 +1,35 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class Test_issue57(TestUtils): + def op_sum(self, view_12, embedding_1, slice_11): + permute_7 = torch.ops.aten.permute.default(embedding_1, [2, 0, 1]) + embedding_1 = None + unsqueeze_4 = torch.ops.aten.unsqueeze.default(permute_7, 0) + permute_7 = None + + add_5 = torch.ops.aten.add.Tensor(unsqueeze_4, slice_11) + slice_8 = slice_11 = None + add_6 = torch.ops.aten.add.Tensor(view_12, add_5) + view_12 = None + return add_6 + + def test_issue57(self): + device = 'npu' + embedding_1 = torch.randn((512, 512, 64), device=device, dtype=torch.float32) + primals_221 = torch.randn((1, 1, 1, 512), device=device, dtype=torch.float32) + view_12 = torch.randn((1, 64, 512, 512), device=device, dtype=torch.float32) + slice_11 = torch.randn((1, 1, 1, 512), device=device, dtype=torch.float32) + + ref = self.op_sum(view_12, embedding_1, primals_221) + func = torch.compile(self.op_sum, backend="inductor", dynamic=False) + calc = func(view_12, embedding_1, primals_221) + + self.assertEqual(ref, calc, atol=1e-3, rtol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_issue59.py b/test/_inductor/test_issue59.py new file mode 100644 index 0000000000..9c55c4ed04 --- /dev/null +++ b/test/_inductor/test_issue59.py @@ -0,0 +1,38 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class Test_issue59(TestUtils): + def layernorm_backward(self, x, y, z): + sum_0 = torch.sum(x) + mean = sum_0 / torch.numel(sum_0) + sub = x - mean + sqr = sub * sub + sum_1 = torch.sum(sqr) + mean_1 = sum_1 / torch.numel(sum_1) + 1e-05 + rsqrt = torch.rsqrt(mean_1) + mul = sub * rsqrt + mul_1 = mul * y + add = mul_1 + z + mean_2 = rsqrt / torch.numel(rsqrt) + return mul, add, mean_2 + + def test_issue59(self): + device = 'npu' + x = torch.randn((1, 1024), device=device, dtype=torch.float32) + y = torch.randn((1, 1024), device=device, dtype=torch.float32) + z = torch.randn((1, 1024), device=device, dtype=torch.float32) + + mul, add, mean_2 = self.layernorm_backward(x, y, z) + func = torch.compile(self.layernorm_backward, backend="inductor", dynamic=False) + mul_t, add_t, mean_2_t = func(x, y, z) + + self.assertEqual(mul, mul_t, atol=1e-3, rtol=1e-3) + self.assertEqual(add, add_t, atol=1e-3, rtol=1e-3) + self.assertEqual(mean_2, mean_2_t, atol=1e-3, rtol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_issue62.py b/test/_inductor/test_issue62.py new file mode 100644 index 0000000000..53b93ed4e2 --- /dev/null +++ b/test/_inductor/test_issue62.py @@ -0,0 +1,47 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class Test_issue62(TestUtils): + def op_func(self, addmm_5, add): + split = torch.ops.aten.split.Tensor(addmm_5, 1536, 1) + getitem = split[0] + getitem_1 = split[1] + getitem_2 = split[2] + getitem_3 = split[3] + getitem_4 = split[4] + getitem_5 = split[5] + + clone_1 = torch.ops.aten.clone.default(add, memory_format=torch.contiguous_format) + convert_element_type_25 = torch.ops.prims.convert_element_type.default(clone_1, torch.float32) + var_mean = torch.ops.aten.var_mean.correction(convert_element_type_25, [2], correction=0, keepdim=True) + getitem_6 = var_mean[0] + getitem_7 = var_mean[1] + add_3 = torch.ops.aten.add.Tensor(getitem_6, 1e-06) + rsqrt = torch.ops.aten.rsqrt.default(add_3) + sub = torch.ops.aten.sub.Tensor(clone_1, getitem_7) + mul_7 = torch.ops.aten.mul.Tensor(sub, rsqrt) + convert_element_type_26 = torch.ops.prims.convert_element_type.default(mul_7, torch.float16) + slice_11 = torch.ops.aten.slice.Tensor(getitem_1, 0, 0, 9223372036854775807) + unsqueeze_2 = torch.ops.aten.unsqueeze.default(slice_11, 1) + add_4 = torch.ops.aten.add.Tensor(unsqueeze_2, 1) + mul_8 = torch.ops.aten.mul.Tensor(convert_element_type_26, add_4) + slice_12 = torch.ops.aten.slice.Tensor(getitem, 0, 0, 9223372036854775807) + unsqueeze_3 = torch.ops.aten.unsqueeze.default(slice_12, 1) + add_5 = torch.ops.aten.add.Tensor(mul_8, unsqueeze_3) + return add_5 + + def test_issue62(self): + addmm_5 = torch.randn((2, 9216), device='npu:0', dtype=torch.float16) + add = torch.randn((2, 4096, 1536), device='npu:0', dtype=torch.float16) + + std_ret = self.op_func(addmm_5, add) + compiled_func = torch.compile(self.op_func, backend="inductor") + inductor_ret = compiled_func(addmm_5, add) + self.assertEqual(std_ret, inductor_ret, atol=1e-2, rtol=1e-2) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_issue70.py b/test/_inductor/test_issue70.py new file mode 100644 index 0000000000..126b901f5e --- /dev/null +++ b/test/_inductor/test_issue70.py @@ -0,0 +1,22 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class Test_issue70(TestUtils): + def op_forward(self, x): + return x.mean(-1) + + def test_issue70(self): + compiled_net = torch.compile(self.op_forward, backend="inductor") + + arg = torch.randn((1, 1, 7168)).npu() + + output = self.op_forward(arg) + output1 = compiled_net(arg) + self.assertEqual(output, output1, atol=1e-3, rtol=1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_lazy_register.py b/test/_inductor/test_lazy_register.py new file mode 100644 index 0000000000..c4bc64c722 --- /dev/null +++ b/test/_inductor/test_lazy_register.py @@ -0,0 +1,35 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class TestLazyRegister(TestUtils): + def test_compile_but_not_invoked(self): + + def run(x, y): + return x + y + + run = torch.compile(run) + self.assertFalse(torch_npu.utils._dynamo.is_inductor_npu_initialized()) + + def test_disale_register_inductor_npu(self): + torch_npu.utils._dynamo.disable_register_inductor_npu() + + def run(x, y): + return x - y + + run = torch.compile(run) + x = torch.randn(10, 20).npu() + y = torch.randn(10, 20).npu() + + with self.assertRaisesRegex(Exception, "Device npu not supported"): + _ = run(x, y) + + self.assertFalse(torch_npu.utils._dynamo.is_inductor_npu_initialized()) + + torch_npu.utils._dynamo.enable_register_inductor_npu() + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_opensora_graph1.py b/test/_inductor/test_opensora_graph1.py new file mode 100644 index 0000000000..4a641dce4c --- /dev/null +++ b/test/_inductor/test_opensora_graph1.py @@ -0,0 +1,266 @@ +import os +import random +import numpy as np + +import torch +from torch import device +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + +device_npu = 'npu' + + +class TestModel(TestUtils): + def test_opensora_cases_model_9_inference(self): + def forward(primals_1: "f32[1, 9600, 2304]"): + permute: "f32[9600, 1, 2304]" = torch.ops.aten.permute.default(primals_1, [1, 0, 2]) + return permute + primals_2 = torch.randn((1, 9600, 2304), device=device_npu, dtype=torch.float32) + ref = forward(primals_2) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(primals_2) + self.assertEqual(ref, calc, atol=1e-4, rtol=1e-4, equal_nan=True) + primals_3 = torch.randn((1, 512, 2304), device=device_npu, dtype=torch.float32) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(primals_3) + ref = forward(primals_3) + self.assertEqual(ref, calc, atol=1e-4, rtol=1e-4, equal_nan=True) + primals_4 = torch.randn((9600, 1, 2304), device=device_npu, dtype=torch.float32) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(primals_4) + ref = forward(primals_4) + self.assertEqual(ref, calc, atol=1e-4, rtol=1e-4, equal_nan=True) + + def test_opensora_cases_model_11_inference(self): + def forward(arg0_1: "f32[1, 1, 9600]", arg1_1: "f32[1, 1, 512]"): + unsqueeze: "f32[1, 1, 1, 9600]" = torch.ops.aten.unsqueeze.default(arg0_1, 1) + arg0_1 = None + unsqueeze_1: "f32[1, 1, 1, 512]" = torch.ops.aten.unsqueeze.default(arg1_1, 1) + arg1_1 = None + constant_pad_nd: "f32[1, 1, 1, 9600]" = torch.ops.aten.constant_pad_nd.default(unsqueeze, [0, 0, 0, 0], -9980.0) + unsqueeze = None + view: "f32[1, 9600, 1]" = torch.ops.aten.view.default(constant_pad_nd, [1, 9600, 1]) + permute: "f32[1, 1, 9600]" = torch.ops.aten.permute.default(view, [2, 0, 1]) + view = None + view_1: "f32[1, 1, 1, 9600]" = torch.ops.aten.view.default(permute, [1, 1, 1, 9600]) + permute = None + view_2: "f32[1, 9600, 1, 1]" = torch.ops.aten.view.default(constant_pad_nd, [1, 9600, 1, 1]) + constant_pad_nd = None + permute_1: "f32[1, 1, 9600, 1]" = torch.ops.aten.permute.default(view_2, [2, 0, 1, 3]) + view_2 = None + view_3: "f32[1, 1, 1, 9600]" = torch.ops.aten.view.default(permute_1, [1, 1, 1, 9600]) + permute_1 = None + repeat: "f32[1, 1, 1, 512]" = torch.ops.aten.repeat.default(unsqueeze_1, [1, 1, 1, 1]) + unsqueeze_1 = None + npu_dtype_cast: "b8[1, 1, 1, 9600]" = torch.ops.npu.npu_dtype_cast.default(view_1, torch.bool) + view_1 = None + repeat_1: "b8[1, 1, 9600, 9600]" = torch.ops.aten.repeat.default(npu_dtype_cast, [1, 1, 9600, 1]) + npu_dtype_cast = None + npu_dtype_cast_1: "b8[1, 1, 1, 9600]" = torch.ops.npu.npu_dtype_cast.default(view_3, torch.bool) + view_3 = None + repeat_2: "b8[1, 1, 9600, 9600]" = torch.ops.aten.repeat.default(npu_dtype_cast_1, [1, 1, 9600, 1]) + npu_dtype_cast_1 = None + npu_dtype_cast_2: "b8[1, 1, 1, 512]" = torch.ops.npu.npu_dtype_cast.default(repeat, torch.bool) + repeat = None + repeat_3: "b8[1, 1, 9600, 512]" = torch.ops.aten.repeat.default(npu_dtype_cast_2, [1, 1, 9600, 1]) + npu_dtype_cast_2 = None + return (repeat_1, repeat_3, repeat_2) + arg0_1 = torch.rand((1, 1, 9600), device=device_npu, dtype=torch.float32) + arg1_1 = torch.rand((1, 1, 512), device=device_npu, dtype=torch.float32) + ref = forward(arg0_1, arg1_1) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(arg0_1, arg1_1) + + for r, c in zip(ref, calc): + self.assertEqual(r, c, atol=1e-4, rtol=1e-4, equal_nan=True) + + + def test_opensora_cases_model_14_backward(self): + def forward(args): + primals_5, getitem_3, rsqrt, add_2, view, permute_1, tangents_1 = args + sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3) + mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt) + view_2: "f32[9600, 32]" = torch.ops.aten.view.default(tangents_1, [9600, 32]) + mm: "f32[9600, 2304]" = torch.ops.aten.mm.default(view_2, permute_1) + permute_2: "f32[32, 9600]" = torch.ops.aten.permute.default(view_2, [1, 0]) + mm_1: "f32[32, 2304]" = torch.ops.aten.mm.default(permute_2, view) + permute_3: "f32[2304, 32]" = torch.ops.aten.permute.default(mm_1, [1, 0]) + sum_1: "f32[1, 32]" = torch.ops.aten.sum.dim_IntList(view_2, [0], True) + view_3: "f32[32]" = torch.ops.aten.view.default(sum_1, [32]) + permute_4: "f32[32, 2304]" = torch.ops.aten.permute.default(permute_3, [1, 0]) + view_4: "f32[1, 9600, 2304]" = torch.ops.aten.view.default(mm, [1, 9600, 2304]) + sum_2: "f32[1, 1, 2304]" = torch.ops.aten.sum.dim_IntList(view_4, [1], True) + mul_2: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(view_4, mul) + mul_3: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(view_4, add_2) + sum_3: "f32[1, 1, 2304]" = torch.ops.aten.sum.dim_IntList(mul_2, [1], True) + mul_5: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul_3, 2304) + sum_4: "f32[1, 9600, 1]" = torch.ops.aten.sum.dim_IntList(mul_3, [2], True) + mul_6: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul_3, mul) + sum_5: "f32[1, 9600, 1]" = torch.ops.aten.sum.dim_IntList(mul_6, [2], True) + mul_7: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, sum_5) + sub_2: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(mul_5, sum_4) + sub_3: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(sub_2, mul_7) + div: "f32[1, 9600, 1]" = torch.ops.aten.div.Tensor(rsqrt, 2304) + mul_8: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(div, sub_3) + cat: "f32[1, 2, 2304]" = torch.ops.aten.cat.default([sum_2, sum_3], 1) + sum_6: "f32[1, 1, 2304]" = torch.ops.aten.sum.dim_IntList(cat, [1], True) + squeeze_1: "f32[1, 2304]" = torch.ops.aten.squeeze.dim(sum_6, 1) + full_default: "f32[1, 2304]" = torch.ops.aten.full.default([1, 2304], 0, dtype=torch.float32, device='npu') + slice_scatter: "f32[1, 2304]" = torch.ops.aten.slice_scatter.default(full_default, squeeze_1, 0, 0, + 9223372036854775807) + squeeze_2: "f32[2, 2304]" = torch.ops.aten.squeeze.dim(cat, 0) + return [squeeze_2, permute_4, view_3, slice_scatter, mul_8] + primals_5 = torch.randn((1, 9600, 2304), device=device_npu, dtype=torch.float32) + getitem_3 = torch.randn((1, 9600, 1), device=device_npu, dtype=torch.float32) + rsqrt = torch.randn((1, 9600, 1), device=device_npu, dtype=torch.float32) + add_2 = torch.randn((1, 1, 2304), device=device_npu, dtype=torch.float32) + view = torch.randn((9600, 2304), device=device_npu, dtype=torch.float32) + permute_1 = torch.randn((32, 2304), device=device_npu, dtype=torch.float32) + tangents_1 = torch.randn((1, 9600, 32), device=device_npu, dtype=torch.float32) + args = (primals_5, getitem_3, rsqrt, add_2, view, permute_1, tangents_1) + ref = forward(args) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(args) + + for r, c in zip(ref, calc): + self.assertEqual(r, c, atol=1e-3, rtol=1e-3, equal_nan=True) + + + def test_opensora_cases_model_14_forward(self): + def forward(primals_1: "f32[2, 2304]", primals_2: "f32[32, 2304]", primals_3: "f32[32]", + primals_4: "f32[1, 2304]", primals_5: "f32[1, 9600, 2304]"): + unsqueeze: "f32[1, 2, 2304]" = torch.ops.aten.unsqueeze.default(primals_1, 0) + primals_1 = None + slice_1: "f32[1, 2304]" = torch.ops.aten.slice.Tensor(primals_4, 0, 0, 9223372036854775807) + primals_4 = None + unsqueeze_1: "f32[1, 1, 2304]" = torch.ops.aten.unsqueeze.default(slice_1, 1) + slice_1 = None + add: "f32[1, 2, 2304]" = torch.ops.aten.add.Tensor(unsqueeze, unsqueeze_1) + unsqueeze = unsqueeze_1 = None + split = torch.ops.aten.split.Tensor(add, 1, 1) + add = None + getitem: "f32[1, 1, 2304]" = split[0] + getitem_1: "f32[1, 1, 2304]" = split[1] + split = None + var_mean = torch.ops.aten.var_mean.correction(primals_5, [2], correction=0, keepdim=True) + getitem_2: "f32[1, 9600, 1]" = var_mean[0] + getitem_3: "f32[1, 9600, 1]" = var_mean[1] + var_mean = None + add_1: "f32[1, 9600, 1]" = torch.ops.aten.add.Tensor(getitem_2, 1e-06) + getitem_2 = None + rsqrt: "f32[1, 9600, 1]" = torch.ops.aten.rsqrt.default(add_1) + add_1 = None + sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3) + mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt) + sub = None + add_2: "f32[1, 1, 2304]" = torch.ops.aten.add.Tensor(getitem_1, 1) + getitem_1 = None + mul_1: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, add_2) + mul = None + add_3: "f32[1, 9600, 2304]" = torch.ops.aten.add.Tensor(mul_1, getitem) + mul_1 = getitem = None + view: "f32[9600, 2304]" = torch.ops.aten.view.default(add_3, [9600, 2304]) + add_3 = None + permute: "f32[2304, 32]" = torch.ops.aten.permute.default(primals_2, [1, 0]) + primals_2 = None + addmm: "f32[9600, 32]" = torch.ops.aten.addmm.default(primals_3, view, permute) + primals_3 = None + view_1: "f32[1, 9600, 32]" = torch.ops.aten.view.default(addmm, [1, 9600, 32]) + addmm = None + # No stacktrace found for following nodes + squeeze: "f32[1, 9600, 32]" = torch.ops.aten.squeeze.dim(view_1, 1) + view_1 = None + permute_1: "f32[32, 2304]" = torch.ops.aten.permute.default(permute, [1, 0]) + permute = None + return [squeeze, primals_5, getitem_3, rsqrt, add_2, view, permute_1] + primals_1 = torch.ones((2, 2304), device=device_npu, dtype=torch.float32) + primals_2 = torch.ones((32, 2304), device=device_npu, dtype=torch.float32) + primals_3 = torch.ones((32,), device=device_npu, dtype=torch.float32) + primals_4 = torch.ones((1, 2304), device=device_npu, dtype=torch.float32) + primals_5 = torch.ones((1, 9600, 2304), device=device_npu, dtype=torch.float32) + ref = forward(primals_1, primals_2, primals_3, primals_4, primals_5) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(primals_1, primals_2, primals_3, primals_4, primals_5) + for r, c in zip(ref, calc): + self.assertEqual(r, c, atol=1e-4, rtol=1e-4, equal_nan=True) + + + def test_opensora_cases_model_15_forward(self): + def forward(primals_1: "f32[1, 8, 30, 40, 1, 2, 2, 8]", primals_2: "i64[]", primals_3: "i64[]", + primals_4: "i64[]"): + permute: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.permute.default(primals_1, [0, 7, 1, 4, 2, 5, 3, 6]) + mul: "i64[]" = torch.ops.aten.mul.Tensor(primals_2, 1) + mul_1: "i64[]" = torch.ops.aten.mul.Tensor(primals_3, 2) + mul_2: "i64[]" = torch.ops.aten.mul.Tensor(primals_4, 2) + return [permute, mul, mul_1, mul_2] + + primals_1 = torch.randn((1, 8, 30, 40, 1, 2, 2, 8), device=device_npu, dtype=torch.float32) + primals_2 = torch.tensor((1), device=device_npu, dtype=torch.int64) + primals_3 = torch.tensor((1), device=device_npu, dtype=torch.int64) + primals_4 = torch.tensor((1), device=device_npu, dtype=torch.int64) + ref = forward(primals_1, primals_2, primals_3, + primals_4) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(primals_1, primals_2, primals_3, + primals_4) + for r, c in zip(ref, calc): + self.assertEqual(r, c, atol=1e-4, rtol=1e-4, equal_nan=True) + + def test_opensora_cases_model_16_forward(self): + def forward(primals_1: "f32[2, 2304]", primals_2: "f32[32, 2304]", primals_3: "f32[32]", primals_4: "f32[1, 2304]", primals_5: "f32[1, 9600, 2304]"): + unsqueeze: "f32[1, 2, 2304]" = torch.ops.aten.unsqueeze.default(primals_1, 0) + slice_1: "f32[1, 2304]" = torch.ops.aten.slice.Tensor(primals_4, 0, 0, 9223372036854775807) + unsqueeze_1: "f32[1, 1, 2304]" = torch.ops.aten.unsqueeze.default(slice_1, 1) + add: "f32[1, 2, 2304]" = torch.ops.aten.add.Tensor(unsqueeze, unsqueeze_1) + split = torch.ops.aten.split.Tensor(add, 1, 1) + getitem: "f32[1, 1, 2304]" = split[0] + getitem_1: "f32[1, 1, 2304]" = split[1] + var_mean = torch.ops.aten.var_mean.correction(primals_5, [2], correction=0, keepdim=True) + getitem_2: "f32[1, 9600, 1]" = var_mean[0] + getitem_3: "f32[1, 9600, 1]" = var_mean[1] + add_1: "f32[1, 9600, 1]" = torch.ops.aten.add.Tensor(getitem_2, 1e-06) + rsqrt: "f32[1, 9600, 1]" = torch.ops.aten.rsqrt.default(add_1) + sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3) + mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt) + add_2: "f32[1, 1, 2304]" = torch.ops.aten.add.Tensor(getitem_1, 1) + mul_1: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, add_2) + add_3: "f32[1, 9600, 2304]" = torch.ops.aten.add.Tensor(mul_1, getitem) + view: "f32[9600, 2304]" = torch.ops.aten.view.default(add_3, [9600, 2304]) + permute: "f32[2304, 32]" = torch.ops.aten.permute.default(primals_2, [1, 0]) + addmm: "f32[9600, 32]" = torch.ops.aten.addmm.default(primals_3, view, permute) + view_1: "f32[1, 9600, 32]" = torch.ops.aten.view.default(addmm, [1, 9600, 32]) + squeeze: "f32[1, 9600, 32]" = torch.ops.aten.squeeze.dim(view_1, 1) + view_2: "f32[1, 8, 30, 40, 1, 2, 2, 8]" = torch.ops.aten.view.default(squeeze, [1, 8, 30, 40, 1, 2, 2, 8]) + permute_1: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.permute.default(view_2, [0, 7, 1, 4, 2, 5, 3, 6]) + clone: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.clone.default(permute_1) + clone_1: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.clone.default(clone, memory_format=torch.contiguous_format) + view_3: "f32[1, 8, 8, 60, 80]" = torch.ops.aten.view.default(clone_1, [1, 8, 8, 60, 80]) + permute_3: "f32[32, 2304]" = torch.ops.aten.permute.default(permute, [1, 0]) + return [view_3, primals_5, getitem_3, rsqrt, add_2, view, permute_3] + + def seed_all(seed=1234, mode=False): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(mode) + torch_npu.npu.manual_seed_all(seed) + torch_npu.npu.manual_seed(seed) + + seed_all(True) + primals_1 = torch.randn((2, 2304), device=device_npu, dtype=torch.float32) + primals_2 = torch.randn((32, 2304), device=device_npu, dtype=torch.float32) + primals_3 = torch.randn((32,), device=device_npu, dtype=torch.float32) + primals_4 = torch.randn((1, 2304), device=device_npu, dtype=torch.float32) + primals_5 = torch.randn((1, 9600, 2304), device=device_npu, dtype=torch.float32) + + ref = forward(primals_1, primals_2, primals_3, primals_4, primals_5) + forward_calc = torch.compile(forward, backend="inductor", dynamic=False) + calc = forward_calc(primals_1, primals_2, primals_3, primals_4, primals_5) + for r, c in zip(ref, calc): + self.assertEqual(r, c, atol=1e-3, rtol=1e-3, equal_nan=True) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_permute.py b/test/_inductor/test_permute.py new file mode 100644 index 0000000000..d59e2f9ee4 --- /dev/null +++ b/test/_inductor/test_permute.py @@ -0,0 +1,39 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestPermute(TestUtils): + _permute_dims = [ + (0, 1, 2, 3), (0, 1, 3, 2), (0, 2, 1, 3), (0, 2, 3, 1), + (0, 3, 1, 2), (0, 3, 2, 1), (1, 0, 2, 3), (1, 0, 3, 2), + (1, 2, 0, 3), (1, 2, 3, 0), (1, 3, 0, 2), (1, 3, 2, 0), + (2, 0, 1, 3), (2, 0, 3, 1), (2, 1, 0, 3), (2, 1, 3, 0), + (2, 3, 0, 1), (2, 3, 1, 0), (3, 0, 1, 2), (3, 0, 2, 1), + (3, 1, 0, 2), (3, 1, 2, 0), (3, 2, 0, 1), (3, 2, 1, 0), + ] + + def op_calc(self, a, b, dim): + a = a.permute(dim) + b = b.permute(dim) + y = a + b + return y + + @parametrize('shape', [(8, 8, 512, 128)]) + @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16', 'int64']) + def test_view_cases(self, shape, dtype): + a = self._generate_tensor(shape, dtype) + b = self._generate_tensor(shape, dtype) + + for dim in self._permute_dims: + std_permute = self.op_calc(a, b, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_permute = compiled_op_calc(a, b, dim) + + self.assertEqual(std_permute, inductor_permute, atol=1e-3, rtol=1e-3) + +instantiate_parametrized_tests(TestPermute) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_reduction_brocast_add.py b/test/_inductor/test_reduction_brocast_add.py new file mode 100644 index 0000000000..fb29fa1516 --- /dev/null +++ b/test/_inductor/test_reduction_brocast_add.py @@ -0,0 +1,30 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSumAdd(TestUtils): + def foo(self, a, b, dim, shape): + y = a + b + y = y.sum(dim) + y = y.unsqueeze(dim) + y = y.broadcast_to(shape) + b + return y + + # case:change shapes + @parametrize('shape', [(9, 9, 31, 63)]) + @parametrize('dim', [0, 1, 2]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes1(self, shape, dim, dtype): + a, b = [torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)] + r1 = self.foo(a, b, dim, shape) + func = torch.compile(self.foo, backend="inductor", dynamic=False) + r = func(a, b, dim, shape) + self.assertEqual(r, r1, atol=1e-3, rtol=1e-3) + + +instantiate_parametrized_tests(TestSumAdd) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_relu.py b/test/_inductor/test_relu.py new file mode 100644 index 0000000000..1bb0562c62 --- /dev/null +++ b/test/_inductor/test_relu.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestRelu(TestUtils): + def op_calc(self, first_element): + result = torch.relu(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestRelu) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_renorm.py b/test/_inductor/test_renorm.py new file mode 100644 index 0000000000..0d49727221 --- /dev/null +++ b/test/_inductor/test_renorm.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestRenorm(TestUtils): + def op_calc(self, input_element, dim): + return torch.renorm(input_element, p=2, dim=dim, maxnorm=5) + + # case:change shapes + @parametrize('shape', [(32, 64)]) + @parametrize('dim', [-1]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + std_ret = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc(input_element, dim) + + self.assertEqual(std_ret, inductor_ret, equal_nan=True) + + +instantiate_parametrized_tests(TestRenorm) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_repeat.py b/test/_inductor/test_repeat.py new file mode 100644 index 0000000000..1d7d2773ab --- /dev/null +++ b/test/_inductor/test_repeat.py @@ -0,0 +1,29 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestRepeat(TestUtils): + def op_calc(self, input_element, dim): + return input_element.repeat(dim) + + # case:change shapes + @parametrize('shape', [(16, 128, 64)]) + @parametrize('dim', [(1, 1, 2), (1, 2, 1), (2, 1, 1)]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + + std_ret = self.op_calc(input_element, dim) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_ret = compiled_op_calc(input_element, dim) + + self.assertEqual(std_ret, inductor_ret, atol=1e-1, rtol=1e-1) + + +instantiate_parametrized_tests(TestRepeat) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_reshape.py b/test/_inductor/test_reshape.py new file mode 100644 index 0000000000..80ad8ea222 --- /dev/null +++ b/test/_inductor/test_reshape.py @@ -0,0 +1,35 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + + + +class TestReshape(TestUtils): + B, N, S, D = (1, 12, 256, 8) + + def op_calc(self, a, b): + a = a.reshape(self.S, self.B, self.N * self.D) + b = b.reshape(self.S, self.B, self.N * self.D) + y = a + b + return y + + @parametrize('shape', [(1, 12, 256, 8)]) + @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16', 'int64']) + def test_view_cases(self, shape, dtype): + a = self._generate_tensor(shape, dtype) + b = self._generate_tensor(shape, dtype) + + std_reshape = self.op_calc(a, b) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_reshape = compiled_op_calc(a, b) + + self.assertEqual(std_reshape, inductor_reshape, atol=1e-3, rtol=1e-3) + + +instantiate_parametrized_tests(TestReshape) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_rsqrt.py b/test/_inductor/test_rsqrt.py new file mode 100644 index 0000000000..00f647cfbb --- /dev/null +++ b/test/_inductor/test_rsqrt.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestRsqrt(TestUtils): + def op_calc(self, first_element): + result = torch.rsqrt(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype, 1) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + + self.assertEqual(std_result, inductor_result, atol=1e-1, rtol=1e-1) + + +instantiate_parametrized_tests(TestRsqrt) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_slice.py b/test/_inductor/test_slice.py new file mode 100644 index 0000000000..7da1efb2d7 --- /dev/null +++ b/test/_inductor/test_slice.py @@ -0,0 +1,50 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSlice(TestUtils): + def op_calc(self, a, b, dim, step): + if dim == 0: + target = a.shape[0] + end = target // step + a = a[:end:, ::, ::, ::] + b = b[:end:, ::, ::, ::] + elif dim == 1: + target = a.shape[1] + end = target // step + a = a[::, :end:, ::, ::] + b = b[::, :end:, ::, ::] + elif dim == 2: + target = a.shape[2] + end = target // step + a = a[::, ::, :end:, ::] + b = b[::, ::, :end:, ::] + elif dim == 3: + target = a.shape[3] + end = target // step + a = a[::, ::, ::, :end:] + b = b[::, ::, ::, :end:] + y = a + b + return y + + @parametrize('shape', [(8, 8, 256, 128)]) + @parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16', 'int64']) + def test_view_cases(self, shape, dtype): + a = self._generate_tensor(shape, dtype) + b = self._generate_tensor(shape, dtype) + + for dim in [3, 2, 1, 0]: + std_slice = self.op_calc(a, b, dim, min(shape) // 2) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_slice = compiled_op_calc(a, b, dim, min(shape) // 2) + + self.assertEqual(std_slice, inductor_slice, atol=1e-3, rtol=1e-3) + + +instantiate_parametrized_tests(TestSlice) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_split_loop.py b/test/_inductor/test_split_loop.py new file mode 100644 index 0000000000..840de0a95d --- /dev/null +++ b/test/_inductor/test_split_loop.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSplitLoop(TestUtils): + def op_calc(self, a, b): + return torch.nn.functional.gelu(a + b) + + @parametrize('shape', [(8, 86, 1152), (61, 89, 157), (7, 89, 971)]) + @parametrize('dtype', ['float32']) + def test_split_loop(self, shape, dtype): + + a = self._generate_tensor(shape, dtype) + b = self._generate_tensor((shape[0], 1, shape[2]), dtype) + + std_ = self.op_calc(a, b) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_ = compiled_op_calc(a, b) + self.assertEqual(std_, inductor_, atol=1e-3, rtol=1e-3) + + +instantiate_parametrized_tests(TestSplitLoop) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_sqrt.py b/test/_inductor/test_sqrt.py new file mode 100644 index 0000000000..92141c47c3 --- /dev/null +++ b/test/_inductor/test_sqrt.py @@ -0,0 +1,27 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSqrt(TestUtils): + def op_calc(self, first_element): + result = torch.sqrt(first_element) + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype, 1) + + std_result = self.op_calc(first_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(first_element) + self.assertEqual(std_result, inductor_result, atol=1e-1, rtol=1e-1, equal_nan=True) + + +instantiate_parametrized_tests(TestSqrt) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_sub.py b/test/_inductor/test_sub.py new file mode 100644 index 0000000000..1af89991ca --- /dev/null +++ b/test/_inductor/test_sub.py @@ -0,0 +1,28 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSub(TestUtils): + def op_calc(self, first_element, second_element): + result = first_element - second_element + return result + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + second_element = self._generate_tensor(shape, dtype) + + std_sub = self.op_calc(first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_sum = compiled_op_calc(first_element, second_element) + self.assertEqual(std_sub, inductor_sum) + + +instantiate_parametrized_tests(TestSub) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_sum.py b/test/_inductor/test_sum.py new file mode 100644 index 0000000000..e8cffbde6a --- /dev/null +++ b/test/_inductor/test_sum.py @@ -0,0 +1,55 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSum(TestUtils): + def op_calc(self, input_element, dim): + return torch.sum(input_element, dim) + # 规约轴和非规约轴对齐用例 float32 XBLOCK_SUB>=8:shape=(8,32) + # non-persistent reduction 用例 规约轴>1024:shape=(8,8,8,2048) dim=-1 + _reduction_extest_shape4d_all = [(8, 32), (8, 8, 8, 2048)] + _reduction_extest_dim4d_low = [-1] + _reduction_extest_dim4d_all = [0, 1, 2] + + @parametrize('shape', _reduction_extest_shape4d_all) + @parametrize('dim', _reduction_extest_dim4d_low) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + std_sum = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_sum_tmp = compiled_op_calc(input_element, dim) + if dtype == 'int32' or dtype == 'int64': + # inductor return float32,need to change int64 for assert + inductor_sum = inductor_sum_tmp.long() + elif dtype == 'float16': + # inductor return float32,need to change float16 for assert + inductor_sum = inductor_sum_tmp.half() + elif dtype == 'bfloat16': + # inductor return float32,need to change float32 for assert + std_sum = std_sum.float() + inductor_sum = inductor_sum_tmp + else: + inductor_sum = inductor_sum_tmp + + self.assertEqual(std_sum, inductor_sum, atol=1e-1, rtol=1e-1) + + @parametrize('shape', [(32, 16, 64, 128)]) + @parametrize('dim', _reduction_extest_dim4d_all) + @parametrize('dtype', ['float32']) + def test_reduction_cases_dims(self, shape, dim, dtype): + + input_element = self._generate_tensor(shape, dtype) + std_sum = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_sum = compiled_op_calc(input_element, dim) + + self.assertEqual(std_sum, inductor_sum, atol=1e-1, rtol=1e-1) + + +instantiate_parametrized_tests(TestSum) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_sum_add.py b/test/_inductor/test_sum_add.py new file mode 100644 index 0000000000..806336f076 --- /dev/null +++ b/test/_inductor/test_sum_add.py @@ -0,0 +1,37 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestSumAdd(TestUtils): + def op_calc(self, input_element, dim, input_element2): + tmp = torch.sum(input_element, dim) + return tmp + input_element2 + + @parametrize('shape', [(32, 64, 128, 2048)]) + @parametrize('dim', [0, 1, 2, 3]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + if dim == -1 or dim == 3: + input_element2 = torch.full(size=(32, 64, 128), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu")) + elif dim == 2: + input_element2 = torch.full(size=(32, 64, 2048), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu")) + elif dim == 1: + input_element2 = torch.full(size=(32, 128, 2048), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu")) + else: + input_element2 = torch.full(size=(64, 128, 2048), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu")) + + std_sum = self.op_calc(input_element, dim, input_element2) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_sum = compiled_op_calc(input_element, dim, input_element2) + + self.assertEqual(std_sum, inductor_sum, atol=1e-1, rtol=1e-1) + + +instantiate_parametrized_tests(TestSumAdd) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_triton.py b/test/_inductor/test_triton.py new file mode 100644 index 0000000000..c9e254ba2a --- /dev/null +++ b/test/_inductor/test_triton.py @@ -0,0 +1,36 @@ +# Owner(s): ["module: tests"] + +import unittest +import torch +from torch.testing._internal.common_utils import run_tests, TestCase, load_tests +from torch.utils._triton import has_triton_package, has_triton, has_triton_tma, has_triton_tma_device +import torch_npu +import torch_npu.testing + +# load_tests from torch.testing._internal.common_utils is used to automatically filter tests for +# sharding on sandcastle. This line silences flake warnings +load_tests = load_tests + + +class TestHasTriton(TestCase): + + def test_has_triton(self): + if not has_triton_package(): + # no triton library found, skip test_has_triton + return + + self.assertFalse(has_triton()) + self.assertFalse(has_triton_tma()) + self.assertFalse(has_triton_tma_device()) + + from torch_npu.contrib import transfer_to_npu + + self.assertFalse(has_triton()) + self.assertFalse(has_triton_tma()) + self.assertFalse(has_triton_tma_device()) + + + + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/_inductor/test_var.py b/test/_inductor/test_var.py new file mode 100644 index 0000000000..f79fa3f70a --- /dev/null +++ b/test/_inductor/test_var.py @@ -0,0 +1,26 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestVar(TestUtils): + def op_calc(self, input_element, dim): + return torch.var(input_element, dim) + + # case:change shapes + @parametrize('shape', [(8, 64, 128)]) + @parametrize('dim', [0, 1, 2]) + @parametrize('dtype', ['float16']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + input_element = self._generate_tensor(shape, dtype) + std_ret = self.op_calc(input_element, dim) + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_ret = compiled_op_calc(input_element, dim) + self.assertEqual(std_ret, inductor_ret, atol=1e-1, rtol=True, equal_nan=True) + + +instantiate_parametrized_tests(TestVar) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_var_mean.py b/test/_inductor/test_var_mean.py new file mode 100644 index 0000000000..bfe333accb --- /dev/null +++ b/test/_inductor/test_var_mean.py @@ -0,0 +1,31 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestVarMean(TestUtils): + def op_calc(self, input_element, dim): + return torch.var_mean(input_element, dim) + + # case:The shape must not be too large + @parametrize('shape', [(8, 64, 128)]) + @parametrize('dim', [0, 1, 2, (0, 2), (0, 1)]) + @parametrize('dtype', ['float32']) + def test_reduction_cases_shapes(self, shape, dim, dtype): + + input_element = self._generate_tensor(shape, dtype) + + std_var, std_mean = self.op_calc(input_element, dim) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False) + inductor_var, inductor_mean = compiled_op_calc(input_element, dim) + + self.assertEqual(std_var, inductor_var, atol=1e-1, rtol=1e-1, equal_nan=True) + self.assertEqual(std_mean, inductor_mean, atol=1e-1, rtol=1e-1, equal_nan=True) + + +instantiate_parametrized_tests(TestVarMean) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_var_mean_add_mul.py b/test/_inductor/test_var_mean_add_mul.py new file mode 100644 index 0000000000..946b40f10a --- /dev/null +++ b/test/_inductor/test_var_mean_add_mul.py @@ -0,0 +1,48 @@ +import torch +from torch.testing._internal.common_utils import run_tests +from testutils import TestUtils +import torch_npu + + +class TestReduction(TestUtils): + def forward(self, add: "f32[1, 2, 2304]", primals_2: "f32[32, 2304]", primals_5: "f32[1, 9600, 2304]"): + split = torch.ops.aten.split.Tensor(add, 1, 1) + getitem: "f32[1, 1, 2304]" = split[0] + getitem_1: "f32[1, 1, 2304]" = split[1] + + var_mean = torch.ops.aten.var_mean.correction(primals_5, [2], correction=0, keepdim=True) + getitem_2: "f32[1, 9600, 1]" = var_mean[0] + getitem_3: "f32[1, 9600, 1]" = var_mean[1] + add_1: "f32[1, 9600, 1]" = torch.ops.aten.add.Tensor(getitem_2, 1e-06) + rsqrt: "f32[1, 9600, 1]" = torch.ops.aten.rsqrt.default(add_1) + sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3) + mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt) + + add_2: "f32[1, 1, 2304]" = torch.ops.aten.add.Tensor(getitem_1, 1) + mul_1: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, add_2) + add_3: "f32[1, 9600, 2304]" = torch.ops.aten.add.Tensor(mul_1, getitem) + + view: "f32[9600, 2304]" = torch.ops.aten.view.default(add_3, [9600, 2304]) + return [None, primals_5, getitem_3, rsqrt, add_2, view, primals_2] + + def test_reduction_cases_shapes(self): + device = 'npu' + primals_2: "f32[32, 2304]" = torch.randn((32, 2304), device=device, dtype=torch.float32) + primals_5: "f32[1, 9600, 2304]" = torch.randn((1, 9600, 2304), device=device, dtype=torch.float32) + add: "f32[1, 2, 2304]" = torch.randn((1, 2, 2304), device=device, dtype=torch.float32) + + _, primals_5_ref, getitem_3_ref, rsqrt_ref, add_2_ref, view_ref, primals_2_ref = self.forward(add, primals_2, primals_5) + + self.forward = torch.compile(self.forward, backend="inductor", dynamic=False) + _, primals_5, getitem_3, rsqrt, add_2, view, primals_2 = self.forward(add, primals_2, primals_5) + + self.assertEqual(primals_5_ref, primals_5, atol=1e-3, rtol=1e-3, equal_nan=True) + self.assertEqual(getitem_3_ref, getitem_3, atol=1e-3, rtol=1e-3, equal_nan=True) + self.assertEqual(rsqrt_ref, rsqrt, atol=1e-3, rtol=1e-3, equal_nan=True) + self.assertEqual(add_2_ref, add_2, atol=1e-3, rtol=1e-3, equal_nan=True) + self.assertEqual(view_ref, view, atol=1e-3, rtol=1e-3, equal_nan=True) + self.assertEqual(primals_2_ref, primals_2, atol=1e-3, rtol=1e-3, equal_nan=True) + + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/test_where.py b/test/_inductor/test_where.py new file mode 100644 index 0000000000..9c5c04c84d --- /dev/null +++ b/test/_inductor/test_where.py @@ -0,0 +1,29 @@ +import torch +from torch.testing._internal.common_utils import run_tests, parametrize, instantiate_parametrized_tests +from testutils import TestUtils +import torch_npu + + +class TestWhere(TestUtils): + def op_calc(self, condition, first_element, second_element): + return torch.where(condition, first_element, second_element) + + @parametrize('shape', TestUtils._pointwise_demo_shapes) + @parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32']) + def test_pointwise_cases(self, shape, dtype): + first_element = self._generate_tensor(shape, dtype) + second_element = self._generate_tensor(shape, dtype) + condition = self._generate_tensor(shape, 'bool') + + std_result = self.op_calc(condition, first_element, second_element) + + compiled_op_calc = torch.compile(self.op_calc, backend="inductor") + inductor_result = compiled_op_calc(condition, first_element, second_element) + + self.assertEqual(std_result, inductor_result) + + +instantiate_parametrized_tests(TestWhere) + +if __name__ == "__main__": + run_tests() diff --git a/test/_inductor/testutils.py b/test/_inductor/testutils.py new file mode 100644 index 0000000000..47863aaf34 --- /dev/null +++ b/test/_inductor/testutils.py @@ -0,0 +1,36 @@ +from collections.abc import Sequence +import os +import time +import numpy as np +import torch +from torch.testing._internal.common_utils import TestCase +import torch_npu + + +class TestUtils(TestCase): + _pointwise_test_shape2d = [(4096, 256), (1024, 32), (8, 2048), (8, 4096)] # (8, 4), (8, 8), not supported + _pointwise_test_shape3d = [(8, 8, 4), (8, 8, 8), (8, 8, 2048), (8, 8, 4096)] + _pointwise_test_shape4d = [(128, 128, 4096, 4), (128, 128, 4096, 8), + (32, 32, 1024, 1024)] # 128*128*4096*2048 is too big(512G) + _pointwise_test_shapes = _pointwise_test_shape2d + _pointwise_test_shape3d + _pointwise_test_shape4d + + _pointwise_demo_shapes = [(1024, 32), (8, 16, 256, 32)] + _reduction_extest_shape4d = [(8, 8, 8, 16384), (8, 8, 16384, 8), (8, 16384, 8, 8), (16384, 8, 8, 8)] + _reduction_extest_dim4d = [-1, -2, 1, 0] + _reduction_extest_SDbinding = list(zip(_reduction_extest_shape4d, _reduction_extest_dim4d)) + + _test_dtypes = ['float32', 'int32', 'float16', 'bfloat16', 'int64'] + + @staticmethod + def _generate_tensor(shape, dtype, floatPOSIFLAG=0): + if dtype == 'float32' or dtype == 'float16' or dtype == 'bfloat16': + if floatPOSIFLAG: + return 1000 * torch.rand(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) + else: + return torch.randn(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) * 2000 + elif dtype == 'int32' or dtype == 'int64': + return torch.randint(low=0, high=2000, size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) + elif dtype == 'bool': + return torch.randint(low=0, high=2, size=shape, device=torch.device("npu")).bool() + else: + raise ValueError('Invalid parameter \"dtype\" is found : {}'.format(dtype)) diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py index 04ef05321d..373f08cc26 100644 --- a/torch_npu/_inductor/__init__.py +++ b/torch_npu/_inductor/__init__.py @@ -21,6 +21,7 @@ from .npu_choices import should_use_persistent_reduction from .npu_device import NewNPUDeviceOpOverrides from .runtime import _load_cached_autotuning from .utils import get_current_raw_stream, patch_is_gpu, patch_has_triton +from .codecache import patch_aot_code_compiler_compile, patch_cache_base_get_system set_compile_threads() @@ -54,14 +55,16 @@ def patch_torch_for_aoti(): from .utils import patch_is_same_tensor from .fx_passes.joint_graph import patch_constant_fold_uniform_value from .ir import patch_fallback_kernel_codegen - from .codecache import patch_aot_code_compiler_compile + patch_codegen_with_cpp_wrapper() patch_get_cpp_torch_device_options() patch_device_to_aten() patch_is_same_tensor() patch_constant_fold_uniform_value() patch_fallback_kernel_codegen() - patch_aot_code_compiler_compile() + + patch_aot_code_compiler_compile() + if os.environ.get("DISABLE_AOTI_PATCH", "0") != "1": @@ -99,5 +102,9 @@ InductorChoices.should_use_persistent_reduction = should_use_persistent_reductio autotune_cache._load_cached_autotuning = _load_cached_autotuning register_fa_pass() +patch_cache_base_get_system() patch_is_gpu() patch_has_triton() + + + diff --git a/torch_npu/_inductor/codecache.py b/torch_npu/_inductor/codecache.py index 9cd9dac4f0..1efec225f2 100644 --- a/torch_npu/_inductor/codecache.py +++ b/torch_npu/_inductor/codecache.py @@ -1,5 +1,7 @@ import os import contextlib +import hashlib +import json from typing import ( Any, Callable, @@ -18,9 +20,9 @@ from typing import ( import torch from torch._inductor import config -from torch._inductor.codecache import get_lock_dir, LOCK_TIMEOUT +from torch._inductor.codecache import CacheBase, get_lock_dir, LOCK_TIMEOUT from torch._inductor.graph import GraphLowering - +import torch_npu from torch_npu.utils._error_code import ErrCode, pta_error empty_json = "{}" @@ -35,6 +37,52 @@ def lock_context(key): yield + +def patch_cache_base_get_system(): + # patch function CacheBase.get_system with get_system_npu, add logic to support CANN + @staticmethod + def get_system(): + try: + from triton.compiler.compiler import triton_key + + # Use triton_key instead of triton.__version__ as the version + # is not updated with each code change + triton_version = triton_key() + except ModuleNotFoundError: + triton_version = None + + try: + system: Dict[str, Any] = { + "device": {"name": None}, + "version": { + "triton": triton_version, + }, + } + device_properties = torch_npu.npu.get_device_properties( + torch_npu.npu.current_device() + ) + if torch.version.cann is not None: + system["device"]["name"] = device_properties.name + system["version"]["cann"] = torch.version.cann + elif torch.version.cuda is not None: + system["device"]["name"] = device_properties.name + system["version"]["cuda"] = torch.version.cuda + else: + system["device"]["name"] = device_properties.gcnArchName + system["version"]["hip"] = torch.version.hip + except (AssertionError, RuntimeError): + # If deivce is not installed, none of the above config is relevant. + system = {} + + system["hash"] = hashlib.sha256( + json.dumps(system, sort_keys=True).encode("utf-8") + ).hexdigest() + + return system + + CacheBase.get_system = get_system + + def patch_aot_code_compiler_compile(): # In v2.6.0, aoti has bug when init oss_proxy_executor with default op_json, # which could not be skipped, so here we try to create a new npu op_json, diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py index 29ac8924a1..2b47e091af 100644 --- a/torch_npu/_inductor/lowering.py +++ b/torch_npu/_inductor/lowering.py @@ -33,7 +33,7 @@ from torch._inductor.lowering import ( add_layout_constraint ) import torch_npu -from torch_npu import npu_dtype_cast +from torch_npu import npu_dtype_cast, _npu_dtype_cast from .lowering_op_list import GENERATE_LIST, GENERATE_LIST2, FALLBACK_LIST, LOWERING_OVERLOAD_OP @@ -198,6 +198,10 @@ def _register_npu_inductor_fallbacks(): def _convert_npu_type(x: TensorBox, dtype: torch.dtype): return to_dtype(x, dtype, copy=True) + @register_lowering(_npu_dtype_cast, type_promotion_kind=None) + def _convert__npu_type(x: TensorBox, dtype: torch.dtype): + return to_dtype(x, dtype, copy=True) + def var_mean_sum_(x, axis, correction, keepdim, return_mean): if correction is None: correction = 1 diff --git a/torch_npu/_inductor/lowering_fx.py b/torch_npu/_inductor/lowering_fx.py index 5084c29534..e9d3e3a0e9 100644 --- a/torch_npu/_inductor/lowering_fx.py +++ b/torch_npu/_inductor/lowering_fx.py @@ -2223,6 +2223,10 @@ def _register_npu_inductor_fallbacks(): def _convert_npu_type(x: TensorBox, dtype: torch.dtype): return to_dtype(x, dtype, copy=True) + @register_lowering(npu._npu_dtype_cast, type_promotion_kind=None) + def _convert__npu_type(x: TensorBox, dtype: torch.dtype): + return to_dtype(x, dtype, copy=True) + def var_mean_sum_(x, axis, correction, keepdim, return_mean): if correction is None: correction = 1 diff --git a/torch_npu/_inductor/lowering_op_list.py b/torch_npu/_inductor/lowering_op_list.py index 0e8bb3a9a5..e750953242 100644 --- a/torch_npu/_inductor/lowering_op_list.py +++ b/torch_npu/_inductor/lowering_op_list.py @@ -1,5 +1,5 @@ import torch -from torch_npu import npu_dtype_cast +from torch_npu import npu_dtype_cast, _npu_dtype_cast aten = torch.ops.aten tr_c10d = torch.ops.tr_c10d @@ -56,6 +56,7 @@ GENERATE_LIST = [ aten.clamp_max, aten.mean, npu_dtype_cast, + _npu_dtype_cast, aten.select_scatter, aten.slice_scatter, prims.broadcast_in_dim, diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py index 64f25854c8..561521151a 100644 --- a/torch_npu/_inductor/npu_triton_heuristics.py +++ b/torch_npu/_inductor/npu_triton_heuristics.py @@ -1,6 +1,7 @@ # This file is based on triton_heuristics with heuristics designed for NPU import copy import functools +from functools import lru_cache import hashlib import importlib import json @@ -551,6 +552,15 @@ class NPUCachingAutotuner(CachingAutotuner): if self.save_cache_hook: self.save_cache_hook(self.launchers[0].config, self.autotune_time_taken_ns) + @lru_cache(None) + def get_fx_graph_dump_path(self): + traced_graph_hash = self.inductor_meta.get("traced_graph_hash") + dump_dir = self.inductor_meta.get("traced_graph_dir", "") + dump_path = os.path.join(dump_dir, traced_graph_hash) + if dump_dir == "" or not os.path.exists(dump_path): + return None + return dump_path + def get_fx_graph_call(self, auto_fallback=False): kernel_name = self.inductor_meta.get("kernel_name", "triton_") traced_graph_hash = self.inductor_meta.get("traced_graph_hash") @@ -593,8 +603,13 @@ class NPUCachingAutotuner(CachingAutotuner): return fx_graph_call, kernel_name, dump_path, fx_module def data_dump(self, *args, dump_path=None): + dump_path = self.get_fx_graph_dump_path() if dump_path is None else dump_path + if dump_path is None: + log.warning(f"data dump for kernel {self.get_fn_name()} failed, no valid dump_path is supplied.") + return False data_dump_path = os.path.join(dump_path, 'data.pth') torch.save(args, data_dump_path) + return True def get_fn_name(self): if self.fn_name is not None: @@ -651,12 +666,11 @@ class NPUCachingAutotuner(CachingAutotuner): return True - def check_accuracy(self, *args, launcher, stream, **kwargs): + def check_accuracy(self, *args, launcher, grid, stream, **kwargs): fx_graph_call, kernel_name, dump_path, fx_module = self.get_fx_graph_call() if not fx_graph_call: return None call_outputs_indices = fx_module.call_args_mapping[fx_module.num_inputs:] - self.data_dump(*args, dump_path=dump_path) fx_args = [] for idx in fx_module.call_args_mapping: @@ -668,10 +682,9 @@ class NPUCachingAutotuner(CachingAutotuner): fx_graph_call(*fx_args) - ret = launcher( + launcher( *args, **kwargs, - grid=grid, stream=stream, ) @@ -730,16 +743,42 @@ class NPUCachingAutotuner(CachingAutotuner): torch.save(dump_args, f"{dump_path}/{idx}_{fn_name}_after.pt") return result + def maybe_run_debug(self, *args, grid_, stream, launcher, **kwargs): + kernel_name = self.get_fn_name() + log.info(f"Try to run debug mode for kernel {kernel_name}.") + if npu_config.dump_fx_graph: + _ = self.data_dump(*args) + + if npu_config.check_accuracy: + if self.check_accuracy(*args, launcher=launcher, grid=grid_, stream=stream, **kwargs): + return "check_accuracy" + elif npu_config.force_fallback_kernel_id: + fallback_result = self.fallback_to_fx(*args, launcher=launcher, grid_=grid_, stream=stream, **kwargs) + if fallback_result is not None: + log.debug(f"fallback kernel {self.get_fn_name()} to fx graph call.") + return "force_fallback_kernel_id" + else: + log.warning(f"kernel {self.get_fn_name()} could not fallback to fx.") + elif npu_config.aot_inductor.debug_kernel_in_run: + _ = self.debug_kernel_in_run(*args, launcher=launcher, grid_=grid_, stream=stream, **kwargs) + return "debug_kernel_in_run" + + log.info(f"No debug mode is activated for kernel {kernel_name}.") + return None def run( self, *args, stream, benchmark_run=False, **kwargs ): # type:ignore[override] if self.triton_interpret: args, grid = self._interpret_args_grid(args, self.configs[0]) + copied_kwargs = copy.copy(self.configs[0].kwargs) + copied_kwargs.pop('split_axis', None) + copied_kwargs.pop('split_blocks', None) + return self.fn[grid]( *args, **kwargs, - **self.configs[0].kwargs, + **copied_kwargs, ) if hasattr(self.launchers[0], "fallback"): @@ -772,26 +811,11 @@ class NPUCachingAutotuner(CachingAutotuner): if self.dump_launch_params: _dump_launch_params(args, kwargs, launcher, self.fn.__name__) - if npu_config.check_accuracy: - if self.check_accuracy(*args, launcher=launcher, stream=stream, **kwargs): - return - - elif npu_config.dump_fx_graph: - fx_graph_call, kernel_name, dump_path, _ = self.get_fx_graph_call() - if not fx_graph_call: - log.warning(f"data dump for kernel {kernel_name} failed!") - else: - self.data_dump(*args, dump_path=dump_path) - - elif npu_config.force_fallback_kernel_id: - fallback_result = self.fallback_to_fx(*args, launcher=launcher, stream=stream, **kwargs) - if fallback_result is not None: - log.debug(f"fallback kernel {self.get_fn_name()} to fx graph call.") - return - else: - log.warning(f"kernel {self.get_fn_name()} could not fallback to fx.") - elif npu_config.aot_inductor.debug_kernel_in_run: - return self.debug_kernel_in_run(*args, launcher=launcher, stream=stream, **kwargs) + _, grid = self._interpret_args_grid(args, launcher.config) + debug_mode = self.maybe_run_debug(*args, grid_=grid, stream=stream, launcher=launcher, **kwargs) + if debug_mode: + log.info(f"Kernel {self.get_fn_name()} goes into {debug_mode} and return.") + return # it is faster than entering and exiting a context manager, even if the context # manager is a nullcontext. @@ -818,6 +842,30 @@ class NPUCachingAutotuner(CachingAutotuner): stream=stream, ) + def _interpret_args_grid( + self, args: tuple[Any, ...], cfg: Config + ) -> tuple[tuple[Any, ...], tuple[int, int, int]]: + + numels = [ + arg + for arg in self.fn.arg_names + if "_numel" in arg + ] + grid = GridExprNpu.from_meta_and_set_numel(self.inductor_meta, cfg, numels).eval_slow( + dict( + zip( + [ + *self.fn.arg_names, + *self.inductor_meta.get("extra_launcher_args", ()), + ], + args, + ) + ) + ) + if self.inductor_meta.get("extra_launcher_args"): + args = args[: -len(self.inductor_meta["extra_launcher_args"])] + return args, grid + class NPUDebugAutotuner(NPUCachingAutotuner): def __init__(self, *args, regex_filter="", **kwargs): diff --git a/torch_npu/_inductor/utils.py b/torch_npu/_inductor/utils.py index a3ac4fd66b..095f1f69cf 100644 --- a/torch_npu/_inductor/utils.py +++ b/torch_npu/_inductor/utils.py @@ -73,4 +73,6 @@ def patch_has_triton(): return is_device_compatible_with_triton() torch.utils._triton.has_triton = has_triton - torch._inductor.scheduler.has_triton = has_triton \ No newline at end of file + torch._inductor.scheduler.has_triton = has_triton + + diff --git a/torch_npu/utils/_dynamo.py b/torch_npu/utils/_dynamo.py index 5915b8ed9c..2021197454 100644 --- a/torch_npu/utils/_dynamo.py +++ b/torch_npu/utils/_dynamo.py @@ -1,4 +1,5 @@ import inspect +import sys from typing import Dict, List import torch @@ -19,7 +20,7 @@ from torch_npu.dynamo import _get_global_npu_backend class NPUTorchCtxManagerClassVariable(TorchCtxManagerClassVariable): def call_function(self, tx, args, kwargs): - return NPUAutocastModeVariable.create(self.value, args, kwargs) + return NPUAutocastModeVariable.create(self.value, args, kwargs) class NPUAutocastModeVariable(AutocastModeVariable): @@ -106,6 +107,62 @@ def TensorVariable_call_method(self, tx, name, args, kwargs): return TensorVariable.call_method_raw(self, tx, name, args, kwargs) +class _InductorNpuRegistry: + _disabled_register = False + _has_inited = False + + @classmethod + def register_inductor_npu(cls): + if cls.has_initialized() or cls._disabled_register: + return + from torch_npu import _inductor + cls._has_inited = True + + @classmethod + def disable_register(cls): + cls._disabled_register = True + + @classmethod + def enable_register(cls): + cls._disabled_register = False + + @classmethod + def has_initialized(cls): + if cls._has_inited: + return True + # Maybe initialized by call `import torch_npu._inductor` manually. + if 'torch_npu._inductor' in sys.modules: + cls._has_inited = True + return cls._has_inited + + +def is_inductor_npu_initialized(): + return _InductorNpuRegistry.has_initialized() + + +def disable_register_inductor_npu(): + _InductorNpuRegistry.disable_register() + + +def enable_register_inductor_npu(): + _InductorNpuRegistry.enable_register() + + +def register_inductor_npu(): + _InductorNpuRegistry.register_inductor_npu() + + +def patch_inductor_wrapper(): + from torch import _TorchCompileInductorWrapper + src_call = _TorchCompileInductorWrapper.__call__ + + def new_call(self, model_, inputs_): + register_inductor_npu() + return src_call(self, model_, inputs_) + + _TorchCompileInductorWrapper.__call__ = new_call + + def patch_dynamo_optimize(): src_optimize = optimize @@ -137,4 +194,4 @@ def add_dynamo_methods(): TensorVariable.call_method_raw = TensorVariable.call_method TensorVariable.call_method = TensorVariable_call_method patch_dynamo_optimize() - + patch_inductor_wrapper() -- Gitee