From 411d9ea17bafae53ac35ce215606b26cb0f5edf9 Mon Sep 17 00:00:00 2001 From: czy1255959842 Date: Thu, 27 Feb 2025 15:21:22 +0800 Subject: [PATCH 1/3] 1 --- third_party/mstx/ms_tools_ext.h | 74 +++++++++++-- .../csrc/core/npu/NPUCachingAllocator.cpp | 26 ++++- .../csrc/core/npu/NPUWorkspaceAllocator.cpp | 7 ++ .../framework/interface/MstxInterface.cpp | 100 ++++++++++++++++-- .../csrc/framework/interface/MstxInterface.h | 19 +++- torch_npu/csrc/profiler/mstx_mgr.cpp | 83 ++++++++++++++- torch_npu/csrc/profiler/mstx_mgr.h | 18 ++-- torch_npu/csrc/profiler/npu_profiler.h | 2 +- 8 files changed, 293 insertions(+), 36 deletions(-) diff --git a/third_party/mstx/ms_tools_ext.h b/third_party/mstx/ms_tools_ext.h index ebeba40d51..6c4ff629c7 100644 --- a/third_party/mstx/ms_tools_ext.h +++ b/third_party/mstx/ms_tools_ext.h @@ -13,7 +13,61 @@ typedef uint64_t mstxRangeId; struct mstxDomainRegistration_st; typedef struct mstxDomainRegistration_st mstxDomainRegistration_t; -typedef mstxDomainRegistration_t* mstxDomainhandle_t; +typedef mstxDomainRegistration_t* mstxDomainHandle_t; + +struct mstxMemHeap_st; +typedef struct mstxMemHeap_st mstxMemHeap_t; +typedef mstxMemHeap_t* mstxMemHeapHandle_t; + +struct mstxMemRegion_st; +typedef struct mstxMemRegion_st mstxMemRegion_t; +typedef mstxMemRegion_t* mstxMemRegionHandle_t; + +typedef struct mstxMemVirtualRangeDesc_t { + uint32_t deviceId; + const void* ptr; + uint64_t size; +} mstxMemVirtualRangeDesc_t; + +typedef enum mstxMemHeapUsageType { + MSTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR = 0, +} mstxMemHeapUsageType; + +typedef enum mstxMemType { + MSTX_MEM_TYPE_VIRTUAL_ADDRESS = 0, +} mstxMemType; + +typedef struct mstxMemHeapDesc_t { + mstxMemHeapUsageType usage; + mstxMemType type; + const void* typeSpecificDesc; +} mstxMemHeapDesc_t; + +typedef struct mstxMemRegionsRegisterBatch_t { + mstxMemHeapHandle_t heap; + mstxMemType regionType; + size_t regionCount; + const void* regionDescArray; + mstxMemRegionHandle_t* regionHandleArrayOut; +} mstxMemRegionsRegisterBatch_t; + +typedef enum mstxMemRegionRefType { + MSTX_MEM_REGION_REF_TYPE_POINTER = 0, + MSTX_MEM_REGION_REF_TYPE_HANDLE +} mstxMemRegionRefType; + +typedef struct mstxMemRegionRef_t { + mstxMemRegionRefType refType; + union { + const void* pointer; + mstxMemRegionHandle_t handle; + }; +} mstxMemRegionRef_t; + +typedef struct mstxMemRegionsUnregisterBatch_t { + size_t refCount; + const mstxMemRegionRef_t* refArray; +} mstxMemRegionsUnregisterBatch_t; ACL_FUNC_VISIBILITY void mstxMarkA(const char* message, aclrtStream stream); @@ -21,16 +75,24 @@ ACL_FUNC_VISIBILITY mstxRangeId mstxRangeStartA(const char* message, aclrtStream ACL_FUNC_VISIBILITY void mstxRangeEnd(mstxRangeId id); -ACL_FUNC_VISIBILITY mstxDomainhandle_t mstxDomainCreateA(const char* name); +ACL_FUNC_VISIBILITY mstxDomainHandle_t mstxDomainCreateA(const char* name); -ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainhandle_t handle); +ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainHandle_t handle); -ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream); +ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream); -ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, +ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream); -ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainhandle_t handle, mstxRangeId id); +ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainHandle_t handle, mstxRangeId id); + +ACL_FUNC_VISIBILITY mstxMemHeapHandle_t mstxMemHeapRegister(mstxDomainHandle_t domain, const mstxMemHeapDesc_t* desc); + +ACL_FUNC_VISIBILITY void mstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap); + +ACL_FUNC_VISIBILITY void mstxMemRegionsRegister(mstxDomainHandle_t domain, const mstxMemRegionsRegisterBatch_t* desc); + +ACL_FUNC_VISIBILITY void mstxMemRegionsUnregister(mstxDomainHandle_t domain, const mstxMemRegionsUnregisterBatch_t* desc); #ifdef __cplusplus } diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 7eab2cb01a..a0f0bc8832 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1261,6 +1261,9 @@ class DeviceCachingAllocator { stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), block->device, @@ -1320,6 +1323,8 @@ class DeviceCachingAllocator { stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr); torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), block->device, @@ -1669,7 +1674,11 @@ class DeviceCachingAllocator { for_each_selected_stat_type(stat_types, [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], mapped_range.size); }); - +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{to_map->device, mapped_range.ptr, mapped_range.size}; + torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc); +#endif record_trace( TraceEntry::SEGMENT_MAP, int64_t(mapped_range.ptr), @@ -2048,6 +2057,11 @@ class DeviceCachingAllocator { // p.block came from new, not cudaMalloc. It should not be nullptr here. TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr); +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc); +#endif record_trace( TraceEntry::SEGMENT_ALLOC, int64_t(p.block->ptr), @@ -2165,7 +2179,10 @@ class DeviceCachingAllocator { if (block->size >= CachingAllocatorConfig::max_split_size()) update_stat(stats.oversize_segments, -1); - +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr); +#endif ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size); pool->blocks.erase(block); @@ -2223,7 +2240,10 @@ class DeviceCachingAllocator { for_each_selected_stat_type(stat_types, [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], -unmapped.size); }); - +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr); +#endif record_trace( TraceEntry::SEGMENT_UNMAP, int64_t(unmapped.ptr), diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 900576dcc2..9cc45eea89 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -54,6 +54,8 @@ public: NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeDeviceWithTimeout()); NPU_CHECK_ERROR(aclrtFree(block->data_ptr)); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr); record_mem_size_decrement(block->size); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { @@ -90,6 +92,9 @@ public: ASCEND_LOGD("NPUWorkspaceAllocator malloc by AclrtMallocAlign32: size=%zu", block->size); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); record_mem_size_increment(block->size); torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), @@ -132,6 +137,8 @@ public: ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block_pair.second->size); NPU_CHECK_ERROR(aclrtFree(block_pair.second->data_ptr)); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr); record_mem_size_decrement(block_pair.second->size); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { diff --git a/torch_npu/csrc/framework/interface/MstxInterface.cpp b/torch_npu/csrc/framework/interface/MstxInterface.cpp index 40ef6dcced..4024a63e27 100644 --- a/torch_npu/csrc/framework/interface/MstxInterface.cpp +++ b/torch_npu/csrc/framework/interface/MstxInterface.cpp @@ -24,6 +24,10 @@ LOAD_FUNCTION(mstxDomainDestroy) LOAD_FUNCTION(mstxDomainMarkA) LOAD_FUNCTION(mstxDomainRangeStartA) LOAD_FUNCTION(mstxDomainRangeEnd) +LOAD_FUNCTION(mstxMemHeapRegister) +LOAD_FUNCTION(mstxMemHeapUnregister) +LOAD_FUNCTION(mstxMemRegionsRegister) +LOAD_FUNCTION(mstxMemRegionsUnregister) // save python range id with cann mstx range id. // when mstx.range_end(id) is called, we can check if this id is invalid @@ -128,9 +132,9 @@ void MstxRangeEnd(int ptRangeId) g_rangeIdMap.erase(iter); } -mstxDomainhandle_t MstxDomainCreateA(const char* name) +mstxDomainHandle_t MstxDomainCreateA(const char* name) { - using MstxDomainCreateAFunc = mstxDomainhandle_t (*)(const char*); + using MstxDomainCreateAFunc = mstxDomainHandle_t (*)(const char*); static MstxDomainCreateAFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -147,9 +151,9 @@ mstxDomainhandle_t MstxDomainCreateA(const char* name) return func(name); } -void MstxDomainDestroy(mstxDomainhandle_t handle) +void MstxDomainDestroy(mstxDomainHandle_t handle) { - using MstxDomainDestroyFunc = void (*)(mstxDomainhandle_t); + using MstxDomainDestroyFunc = void (*)(mstxDomainHandle_t); static MstxDomainDestroyFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -166,9 +170,9 @@ void MstxDomainDestroy(mstxDomainhandle_t handle) func(handle); } -void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream) +void MstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream) { - using MstxDomainMarkAFunc = void (*)(mstxDomainhandle_t, const char*, aclrtStream); + using MstxDomainMarkAFunc = void (*)(mstxDomainHandle_t, const char*, aclrtStream); static MstxDomainMarkAFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -185,9 +189,9 @@ void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream func(handle, message, stream); } -int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId) +int MstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream, int ptRangeId) { - using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainhandle_t, const char*, aclrtStream); + using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainHandle_t, const char*, aclrtStream); static MstxDomainRangeStartAFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -207,9 +211,9 @@ int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtS return 0; } -void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId) +void MstxDomainRangeEnd(mstxDomainHandle_t handle, int ptRangeId) { - using MstxDomainRangeEndFunc = void (*)(mstxDomainhandle_t, mstxRangeId); + using MstxDomainRangeEndFunc = void (*)(mstxDomainHandle_t, mstxRangeId); static MstxDomainRangeEndFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -233,5 +237,81 @@ void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId) g_rangeIdMap.erase(iter); } +mstxMemHeapHandle_t MstxMemHeapRegister(mstxDomainHandle_t domain, mstxMemHeapDesc_t const* desc) +{ + using MstxMemHeapRegisterFunc = mstxMemHeapHandle_t (*)(mstxDomainHandle_t, mstxMemHeapDesc_t const*); + static MstxMemHeapRegisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return nullptr; + } + if (func == nullptr) { + func = (MstxMemHeapRegisterFunc)GET_FUNC(mstxMemHeapRegister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemHeapRegister"); + noFuncFlag = true; + return nullptr; + } + } + return func(domain, desc); +} + +void MstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap) +{ + using MstxMemHeapUnregisterFunc = void (*)(mstxDomainHandle_t, mstxMemHeapHandle_t); + static MstxMemHeapUnregisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return; + } + if (func == nullptr) { + func = (MstxMemHeapUnregisterFunc)GET_FUNC(mstxMemHeapUnregister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemHeapUnregister"); + noFuncFlag = true; + return; + } + } + func(domain, heap); +} + +void MstxMemRegionsRegister(mstxDomainHandle_t domain, mstxMemRegionsRegisterBatch_t const* desc) +{ + using MstxMemRegionsRegisterFunc = void (*)(mstxDomainHandle_t, mstxMemRegionsRegisterBatch_t const*); + static MstxMemRegionsRegisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return; + } + if (func == nullptr) { + func = (MstxMemRegionsRegisterFunc)GET_FUNC(mstxMemRegionsRegister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemRegionsRegister"); + noFuncFlag = true; + return; + } + } + func(domain, desc); +} + +void MstxMemRegionsUnregister(mstxDomainHandle_t domain, mstxMemRegionsUnregisterBatch_t const* desc) +{ + using MstxMemRegionsUnregisterFunc = void (*)(mstxDomainHandle_t, mstxMemRegionsUnregisterBatch_t const*); + static MstxMemRegionsUnregisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return; + } + if (func == nullptr) { + func = (MstxMemRegionsUnregisterFunc)GET_FUNC(mstxMemRegionsUnregister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemRegionsUnregister"); + noFuncFlag = true; + return; + } + } + func(domain, desc); +} + } } \ No newline at end of file diff --git a/torch_npu/csrc/framework/interface/MstxInterface.h b/torch_npu/csrc/framework/interface/MstxInterface.h index 806e8e749b..ba0781f587 100644 --- a/torch_npu/csrc/framework/interface/MstxInterface.h +++ b/torch_npu/csrc/framework/interface/MstxInterface.h @@ -16,15 +16,24 @@ int MstxRangeStartA(const char* message, aclrtStream stream, int ptRangeId); void MstxRangeEnd(int ptRangeId); -mstxDomainhandle_t MstxDomainCreateA(const char* name); +mstxDomainHandle_t MstxDomainCreateA(const char* name); -void MstxDomainDestroy(mstxDomainhandle_t handle); +void MstxDomainDestroy(mstxDomainHandle_t handle); -void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream); +void MstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream); -int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId); +int MstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream, int ptRangeId); + +void MstxDomainRangeEnd(mstxDomainHandle_t handle, int ptRangeId); + +mstxMemHeapHandle_t MstxMemHeapRegister(mstxDomainHandle_t domain, const mstxMemHeapDesc_t* desc); + +void MstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap); + +void MstxMemRegionsRegister(mstxDomainHandle_t domain, const mstxMemRegionsRegisterBatch_t* desc); + +void MstxMemRegionsUnregister(mstxDomainHandle_t domain, const mstxMemRegionsUnregisterBatch_t* desc); -void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId); } } diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp index 7ee7793e8c..0c7ff91d3e 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.cpp +++ b/torch_npu/csrc/profiler/mstx_mgr.cpp @@ -84,17 +84,20 @@ int MstxMgr::getRangeId() return ptRangeId_++; } -mstxDomainhandle_t MstxMgr::createDomain(const char* name) +mstxDomainHandle_t MstxMgr::createDomain(const char* name) { + if (!isMsleaksEnable() && !isMstxEnable()) { + return nullptr; + } return at_npu::native::MstxDomainCreateA(name); } -void MstxMgr::destroyDomain(mstxDomainhandle_t domain) +void MstxMgr::destroyDomain(mstxDomainHandle_t domain) { at_npu::native::MstxDomainDestroy(domain); } -void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream) +void MstxMgr::domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream) { if (!isMstxEnable()) { return; @@ -111,7 +114,7 @@ void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const a at_npu::native::OpCommand::RunOpApi("mstx_domain_mark_op", mark_call); } -int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream) +int MstxMgr::domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream) { if (!isMstxEnable()) { return 0; @@ -133,7 +136,7 @@ int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, co return id; } -void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId) +void MstxMgr::domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId) { if (!isMstxEnable() || ptRangeId == 0) { return; @@ -158,6 +161,76 @@ void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId) at_npu::native::OpCommand::RunOpApi("mstx_domain_range_end_op", range_end_call); } +mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc) +{ + if (!isMsleaksEnable() || desc==nullptr) { + return nullptr; + } + mstxMemHeapDesc_t heapDesc; + heapDesc.typeSpecificDesc = reinterpret_cast(desc); + return at_npu::native::MstxMemHeapRegister(domain, &heapDesc); +} + +void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr) +{ + if (!isMsleaksEnable() || ptr == nullptr) { + return; + } + at_npu::native::MstxMemHeapUnregister(domain, reinterpret_cast(ptr)); +} + +void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc) +{ + if (!isMsleaksEnable() || desc == nullptr) { + return; + } + mstxMemRegionsRegisterBatch_t batch; + batch.regionCount = 1; + batch.regionDescArray = reinterpret_cast(desc); + at_npu::native::MstxMemRegionsRegister(domain, &batch); +} + +void MstxMgr::memRegionsUnregister(mstxDomainHandle_t domain, void* ptr) +{ + if (!isMsleaksEnable() || ptr == nullptr) { + return; + } + mstxMemRegionsUnregisterBatch_t unregisterBatch; + unregisterBatch.refCount = 1; + mstxMemRegionRef_t regionRef[1] = {}; + regionRef[0].refType = MSTX_MEM_REGION_REF_TYPE_POINTER; + regionRef[0].pointer = ptr; + unregisterBatch.refArray = regionRef; + at_npu::native::MstxMemRegionsUnregister(domain, &unregisterBatch); +} + + +bool MstxMgr::isMsleaksEnable() +{ + static bool isEnable = isMsleaksEnableImpl(); + return isEnable; +} + +bool MstxMgr::isMsleaksEnableImpl() +{ + bool ret = false; + const char* envVal = std::getenv("LD_PRELOAD"); + if (envVal == nullptr) { + return ret; + } + static const std::string soName = "libascend_hal_hook.so"; + std::stringstream ss(envVal); + std::string path; + while (std::getline(ss, path, ':')) { + path = torch_npu::toolkit::profiler::Utils::RealPath(path); + if ((path.size() > soName.size()) && (path.substr(path.size() - soName.size()) == soName)) { + ret = true; + break; + } + } + return ret; +} + bool MstxMgr::isProfTxEnable() { return ProfilerMgr::GetInstance()->GetNpuTrace().load() && ProfilerMgr::GetInstance()->GetMsprofTx().load(); diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h index c6fc6a5fe1..bea6f59bea 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.h +++ b/torch_npu/csrc/profiler/mstx_mgr.h @@ -12,6 +12,7 @@ namespace torch_npu { namespace profiler { const std::string DOMAIN_COMMUNICATION = "communication"; +const std::string DOMAIN_MSLEAKS = "msleaks"; class MstxMgr : public torch_npu::toolkit::profiler::Singleton { friend class torch_npu::toolkit::profiler::Singleton; @@ -22,11 +23,15 @@ public: bool isMstxEnable(); int getRangeId(); - mstxDomainhandle_t createDomain(const char* name); - void destroyDomain(mstxDomainhandle_t domain); - void domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream); - int domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream); - void domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId); + mstxDomainHandle_t createDomain(const char* name); + void destroyDomain(mstxDomainHandle_t domain); + void domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream); + int domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream); + void domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId); + mstxMemHeapHandle_t memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc); + void memHeapUnregister(mstxDomainHandle_t domain, void* ptr); + void memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc); + void memRegionsUnregister(mstxDomainHandle_t domain, void* ptr); private: MstxMgr(); @@ -35,6 +40,8 @@ private: explicit MstxMgr(MstxMgr &&obj) = delete; MstxMgr& operator=(MstxMgr &&obj) = delete; + bool isMsleaksEnable(); + bool isMsleaksEnableImpl(); bool isProfTxEnable(); bool isMsptiTxEnable(); bool isMsptiTxEnableImpl(); @@ -43,6 +50,5 @@ private: std::unordered_set ptRangeIdsWithStream_; std::mutex mtx_; }; - } } // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index b58fa182a7..074a41440e 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -129,7 +129,7 @@ inline bool mstxEnable() struct MstxRange { int rangeId{0}; - mstxDomainhandle_t domainHandle{nullptr}; + mstxDomainHandle_t domainHandle{nullptr}; MstxRange(const std::string &message, aclrtStream stream, const std::string &domainName = "default") { if (!mstxEnable()) { -- Gitee From 467008c591731f1eb5e168a966ec59546bbfaf05 Mon Sep 17 00:00:00 2001 From: czy1255959842 Date: Mon, 3 Mar 2025 15:17:47 +0800 Subject: [PATCH 2/3] 1 --- torch_npu/csrc/profiler/mstx_mgr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp index 0c7ff91d3e..671908257b 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.cpp +++ b/torch_npu/csrc/profiler/mstx_mgr.cpp @@ -218,7 +218,7 @@ bool MstxMgr::isMsleaksEnableImpl() if (envVal == nullptr) { return ret; } - static const std::string soName = "libascend_hal_hook.so"; + static const std::string soName = "libascend_kernel_hook.so"; std::stringstream ss(envVal); std::string path; while (std::getline(ss, path, ':')) { -- Gitee From 23b02cd3d6a2a2faa83f0847d42ef5b8f87eaad5 Mon Sep 17 00:00:00 2001 From: czy1255959842 Date: Tue, 18 Mar 2025 11:21:51 +0800 Subject: [PATCH 3/3] 1 --- .../csrc/core/npu/NPUCachingAllocator.cpp | 38 +++++++++---------- .../csrc/core/npu/NPUWorkspaceAllocator.cpp | 6 +-- torch_npu/csrc/profiler/mstx_mgr.cpp | 13 +++++-- torch_npu/csrc/profiler/mstx_mgr.h | 4 +- torch_npu/csrc/profiler/npu_profiler.h | 2 +- 5 files changed, 34 insertions(+), 29 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index a0f0bc8832..4160c62458 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1261,21 +1261,19 @@ class DeviceCachingAllocator { stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); - mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size}; - torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), - allocator_type, - reinterpret_cast(block->ptr), - block->size, - stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, - stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, - stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); + torch_npu::profiler::reportMemoryDataToNpuProfiler({static_cast(c10::DeviceType::PrivateUse1), + block->device, + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), + allocator_type, + reinterpret_cast(block->ptr), + block->size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + stats.active_bytes[static_cast(StatType::AGGREGATE)].current, + reinterpret_cast(block->stream)}); #endif return block; @@ -1323,7 +1321,7 @@ class DeviceCachingAllocator { stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr); torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), @@ -1675,7 +1673,7 @@ class DeviceCachingAllocator { update_stat(stats.reserved_bytes[stat_type], mapped_range.size); }); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); mstxMemVirtualRangeDesc_t desc{to_map->device, mapped_range.ptr, mapped_range.size}; torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc); #endif @@ -2058,7 +2056,7 @@ class DeviceCachingAllocator { // p.block came from new, not cudaMalloc. It should not be nullptr here. TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size}; torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc); #endif @@ -2180,7 +2178,7 @@ class DeviceCachingAllocator { if (block->size >= CachingAllocatorConfig::max_split_size()) update_stat(stats.oversize_segments, -1); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr); #endif ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size); @@ -2241,7 +2239,7 @@ class DeviceCachingAllocator { update_stat(stats.reserved_bytes[stat_type], -unmapped.size); }); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr); #endif record_trace( diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 9cc45eea89..f4dcc8de6c 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -54,7 +54,7 @@ public: NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeDeviceWithTimeout()); NPU_CHECK_ERROR(aclrtFree(block->data_ptr)); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr); record_mem_size_decrement(block->size); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); @@ -92,7 +92,7 @@ public: ASCEND_LOGD("NPUWorkspaceAllocator malloc by AclrtMallocAlign32: size=%zu", block->size); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size}; torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); record_mem_size_increment(block->size); @@ -137,7 +137,7 @@ public: ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block_pair.second->size); NPU_CHECK_ERROR(aclrtFree(block_pair.second->data_ptr)); #ifndef BUILD_LIBTORCH - mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr); record_mem_size_decrement(block_pair.second->size); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp index 671908257b..d276dd566b 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.cpp +++ b/torch_npu/csrc/profiler/mstx_mgr.cpp @@ -83,10 +83,17 @@ int MstxMgr::getRangeId() { return ptRangeId_++; } - -mstxDomainHandle_t MstxMgr::createDomain(const char* name) +mstxDomainHandle_t MstxMgr::createProfDomain(const char *name) +{ + if (!isMstxEnable()) + { + return nullptr; + } + return at_npu::native::MstxDomainCreateA(name); +} +mstxDomainHandle_t MstxMgr::createLeaksDomain(const char *name) { - if (!isMsleaksEnable() && !isMstxEnable()) { + if (!isMsleaksEnable()) { return nullptr; } return at_npu::native::MstxDomainCreateA(name); diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h index bea6f59bea..d70f460c85 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.h +++ b/torch_npu/csrc/profiler/mstx_mgr.h @@ -22,8 +22,8 @@ public: void rangeEnd(int ptRangeId); bool isMstxEnable(); int getRangeId(); - - mstxDomainHandle_t createDomain(const char* name); + mstxDomainHandle_t createProfDomain(const char *name); + mstxDomainHandle_t createLeaksDomain(const char* name); void destroyDomain(mstxDomainHandle_t domain); void domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream); int domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream); diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index 074a41440e..bab2bce613 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -137,7 +137,7 @@ struct MstxRange { } rangeId = MstxMgr::GetInstance()->getRangeId(); if (at_npu::native::IsSupportMstxDomainFunc()) { - domainHandle = MstxMgr::GetInstance()->createDomain(domainName.c_str()); + domainHandle = MstxMgr::GetInstance()->createProfDomain(domainName.c_str()); at_npu::native::MstxDomainRangeStartA(domainHandle, message.c_str(), stream, rangeId); } else { at_npu::native::MstxRangeStartA(message.c_str(), stream, rangeId); -- Gitee