diff --git a/third_party/mstx/ms_tools_ext.h b/third_party/mstx/ms_tools_ext.h index ebeba40d5175e8fa13c536cd2b61436b36d9d458..6c4ff629c7421f6a4c0fed7e7db70ac37b891b45 100644 --- a/third_party/mstx/ms_tools_ext.h +++ b/third_party/mstx/ms_tools_ext.h @@ -13,7 +13,61 @@ typedef uint64_t mstxRangeId; struct mstxDomainRegistration_st; typedef struct mstxDomainRegistration_st mstxDomainRegistration_t; -typedef mstxDomainRegistration_t* mstxDomainhandle_t; +typedef mstxDomainRegistration_t* mstxDomainHandle_t; + +struct mstxMemHeap_st; +typedef struct mstxMemHeap_st mstxMemHeap_t; +typedef mstxMemHeap_t* mstxMemHeapHandle_t; + +struct mstxMemRegion_st; +typedef struct mstxMemRegion_st mstxMemRegion_t; +typedef mstxMemRegion_t* mstxMemRegionHandle_t; + +typedef struct mstxMemVirtualRangeDesc_t { + uint32_t deviceId; + const void* ptr; + uint64_t size; +} mstxMemVirtualRangeDesc_t; + +typedef enum mstxMemHeapUsageType { + MSTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR = 0, +} mstxMemHeapUsageType; + +typedef enum mstxMemType { + MSTX_MEM_TYPE_VIRTUAL_ADDRESS = 0, +} mstxMemType; + +typedef struct mstxMemHeapDesc_t { + mstxMemHeapUsageType usage; + mstxMemType type; + const void* typeSpecificDesc; +} mstxMemHeapDesc_t; + +typedef struct mstxMemRegionsRegisterBatch_t { + mstxMemHeapHandle_t heap; + mstxMemType regionType; + size_t regionCount; + const void* regionDescArray; + mstxMemRegionHandle_t* regionHandleArrayOut; +} mstxMemRegionsRegisterBatch_t; + +typedef enum mstxMemRegionRefType { + MSTX_MEM_REGION_REF_TYPE_POINTER = 0, + MSTX_MEM_REGION_REF_TYPE_HANDLE +} mstxMemRegionRefType; + +typedef struct mstxMemRegionRef_t { + mstxMemRegionRefType refType; + union { + const void* pointer; + mstxMemRegionHandle_t handle; + }; +} mstxMemRegionRef_t; + +typedef struct mstxMemRegionsUnregisterBatch_t { + size_t refCount; + const mstxMemRegionRef_t* refArray; +} mstxMemRegionsUnregisterBatch_t; ACL_FUNC_VISIBILITY void mstxMarkA(const char* message, aclrtStream stream); @@ -21,16 +75,24 @@ ACL_FUNC_VISIBILITY mstxRangeId mstxRangeStartA(const char* message, aclrtStream ACL_FUNC_VISIBILITY void mstxRangeEnd(mstxRangeId id); -ACL_FUNC_VISIBILITY mstxDomainhandle_t mstxDomainCreateA(const char* name); +ACL_FUNC_VISIBILITY mstxDomainHandle_t mstxDomainCreateA(const char* name); -ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainhandle_t handle); +ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainHandle_t handle); -ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream); +ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream); -ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, +ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream); -ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainhandle_t handle, mstxRangeId id); +ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainHandle_t handle, mstxRangeId id); + +ACL_FUNC_VISIBILITY mstxMemHeapHandle_t mstxMemHeapRegister(mstxDomainHandle_t domain, const mstxMemHeapDesc_t* desc); + +ACL_FUNC_VISIBILITY void mstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap); + +ACL_FUNC_VISIBILITY void mstxMemRegionsRegister(mstxDomainHandle_t domain, const mstxMemRegionsRegisterBatch_t* desc); + +ACL_FUNC_VISIBILITY void mstxMemRegionsUnregister(mstxDomainHandle_t domain, const mstxMemRegionsUnregisterBatch_t* desc); #ifdef __cplusplus } diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 7eab2cb01a6f10019e98f13cc1c76e76a64cff93..4160c62458e7ac74cd0ef2b4714625e239ba3ed2 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1261,18 +1261,19 @@ class DeviceCachingAllocator { stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); #ifndef BUILD_LIBTORCH - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), - allocator_type, - reinterpret_cast(block->ptr), - block->size, - stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, - stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, - stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); + torch_npu::profiler::reportMemoryDataToNpuProfiler({static_cast(c10::DeviceType::PrivateUse1), + block->device, + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), + allocator_type, + reinterpret_cast(block->ptr), + block->size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + stats.active_bytes[static_cast(StatType::AGGREGATE)].current, + reinterpret_cast(block->stream)}); #endif return block; @@ -1320,6 +1321,8 @@ class DeviceCachingAllocator { stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr); torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), block->device, @@ -1669,7 +1672,11 @@ class DeviceCachingAllocator { for_each_selected_stat_type(stat_types, [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], mapped_range.size); }); - +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{to_map->device, mapped_range.ptr, mapped_range.size}; + torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc); +#endif record_trace( TraceEntry::SEGMENT_MAP, int64_t(mapped_range.ptr), @@ -2048,6 +2055,11 @@ class DeviceCachingAllocator { // p.block came from new, not cudaMalloc. It should not be nullptr here. TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr); +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc); +#endif record_trace( TraceEntry::SEGMENT_ALLOC, int64_t(p.block->ptr), @@ -2165,7 +2177,10 @@ class DeviceCachingAllocator { if (block->size >= CachingAllocatorConfig::max_split_size()) update_stat(stats.oversize_segments, -1); - +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr); +#endif ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size); pool->blocks.erase(block); @@ -2223,7 +2238,10 @@ class DeviceCachingAllocator { for_each_selected_stat_type(stat_types, [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], -unmapped.size); }); - +#ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr); +#endif record_trace( TraceEntry::SEGMENT_UNMAP, int64_t(unmapped.ptr), diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 900576dcc2fe739954d302a7684189f3dbbc06af..f4dcc8de6c1a914537bf96a27aa407c54357f9bd 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -54,6 +54,8 @@ public: NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeDeviceWithTimeout()); NPU_CHECK_ERROR(aclrtFree(block->data_ptr)); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr); record_mem_size_decrement(block->size); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { @@ -90,6 +92,9 @@ public: ASCEND_LOGD("NPUWorkspaceAllocator malloc by AclrtMallocAlign32: size=%zu", block->size); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size}; + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc); record_mem_size_increment(block->size); torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast(c10::DeviceType::PrivateUse1), @@ -132,6 +137,8 @@ public: ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block_pair.second->size); NPU_CHECK_ERROR(aclrtFree(block_pair.second->data_ptr)); #ifndef BUILD_LIBTORCH + mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str()); + torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr); record_mem_size_decrement(block_pair.second->size); const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { diff --git a/torch_npu/csrc/framework/interface/MstxInterface.cpp b/torch_npu/csrc/framework/interface/MstxInterface.cpp index 40ef6dcced3e929f1849cbee54c153e9dab53f5a..4024a63e270aaa410f27c6afbee3b2a2d75be840 100644 --- a/torch_npu/csrc/framework/interface/MstxInterface.cpp +++ b/torch_npu/csrc/framework/interface/MstxInterface.cpp @@ -24,6 +24,10 @@ LOAD_FUNCTION(mstxDomainDestroy) LOAD_FUNCTION(mstxDomainMarkA) LOAD_FUNCTION(mstxDomainRangeStartA) LOAD_FUNCTION(mstxDomainRangeEnd) +LOAD_FUNCTION(mstxMemHeapRegister) +LOAD_FUNCTION(mstxMemHeapUnregister) +LOAD_FUNCTION(mstxMemRegionsRegister) +LOAD_FUNCTION(mstxMemRegionsUnregister) // save python range id with cann mstx range id. // when mstx.range_end(id) is called, we can check if this id is invalid @@ -128,9 +132,9 @@ void MstxRangeEnd(int ptRangeId) g_rangeIdMap.erase(iter); } -mstxDomainhandle_t MstxDomainCreateA(const char* name) +mstxDomainHandle_t MstxDomainCreateA(const char* name) { - using MstxDomainCreateAFunc = mstxDomainhandle_t (*)(const char*); + using MstxDomainCreateAFunc = mstxDomainHandle_t (*)(const char*); static MstxDomainCreateAFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -147,9 +151,9 @@ mstxDomainhandle_t MstxDomainCreateA(const char* name) return func(name); } -void MstxDomainDestroy(mstxDomainhandle_t handle) +void MstxDomainDestroy(mstxDomainHandle_t handle) { - using MstxDomainDestroyFunc = void (*)(mstxDomainhandle_t); + using MstxDomainDestroyFunc = void (*)(mstxDomainHandle_t); static MstxDomainDestroyFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -166,9 +170,9 @@ void MstxDomainDestroy(mstxDomainhandle_t handle) func(handle); } -void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream) +void MstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream) { - using MstxDomainMarkAFunc = void (*)(mstxDomainhandle_t, const char*, aclrtStream); + using MstxDomainMarkAFunc = void (*)(mstxDomainHandle_t, const char*, aclrtStream); static MstxDomainMarkAFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -185,9 +189,9 @@ void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream func(handle, message, stream); } -int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId) +int MstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream, int ptRangeId) { - using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainhandle_t, const char*, aclrtStream); + using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainHandle_t, const char*, aclrtStream); static MstxDomainRangeStartAFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -207,9 +211,9 @@ int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtS return 0; } -void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId) +void MstxDomainRangeEnd(mstxDomainHandle_t handle, int ptRangeId) { - using MstxDomainRangeEndFunc = void (*)(mstxDomainhandle_t, mstxRangeId); + using MstxDomainRangeEndFunc = void (*)(mstxDomainHandle_t, mstxRangeId); static MstxDomainRangeEndFunc func = nullptr; static bool noFuncFlag = false; if (noFuncFlag) { @@ -233,5 +237,81 @@ void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId) g_rangeIdMap.erase(iter); } +mstxMemHeapHandle_t MstxMemHeapRegister(mstxDomainHandle_t domain, mstxMemHeapDesc_t const* desc) +{ + using MstxMemHeapRegisterFunc = mstxMemHeapHandle_t (*)(mstxDomainHandle_t, mstxMemHeapDesc_t const*); + static MstxMemHeapRegisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return nullptr; + } + if (func == nullptr) { + func = (MstxMemHeapRegisterFunc)GET_FUNC(mstxMemHeapRegister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemHeapRegister"); + noFuncFlag = true; + return nullptr; + } + } + return func(domain, desc); +} + +void MstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap) +{ + using MstxMemHeapUnregisterFunc = void (*)(mstxDomainHandle_t, mstxMemHeapHandle_t); + static MstxMemHeapUnregisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return; + } + if (func == nullptr) { + func = (MstxMemHeapUnregisterFunc)GET_FUNC(mstxMemHeapUnregister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemHeapUnregister"); + noFuncFlag = true; + return; + } + } + func(domain, heap); +} + +void MstxMemRegionsRegister(mstxDomainHandle_t domain, mstxMemRegionsRegisterBatch_t const* desc) +{ + using MstxMemRegionsRegisterFunc = void (*)(mstxDomainHandle_t, mstxMemRegionsRegisterBatch_t const*); + static MstxMemRegionsRegisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return; + } + if (func == nullptr) { + func = (MstxMemRegionsRegisterFunc)GET_FUNC(mstxMemRegionsRegister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemRegionsRegister"); + noFuncFlag = true; + return; + } + } + func(domain, desc); +} + +void MstxMemRegionsUnregister(mstxDomainHandle_t domain, mstxMemRegionsUnregisterBatch_t const* desc) +{ + using MstxMemRegionsUnregisterFunc = void (*)(mstxDomainHandle_t, mstxMemRegionsUnregisterBatch_t const*); + static MstxMemRegionsUnregisterFunc func = nullptr; + static bool noFuncFlag = false; + if (noFuncFlag) { + return; + } + if (func == nullptr) { + func = (MstxMemRegionsUnregisterFunc)GET_FUNC(mstxMemRegionsUnregister); + if (func == nullptr) { + ASCEND_LOGW("Failed to get func mstxMemRegionsUnregister"); + noFuncFlag = true; + return; + } + } + func(domain, desc); +} + } } \ No newline at end of file diff --git a/torch_npu/csrc/framework/interface/MstxInterface.h b/torch_npu/csrc/framework/interface/MstxInterface.h index 806e8e749b16752435acf89de64274948dd534c9..ba0781f587d9749403246e9f46c84cc0b4593fdc 100644 --- a/torch_npu/csrc/framework/interface/MstxInterface.h +++ b/torch_npu/csrc/framework/interface/MstxInterface.h @@ -16,15 +16,24 @@ int MstxRangeStartA(const char* message, aclrtStream stream, int ptRangeId); void MstxRangeEnd(int ptRangeId); -mstxDomainhandle_t MstxDomainCreateA(const char* name); +mstxDomainHandle_t MstxDomainCreateA(const char* name); -void MstxDomainDestroy(mstxDomainhandle_t handle); +void MstxDomainDestroy(mstxDomainHandle_t handle); -void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream); +void MstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream); -int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId); +int MstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream, int ptRangeId); + +void MstxDomainRangeEnd(mstxDomainHandle_t handle, int ptRangeId); + +mstxMemHeapHandle_t MstxMemHeapRegister(mstxDomainHandle_t domain, const mstxMemHeapDesc_t* desc); + +void MstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap); + +void MstxMemRegionsRegister(mstxDomainHandle_t domain, const mstxMemRegionsRegisterBatch_t* desc); + +void MstxMemRegionsUnregister(mstxDomainHandle_t domain, const mstxMemRegionsUnregisterBatch_t* desc); -void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId); } } diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp index 7ee7793e8cf944ba61b17a977178c9e420655e4a..d276dd566b7198a10524a765da4288d1f16cbb5f 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.cpp +++ b/torch_npu/csrc/profiler/mstx_mgr.cpp @@ -83,18 +83,28 @@ int MstxMgr::getRangeId() { return ptRangeId_++; } - -mstxDomainhandle_t MstxMgr::createDomain(const char* name) +mstxDomainHandle_t MstxMgr::createProfDomain(const char *name) { + if (!isMstxEnable()) + { + return nullptr; + } + return at_npu::native::MstxDomainCreateA(name); +} +mstxDomainHandle_t MstxMgr::createLeaksDomain(const char *name) +{ + if (!isMsleaksEnable()) { + return nullptr; + } return at_npu::native::MstxDomainCreateA(name); } -void MstxMgr::destroyDomain(mstxDomainhandle_t domain) +void MstxMgr::destroyDomain(mstxDomainHandle_t domain) { at_npu::native::MstxDomainDestroy(domain); } -void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream) +void MstxMgr::domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream) { if (!isMstxEnable()) { return; @@ -111,7 +121,7 @@ void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const a at_npu::native::OpCommand::RunOpApi("mstx_domain_mark_op", mark_call); } -int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream) +int MstxMgr::domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream) { if (!isMstxEnable()) { return 0; @@ -133,7 +143,7 @@ int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, co return id; } -void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId) +void MstxMgr::domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId) { if (!isMstxEnable() || ptRangeId == 0) { return; @@ -158,6 +168,76 @@ void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId) at_npu::native::OpCommand::RunOpApi("mstx_domain_range_end_op", range_end_call); } +mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc) +{ + if (!isMsleaksEnable() || desc==nullptr) { + return nullptr; + } + mstxMemHeapDesc_t heapDesc; + heapDesc.typeSpecificDesc = reinterpret_cast(desc); + return at_npu::native::MstxMemHeapRegister(domain, &heapDesc); +} + +void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr) +{ + if (!isMsleaksEnable() || ptr == nullptr) { + return; + } + at_npu::native::MstxMemHeapUnregister(domain, reinterpret_cast(ptr)); +} + +void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc) +{ + if (!isMsleaksEnable() || desc == nullptr) { + return; + } + mstxMemRegionsRegisterBatch_t batch; + batch.regionCount = 1; + batch.regionDescArray = reinterpret_cast(desc); + at_npu::native::MstxMemRegionsRegister(domain, &batch); +} + +void MstxMgr::memRegionsUnregister(mstxDomainHandle_t domain, void* ptr) +{ + if (!isMsleaksEnable() || ptr == nullptr) { + return; + } + mstxMemRegionsUnregisterBatch_t unregisterBatch; + unregisterBatch.refCount = 1; + mstxMemRegionRef_t regionRef[1] = {}; + regionRef[0].refType = MSTX_MEM_REGION_REF_TYPE_POINTER; + regionRef[0].pointer = ptr; + unregisterBatch.refArray = regionRef; + at_npu::native::MstxMemRegionsUnregister(domain, &unregisterBatch); +} + + +bool MstxMgr::isMsleaksEnable() +{ + static bool isEnable = isMsleaksEnableImpl(); + return isEnable; +} + +bool MstxMgr::isMsleaksEnableImpl() +{ + bool ret = false; + const char* envVal = std::getenv("LD_PRELOAD"); + if (envVal == nullptr) { + return ret; + } + static const std::string soName = "libascend_kernel_hook.so"; + std::stringstream ss(envVal); + std::string path; + while (std::getline(ss, path, ':')) { + path = torch_npu::toolkit::profiler::Utils::RealPath(path); + if ((path.size() > soName.size()) && (path.substr(path.size() - soName.size()) == soName)) { + ret = true; + break; + } + } + return ret; +} + bool MstxMgr::isProfTxEnable() { return ProfilerMgr::GetInstance()->GetNpuTrace().load() && ProfilerMgr::GetInstance()->GetMsprofTx().load(); diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h index c6fc6a5fe1856fbb0b94bcbb46e4323866036c21..d70f460c85bb50aefc6fdcafed0123d0357e06a8 100644 --- a/torch_npu/csrc/profiler/mstx_mgr.h +++ b/torch_npu/csrc/profiler/mstx_mgr.h @@ -12,6 +12,7 @@ namespace torch_npu { namespace profiler { const std::string DOMAIN_COMMUNICATION = "communication"; +const std::string DOMAIN_MSLEAKS = "msleaks"; class MstxMgr : public torch_npu::toolkit::profiler::Singleton { friend class torch_npu::toolkit::profiler::Singleton; @@ -21,12 +22,16 @@ public: void rangeEnd(int ptRangeId); bool isMstxEnable(); int getRangeId(); - - mstxDomainhandle_t createDomain(const char* name); - void destroyDomain(mstxDomainhandle_t domain); - void domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream); - int domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream); - void domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId); + mstxDomainHandle_t createProfDomain(const char *name); + mstxDomainHandle_t createLeaksDomain(const char* name); + void destroyDomain(mstxDomainHandle_t domain); + void domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream); + int domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream); + void domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId); + mstxMemHeapHandle_t memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc); + void memHeapUnregister(mstxDomainHandle_t domain, void* ptr); + void memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc); + void memRegionsUnregister(mstxDomainHandle_t domain, void* ptr); private: MstxMgr(); @@ -35,6 +40,8 @@ private: explicit MstxMgr(MstxMgr &&obj) = delete; MstxMgr& operator=(MstxMgr &&obj) = delete; + bool isMsleaksEnable(); + bool isMsleaksEnableImpl(); bool isProfTxEnable(); bool isMsptiTxEnable(); bool isMsptiTxEnableImpl(); @@ -43,6 +50,5 @@ private: std::unordered_set ptRangeIdsWithStream_; std::mutex mtx_; }; - } } // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index b58fa182a73659f0761343f5a646baa5e339b3c8..bab2bce61307987e573fc9dd44b8ac7f19086a1a 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -129,7 +129,7 @@ inline bool mstxEnable() struct MstxRange { int rangeId{0}; - mstxDomainhandle_t domainHandle{nullptr}; + mstxDomainHandle_t domainHandle{nullptr}; MstxRange(const std::string &message, aclrtStream stream, const std::string &domainName = "default") { if (!mstxEnable()) { @@ -137,7 +137,7 @@ struct MstxRange { } rangeId = MstxMgr::GetInstance()->getRangeId(); if (at_npu::native::IsSupportMstxDomainFunc()) { - domainHandle = MstxMgr::GetInstance()->createDomain(domainName.c_str()); + domainHandle = MstxMgr::GetInstance()->createProfDomain(domainName.c_str()); at_npu::native::MstxDomainRangeStartA(domainHandle, message.c_str(), stream, rangeId); } else { at_npu::native::MstxRangeStartA(message.c_str(), stream, rangeId);