diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 1b53000d490be181f3ba2e4ff696dc48f5a7a736..f40bfcfc6e6337e0d3010d05c1214442bf5ccb8a 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -21,6 +21,7 @@ #include "third_party/acl/inc/acl/acl_op_compiler.h" #include "third_party/acl/inc/acl/acl_rt.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" +#include "torch_npu/csrc/framework/LazyInitAclops.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" #ifdef SUCCESS @@ -30,74 +31,29 @@ #undef FAILED #endif -#if defined(_MSC_VER) -#include -#define GetCurrentDirPath _getcwd -#define Mkdir(path, mode) _mkdir(path) -#elif defined(__unix__) +#include +#include +#include #include -#include -#include -#define GetCurrentDirPath getcwd -#define Mkdir(path, mode) mkdir(path, mode) -#else -#endif namespace { const uint32_t kMaxOpExecuteTimeOut = 547U; const size_t kMaxPathLen = 4096U; -void MakeCompileCacheDirAndSetOption() +void SignalHandler(int sig) { - char* compile_cache_mode_val = std::getenv("ACL_OP_COMPILER_CACHE_MODE"); - std::string compile_cache_mode = (compile_cache_mode_val == nullptr) ? std::string("enable") - : std::string(compile_cache_mode_val); - if (compile_cache_mode != "enable" && compile_cache_mode != "disable" && compile_cache_mode != "force") { - compile_cache_mode = std::string("enable"); - } - auto compile_mode = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_MODE"); - if (!compile_mode.has_value() || compile_mode.value() == "") { - c10_npu::option::register_options::OptionRegister::GetInstance()->Set("ACL_OP_COMPILER_CACHE_MODE", compile_cache_mode); - } + std::cerr << "Caught signal " << sig << std::endl; - char* compile_cache_dir_val = std::getenv("ACL_OP_COMPILER_CACHE_DIR"); - if (compile_cache_dir_val != nullptr) { - std::string compile_cache_dir = std::string(compile_cache_dir_val); - // mode : 750 - auto ret = Mkdir(compile_cache_dir.c_str(), S_IRWXU | S_IRGRP | S_IXGRP); - if (ret == -1) { - if (errno != EEXIST) { - TORCH_NPU_WARN("make compile cache directory error: ", strerror(errno)); - return; - } - } - auto compile_dir = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_DIR"); - if (!compile_dir.has_value() || compile_dir.value() == "") { - c10_npu::option::register_options::OptionRegister::GetInstance()->Set("ACL_OP_COMPILER_CACHE_DIR", compile_cache_dir); - } - } -} + void *array[10]; + size_t size = backtrace(array, 10); + char **symbols = backtrace_symbols(array, size); -void GetAndSetDefaultJitCompileByAcl() -{ - auto jit_compile = c10_npu::option::GetOption("jitCompile"); - if (jit_compile.has_value() && jit_compile.value() != "") { - return; + for (size_t i = 0; i < size; i++) { + std::cerr << symbols[i] << std::endl; } + free(symbols); - auto opt_size = at_npu::native::AclGetCompileoptSize(ACL_OP_JIT_COMPILE); - if (!opt_size.has_value()) { - ASCEND_LOGW("Get ACL JitCompile default value size failed, use PTA default value: True"); - return; - } - TORCH_CHECK(opt_size.value() != 0, "AclGetCompileoptSize opt_size.value() = 0 !", PTA_ERROR(ErrCode::ACL)); - char value_name[opt_size.value()]; - auto ret = at_npu::native::AclGetCompileopt(ACL_OP_JIT_COMPILE, value_name, opt_size.value()); - // Get func success but get value failed, throw error - TORCH_CHECK(ret == ACL_SUCCESS, "Get ACL JitCompile default value failed.", PTA_ERROR(ErrCode::ACL)); - std::string value_str(value_name); - c10_npu::option::SetOption("jitCompile", value_str); - ASCEND_LOGI("Get ACL JitCompile default value %s and set", value_str.c_str()); + std::_Exit(1); } void SetDefaultAllowInternalFromatDisable() @@ -111,25 +67,6 @@ void SetDefaultAllowInternalFromatDisable() ASCEND_LOGI("Set ALLOW_INTERNAL_FORMAT default value disable."); } -void SetHF32DefaultValue() { - // The default value of the flag used to control whether HF32 is allowed on conv is True. - // The default value of the flag used to control whether HF32 is allowed on matmul is True, - // but this flag defaults to False in PyTorch 1.12 and later. - - // When the flag of matmul is False, and the flag of conv is True, - // the value of option "ACL_ALLOW_HF32" should be set to "10"; - std::string allow_hf32 = "10"; - auto ret = at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_ALLOW_HF32, allow_hf32.c_str()); - if (ret == ACL_SUCCESS) { - ASCEND_LOGI("Set ACL option ACL_ALLOW_HF32 default value to %s.", allow_hf32.c_str()); - } else if (ret == ACL_ERROR_INTERNAL_ERROR) { - // Used to solve version compatibility issues, when ASCEND have not been updated. - ASCEND_LOGW("Failed to set default value of ACL option ACL_ALLOW_HF32, which is unsupported by current version."); - } else { - TORCH_CHECK(0, "Failed to set compile option ACL_ALLOW_HF32, result = ", ret, ", set value ", allow_hf32, PTA_ERROR(ErrCode::ACL)); - } -} - #ifndef BUILD_LIBTORCH std::string GetTorchNpuFile() { PyObject* file_attr = nullptr; @@ -194,6 +131,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) if (init_flag_) { return INIT_SUCC; } + signal(SIGSEGV, SignalHandler); std::string json_path = GetAclConfigJsonPath(); const char *json_path_ptr = json_path == "" ? nullptr : json_path.c_str(); ASCEND_LOGD("get acl json path:%s.", json_path_ptr); @@ -228,7 +166,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); } - if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; NPU_CHECK_ERROR(aclmdlSetDump(aclConfigPath)); @@ -245,22 +182,12 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); } - // set ACL_PRECISION_MODE by SocVersion("allow_fp32_to_fp16" or "must_keep_origin_dtype"). - auto precision_mode = c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1 ? - "must_keep_origin_dtype" : "allow_fp32_to_fp16"; - NPU_CHECK_ERROR(at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, precision_mode)); - - // set default compile cache mode and dir for users to improve op compile time - MakeCompileCacheDirAndSetOption(); - // set default jit_Compile value from Get acl defalut value - GetAndSetDefaultJitCompileByAcl(); + at_npu::aclops::SetJitCompileModeforAclnn(); // set default allow_internal_format value if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910_9391) { SetDefaultAllowInternalFromatDisable(); } - SetHF32DefaultValue(); - NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); @@ -271,7 +198,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) const auto& in = iter.second; call_(in); } - + lazy_fn_.clear(); SetThreadAffinity(device_id_); diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c925beaa2585ca5ad45bcab96304a8aa33f65e9d --- /dev/null +++ b/torch_npu/csrc/framework/LazyInitAclops.cpp @@ -0,0 +1,164 @@ +#include "torch_npu/csrc/framework/LazyInitAclops.h" + +#include + +#include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/NpuVariables.h" +#include "torch_npu/csrc/core/npu/register/OptionRegister.h" +#include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" +#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" +#include + + +#if defined(_MSC_VER) +#include +#define GetCurrentDirPath _getcwd +#define Mkdir(path, mode) _mkdir(path) +#elif defined(__unix__) +#include +#include +#include +#define GetCurrentDirPath getcwd +#define Mkdir(path, mode) mkdir(path, mode) +#else +#endif + +namespace at_npu { + namespace aclops { + + bool encounteredAclops = false; + + void SetHF32DefaultValue() + { + // The default value of the flag used to control whether HF32 is allowed on + // conv is True. The default value of the flag used to control whether HF32 + // is allowed on matmul is True, but this flag defaults to False in + // PyTorch 1.12 and later. + + // When the flag of matmul is False, and the flag of conv is True, + // the value of option "ACL_ALLOW_HF32" should be set to "10"; + std::string allow_hf32 = "10"; + auto ret = at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_ALLOW_HF32, allow_hf32.c_str()); + if (ret == ACL_SUCCESS) { + ASCEND_LOGI("Set ACL option ACL_ALLOW_HF32 default value to %s.", allow_hf32.c_str()); + } else if (ret == ACL_ERROR_INTERNAL_ERROR) { + // Used to solve version compatibility issues, when ASCEND have not been + // updated. + ASCEND_LOGW( + "Failed to set default value of ACL option ACL_ALLOW_HF32, which is " + "unsupported by current version."); + } else { + TORCH_CHECK(0, "Failed to set compile option ACL_ALLOW_HF32, result = ", ret, ", set value ", allow_hf32, + PTA_ERROR(ErrCode::ACL)); + } + } + + // set default compile cache mode and dir to improve op compile time + void MakeCompileCacheDirAndSetOption() + { + char *compile_cache_mode_val = std::getenv("ACL_OP_COMPILER_CACHE_MODE"); + std::string compile_cache_mode = + (compile_cache_mode_val == nullptr) ? std::string("enable") : std::string(compile_cache_mode_val); + if (compile_cache_mode != "enable" && compile_cache_mode != "disable" && compile_cache_mode != "force") { + compile_cache_mode = std::string("enable"); + } + auto compile_mode = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_MODE"); + if (!compile_mode.has_value() || compile_mode.value() == "") { + c10_npu::option::register_options::OptionRegister::GetInstance()->Set("ACL_OP_COMPILER_CACHE_MODE", + compile_cache_mode); + } + + char *compile_cache_dir_val = std::getenv("ACL_OP_COMPILER_CACHE_DIR"); + if (compile_cache_dir_val != nullptr) { + std::string compile_cache_dir = std::string(compile_cache_dir_val); + // mode : 750 + auto ret = Mkdir(compile_cache_dir.c_str(), S_IRWXU | S_IRGRP | S_IXGRP); + if (ret == -1) { + if (errno != EEXIST) { + TORCH_NPU_WARN("make compile cache directory error: ", strerror(errno)); + return; + } + } + auto compile_dir = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_DIR"); + if (!compile_dir.has_value() || compile_dir.value() == "") { + c10_npu::option::register_options::OptionRegister::GetInstance()->Set("ACL_OP_COMPILER_CACHE_DIR", + compile_cache_dir); + } + } + } + + void SetJitCompileModeforAclnn() + { + auto jit_compile = c10_npu::option::GetOption("jitCompile"); + if (jit_compile.has_value() && jit_compile.value() != "") { + return; + } + c10_npu::option::SetOption("jitCompileInit", "disable"); + } + + // set default jit_Compile value from Get acl defalut value + void GetAndSetDefaultJitCompileByAcl() + { + auto jit_compile = c10_npu::option::GetOption("jitCompile"); + if (jit_compile.has_value() && jit_compile.value() != "") { + return; + } + + auto opt_size = at_npu::native::AclGetCompileoptSize(ACL_OP_JIT_COMPILE); + if (!opt_size.has_value()) { + ASCEND_LOGW( + "Get ACL JitCompile default value size failed, use PTA " + "default value: " + "True"); + return; + } + TORCH_CHECK(opt_size.value() != 0, "AclGetCompileoptSize opt_size.value() = 0 !", PTA_ERROR(ErrCode::ACL)); + char value_name[opt_size.value()]; + auto ret = at_npu::native::AclGetCompileopt(ACL_OP_JIT_COMPILE, value_name, opt_size.value()); + // Get func success but get value failed, throw error + TORCH_CHECK(ret == ACL_SUCCESS, "Get ACL JitCompile default value failed.", PTA_ERROR(ErrCode::ACL)); + std::string value_str(value_name); + c10_npu::option::SetOption("jitCompile", value_str); + ASCEND_LOGI("Get ACL JitCompile default value %s and set", value_str.c_str()); + } + + void SetPrecisionMode() + { + // set ACL_PRECISION_MODE by SocVersion("allow_fp32_to_fp16" or + // "must_keep_origin_dtype"). + auto precision_mode = c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1 ? "must_keep_origin_dtype" + : "allow_fp32_to_fp16"; + NPU_CHECK_ERROR(at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, precision_mode)); + } + + void LazyInitAclopsCore() + { + SetPrecisionMode(); + SetHF32DefaultValue(); + MakeCompileCacheDirAndSetOption(); + GetAndSetDefaultJitCompileByAcl(); + } + + void LazyInitAclops() + { +#ifndef BUILD_LIBTORCH + PyThreadState *gilState = nullptr; + if (PyGILState_Check()) { + gilState = PyEval_SaveThread(); + } +#endif + RECORD_FUNCTION("LazyInitAclops", std::vector({})); + if (!encounteredAclops && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { + LazyInitAclopsCore(); + encounteredAclops = true; + ASCEND_LOGI("Lazy Init Aclops Success.") + } +#ifndef BUILD_LIBTORCH + if (gilState) { + PyEval_RestoreThread(gilState); + } +#endif + } + + } // namespace aclops +} // namespace at_npu diff --git a/torch_npu/csrc/framework/LazyInitAclops.h b/torch_npu/csrc/framework/LazyInitAclops.h new file mode 100644 index 0000000000000000000000000000000000000000..cfd3db87210216207c6d47927a4373b603b34f9c --- /dev/null +++ b/torch_npu/csrc/framework/LazyInitAclops.h @@ -0,0 +1,13 @@ +#ifndef AT_NPU_ACOPLS_LAZYINITACLOPS_H_ +#define AT_NPU_ACOPLS_LAZYINITACLOPS_H_ + +namespace at_npu { +namespace aclops { + +void LazyInitAclops(); +void SetJitCompileModeforAclnn(); + +} // namespace aclops +} // namespace at_npu + +#endif // AT_NPU_ACOPLS_LAZYINITACLOPS_H_ \ No newline at end of file diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp index 57e6fd4385bbaa41560cdf89a5d46f8651d8dd08..b6a82747bd15afc959271d573d8ec1f814c197dc 100644 --- a/torch_npu/csrc/framework/OpCommand.cpp +++ b/torch_npu/csrc/framework/OpCommand.cpp @@ -9,6 +9,7 @@ #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h" +#include "torch_npu/csrc/framework/LazyInitAclops.h" #include "torch_npu/csrc/aten/CustomFunctions.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #ifndef BUILD_LIBTORCH @@ -134,6 +135,7 @@ OpCommand& OpCommand::Output( void OpCommand::Run() { aclCmd->SetEnginePriority(); const string &op_name = aclCmd->GetName(); + at_npu::aclops::LazyInitAclops(); #ifndef BUILD_LIBTORCH const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); #endif diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 8a23f194cca72f7fd813626ea86e9ac3ce99e118..9a9bcde487277c4e35353eb1072e2d98265a6410 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -96,11 +96,12 @@ void OpCommandImpl::SetEnginePriority() } } -inline void SetDeterministicOption(bool deterministicAlgorithmsStatus) +inline void SetDeterministicOption(bool deterministicAlgorithmsStatus, bool isOpapi) { if (deterministicaclnn_oldstatus != deterministicAlgorithmsStatus) { - NPU_CHECK_ERROR( - AclSetCompileopt(aclCompileOpt::ACL_OP_DETERMINISTIC, deterministicAlgorithmsStatus ? "1" : "0")); + if (!isOpapi) { + NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_DETERMINISTIC, deterministicAlgorithmsStatus ? "1" : "0")); + } NPU_CHECK_ERROR( AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, deterministicAlgorithmsStatus ? 1 : 0)); NPU_CHECK_ERROR( @@ -112,14 +113,19 @@ inline void SetDeterministicOption(bool deterministicAlgorithmsStatus) } void SetDeterministic() +{ + SetDeterministic(true); +} + +void SetDeterministic(bool isOpapi) { auto deterministicAlgorithmsStatus = at::globalContext().deterministicAlgorithms(); - SetDeterministicOption(deterministicAlgorithmsStatus); + SetDeterministicOption(deterministicAlgorithmsStatus, isOpapi); } void SetDeterministicOps(bool deterministicAlgorithmsStatus) { - SetDeterministicOption(deterministicAlgorithmsStatus); + SetDeterministicOption(deterministicAlgorithmsStatus, true); } void OpCommandImpl::Run( @@ -177,7 +183,7 @@ aclError OpCommandImpl::InnerRun( auto inputSize = params.inBuffer.size(); auto outputSize = params.outBuffer.size(); // open the deterministicAlgorithms config - SetDeterministic(); + SetDeterministic(false); bool reset_flag = false; if (ForceJitCompileList::GetInstance().Inlist(name) && env::CheckJitDisable()) { NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "enable")); @@ -271,20 +277,12 @@ aclError OpCommandImpl::InnerRunOpApi(const string &op_name, PROC_FUNC func) } // open the deterministicAlgorithms config SetDeterministic(); - bool reset_flag = false; - if (ForceJitCompileList::GetInstance().Inlist(op_name) && env::CheckJitDisable()) { - NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "enable")); - reset_flag = true; - } int index = 0; do { ret = func(); OPS_CHECK_ERROR(ret, op_name.c_str()); index++; } while (NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); - if (reset_flag) { - NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "disable")); - } return ret; } @@ -323,7 +321,7 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) ASCEND_LOGD("Op %s Run.", cur_paras->opType); aclError ret; // open the deterministicAlgorithms config - SetDeterministic(); + SetDeterministic(false); if (cur_paras->customHandler) { ASCEND_LOGD("Exec Op %s with custom handle", cur_paras->opType); try { diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h index 56ec9cf9f687102b0cf7047813d5330c91f6976d..da29441a513d4e5e183d5a402a6945b37ff3d911 100644 --- a/torch_npu/csrc/framework/OpParamMaker.h +++ b/torch_npu/csrc/framework/OpParamMaker.h @@ -395,6 +395,7 @@ private: }; // class OpCommandImpls void SetDeterministic(); +void SetDeterministic(bool isOpapi); void SetDeterministicOps(bool deterministicAlgorithmsStatus); static bool deterministicaclnn_oldstatus = false; diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp index 6c2bebe314f9ed6c53e02fa9b4290383b509ee19..7041ca3df426b9b503a0343a15bd3b6b569da8c3 100644 --- a/torch_npu/csrc/framework/interface/EnvVariables.cpp +++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp @@ -52,6 +52,10 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false); }) +REGISTER_OPTION_HOOK(jitCompileInit, [](const std::string &val) { + SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false); +}) + bool CheckJitDisable() { return GET_OPTION_WITH_CACHE(isJitDisable); }