diff --git a/torch_npu/profiler/analysis/npu_profiler.py b/torch_npu/profiler/analysis/npu_profiler.py
index 5689b6ec2a5c7c57b6458583804ab85fe7333570..b72d28992f5d287c7f5d811c1eadbf2de0d232a6 100644
--- a/torch_npu/profiler/analysis/npu_profiler.py
+++ b/torch_npu/profiler/analysis/npu_profiler.py
@@ -10,9 +10,20 @@ from ...utils.path_manager import PathManager
 
 
 class NpuProfiler:
-
     @classmethod
     def analyse(cls, input_path: str, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None,
+                 **kwargs):
+        """ Muti-process in parsing use fork to generate child processes for better performance, while forking from a
+            muti-threaded process may cause deadlock. So spawn a pure process to be public parent process for parsing.
+        """
+        mp = multiprocessing.get_context("spawn")
+        p = mp.Process(target=NpuProfiler._analyse, args=(input_path, analysis_type, output_path),
+                                    kwargs=kwargs)
+        p.start()
+        p.join()
+
+    @classmethod
+    def _analyse(cls, input_path: str, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None,
                 **kwargs):
         input_path = ProfilerPathManager.get_realpath(input_path)
         cls._check_input_path(input_path)