From a2583035195695a64855aed62eda0bec6278258c Mon Sep 17 00:00:00 2001
From: lijian <lijian379@huawei.com>
Date: Tue, 23 Sep 2025 17:17:26 +0800
Subject: [PATCH] [perf] whisperx vad perf improve bind core

---
 ACL_PyTorch/built-in/audio/whisperx/README.md | 53 ++++++++++++++-----
 .../built-in/audio/whisperx/check_numa.sh     | 24 +++++++++
 .../audio/whisperx/patches/vad_model.patch    | 32 ++++++++---
 3 files changed, 88 insertions(+), 21 deletions(-)
 create mode 100644 ACL_PyTorch/built-in/audio/whisperx/check_numa.sh

diff --git a/ACL_PyTorch/built-in/audio/whisperx/README.md b/ACL_PyTorch/built-in/audio/whisperx/README.md
index 4300502578..4c152b7cee 100644
--- a/ACL_PyTorch/built-in/audio/whisperx/README.md
+++ b/ACL_PyTorch/built-in/audio/whisperx/README.md
@@ -71,6 +71,7 @@ cd ..
 
 ```text
 📁 whisper/
+├── check_numa.sh
 ├── audio.mp3
 ├── infer.py
 ├── modeling_whisper.py
@@ -90,19 +91,45 @@ cd ..
 |   |── 📁 speech_fsmn_vad_zh-cn-16k-common-pytorch
 ```
 
-## 开始推理
-```SHELL
-# 1. 激活环境变量
-source /usr/local/Ascend/ascend-toolkit/set_env.sh  # 具体路径根据你自己的情况修改
-# 2. 指定使用NPU ID，默认为0
-export ASCEND_RT_VISIBLE_DEVICES=0
-# 3. 给funasr和torchaudio打补丁
-cd patches
-python3 patch_apply.py
-cd ..
-# 4. 开始推理
-python3 infer.py --whisper_model_path ./weight/Whisper-large-v3/large-v3.pt
-```
+## 模型推理
+1. 激活环境变量
+    ```SHELL
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh  # 具体路径根据你自己的情况修改
+    # 提升性能相关环境变量
+    export TASK_QUEUE_ENABLE=2
+    export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
+    export HOST_CACHE_CAPACITY=20
+    export ASCEND_ENHANCE_ENABLE=1
+    ```
+
+2. 指定使用NPU ID，默认为0
+    ```SHELL
+    export ASCEND_RT_VISIBLE_DEVICES=0
+    ```
+3. 给funasr和torchaudio打补丁
+    ```SHELL
+    cd patches
+    python3 patch_apply.py
+    cd ..
+    ```
+4. 使能绑核，进一步提升性能
+    ```SHELL
+    export CPU_AFFINITY_CONF=1
+    apt-get update
+    apt-get install numactl
+    # 在容器外执行脚本查看NPU id对应的NUMA node和cpu
+    bash check_numa.sh
+    ```
+    回显如下：
+    ```SHELL
+    ...
+    >>>>设备 0 对应 NUMA 节点: 6, NUMA node6 CPU(s):     192-223
+    ...
+    ```
+5. 开始推理, 根据实际查询到的核数配置，比如
+    ```SHELL
+    taskset -c 192-223 python3 infer.py --whisper_model_path ./weight/Whisper-large-v3/large-v3.pt
+    ```
 infer.py推理参数：
 * --whisper_model_path：whisper模型权重路径，默认为"./weight/Whisper-large-v3/large-v3.pt"
 * --vad_model_path：vad模型权重路径，默认为"./weight/speech_fsmn_vad_zh-cn-16k-common-pytorch"
diff --git a/ACL_PyTorch/built-in/audio/whisperx/check_numa.sh b/ACL_PyTorch/built-in/audio/whisperx/check_numa.sh
new file mode 100644
index 0000000000..094624afc9
--- /dev/null
+++ b/ACL_PyTorch/built-in/audio/whisperx/check_numa.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# 查询NPU设备信息并获取第四列的PCI地址
+pci_addresses=$(npu-smi info | grep 0000 | awk '{print $4}')
+
+if [ -z "$pci_addresses" ]; then
+    echo "未找到匹配的NPU设备信息"
+    exit 1
+fi
+
+echo "找到以下NPU设备PCI地址:"
+echo "$pci_addresses"
+
+# 为每个PCI地址查询NUMA节点信息
+i=0
+for  addr in $pci_addresses; do
+    echo "查询PCI地址 $addr 的NUMA信息"
+
+    # 使用lspci获取节点信息
+    node_info=$(lspci -vvv -s $addr | grep node | awk -F ':' '{print $2}' | awk '{print $1}')
+    numa_info=$(lscpu | grep "node${node_info}")
+    echo ">>>>设备 $i 对应 NUMA 节点: $node_info, $numa_info"
+    i=$((i+1))
+done
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/audio/whisperx/patches/vad_model.patch b/ACL_PyTorch/built-in/audio/whisperx/patches/vad_model.patch
index 0e41152c38..15a60dd99b 100644
--- a/ACL_PyTorch/built-in/audio/whisperx/patches/vad_model.patch
+++ b/ACL_PyTorch/built-in/audio/whisperx/patches/vad_model.patch
@@ -1,16 +1,32 @@
---- model.py	2025-09-09 14:29:16
-+++ model_modified.py	2025-09-09 14:43:59
-@@ -336,7 +336,7 @@
+--- model.py	2025-09-22 19:07:02
++++ model_modified.py	2025-09-23 09:23:30
+@@ -336,19 +336,26 @@
                  (cache["stats"].data_buf_all, cache["stats"].waveform[0])
              )
              
 -        waveform_numpy = cache["stats"].waveform.numpy()
-+        waveform_numpy = cache["stats"].waveform.cpu().numpy()
+-
+-        offsets = np.arange(0, waveform_numpy.shape[1] - frame_sample_length + 1, frame_shift_length)
+-        frames = waveform_numpy[0, offsets[:, np.newaxis] + np.arange(frame_sample_length)]
++        waveform = cache["stats"].waveform
  
-         offsets = np.arange(0, waveform_numpy.shape[1] - frame_sample_length + 1, frame_shift_length)
-         frames = waveform_numpy[0, offsets[:, np.newaxis] + np.arange(frame_sample_length)]
-@@ -348,7 +348,8 @@
+-        decibel_numpy = 10 * np.log10(np.sum(np.square(frames), axis=1) + 0.000001)
+-        decibel_numpy = decibel_numpy.tolist()
++        offsets = torch.arange(
++            0, waveform.shape[1] - frame_sample_length + 1, frame_shift_length,
++            device=waveform.device
++        )
  
+-        cache["stats"].decibel.extend(decibel_numpy)
++        indices = offsets.unsqueeze(1) + torch.arange(frame_sample_length, device=waveform.device)
++        frames = waveform[0].index_select(0, indices.view(-1)).view(offsets.size(0), frame_sample_length)
+ 
++        decibel = 10 * torch.log10(torch.sum(frames ** 2, dim=1) + 1e-6)
++
++        if isinstance(cache["stats"].decibel, torch.Tensor):
++            cache["stats"].decibel = torch.cat(cache["stats"].decibel, decibel)
++        else:
++            cache["stats"].decibel.extend(decibel.cpu().tolist())
  
      def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None:
 -        scores = self.encoder(feats, cache=cache["encoder"]).to("cpu")  # return B * T * D
@@ -19,7 +35,7 @@
          assert (
              scores.shape[1] == feats.shape[1]
          ), "The shape between feats and scores does not match"
-@@ -688,7 +689,7 @@
+@@ -688,7 +695,7 @@
          meta_data["load_data"] = f"{time2 - time1:0.3f}"
          assert len(audio_sample_list) == 1, "batch_size must be set 1"
  
-- 
Gitee