diff --git a/ACL_PyTorch/built-in/ocr/MinerU/README.md b/ACL_PyTorch/built-in/ocr/MinerU/README.md index af345076b8c1e53ee78a9d3e24d245f2947f81cf..00f78f10c41d99fc5ada514d2fbe0c1174acc213 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/README.md +++ b/ACL_PyTorch/built-in/ocr/MinerU/README.md @@ -44,6 +44,8 @@ MinerU是由上海人工智能实验室OpenDataLab团队开发的开源文档解 1. 获取`Pytorch`源码 ``` + git clone https://gitee.com/ascend/ModelZoo-PyTorch.git + cd ModelZoo-PyTorch/ACL_PyTorch/built-in/ocr/MinerU git clone https://github.com/opendatalab/MinerU.git cd MinerU git reset --hard de41fa58590263e43b783fe224b6d07cae290a33 @@ -71,13 +73,15 @@ MinerU是由上海人工智能实验室OpenDataLab团队开发的开源文档解 3. 修改第三方库 进入第三方库安装路径,默认为`source_path = /usr/local/lib/python3.11/site-packages`,通过工作目录`workdir`(自定义)中的`ultralytics.patch`和`doclayout_yolo.patch`进行修改 ``` - source_path=/usr/local/lib/python3.11/site-packages + workdir=$(pwd) + source_path=$(pip show ultralytics | grep Location | awk '{print $2}') cd ${source_path}/ultralytics - patch -p2 < ${workdir}/ultralytics.patch + patch -p1 < ${workdir}/ultralytics.patch cd ${source_path}/doclayout_yolo - patch -p2 < ${workdir}/doclayout_yolo.patch - cd ${workdir} - patch -p0 < mfr_encoder_mhsa.patch + patch -p1 < ${workdir}/doclayout_yolo.patch + cd ${workdir}/MinerU + git apply ../mineru.patch + cd .. ``` ## 获取权重 @@ -155,33 +159,76 @@ python3 infer.py --data_path=OmniDocBench_dataset --model_source=local 1. 推理结果整理 - 将解析结果文件夹中的markdown文件整理放置于同一目录,本例将所有markdown文件存放于OmniDocBench_dataset目录下的results_md文件夹 + 将解析结果文件夹中的markdown文件整理放置于同一目录,本例将所有markdown文件存放于OmniDocBench_dataset目录下的`end2end`文件夹 ``` - cp OmniDocBench_dataset/output/*/auto/*.md OmniDocBench_dataset/results_md/ + cp OmniDocBench_dataset/output/*/auto/*.md OmniDocBench_dataset/end2end/ ``` 2. 获取测评源码并构建环境 + + - 安装OmniDocBench基础环境 ``` git clone https://github.com/opendatalab/OmniDocBench.git cd OmniDocBench - git reset --hard dc96d812d219960773399c02ae8f89e4706120d4 + git reset --hard 523fd1d529c3e9d0088c662e983aa70fb9585c9a conda create -n omnidocbench python=3.10 conda activate omnidocbench pip install -r requirements.txt ``` + - 公式精度指标CDM需要额外安装环境 + + step.1 install nodejs + ``` + wget https://nodejs.org/dist/v16.13.1/node-v16.13.1-linux-arm64.tar.xz + tar -xf node-v16.13.1-linux-arm64.tar.xz + mv node-v16.13.1-linux-arm64/* /usr/local/nodejs/ + ln -s /usr/local/nodejs/bin/node /usr/local/bin + ln -s /usr/local/nodejs/bin/npm /usr/local/bin + node -v + ``` + + step.2 install imagemagic + ``` + git clone https://github.com/ImageMagick/ImageMagick.git ImageMagick-7.1.2 + cd ImageMagick-7.1.2 + apt-get update && apt-get install -y libpng-dev zlib1g-dev + apt-get install -y ghostscript + ./configure + make + sudo make install + sudo ldconfig /usr/local/lib + convert --version + ``` + + step.3 install latexpdf + ``` + sudo apt-get install texlive-full + ``` + + step.4 install python requriements + ``` + pip install -r metrics/cdm/requirements.txt + ``` + 3. 测评配置修改 修改`OmniDocBench`测评代码中的config文件,具体来说,我们使用端到端测评配置,修改configs/end2end.yaml文件中的ground_truth的data_path为下载的OmniDocBench.json路径,修改prediction的data_path中提供整理的推理结果的文件夹路径,如下: ``` # -----以下是需要修改的部分 ----- + display_formula: + metric: + - Edit_dist + - CDM ### 安装好CDM环境后,可以在config文件中设置并直接计算 + - CDM_plain + ... dataset: dataset_name: end2end_dataset ground_truth: data_path: ../OmniDocBench_dataset/OmniDocBench.json prediction: - data_path: ../OmniDocBench_dataset/results_md + data_path: ../OmniDocBench_dataset/end2end ``` 4. 精度测量结果 @@ -190,10 +237,18 @@ python3 infer.py --data_path=OmniDocBench_dataset --model_source=local ``` python pdf_validation.py --config ./configs/end2end.yaml ``` + 评测结果将会存储在result目录下,Overall指标的计算方式为: + $$\text{Overall} = \frac{(1-\textit{Text Edit Distance}) \times 100 + \textit{Table TEDS} +\textit{Formula CDM}}{3}$$ + + 运行overall_metric.py可以得到精度结果: + ``` + cd .. + python overall_metric.py + ``` - 在`OmniDocBench`数据集上的精度为: - |模型|芯片|overall_EN|overall_CH| + 在`OmniDocBench`数据集上的精度和性能数据分别为: + |模型|芯片|overall|性能(s)| |------|------|------|------| - |MinerU|300I DUO|0.1588|0.2527| - |MinerU|800I A2 64G|0.1580|0.2510| + |MinerU|300I DUO|81.68| 3.37 | + |MinerU|800I A2 64G|81.51| 1.85 | diff --git a/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch b/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch index b5fd6669aa2dec34a5a4038305bb63deabe8c673..291a2914abbd98d4b04ead6c11c4022e4840e514 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch +++ b/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch @@ -1,7 +1,123 @@ -diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py doclayout_yolo-0.0.4_fix/doclayout_yolo/engine/predictor.py +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/data/loaders.py doclayout_yolo/data/loaders.py +--- doclayout_yolo-0.0.4/doclayout_yolo/data/loaders.py 2025-02-11 15:49:31.000000000 +0800 ++++ doclayout_yolo/data/loaders.py 2025-10-19 01:27:41.984000000 +0800 +@@ -14,6 +14,7 @@ + import requests + import torch + from PIL import Image ++from torchvision.transforms import functional as TF + + from doclayout_yolo.data.utils import IMG_FORMATS, VID_FORMATS + from doclayout_yolo.utils import LOGGER, is_colab, is_kaggle, ops +@@ -411,7 +412,7 @@ + self.bs = len(self.im0) + + @staticmethod +- def _single_check(im): ++ def __single_check(im): ## origin _single_check + """Validate and format an image to numpy array.""" + assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}" + if isinstance(im, Image.Image): +@@ -419,6 +420,18 @@ + im = im.convert("RGB") + im = np.asarray(im)[:, :, ::-1] + im = np.ascontiguousarray(im) # contiguous ++ ++ return im ++ ++ @staticmethod ++ def _single_check(im): ++ """Validate and format an image to numpy array.""" ++ assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}" ++ if isinstance(im, Image.Image): ++ if im.mode != "RGB": ++ im = im.convert("RGB") ++ im = np.asarray(im) ++ + return im + + def __len__(self): +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/engine/model.py doclayout_yolo/engine/model.py +--- doclayout_yolo-0.0.4/doclayout_yolo/engine/model.py 2025-02-11 15:49:31.000000000 +0800 ++++ doclayout_yolo/engine/model.py 2025-10-19 01:27:41.988000000 +0800 +@@ -143,6 +143,8 @@ + else: + self._load(model, task=task) + ++ self.model.half() ++ + def __call__( + self, + source: Union[str, Path, int, list, tuple, np.ndarray, torch.Tensor] = None, +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py doclayout_yolo/engine/predictor.py --- doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py 2025-02-11 15:49:31.000000000 +0800 -+++ doclayout_yolo-0.0.4_fix/doclayout_yolo/engine/predictor.py 2025-09-09 16:05:20.011737230 +0800 -@@ -152,7 +152,8 @@ ++++ doclayout_yolo/engine/predictor.py 2025-10-19 01:27:41.988000000 +0800 +@@ -47,6 +47,8 @@ + from doclayout_yolo.utils.files import increment_path + from doclayout_yolo.utils.torch_utils import select_device, smart_inference_mode + ++import torch.nn.functional as F ++ + STREAM_WARNING = """ + WARNING ⚠️ inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory + errors for large sources or long-running streams and videos. See https://docs.doclayout_yolo.com/modes/predict/ for help. +@@ -112,7 +114,7 @@ + self._lock = threading.Lock() # for automatic thread-safe inference + callbacks.add_integration_callbacks(self) + +- def preprocess(self, im): ++ def _preprocess(self, im): ### origin preprocess + """ + Prepares input image before inference. + +@@ -132,6 +134,46 @@ + im /= 255 # 0 - 255 to 0.0 - 1.0 + return im + ++ ++ def preprocess(self, images): ### adapt preprocess ++ """ ++ Prepares input image before inference. ++ ++ Args: ++ images (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list. ++ """ ++ new_shape = (new_shape, new_shape) if isinstance(self.imgsz, int) else self.imgsz ++ tensors = [] ++ for im in images: ++ im = torch.from_numpy(im).to(self.device).permute((2, 0, 1)) / 255.0 ++ ++ c, h, w = im.shape ++ ++ r = min(new_shape[0] / h, new_shape[1] / w) ++ ++ new_unpad = (int(round(w * r)), int(round(h * r))) ++ ++ if (w, h) != new_unpad: ++ im = F.interpolate(im.unsqueeze(0), size=(new_unpad[1], new_unpad[0]), ++ mode="bilinear", align_corners=False).squeeze(0) ++ ++ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] ++ dw /= 2 ++ dh /= 2 ++ left, right = int(dw), int(dw + 0.5) ++ top, bottom = int(dh), int(dh + 0.5) ++ im = F.pad(im, (left, right, top, bottom), value=114/255.0) ++ ++ _, H, W = im.shape ++ assert (H, W) == (new_shape[0], new_shape[1]), f"Expected image size do not match: padding image size:{(H, W)} != expected image size: {(new_shape[0], new_shape[1])}" ++ ++ im = im.half() if self.model.fp16 else im.float() # uint8 to fp16/32 ++ ++ tensors.append(im) ++ ++ return torch.stack(tensors, dim=0) ++ ++ + def inference(self, im, *args, **kwargs): + """Runs inference on a given image using the specified model and arguments.""" + visualize = ( +@@ -152,7 +194,8 @@ (list): A list of transformed images. """ same_shapes = len({x.shape for x in im}) == 1 @@ -11,7 +127,7 @@ diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py doclayout_yolo return [letterbox(image=x) for x in im] def postprocess(self, preds, img, orig_imgs): -@@ -225,7 +226,8 @@ +@@ -225,7 +268,8 @@ # Warmup model if not self.done_warmup: @@ -21,10 +137,9 @@ diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py doclayout_yolo self.done_warmup = True self.seen, self.windows, self.batch = 0, [], None - -diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/nn/modules/block.py doclayout_yolo-0.0.4_fix/doclayout_yolo/nn/modules/block.py +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/nn/modules/block.py doclayout_yolo/nn/modules/block.py --- doclayout_yolo-0.0.4/doclayout_yolo/nn/modules/block.py 2025-02-11 15:49:31.000000000 +0800 -+++ doclayout_yolo-0.0.4_fix/doclayout_yolo/nn/modules/block.py 2025-09-09 16:05:20.019737230 +0800 ++++ doclayout_yolo/nn/modules/block.py 2025-10-19 01:27:41.996000000 +0800 @@ -230,7 +230,9 @@ def forward(self, x): """Forward pass through C2f layer.""" @@ -36,10 +151,9 @@ diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/nn/modules/block.py doclayout_yolo return self.cv2(torch.cat(y, 1)) def forward_split(self, x): - -diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/utils/tal.py doclayout_yolo-0.0.4_fix/doclayout_yolo/utils/tal.py +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/utils/tal.py doclayout_yolo/utils/tal.py --- doclayout_yolo-0.0.4/doclayout_yolo/utils/tal.py 2025-02-11 15:49:31.000000000 +0800 -+++ doclayout_yolo-0.0.4_fix/doclayout_yolo/utils/tal.py 2025-09-09 16:05:20.023737230 +0800 ++++ doclayout_yolo/utils/tal.py 2025-10-19 01:27:42.000000000 +0800 @@ -328,7 +328,8 @@ sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx) @@ -48,3 +162,4 @@ diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/utils/tal.py doclayout_yolo-0.0.4_ + # stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) + stride_tensor.append(torch.ones((h * w, 1), dtype=dtype, device=device)*stride) return torch.cat(anchor_points), torch.cat(stride_tensor) + diff --git a/ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch b/ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch deleted file mode 100644 index 1fe80a05cbfbdbee80ee84508469f256c48f777d..0000000000000000000000000000000000000000 --- a/ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch +++ /dev/null @@ -1,23 +0,0 @@ ---- MinerU/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py 2025-09-02 17:58:15.032000000 +0800 -+++ copy_mfr.py 2025-09-10 13:58:36.616000000 +0800 -@@ -465,11 +465,15 @@ - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - batch_size, dim, num_channels = hidden_states.shape -- mixed_query_layer = self.query(hidden_states) - -- key_layer = self.transpose_for_scores(self.key(hidden_states)) -- value_layer = self.transpose_for_scores(self.value(hidden_states)) -- query_layer = self.transpose_for_scores(mixed_query_layer) -+ # """融合qk为大矩阵,由于加入相对位置编码,PFA接口用不了,暂时只修改矩阵乘法""" -+ batch_size, dim, num_channels = hidden_states.shape -+ qkv = self.qkv(hidden_states) -+ q, k, v = qkv.chunk(3, dim=-1) -+ -+ query_layer = q.view(*q.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) -+ key_layer = k.view(*k.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) -+ value_layer = v.view(*v.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - diff --git a/ACL_PyTorch/built-in/ocr/MinerU/mineru.patch b/ACL_PyTorch/built-in/ocr/MinerU/mineru.patch new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ACL_PyTorch/built-in/ocr/MinerU/overall_metric.py b/ACL_PyTorch/built-in/ocr/MinerU/overall_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..5d89b41693aefb448d1bcd8c3cc3b9ec8f881c3b --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/overall_metric.py @@ -0,0 +1,47 @@ +import os +import json +import argparse + +import pandas as pd +import numpy as np + +parser = argparse.ArgumentParser(description='result path') +parser.add_argument('--result', type=str, default='OmniDocBench/result') +args = parser.parse_args() + + +ocr_types_dict = { + 'end2end': 'end2end' +} + +result_folder = args.result + +match_name = 'quick_match' + +# overall result: not distinguishing between Chinese and English, page-level average + +dict_list = [] + +for ocr_type in ocr_types_dict.values(): + result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json') + + with open(result_path, 'r') as f: + result = json.load(f) + + save_dict = {} + + for category_type, metric in [("text_block", "Edit_dist"), ("display_formula", "CDM"), ("table", "TEDS"), ("table", "TEDS_structure_only"), ("reading_order", "Edit_dist")]: + if metric == 'CDM' or metric == "TEDS" or metric == "TEDS_structure_only": + if result[category_type]["page"].get(metric): + save_dict[category_type + '_' + metric] = result[category_type]["page"][metric]["ALL"] * 100 # page级别的avg + else: + save_dict[category_type + '_' + metric] = 0 + else: + save_dict[category_type + '_' + metric] = result[category_type]["all"][metric].get("ALL_page_avg", np.nan) + + dict_list.append(save_dict) + +df = pd.DataFrame(dict_list, index=ocr_types_dict.keys()).round(3) +df['overall'] = ((1 - df['text_block_Edit_dist']) * 100 + df['display_formula_CDM'] + df['table_TEDS']) / 3 + +print(df) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch b/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch index 5511fa6a9e750819a60e1d89c46380d4548cd49d..4baf8c7b1500f9941ddf1413d12f7bfd354bee22 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch +++ b/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch @@ -1,7 +1,120 @@ -diff -ruN ultralytics-8.3.193/ultralytics/engine/predictor.py ultralytics_/ultralytics/engine/predictor.py ---- ultralytics-8.3.193/ultralytics/engine/predictor.py 2025-09-04 19:51:11.000000000 +0800 -+++ ultralytics_/ultralytics/engine/predictor.py 2025-09-09 14:56:14.535737230 +0800 -@@ -196,9 +196,10 @@ +diff -ruN ultralytics/data/loaders.py ultralytics/data/loaders.py +--- ultralytics/data/loaders.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics/data/loaders.py 2025-10-19 01:27:48.412000000 +0800 +@@ -534,7 +534,7 @@ + self.bs = len(self.im0) + + @staticmethod +- def _single_check(im: Image.Image | np.ndarray, flag: str = "RGB") -> np.ndarray: ++ def __single_check(im: Image.Image | np.ndarray, flag: str = "RGB") -> np.ndarray: + """Validate and format an image to numpy array, ensuring RGB order and contiguous memory.""" + assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}" + if isinstance(im, Image.Image): +@@ -546,6 +546,19 @@ + im = im[..., None] + return im + ++ @staticmethod ++ def _single_check(im: Image.Image | np.ndarray, flag: str = "RGB") -> np.ndarray: ++ """Validate and format an image to numpy array, ensuring RGB order and contiguous memory.""" ++ assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}" ++ if isinstance(im, Image.Image): ++ if im.mode != "RGB": ++ im = im.convert("RGB") ++ im = np.asarray(im) ++ elif im.ndim == 2: # grayscale in numpy form ++ im = im[..., None] ++ return im ++ ++ + def __len__(self) -> int: + """Return the length of the 'im0' attribute, representing the number of loaded images.""" + return len(self.im0) +diff -ruN ultralytics/engine/model.py ultralytics/engine/model.py +--- ultralytics/engine/model.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics/engine/model.py 2025-10-19 01:27:48.412000000 +0800 +@@ -152,6 +152,8 @@ + else: + self._load(model, task=task) + ++ self.model.half() ++ + # Delete super().training for accessing self.model.training + del self.training + +diff -ruN ultralytics/engine/predictor.py ultralytics/engine/predictor.py +--- ultralytics/engine/predictor.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics/engine/predictor.py 2025-10-19 01:27:48.412000000 +0800 +@@ -43,6 +43,7 @@ + import cv2 + import numpy as np + import torch ++import torch.nn.functional as F + + from ultralytics.cfg import get_cfg, get_save_dir + from ultralytics.data import load_inference_source +@@ -149,7 +150,7 @@ + self._lock = threading.Lock() # for automatic thread-safe inference + callbacks.add_integration_callbacks(self) + +- def preprocess(self, im: torch.Tensor | list[np.ndarray]) -> torch.Tensor: ++ def _preprocess(self, im: torch.Tensor | list[np.ndarray]) -> torch.Tensor: + """ + Prepare input image before inference. + +@@ -174,6 +175,51 @@ + im /= 255 # 0 - 255 to 0.0 - 1.0 + return im + ++ def preprocess(self, images: torch.Tensor | list[np.ndarray]) -> torch.Tensor: ++ """ ++ Prepare input image before inference. ++ ++ Args: ++ images (torch.Tensor | List[np.ndarray]): Images of shape (N, 3, H, W) for tensor, [(H, W, 3) x N] for list. ++ ++ Returns: ++ (torch.Tensor): Preprocessed image tensor of shape (N, 3, H, W). ++ """ ++ ++ new_shape = (new_shape, new_shape) if isinstance(self.imgsz, int) else self.imgsz ++ tensors = [] ++ for im in images: ++ im = torch.from_numpy(im).to(self.device).permute((2, 0, 1)) / 255.0 ++ ++ c, h, w = im.shape ++ ++ r = min(new_shape[0] / h, new_shape[1] / w) ++ ++ new_unpad = (int(round(w * r)), int(round(h * r))) ++ ++ if (w, h) != new_unpad: ++ im = F.interpolate(im.unsqueeze(0), size=(new_unpad[1], new_unpad[0]), ++ mode="bilinear", align_corners=False).squeeze(0) ++ ++ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] ++ dw /= 2 ++ dh /= 2 ++ left, right = int(dw), int(dw + 0.5) ++ top, bottom = int(dh), int(dh + 0.5) ++ im = F.pad(im, (left, right, top, bottom), value=114/255.0) ++ ++ _, H, W = im.shape ++ assert (H, W) == (new_shape[0], new_shape[1]), f"Expected image size do not match: padding image size:{(H, W)} != expected image size: {(new_shape[0], new_shape[1])}" ++ ++ im = im.half() if self.model.fp16 else im.float() # uint8 to fp16/32 ++ ++ tensors.append(im) ++ ++ return torch.stack(tensors, dim=0) ++ ++ ++ ++ + def inference(self, im: torch.Tensor, *args, **kwargs): + """Run inference on a given image using the specified model and arguments.""" + visualize = ( +@@ -196,9 +242,10 @@ same_shapes = len({x.shape for x in im}) == 1 letterbox = LetterBox( self.imgsz, @@ -15,7 +128,7 @@ diff -ruN ultralytics-8.3.193/ultralytics/engine/predictor.py ultralytics_/ultra stride=self.model.stride, ) return [letterbox(image=x) for x in im] -@@ -311,8 +312,11 @@ +@@ -311,8 +358,11 @@ # Warmup model if not self.done_warmup: @@ -28,7 +141,7 @@ diff -ruN ultralytics-8.3.193/ultralytics/engine/predictor.py ultralytics_/ultra ) self.done_warmup = True -@@ -400,7 +404,8 @@ +@@ -400,7 +450,8 @@ dnn=self.args.dnn, data=self.args.data, fp16=self.args.half, @@ -38,9 +151,9 @@ diff -ruN ultralytics-8.3.193/ultralytics/engine/predictor.py ultralytics_/ultra verbose=verbose, ) -diff -ruN ultralytics-8.3.193/ultralytics/nn/modules/block.py ultralytics_/ultralytics/nn/modules/block.py ---- ultralytics-8.3.193/ultralytics/nn/modules/block.py 2025-09-04 19:51:11.000000000 +0800 -+++ ultralytics_/ultralytics/nn/modules/block.py 2025-09-09 14:56:14.543737230 +0800 +diff -ruN ultralytics/nn/modules/block.py ultralytics/nn/modules/block.py +--- ultralytics/nn/modules/block.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics/nn/modules/block.py 2025-10-19 01:27:48.424000000 +0800 @@ -237,7 +237,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """Apply sequential pooling operations to input and return concatenated feature maps.""" @@ -63,10 +176,9 @@ diff -ruN ultralytics-8.3.193/ultralytics/nn/modules/block.py ultralytics_/ultra return self.cv2(torch.cat(y, 1)) def forward_split(self, x: torch.Tensor) -> torch.Tensor: - -diff -ruN ultralytics-8.3.193/ultralytics/utils/tal.py ultralytics_/ultralytics/utils/tal.py ---- ultralytics-8.3.193/ultralytics/utils/tal.py 2025-09-04 19:51:11.000000000 +0800 -+++ ultralytics_/ultralytics/utils/tal.py 2025-09-09 14:56:14.551737230 +0800 +diff -ruN ultralytics/utils/tal.py ultralytics/utils/tal.py +--- ultralytics/utils/tal.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics/utils/tal.py 2025-10-19 01:27:48.428000000 +0800 @@ -375,7 +375,8 @@ sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx) @@ -75,3 +187,4 @@ diff -ruN ultralytics-8.3.193/ultralytics/utils/tal.py ultralytics_/ultralytics/ + # stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) + stride_tensor.append(torch.ones((h * w, 1), dtype=dtype, device=device)*stride) return torch.cat(anchor_points), torch.cat(stride_tensor) +