diff --git a/cv/detection/yolov5/pytorch/README.md b/cv/detection/yolov5/pytorch/README.md
index 475fb9973d038502a820104710e926c3d84f3630..15c74620d5ca6f66cb6b5fba0839c01b3c195ee7 100644
--- a/cv/detection/yolov5/pytorch/README.md
+++ b/cv/detection/yolov5/pytorch/README.md
@@ -6,9 +6,8 @@ YOLOv5 🚀 is a family of object detection architectures and models pretrained
 
 ```shell
 # install libGL
-yum install mesa-libGL
-
-pip3 install -r requirements.txt
+yum install -y mesa-libGL
+bash init.sh
 ```
 
 ## Step 2: Preparing datasets
@@ -38,10 +37,12 @@ coco2017
 
 Modify the configuration file(data/coco.yaml)
 
-    $ vim data/coco.yaml
-    $ # path: the root of coco data
-    $ # train: the relative path of train images
-    $ # val: the relative path of valid images
+```bash
+$ vim data/coco.yaml
+# path: the root of coco data
+# train: the relative path of train images
+# val: the relative path of valid images
+```
 
 ## Training the detector
 
@@ -49,45 +50,65 @@ Train the yolov5 model as follows, the train log is saved in ./runs/train/exp
 
 ### On single GPU
 
-    $ cd yolov5 
-    $ python3 train.py --data ./data/coco.yaml --batch-size 32 --cfg ./models/yolov5s.yaml --weights ''
+```bash
+$ python3 train.py --data ./data/coco.yaml --batch-size 32 --cfg ./models/yolov5s.yaml --weights ''
+```
 
 ### On single GPU (AMP)
 
-    $ python3 train.py --data ./data/coco.yaml --batch-size 32 --cfg ./models/yolov5s.yaml --weights '' --amp
-
+```bash
+$ python3 train.py --data ./data/coco.yaml --batch-size 32 --cfg ./models/yolov5s.yaml --weights '' --amp
+```
 
 ### Multiple GPUs on one machine
 
-    $ # eight cards 
-    $ python3 -m torch.distributed.launch --nproc_per_node 8 train.py --data ./data/coco.yaml --batch-size 256 --cfg ./models/yolov5s.yaml --weights '' --device 0,1,2,3,4,5,6,7 
+```bash
+# eight cards
+# YOLOv5s
+$ python3 -m torch.distributed.launch --nproc_per_node 8 \
+    train.py \
+    --data ./data/coco.yaml \
+    --batch-size 64 \
+    --cfg ./models/yolov5s.yaml --weights '' \
+    --device 0,1,2,3,4,5,6,7
+
+# YOLOv5m
+$ bash run.sh
+```
 
 ### Multiple GPUs on one machine (AMP)
 
-    $ # eight cards 
-    $ python3 -m torch.distributed.launch --nproc_per_node 8 train.py --data ./data/coco.yaml --batch-size 256 --cfg ./models/yolov5s.yaml --weights '' --device 0,1,2,3,4,5,6,7 --amp
-
+```bash
+# eight cards 
+$ python3 -m torch.distributed.launch --nproc_per_node 8 \
+    train.py \
+    --data ./data/coco.yaml \
+    --batch-size 256 \
+    --cfg ./models/yolov5s.yaml --weights '' \
+    --device 0,1,2,3,4,5,6,7 --amp
+```
 
 ## Test the detector
 
 Test the yolov5 model as follows, the result is saved in ./runs/detect:
 
-    $ cd yolov5
-    $ python3 detect.py --source ./data/images/bus.jpg --weights yolov5s.pt --img 640
-    $ python3 detect.py --source ./data/images/zidane.jpg --weights yolov5s.pt --img 640
-
+$ cd yolov5
+$ python3 detect.py --source ./data/images/bus.jpg --weights yolov5s.pt --img 640
+$ python3 detect.py --source ./data/images/zidane.jpg --weights yolov5s.pt --img 640
 
 ## Results on BI-V100
 
+
 | GPUs | FP16 | Batch size | FPS | E2E | mAP@.5 |
-|------|------|------------|-----|-----|--------|
-| 1x1  | True  | 64         | 81  | N/A | N/A    |
-| 1x8  | True  | 64         | 598 | 24h | 0.632  |
+| ------ | ------ | ------------ | ----- | ----- | -------- |
+| 1x1  | True | 64         | 81  | N/A | N/A    |
+| 1x8  | True | 64         | 598 | 24h | 0.632  |
 
-| Convergence criteria | Configuration (x denotes number of GPUs) | Performance | Accuracy | Power（W） | Scalability | Memory utilization（G） | Stability |
-|----------------------|------------------------------------------|-------------|----------|------------|-------------|-------------------------|-----------|
-| mAP:0.5              | SDK V2.2,bs:128,8x,AMP                   | 1228        | 0.56     | 140\*8     | 0.92        | 27.3\*8                 | 1         |
 
+| Convergence criteria | Configuration (x denotes number of GPUs) | Performance | Accuracy | Power（W） | Scalability | Memory utilization（G） | Stability |
+| ---------------------- | ------------------------------------------ | ------------- | ---------- | ------------ | ------------- | ------------------------- | ----------- |
+| mAP:0.5              | SDK V2.2, bs:128, 8x, AMP                | 1228        | 0.56     | 140\*8     | 0.92        | 27.3\*8                 | 1         |
 
 ## Reference
+
 https://github.com/ultralytics/yolov5
diff --git a/cv/detection/yolov5/pytorch/init.sh b/cv/detection/yolov5/pytorch/init.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44510f81a271bef34200e15ac322c9e7735edcef
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/init.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+if [[ "$(uname)" == "Linux" ]]; then
+    if command -v apt &> /dev/null; then
+        apt install -y numactl
+    elif command -v yum &> /dev/null; then
+        yum install -y numactl
+    else
+        echo "Unsupported package manager"
+        exit 1
+    fi
+else
+    echo "Unsupported operating system"
+    exit 1
+fi
+
+pip3 install -r requirements.txt
+PY_VERSION=$(python3 -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}')
+if [ "$PY_VERSION" == "10" ]; then
+   pip3 install matplotlib==3.8.2
+   pip3 install numpy==1.22.4
+   pip3 install Pillow==9.5 
+else
+   echo "only for python3.10"
+fi
+
+wandb disabled
+pip3 install pycocotools
diff --git a/cv/detection/yolov5/pytorch/models/common.py b/cv/detection/yolov5/pytorch/models/common.py
index 2308ace43b1a4a928abab9705d17058153d8ca7c..7603e9e4719202e9143bcfd80984bf92128085d9 100644
--- a/cv/detection/yolov5/pytorch/models/common.py
+++ b/cv/detection/yolov5/pytorch/models/common.py
@@ -49,24 +49,6 @@ class Conv(nn.Module):
         return self.act(self.conv(x))
 
 
-class SPPF(nn.Module):
-    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
-    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * 4, c2, 1, 1)
-        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-
-    def forward(self, x):
-        x = self.cv1(x)
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
-            y1 = self.m(x)
-            y2 = self.m(y1)
-            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
-
-
 class TransformerLayer(nn.Module):
     # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
     def __init__(self, c, num_heads):
diff --git a/cv/detection/yolov5/pytorch/run.sh b/cv/detection/yolov5/pytorch/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ed5cda7d4b22ed2e4e09c1e0cd43208694c31cc6
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/run.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+start_time=$(date +%s)
+unset no_proxy use_proxy https_proxy http_proxy
+EXIT_STATUS=0
+check_status() {
+	if ((${PIPESTATUS[0]} != 0)); then
+		EXIT_STATUS=1
+	fi
+}
+
+python3 -m torch.distributed.launch --nproc_per_node=16 \
+	train.py --batch-size 32 \
+	--data ./data/coco.yaml --weights "" \
+	--cfg models/yolov5m.yaml --workers 16 \
+	--epochs 3 --linear-lr "$@"
+check_status
+
+wait
+
+end_time=$(date +%s)
+e2e_time=$(($end_time - $start_time))
+echo "end to end time: $e2e_time" >>total_time.log
+exit ${EXIT_STATUS}
diff --git a/cv/detection/yolov5/pytorch/run_inference.sh b/cv/detection/yolov5/pytorch/run_inference.sh
index 23a092df8f0d2705b8a5eff76d586d683f466463..3f4226fbc9ee681d9f1d0ea0fc1207d5a16beabb 100644
--- a/cv/detection/yolov5/pytorch/run_inference.sh
+++ b/cv/detection/yolov5/pytorch/run_inference.sh
@@ -13,4 +13,14 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
-python3 test.py --task val --data data/coco.yaml --weights weights/yolov5s.pt 2>&1 | tee inferencelog.log;
+
+EXIT_STATUS=0
+check_status() {
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+python3 test.py --task val --data data/coco128.yaml --weights weights/yolov5s.pt 2>&1 | tee inferencelog.log
+check_status
+exit ${EXIT_STATUS}
diff --git a/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_amp_torch.sh b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_amp_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b84888011df7f1ee71b7b90a6014655560baea16
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_amp_torch.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+export PYTORCH_DISABLE_VEC_KERNEL=1
+export PT_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT=1
+cd ..
+bash run_training.sh --data ./data/coco128.yaml --amp "$@"
+cd -
diff --git a/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_dist_torch.sh b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_dist_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a37d6c7a3d5650cb5cd1f8d575d2cf60f029cd5
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_dist_torch.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+cd ..
+bash run_dist_training.sh --data ./data/coco128.yaml "$@"
+cd -
diff --git a/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_torch.sh b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c206974245492b3568c5eb85f0ded2bf0f809236
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco128_torch.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+cd ..
+bash run_training.sh --data ./data/coco128.yaml "$@"
+cd -
diff --git a/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_amp_torch.sh b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_amp_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fe09a71af5a7f123c4fff99f2aa8892c50dff3f9
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_amp_torch.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+export PYTORCH_DISABLE_VEC_KERNEL=1
+export PT_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT=1
+cd ..
+bash run_training.sh --data ./data/coco.yaml  --amp "$@"
+cd -
diff --git a/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_dist_torch.sh b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_dist_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3434fee60cc38489585c7b96c785980f149701fb
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_dist_torch.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+cd ..
+bash run_dist_training.sh --data ./data/coco.yaml "$@"
diff --git a/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_torch.sh b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..94da449ae65fb8dbceccd0a0234685def4ba53ed
--- /dev/null
+++ b/cv/detection/yolov5/pytorch/start_scripts/train_yolov5s_coco_torch.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+cd ..
+bash run_training.sh --data ./data/coco.yaml "$@"
+cd -
\ No newline at end of file
diff --git a/cv/detection/yolov5/pytorch/train.py b/cv/detection/yolov5/pytorch/train.py
index 2bc68e9815ce4679668fd47eec8eb55ce98e629a..f8f0f95a22846e7f2f76f649f68e88357cf30bfb 100644
--- a/cv/detection/yolov5/pytorch/train.py
+++ b/cv/detection/yolov5/pytorch/train.py
@@ -335,7 +335,7 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
         logger.info('Using SyncBatchNorm()')
 
     # Trainloader
-    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
+    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, single_cls,
                                             hyp=hyp, augment=False, cache=opt.cache_images, rect=opt.rect, rank=RANK,
                                             workers=workers,
                                             image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '))
@@ -345,7 +345,7 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
 
     # Process 0
     if RANK in [-1, 0]:
-        testloader = create_dataloader(test_path, imgsz_test, batch_size // WORLD_SIZE * 2, gs, single_cls,
+        testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, single_cls,
                                        hyp=hyp, cache=opt.cache_images and not notest, rect=True, rank=-1,
                                        workers=workers,
                                        pad=0.5, prefix=colorstr('val: '))[0]
@@ -397,10 +397,6 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
                 f'Using {dataloader.num_workers} dataloader workers\n'
                 f'Logging results to {save_dir}\n'
                 f'Starting training for {epochs} epochs...')
-
-    run_steps = 0
-    time_step = []
-    time_step.append(time.time())
     for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
         model.train()
 
@@ -426,11 +422,13 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
         if RANK != -1:
             dataloader.sampler.set_epoch(epoch)
         pbar = enumerate(dataloader)
-        logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'labels', 'img_size'))
+        logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'img_size', "total_fps"))
         if RANK in [-1, 0]:
             pbar = tqdm(pbar, total=nb)  # progress bar
         optimizer.zero_grad()
         for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
+            step_start_time = time.time()
+
             ni = i + nb * epoch  # number integrated batches (since train start)
             imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0
 
@@ -470,6 +468,10 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
                 if opt.quad:
                     loss *= 4.
 
+            if not math.isfinite(loss[0]):
+                print("Loss is {}, stopping training".format(loss[0]))
+                sys.exit(1)
+
             # Backward
             if opt.amp:
                 scaler.scale(loss).backward()
@@ -491,18 +493,18 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
                     ema.update(model)
                 last_opt_step = ni
 
+            step_end_time = time.time()
+            fps = len(imgs) / (step_end_time - step_start_time)
+            if torch.distributed.is_initialized():
+                fps = fps * torch.distributed.get_world_size()
+
             # Print
             if RANK in [-1, 0]:
-                time_step.append(time.time())
                 mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                 mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
                 s = ('%10s' * 2 + '%10.4g' * 6) % (
-                    f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])
-                time_iter = time_step[i+1] - time_step[i]
-                fps = opt.batch_size / time_iter
-                performance = " timer: %.6f sec. || fps: %.3f" % (time_iter, fps)
-
-                pbar.set_description(s + performance)
+                    f'{epoch}/{epochs - 1}', mem, *mloss, imgs.shape[-1], fps)
+                pbar.set_description(s)
 
                 if nb > 1000:
                     log_freq = 100
@@ -677,7 +679,7 @@ def parse_opt(known=False):
     parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval for W&B')
     parser.add_argument('--save_period', type=int, default=-1, help='Log model after every "save_period" epoch')
     parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used')
-    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=-1, help='DDP parameter, do not modify')
     parser.add_argument('--amp', action='store_true', default=False, help='use amp to train and test')
     opt = parser.parse_known_args()[0] if known else parser.parse_args()
     return opt
@@ -709,6 +711,12 @@ def main(opt):
 
     print("Global setting:", LOCAL_RANK, RANK, WORLD_SIZE)
 
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments(opt)
+    except:
+        pass
+
     # DDP mode
     device = select_device(opt.device, batch_size=opt.batch_size)
     if LOCAL_RANK != -1:
@@ -724,7 +732,7 @@ def main(opt):
             dist_backend = os.environ[DIST_BACKEND_ENV]
 
         dist.init_process_group(backend=dist_backend, rank=RANK, world_size=WORLD_SIZE)
-        assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
+        # assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
         assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
 
     # Train
diff --git a/cv/detection/yolov5/pytorch/utils/loss.py b/cv/detection/yolov5/pytorch/utils/loss.py
index 932f8df15956046f322933bd3d8bbbb2627a6f70..370f7323e54e8cd01e8fd7cef9708fc192bce3c2 100644
--- a/cv/detection/yolov5/pytorch/utils/loss.py
+++ b/cv/detection/yolov5/pytorch/utils/loss.py
@@ -183,8 +183,9 @@ class ComputeLoss:
                             ], device=targets.device).float() * g  # offsets
 
         for i in range(self.nl):
+            # anchors = self.anchors[i]
             anchors, shape = self.anchors[i], p[i].shape
-            gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]]  # xyxy gain
+            gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]]  # xyxy gain
 
             # Match targets to anchors
             t = targets * gain
@@ -216,7 +217,8 @@ class ComputeLoss:
 
             # Append
             a = t[:, 6].long()  # anchor indices
-            indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1))) # image, anchor, grid
+            # indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))  # image, anchor, grid indices
+            indices.append((b,a,gj.clamp_(0,shape[2] - 1),gi.clamp_(0,shape[3] - 1)))  # image, anchor, grid indices
             tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
             anch.append(anchors[a])  # anchors
             tcls.append(c)  # class
diff --git a/cv/detection/yolov5/pytorch/utils/wandb_logging/wandb_utils.py b/cv/detection/yolov5/pytorch/utils/wandb_logging/wandb_utils.py
index f031a819b977c8a606a99a7b8f31fd86d4a2c110..45aa088fa5e05538ecbab2362457cb38d369f4cf 100644
--- a/cv/detection/yolov5/pytorch/utils/wandb_logging/wandb_utils.py
+++ b/cv/detection/yolov5/pytorch/utils/wandb_logging/wandb_utils.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+
 """Utilities and tools for tracking runs with Weights & Biases."""
 import logging
 import os
@@ -18,7 +21,7 @@ try:
     from wandb import init, finish
 except ImportError:
     wandb = None
-
+wandb = None
 RANK = int(os.getenv('RANK', -1))
 WANDB_ARTIFACT_PREFIX = 'wandb-artifact://'