diff --git a/cv/classification/repvgg/pytorch/LICENSE b/cv/classification/repvgg/pytorch/LICENSE
new file mode 100755
index 0000000000000000000000000000000000000000..9b7d31a349b645c2323b96fb4ed7912eabce3884
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 DingXiaoH
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/cv/classification/repvgg/pytorch/README.md b/cv/classification/repvgg/pytorch/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..be9cb1ea0edfc12c90723fb39c055796b3969645
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/README.md
@@ -0,0 +1,61 @@
+
+# RepVGG
+## Model description
+ A simple but powerful architecture of convolutional neural network, which has a VGG-like inference-time body composed of nothing but a stack of 3x3 convolution and ReLU, while the training-time model has a multi-branch topology. Such decoupling of the training-time and inference-time architecture is realized by a structural re-parameterization technique so that the model is named RepVGG. 
+
+## Step 1: Installing
+
+```bash
+pip3 install timm yacs
+```
+
+## Step 2: Download data
+
+Download the [ImageNet Dataset](https://www.image-net.org/download.php) 
+
+```bash
+# IMAGENET PATH as follow:
+ls -al /home/datasets/imagenet_jpeg/
+total 52688
+drwxr-xr-x 1002 root root    24576 Mar  1 15:33 train
+-rw-r--r--    1 root root 43829433 May 16 07:55 train_list.txt
+drwxr-xr-x 1002 root root    24576 Mar  1 15:41 val
+-rw-r--r--    1 root root  2144499 May 16 07:56 val_list.txt
+-----------------------
+# train_list.txt has the following format
+train/n01440764/n01440764_10026.JPEG 0
+...
+
+# val_list.txt has the following format
+val/ILSVRC2012_val_00000001.JPEG 65
+-----------------------
+```
+
+## Step 3: Run RepVGG
+```
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12349 main.py --arch [model name] --data-path [/path/to/imagenet] --batch-size 32 --tag train_from_scratch --output ./ --opts TRAIN.EPOCHS 300 TRAIN.BASE_LR 0.1 TRAIN.WEIGHT_DECAY 1e-4 TRAIN.WARMUP_EPOCHS 5 MODEL.LABEL_SMOOTHING 0.1 AUG.PRESET weak AUG.MIXUP 0.0 DATA.DATASET imagenet DATA.IMG_SIZE 224
+```
+The original RepVGG models were trained in 120 epochs with cosine learning rate decay from 0.1 to 0. We used 8 GPUs, global batch size of 256, weight decay of 1e-4 (no weight decay on fc.bias, bn.bias, rbr_dense.bn.weight and rbr_1x1.bn.weight) (weight decay on rbr_identity.weight makes little difference, and it is better to use it in most of the cases), and the same simple data preprocssing as the PyTorch official example:
+```
+            trans = transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+```
+
+The valid model names include (--arch [model name])
+```
+RepVGGplus-L2pse, RepVGG-A0, RepVGG-A1, RepVGG-A2, RepVGG-B0, RepVGG-B1, RepVGG-B1g2, RepVGG-B1g4, RepVGG-B2, RepVGG-B2g2, RepVGG-B2g4, RepVGG-B3, RepVGG-B3g2, RepVGG-B3g4
+```
+
+|   model  |     GPU     | FP32                                 | 
+|----------| ----------- | ------------------------------------ |
+| RepVGG-A0| 8 cards     | Acc@1=0.7241                         |
+
+
+
+
+
+
+
diff --git a/cv/classification/repvgg/pytorch/data/__init__.py b/cv/classification/repvgg/pytorch/data/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..70c633ce61849c1600e3effbd1bf46f29f29cb80
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/data/__init__.py
@@ -0,0 +1 @@
+from .build import build_loader
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/data/build.py b/cv/classification/repvgg/pytorch/data/build.py
new file mode 100755
index 0000000000000000000000000000000000000000..faf093957ce8f3de6b4198832aad019d62454185
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/data/build.py
@@ -0,0 +1,188 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+import torch
+import numpy as np
+import torch.distributed as dist
+from torchvision import datasets, transforms
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data import Mixup
+from timm.data import create_transform
+try:
+    from timm.data.transforms import str_to_pil_interp as _pil_interp
+except:
+    from timm.data.transforms import _pil_interp
+from .cached_image_folder import CachedImageFolder
+from .samplers import SubsetRandomSampler
+import os
+
+
+def build_loader(config):
+    config.defrost()
+    dataset_train, config.MODEL.NUM_CLASSES = build_dataset(is_train=True, config=config)
+    config.freeze()
+    print(f"local rank {config.LOCAL_RANK} / global rank {dist.get_rank()} successfully build train dataset")
+    dataset_val, _ = build_dataset(is_train=False, config=config)
+    print(f"local rank {config.LOCAL_RANK} / global rank {dist.get_rank()} successfully build val dataset")
+
+    num_tasks = dist.get_world_size()
+    global_rank = dist.get_rank()
+    if config.DATA.ZIP_MODE and config.DATA.CACHE_MODE == 'part':
+        indices = np.arange(dist.get_rank(), len(dataset_train), dist.get_world_size())
+        sampler_train = SubsetRandomSampler(indices)
+    else:
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+
+    if dataset_val is None:
+        sampler_val = None
+    else:
+        indices = np.arange(dist.get_rank(), len(dataset_val), dist.get_world_size())   #TODO
+        sampler_val = SubsetRandomSampler(indices)
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=config.DATA.BATCH_SIZE,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=True,
+    )
+
+    if dataset_val is None:
+        data_loader_val = None
+    else:
+        data_loader_val = torch.utils.data.DataLoader(
+            dataset_val, sampler=sampler_val,
+            batch_size=config.DATA.TEST_BATCH_SIZE,
+            shuffle=False,
+            num_workers=config.DATA.NUM_WORKERS,
+            pin_memory=config.DATA.PIN_MEMORY,
+            drop_last=False
+        )
+
+    # setup mixup / cutmix
+    mixup_fn = None
+    mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
+    if mixup_active:
+        mixup_fn = Mixup(
+            mixup_alpha=config.AUG.MIXUP, cutmix_alpha=config.AUG.CUTMIX, cutmix_minmax=config.AUG.CUTMIX_MINMAX,
+            prob=config.AUG.MIXUP_PROB, switch_prob=config.AUG.MIXUP_SWITCH_PROB, mode=config.AUG.MIXUP_MODE,
+            label_smoothing=config.MODEL.LABEL_SMOOTHING, num_classes=config.MODEL.NUM_CLASSES)
+
+    return dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn
+
+
+def build_dataset(is_train, config):
+    if config.DATA.DATASET == 'imagenet':
+        transform = build_transform(is_train, config)
+        prefix = 'train' if is_train else 'val'
+        if config.DATA.ZIP_MODE:
+            ann_file = prefix + "_map.txt"
+            prefix = prefix + ".zip@/"
+            dataset = CachedImageFolder(config.DATA.DATA_PATH, ann_file, prefix, transform,
+                                        cache_mode=config.DATA.CACHE_MODE if is_train else 'part')
+        else:
+            import torchvision
+            print('use raw ImageNet data')
+            #dataset = torchvision.datasets.ImageNet(root=config.DATA.DATA_PATH, split='train' if is_train else 'val', transform=transform)
+            root = os.path.join(config.DATA.DATA_PATH, prefix)
+            dataset = datasets.ImageFolder(root, transform=transform)
+
+        nb_classes = 1000
+
+    elif config.DATA.DATASET == 'cf100':
+        mean = [0.5070751592371323, 0.48654887331495095, 0.4409178433670343]
+        std = [0.2673342858792401, 0.2564384629170883, 0.27615047132568404]
+        if is_train:
+            transform = transforms.Compose([
+                transforms.RandomCrop(32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean, std)
+            ])
+            dataset = datasets.CIFAR100(root=config.DATA.DATA_PATH, train=True, download=True, transform=transform)
+        else:
+            transform = transforms.Compose(
+                [transforms.ToTensor(),
+                 transforms.Normalize(mean, std)])
+            dataset = datasets.CIFAR100(root=config.DATA.DATA_PATH, train=False, download=True, transform=transform)
+        nb_classes = 100
+
+    else:
+        raise NotImplementedError("We only support ImageNet and CIFAR-100 now.")
+
+    return dataset, nb_classes
+
+
+def build_transform(is_train, config):
+    resize_im = config.DATA.IMG_SIZE > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+
+        if config.AUG.PRESET is None:
+            transform = create_transform(
+                input_size=config.DATA.IMG_SIZE,
+                is_training=True,
+                color_jitter=config.AUG.COLOR_JITTER if config.AUG.COLOR_JITTER > 0 else None,
+                auto_augment=config.AUG.AUTO_AUGMENT if config.AUG.AUTO_AUGMENT != 'none' else None,
+                re_prob=config.AUG.REPROB,
+                re_mode=config.AUG.REMODE,
+                re_count=config.AUG.RECOUNT,
+                interpolation=config.DATA.INTERPOLATION,
+            )
+            print('=============================== original AUG! ', config.AUG.AUTO_AUGMENT)
+            if not resize_im:
+                # replace RandomResizedCropAndInterpolation with
+                # RandomCrop
+                transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4)
+
+        elif config.AUG.PRESET.strip() == 'raug15':
+            from train.randaug import RandAugPolicy
+            transform = transforms.Compose([
+                transforms.RandomResizedCrop(config.DATA.IMG_SIZE),
+                transforms.RandomHorizontalFlip(),
+                RandAugPolicy(magnitude=15),
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ])
+            print('---------------------- RAND AUG 15 distortion!')
+
+        elif config.AUG.PRESET.strip() == 'weak':
+            transform = transforms.Compose([
+                transforms.RandomResizedCrop(config.DATA.IMG_SIZE),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ])
+        elif config.AUG.PRESET.strip() == 'none':
+            transform = transforms.Compose([
+                transforms.Resize(config.DATA.IMG_SIZE, interpolation=_pil_interp(config.DATA.INTERPOLATION)),
+                transforms.CenterCrop(config.DATA.IMG_SIZE),
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ])
+        else:
+            raise ValueError('???' + config.AUG.PRESET)
+        print(transform)
+        return transform
+
+    t = []
+    if resize_im:
+        if config.TEST.CROP:
+            size = int((256 / 224) * config.DATA.TEST_SIZE)
+            t.append(transforms.Resize(size, interpolation=_pil_interp(config.DATA.INTERPOLATION)),
+                # to maintain same ratio w.r.t. 224 images
+            )
+            t.append(transforms.CenterCrop(config.DATA.TEST_SIZE))
+        else:
+            #   default for testing
+            t.append(transforms.Resize(config.DATA.TEST_SIZE, interpolation=_pil_interp(config.DATA.INTERPOLATION)))
+            t.append(transforms.CenterCrop(config.DATA.TEST_SIZE))
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    trans = transforms.Compose(t)
+    return trans
diff --git a/cv/classification/repvgg/pytorch/data/cached_image_folder.py b/cv/classification/repvgg/pytorch/data/cached_image_folder.py
new file mode 100755
index 0000000000000000000000000000000000000000..2f3d013a67f5aac9ca5fac635044ecef8fdbca5a
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/data/cached_image_folder.py
@@ -0,0 +1,244 @@
+import io
+import os
+import time
+import torch.distributed as dist
+import torch.utils.data as data
+from PIL import Image
+
+from .zipreader import is_zip_path, ZipReader
+
+
+def has_file_allowed_extension(filename, extensions):
+    """Checks if a file is an allowed extension.
+    Args:
+        filename (string): path to a file
+    Returns:
+        bool: True if the filename ends with a known image extension
+    """
+    filename_lower = filename.lower()
+    return any(filename_lower.endswith(ext) for ext in extensions)
+
+
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+
+
+def make_dataset(dir, class_to_idx, extensions):
+    images = []
+    dir = os.path.expanduser(dir)
+    for target in sorted(os.listdir(dir)):
+        d = os.path.join(dir, target)
+        if not os.path.isdir(d):
+            continue
+
+        for root, _, fnames in sorted(os.walk(d)):
+            for fname in sorted(fnames):
+                if has_file_allowed_extension(fname, extensions):
+                    path = os.path.join(root, fname)
+                    item = (path, class_to_idx[target])
+                    images.append(item)
+
+    return images
+
+
+def make_dataset_with_ann(ann_file, img_prefix, extensions):
+    images = []
+    with open(ann_file, "r") as f:
+        contents = f.readlines()
+        for line_str in contents:
+            path_contents = [c for c in line_str.split('\t')]
+            im_file_name = path_contents[0]
+            class_index = int(path_contents[1])
+
+            assert str.lower(os.path.splitext(im_file_name)[-1]) in extensions
+            item = (os.path.join(img_prefix, im_file_name), class_index)
+
+            images.append(item)
+
+    return images
+
+
+class DatasetFolder(data.Dataset):
+    """A generic data loader where the samples are arranged in this way: ::
+        root/class_x/xxx.ext
+        root/class_x/xxy.ext
+        root/class_x/xxz.ext
+        root/class_y/123.ext
+        root/class_y/nsdf3.ext
+        root/class_y/asd932_.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (list[string]): A list of allowed extensions.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+     Attributes:
+        samples (list): List of (sample path, class_index) tuples
+    """
+
+    def __init__(self, root, loader, extensions, ann_file='', img_prefix='', transform=None, target_transform=None,
+                 cache_mode="no"):
+        # image folder mode
+        if ann_file == '':
+            _, class_to_idx = find_classes(root)
+            samples = make_dataset(root, class_to_idx, extensions)
+        # zip mode
+        else:
+            samples = make_dataset_with_ann(os.path.join(root, ann_file),
+                                            os.path.join(root, img_prefix),
+                                            extensions)
+
+        if len(samples) == 0:
+            raise (RuntimeError("Found 0 files in subfolders of: " + root + "\n" +
+                                "Supported extensions are: " + ",".join(extensions)))
+
+        self.root = root
+        self.loader = loader
+        self.extensions = extensions
+
+        self.samples = samples
+        self.labels = [y_1k for _, y_1k in samples]
+        self.classes = list(set(self.labels))
+
+        self.transform = transform
+        self.target_transform = target_transform
+
+        self.cache_mode = cache_mode
+        if self.cache_mode != "no":
+            self.init_cache()
+
+    def init_cache(self):
+        assert self.cache_mode in ["part", "full"]
+        n_sample = len(self.samples)
+        global_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        samples_bytes = [None for _ in range(n_sample)]
+        start_time = time.time()
+        for index in range(n_sample):
+            if index % (n_sample // 10) == 0:
+                t = time.time() - start_time
+                print(f'global_rank {dist.get_rank()} cached {index}/{n_sample} takes {t:.2f}s per block')
+                start_time = time.time()
+            path, target = self.samples[index]
+            if self.cache_mode == "full":
+                samples_bytes[index] = (ZipReader.read(path), target)
+            elif self.cache_mode == "part" and index % world_size == global_rank:
+                samples_bytes[index] = (ZipReader.read(path), target)
+            else:
+                samples_bytes[index] = (path, target)
+        self.samples = samples_bytes
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return sample, target
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __repr__(self):
+        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
+        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
+        fmt_str += '    Root Location: {}\n'.format(self.root)
+        tmp = '    Transforms (if any): '
+        fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        tmp = '    Target Transforms (if any): '
+        fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        return fmt_str
+
+
+IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    if isinstance(path, bytes):
+        img = Image.open(io.BytesIO(path))
+    elif is_zip_path(path):
+        data = ZipReader.read(path)
+        img = Image.open(io.BytesIO(data))
+    else:
+        with open(path, 'rb') as f:
+            img = Image.open(f)
+    return img.convert('RGB')
+
+
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def default_img_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class CachedImageFolder(DatasetFolder):
+    """A generic data loader where the images are arranged in this way: ::
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/xxz.png
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/asd932_.png
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+     Attributes:
+        imgs (list): List of (image path, class_index) tuples
+    """
+
+    def __init__(self, root, ann_file='', img_prefix='', transform=None, target_transform=None,
+                 loader=default_img_loader, cache_mode="no"):
+        super(CachedImageFolder, self).__init__(root, loader, IMG_EXTENSIONS,
+                                                ann_file=ann_file, img_prefix=img_prefix,
+                                                transform=transform, target_transform=target_transform,
+                                                cache_mode=cache_mode)
+        self.imgs = self.samples
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        image = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(image)
+        else:
+            img = image
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
diff --git a/cv/classification/repvgg/pytorch/data/lmdb_dataset.py b/cv/classification/repvgg/pytorch/data/lmdb_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..640d1824d34be5f1f522918500156fdf498a22df
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/data/lmdb_dataset.py
@@ -0,0 +1,164 @@
+import os
+import os.path as osp
+from PIL import Image
+import six
+import lmdb
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+import pyarrow as pa
+import numpy as np
+import torch.utils.data as data
+from torch.utils.data import DataLoader
+from torchvision.datasets import ImageFolder
+
+train_lmdb_path = '/apdcephfs/share_1290939/0_public_datasets/imageNet_2012/train.lmdb'
+val_lmdb_path = '/apdcephfs/share_1290939/0_public_datasets/imageNet_2012/val.lmdb'
+
+# from data.lmdb_dataset import ImageFolderLMDB, train_lmdb_path, val_lmdb_path
+# lmdb_path = train_lmdb_path if is_train else val_lmdb_path
+# dataset = ImageFolderLMDB(db_path=lmdb_path, transform=transform)
+
+def loads_pyarrow(buf):
+    """
+    Args:
+        buf: the output of `dumps`.
+    """
+    return pa.deserialize(buf)
+
+
+class ImageFolderLMDB(data.Dataset):
+    def __init__(self, db_path, transform=None, target_transform=None):
+        self.db_path = db_path
+        self.env = lmdb.open(db_path, subdir=osp.isdir(db_path),
+                             readonly=True, lock=False,
+                             readahead=False, meminit=False)
+        with self.env.begin(write=False) as txn:
+            self.length = loads_pyarrow(txn.get(b'__len__'))
+            self.keys = loads_pyarrow(txn.get(b'__keys__'))
+
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __getstate__(self):
+        state = self.__dict__
+        state["env"] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.env = lmdb.open(self.db_path, subdir=osp.isdir(self.db_path),
+                             readonly=True, lock=False,
+                             readahead=False, meminit=False)
+        with self.env.begin(write=False) as txn:
+            self.length = loads_pyarrow(txn.get(b'__len__'))
+            self.keys = loads_pyarrow(txn.get(b'__keys__'))
+
+    def __getitem__(self, index):
+        env = self.env
+        with env.begin(write=False) as txn:
+            byteflow = txn.get(self.keys[index])
+
+        unpacked = loads_pyarrow(byteflow)
+
+        # load img
+        imgbuf = unpacked[0]
+        buf = six.BytesIO()
+        buf.write(imgbuf)
+        buf.seek(0)
+        img = Image.open(buf).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+
+        # load label
+        target = unpacked[1]
+        if self.target_transform is not None:
+            target = self.transform(target)
+
+        return img, target
+#        if self.transform is not None:
+#            img = self.transform(img)
+#
+#        # im2arr = np.array(img)
+#
+#        if self.target_transform is not None:
+#            target = self.target_transform(target)
+#
+#        return img, target
+        # return im2arr, target
+
+    def __len__(self):
+        return self.length
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' + self.db_path + ')'
+
+
+def raw_reader(path):
+    with open(path, 'rb') as f:
+        bin_data = f.read()
+    return bin_data
+
+
+def dumps_pyarrow(obj):
+    """
+    Serialize an object.
+    Returns:
+        Implementation-dependent bytes-like object
+    """
+    return pa.serialize(obj).to_buffer()
+
+
+def folder2lmdb(dpath, name="train", write_frequency=5000):
+    directory = osp.expanduser(osp.join(dpath, name))
+    print("Loading dataset from %s" % directory)
+    dataset = ImageFolder(directory, loader=raw_reader)
+    data_loader = DataLoader(dataset, num_workers=4, collate_fn=lambda x: x)
+
+    lmdb_path = osp.join(dpath, "%s.lmdb" % name)
+    isdir = os.path.isdir(lmdb_path)
+
+    print("Generate LMDB to %s" % lmdb_path)
+    db = lmdb.open(lmdb_path, subdir=isdir,
+                   map_size=1099511627776 * 2, readonly=False,
+                   meminit=False, map_async=True)
+
+    txn = db.begin(write=True)
+    for idx, data in enumerate(data_loader):
+        image, label = data[0]
+
+        txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow((image, label)))
+        if idx % write_frequency == 0:
+            print("[%d/%d]" % (idx, len(data_loader)))
+            txn.commit()
+            txn = db.begin(write=True)
+
+    # finish iterating through dataset
+    txn.commit()
+    keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
+    with db.begin(write=True) as txn:
+        txn.put(b'__keys__', dumps_pyarrow(keys))
+        txn.put(b'__len__', dumps_pyarrow(len(keys)))
+
+    print("Flushing database ...")
+    db.sync()
+    db.close()
+
+
+
+
+if __name__ == "__main__":
+    # lmdb_path = '/apdcephfs/share_1016399/0_public_datasets/imageNet_2012/train.lmdb'
+    # from lmdb_dataset import ImageFolderLMDB
+    # dataset = ImageFolderLMDB(db_path=lmdb_path)
+    # for x, y in dataset:
+    #     print(type(x), type(y))
+    # exit()
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dir', type=str, required=True, help="The dataset directory to process")
+    args = parser.parse_args()
+    # generate lmdb
+    path = args.dir
+    folder2lmdb(path, name="train")
+    folder2lmdb(path, name="val")
diff --git a/cv/classification/repvgg/pytorch/data/samplers.py b/cv/classification/repvgg/pytorch/data/samplers.py
new file mode 100755
index 0000000000000000000000000000000000000000..fed54b4e27b2df268670ace4cda64687209a5380
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/data/samplers.py
@@ -0,0 +1,21 @@
+import torch
+
+class SubsetRandomSampler(torch.utils.data.Sampler):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+
+    Arguments:
+        indices (sequence): a sequence of indices
+    """
+
+    def __init__(self, indices):
+        self.epoch = 0
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in torch.randperm(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/cv/classification/repvgg/pytorch/data/zipreader.py b/cv/classification/repvgg/pytorch/data/zipreader.py
new file mode 100755
index 0000000000000000000000000000000000000000..9d773c3c4e91eae4435faf9d9b297a2d3c21a3d2
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/data/zipreader.py
@@ -0,0 +1,96 @@
+import os
+import zipfile
+import io
+import numpy as np
+from PIL import Image
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def is_zip_path(img_or_path):
+    """judge if this is a zip path"""
+    return '.zip@' in img_or_path
+
+
+class ZipReader(object):
+    """A class to read zipped files"""
+    zip_bank = dict()
+
+    def __init__(self):
+        super(ZipReader, self).__init__()
+
+    @staticmethod
+    def get_zipfile(path):
+        zip_bank = ZipReader.zip_bank
+        if path not in zip_bank:
+            zfile = zipfile.ZipFile(path, 'r')
+            zip_bank[path] = zfile
+        return zip_bank[path]
+
+    @staticmethod
+    def split_zip_style_path(path):
+        pos_at = path.index('@')
+        assert pos_at != -1, "character '@' is not found from the given path '%s'" % path
+
+        zip_path = path[0: pos_at]
+        folder_path = path[pos_at + 1:]
+        folder_path = str.strip(folder_path, '/')
+        return zip_path, folder_path
+
+    @staticmethod
+    def list_folder(path):
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+
+        zfile = ZipReader.get_zipfile(zip_path)
+        folder_list = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    len(os.path.splitext(file_foler_name)[-1]) == 0 and \
+                    file_foler_name != folder_path:
+                if len(folder_path) == 0:
+                    folder_list.append(file_foler_name)
+                else:
+                    folder_list.append(file_foler_name[len(folder_path) + 1:])
+
+        return folder_list
+
+    @staticmethod
+    def list_files(path, extension=None):
+        if extension is None:
+            extension = ['.*']
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+
+        zfile = ZipReader.get_zipfile(zip_path)
+        file_lists = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
+                if len(folder_path) == 0:
+                    file_lists.append(file_foler_name)
+                else:
+                    file_lists.append(file_foler_name[len(folder_path) + 1:])
+
+        return file_lists
+
+    @staticmethod
+    def read(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        return data
+
+    @staticmethod
+    def imread(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        try:
+            im = Image.open(io.BytesIO(data))
+        except:
+            print("ERROR IMG LOADED: ", path_img)
+            random_img = np.random.rand(224, 224, 3) * 255
+            im = Image.fromarray(np.uint8(random_img))
+        return im
diff --git a/cv/classification/repvgg/pytorch/example_pspnet.py b/cv/classification/repvgg/pytorch/example_pspnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..19ff9abdce2ac9b80d637c5033fb568351564796
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/example_pspnet.py
@@ -0,0 +1,161 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+from repvgg import get_RepVGG_func_by_name
+
+#   The PSPNet parts are from
+#   https://github.com/hszhao/semseg
+
+class PPM(nn.Module):
+    def __init__(self, in_dim, reduction_dim, bins, BatchNorm):
+        super(PPM, self).__init__()
+        self.features = []
+        for bin in bins:
+            self.features.append(nn.Sequential(
+                nn.AdaptiveAvgPool2d(bin),
+                nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
+                BatchNorm(reduction_dim),
+                nn.ReLU(inplace=True)
+            ))
+        self.features = nn.ModuleList(self.features)
+
+    def forward(self, x):
+        x_size = x.size()
+        out = [x]
+        for f in self.features:
+            out.append(F.interpolate(f(x), x_size[2:], mode='bilinear', align_corners=True))
+        return torch.cat(out, 1)
+
+
+class PSPNet(nn.Module):
+    def __init__(self,
+                backbone_name, backbone_file, deploy,
+                 bins=(1, 2, 3, 6), dropout=0.1, classes=2,
+                 zoom_factor=8, use_ppm=True, criterion=nn.CrossEntropyLoss(ignore_index=255), BatchNorm=nn.BatchNorm2d,
+                 pretrained=True):
+        super(PSPNet, self).__init__()
+        assert 2048 % len(bins) == 0
+        assert classes > 1
+        assert zoom_factor in [1, 2, 4, 8]
+        self.zoom_factor = zoom_factor
+        self.use_ppm = use_ppm
+        self.criterion = criterion
+
+        repvgg_fn = get_RepVGG_func_by_name(backbone_name)
+        backbone = repvgg_fn(deploy)
+        if pretrained:
+            checkpoint = torch.load(backbone_file)
+            if 'state_dict' in checkpoint:
+                checkpoint = checkpoint['state_dict']
+            ckpt = {k.replace('module.', ''): v for k, v in checkpoint.items()}  # strip the names
+            backbone.load_state_dict(ckpt)
+
+        self.layer0, self.layer1, self.layer2, self.layer3, self.layer4 = backbone.stage0, backbone.stage1, backbone.stage2, backbone.stage3, backbone.stage4
+
+        #   The last two stages should have stride=1 for semantic segmentation
+        #   Note that the stride of 1x1 should be the same as the 3x3
+        #   Use dilation following the implementation of PSPNet
+        secondlast_channel = 0
+        for n, m in self.layer3.named_modules():
+            if ('rbr_dense' in n or 'rbr_reparam' in n) and isinstance(m, nn.Conv2d):
+                m.dilation, m.padding, m.stride = (2, 2), (2, 2), (1, 1)
+                print('change dilation, padding, stride of ', n)
+                secondlast_channel = m.out_channels
+            elif 'rbr_1x1' in n and isinstance(m, nn.Conv2d):
+                m.stride = (1, 1)
+                print('change stride of ', n)
+        last_channel = 0
+        for n, m in self.layer4.named_modules():
+            if ('rbr_dense' in n or 'rbr_reparam' in n) and isinstance(m, nn.Conv2d):
+                m.dilation, m.padding, m.stride = (4, 4), (4, 4), (1, 1)
+                print('change dilation, padding, stride of ', n)
+                last_channel = m.out_channels
+            elif 'rbr_1x1' in n and isinstance(m, nn.Conv2d):
+                m.stride = (1, 1)
+                print('change stride of ', n)
+
+        fea_dim = last_channel
+        aux_in = secondlast_channel
+
+        if use_ppm:
+            self.ppm = PPM(fea_dim, int(fea_dim/len(bins)), bins, BatchNorm)
+            fea_dim *= 2
+
+        self.cls = nn.Sequential(
+            nn.Conv2d(fea_dim, 512, kernel_size=3, padding=1, bias=False),
+            BatchNorm(512),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(p=dropout),
+            nn.Conv2d(512, classes, kernel_size=1)
+        )
+        if self.training:
+            self.aux = nn.Sequential(
+                nn.Conv2d(aux_in, 256, kernel_size=3, padding=1, bias=False),
+                BatchNorm(256),
+                nn.ReLU(inplace=True),
+                nn.Dropout2d(p=dropout),
+                nn.Conv2d(256, classes, kernel_size=1)
+            )
+
+    def forward(self, x, y=None):
+        x_size = x.size()
+        assert (x_size[2]-1) % 8 == 0 and (x_size[3]-1) % 8 == 0
+        h = int((x_size[2] - 1) / 8 * self.zoom_factor + 1)
+        w = int((x_size[3] - 1) / 8 * self.zoom_factor + 1)
+
+        x = self.layer0(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x_tmp = self.layer3(x)
+        x = self.layer4(x_tmp)
+
+        if self.use_ppm:
+            x = self.ppm(x)
+        x = self.cls(x)
+        if self.zoom_factor != 1:
+            x = F.interpolate(x, size=(h, w), mode='bilinear', align_corners=True)
+
+        if self.training:
+            aux = self.aux(x_tmp)
+            if self.zoom_factor != 1:
+                aux = F.interpolate(aux, size=(h, w), mode='bilinear', align_corners=True)
+            main_loss = self.criterion(x, y)
+            aux_loss = self.criterion(aux, y)
+            return x.max(1)[1], main_loss, aux_loss
+        else:
+            return x
+
+
+if __name__ == '__main__':
+    #   1.  Build the PSPNet with RepVGG backbone. Download the ImageNet-pretrained weight file and load it.
+    model = PSPNet(backbone_name='RepVGG-A0', backbone_file='RepVGG-A0-train.pth', deploy=False, classes=19, pretrained=True)
+
+    #   2.  Train it
+    #   seg_train(model)
+
+    #   3.  Convert and check the equivalence
+    input = torch.rand(4, 3, 713, 713)
+    model.eval()
+    print(model)
+    y_train = model(input)
+    for module in model.modules():
+        if hasattr(module, 'switch_to_deploy'):
+            module.switch_to_deploy()
+    y_deploy = model(input)
+    print('output is ', y_deploy.size())
+    print('=================== The diff is')
+    print(((y_deploy - y_train) ** 2).sum())
+
+    #   4.  Save the converted model
+    torch.save(model.state_dict(), 'PSPNet-RepVGG-A0-deploy.pth')
+    del model   #   Or do whatever you want with it
+
+    #   5.  For inference, load the saved model. There is no need to load the ImageNet-pretrained weights again.
+    deploy_model = PSPNet(backbone_name='RepVGG-A0', backbone_file=None, deploy=True, classes=19, pretrained=False)
+    deploy_model.eval()
+    deploy_model.load_state_dict(torch.load('PSPNet-RepVGG-A0-deploy.pth'))
+
+    #   6.  Check again or do whatever you want
+    y_deploy = deploy_model(input)
+    print('=================== The diff is')
+    print(((y_deploy - y_train) ** 2).sum())
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/jizhi_submit_train_repvgg.py b/cv/classification/repvgg/pytorch/jizhi_submit_train_repvgg.py
new file mode 100755
index 0000000000000000000000000000000000000000..8debce4d26aada27fcad1d47148c04510499636f
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/jizhi_submit_train_repvgg.py
@@ -0,0 +1,34 @@
+import argparse
+import datetime
+import os
+import json
+
+parser = argparse.ArgumentParser('JIZHI submit', add_help=False)
+parser.add_argument('arch', default=None, type=str)
+parser.add_argument('tag', default=None, type=str)
+parser.add_argument('--config', default='/apdcephfs_cq2/share_1290939/xiaohanding/cnt/default_V100x8_elastic_config.json', type=str,
+                    help='config file')
+
+
+args = parser.parse_args()
+run_dir = f'{args.arch}_{args.tag}'
+
+cmd = f'python3 -m torch.distributed.launch --nproc_per_node 8 --master_port 12349 main.py ' \
+      f'--arch {args.arch} --batch-size 32 --tag {args.tag} --output-dir /apdcephfs_cq2/share_1290939/xiaohanding/swin_exps/{args.arch}_{args.tag} --opts TRAIN.EPOCHS 120 TRAIN.BASE_LR 0.1 TRAIN.WEIGHT_DECAY 4e-5 TRAIN.WARMUP_EPOCHS 5 MODEL.LABEL_SMOOTHING 0.1 AUG.PRESET raug15 DATA.DATASET imagenet'
+
+os.system('cd /apdcephfs_cq2/share_1290939/xiaohanding/RepVGG/')
+os.system(f'mkdir runs/{run_dir}')
+with open(f'runs/{run_dir}/start.sh', 'w') as f:
+    f.write(cmd)
+with open(args.config, 'r') as f:
+    json_content = json.load(f)
+json_content['model_local_file_path'] = f'/apdcephfs_cq2/share_1290939/xiaohanding/RepVGG/runs/{run_dir}'
+config_file_path = f'/apdcephfs_cq2/share_1290939/xiaohanding/RepVGG/runs/{run_dir}/config.json'
+with open(config_file_path, 'w') as f:
+    json.dump(json_content, f)
+
+os.system(f'cp *.py runs/{run_dir}/')
+os.system(f'cp -r data runs/{run_dir}/')
+os.system(f'cp -r train runs/{run_dir}/')
+os.system(f'cd runs/{run_dir}')
+os.system(f'jizhi_client start -scfg {config_file_path}')
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/main.py b/cv/classification/repvgg/pytorch/main.py
new file mode 100755
index 0000000000000000000000000000000000000000..c721c14bd81484896b0a4f4f72558af96af29790
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/main.py
@@ -0,0 +1,414 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+import time
+import argparse
+import datetime
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import accuracy, AverageMeter
+from train.config import get_config
+from data import build_loader
+from train.lr_scheduler import build_scheduler
+from train.logger import create_logger
+from utils import load_checkpoint, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor, save_latest, update_model_ema, unwrap_model
+import copy
+from train.optimizer import build_optimizer
+from repvggplus import create_RepVGGplus_by_name
+
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+def parse_option():
+    parser = argparse.ArgumentParser('RepOpt-VGG training script built on the codebase of Swin Transformer', add_help=False)
+    parser.add_argument(
+        "--opts",
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+',
+    )
+
+    # easy config modification
+    parser.add_argument('--arch', default=None, type=str, help='arch name')
+    parser.add_argument('--batch-size', default=128, type=int, help="batch size for single GPU")
+    parser.add_argument('--data-path', default='/your/path/to/dataset', type=str, help='path to dataset')
+    parser.add_argument('--scales-path', default=None, type=str, help='path to the trained Hyper-Search model')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                             'full: cache all data, '
+                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps")
+    parser.add_argument('--use-checkpoint', action='store_true',
+                        help="whether to use gradient checkpointing to save memory")
+    parser.add_argument('--amp-opt-level', type=str, default='O0', choices=['O0', 'O1', 'O2'],  #TODO Note: use amp if you have it
+                        help='mixed precision opt level, if O0, no amp is used')
+    parser.add_argument('--output', default='/your/path/to/save/dir', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+
+    # distributed training
+    parser.add_argument("--local_rank", type=int, default=0, help='local rank for DistributedDataParallel')
+
+    args, unparsed = parser.parse_known_args()
+
+    config = get_config(args)
+
+    return args, config
+
+
+
+
+
+def main(config):
+    dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn = build_loader(config)
+
+    logger.info(f"Creating model:{config.MODEL.ARCH}")
+
+    model = create_RepVGGplus_by_name(config.MODEL.ARCH, deploy=False, use_checkpoint=args.use_checkpoint)
+    optimizer = build_optimizer(config, model)
+
+    logger.info(str(model))
+    model.cuda()
+
+    if torch.cuda.device_count() > 1:
+        if config.AMP_OPT_LEVEL != "O0":
+            model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK],
+                                                          broadcast_buffers=False)
+        model_without_ddp = model.module
+    else:
+        if config.AMP_OPT_LEVEL != "O0":
+            model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL)
+        model_without_ddp = model
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"number of params: {n_parameters}")
+    if hasattr(model_without_ddp, 'flops'):
+        flops = model_without_ddp.flops()
+        logger.info(f"number of GFLOPs: {flops / 1e9}")
+
+    if config.THROUGHPUT_MODE:
+        throughput(data_loader_val, model, logger)
+        return
+
+    if config.EVAL_MODE:
+        load_weights(model, config.MODEL.RESUME)
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Only eval. top-1 acc, top-5 acc, loss: {acc1:.3f}, {acc5:.3f}, {loss:.5f}")
+        return
+
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
+
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    max_accuracy = 0.0
+    max_ema_accuracy = 0.0
+
+    if config.TRAIN.EMA_ALPHA > 0 and (not config.EVAL_MODE) and (not config.THROUGHPUT_MODE):
+        model_ema = copy.deepcopy(model)
+    else:
+        model_ema = None
+
+    if config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}")
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+
+    if (not config.THROUGHPUT_MODE) and config.MODEL.RESUME:
+        max_accuracy = load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger, model_ema=model_ema)
+
+
+    logger.info("Start training")
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+
+        train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler, model_ema=model_ema)
+        if dist.get_rank() == 0:
+            save_latest(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger, model_ema=model_ema)
+            if epoch % config.SAVE_FREQ == 0:
+                save_checkpoint(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger, model_ema=model_ema)
+
+        if epoch % config.SAVE_FREQ == 0 or epoch >= (config.TRAIN.EPOCHS - 10):
+
+            if data_loader_val is not None:
+                acc1, acc5, loss = validate(config, data_loader_val, model)
+                logger.info(f"Accuracy of the network at epoch {epoch}: {acc1:.3f}%")
+                max_accuracy = max(max_accuracy, acc1)
+                logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+                if max_accuracy == acc1 and dist.get_rank() == 0:
+                    save_checkpoint(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger,
+                                    is_best=True, model_ema=model_ema)
+
+            if model_ema is not None:
+                if data_loader_val is not None:
+                    acc1, acc5, loss = validate(config, data_loader_val, model_ema)
+                    logger.info(f"EMAAccuracy of the network at epoch {epoch} test images: {acc1:.3f}%")
+                    max_ema_accuracy = max(max_ema_accuracy, acc1)
+                    logger.info(f'EMAMax accuracy: {max_ema_accuracy:.2f}%')
+                    if max_ema_accuracy == acc1 and dist.get_rank() == 0:
+                        best_ema_path = os.path.join(config.OUTPUT, 'best_ema.pth')
+                        logger.info(f"{best_ema_path} best EMA saving......")
+                        torch.save(unwrap_model(model_ema).state_dict(), best_ema_path)
+                else:
+                    latest_ema_path = os.path.join(config.OUTPUT, 'latest_ema.pth')
+                    logger.info(f"{latest_ema_path} latest EMA saving......")
+                    torch.save(unwrap_model(model_ema).state_dict(), latest_ema_path)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+def train_one_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler, model_ema=None):
+    model.train()
+    optimizer.zero_grad()
+
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = AverageMeter()
+
+    start = time.time()
+    end = time.time()
+    for idx, (samples, targets) in enumerate(data_loader):
+        samples = samples.cuda(non_blocking=True)
+        targets = targets.cuda(non_blocking=True)
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        outputs = model(samples)
+
+        if type(outputs) is dict:
+            loss = 0.0
+            for name, pred in outputs.items():
+                if 'aux' in name:
+                    loss += 0.1 * criterion(pred, targets)
+                else:
+                    loss += criterion(pred, targets)
+        else:
+            loss = criterion(outputs, targets)
+
+        if config.TRAIN.ACCUMULATION_STEPS > 1:
+
+            loss = loss / config.TRAIN.ACCUMULATION_STEPS
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                optimizer.step()
+                optimizer.zero_grad()
+                lr_scheduler.step_update(epoch * num_steps + idx)
+
+        else:
+
+            optimizer.zero_grad()
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            optimizer.step()
+            lr_scheduler.step_update(epoch * num_steps + idx)
+
+        torch.cuda.synchronize()
+
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(grad_norm)
+        batch_time.update(time.time() - end)
+
+        if model_ema is not None:
+            update_model_ema(config, dist.get_world_size(), model=model, model_ema=model_ema, cur_epoch=epoch, cur_iter=idx)
+
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")
+
+
+@torch.no_grad()
+def validate(config, data_loader, model):
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    end = time.time()
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+
+        # compute output
+        output = model(images)
+
+        #   =============================== deepsup part
+        if type(output) is dict:
+            output = output['main']
+
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(
+                f'Test: [{idx}/{len(data_loader)}]\t'
+                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                f'Mem {memory_used:.0f}MB')
+    logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+
+
+@torch.no_grad()
+def throughput(data_loader, model, logger):
+    model.eval()
+
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.cuda.synchronize()
+        logger.info(f"throughput averaged with 30 times")
+        tic1 = time.time()
+        for i in range(30):
+            model(images)
+        torch.cuda.synchronize()
+        tic2 = time.time()
+        throughput = 30 * batch_size / (tic2 - tic1)
+        logger.info(f"batch_size {batch_size} throughput {throughput}")
+        return
+
+
+import os
+
+if __name__ == '__main__':
+    args, config = parse_option()
+
+    if config.AMP_OPT_LEVEL != "O0":
+        assert amp is not None, "amp not installed!"
+
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+    else:
+        rank = -1
+        world_size = -1
+    torch.cuda.set_device(config.LOCAL_RANK)
+    torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
+    torch.distributed.barrier()
+    seed = config.SEED + dist.get_rank()
+
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    if not config.EVAL_MODE:
+        # linear scale the learning rate according to total batch size, may not be optimal
+        linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 256.0
+        linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 256.0
+        linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 256.0
+        # gradient accumulation also need to scale the learning rate
+        if config.TRAIN.ACCUMULATION_STEPS > 1:
+            linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+            linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+            linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+        config.defrost()
+        config.TRAIN.BASE_LR = linear_scaled_lr
+        config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+        config.TRAIN.MIN_LR = linear_scaled_min_lr
+        config.freeze()
+
+    print('==========================================')
+    print('real base lr: ', config.TRAIN.BASE_LR)
+    print('==========================================')
+
+    os.makedirs(config.OUTPUT, exist_ok=True)
+
+    logger = create_logger(output_dir=config.OUTPUT, dist_rank=0 if torch.cuda.device_count() == 1 else dist.get_rank(), name=f"{config.MODEL.ARCH}")
+
+    if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
+        path = os.path.join(config.OUTPUT, "config.json")
+        with open(path, "w") as f:
+            f.write(config.dump())
+        logger.info(f"Full config saved to {path}")
+
+    # print config
+    logger.info(config.dump())
+
+    main(config)
diff --git a/cv/classification/repvgg/pytorch/quantization/quant_qat_train.py b/cv/classification/repvgg/pytorch/quantization/quant_qat_train.py
new file mode 100755
index 0000000000000000000000000000000000000000..80e1dcd06bcf20c839b98808d3da54d9cb9616cd
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/quantization/quant_qat_train.py
@@ -0,0 +1,426 @@
+import argparse
+import random
+import shutil
+import time
+import warnings
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+from utils import *
+import torchvision.transforms as transforms
+import PIL
+
+best_acc1 = 0
+
+IMAGENET_TRAINSET_SIZE = 1281167
+
+parser = argparse.ArgumentParser(description='PyTorch Whole Model Quant')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='RepVGG-A0')
+parser.add_argument('-j', '--workers', default=8, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=8, type=int, metavar='N',
+                    help='number of epochs for each run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--val-batch-size', default=100, type=int, metavar='V',
+                    help='validation batch size')
+parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,
+                    metavar='LR', help='learning rate for finetuning', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://127.0.0.1:23333', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--base-weights', default=None, type=str,
+                    help='weights of the base model.')
+parser.add_argument('--tag', default='testtest', type=str,
+                    help='the tag for identifying the log and model files. Just a string.')
+parser.add_argument('--fpfinetune', dest='fpfinetune', action='store_true',
+                    help='full precision finetune')
+parser.add_argument('--fixobserver', dest='fixobserver', action='store_true',
+                    help='fix observer?')
+parser.add_argument('--fixbn', dest='fixbn', action='store_true',
+                    help='fix bn?')
+parser.add_argument('--quantlayers', default='all', type=str, choices=['all', 'exclud_first_and_linear', 'exclud_first_and_last'],
+                    help='the tag for identifying the log and model files. Just a string.')
+
+
+
+def sgd_optimizer(model, lr, momentum, weight_decay):
+    params = []
+    for key, value in model.named_parameters():
+        if not value.requires_grad:
+            continue
+        apply_weight_decay = weight_decay
+        apply_lr = lr
+        if value.ndimension() < 2:  #TODO note this
+            apply_weight_decay = 0
+            print('set weight decay=0 for {}'.format(key))
+        if 'bias' in key:
+            apply_lr = 2 * lr       #   Just a Caffe-style common practice. Made no difference.
+        params += [{'params': [value], 'lr': apply_lr, 'weight_decay': apply_weight_decay}]
+    optimizer = torch.optim.SGD(params, lr, momentum=momentum)
+    return optimizer
+
+def main():
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+
+
+def get_default_train_trans(args):
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    if (not hasattr(args, 'resolution')) or args.resolution == 224:
+        trans = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize])
+    else:
+        raise ValueError('Not yet implemented.')
+    return trans
+
+
+def get_default_val_trans(args):
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    if (not hasattr(args, 'resolution')) or args.resolution == 224:
+        trans = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize])
+    else:
+        trans = transforms.Compose([
+            transforms.Resize(args.resolution, interpolation=PIL.Image.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            normalize,
+        ])
+    return trans
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+    log_file = 'quant_{}_exp.txt'.format(args.tag)
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+
+    #   1.  Build and load base model
+    from repvgg import get_RepVGG_func_by_name
+    repvgg_build_func = get_RepVGG_func_by_name(args.arch)
+    base_model = repvgg_build_func(deploy=True)
+    from tools.insert_bn import directly_insert_bn_without_init
+    directly_insert_bn_without_init(base_model)
+    if args.base_weights is not None:
+        load_checkpoint(base_model, args.base_weights)
+
+    #   2.
+    if not args.fpfinetune:
+        from quantization.repvgg_quantized import RepVGGWholeQuant
+        qat_model = RepVGGWholeQuant(repvgg_model=base_model, quantlayers=args.quantlayers)
+        qat_model.prepare_quant()
+    else:
+        qat_model = base_model
+        log_msg('===================== not QAT, just full-precision finetune ===========', log_file)
+
+    #===================================================
+    #   From now on, the code will be very similar to ordinary training
+    # ===================================================
+
+    is_main = not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)
+
+    if is_main:
+        for n, p in qat_model.named_parameters():
+            print(n, p.size())
+        for n, p in qat_model.named_buffers():
+            print(n, p.size())
+        log_msg('epochs {}, lr {}, weight_decay {}'.format(args.epochs, args.lr, args.weight_decay), log_file)
+        #   You will see it now has quantization-related parameters (zero-points and scales)
+
+    if not torch.cuda.is_available():
+        print('using CPU, this will be slow')
+    elif args.distributed:
+        if args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            qat_model.cuda(args.gpu)
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+            qat_model = torch.nn.parallel.DistributedDataParallel(qat_model, device_ids=[args.gpu])
+        else:
+            qat_model.cuda()
+            qat_model = torch.nn.parallel.DistributedDataParallel(qat_model)
+    elif args.gpu is not None:
+        torch.cuda.set_device(args.gpu)
+        qat_model = qat_model.cuda(args.gpu)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        qat_model = torch.nn.DataParallel(qat_model).cuda()
+
+
+    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+    optimizer = sgd_optimizer(qat_model, args.lr, args.momentum, args.weight_decay)
+
+    warmup_epochs = 1
+    lr_scheduler = WarmupCosineAnnealingLR(optimizer=optimizer, T_cosine_max=args.epochs * IMAGENET_TRAINSET_SIZE // args.batch_size // ngpus_per_node,
+                            eta_min=0, warmup=warmup_epochs * IMAGENET_TRAINSET_SIZE // args.batch_size // ngpus_per_node)
+
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            else:
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            qat_model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['scheduler'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    # todo
+    train_sampler, train_loader = get_default_ImageNet_train_sampler_loader(args)
+    val_loader = get_default_ImageNet_val_loader(args)
+
+    if args.evaluate:
+        validate(val_loader, qat_model, criterion, args)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        # train for one epoch
+        train(train_loader, qat_model, criterion, optimizer, epoch, args, lr_scheduler, is_main=is_main)
+
+        if args.fixobserver and epoch > (3 * args.epochs // 8):
+            # Freeze quantizer parameters
+            qat_model.apply(torch.quantization.disable_observer)  #TODO testing. May not be useful
+            log_msg('fix observer after epoch {}'.format(epoch), log_file)
+
+        if args.fixbn and epoch > (2 * args.epochs // 8):    #TODO testing. May not be useful
+        #     Freeze batch norm mean and variance estimates
+            qat_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
+            log_msg('fix bn after epoch {}'.format(epoch), log_file)
+
+        # evaluate on validation set
+        if is_main:
+            acc1 = validate(val_loader, qat_model, criterion, args)
+            msg = '{}, base{}, quant, epoch {}, QAT acc {}'.format(args.arch, args.base_weights, epoch, acc1)
+            log_msg(msg, log_file)
+
+            is_best = acc1 > best_acc1
+            best_acc1 = max(acc1, best_acc1)
+
+            save_checkpoint({
+                'epoch': epoch + 1,
+                'arch': args.arch,
+                'state_dict': qat_model.state_dict(),
+                'best_acc1': best_acc1,
+                'optimizer' : optimizer.state_dict(),
+                'scheduler': lr_scheduler.state_dict(),
+            }, is_best,
+                filename = '{}_{}.pth.tar'.format(args.arch, args.tag),
+                best_filename='{}_{}_best.pth.tar'.format(args.arch, args.tag))
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args, lr_scheduler, is_main):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5, ],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            images = images.cuda(args.gpu, non_blocking=True)
+        if torch.cuda.is_available():
+            target = target.cuda(args.gpu, non_blocking=True)
+
+        # compute output
+
+        output = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if is_main and i % args.print_freq == 0:
+            progress.display(i)
+        if is_main and i % 1000 == 0 and lr_scheduler is not None:
+            print('cur lr: ', lr_scheduler.get_lr()[0])
+
+
+
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+            images = images.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename, best_filename):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, best_filename)
+
+
+
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/quantization/repvgg_quantized.py b/cv/classification/repvgg/pytorch/quantization/repvgg_quantized.py
new file mode 100755
index 0000000000000000000000000000000000000000..9a06a89c745c7824d1e60adc17125da9636c0be7
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/quantization/repvgg_quantized.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+from torch.quantization import QuantStub, DeQuantStub
+
+class RepVGGWholeQuant(nn.Module):
+
+    def __init__(self, repvgg_model, quantlayers):
+        super(RepVGGWholeQuant, self).__init__()
+        assert quantlayers in ['all', 'exclud_first_and_linear', 'exclud_first_and_last']
+        self.quantlayers = quantlayers
+        self.quant = QuantStub()
+        self.stage0, self.stage1, self.stage2, self.stage3, self.stage4 = repvgg_model.stage0, repvgg_model.stage1, repvgg_model.stage2, repvgg_model.stage3, repvgg_model.stage4
+        self.gap, self.linear = repvgg_model.gap, repvgg_model.linear
+        self.dequant = DeQuantStub()
+
+
+    def forward(self, x):
+        if self.quantlayers == 'all':
+            x = self.quant(x)
+            out = self.stage0(x)
+        else:
+            out = self.stage0(x)
+            out = self.quant(out)
+        out = self.stage1(out)
+        out = self.stage2(out)
+        out = self.stage3(out)
+        if self.quantlayers == 'all':
+            out = self.stage4(out)
+            out = self.gap(out).view(out.size(0), -1)
+            out = self.linear(out)
+            out = self.dequant(out)
+        elif self.quantlayers == 'exclud_first_and_linear':
+            out = self.stage4(out)
+            out = self.dequant(out)
+            out = self.gap(out).view(out.size(0), -1)
+            out = self.linear(out)
+        else:
+            out = self.dequant(out)
+            out = self.stage4(out)
+            out = self.gap(out).view(out.size(0), -1)
+            out = self.linear(out)
+        return out
+
+    #   From https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html
+    def fuse_model(self):
+        for m in self.modules():
+            if type(m) == nn.Sequential and hasattr(m, 'conv'):
+                # Note that we moved ReLU from "block.nonlinearity" into "rbr_reparam" (nn.Sequential).
+                # This makes it more convenient to fuse operators using off-the-shelf APIs.
+                torch.quantization.fuse_modules(m, ['conv', 'bn', 'relu'], inplace=True)
+
+    def _get_qconfig(self):
+        return torch.quantization.get_default_qat_qconfig('fbgemm')
+
+    def prepare_quant(self):
+        #   From https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html
+        self.fuse_model()
+        qconfig = self._get_qconfig()
+        self.qconfig = qconfig
+        torch.quantization.prepare_qat(self, inplace=True)
+
+    def freeze_quant_bn(self):
+        self.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/repvgg.py b/cv/classification/repvgg/pytorch/repvgg.py
new file mode 100755
index 0000000000000000000000000000000000000000..92bd07f462173962326848cb6de01e19ee279c4e
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/repvgg.py
@@ -0,0 +1,303 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import torch.nn as nn
+import numpy as np
+import torch
+import copy
+from se_block import SEBlock
+import torch.utils.checkpoint as checkpoint
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    result = nn.Sequential()
+    result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+class RepVGGBlock(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', deploy=False, use_se=False):
+        super(RepVGGBlock, self).__init__()
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        self.nonlinearity = nn.ReLU()
+
+        if use_se:
+            #   Note that RepVGG-D2se uses SE before nonlinearity. But RepVGGplus models uses SE after nonlinearity.
+            self.se = SEBlock(out_channels, internal_neurons=out_channels // 16)
+        else:
+            self.se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+                                      padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode)
+
+        else:
+            self.rbr_identity = nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+            self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups)
+            self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups)
+            print('RepVGG Block, identity = ', self.rbr_identity)
+
+
+    def forward(self, inputs):
+        if hasattr(self, 'rbr_reparam'):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        return self.nonlinearity(self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+
+    #   Optional. This may improve the accuracy and facilitates quantization in some cases.
+    #   1.  Cancel the original weight decay on rbr_dense.conv.weight and rbr_1x1.conv.weight.
+    #   2.  Use like this.
+    #       loss = criterion(....)
+    #       for every RepVGGBlock blk:
+    #           loss += weight_decay_coefficient * 0.5 * blk.get_cust_L2()
+    #       optimizer.zero_grad()
+    #       loss.backward()
+    def get_custom_L2(self):
+        K3 = self.rbr_dense.conv.weight
+        K1 = self.rbr_1x1.conv.weight
+        t3 = (self.rbr_dense.bn.weight / ((self.rbr_dense.bn.running_var + self.rbr_dense.bn.eps).sqrt())).reshape(-1, 1, 1, 1).detach()
+        t1 = (self.rbr_1x1.bn.weight / ((self.rbr_1x1.bn.running_var + self.rbr_1x1.bn.eps).sqrt())).reshape(-1, 1, 1, 1).detach()
+
+        l2_loss_circle = (K3 ** 2).sum() - (K3[:, :, 1:2, 1:2] ** 2).sum()      # The L2 loss of the "circle" of weights in 3x3 kernel. Use regular L2 on them.
+        eq_kernel = K3[:, :, 1:2, 1:2] * t3 + K1 * t1                           # The equivalent resultant central point of 3x3 kernel.
+        l2_loss_eq_kernel = (eq_kernel ** 2 / (t3 ** 2 + t1 ** 2)).sum()        # Normalize for an L2 coefficient comparable to regular L2.
+        return l2_loss_eq_kernel + l2_loss_circle
+
+
+
+#   This func derives the equivalent kernel and bias in a DIFFERENTIABLE way.
+#   You can get the equivalent kernel and bias at any time and do whatever you want,
+    #   for example, apply some penalties or constraints during training, just like you do to the other models.
+#   May be useful for quantization or pruning.
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1,1,1,1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels, out_channels=self.rbr_dense.conv.out_channels,
+                                     kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride,
+                                     padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation, groups=self.rbr_dense.conv.groups, bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+
+
+class RepVGG(nn.Module):
+
+    def __init__(self, num_blocks, num_classes=1000, width_multiplier=None, override_groups_map=None, deploy=False, use_se=False, use_checkpoint=False):
+        super(RepVGG, self).__init__()
+        assert len(width_multiplier) == 4
+        self.deploy = deploy
+        self.override_groups_map = override_groups_map or dict()
+        assert 0 not in self.override_groups_map
+        self.use_se = use_se
+        self.use_checkpoint = use_checkpoint
+
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+        self.stage0 = RepVGGBlock(in_channels=3, out_channels=self.in_planes, kernel_size=3, stride=2, padding=1, deploy=self.deploy, use_se=self.use_se)
+        self.cur_layer_idx = 1
+        self.stage1 = self._make_stage(int(64 * width_multiplier[0]), num_blocks[0], stride=2)
+        self.stage2 = self._make_stage(int(128 * width_multiplier[1]), num_blocks[1], stride=2)
+        self.stage3 = self._make_stage(int(256 * width_multiplier[2]), num_blocks[2], stride=2)
+        self.stage4 = self._make_stage(int(512 * width_multiplier[3]), num_blocks[3], stride=2)
+        self.gap = nn.AdaptiveAvgPool2d(output_size=1)
+        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)
+
+    def _make_stage(self, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        blocks = []
+        for stride in strides:
+            cur_groups = self.override_groups_map.get(self.cur_layer_idx, 1)
+            blocks.append(RepVGGBlock(in_channels=self.in_planes, out_channels=planes, kernel_size=3,
+                                      stride=stride, padding=1, groups=cur_groups, deploy=self.deploy, use_se=self.use_se))
+            self.in_planes = planes
+            self.cur_layer_idx += 1
+        return nn.ModuleList(blocks)
+
+    def forward(self, x):
+        out = self.stage0(x)
+        for stage in (self.stage1, self.stage2, self.stage3, self.stage4):
+            for block in stage:
+                if self.use_checkpoint:
+                    out = checkpoint.checkpoint(block, out)
+                else:
+                    out = block(out)
+        out = self.gap(out)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+optional_groupwise_layers = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]
+g2_map = {l: 2 for l in optional_groupwise_layers}
+g4_map = {l: 4 for l in optional_groupwise_layers}
+
+def create_RepVGG_A0(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[2, 4, 14, 1], num_classes=1000,
+                  width_multiplier=[0.75, 0.75, 0.75, 2.5], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_A1(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[2, 4, 14, 1], num_classes=1000,
+                  width_multiplier=[1, 1, 1, 2.5], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_A2(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[2, 4, 14, 1], num_classes=1000,
+                  width_multiplier=[1.5, 1.5, 1.5, 2.75], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B0(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[1, 1, 1, 2.5], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B1(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[2, 2, 2, 4], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B1g2(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[2, 2, 2, 4], override_groups_map=g2_map, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B1g4(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[2, 2, 2, 4], override_groups_map=g4_map, deploy=deploy, use_checkpoint=use_checkpoint)
+
+
+def create_RepVGG_B2(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B2g2(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=g2_map, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B2g4(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=g4_map, deploy=deploy, use_checkpoint=use_checkpoint)
+
+
+def create_RepVGG_B3(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[3, 3, 3, 5], override_groups_map=None, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B3g2(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[3, 3, 3, 5], override_groups_map=g2_map, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_B3g4(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], num_classes=1000,
+                  width_multiplier=[3, 3, 3, 5], override_groups_map=g4_map, deploy=deploy, use_checkpoint=use_checkpoint)
+
+def create_RepVGG_D2se(deploy=False, use_checkpoint=False):
+    return RepVGG(num_blocks=[8, 14, 24, 1], num_classes=1000,
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=None, deploy=deploy, use_se=True, use_checkpoint=use_checkpoint)
+
+
+func_dict = {
+'RepVGG-A0': create_RepVGG_A0,
+'RepVGG-A1': create_RepVGG_A1,
+'RepVGG-A2': create_RepVGG_A2,
+'RepVGG-B0': create_RepVGG_B0,
+'RepVGG-B1': create_RepVGG_B1,
+'RepVGG-B1g2': create_RepVGG_B1g2,
+'RepVGG-B1g4': create_RepVGG_B1g4,
+'RepVGG-B2': create_RepVGG_B2,
+'RepVGG-B2g2': create_RepVGG_B2g2,
+'RepVGG-B2g4': create_RepVGG_B2g4,
+'RepVGG-B3': create_RepVGG_B3,
+'RepVGG-B3g2': create_RepVGG_B3g2,
+'RepVGG-B3g4': create_RepVGG_B3g4,
+'RepVGG-D2se': create_RepVGG_D2se,      #   Updated at April 25, 2021. This is not reported in the CVPR paper.
+}
+def get_RepVGG_func_by_name(name):
+    return func_dict[name]
+
+
+
+#   Use this for converting a RepVGG model or a bigger model with RepVGG as its component
+#   Use like this
+#   model = create_RepVGG_A0(deploy=False)
+#   train model or load weights
+#   repvgg_model_convert(model, save_path='repvgg_deploy.pth')
+#   If you want to preserve the original model, call with do_copy=True
+
+#   ====================== for using RepVGG as the backbone of a bigger model, e.g., PSPNet, the pseudo code will be like
+#   train_backbone = create_RepVGG_B2(deploy=False)
+#   train_backbone.load_state_dict(torch.load('RepVGG-B2-train.pth'))
+#   train_pspnet = build_pspnet(backbone=train_backbone)
+#   segmentation_train(train_pspnet)
+#   deploy_pspnet = repvgg_model_convert(train_pspnet)
+#   segmentation_test(deploy_pspnet)
+#   =====================   example_pspnet.py shows an example
+
+def repvgg_model_convert(model:torch.nn.Module, save_path=None, do_copy=True):
+    if do_copy:
+        model = copy.deepcopy(model)
+    for module in model.modules():
+        if hasattr(module, 'switch_to_deploy'):
+            module.switch_to_deploy()
+    if save_path is not None:
+        torch.save(model.state_dict(), save_path)
+    return model
diff --git a/cv/classification/repvgg/pytorch/repvggplus.py b/cv/classification/repvgg/pytorch/repvggplus.py
new file mode 100755
index 0000000000000000000000000000000000000000..9f365871befd228d6e7ed5d4c0928a597785af4a
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/repvggplus.py
@@ -0,0 +1,293 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from se_block import SEBlock
+import torch
+import numpy as np
+
+def conv_bn_relu(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    result = nn.Sequential()
+    result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    result.add_module('relu', nn.ReLU())
+    return result
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    result = nn.Sequential()
+    result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+class RepVGGplusBlock(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros',
+                 deploy=False,
+                 use_post_se=False):
+        super(RepVGGplusBlock, self).__init__()
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        self.nonlinearity = nn.ReLU()
+
+        if use_post_se:
+            self.post_se = SEBlock(out_channels, internal_neurons=out_channels // 4)
+        else:
+            self.post_se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+                                      padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode)
+        else:
+            if out_channels == in_channels and stride == 1:
+                self.rbr_identity = nn.BatchNorm2d(num_features=out_channels)
+            else:
+                self.rbr_identity = None
+            self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups)
+            padding_11 = padding - kernel_size // 2
+            self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups)
+
+    def forward(self, x):
+        if self.deploy:
+            return self.post_se(self.nonlinearity(self.rbr_reparam(x)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(x)
+        out = self.rbr_dense(x) + self.rbr_1x1(x) + id_out
+        out = self.post_se(self.nonlinearity(out))
+        return out
+
+
+    #   This func derives the equivalent kernel and bias in a DIFFERENTIABLE way.
+    #   You can get the equivalent kernel and bias at any time and do whatever you want,
+    #   for example, apply some penalties or constraints during training, just like you do to the other models.
+    #   May be useful for quantization or pruning.
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            #   For the 1x1 or 3x3 branch
+            kernel, running_mean, running_var, gamma, beta, eps = branch.conv.weight, branch.bn.running_mean, branch.bn.running_var, branch.bn.weight, branch.bn.bias, branch.bn.eps
+        else:
+            #   For the identity branch
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                #   Construct and store the identity kernel in case it is used multiple times
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel, running_mean, running_var, gamma, beta, eps = self.id_tensor, branch.running_mean, branch.running_var, branch.weight, branch.bias, branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels,
+                                     out_channels=self.rbr_dense.conv.out_channels,
+                                     kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride,
+                                     padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation,
+                                     groups=self.rbr_dense.conv.groups, bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+
+
+class RepVGGplusStage(nn.Module):
+
+    def __init__(self, in_planes, planes, num_blocks, stride, use_checkpoint, use_post_se=False, deploy=False):
+        super().__init__()
+        strides = [stride] + [1] * (num_blocks - 1)
+        blocks = []
+        self.in_planes = in_planes
+        for stride in strides:
+            cur_groups = 1
+            blocks.append(RepVGGplusBlock(in_channels=self.in_planes, out_channels=planes, kernel_size=3,
+                                      stride=stride, padding=1, groups=cur_groups, deploy=deploy, use_post_se=use_post_se))
+            self.in_planes = planes
+        self.blocks = nn.ModuleList(blocks)
+        self.use_checkpoint = use_checkpoint
+
+    def forward(self, x):
+        for block in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(block, x)
+            else:
+                x = block(x)
+        return x
+
+
+class RepVGGplus(nn.Module):
+    """RepVGGplus
+        An official improved version of RepVGG (RepVGG: Making VGG-style ConvNets Great Again) <https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf>`_.
+
+        Args:
+            num_blocks (tuple[int]): Depths of each stage.
+            num_classes (tuple[int]): Num of classes.
+            width_multiplier (tuple[float]): The width of the four stages
+                will be (64 * width_multiplier[0], 128 * width_multiplier[1], 256 * width_multiplier[2], 512 * width_multiplier[3]).
+            deploy (bool, optional): If True, the model will have the inference-time structure.
+                Default: False.
+            use_post_se (bool, optional): If True, the model will have Squeeze-and-Excitation blocks following the conv-ReLU units.
+                Default: False.
+            use_checkpoint (bool, optional): If True, the model will use torch.utils.checkpoint to save the GPU memory during training with acceptable slowdown.
+                Do not use it if you have sufficient GPU memory.
+                Default: False.
+        """
+    def __init__(self,
+                 num_blocks,
+                 num_classes,
+                 width_multiplier,
+                 deploy=False,
+                 use_post_se=False,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.deploy = deploy
+        self.num_classes = num_classes
+
+        in_channels = min(64, int(64 * width_multiplier[0]))
+        stage_channels = [int(64 * width_multiplier[0]), int(128 * width_multiplier[1]), int(256 * width_multiplier[2]), int(512 * width_multiplier[3])]
+        self.stage0 = RepVGGplusBlock(in_channels=3, out_channels=in_channels, kernel_size=3, stride=2, padding=1, deploy=self.deploy, use_post_se=use_post_se)
+        self.stage1 = RepVGGplusStage(in_channels, stage_channels[0], num_blocks[0], stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.stage2 = RepVGGplusStage(stage_channels[0], stage_channels[1], num_blocks[1], stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        #   split stage3 so that we can insert an auxiliary classifier
+        self.stage3_first = RepVGGplusStage(stage_channels[1], stage_channels[2], num_blocks[2] // 2, stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.stage3_second = RepVGGplusStage(stage_channels[2], stage_channels[2], num_blocks[2] - num_blocks[2] // 2, stride=1, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.stage4 = RepVGGplusStage(stage_channels[2], stage_channels[3], num_blocks[3], stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.gap = nn.AdaptiveAvgPool2d(output_size=1)
+        self.flatten = nn.Flatten()
+        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)
+        #   aux classifiers
+        if not self.deploy:
+            self.stage1_aux = self._build_aux_for_stage(self.stage1)
+            self.stage2_aux = self._build_aux_for_stage(self.stage2)
+            self.stage3_first_aux = self._build_aux_for_stage(self.stage3_first)
+
+    def _build_aux_for_stage(self, stage):
+        stage_out_channels = list(stage.blocks.children())[-1].rbr_dense.conv.out_channels
+        downsample = conv_bn_relu(in_channels=stage_out_channels, out_channels=stage_out_channels, kernel_size=3, stride=2, padding=1)
+        fc = nn.Linear(stage_out_channels, self.num_classes, bias=True)
+        return nn.Sequential(downsample, nn.AdaptiveAvgPool2d(1), nn.Flatten(), fc)
+
+    def forward(self, x):
+        out = self.stage0(x)
+        out = self.stage1(out)
+        stage1_aux = self.stage1_aux(out)
+        out = self.stage2(out)
+        stage2_aux = self.stage2_aux(out)
+        out = self.stage3_first(out)
+        stage3_first_aux = self.stage3_first_aux(out)
+        out = self.stage3_second(out)
+        out = self.stage4(out)
+        y = self.gap(out)
+        y = self.flatten(y)
+        y = self.linear(y)
+        return {
+            'main': y,
+            'stage1_aux': stage1_aux,
+            'stage2_aux': stage2_aux,
+            'stage3_first_aux': stage3_first_aux,
+        }
+
+    def switch_repvggplus_to_deploy(self):
+        for m in self.modules():
+            if hasattr(m, 'switch_to_deploy'):
+                m.switch_to_deploy()
+        if hasattr(self, 'stage1_aux'):
+            self.__delattr__('stage1_aux')
+        if hasattr(self, 'stage2_aux'):
+            self.__delattr__('stage2_aux')
+        if hasattr(self, 'stage3_first_aux'):
+            self.__delattr__('stage3_first_aux')
+        self.deploy = True
+
+
+#   torch.utils.checkpoint can reduce the memory consumption during training with a minor slowdown. Don't use it if you have sufficient GPU memory.
+#   Not sure whether it slows down inference
+#   pse for "post SE", which means using SE block after ReLU
+def create_RepVGGplus_L2pse(deploy=False, use_checkpoint=False):
+    return RepVGGplus(num_blocks=[8, 14, 24, 1], num_classes=1000,
+                  width_multiplier=[2.5, 2.5, 2.5, 5], deploy=deploy, use_post_se=True,
+                      use_checkpoint=use_checkpoint)
+
+#   Will release more
+repvggplus_func_dict = {
+    'RepVGGplus-L2pse': create_RepVGGplus_L2pse,
+}
+
+def create_RepVGGplus_by_name(name, deploy=False, use_checkpoint=False):
+    if 'plus' in name:
+        return repvggplus_func_dict[name](deploy=deploy, use_checkpoint=use_checkpoint)
+    else:
+        print('=================== Building the vanila RepVGG ===================')
+        from repvgg import get_RepVGG_func_by_name
+        return get_RepVGG_func_by_name(name)(deploy=deploy, use_checkpoint=use_checkpoint)
+
+
+
+
+
+
+#   Use this for converting a RepVGG model or a bigger model with RepVGG as its component
+#   Use like this
+#   model = create_RepVGG_A0(deploy=False)
+#   train model or load weights
+#   repvgg_model_convert(model, save_path='repvgg_deploy.pth')
+#   If you want to preserve the original model, call with do_copy=True
+
+#   ====================== for using RepVGG as the backbone of a bigger model, e.g., PSPNet, the pseudo code will be like
+#   train_backbone = create_RepVGG_B2(deploy=False)
+#   train_backbone.load_state_dict(torch.load('RepVGG-B2-train.pth'))
+#   train_pspnet = build_pspnet(backbone=train_backbone)
+#   segmentation_train(train_pspnet)
+#   deploy_pspnet = repvgg_model_convert(train_pspnet)
+#   segmentation_test(deploy_pspnet)
+#   =====================   example_pspnet.py shows an example
+
+def repvgg_model_convert(model:torch.nn.Module, save_path=None, do_copy=True):
+    import copy
+    if do_copy:
+        model = copy.deepcopy(model)
+    for module in model.modules():
+        if hasattr(module, 'switch_to_deploy'):
+            module.switch_to_deploy()
+    if save_path is not None:
+        torch.save(model.state_dict(), save_path)
+    return model
diff --git a/cv/classification/repvgg/pytorch/repvggplus_custom_L2.py b/cv/classification/repvgg/pytorch/repvggplus_custom_L2.py
new file mode 100755
index 0000000000000000000000000000000000000000..dd8a15bec7c309bb83fdb702977c2382d2a6f01f
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/repvggplus_custom_L2.py
@@ -0,0 +1,268 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from se_block import SEBlock
+import torch
+import numpy as np
+
+
+def conv_bn_relu(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    result = nn.Sequential()
+    result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    result.add_module('relu', nn.ReLU())
+    return result
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    result = nn.Sequential()
+    result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+class RepVGGplusBlock(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros',
+                 deploy=False,
+                 use_post_se=False):
+        super(RepVGGplusBlock, self).__init__()
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        self.nonlinearity = nn.ReLU()
+
+        if use_post_se:
+            self.post_se = SEBlock(out_channels, internal_neurons=out_channels // 4)
+        else:
+            self.post_se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+                                      padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode)
+        else:
+            if out_channels == in_channels and stride == 1:
+                self.rbr_identity = nn.BatchNorm2d(num_features=out_channels)
+            else:
+                self.rbr_identity = None
+            self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups)
+            padding_11 = padding - kernel_size // 2
+            self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups)
+
+    def forward(self, x, L2):
+
+        if self.deploy:
+            return self.post_se(self.nonlinearity(self.rbr_reparam(x))), None
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(x)
+        out = self.rbr_dense(x) + self.rbr_1x1(x) + id_out
+        out = self.post_se(self.nonlinearity(out))
+
+        #   Custom L2
+        t3 = (self.rbr_dense.bn.weight / ((self.rbr_dense.bn.running_var + self.rbr_dense.bn.eps).sqrt())).reshape(-1, 1, 1, 1).detach()
+        t1 = (self.rbr_1x1.bn.weight / ((self.rbr_1x1.bn.running_var + self.rbr_1x1.bn.eps).sqrt())).reshape(-1, 1, 1, 1).detach()
+        K3 = self.rbr_dense.conv.weight
+        K1 = self.rbr_1x1.conv.weight
+
+        l2_loss_circle = (K3 ** 2).sum() - (K3[:, :, 1:2, 1:2] ** 2).sum()
+        eq_kernel = K3[:,:,1:2,1:2] * t3 + K1 * t1
+        l2_loss_eq_kernel = (eq_kernel ** 2 / (t3 ** 2 + t1 ** 2)).sum()
+
+        return out, L2 + l2_loss_circle + l2_loss_eq_kernel
+
+
+    #   This func derives the equivalent kernel and bias in a DIFFERENTIABLE way.
+    #   You can get the equivalent kernel and bias at any time and do whatever you want,
+    #   for example, apply some penalties or constraints during training, just like you do to the other models.
+    #   May be useful for quantization or pruning.
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            #   For the 1x1 or 3x3 branch
+            kernel, running_mean, running_var, gamma, beta, eps = branch.conv.weight, branch.bn.running_mean, branch.bn.running_var, branch.bn.weight, branch.bn.bias, branch.bn.eps
+        else:
+            #   For the identity branch
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                #   Construct and store the identity kernel in case it is used multiple times
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel, running_mean, running_var, gamma, beta, eps = self.id_tensor, branch.running_mean, branch.running_var, branch.weight, branch.bias, branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels,
+                                     out_channels=self.rbr_dense.conv.out_channels,
+                                     kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride,
+                                     padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation,
+                                     groups=self.rbr_dense.conv.groups, bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+
+
+class RepVGGplusStage(nn.Module):
+
+    def __init__(self, in_planes, planes, num_blocks, stride, use_checkpoint, use_post_se=False, deploy=False):
+        super().__init__()
+        strides = [stride] + [1] * (num_blocks - 1)
+        blocks = []
+        self.in_planes = in_planes
+        for stride in strides:
+            cur_groups = 1
+            blocks.append(RepVGGplusBlock(in_channels=self.in_planes, out_channels=planes, kernel_size=3,
+                                      stride=stride, padding=1, groups=cur_groups, deploy=deploy, use_post_se=use_post_se))
+            self.in_planes = planes
+        self.blocks = nn.ModuleList(blocks)
+        self.use_checkpoint = use_checkpoint
+
+    def forward(self, x, L2):
+        for block in self.blocks:
+            if self.use_checkpoint:
+                x, L2 = checkpoint.checkpoint(block, x, L2)
+            else:
+                x, L2 = block(x, L2)
+        return x, L2
+
+
+class RepVGGplus(nn.Module):
+
+    def __init__(self, num_blocks, num_classes,
+                 width_multiplier, override_groups_map=None,
+                 deploy=False,
+                 use_post_se=False,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.deploy = deploy
+        self.override_groups_map = override_groups_map or dict()
+        self.use_post_se = use_post_se
+        self.use_checkpoint = use_checkpoint
+        self.num_classes = num_classes
+        self.nonlinear = 'relu'
+
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+        self.stage0 = RepVGGplusBlock(in_channels=3, out_channels=self.in_planes, kernel_size=3, stride=2, padding=1, deploy=self.deploy, use_post_se=use_post_se)
+        self.cur_layer_idx = 1
+        self.stage1 = RepVGGplusStage(self.in_planes, int(64 * width_multiplier[0]), num_blocks[0], stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.stage2 = RepVGGplusStage(int(64 * width_multiplier[0]), int(128 * width_multiplier[1]), num_blocks[1], stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        #   split stage3 so that we can insert an auxiliary classifier
+        self.stage3_first = RepVGGplusStage(int(128 * width_multiplier[1]), int(256 * width_multiplier[2]), num_blocks[2] // 2, stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.stage3_second = RepVGGplusStage(int(256 * width_multiplier[2]), int(256 * width_multiplier[2]), num_blocks[2] // 2, stride=1, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.stage4 = RepVGGplusStage(int(256 * width_multiplier[2]), int(512 * width_multiplier[3]), num_blocks[3], stride=2, use_checkpoint=use_checkpoint, use_post_se=use_post_se, deploy=deploy)
+        self.gap = nn.AdaptiveAvgPool2d(output_size=1)
+        self.linear = nn.Linear(int(512 * width_multiplier[3]), num_classes)
+        #   aux classifiers
+        if not self.deploy:
+            self.stage1_aux = self._build_aux_for_stage(self.stage1)
+            self.stage2_aux = self._build_aux_for_stage(self.stage2)
+            self.stage3_first_aux = self._build_aux_for_stage(self.stage3_first)
+
+    def _build_aux_for_stage(self, stage):
+        stage_out_channels = list(stage.blocks.children())[-1].rbr_dense.conv.out_channels
+        downsample = conv_bn_relu(in_channels=stage_out_channels, out_channels=stage_out_channels, kernel_size=3, stride=2, padding=1)
+        fc = nn.Linear(stage_out_channels, self.num_classes, bias=True)
+        return nn.Sequential(downsample, nn.AdaptiveAvgPool2d(1), nn.Flatten(), fc)
+
+    def forward(self, x):
+        if self.deploy:
+            out, _ = self.stage0(x, L2=None)
+            out, _ = self.stage1(out, L2=None)
+            out, _ = self.stage2(out, L2=None)
+            out, _ = self.stage3_first(out, L2=None)
+            out, _ = self.stage3_second(out, L2=None)
+            out, _ = self.stage4(out, L2=None)
+            y = self.gap(out)
+            y = y.view(y.size(0), -1)
+            y = self.linear(y)
+            return y
+
+        else:
+            out, L2 = self.stage0(x, L2=0.0)
+            out, L2 = self.stage1(out, L2=L2)
+            stage1_aux = self.stage1_aux(out)
+            out, L2 = self.stage2(out, L2=L2)
+            stage2_aux = self.stage2_aux(out)
+            out, L2 = self.stage3_first(out, L2=L2)
+            stage3_first_aux = self.stage3_first_aux(out)
+            out, L2 = self.stage3_second(out, L2=L2)
+            out, L2 = self.stage4(out, L2=L2)
+            y = self.gap(out)
+            y = y.view(y.size(0), -1)
+            y = self.linear(y)
+            return {
+                'main': y,
+                'stage1_aux': stage1_aux,
+                'stage2_aux': stage2_aux,
+                'stage3_first_aux': stage3_first_aux,
+                'L2': L2
+            }
+
+    def switch_repvggplus_to_deploy(self):
+        for m in self.modules():
+            if hasattr(m, 'switch_to_deploy'):
+                m.switch_to_deploy()
+            if hasattr(m, 'use_checkpoint'):
+                m.use_checkpoint = False        #   Disable checkpoint. I am not sure whether using checkpoint slows down inference.
+        if hasattr(self, 'stage1_aux'):
+            self.__delattr__('stage1_aux')
+        if hasattr(self, 'stage2_aux'):
+            self.__delattr__('stage2_aux')
+        if hasattr(self, 'stage3_first_aux'):
+            self.__delattr__('stage3_first_aux')
+        self.deploy = True
+
+
+#   torch.utils.checkpoint can reduce the memory consumption during training with a minor slowdown. Don't use it if you have sufficient GPU memory.
+#   Not sure whether it slows down inference
+#   pse for "post SE", which means using SE block after ReLU
+def create_RepVGGplus_L2pse(deploy=False, use_checkpoint=False):
+    return RepVGGplus(num_blocks=[8, 14, 24, 1], num_classes=1000,
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=None, deploy=deploy, use_post_se=True,
+                      use_checkpoint=use_checkpoint)
+
+repvggplus_func_dict = {
+'RepVGGplus-L2pse': create_RepVGGplus_L2pse,
+}
+def get_RepVGGplus_func_by_name(name):
+    return repvggplus_func_dict[name]
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/se_block.py b/cv/classification/repvgg/pytorch/se_block.py
new file mode 100755
index 0000000000000000000000000000000000000000..e23911e0cae8826711cff19ea16028030ce0e73a
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/se_block.py
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+#   https://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper.html
+
+class SEBlock(nn.Module):
+
+    def __init__(self, input_channels, internal_neurons):
+        super(SEBlock, self).__init__()
+        self.down = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True)
+        self.up = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels, kernel_size=1, stride=1, bias=True)
+        self.input_channels = input_channels
+
+    def forward(self, inputs):
+        x = F.avg_pool2d(inputs, kernel_size=inputs.size(3))
+        x = self.down(x)
+        x = F.relu(x)
+        x = self.up(x)
+        x = torch.sigmoid(x)
+        x = x.view(-1, self.input_channels, 1, 1)
+        return inputs * x
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/tools/convert.py b/cv/classification/repvgg/pytorch/tools/convert.py
new file mode 100755
index 0000000000000000000000000000000000000000..b239ad0fd806ecabfa42124e753d16dfd45c0fc9
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/tools/convert.py
@@ -0,0 +1,46 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import argparse
+import os
+import torch
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+from repvggplus import create_RepVGGplus_by_name, repvgg_model_convert
+
+parser = argparse.ArgumentParser(description='RepVGG(plus) Conversion')
+parser.add_argument('load', metavar='LOAD', help='path to the weights file')
+parser.add_argument('save', metavar='SAVE', help='path to the weights file')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='RepVGG-A0')
+
+def convert():
+    args = parser.parse_args()
+
+    train_model = create_RepVGGplus_by_name(args.arch, deploy=False)
+
+    if os.path.isfile(args.load):
+        print("=> loading checkpoint '{}'".format(args.load))
+        checkpoint = torch.load(args.load)
+        if 'state_dict' in checkpoint:
+            checkpoint = checkpoint['state_dict']
+        elif 'model' in checkpoint:
+            checkpoint = checkpoint['model']
+        ckpt = {k.replace('module.', ''): v for k, v in checkpoint.items()}  # strip the names
+        print(ckpt.keys())
+        train_model.load_state_dict(ckpt)
+    else:
+        print("=> no checkpoint found at '{}'".format(args.load))
+
+    if 'plus' in args.arch:
+        train_model.switch_repvggplus_to_deploy()
+        torch.save(train_model.state_dict(), args.save)
+    else:
+        repvgg_model_convert(train_model, save_path=args.save)
+
+
+if __name__ == '__main__':
+    convert()
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/tools/insert_bn.py b/cv/classification/repvgg/pytorch/tools/insert_bn.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a66f6b4b2e576d86aca48cb1a88076e7dd48c47
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/tools/insert_bn.py
@@ -0,0 +1,217 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import argparse
+import os
+import time
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+from utils import accuracy, ProgressMeter, AverageMeter
+from repvgg import get_RepVGG_func_by_name, RepVGGBlock
+from utils import load_checkpoint, get_ImageNet_train_dataset, get_default_train_trans
+
+#   Insert BN into an inference-time RepVGG (e.g., for quantization-aware training).
+#   Get the mean and std on every conv3x3 (before the bias-adding) on the train set. Then use such data to initialize BN layers and insert them after conv3x3.
+#   May, 07, 2021
+
+parser = argparse.ArgumentParser(description='Get the mean and std on every conv3x3 (before the bias-adding) on the train set. Then use such data to initialize BN layers and insert them after conv3x3.')
+parser.add_argument('data', metavar='DIR', help='path to dataset')
+parser.add_argument('weights', metavar='WEIGHTS', help='path to the weights file')
+parser.add_argument('save', metavar='SAVE', help='path to save the model with BN')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='RepVGG-A0')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('-b', '--batch-size', default=100, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 100) for test')
+parser.add_argument('-n', '--num-batches', default=500, type=int,
+                    metavar='N',
+                    help='number of batches (default: 500) to record the mean and std on the train set')
+parser.add_argument('-r', '--resolution', default=224, type=int,
+                    metavar='R',
+                    help='resolution (default: 224) for test')
+
+
+def update_running_mean_var(x, running_mean, running_var, momentum=0.9, is_first_batch=False):
+    mean = x.mean(dim=(0, 2, 3), keepdim=True)
+    var = ((x - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
+    if is_first_batch:
+        running_mean = mean
+        running_var = var
+    else:
+        running_mean = momentum * running_mean + (1.0 - momentum) * mean
+        running_var = momentum * running_var + (1.0 - momentum) * var
+    return running_mean, running_var
+
+#   Record the mean and std like a BN layer but do no normalization
+class BNStatistics(nn.Module):
+    def __init__(self, num_features):
+        super(BNStatistics, self).__init__()
+        shape = (1, num_features, 1, 1)
+        self.register_buffer('running_mean', torch.zeros(shape))
+        self.register_buffer('running_var', torch.zeros(shape))
+        self.is_first_batch = True
+
+    def forward(self, x):
+        if self.running_mean.device != x.device:
+            self.running_mean = self.running_mean.to(x.device)
+            self.running_var = self.running_var.to(x.device)
+        self.running_mean, self.running_var = update_running_mean_var(x, self.running_mean, self.running_var, momentum=0.9, is_first_batch=self.is_first_batch)
+        self.is_first_batch = False
+        return x
+
+#   This is designed to insert BNStat layer between Conv2d(without bias) and its bias
+class BiasAdd(nn.Module):
+    def __init__(self, num_features):
+        super(BiasAdd, self).__init__()
+        self.bias = torch.nn.Parameter(torch.Tensor(num_features))
+    def forward(self, x):
+        return x + self.bias.view(1, -1, 1, 1)
+
+def switch_repvggblock_to_bnstat(model):
+    for n, block in model.named_modules():
+        if isinstance(block, RepVGGBlock):
+            print('switch to BN Statistics: ', n)
+            assert hasattr(block, 'rbr_reparam')
+            stat = nn.Sequential()
+            stat.add_module('conv', nn.Conv2d(block.rbr_reparam.in_channels, block.rbr_reparam.out_channels,
+                                              block.rbr_reparam.kernel_size,
+                                              block.rbr_reparam.stride, block.rbr_reparam.padding,
+                                              block.rbr_reparam.dilation,
+                                              block.rbr_reparam.groups, bias=False))  # Note bias=False
+            stat.add_module('bnstat', BNStatistics(block.rbr_reparam.out_channels))
+            stat.add_module('biasadd', BiasAdd(block.rbr_reparam.out_channels))  # Bias is here
+            stat.conv.weight.data = block.rbr_reparam.weight.data
+            stat.biasadd.bias.data = block.rbr_reparam.bias.data
+            block.__delattr__('rbr_reparam')
+            block.rbr_reparam = stat
+
+def switch_bnstat_to_convbn(model):
+    for n, block in model.named_modules():
+        if isinstance(block, RepVGGBlock):
+            assert hasattr(block, 'rbr_reparam')
+            assert hasattr(block.rbr_reparam, 'bnstat')
+            print('switch to ConvBN: ', n)
+            conv = nn.Conv2d(block.rbr_reparam.conv.in_channels, block.rbr_reparam.conv.out_channels,
+                             block.rbr_reparam.conv.kernel_size,
+                             block.rbr_reparam.conv.stride, block.rbr_reparam.conv.padding,
+                             block.rbr_reparam.conv.dilation,
+                             block.rbr_reparam.conv.groups, bias=False)
+            bn = nn.BatchNorm2d(block.rbr_reparam.conv.out_channels)
+            bn.running_mean = block.rbr_reparam.bnstat.running_mean.squeeze()  # Initialize the mean and var of BN with the statistics
+            bn.running_var = block.rbr_reparam.bnstat.running_var.squeeze()
+            std = (bn.running_var + bn.eps).sqrt()
+            conv.weight.data = block.rbr_reparam.conv.weight.data
+            bn.weight.data = std
+            bn.bias.data = block.rbr_reparam.biasadd.bias.data + bn.running_mean  # Initialize gamma = std and beta = bias + mean
+
+            convbn = nn.Sequential()
+            convbn.add_module('conv', conv)
+            convbn.add_module('bn', bn)
+            block.__delattr__('rbr_reparam')
+            block.rbr_reparam = convbn
+
+
+#   Insert a BN after conv3x3 (rbr_reparam). With no reasonable initialization of BN, the model may break down.
+#   So you have to load the weights obtained through the BN statistics (please see the function "insert_bn" in this file).
+def directly_insert_bn_without_init(model):
+    for n, block in model.named_modules():
+        if isinstance(block, RepVGGBlock):
+            print('directly insert a BN with no initialization: ', n)
+            assert hasattr(block, 'rbr_reparam')
+            convbn = nn.Sequential()
+            convbn.add_module('conv', nn.Conv2d(block.rbr_reparam.in_channels, block.rbr_reparam.out_channels,
+                                              block.rbr_reparam.kernel_size,
+                                              block.rbr_reparam.stride, block.rbr_reparam.padding,
+                                              block.rbr_reparam.dilation,
+                                              block.rbr_reparam.groups, bias=False))  # Note bias=False
+            convbn.add_module('bn', nn.BatchNorm2d(block.rbr_reparam.out_channels))
+            #   ====================
+            convbn.add_module('relu', nn.ReLU())
+            # TODO we moved ReLU from "block.nonlinearity" into "rbr_reparam" (nn.Sequential). This makes it more convenient to fuse operators (see RepVGGWholeQuant.fuse_model) using off-the-shelf APIs.
+            block.nonlinearity = nn.Identity()
+            #==========================
+            block.__delattr__('rbr_reparam')
+            block.rbr_reparam = convbn
+
+
+def insert_bn():
+    args = parser.parse_args()
+
+    repvgg_build_func = get_RepVGG_func_by_name(args.arch)
+
+    model = repvgg_build_func(deploy=True).cuda()
+
+    load_checkpoint(model, args.weights)
+
+    switch_repvggblock_to_bnstat(model)
+
+    cudnn.benchmark = True
+
+    trans = get_default_train_trans(args)
+    print('data aug: ', trans)
+
+    train_dataset = get_ImageNet_train_dataset(args, trans)
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True)
+
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+
+    progress = ProgressMeter(
+        min(len(train_loader), args.num_batches),
+        [batch_time, losses, top1, top5],
+        prefix='BN stat: ')
+
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(train_loader):
+            if i >= args.num_batches:
+                break
+            images = images.cuda(non_blocking=True)
+            target = target.cuda(non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % 10 == 0:
+                progress.display(i)
+
+
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+
+    switch_bnstat_to_convbn(model)
+
+    torch.save(model.state_dict(), args.save)
+
+
+
+
+if __name__ == '__main__':
+    insert_bn()
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/tools/verify.py b/cv/classification/repvgg/pytorch/tools/verify.py
new file mode 100755
index 0000000000000000000000000000000000000000..d9f77fda2d17c1fe943b57dac80470f3b2c98192
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/tools/verify.py
@@ -0,0 +1,30 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+from repvgg import create_RepVGG_B1
+
+if __name__ == '__main__':
+    x = torch.randn(1, 3, 224, 224)
+    model = create_RepVGG_B1(deploy=False)
+    model.eval()
+
+    for module in model.modules():
+        if isinstance(module, torch.nn.BatchNorm2d):
+            nn.init.uniform_(module.running_mean, 0, 0.1)
+            nn.init.uniform_(module.running_var, 0, 0.1)
+            nn.init.uniform_(module.weight, 0, 0.1)
+            nn.init.uniform_(module.bias, 0, 0.1)
+
+    train_y = model(x)
+    for module in model.modules():
+        if hasattr(module, 'switch_to_deploy'):
+            module.switch_to_deploy()
+
+    print(model)
+    deploy_y = model(x)
+    print('========================== The diff is')
+    print(((train_y - deploy_y) ** 2).sum())
diff --git a/cv/classification/repvgg/pytorch/train/config.py b/cv/classification/repvgg/pytorch/train/config.py
new file mode 100755
index 0000000000000000000000000000000000000000..e5fd7772801d486a8b60831cf4e2e051f30f062b
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/train/config.py
@@ -0,0 +1,217 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+
+import os
+import yaml
+from yacs.config import CfgNode as CN
+
+_C = CN()
+
+# Base config files
+_C.BASE = ['']
+
+# -----------------------------------------------------------------------------
+# Data settings
+# -----------------------------------------------------------------------------
+_C.DATA = CN()
+# Batch size for a single GPU, could be overwritten by command line argument
+_C.DATA.BATCH_SIZE = 128
+# Path to dataset, could be overwritten by command line argument
+_C.DATA.DATA_PATH = '/your/path/to/dataset'
+
+# Dataset name
+_C.DATA.DATASET = 'imagenet'
+# Input image size
+_C.DATA.IMG_SIZE = 224
+_C.DATA.TEST_SIZE = None
+_C.DATA.TEST_BATCH_SIZE = None
+# Interpolation to resize image (random, bilinear, bicubic)
+_C.DATA.INTERPOLATION = 'bilinear'
+# Use zipped dataset instead of folder dataset
+# could be overwritten by command line argument
+_C.DATA.ZIP_MODE = False
+# Cache Data in Memory, could be overwritten by command line argument
+_C.DATA.CACHE_MODE = 'part'
+# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
+_C.DATA.PIN_MEMORY = True
+# Number of data loading threads
+_C.DATA.NUM_WORKERS = 8
+
+# -----------------------------------------------------------------------------
+# Model settings
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Model type
+_C.MODEL.ARCH = 'RepVGG-L2pse'
+# Checkpoint to resume, could be overwritten by command line argument
+_C.MODEL.RESUME = ''
+# Number of classes, overwritten in data preparation
+_C.MODEL.NUM_CLASSES = 1000
+# Label Smoothing
+_C.MODEL.LABEL_SMOOTHING = 0.1
+
+# -----------------------------------------------------------------------------
+# Training settings
+# -----------------------------------------------------------------------------
+_C.TRAIN = CN()
+_C.TRAIN.START_EPOCH = 0
+_C.TRAIN.EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 5e-4
+_C.TRAIN.WARMUP_LR = 0.0
+_C.TRAIN.MIN_LR = 0.0
+# Clip gradient norm
+_C.TRAIN.CLIP_GRAD = 0.0
+# Auto resume from latest checkpoint
+_C.TRAIN.AUTO_RESUME = True
+# Gradient accumulation steps
+# could be overwritten by command line argument
+_C.TRAIN.ACCUMULATION_STEPS = 0
+# Whether to use gradient checkpointing to save memory
+# could be overwritten by command line argument
+_C.TRAIN.USE_CHECKPOINT = False
+
+# LR scheduler
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
+# Epoch interval to decay LR, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
+# LR decay rate, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1
+
+# Optimizer
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'sgd'
+# Optimizer Epsilon
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+# Optimizer Betas
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+# SGD momentum
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+#   For EMA model
+_C.TRAIN.EMA_ALPHA = 0.0
+_C.TRAIN.EMA_UPDATE_PERIOD = 8
+
+#   For RepOptimizer only
+_C.TRAIN.SCALES_PATH = None
+
+# -----------------------------------------------------------------------------
+# Augmentation settings
+# -----------------------------------------------------------------------------
+_C.AUG = CN()
+# Mixup alpha, mixup enabled if > 0
+_C.AUG.MIXUP = 0.2
+# Cutmix alpha, cutmix enabled if > 0
+_C.AUG.CUTMIX = 0.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+_C.AUG.CUTMIX_MINMAX = None
+# Probability of performing mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_PROB = 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+_C.AUG.MIXUP_MODE = 'batch'
+
+_C.AUG.PRESET = None    # If use AUG.PRESET (e.g., 'raug15'), use the pre-defined preprocessing, ignoring the following settings.
+# Color jitter factor
+_C.AUG.COLOR_JITTER = 0.4
+# Use AutoAugment policy. "v0" or "original"
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+_C.AUG.REPROB = 0.25
+# Random erase mode
+_C.AUG.REMODE = 'pixel'
+# Random erase count
+_C.AUG.RECOUNT = 1
+
+
+# -----------------------------------------------------------------------------
+# Testing settings
+# -----------------------------------------------------------------------------
+_C.TEST = CN()
+# Whether to use center crop when testing
+_C.TEST.CROP = False
+
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2')
+# overwritten by command line argument
+_C.AMP_OPT_LEVEL = ''
+# Path to output folder, overwritten by command line argument
+_C.OUTPUT = ''
+# Tag of experiment, overwritten by command line argument
+_C.TAG = 'default'
+# Frequency to save checkpoint
+_C.SAVE_FREQ = 20
+# Frequency to logging info
+_C.PRINT_FREQ = 10
+# Fixed random seed
+_C.SEED = 0
+# Perform evaluation only, overwritten by command line argument
+_C.EVAL_MODE = False
+# Test throughput only, overwritten by command line argument
+_C.THROUGHPUT_MODE = False
+# local rank for DistributedDataParallel, given by command line argument
+_C.LOCAL_RANK = 0
+
+
+def update_config(config, args):
+    config.defrost()
+    if args.opts:
+        config.merge_from_list(args.opts)
+    # merge from specific arguments
+    if args.scales_path:
+        config.TRAIN.SCALES_PATH = args.scales_path
+    if args.arch:
+        config.MODEL.ARCH = args.arch
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.zip:
+        config.DATA.ZIP_MODE = True
+    if args.cache_mode:
+        config.DATA.CACHE_MODE = args.cache_mode
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.accumulation_steps:
+        config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps
+    if args.use_checkpoint:
+        config.TRAIN.USE_CHECKPOINT = True
+    if args.amp_opt_level:
+        config.AMP_OPT_LEVEL = args.amp_opt_level
+    if args.output:
+        config.OUTPUT = args.output
+    if args.tag:
+        config.TAG = args.tag
+    if args.eval:
+        config.EVAL_MODE = True
+    if args.throughput:
+        config.THROUGHPUT_MODE = True
+
+    if config.DATA.TEST_SIZE is None:
+        config.DATA.TEST_SIZE = config.DATA.IMG_SIZE
+    if config.DATA.TEST_BATCH_SIZE is None:
+        config.DATA.TEST_BATCH_SIZE = config.DATA.BATCH_SIZE
+    # set local rank for distributed training
+    config.LOCAL_RANK = args.local_rank
+    # output folder
+    config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.ARCH, config.TAG)
+    config.freeze()
+
+
+def get_config(args):
+    """Get a yacs CfgNode object with default values."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    update_config(config, args)
+
+    return config
diff --git a/cv/classification/repvgg/pytorch/train/cutout.py b/cv/classification/repvgg/pytorch/train/cutout.py
new file mode 100755
index 0000000000000000000000000000000000000000..8592ffc08441857a7b0fd8882d23179cc064fae1
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/train/cutout.py
@@ -0,0 +1,55 @@
+import numpy as np
+
+class Cutout:
+
+    def __init__(self, size=16) -> None:
+        self.size = size
+
+    def _create_cutout_mask(self, img_height, img_width, num_channels, size):
+        """Creates a zero mask used for cutout of shape `img_height` x `img_width`.
+        Args:
+          img_height: Height of image cutout mask will be applied to.
+          img_width: Width of image cutout mask will be applied to.
+          num_channels: Number of channels in the image.
+          size: Size of the zeros mask.
+        Returns:
+          A mask of shape `img_height` x `img_width` with all ones except for a
+          square of zeros of shape `size` x `size`. This mask is meant to be
+          elementwise multiplied with the original image. Additionally returns
+          the `upper_coord` and `lower_coord` which specify where the cutout mask
+          will be applied.
+        """
+        # assert img_height == img_width
+
+        # Sample center where cutout mask will be applied
+        height_loc = np.random.randint(low=0, high=img_height)
+        width_loc = np.random.randint(low=0, high=img_width)
+
+        size = int(size)
+        # Determine upper right and lower left corners of patch
+        upper_coord = (max(0, height_loc - size // 2), max(0, width_loc - size // 2))
+        lower_coord = (
+            min(img_height, height_loc + size // 2),
+            min(img_width, width_loc + size // 2),
+        )
+        mask_height = lower_coord[0] - upper_coord[0]
+        mask_width = lower_coord[1] - upper_coord[1]
+        assert mask_height > 0
+        assert mask_width > 0
+
+        mask = np.ones((img_height, img_width, num_channels))
+        zeros = np.zeros((mask_height, mask_width, num_channels))
+        mask[upper_coord[0]: lower_coord[0], upper_coord[1]: lower_coord[1], :] = zeros
+        return mask, upper_coord, lower_coord
+
+    def __call__(self, pil_img):
+        pil_img = pil_img.copy()
+        img_height, img_width, num_channels = (*pil_img.size, 3)
+        _, upper_coord, lower_coord = self._create_cutout_mask(
+            img_height, img_width, num_channels, self.size
+        )
+        pixels = pil_img.load()  # create the pixel map
+        for i in range(upper_coord[0], lower_coord[0]):  # for every col:
+            for j in range(upper_coord[1], lower_coord[1]):  # For every row
+                pixels[i, j] = (125, 122, 113, 0)  # set the colour accordingly
+        return pil_img
\ No newline at end of file
diff --git a/cv/classification/repvgg/pytorch/train/logger.py b/cv/classification/repvgg/pytorch/train/logger.py
new file mode 100755
index 0000000000000000000000000000000000000000..a0ae05e487856c285cad5a7cf87cb5e63c30d8f6
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/train/logger.py
@@ -0,0 +1,41 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+
+import os
+import sys
+import logging
+import functools
+from termcolor import colored
+
+
+@functools.lru_cache()
+def create_logger(output_dir, dist_rank=0, name=''):
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    # create formatter
+    fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+    color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \
+                colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
+
+    # create console handlers for master process
+    if dist_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+
+    # create file handlers
+    file_handler = logging.FileHandler(os.path.join(output_dir, f'log_rank{dist_rank}.txt'), mode='a')
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+    logger.addHandler(file_handler)
+
+    return logger
diff --git a/cv/classification/repvgg/pytorch/train/lr_scheduler.py b/cv/classification/repvgg/pytorch/train/lr_scheduler.py
new file mode 100755
index 0000000000000000000000000000000000000000..029b184c16a03eb2e64017b9518b1e460cd20e28
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/train/lr_scheduler.py
@@ -0,0 +1,101 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+
+import torch
+from timm.scheduler.cosine_lr import CosineLRScheduler
+from timm.scheduler.step_lr import StepLRScheduler
+from timm.scheduler.scheduler import Scheduler
+
+
+def build_scheduler(config, optimizer, n_iter_per_epoch):
+    num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch)
+    warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch)
+    decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS * n_iter_per_epoch)
+
+    lr_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == 'cosine':
+        lr_scheduler = CosineLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            lr_min=config.TRAIN.MIN_LR,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            cycle_limit=1,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'linear':
+        lr_scheduler = LinearLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            lr_min_rate=0.01,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'step':
+        lr_scheduler = StepLRScheduler(
+            optimizer,
+            decay_t=decay_steps,
+            decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+
+    return lr_scheduler
+
+
+class LinearLRScheduler(Scheduler):
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 t_initial: int,
+                 lr_min_rate: float,
+                 warmup_t=0,
+                 warmup_lr_init=0.,
+                 t_in_epochs=True,
+                 noise_range_t=None,
+                 noise_pct=0.67,
+                 noise_std=1.0,
+                 noise_seed=42,
+                 initialize=True,
+                 ) -> None:
+        super().__init__(
+            optimizer, param_group_field="lr",
+            noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
+            initialize=initialize)
+
+        self.t_initial = t_initial
+        self.lr_min_rate = lr_min_rate
+        self.warmup_t = warmup_t
+        self.warmup_lr_init = warmup_lr_init
+        self.t_in_epochs = t_in_epochs
+        if self.warmup_t:
+            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
+            super().update_groups(self.warmup_lr_init)
+        else:
+            self.warmup_steps = [1 for _ in self.base_values]
+
+    def _get_lr(self, t):
+        if t < self.warmup_t:
+            lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
+        else:
+            t = t - self.warmup_t
+            total_t = self.t_initial - self.warmup_t
+            lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values]
+        return lrs
+
+    def get_epoch_values(self, epoch: int):
+        if self.t_in_epochs:
+            return self._get_lr(epoch)
+        else:
+            return None
+
+    def get_update_values(self, num_updates: int):
+        if not self.t_in_epochs:
+            return self._get_lr(num_updates)
+        else:
+            return None
diff --git a/cv/classification/repvgg/pytorch/train/optimizer.py b/cv/classification/repvgg/pytorch/train/optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..68abf6d0eb1decfbb61457e86ac0cc955fc30f39
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/train/optimizer.py
@@ -0,0 +1,71 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+
+from torch import optim as optim
+
+
+def build_optimizer(config, model):
+    """
+    Build optimizer, set weight decay of normalization to 0 by default.
+    """
+    skip = {}
+    skip_keywords = {}
+    if hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    if hasattr(model, 'no_weight_decay_keywords'):
+        skip_keywords = model.no_weight_decay_keywords()
+    echo = (config.LOCAL_RANK==0)
+    parameters = set_weight_decay(model, skip, skip_keywords, echo=echo)
+    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
+    optimizer = None
+    if opt_lower == 'sgd':
+        optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
+                              lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+        if echo:
+            print('================================== SGD nest, momentum = {}, wd = {}'.format(config.TRAIN.OPTIMIZER.MOMENTUM, config.TRAIN.WEIGHT_DECAY))
+    elif opt_lower == 'adam':
+        print('adam')
+        optimizer = optim.Adam(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
+                                lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+    elif opt_lower == 'adamw':
+        optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
+                                lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+
+    return optimizer
+
+
+def set_weight_decay(model, skip_list=(), skip_keywords=(), echo=False):
+    has_decay = []
+    no_decay = []
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if 'identity.weight' in name:
+            has_decay.append(param)
+            if echo:
+                print(f"{name} USE weight decay")
+        elif len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
+            check_keywords_in_name(name, skip_keywords):
+            no_decay.append(param)
+            if echo:
+                print(f"{name} has no weight decay")
+        else:
+            has_decay.append(param)
+            if echo:
+                print(f"{name} USE weight decay")
+
+    return [{'params': has_decay},
+            {'params': no_decay, 'weight_decay': 0.}]
+
+
+def check_keywords_in_name(name, keywords=()):
+    isin = False
+    for keyword in keywords:
+        if keyword in name:
+            isin = True
+    return isin
diff --git a/cv/classification/repvgg/pytorch/train/randaug.py b/cv/classification/repvgg/pytorch/train/randaug.py
new file mode 100755
index 0000000000000000000000000000000000000000..6934fb8059bd1f04129d41964dbae0f8d39f0beb
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/train/randaug.py
@@ -0,0 +1,407 @@
+import math
+import random
+
+import numpy as np
+import PIL
+from PIL import Image, ImageEnhance, ImageOps
+
+from train.cutout import Cutout
+
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL,
+)
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def cutout(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return Cutout(size=factor)(img)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(
+            -rotn_center[0] - post_trans[0],
+            - rotn_center[1] - post_trans[1], matrix
+        )
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def identity(img, **__):
+    return img
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _cutout_level_to_arg(level, _hparams):
+    # range [0, 40]
+    level = max(2, (level / _MAX_LEVEL) * 40.)
+    return level,
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, _hparams):
+    # range [-0.45, 0.45]
+    level = (level / _MAX_LEVEL) * 0.45
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _posterize_research_level_to_arg(level, _hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image'
+    return 4 - int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_tpu_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Identity': None,
+    'Rotate': _rotate_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'PosterizeResearch': _posterize_research_level_to_arg,
+    'PosterizeTpu': _posterize_tpu_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+    'Cutout': _cutout_level_to_arg,
+}
+
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Identity': identity,
+    'Rotate': rotate,
+    'PosterizeOriginal': posterize,
+    'PosterizeResearch': posterize,
+    'PosterizeTpu': posterize,
+    'Solarize': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+    'Cutout': cutout,
+}
+
+
+class AutoAugmentTransform(object):
+    """
+    AutoAugment from Google.
+    Implementation adapted from:
+        https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
+    """
+
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        """
+        Args:
+            name (str): any type of transforms list in _RAND_TRANSFORMS.
+            prob (float): probability of perform current augmentation.
+            magnitude (int): intensity / magnitude of each augmentation.
+            hparams (dict): hyper-parameters required by each augmentation.
+        """
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation'] if 'interpolation' in hparams
+            else _RANDOM_INTERPOLATION,
+        )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img: PIL.Image) -> PIL.Image:
+        if random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        # NOTE: magnitude fixed and no boundary
+        # magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(
+            magnitude, self.hparams) if self.level_fn is not None else tuple()
+        return self.aug_fn(img, *level_args, **self.kwargs)
+        # return np.array(self.aug_fn(Image.fromarray(img), *level_args, **self.kwargs))
+
+    # def apply_coords(self, coords: np.ndarray) -> np.ndarray:
+    #     return coords
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeTpu',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    'Cutout'  # FIXME I implement this as random erasing separately
+]
+
+_RAND_TRANSFORMS_CMC = [
+    'AutoContrast',
+    'Identity',
+    'Rotate',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    # 'Cutout'  # FIXME I implement this as random erasing separately
+]
+
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'PosterizeTpu': 0,
+    'Invert': 0,
+}
+
+
+class RandAugPolicy(object):
+    def __init__(self, layers=2, magnitude=10):
+        self.layers = layers
+        self.magnitude = magnitude
+
+    def __call__(self, img):
+        for _ in range(self.layers):
+            trans = np.random.choice(_RAND_TRANSFORMS)
+            # NOTE: prob apply, fixed magnitude
+            # trans_op = AutoAugmentTransform(trans, prob=np.random.uniform(0.2, 0.8), magnitude=self.magnitude)
+            # NOTE: always apply, random magnitude
+            trans_op = AutoAugmentTransform(trans, prob=1.0, magnitude=np.random.choice(self.magnitude))
+            img = trans_op(img)
+            assert img is not None, trans
+        return img
diff --git a/cv/classification/repvgg/pytorch/utils.py b/cv/classification/repvgg/pytorch/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..78ceb52305e8e4619d183f3f6219046a4c2f028a
--- /dev/null
+++ b/cv/classification/repvgg/pytorch/utils.py
@@ -0,0 +1,249 @@
+# --------------------------------------------------------
+# RepVGG: Making VGG-style ConvNets Great Again (https://openaccess.thecvf.com/content/CVPR2021/papers/Ding_RepVGG_Making_VGG-Style_ConvNets_Great_Again_CVPR_2021_paper.pdf)
+# Github source: https://github.com/DingXiaoH/RepVGG
+# Licensed under The MIT License [see LICENSE for details]
+# The training script is based on the code of Swin Transformer (https://github.com/microsoft/Swin-Transformer)
+# --------------------------------------------------------
+
+import torch
+import math
+import os
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+def load_checkpoint(model, ckpt_path):
+    checkpoint = torch.load(ckpt_path)
+    if 'model' in checkpoint:
+        checkpoint = checkpoint['model']
+    if 'state_dict' in checkpoint:
+        checkpoint = checkpoint['state_dict']
+    ckpt = {}
+    for k, v in checkpoint.items():
+        if k.startswith('module.'):
+            ckpt[k[7:]] = v
+        else:
+            ckpt[k] = v
+    model.load_state_dict(ckpt)
+
+
+class WarmupCosineAnnealingLR(torch.optim.lr_scheduler._LRScheduler):
+
+    def __init__(self, optimizer, T_cosine_max, eta_min=0, last_epoch=-1, warmup=0):
+        self.eta_min = eta_min
+        self.T_cosine_max = T_cosine_max
+        self.warmup = warmup
+        super(WarmupCosineAnnealingLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup:
+            return [self.last_epoch / self.warmup * base_lr for base_lr in self.base_lrs]
+        else:
+            return [self.eta_min + (base_lr - self.eta_min) *
+                    (1 + math.cos(math.pi * (self.last_epoch - self.warmup) / (self.T_cosine_max - self.warmup))) / 2
+                    for base_lr in self.base_lrs]
+
+
+def log_msg(message, log_file):
+    print(message)
+    with open(log_file, 'a') as f:
+        print(message, file=f)
+
+
+
+
+
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+def unwrap_model(model):
+    """Remove the DistributedDataParallel wrapper if present."""
+    wrapped = isinstance(model, torch.nn.parallel.distributed.DistributedDataParallel)
+    return model.module if wrapped else model
+
+
+def load_checkpoint(config, model, optimizer, lr_scheduler, logger, model_ema=None):
+    logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
+    if config.MODEL.RESUME.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(
+            config.MODEL.RESUME, map_location='cpu', check_hash=True)
+    else:
+        checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+    msg = model.load_state_dict(checkpoint['model'], strict=False)
+    logger.info(msg)
+    max_accuracy = 0.0
+    if not config.EVAL_MODE and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        config.defrost()
+        config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
+        config.freeze()
+        if 'amp' in checkpoint and config.AMP_OPT_LEVEL != "O0" and checkpoint['config'].AMP_OPT_LEVEL != "O0":
+            amp.load_state_dict(checkpoint['amp'])
+        logger.info(f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})")
+        if 'max_accuracy' in checkpoint:
+            max_accuracy = checkpoint['max_accuracy']
+    if model_ema is not None:
+        unwrap_model(model_ema).load_state_dict(checkpoint['ema'])
+        print('=================================================== EMAloaded')
+
+    del checkpoint
+    torch.cuda.empty_cache()
+    return max_accuracy
+
+
+def load_weights(model, path):
+    checkpoint = torch.load(path, map_location='cpu')
+    if 'model' in checkpoint:
+        checkpoint = checkpoint['model']
+    if 'state_dict' in checkpoint:
+        checkpoint = checkpoint['state_dict']
+    unwrap_model(model).load_state_dict(checkpoint, strict=False)
+    print('=================== loaded from', path)
+
+def save_latest(config, epoch, model, max_accuracy, optimizer, lr_scheduler, logger, model_ema=None):
+    save_state = {'model': model.state_dict(),
+                  'optimizer': optimizer.state_dict(),
+                  'lr_scheduler': lr_scheduler.state_dict(),
+                  'max_accuracy': max_accuracy,
+                  'epoch': epoch,
+                  'config': config}
+    if config.AMP_OPT_LEVEL != "O0":
+        save_state['amp'] = amp.state_dict()
+    if model_ema is not None:
+        save_state['ema'] = unwrap_model(model_ema).state_dict()
+
+    save_path = os.path.join(config.OUTPUT, 'latest.pth')
+    logger.info(f"{save_path} saving......")
+    torch.save(save_state, save_path)
+    logger.info(f"{save_path} saved !!!")
+
+def save_checkpoint(config, epoch, model, max_accuracy, optimizer, lr_scheduler, logger, is_best=False, model_ema=None):
+    save_state = {'model': model.state_dict(),
+                  'optimizer': optimizer.state_dict(),
+                  'lr_scheduler': lr_scheduler.state_dict(),
+                  'max_accuracy': max_accuracy,
+                  'epoch': epoch,
+                  'config': config}
+    if config.AMP_OPT_LEVEL != "O0":
+        save_state['amp'] = amp.state_dict()
+    if model_ema is not None:
+        save_state['ema'] = unwrap_model(model_ema).state_dict()
+
+    if is_best:
+        best_path = os.path.join(config.OUTPUT, 'best_ckpt.pth')
+        torch.save(save_state, best_path)
+
+    save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth')
+    logger.info(f"{save_path} saving......")
+    torch.save(save_state, save_path)
+    logger.info(f"{save_path} saved !!!")
+
+
+def get_grad_norm(parameters, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+    total_norm = total_norm ** (1. / norm_type)
+    return total_norm
+
+
+import torch.distributed as dist
+
+def auto_resume_helper(output_dir):
+    checkpoints = os.listdir(output_dir)
+    checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth') and 'ema' not in ckpt]
+    print(f"All checkpoints founded in {output_dir}: {checkpoints}")
+    if len(checkpoints) > 0:
+        latest_checkpoint = max([os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime)
+        print(f"The latest checkpoint founded: {latest_checkpoint}")
+        resume_file = latest_checkpoint
+    else:
+        resume_file = None
+    return resume_file
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= dist.get_world_size()
+    return rt
+
+def update_model_ema(cfg, num_gpus, model, model_ema, cur_epoch, cur_iter):
+    """Update exponential moving average (ema) of model weights."""
+    update_period = cfg.TRAIN.EMA_UPDATE_PERIOD
+    if update_period is None or update_period == 0 or cur_iter % update_period != 0:
+        return
+    # Adjust alpha to be fairly independent of other parameters
+    total_batch_size = num_gpus * cfg.DATA.BATCH_SIZE
+    adjust = total_batch_size / cfg.TRAIN.EPOCHS * update_period
+    # print('ema adjust', adjust)
+    alpha = min(1.0, cfg.TRAIN.EMA_ALPHA * adjust)
+    # During warmup simply copy over weights instead of using ema
+    alpha = 1.0 if cur_epoch < cfg.TRAIN.WARMUP_EPOCHS else alpha
+    # Take ema of all parameters (not just named parameters)
+    params = unwrap_model(model).state_dict()
+    for name, param in unwrap_model(model_ema).state_dict().items():
+        param.copy_(param * (1.0 - alpha) + params[name] * alpha)
\ No newline at end of file