diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/__init__.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e419fff0d973284773a0042b507d03a2fb46dd4
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/__init__.py
@@ -0,0 +1,3 @@
+from . import models
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/_internally_replaced_utils.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/_internally_replaced_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18afc3ed93a8272600d73cc240047a0a49f23991
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/_internally_replaced_utils.py
@@ -0,0 +1,58 @@
+import importlib.machinery
+import os
+
+from torch.hub import _get_torch_home
+
+
+_HOME = os.path.join(_get_torch_home(), "datasets", "vision")
+_USE_SHARDED_DATASETS = False
+
+
+def _download_file_from_remote_location(fpath: str, url: str) -> None:
+    pass
+
+
+def _is_remote_location_available() -> bool:
+    return False
+
+
+try:
+    from torch.hub import load_state_dict_from_url  # noqa: 401
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url  # noqa: 401
+
+
+def _get_extension_path(lib_name):
+
+    lib_dir = os.path.dirname(__file__)
+    if os.name == "nt":
+        # Register the main torchvision library location on the default DLL path
+        import ctypes
+        import sys
+
+        kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+        with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
+        prev_error_mode = kernel32.SetErrorMode(0x0001)
+
+        if with_load_library_flags:
+            kernel32.AddDllDirectory.restype = ctypes.c_void_p
+
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(lib_dir)
+        elif with_load_library_flags:
+            res = kernel32.AddDllDirectory(lib_dir)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
+                raise err
+
+        kernel32.SetErrorMode(prev_error_mode)
+
+    loader_details = (importlib.machinery.ExtensionFileLoader, importlib.machinery.EXTENSION_SUFFIXES)
+
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = extfinder.find_spec(lib_name)
+    if ext_specs is None:
+        raise ImportError
+
+    return ext_specs.origin
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/__init__.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6d04500cd4585b18a44150a8fc37a21264f2f41
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/__init__.py
@@ -0,0 +1,14 @@
+from .alexnet import *
+from .convnext import *
+from .resnet import *
+from .vgg import *
+from .squeezenet import *
+from .inception import *
+from .densenet import *
+from .googlenet import *
+from .mobilenet import *
+from .mnasnet import *
+from .shufflenetv2 import *
+from .efficientnet import *
+from .regnet import *
+from .vision_transformer import *
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/_utils.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e1cd845088f491bbb29ee9b4b2548063dc730a
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/_utils.py
@@ -0,0 +1,83 @@
+from collections import OrderedDict
+from typing import Dict, Optional
+
+from torch import nn
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+
+    Args:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+
+    Examples::
+
+        >>> m = torchvision.models.resnet18(pretrained=True)
+        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
+        >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
+        >>>     {'layer1': 'feat1', 'layer3': 'feat2'})
+        >>> out = new_m(torch.rand(1, 3, 224, 224))
+        >>> print([(k, v.shape) for k, v in out.items()])
+        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
+        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
+    """
+
+    _version = 2
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/alexnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb812febdc43fa6d428cdd5541f27917649ece62
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/alexnet.py
@@ -0,0 +1,69 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+
+__all__ = ["AlexNet", "alexnet"]
+
+
+model_urls = {
+    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
+}
+
+
+class AlexNet(nn.Module):
+    def __init__(self, num_classes: int = 1000, dropout: float = 0.5) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(256 * 6 * 6, 4096),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, 4096),
+            nn.ReLU(inplace=True),
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+
+def alexnet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> AlexNet:
+    r"""AlexNet model architecture from the
+    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
+    The required minimum input size of the model is 63x63.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = AlexNet(**kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls["alexnet"], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/convnext.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..9067b6876fdfef39c5acb1ab776ee1433e05a539
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/convnext.py
@@ -0,0 +1,271 @@
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Sequence
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation
+from ..ops.stochastic_depth import StochasticDepth
+from ..utils import _log_api_usage_once
+
+
+__all__ = [
+    "ConvNeXt",
+    "convnext_tiny",
+    "convnext_small",
+    "convnext_base",
+    "convnext_large",
+]
+
+
+_MODELS_URLS: Dict[str, Optional[str]] = {
+    "convnext_tiny": "https://download.pytorch.org/models/convnext_tiny-983f1562.pth",
+    "convnext_small": "https://download.pytorch.org/models/convnext_small-0c510722.pth",
+    "convnext_base": "https://download.pytorch.org/models/convnext_base-6075fbad.pth",
+    "convnext_large": "https://download.pytorch.org/models/convnext_large-ea097f82.pth",
+}
+
+
+class LayerNorm2d(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        x = x.permute(0, 2, 3, 1)
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = x.permute(0, 3, 1, 2)
+        return x
+
+
+class Permute(nn.Module):
+    def __init__(self, dims: List[int]):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x):
+        return torch.permute(x, self.dims)
+
+
+class CNBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        layer_scale: float,
+        stochastic_depth_prob: float,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.block = nn.Sequential(
+            nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim, bias=True),
+            Permute([0, 2, 3, 1]),
+            norm_layer(dim),
+            nn.Linear(in_features=dim, out_features=4 * dim, bias=True),
+            nn.GELU(),
+            nn.Linear(in_features=4 * dim, out_features=dim, bias=True),
+            Permute([0, 3, 1, 2]),
+        )
+        self.layer_scale = nn.Parameter(torch.ones(dim, 1, 1) * layer_scale)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.layer_scale * self.block(input)
+        result = self.stochastic_depth(result)
+        result += input
+        return result
+
+
+class CNBlockConfig:
+    # Stores information listed at Section 3 of the ConvNeXt paper
+    def __init__(
+        self,
+        input_channels: int,
+        out_channels: Optional[int],
+        num_layers: int,
+    ) -> None:
+        self.input_channels = input_channels
+        self.out_channels = out_channels
+        self.num_layers = num_layers
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "input_channels={input_channels}"
+        s += ", out_channels={out_channels}"
+        s += ", num_layers={num_layers}"
+        s += ")"
+        return s.format(**self.__dict__)
+
+
+class ConvNeXt(nn.Module):
+    def __init__(
+        self,
+        block_setting: List[CNBlockConfig],
+        stochastic_depth_prob: float = 0.0,
+        layer_scale: float = 1e-6,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not block_setting:
+            raise ValueError("The block_setting should not be empty")
+        elif not (isinstance(block_setting, Sequence) and all([isinstance(s, CNBlockConfig) for s in block_setting])):
+            raise TypeError("The block_setting should be List[CNBlockConfig]")
+
+        if block is None:
+            block = CNBlock
+
+        if norm_layer is None:
+            norm_layer = partial(LayerNorm2d, eps=1e-6)
+
+        layers: List[nn.Module] = []
+
+        # Stem
+        firstconv_output_channels = block_setting[0].input_channels
+        layers.append(
+            ConvNormActivation(
+                3,
+                firstconv_output_channels,
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=None,
+                bias=True,
+            )
+        )
+
+        total_stage_blocks = sum(cnf.num_layers for cnf in block_setting)
+        stage_block_id = 0
+        for cnf in block_setting:
+            # Bottlenecks
+            stage: List[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0)
+                stage.append(block(cnf.input_channels, layer_scale, sd_prob))
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            if cnf.out_channels is not None:
+                # Downsampling
+                layers.append(
+                    nn.Sequential(
+                        norm_layer(cnf.input_channels),
+                        nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=2, stride=2),
+                    )
+                )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+
+        lastblock = block_setting[-1]
+        lastconv_output_channels = (
+            lastblock.out_channels if lastblock.out_channels is not None else lastblock.input_channels
+        )
+        self.classifier = nn.Sequential(
+            norm_layer(lastconv_output_channels), nn.Flatten(1), nn.Linear(lastconv_output_channels, num_classes)
+        )
+
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear)):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _convnext(
+    arch: str,
+    block_setting: List[CNBlockConfig],
+    stochastic_depth_prob: float,
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any,
+) -> ConvNeXt:
+    model = ConvNeXt(block_setting, stochastic_depth_prob=stochastic_depth_prob, **kwargs)
+    if pretrained:
+        if arch not in _MODELS_URLS:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        state_dict = load_state_dict_from_url(_MODELS_URLS[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def convnext_tiny(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt:
+    r"""ConvNeXt Tiny model architecture from the
+    `"A ConvNet for the 2020s" <https://arxiv.org/abs/2201.03545>`_ paper.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    block_setting = [
+        CNBlockConfig(96, 192, 3),
+        CNBlockConfig(192, 384, 3),
+        CNBlockConfig(384, 768, 9),
+        CNBlockConfig(768, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.1)
+    return _convnext("convnext_tiny", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs)
+
+
+def convnext_small(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt:
+    r"""ConvNeXt Small model architecture from the
+    `"A ConvNet for the 2020s" <https://arxiv.org/abs/2201.03545>`_ paper.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    block_setting = [
+        CNBlockConfig(96, 192, 3),
+        CNBlockConfig(192, 384, 3),
+        CNBlockConfig(384, 768, 27),
+        CNBlockConfig(768, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.4)
+    return _convnext("convnext_small", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs)
+
+
+def convnext_base(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt:
+    r"""ConvNeXt Base model architecture from the
+    `"A ConvNet for the 2020s" <https://arxiv.org/abs/2201.03545>`_ paper.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    block_setting = [
+        CNBlockConfig(128, 256, 3),
+        CNBlockConfig(256, 512, 3),
+        CNBlockConfig(512, 1024, 27),
+        CNBlockConfig(1024, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.5)
+    return _convnext("convnext_base", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs)
+
+
+def convnext_large(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt:
+    r"""ConvNeXt Large model architecture from the
+    `"A ConvNet for the 2020s" <https://arxiv.org/abs/2201.03545>`_ paper.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    block_setting = [
+        CNBlockConfig(192, 384, 3),
+        CNBlockConfig(384, 768, 3),
+        CNBlockConfig(768, 1536, 27),
+        CNBlockConfig(1536, None, 3),
+    ]
+    stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.5)
+    return _convnext("convnext_large", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/densenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..14e318360afc03f11d14ebd5c2e1be9811923b7a
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/densenet.py
@@ -0,0 +1,310 @@
+import re
+from collections import OrderedDict
+from typing import Any, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from torch import Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+
+__all__ = ["DenseNet", "densenet121", "densenet169", "densenet201", "densenet161"]
+
+model_urls = {
+    "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth",
+    "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth",
+    "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth",
+    "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth",
+}
+
+
+class _DenseLayer(nn.Module):
+    def __init__(
+        self, num_input_features: int, growth_rate: int, bn_size: int, drop_rate: float, memory_efficient: bool = False
+    ) -> None:
+        super().__init__()
+        self.norm1: nn.BatchNorm2d
+        self.add_module("norm1", nn.BatchNorm2d(num_input_features))
+        self.relu1: nn.ReLU
+        self.add_module("relu1", nn.ReLU(inplace=True))
+        self.conv1: nn.Conv2d
+        self.add_module(
+            "conv1", nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)
+        )
+        self.norm2: nn.BatchNorm2d
+        self.add_module("norm2", nn.BatchNorm2d(bn_size * growth_rate))
+        self.relu2: nn.ReLU
+        self.add_module("relu2", nn.ReLU(inplace=True))
+        self.conv2: nn.Conv2d
+        self.add_module(
+            "conv2", nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)
+        )
+        self.drop_rate = float(drop_rate)
+        self.memory_efficient = memory_efficient
+
+    def bn_function(self, inputs: List[Tensor]) -> Tensor:
+        concated_features = torch.cat(inputs, 1)
+        bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))  # noqa: T484
+        return bottleneck_output
+
+    # todo: rewrite when torchscript supports any
+    def any_requires_grad(self, input: List[Tensor]) -> bool:
+        for tensor in input:
+            if tensor.requires_grad:
+                return True
+        return False
+
+    @torch.jit.unused  # noqa: T484
+    def call_checkpoint_bottleneck(self, input: List[Tensor]) -> Tensor:
+        def closure(*inputs):
+            return self.bn_function(inputs)
+
+        return cp.checkpoint(closure, *input)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, input: List[Tensor]) -> Tensor:  # noqa: F811
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, input: Tensor) -> Tensor:  # noqa: F811
+        pass
+
+    # torchscript does not yet support *args, so we overload method
+    # allowing it to take either a List[Tensor] or single Tensor
+    def forward(self, input: Tensor) -> Tensor:  # noqa: F811
+        if isinstance(input, Tensor):
+            prev_features = [input]
+        else:
+            prev_features = input
+
+        if self.memory_efficient and self.any_requires_grad(prev_features):
+            if torch.jit.is_scripting():
+                raise Exception("Memory Efficient not supported in JIT")
+
+            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
+        else:
+            bottleneck_output = self.bn_function(prev_features)
+
+        new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
+        if self.drop_rate > 0:
+            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+        return new_features
+
+
+class _DenseBlock(nn.ModuleDict):
+    _version = 2
+
+    def __init__(
+        self,
+        num_layers: int,
+        num_input_features: int,
+        bn_size: int,
+        growth_rate: int,
+        drop_rate: float,
+        memory_efficient: bool = False,
+    ) -> None:
+        super().__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(
+                num_input_features + i * growth_rate,
+                growth_rate=growth_rate,
+                bn_size=bn_size,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
+            self.add_module("denselayer%d" % (i + 1), layer)
+
+    def forward(self, init_features: Tensor) -> Tensor:
+        features = [init_features]
+        for name, layer in self.items():
+            new_features = layer(features)
+            features.append(new_features)
+        return torch.cat(features, 1)
+
+
+class _Transition(nn.Sequential):
+    def __init__(self, num_input_features: int, num_output_features: int) -> None:
+        super().__init__()
+        self.add_module("norm", nn.BatchNorm2d(num_input_features))
+        self.add_module("relu", nn.ReLU(inplace=True))
+        self.add_module("conv", nn.Conv2d(num_input_features, num_output_features, kernel_size=1, stride=1, bias=False))
+        self.add_module("pool", nn.AvgPool2d(kernel_size=2, stride=2))
+
+
+class DenseNet(nn.Module):
+    r"""Densenet-BC model class, based on
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
+
+    Args:
+        growth_rate (int) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bn_size (int) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        num_classes (int) - number of classification classes
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
+    """
+
+    def __init__(
+        self,
+        growth_rate: int = 32,
+        block_config: Tuple[int, int, int, int] = (6, 12, 24, 16),
+        num_init_features: int = 64,
+        bn_size: int = 4,
+        drop_rate: float = 0,
+        num_classes: int = 1000,
+        memory_efficient: bool = False,
+    ) -> None:
+
+        super().__init__()
+        _log_api_usage_once(self)
+
+        # First convolution
+        self.features = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+                    ("norm0", nn.BatchNorm2d(num_init_features)),
+                    ("relu0", nn.ReLU(inplace=True)),
+                    ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+                ]
+            )
+        )
+
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bn_size,
+                growth_rate=growth_rate,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
+            self.features.add_module("denseblock%d" % (i + 1), block)
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
+                self.features.add_module("transition%d" % (i + 1), trans)
+                num_features = num_features // 2
+
+        # Final batch norm
+        self.features.add_module("norm5", nn.BatchNorm2d(num_features))
+
+        # Linear layer
+        self.classifier = nn.Linear(num_features, num_classes)
+
+        # Official init from torch repo.
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        features = self.features(x)
+        out = F.relu(features, inplace=True)
+        out = F.adaptive_avg_pool2d(out, (1, 1))
+        out = torch.flatten(out, 1)
+        out = self.classifier(out)
+        return out
+
+
+def _load_state_dict(model: nn.Module, model_url: str, progress: bool) -> None:
+    # '.'s are no longer allowed in module names, but previous _DenseLayer
+    # has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
+    # They are also in the checkpoints in model_urls. This pattern is used
+    # to find such keys.
+    pattern = re.compile(
+        r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$"
+    )
+
+    state_dict = load_state_dict_from_url(model_url, progress=progress)
+    for key in list(state_dict.keys()):
+        res = pattern.match(key)
+        if res:
+            new_key = res.group(1) + res.group(2)
+            state_dict[new_key] = state_dict[key]
+            del state_dict[key]
+    model.load_state_dict(state_dict)
+
+
+def _densenet(
+    arch: str,
+    growth_rate: int,
+    block_config: Tuple[int, int, int, int],
+    num_init_features: int,
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any,
+) -> DenseNet:
+    model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
+    if pretrained:
+        _load_state_dict(model, model_urls[arch], progress)
+    return model
+
+
+def densenet121(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
+    The required minimum input size of the model is 29x29.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
+    """
+    return _densenet("densenet121", 32, (6, 12, 24, 16), 64, pretrained, progress, **kwargs)
+
+
+def densenet161(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-161 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
+    The required minimum input size of the model is 29x29.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
+    """
+    return _densenet("densenet161", 48, (6, 12, 36, 24), 96, pretrained, progress, **kwargs)
+
+
+def densenet169(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-169 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
+    The required minimum input size of the model is 29x29.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
+    """
+    return _densenet("densenet169", 32, (6, 12, 32, 32), 64, pretrained, progress, **kwargs)
+
+
+def densenet201(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet:
+    r"""Densenet-201 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
+    The required minimum input size of the model is 29x29.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
+    """
+    return _densenet("densenet201", 32, (6, 12, 48, 32), 64, pretrained, progress, **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/efficientnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2f1086329dc4c9676e68e88e08184929bac276f
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/efficientnet.py
@@ -0,0 +1,418 @@
+import copy
+import math
+from functools import partial
+from typing import Any, Callable, Optional, List, Sequence
+
+import torch
+from torch import nn, Tensor
+from ..ops import StochasticDepth
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation, SqueezeExcitation
+from ..utils import _log_api_usage_once
+from ._utils import _make_divisible
+
+
+__all__ = [
+    "EfficientNet",
+    "efficientnet_b0",
+    "efficientnet_b1",
+    "efficientnet_b2",
+    "efficientnet_b3",
+    "efficientnet_b4",
+    "efficientnet_b5",
+    "efficientnet_b6",
+    "efficientnet_b7",
+]
+
+
+model_urls = {
+    # Weights ported from https://github.com/rwightman/pytorch-image-models/
+    "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
+    "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+    "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
+    "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
+    "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+    # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/
+    "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
+    "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
+    "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
+}
+
+
+class MBConvConfig:
+    # Stores information listed at Table 1 of the EfficientNet paper
+    def __init__(
+        self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        width_mult: float,
+        depth_mult: float,
+    ) -> None:
+        self.expand_ratio = expand_ratio
+        self.kernel = kernel
+        self.stride = stride
+        self.input_channels = self.adjust_channels(input_channels, width_mult)
+        self.out_channels = self.adjust_channels(out_channels, width_mult)
+        self.num_layers = self.adjust_depth(num_layers, depth_mult)
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"expand_ratio={self.expand_ratio}"
+            f", kernel={self.kernel}"
+            f", stride={self.stride}"
+            f", input_channels={self.input_channels}"
+            f", out_channels={self.out_channels}"
+            f", num_layers={self.num_layers}"
+            f")"
+        )
+        return s
+
+    @staticmethod
+    def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int:
+        return _make_divisible(channels * width_mult, 8, min_value)
+
+    @staticmethod
+    def adjust_depth(num_layers: int, depth_mult: float):
+        return int(math.ceil(num_layers * depth_mult))
+
+
+class MBConv(nn.Module):
+    def __init__(
+        self,
+        cnf: MBConvConfig,
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., nn.Module],
+        se_layer: Callable[..., nn.Module] = SqueezeExcitation,
+    ) -> None:
+        super().__init__()
+
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+        layers: List[nn.Module] = []
+        activation_layer = nn.SiLU
+
+        # expand
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            layers.append(
+                ConvNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+
+        # depthwise
+        layers.append(
+            ConvNormActivation(
+                expanded_channels,
+                expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=cnf.stride,
+                groups=expanded_channels,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+            )
+        )
+
+        # squeeze and excitation
+        squeeze_channels = max(1, cnf.input_channels // 4)
+        layers.append(se_layer(expanded_channels, squeeze_channels, activation=partial(nn.SiLU, inplace=True)))
+
+        # project
+        layers.append(
+            ConvNormActivation(
+                expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+            )
+        )
+
+        self.block = nn.Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+
+
+class EfficientNet(nn.Module):
+    def __init__(
+        self,
+        inverted_residual_setting: List[MBConvConfig],
+        dropout: float,
+        stochastic_depth_prob: float = 0.2,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        EfficientNet main class
+
+        Args:
+            inverted_residual_setting (List[MBConvConfig]): Network structure
+            dropout (float): The droupout probability
+            stochastic_depth_prob (float): The stochastic depth probability
+            num_classes (int): Number of classes
+            block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, MBConvConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
+
+        if block is None:
+            block = MBConv
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        layers: List[nn.Module] = []
+
+        # building first layer
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            ConvNormActivation(
+                3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.SiLU
+            )
+        )
+
+        # building inverted residual blocks
+        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
+        stage_block_id = 0
+        for cnf in inverted_residual_setting:
+            stage: List[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # copy to avoid modifications. shallow copy is enough
+                block_cnf = copy.copy(cnf)
+
+                # overwrite info if not the first conv in the stage
+                if stage:
+                    block_cnf.input_channels = block_cnf.out_channels
+                    block_cnf.stride = 1
+
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
+
+                stage.append(block(block_cnf, sd_prob, norm_layer))
+                stage_block_id += 1
+
+            layers.append(nn.Sequential(*stage))
+
+        # building last several layers
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = 4 * lastconv_input_channels
+        layers.append(
+            ConvNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=nn.SiLU,
+            )
+        )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(lastconv_output_channels, num_classes),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                init_range = 1.0 / math.sqrt(m.out_features)
+                nn.init.uniform_(m.weight, -init_range, init_range)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        x = self.classifier(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _efficientnet(
+    arch: str,
+    width_mult: float,
+    depth_mult: float,
+    dropout: float,
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any,
+) -> EfficientNet:
+    bneck_conf = partial(MBConvConfig, width_mult=width_mult, depth_mult=depth_mult)
+    inverted_residual_setting = [
+        bneck_conf(1, 3, 1, 32, 16, 1),
+        bneck_conf(6, 3, 2, 16, 24, 2),
+        bneck_conf(6, 5, 2, 24, 40, 2),
+        bneck_conf(6, 3, 2, 40, 80, 3),
+        bneck_conf(6, 5, 1, 80, 112, 3),
+        bneck_conf(6, 5, 2, 112, 192, 4),
+        bneck_conf(6, 3, 1, 192, 320, 1),
+    ]
+    model = EfficientNet(inverted_residual_setting, dropout, **kwargs)
+    if pretrained:
+        if model_urls.get(arch, None) is None:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def efficientnet_b0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B0 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet("efficientnet_b0", 1.0, 1.0, 0.2, pretrained, progress, **kwargs)
+
+
+def efficientnet_b1(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B1 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet("efficientnet_b1", 1.0, 1.1, 0.2, pretrained, progress, **kwargs)
+
+
+def efficientnet_b2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B2 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet("efficientnet_b2", 1.1, 1.2, 0.3, pretrained, progress, **kwargs)
+
+
+def efficientnet_b3(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B3 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet("efficientnet_b3", 1.2, 1.4, 0.3, pretrained, progress, **kwargs)
+
+
+def efficientnet_b4(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B4 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet("efficientnet_b4", 1.4, 1.8, 0.4, pretrained, progress, **kwargs)
+
+
+def efficientnet_b5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B5 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet(
+        "efficientnet_b5",
+        1.6,
+        2.2,
+        0.4,
+        pretrained,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+        **kwargs,
+    )
+
+
+def efficientnet_b6(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B6 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet(
+        "efficientnet_b6",
+        1.8,
+        2.6,
+        0.5,
+        pretrained,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+        **kwargs,
+    )
+
+
+def efficientnet_b7(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet:
+    """
+    Constructs a EfficientNet B7 architecture from
+    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _efficientnet(
+        "efficientnet_b7",
+        2.0,
+        3.1,
+        0.5,
+        pretrained,
+        progress,
+        norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01),
+        **kwargs,
+    )
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/feature_extraction.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2b597da2368959eb74596547e87901cc7f8ea3
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/feature_extraction.py
@@ -0,0 +1,547 @@
+import inspect
+import math
+import re
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+from itertools import chain
+from typing import Dict, Callable, List, Union, Optional, Tuple, Any
+
+import torch
+import torchvision
+from torch import fx
+from torch import nn
+from torch.fx.graph_module import _copy_attr
+
+
+__all__ = ["create_feature_extractor", "get_graph_node_names"]
+
+
+class LeafModuleAwareTracer(fx.Tracer):
+    """
+    An fx.Tracer that allows the user to specify a set of leaf modules, ie.
+    modules that are not to be traced through. The resulting graph ends up
+    having single nodes referencing calls to the leaf modules' forward methods.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.leaf_modules = {}
+        if "leaf_modules" in kwargs:
+            leaf_modules = kwargs.pop("leaf_modules")
+            self.leaf_modules = leaf_modules
+        super().__init__(*args, **kwargs)
+
+    def is_leaf_module(self, m: nn.Module, module_qualname: str) -> bool:
+        if isinstance(m, tuple(self.leaf_modules)):
+            return True
+        return super().is_leaf_module(m, module_qualname)
+
+
+class NodePathTracer(LeafModuleAwareTracer):
+    """
+    NodePathTracer is an FX tracer that, for each operation, also records the
+    name of the Node from which the operation originated. A node name here is
+    a `.` separated path walking the hierarchy from top level module down to
+    leaf operation or leaf module. The name of the top level module is not
+    included as part of the node name. For example, if we trace a module whose
+    forward method applies a ReLU module, the name for that node will simply
+    be 'relu'.
+
+    Some notes on the specifics:
+        - Nodes are recorded to `self.node_to_qualname` which is a dictionary
+          mapping a given Node object to its node name.
+        - Nodes are recorded in the order which they are executed during
+          tracing.
+        - When a duplicate node name is encountered, a suffix of the form
+          _{int} is added. The counter starts from 1.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Track the qualified name of the Node being traced
+        self.current_module_qualname = ""
+        # A map from FX Node to the qualified name\#
+        # NOTE: This is loosely like the "qualified name" mentioned in the
+        # torch.fx docs https://pytorch.org/docs/stable/fx.html but adapted
+        # for the purposes of the torchvision feature extractor
+        self.node_to_qualname = OrderedDict()
+
+    def call_module(self, m: torch.nn.Module, forward: Callable, args, kwargs):
+        """
+        Override of `fx.Tracer.call_module`
+        This override:
+        1) Stores away the qualified name of the caller for restoration later
+        2) Adds the qualified name of the caller to
+           `current_module_qualname` for retrieval by `create_proxy`
+        3) Once a leaf module is reached, calls `create_proxy`
+        4) Restores the caller's qualified name into current_module_qualname
+        """
+        old_qualname = self.current_module_qualname
+        try:
+            module_qualname = self.path_of_module(m)
+            self.current_module_qualname = module_qualname
+            if not self.is_leaf_module(m, module_qualname):
+                out = forward(*args, **kwargs)
+                return out
+            return self.create_proxy("call_module", module_qualname, args, kwargs)
+        finally:
+            self.current_module_qualname = old_qualname
+
+    def create_proxy(
+        self, kind: str, target: fx.node.Target, args, kwargs, name=None, type_expr=None, *_
+    ) -> fx.proxy.Proxy:
+        """
+        Override of `Tracer.create_proxy`. This override intercepts the recording
+        of every operation and stores away the current traced module's qualified
+        name in `node_to_qualname`
+        """
+        proxy = super().create_proxy(kind, target, args, kwargs, name, type_expr)
+        self.node_to_qualname[proxy.node] = self._get_node_qualname(self.current_module_qualname, proxy.node)
+        return proxy
+
+    def _get_node_qualname(self, module_qualname: str, node: fx.node.Node) -> str:
+        node_qualname = module_qualname
+
+        if node.op != "call_module":
+            # In this case module_qualname from torch.fx doesn't go all the
+            # way to the leaf function/op so we need to append it
+            if len(node_qualname) > 0:
+                # Only append '.' if we are deeper than the top level module
+                node_qualname += "."
+            node_qualname += str(node)
+
+        # Now we need to add an _{index} postfix on any repeated node names
+        # For modules we do this from scratch
+        # But for anything else, torch.fx already has a globally scoped
+        # _{index} postfix. But we want it locally (relative to direct parent)
+        # scoped. So first we need to undo the torch.fx postfix
+        if re.match(r".+_[0-9]+$", node_qualname) is not None:
+            node_qualname = node_qualname.rsplit("_", 1)[0]
+
+        # ... and now we add on our own postfix
+        for existing_qualname in reversed(self.node_to_qualname.values()):
+            # Check to see if existing_qualname is of the form
+            # {node_qualname} or {node_qualname}_{int}
+            if re.match(rf"{node_qualname}(_[0-9]+)?$", existing_qualname) is not None:
+                postfix = existing_qualname.replace(node_qualname, "")
+                if len(postfix):
+                    # existing_qualname is of the form {node_qualname}_{int}
+                    next_index = int(postfix[1:]) + 1
+                else:
+                    # existing_qualname is of the form {node_qualname}
+                    next_index = 1
+                node_qualname += f"_{next_index}"
+                break
+
+        return node_qualname
+
+
+def _is_subseq(x, y):
+    """Check if y is a subseqence of x
+    https://stackoverflow.com/a/24017747/4391249
+    """
+    iter_x = iter(x)
+    return all(any(x_item == y_item for x_item in iter_x) for y_item in y)
+
+
+def _warn_graph_differences(train_tracer: NodePathTracer, eval_tracer: NodePathTracer):
+    """
+    Utility function for warning the user if there are differences between
+    the train graph nodes and the eval graph nodes.
+    """
+    train_nodes = list(train_tracer.node_to_qualname.values())
+    eval_nodes = list(eval_tracer.node_to_qualname.values())
+
+    if len(train_nodes) == len(eval_nodes) and all(t == e for t, e in zip(train_nodes, eval_nodes)):
+        return
+
+    suggestion_msg = (
+        "When choosing nodes for feature extraction, you may need to specify "
+        "output nodes for train and eval mode separately."
+    )
+
+    if _is_subseq(train_nodes, eval_nodes):
+        msg = (
+            "NOTE: The nodes obtained by tracing the model in eval mode "
+            "are a subsequence of those obtained in train mode. "
+        )
+    elif _is_subseq(eval_nodes, train_nodes):
+        msg = (
+            "NOTE: The nodes obtained by tracing the model in train mode "
+            "are a subsequence of those obtained in eval mode. "
+        )
+    else:
+        msg = "The nodes obtained by tracing the model in train mode are different to those obtained in eval mode. "
+    warnings.warn(msg + suggestion_msg)
+
+
+def _get_leaf_modules_for_ops() -> List[type]:
+    members = inspect.getmembers(torchvision.ops)
+    result = []
+    for _, obj in members:
+        if inspect.isclass(obj) and issubclass(obj, torch.nn.Module):
+            result.append(obj)
+    return result
+
+
+def get_graph_node_names(
+    model: nn.Module,
+    tracer_kwargs: Optional[Dict[str, Any]] = None,
+    suppress_diff_warning: bool = False,
+) -> Tuple[List[str], List[str]]:
+    """
+    Dev utility to return node names in order of execution. See note on node
+    names under :func:`create_feature_extractor`. Useful for seeing which node
+    names are available for feature extraction. There are two reasons that
+    node names can't easily be read directly from the code for a model:
+
+        1. Not all submodules are traced through. Modules from ``torch.nn`` all
+           fall within this category.
+        2. Nodes representing the repeated application of the same operation
+           or leaf module get a ``_{counter}`` postfix.
+
+    The model is traced twice: once in train mode, and once in eval mode. Both
+    sets of node names are returned.
+
+    For more details on the node naming conventions used here, please see the
+    :ref:`relevant subheading <about-node-names>` in the
+    `documentation <https://pytorch.org/vision/stable/feature_extraction.html>`_.
+
+    Args:
+        model (nn.Module): model for which we'd like to print node names
+        tracer_kwargs (dict, optional): a dictionary of keywork arguments for
+            ``NodePathTracer`` (they are eventually passed onto
+            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
+            By default it will be set to wrap and make leaf nodes all torchvision ops.
+        suppress_diff_warning (bool, optional): whether to suppress a warning
+            when there are discrepancies between the train and eval version of
+            the graph. Defaults to False.
+
+    Returns:
+        tuple(list, list): a list of node names from tracing the model in
+        train mode, and another from tracing the model in eval mode.
+
+    Examples::
+
+        >>> model = torchvision.models.resnet18()
+        >>> train_nodes, eval_nodes = get_graph_node_names(model)
+    """
+    if tracer_kwargs is None:
+        tracer_kwargs = {
+            "autowrap_modules": (
+                math,
+                torchvision.ops,
+            ),
+            "leaf_modules": _get_leaf_modules_for_ops(),
+        }
+    is_training = model.training
+    train_tracer = NodePathTracer(**tracer_kwargs)
+    train_tracer.trace(model.train())
+    eval_tracer = NodePathTracer(**tracer_kwargs)
+    eval_tracer.trace(model.eval())
+    train_nodes = list(train_tracer.node_to_qualname.values())
+    eval_nodes = list(eval_tracer.node_to_qualname.values())
+    if not suppress_diff_warning:
+        _warn_graph_differences(train_tracer, eval_tracer)
+    # Restore training state
+    model.train(is_training)
+    return train_nodes, eval_nodes
+
+
+class DualGraphModule(fx.GraphModule):
+    """
+    A derivative of `fx.GraphModule`. Differs in the following ways:
+    - Requires a train and eval version of the underlying graph
+    - Copies submodules according to the nodes of both train and eval graphs.
+    - Calling train(mode) switches between train graph and eval graph.
+    """
+
+    def __init__(
+        self, root: torch.nn.Module, train_graph: fx.Graph, eval_graph: fx.Graph, class_name: str = "GraphModule"
+    ):
+        """
+        Args:
+            root (nn.Module): module from which the copied module hierarchy is
+                built
+            train_graph (fx.Graph): the graph that should be used in train mode
+            eval_graph (fx.Graph): the graph that should be used in eval mode
+        """
+        super(fx.GraphModule, self).__init__()
+
+        self.__class__.__name__ = class_name
+
+        self.train_graph = train_graph
+        self.eval_graph = eval_graph
+
+        # Copy all get_attr and call_module ops (indicated by BOTH train and
+        # eval graphs)
+        for node in chain(iter(train_graph.nodes), iter(eval_graph.nodes)):
+            if node.op in ["get_attr", "call_module"]:
+                assert isinstance(node.target, str)
+                _copy_attr(root, self, node.target)
+
+        # train mode by default
+        self.train()
+        self.graph = train_graph
+
+        # (borrowed from fx.GraphModule):
+        # Store the Tracer class responsible for creating a Graph separately as part of the
+        # GraphModule state, except when the Tracer is defined in a local namespace.
+        # Locally defined Tracers are not pickleable. This is needed because torch.package will
+        # serialize a GraphModule without retaining the Graph, and needs to use the correct Tracer
+        # to re-create the Graph during deserialization.
+        assert (
+            self.eval_graph._tracer_cls == self.train_graph._tracer_cls
+        ), "Train mode and eval mode should use the same tracer class"
+        self._tracer_cls = None
+        if self.graph._tracer_cls and "<locals>" not in self.graph._tracer_cls.__qualname__:
+            self._tracer_cls = self.graph._tracer_cls
+
+    def train(self, mode=True):
+        """
+        Swap out the graph depending on the selected training mode.
+        NOTE this should be safe when calling model.eval() because that just
+        calls this with mode == False.
+        """
+        # NOTE: Only set self.graph if the current graph is not the desired
+        # one. This saves us from recompiling the graph where not necessary.
+        if mode and not self.training:
+            self.graph = self.train_graph
+        elif not mode and self.training:
+            self.graph = self.eval_graph
+        return super().train(mode=mode)
+
+
+def create_feature_extractor(
+    model: nn.Module,
+    return_nodes: Optional[Union[List[str], Dict[str, str]]] = None,
+    train_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None,
+    eval_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None,
+    tracer_kwargs: Optional[Dict[str, Any]] = None,
+    suppress_diff_warning: bool = False,
+) -> fx.GraphModule:
+    """
+    Creates a new graph module that returns intermediate nodes from a given
+    model as dictionary with user specified keys as strings, and the requested
+    outputs as values. This is achieved by re-writing the computation graph of
+    the model via FX to return the desired nodes as outputs. All unused nodes
+    are removed, together with their corresponding parameters.
+
+    Desired output nodes must be specified as a ``.`` separated
+    path walking the module hierarchy from top level module down to leaf
+    operation or leaf module. For more details on the node naming conventions
+    used here, please see the :ref:`relevant subheading <about-node-names>`
+    in the `documentation <https://pytorch.org/vision/stable/feature_extraction.html>`_.
+
+    Not all models will be FX traceable, although with some massaging they can
+    be made to cooperate. Here's a (not exhaustive) list of tips:
+
+        - If you don't need to trace through a particular, problematic
+          sub-module, turn it into a "leaf module" by passing a list of
+          ``leaf_modules`` as one of the ``tracer_kwargs`` (see example below).
+          It will not be traced through, but rather, the resulting graph will
+          hold a reference to that module's forward method.
+        - Likewise, you may turn functions into leaf functions by passing a
+          list of ``autowrap_functions`` as one of the ``tracer_kwargs`` (see
+          example below).
+        - Some inbuilt Python functions can be problematic. For instance,
+          ``int`` will raise an error during tracing. You may wrap them in your
+          own function and then pass that in ``autowrap_functions`` as one of
+          the ``tracer_kwargs``.
+
+    For further information on FX see the
+    `torch.fx documentation <https://pytorch.org/docs/stable/fx.html>`_.
+
+    Args:
+        model (nn.Module): model on which we will extract the features
+        return_nodes (list or dict, optional): either a ``List`` or a ``Dict``
+            containing the names (or partial names - see note above)
+            of the nodes for which the activations will be returned. If it is
+            a ``Dict``, the keys are the node names, and the values
+            are the user-specified keys for the graph module's returned
+            dictionary. If it is a ``List``, it is treated as a ``Dict`` mapping
+            node specification strings directly to output names. In the case
+            that ``train_return_nodes`` and ``eval_return_nodes`` are specified,
+            this should not be specified.
+        train_return_nodes (list or dict, optional): similar to
+            ``return_nodes``. This can be used if the return nodes
+            for train mode are different than those from eval mode.
+            If this is specified, ``eval_return_nodes`` must also be specified,
+            and ``return_nodes`` should not be specified.
+        eval_return_nodes (list or dict, optional): similar to
+            ``return_nodes``. This can be used if the return nodes
+            for train mode are different than those from eval mode.
+            If this is specified, ``train_return_nodes`` must also be specified,
+            and `return_nodes` should not be specified.
+        tracer_kwargs (dict, optional): a dictionary of keywork arguments for
+            ``NodePathTracer`` (which passes them onto it's parent class
+            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
+            By default it will be set to wrap and make leaf nodes all torchvision ops.
+        suppress_diff_warning (bool, optional): whether to suppress a warning
+            when there are discrepancies between the train and eval version of
+            the graph. Defaults to False.
+
+    Examples::
+
+        >>> # Feature extraction with resnet
+        >>> model = torchvision.models.resnet18()
+        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
+        >>> model = create_feature_extractor(
+        >>>     model, {'layer1': 'feat1', 'layer3': 'feat2'})
+        >>> out = model(torch.rand(1, 3, 224, 224))
+        >>> print([(k, v.shape) for k, v in out.items()])
+        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
+        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
+
+        >>> # Specifying leaf modules and leaf functions
+        >>> def leaf_function(x):
+        >>>     # This would raise a TypeError if traced through
+        >>>     return int(x)
+        >>>
+        >>> class LeafModule(torch.nn.Module):
+        >>>     def forward(self, x):
+        >>>         # This would raise a TypeError if traced through
+        >>>         int(x.shape[0])
+        >>>         return torch.nn.functional.relu(x + 4)
+        >>>
+        >>> class MyModule(torch.nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.conv = torch.nn.Conv2d(3, 1, 3)
+        >>>         self.leaf_module = LeafModule()
+        >>>
+        >>>     def forward(self, x):
+        >>>         leaf_function(x.shape[0])
+        >>>         x = self.conv(x)
+        >>>         return self.leaf_module(x)
+        >>>
+        >>> model = create_feature_extractor(
+        >>>     MyModule(), return_nodes=['leaf_module'],
+        >>>     tracer_kwargs={'leaf_modules': [LeafModule],
+        >>>                    'autowrap_functions': [leaf_function]})
+
+    """
+    if tracer_kwargs is None:
+        tracer_kwargs = {
+            "autowrap_modules": (
+                math,
+                torchvision.ops,
+            ),
+            "leaf_modules": _get_leaf_modules_for_ops(),
+        }
+    is_training = model.training
+
+    assert any(
+        arg is not None for arg in [return_nodes, train_return_nodes, eval_return_nodes]
+    ), "Either `return_nodes` or `train_return_nodes` and `eval_return_nodes` together, should be specified"
+
+    assert not (
+        (train_return_nodes is None) ^ (eval_return_nodes is None)
+    ), "If any of `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified"
+
+    assert (return_nodes is None) ^ (
+        train_return_nodes is None
+    ), "If `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified"
+
+    # Put *_return_nodes into Dict[str, str] format
+    def to_strdict(n) -> Dict[str, str]:
+        if isinstance(n, list):
+            return {str(i): str(i) for i in n}
+        return {str(k): str(v) for k, v in n.items()}
+
+    if train_return_nodes is None:
+        return_nodes = to_strdict(return_nodes)
+        train_return_nodes = deepcopy(return_nodes)
+        eval_return_nodes = deepcopy(return_nodes)
+    else:
+        train_return_nodes = to_strdict(train_return_nodes)
+        eval_return_nodes = to_strdict(eval_return_nodes)
+
+    # Repeat the tracing and graph rewriting for train and eval mode
+    tracers = {}
+    graphs = {}
+    mode_return_nodes: Dict[str, Dict[str, str]] = {"train": train_return_nodes, "eval": eval_return_nodes}
+    for mode in ["train", "eval"]:
+        if mode == "train":
+            model.train()
+        elif mode == "eval":
+            model.eval()
+
+        # Instantiate our NodePathTracer and use that to trace the model
+        tracer = NodePathTracer(**tracer_kwargs)
+        graph = tracer.trace(model)
+
+        name = model.__class__.__name__ if isinstance(model, nn.Module) else model.__name__
+        graph_module = fx.GraphModule(tracer.root, graph, name)
+
+        available_nodes = list(tracer.node_to_qualname.values())
+        # FIXME We don't know if we should expect this to happen
+        assert len(set(available_nodes)) == len(
+            available_nodes
+        ), "There are duplicate nodes! Please raise an issue https://github.com/pytorch/vision/issues"
+        # Check that all outputs in return_nodes are present in the model
+        for query in mode_return_nodes[mode].keys():
+            # To check if a query is available we need to check that at least
+            # one of the available names starts with it up to a .
+            if not any([re.match(rf"^{query}(\.|$)", n) is not None for n in available_nodes]):
+                raise ValueError(
+                    f"node: '{query}' is not present in model. Hint: use "
+                    "`get_graph_node_names` to make sure the "
+                    "`return_nodes` you specified are present. It may even "
+                    "be that you need to specify `train_return_nodes` and "
+                    "`eval_return_nodes` separately."
+                )
+
+        # Remove existing output nodes (train mode)
+        orig_output_nodes = []
+        for n in reversed(graph_module.graph.nodes):
+            if n.op == "output":
+                orig_output_nodes.append(n)
+        assert len(orig_output_nodes)
+        for n in orig_output_nodes:
+            graph_module.graph.erase_node(n)
+
+        # Find nodes corresponding to return_nodes and make them into output_nodes
+        nodes = [n for n in graph_module.graph.nodes]
+        output_nodes = OrderedDict()
+        for n in reversed(nodes):
+            module_qualname = tracer.node_to_qualname.get(n)
+            if module_qualname is None:
+                # NOTE - Know cases where this happens:
+                # - Node representing creation of a tensor constant - probably
+                #   not interesting as a return node
+                # - When packing outputs into a named tuple like in InceptionV3
+                continue
+            for query in mode_return_nodes[mode]:
+                depth = query.count(".")
+                if ".".join(module_qualname.split(".")[: depth + 1]) == query:
+                    output_nodes[mode_return_nodes[mode][query]] = n
+                    mode_return_nodes[mode].pop(query)
+                    break
+        output_nodes = OrderedDict(reversed(list(output_nodes.items())))
+
+        # And add them in the end of the graph
+        with graph_module.graph.inserting_after(nodes[-1]):
+            graph_module.graph.output(output_nodes)
+
+        # Remove unused modules / parameters
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+
+        # Keep track of the tracer and graph so we can choose the main one
+        tracers[mode] = tracer
+        graphs[mode] = graph
+
+    # Warn user if there are any discrepancies between the graphs of the
+    # train and eval modes
+    if not suppress_diff_warning:
+        _warn_graph_differences(tracers["train"], tracers["eval"])
+
+    # Build the final graph module
+    graph_module = DualGraphModule(model, graphs["train"], graphs["eval"], class_name=name)
+
+    # Restore original training mode
+    model.train(is_training)
+    graph_module.train(is_training)
+
+    return graph_module
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/googlenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e4c3498aab83e97ec9b1b96c8d5be85bfc4b7a7
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/googlenet.py
@@ -0,0 +1,310 @@
+import warnings
+from collections import namedtuple
+from typing import Optional, Tuple, List, Callable, Any
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+__all__ = ["GoogLeNet", "googlenet", "GoogLeNetOutputs", "_GoogLeNetOutputs"]
+
+model_urls = {
+    # GoogLeNet ported from TensorFlow
+    "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth",
+}
+
+GoogLeNetOutputs = namedtuple("GoogLeNetOutputs", ["logits", "aux_logits2", "aux_logits1"])
+GoogLeNetOutputs.__annotations__ = {"logits": Tensor, "aux_logits2": Optional[Tensor], "aux_logits1": Optional[Tensor]}
+
+# Script annotations failed with _GoogleNetOutputs = namedtuple ...
+# _GoogLeNetOutputs set here for backwards compat
+_GoogLeNetOutputs = GoogLeNetOutputs
+
+
+class GoogLeNet(nn.Module):
+    __constants__ = ["aux_logits", "transform_input"]
+
+    def __init__(
+        self,
+        num_classes: int = 1000,
+        aux_logits: bool = True,
+        transform_input: bool = False,
+        init_weights: Optional[bool] = None,
+        blocks: Optional[List[Callable[..., nn.Module]]] = None,
+        dropout: float = 0.2,
+        dropout_aux: float = 0.7,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if blocks is None:
+            blocks = [BasicConv2d, Inception, InceptionAux]
+        if init_weights is None:
+            warnings.warn(
+                "The default weight initialization of GoogleNet will be changed in future releases of "
+                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
+                " due to scipy/scipy#11299), please set init_weights=True.",
+                FutureWarning,
+            )
+            init_weights = True
+        assert len(blocks) == 3
+        conv_block = blocks[0]
+        inception_block = blocks[1]
+        inception_aux_block = blocks[2]
+
+        self.aux_logits = aux_logits
+        self.transform_input = transform_input
+
+        self.conv1 = conv_block(3, 64, kernel_size=7, stride=2, padding=3)
+        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.conv2 = conv_block(64, 64, kernel_size=1)
+        self.conv3 = conv_block(64, 192, kernel_size=3, padding=1)
+        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+
+        self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)
+        self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)
+        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+
+        self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)
+        self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)
+        self.inception4c = inception_block(512, 128, 128, 256, 24, 64, 64)
+        self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)
+        self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)
+        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+        self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)
+        self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)
+
+        if aux_logits:
+            self.aux1 = inception_aux_block(512, num_classes, dropout=dropout_aux)
+            self.aux2 = inception_aux_block(528, num_classes, dropout=dropout_aux)
+        else:
+            self.aux1 = None  # type: ignore[assignment]
+            self.aux2 = None  # type: ignore[assignment]
+
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(p=dropout)
+        self.fc = nn.Linear(1024, num_classes)
+
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                    torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=0.01, a=-2, b=2)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def _transform_input(self, x: Tensor) -> Tensor:
+        if self.transform_input:
+            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
+            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
+            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
+            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
+        return x
+
+    def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        # N x 3 x 224 x 224
+        x = self.conv1(x)
+        # N x 64 x 112 x 112
+        x = self.maxpool1(x)
+        # N x 64 x 56 x 56
+        x = self.conv2(x)
+        # N x 64 x 56 x 56
+        x = self.conv3(x)
+        # N x 192 x 56 x 56
+        x = self.maxpool2(x)
+
+        # N x 192 x 28 x 28
+        x = self.inception3a(x)
+        # N x 256 x 28 x 28
+        x = self.inception3b(x)
+        # N x 480 x 28 x 28
+        x = self.maxpool3(x)
+        # N x 480 x 14 x 14
+        x = self.inception4a(x)
+        # N x 512 x 14 x 14
+        aux1: Optional[Tensor] = None
+        if self.aux1 is not None:
+            if self.training:
+                aux1 = self.aux1(x)
+
+        x = self.inception4b(x)
+        # N x 512 x 14 x 14
+        x = self.inception4c(x)
+        # N x 512 x 14 x 14
+        x = self.inception4d(x)
+        # N x 528 x 14 x 14
+        aux2: Optional[Tensor] = None
+        if self.aux2 is not None:
+            if self.training:
+                aux2 = self.aux2(x)
+
+        x = self.inception4e(x)
+        # N x 832 x 14 x 14
+        x = self.maxpool4(x)
+        # N x 832 x 7 x 7
+        x = self.inception5a(x)
+        # N x 832 x 7 x 7
+        x = self.inception5b(x)
+        # N x 1024 x 7 x 7
+
+        x = self.avgpool(x)
+        # N x 1024 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 1024
+        x = self.dropout(x)
+        x = self.fc(x)
+        # N x 1000 (num_classes)
+        return x, aux2, aux1
+
+    @torch.jit.unused
+    def eager_outputs(self, x: Tensor, aux2: Tensor, aux1: Optional[Tensor]) -> GoogLeNetOutputs:
+        if self.training and self.aux_logits:
+            return _GoogLeNetOutputs(x, aux2, aux1)
+        else:
+            return x  # type: ignore[return-value]
+
+    def forward(self, x: Tensor) -> GoogLeNetOutputs:
+        x = self._transform_input(x)
+        x, aux1, aux2 = self._forward(x)
+        aux_defined = self.training and self.aux_logits
+        if torch.jit.is_scripting():
+            if not aux_defined:
+                warnings.warn("Scripted GoogleNet always returns GoogleNetOutputs Tuple")
+            return GoogLeNetOutputs(x, aux2, aux1)
+        else:
+            return self.eager_outputs(x, aux2, aux1)
+
+
+class Inception(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        ch1x1: int,
+        ch3x3red: int,
+        ch3x3: int,
+        ch5x5red: int,
+        ch5x5: int,
+        pool_proj: int,
+        conv_block: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1 = conv_block(in_channels, ch1x1, kernel_size=1)
+
+        self.branch2 = nn.Sequential(
+            conv_block(in_channels, ch3x3red, kernel_size=1), conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1)
+        )
+
+        self.branch3 = nn.Sequential(
+            conv_block(in_channels, ch5x5red, kernel_size=1),
+            # Here, kernel_size=3 instead of kernel_size=5 is a known bug.
+            # Please see https://github.com/pytorch/vision/issues/906 for details.
+            conv_block(ch5x5red, ch5x5, kernel_size=3, padding=1),
+        )
+
+        self.branch4 = nn.Sequential(
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True),
+            conv_block(in_channels, pool_proj, kernel_size=1),
+        )
+
+    def _forward(self, x: Tensor) -> List[Tensor]:
+        branch1 = self.branch1(x)
+        branch2 = self.branch2(x)
+        branch3 = self.branch3(x)
+        branch4 = self.branch4(x)
+
+        outputs = [branch1, branch2, branch3, branch4]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionAux(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_classes: int,
+        conv_block: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.7,
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.conv = conv_block(in_channels, 128, kernel_size=1)
+
+        self.fc1 = nn.Linear(2048, 1024)
+        self.fc2 = nn.Linear(1024, num_classes)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
+        x = F.adaptive_avg_pool2d(x, (4, 4))
+        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
+        x = self.conv(x)
+        # N x 128 x 4 x 4
+        x = torch.flatten(x, 1)
+        # N x 2048
+        x = F.relu(self.fc1(x), inplace=True)
+        # N x 1024
+        x = self.dropout(x)
+        # N x 1024
+        x = self.fc2(x)
+        # N x 1000 (num_classes)
+
+        return x
+
+
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        return F.relu(x, inplace=True)
+
+
+def googlenet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> GoogLeNet:
+    r"""GoogLeNet (Inception v1) model architecture from
+    `"Going Deeper with Convolutions" <http://arxiv.org/abs/1409.4842>`_.
+    The required minimum input size of the model is 15x15.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+        aux_logits (bool): If True, adds two auxiliary branches that can improve training.
+            Default: *False* when pretrained is True otherwise *True*
+        transform_input (bool): If True, preprocesses the input according to the method with which it
+            was trained on ImageNet. Default: True if ``pretrained=True``, else False.
+    """
+    if pretrained:
+        if "transform_input" not in kwargs:
+            kwargs["transform_input"] = True
+        if "aux_logits" not in kwargs:
+            kwargs["aux_logits"] = False
+        if kwargs["aux_logits"]:
+            warnings.warn(
+                "auxiliary heads in the pretrained googlenet model are NOT pretrained, so make sure to train them"
+            )
+        original_aux_logits = kwargs["aux_logits"]
+        kwargs["aux_logits"] = True
+        kwargs["init_weights"] = False
+        model = GoogLeNet(**kwargs)
+        state_dict = load_state_dict_from_url(model_urls["googlenet"], progress=progress)
+        model.load_state_dict(state_dict)
+        if not original_aux_logits:
+            model.aux_logits = False
+            model.aux1 = None  # type: ignore[assignment]
+            model.aux2 = None  # type: ignore[assignment]
+        return model
+
+    return GoogLeNet(**kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/inception.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/inception.py
new file mode 100644
index 0000000000000000000000000000000000000000..c489925cb458d9700afbb377bc2c449d6e6d0066
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/inception.py
@@ -0,0 +1,444 @@
+import warnings
+from collections import namedtuple
+from typing import Callable, Any, Optional, Tuple, List
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+
+__all__ = ["Inception3", "inception_v3", "InceptionOutputs", "_InceptionOutputs"]
+
+
+model_urls = {
+    # Inception v3 ported from TensorFlow
+    "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
+}
+
+InceptionOutputs = namedtuple("InceptionOutputs", ["logits", "aux_logits"])
+InceptionOutputs.__annotations__ = {"logits": Tensor, "aux_logits": Optional[Tensor]}
+
+# Script annotations failed with _GoogleNetOutputs = namedtuple ...
+# _InceptionOutputs set here for backwards compat
+_InceptionOutputs = InceptionOutputs
+
+
+class Inception3(nn.Module):
+    def __init__(
+        self,
+        num_classes: int = 1000,
+        aux_logits: bool = True,
+        transform_input: bool = False,
+        inception_blocks: Optional[List[Callable[..., nn.Module]]] = None,
+        init_weights: Optional[bool] = None,
+        dropout: float = 0.5,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if inception_blocks is None:
+            inception_blocks = [BasicConv2d, InceptionA, InceptionB, InceptionC, InceptionD, InceptionE, InceptionAux]
+        if init_weights is None:
+            warnings.warn(
+                "The default weight initialization of inception_v3 will be changed in future releases of "
+                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
+                " due to scipy/scipy#11299), please set init_weights=True.",
+                FutureWarning,
+            )
+            init_weights = True
+        assert len(inception_blocks) == 7
+        conv_block = inception_blocks[0]
+        inception_a = inception_blocks[1]
+        inception_b = inception_blocks[2]
+        inception_c = inception_blocks[3]
+        inception_d = inception_blocks[4]
+        inception_e = inception_blocks[5]
+        inception_aux = inception_blocks[6]
+
+        self.aux_logits = aux_logits
+        self.transform_input = transform_input
+        self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2)
+        self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3)
+        self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1)
+        self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.Mixed_5b = inception_a(192, pool_features=32)
+        self.Mixed_5c = inception_a(256, pool_features=64)
+        self.Mixed_5d = inception_a(288, pool_features=64)
+        self.Mixed_6a = inception_b(288)
+        self.Mixed_6b = inception_c(768, channels_7x7=128)
+        self.Mixed_6c = inception_c(768, channels_7x7=160)
+        self.Mixed_6d = inception_c(768, channels_7x7=160)
+        self.Mixed_6e = inception_c(768, channels_7x7=192)
+        self.AuxLogits: Optional[nn.Module] = None
+        if aux_logits:
+            self.AuxLogits = inception_aux(768, num_classes)
+        self.Mixed_7a = inception_d(768)
+        self.Mixed_7b = inception_e(1280)
+        self.Mixed_7c = inception_e(2048)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(p=dropout)
+        self.fc = nn.Linear(2048, num_classes)
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                    stddev = float(m.stddev) if hasattr(m, "stddev") else 0.1  # type: ignore
+                    torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=stddev, a=-2, b=2)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def _transform_input(self, x: Tensor) -> Tensor:
+        if self.transform_input:
+            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
+            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
+            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
+            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
+        return x
+
+    def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor]]:
+        # N x 3 x 299 x 299
+        x = self.Conv2d_1a_3x3(x)
+        # N x 32 x 149 x 149
+        x = self.Conv2d_2a_3x3(x)
+        # N x 32 x 147 x 147
+        x = self.Conv2d_2b_3x3(x)
+        # N x 64 x 147 x 147
+        x = self.maxpool1(x)
+        # N x 64 x 73 x 73
+        x = self.Conv2d_3b_1x1(x)
+        # N x 80 x 73 x 73
+        x = self.Conv2d_4a_3x3(x)
+        # N x 192 x 71 x 71
+        x = self.maxpool2(x)
+        # N x 192 x 35 x 35
+        x = self.Mixed_5b(x)
+        # N x 256 x 35 x 35
+        x = self.Mixed_5c(x)
+        # N x 288 x 35 x 35
+        x = self.Mixed_5d(x)
+        # N x 288 x 35 x 35
+        x = self.Mixed_6a(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6b(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6c(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6d(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_6e(x)
+        # N x 768 x 17 x 17
+        aux: Optional[Tensor] = None
+        if self.AuxLogits is not None:
+            if self.training:
+                aux = self.AuxLogits(x)
+        # N x 768 x 17 x 17
+        x = self.Mixed_7a(x)
+        # N x 1280 x 8 x 8
+        x = self.Mixed_7b(x)
+        # N x 2048 x 8 x 8
+        x = self.Mixed_7c(x)
+        # N x 2048 x 8 x 8
+        # Adaptive average pooling
+        x = self.avgpool(x)
+        # N x 2048 x 1 x 1
+        x = self.dropout(x)
+        # N x 2048 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 2048
+        x = self.fc(x)
+        # N x 1000 (num_classes)
+        return x, aux
+
+    @torch.jit.unused
+    def eager_outputs(self, x: Tensor, aux: Optional[Tensor]) -> InceptionOutputs:
+        if self.training and self.aux_logits:
+            return InceptionOutputs(x, aux)
+        else:
+            return x  # type: ignore[return-value]
+
+    def forward(self, x: Tensor) -> InceptionOutputs:
+        x = self._transform_input(x)
+        x, aux = self._forward(x)
+        aux_defined = self.training and self.aux_logits
+        if torch.jit.is_scripting():
+            if not aux_defined:
+                warnings.warn("Scripted Inception3 always returns Inception3 Tuple")
+            return InceptionOutputs(x, aux)
+        else:
+            return self.eager_outputs(x, aux)
+
+
+class InceptionA(nn.Module):
+    def __init__(
+        self, in_channels: int, pool_features: int, conv_block: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 64, kernel_size=1)
+
+        self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1)
+        self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2)
+
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1)
+
+        self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
+
+    def _forward(self, x: Tensor) -> List[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionB(nn.Module):
+    def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2)
+
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
+
+    def _forward(self, x: Tensor) -> List[Tensor]:
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+
+        outputs = [branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionC(nn.Module):
+    def __init__(
+        self, in_channels: int, channels_7x7: int, conv_block: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 192, kernel_size=1)
+
+        c7 = channels_7x7
+        self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0))
+
+        self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3))
+
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+
+    def _forward(self, x: Tensor) -> List[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionD(nn.Module):
+    def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2)
+
+        self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
+
+    def _forward(self, x: Tensor) -> List[Tensor]:
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
+        outputs = [branch3x3, branch7x7x3, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionE(nn.Module):
+    def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 320, kernel_size=1)
+
+        self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1)
+        self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+
+        self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1)
+        self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+
+    def _forward(self, x: Tensor) -> List[Tensor]:
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self._forward(x)
+        return torch.cat(outputs, 1)
+
+
+class InceptionAux(nn.Module):
+    def __init__(
+        self, in_channels: int, num_classes: int, conv_block: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        if conv_block is None:
+            conv_block = BasicConv2d
+        self.conv0 = conv_block(in_channels, 128, kernel_size=1)
+        self.conv1 = conv_block(128, 768, kernel_size=5)
+        self.conv1.stddev = 0.01  # type: ignore[assignment]
+        self.fc = nn.Linear(768, num_classes)
+        self.fc.stddev = 0.001  # type: ignore[assignment]
+
+    def forward(self, x: Tensor) -> Tensor:
+        # N x 768 x 17 x 17
+        x = F.avg_pool2d(x, kernel_size=5, stride=3)
+        # N x 768 x 5 x 5
+        x = self.conv0(x)
+        # N x 128 x 5 x 5
+        x = self.conv1(x)
+        # N x 768 x 1 x 1
+        # Adaptive average pooling
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        # N x 768 x 1 x 1
+        x = torch.flatten(x, 1)
+        # N x 768
+        x = self.fc(x)
+        # N x 1000
+        return x
+
+
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        return F.relu(x, inplace=True)
+
+
+def inception_v3(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> Inception3:
+    r"""Inception v3 model architecture from
+    `"Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>`_.
+    The required minimum input size of the model is 75x75.
+
+    .. note::
+        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
+        N x 3 x 299 x 299, so ensure your images are sized accordingly.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+        aux_logits (bool): If True, add an auxiliary branch that can improve training.
+            Default: *True*
+        transform_input (bool): If True, preprocesses the input according to the method with which it
+            was trained on ImageNet. Default: True if ``pretrained=True``, else False.
+    """
+    if pretrained:
+        if "transform_input" not in kwargs:
+            kwargs["transform_input"] = True
+        if "aux_logits" in kwargs:
+            original_aux_logits = kwargs["aux_logits"]
+            kwargs["aux_logits"] = True
+        else:
+            original_aux_logits = True
+        kwargs["init_weights"] = False  # we are loading weights from a pretrained model
+        model = Inception3(**kwargs)
+        state_dict = load_state_dict_from_url(model_urls["inception_v3_google"], progress=progress)
+        model.load_state_dict(state_dict)
+        if not original_aux_logits:
+            model.aux_logits = False
+            model.AuxLogits = None
+        return model
+
+    return Inception3(**kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mnasnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d4013f30c8ec7725665a09a7db9cdb41ea9988
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mnasnet.py
@@ -0,0 +1,263 @@
+import warnings
+from typing import Any, Dict, List
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+__all__ = ["MNASNet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3"]
+
+_MODEL_URLS = {
+    "mnasnet0_5": "https://download.pytorch.org/models/mnasnet0.5_top1_67.823-3ffadce67e.pth",
+    "mnasnet0_75": None,
+    "mnasnet1_0": "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
+    "mnasnet1_3": None,
+}
+
+# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
+# 1.0 - tensorflow.
+_BN_MOMENTUM = 1 - 0.9997
+
+
+class _InvertedResidual(nn.Module):
+    def __init__(
+        self, in_ch: int, out_ch: int, kernel_size: int, stride: int, expansion_factor: int, bn_momentum: float = 0.1
+    ) -> None:
+        super().__init__()
+        assert stride in [1, 2]
+        assert kernel_size in [3, 5]
+        mid_ch = in_ch * expansion_factor
+        self.apply_residual = in_ch == out_ch and stride == 1
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2, stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=bn_momentum),
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        if self.apply_residual:
+            return self.layers(input) + input
+        else:
+            return self.layers(input)
+
+
+def _stack(
+    in_ch: int, out_ch: int, kernel_size: int, stride: int, exp_factor: int, repeats: int, bn_momentum: float
+) -> nn.Sequential:
+    """Creates a stack of inverted residuals."""
+    assert repeats >= 1
+    # First one has no skip, because feature map size changes.
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum=bn_momentum)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum=bn_momentum))
+    return nn.Sequential(first, *remaining)
+
+
+def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9) -> int:
+    """Asymmetric rounding to make `val` divisible by `divisor`. With default
+    bias, will round up, unless the number is no more than 10% greater than the
+    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88."""
+    assert 0.0 < round_up_bias < 1.0
+    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
+    return new_val if new_val >= round_up_bias * val else new_val + divisor
+
+
+def _get_depths(alpha: float) -> List[int]:
+    """Scales tensor depths as in reference MobileNet code, prefers rouding up
+    rather than down."""
+    depths = [32, 16, 24, 40, 80, 96, 192, 320]
+    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
+
+
+class MNASNet(torch.nn.Module):
+    """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    implements the B1 variant of the model.
+    >>> model = MNASNet(1.0, num_classes=1000)
+    >>> x = torch.rand(1, 3, 224, 224)
+    >>> y = model(x)
+    >>> y.dim()
+    2
+    >>> y.nelement()
+    1000
+    """
+
+    # Version 2 adds depth scaling in the initial stages of the network.
+    _version = 2
+
+    def __init__(self, alpha: float, num_classes: int = 1000, dropout: float = 0.2) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        assert alpha > 0.0
+        self.alpha = alpha
+        self.num_classes = num_classes
+        depths = _get_depths(alpha)
+        layers = [
+            # First layer: regular conv.
+            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Depthwise separable, no skip.
+            nn.Conv2d(depths[0], depths[0], 3, padding=1, stride=1, groups=depths[0], bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(depths[1], momentum=_BN_MOMENTUM),
+            # MNASNet blocks: stacks of inverted residuals.
+            _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM),
+            _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM),
+            _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM),
+            _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM),
+            # Final mapping to classifier input.
+            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True), nn.Linear(1280, num_classes))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_uniform_(m.weight, mode="fan_out", nonlinearity="sigmoid")
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.layers(x)
+        # Equivalent to global avgpool and removing H and W dimensions.
+        x = x.mean([2, 3])
+        return self.classifier(x)
+
+    def _load_from_state_dict(
+        self,
+        state_dict: Dict,
+        prefix: str,
+        local_metadata: Dict,
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        version = local_metadata.get("version", None)
+        assert version in [1, 2]
+
+        if version == 1 and not self.alpha == 1.0:
+            # In the initial version of the model (v1), stem was fixed-size.
+            # All other layer configurations were the same. This will patch
+            # the model so that it's identical to v1. Model with alpha 1.0 is
+            # unaffected.
+            depths = _get_depths(self.alpha)
+            v1_stem = [
+                nn.Conv2d(3, 32, 3, padding=1, stride=2, bias=False),
+                nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32, 32, 3, padding=1, stride=1, groups=32, bias=False),
+                nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False),
+                nn.BatchNorm2d(16, momentum=_BN_MOMENTUM),
+                _stack(16, depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+            ]
+            for idx, layer in enumerate(v1_stem):
+                self.layers[idx] = layer
+
+            # The model is now identical to v1, and must be saved as such.
+            self._version = 1
+            warnings.warn(
+                "A new version of MNASNet model has been implemented. "
+                "Your checkpoint was saved using the previous version. "
+                "This checkpoint will load and work as before, but "
+                "you may want to upgrade by training a newer model or "
+                "transfer learning from an updated ImageNet checkpoint.",
+                UserWarning,
+            )
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+
+def _load_pretrained(model_name: str, model: nn.Module, progress: bool) -> None:
+    if model_name not in _MODEL_URLS or _MODEL_URLS[model_name] is None:
+        raise ValueError(f"No checkpoint is available for model type {model_name}")
+    checkpoint_url = _MODEL_URLS[model_name]
+    model.load_state_dict(load_state_dict_from_url(checkpoint_url, progress=progress))
+
+
+def mnasnet0_5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet:
+    r"""MNASNet with depth multiplier of 0.5 from
+    `"MnasNet: Platform-Aware Neural Architecture Search for Mobile"
+    <https://arxiv.org/pdf/1807.11626.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MNASNet(0.5, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet0_5", model, progress)
+    return model
+
+
+def mnasnet0_75(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet:
+    r"""MNASNet with depth multiplier of 0.75 from
+    `"MnasNet: Platform-Aware Neural Architecture Search for Mobile"
+    <https://arxiv.org/pdf/1807.11626.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MNASNet(0.75, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet0_75", model, progress)
+    return model
+
+
+def mnasnet1_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet:
+    r"""MNASNet with depth multiplier of 1.0 from
+    `"MnasNet: Platform-Aware Neural Architecture Search for Mobile"
+    <https://arxiv.org/pdf/1807.11626.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MNASNet(1.0, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet1_0", model, progress)
+    return model
+
+
+def mnasnet1_3(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet:
+    r"""MNASNet with depth multiplier of 1.3 from
+    `"MnasNet: Platform-Aware Neural Architecture Search for Mobile"
+    <https://arxiv.org/pdf/1807.11626.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MNASNet(1.3, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet1_3", model, progress)
+    return model
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4108305d3f5371ee10f02a7c5fb4858e94b35d5d
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenet.py
@@ -0,0 +1,4 @@
+from .mobilenetv2 import MobileNetV2, mobilenet_v2, __all__ as mv2_all
+from .mobilenetv3 import MobileNetV3, mobilenet_v3_large, mobilenet_v3_small, __all__ as mv3_all
+
+__all__ = mv2_all + mv3_all
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv2.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e24c5962d7e20a5e1f2bde83fa330c509b881415
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv2.py
@@ -0,0 +1,211 @@
+import warnings
+from typing import Callable, Any, Optional, List
+
+import torch
+from torch import Tensor
+from torch import nn
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation
+from ..utils import _log_api_usage_once
+from ._utils import _make_divisible
+
+
+__all__ = ["MobileNetV2", "mobilenet_v2"]
+
+
+model_urls = {
+    "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
+}
+
+
+# necessary for backwards compatibility
+class _DeprecatedConvBNAct(ConvNormActivation):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The ConvBNReLU/ConvBNActivation classes are deprecated since 0.12 and will be removed in 0.14. "
+            "Use torchvision.ops.misc.ConvNormActivation instead.",
+            FutureWarning,
+        )
+        if kwargs.get("norm_layer", None) is None:
+            kwargs["norm_layer"] = nn.BatchNorm2d
+        if kwargs.get("activation_layer", None) is None:
+            kwargs["activation_layer"] = nn.ReLU6
+        super().__init__(*args, **kwargs)
+
+
+ConvBNReLU = _DeprecatedConvBNAct
+ConvBNActivation = _DeprecatedConvBNAct
+
+
+class InvertedResidual(nn.Module):
+    def __init__(
+        self, inp: int, oup: int, stride: int, expand_ratio: int, norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers: List[nn.Module] = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(
+                ConvNormActivation(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6)
+            )
+        layers.extend(
+            [
+                # dw
+                ConvNormActivation(
+                    hidden_dim,
+                    hidden_dim,
+                    stride=stride,
+                    groups=hidden_dim,
+                    norm_layer=norm_layer,
+                    activation_layer=nn.ReLU6,
+                ),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                norm_layer(oup),
+            ]
+        )
+        self.conv = nn.Sequential(*layers)
+        self.out_channels = oup
+        self._is_cn = stride > 1
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(
+        self,
+        num_classes: int = 1000,
+        width_mult: float = 1.0,
+        inverted_residual_setting: Optional[List[List[int]]] = None,
+        round_nearest: int = 8,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.2,
+    ) -> None:
+        """
+        MobileNet V2 main class
+
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+            norm_layer: Module specifying the normalization layer to use
+            dropout (float): The droupout probability
+
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        input_channel = 32
+        last_channel = 1280
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError(
+                f"inverted_residual_setting should be non-empty or a 4-element list, got {inverted_residual_setting}"
+            )
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features: List[nn.Module] = [
+            ConvNormActivation(3, input_channel, stride=2, norm_layer=norm_layer, activation_layer=nn.ReLU6)
+        ]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
+                input_channel = output_channel
+        # building last several layers
+        features.append(
+            ConvNormActivation(
+                input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6
+            )
+        )
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(self.last_channel, num_classes),
+        )
+
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        x = self.features(x)
+        # Cannot use "squeeze" as batch-size can be 1
+        x = nn.functional.adaptive_avg_pool2d(x, (1, 1))
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def mobilenet_v2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV2:
+    """
+    Constructs a MobileNetV2 architecture from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MobileNetV2(**kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls["mobilenet_v2"], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv3.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..711888b7c8b236e3f370ec5833b8a875dce7883f
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv3.py
@@ -0,0 +1,328 @@
+import warnings
+from functools import partial
+from typing import Any, Callable, List, Optional, Sequence
+
+import torch
+from torch import nn, Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation, SqueezeExcitation as SElayer
+from ..utils import _log_api_usage_once
+from ._utils import _make_divisible
+
+
+__all__ = ["MobileNetV3", "mobilenet_v3_large", "mobilenet_v3_small"]
+
+
+model_urls = {
+    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
+    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
+}
+
+
+class SqueezeExcitation(SElayer):
+    """DEPRECATED"""
+
+    def __init__(self, input_channels: int, squeeze_factor: int = 4):
+        squeeze_channels = _make_divisible(input_channels // squeeze_factor, 8)
+        super().__init__(input_channels, squeeze_channels, scale_activation=nn.Hardsigmoid)
+        self.relu = self.activation
+        delattr(self, "activation")
+        warnings.warn(
+            "This SqueezeExcitation class is deprecated since 0.12 and will be removed in 0.14. "
+            "Use torchvision.ops.SqueezeExcitation instead.",
+            FutureWarning,
+        )
+
+
+class InvertedResidualConfig:
+    # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
+    def __init__(
+        self,
+        input_channels: int,
+        kernel: int,
+        expanded_channels: int,
+        out_channels: int,
+        use_se: bool,
+        activation: str,
+        stride: int,
+        dilation: int,
+        width_mult: float,
+    ):
+        self.input_channels = self.adjust_channels(input_channels, width_mult)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
+        self.out_channels = self.adjust_channels(out_channels, width_mult)
+        self.use_se = use_se
+        self.use_hs = activation == "HS"
+        self.stride = stride
+        self.dilation = dilation
+
+    @staticmethod
+    def adjust_channels(channels: int, width_mult: float):
+        return _make_divisible(channels * width_mult, 8)
+
+
+class InvertedResidual(nn.Module):
+    # Implemented as described at section 5 of MobileNetV3 paper
+    def __init__(
+        self,
+        cnf: InvertedResidualConfig,
+        norm_layer: Callable[..., nn.Module],
+        se_layer: Callable[..., nn.Module] = partial(SElayer, scale_activation=nn.Hardsigmoid),
+    ):
+        super().__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+
+        self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+        layers: List[nn.Module] = []
+        activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
+
+        # expand
+        if cnf.expanded_channels != cnf.input_channels:
+            layers.append(
+                ConvNormActivation(
+                    cnf.input_channels,
+                    cnf.expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+
+        # depthwise
+        stride = 1 if cnf.dilation > 1 else cnf.stride
+        layers.append(
+            ConvNormActivation(
+                cnf.expanded_channels,
+                cnf.expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=stride,
+                dilation=cnf.dilation,
+                groups=cnf.expanded_channels,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+            )
+        )
+        if cnf.use_se:
+            squeeze_channels = _make_divisible(cnf.expanded_channels // 4, 8)
+            layers.append(se_layer(cnf.expanded_channels, squeeze_channels))
+
+        # project
+        layers.append(
+            ConvNormActivation(
+                cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+            )
+        )
+
+        self.block = nn.Sequential(*layers)
+        self.out_channels = cnf.out_channels
+        self._is_cn = cnf.stride > 1
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result += input
+        return result
+
+
+class MobileNetV3(nn.Module):
+    def __init__(
+        self,
+        inverted_residual_setting: List[InvertedResidualConfig],
+        last_channel: int,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        dropout: float = 0.2,
+        **kwargs: Any,
+    ) -> None:
+        """
+        MobileNet V3 main class
+
+        Args:
+            inverted_residual_setting (List[InvertedResidualConfig]): Network structure
+            last_channel (int): The number of channels on the penultimate layer
+            num_classes (int): Number of classes
+            block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+            dropout (float): The droupout probability
+        """
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]")
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
+
+        layers: List[nn.Module] = []
+
+        # building first layer
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            ConvNormActivation(
+                3,
+                firstconv_output_channels,
+                kernel_size=3,
+                stride=2,
+                norm_layer=norm_layer,
+                activation_layer=nn.Hardswish,
+            )
+        )
+
+        # building inverted residual blocks
+        for cnf in inverted_residual_setting:
+            layers.append(block(cnf, norm_layer))
+
+        # building last several layers
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = 6 * lastconv_input_channels
+        layers.append(
+            ConvNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=nn.Hardswish,
+            )
+        )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Linear(lastconv_output_channels, last_channel),
+            nn.Hardswish(inplace=True),
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(last_channel, num_classes),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        x = self.classifier(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _mobilenet_v3_conf(
+    arch: str, width_mult: float = 1.0, reduced_tail: bool = False, dilated: bool = False, **kwargs: Any
+):
+    reduce_divider = 2 if reduced_tail else 1
+    dilation = 2 if dilated else 1
+
+    bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
+    adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult)
+
+    if arch == "mobilenet_v3_large":
+        inverted_residual_setting = [
+            bneck_conf(16, 3, 16, 16, False, "RE", 1, 1),
+            bneck_conf(16, 3, 64, 24, False, "RE", 2, 1),  # C1
+            bneck_conf(24, 3, 72, 24, False, "RE", 1, 1),
+            bneck_conf(24, 5, 72, 40, True, "RE", 2, 1),  # C2
+            bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+            bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+            bneck_conf(40, 3, 240, 80, False, "HS", 2, 1),  # C3
+            bneck_conf(80, 3, 200, 80, False, "HS", 1, 1),
+            bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+            bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+            bneck_conf(80, 3, 480, 112, True, "HS", 1, 1),
+            bneck_conf(112, 3, 672, 112, True, "HS", 1, 1),
+            bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation),  # C4
+            bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+            bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+        ]
+        last_channel = adjust_channels(1280 // reduce_divider)  # C5
+    elif arch == "mobilenet_v3_small":
+        inverted_residual_setting = [
+            bneck_conf(16, 3, 16, 16, True, "RE", 2, 1),  # C1
+            bneck_conf(16, 3, 72, 24, False, "RE", 2, 1),  # C2
+            bneck_conf(24, 3, 88, 24, False, "RE", 1, 1),
+            bneck_conf(24, 5, 96, 40, True, "HS", 2, 1),  # C3
+            bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
+            bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
+            bneck_conf(40, 5, 120, 48, True, "HS", 1, 1),
+            bneck_conf(48, 5, 144, 48, True, "HS", 1, 1),
+            bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation),  # C4
+            bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
+            bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
+        ]
+        last_channel = adjust_channels(1024 // reduce_divider)  # C5
+    else:
+        raise ValueError(f"Unsupported model type {arch}")
+
+    return inverted_residual_setting, last_channel
+
+
+def _mobilenet_v3(
+    arch: str,
+    inverted_residual_setting: List[InvertedResidualConfig],
+    last_channel: int,
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any,
+):
+    model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
+    if pretrained:
+        if model_urls.get(arch, None) is None:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def mobilenet_v3_large(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV3:
+    """
+    Constructs a large MobileNetV3 architecture from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    arch = "mobilenet_v3_large"
+    inverted_residual_setting, last_channel = _mobilenet_v3_conf(arch, **kwargs)
+    return _mobilenet_v3(arch, inverted_residual_setting, last_channel, pretrained, progress, **kwargs)
+
+
+def mobilenet_v3_small(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV3:
+    """
+    Constructs a small MobileNetV3 architecture from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    arch = "mobilenet_v3_small"
+    inverted_residual_setting, last_channel = _mobilenet_v3_conf(arch, **kwargs)
+    return _mobilenet_v3(arch, inverted_residual_setting, last_channel, pretrained, progress, **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/regnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f393c8e82d9bb13a5b7b571313fca4343b7e237
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/regnet.py
@@ -0,0 +1,608 @@
+# Modified from
+# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py
+# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+
+
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+from torch import nn, Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation, SqueezeExcitation
+from ..utils import _log_api_usage_once
+from ._utils import _make_divisible
+
+
+__all__ = [
+    "RegNet",
+    "regnet_y_400mf",
+    "regnet_y_800mf",
+    "regnet_y_1_6gf",
+    "regnet_y_3_2gf",
+    "regnet_y_8gf",
+    "regnet_y_16gf",
+    "regnet_y_32gf",
+    "regnet_y_128gf",
+    "regnet_x_400mf",
+    "regnet_x_800mf",
+    "regnet_x_1_6gf",
+    "regnet_x_3_2gf",
+    "regnet_x_8gf",
+    "regnet_x_16gf",
+    "regnet_x_32gf",
+]
+
+
+model_urls = {
+    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
+    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
+    "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
+    "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
+    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
+    "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
+    "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
+    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
+    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
+    "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
+    "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
+    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
+    "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
+    "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
+}
+
+
+class SimpleStemIN(ConvNormActivation):
+    """Simple stem for ImageNet: 3x3, BN, ReLU."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+    ) -> None:
+        super().__init__(
+            width_in, width_out, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=activation_layer
+        )
+
+
+class BottleneckTransform(nn.Sequential):
+    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        group_width: int,
+        bottleneck_multiplier: float,
+        se_ratio: Optional[float],
+    ) -> None:
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
+        w_b = int(round(width_out * bottleneck_multiplier))
+        g = w_b // group_width
+
+        layers["a"] = ConvNormActivation(
+            width_in, w_b, kernel_size=1, stride=1, norm_layer=norm_layer, activation_layer=activation_layer
+        )
+        layers["b"] = ConvNormActivation(
+            w_b, w_b, kernel_size=3, stride=stride, groups=g, norm_layer=norm_layer, activation_layer=activation_layer
+        )
+
+        if se_ratio:
+            # The SE reduction ratio is defined with respect to the
+            # beginning of the block
+            width_se_out = int(round(se_ratio * width_in))
+            layers["se"] = SqueezeExcitation(
+                input_channels=w_b,
+                squeeze_channels=width_se_out,
+                activation=activation_layer,
+            )
+
+        layers["c"] = ConvNormActivation(
+            w_b, width_out, kernel_size=1, stride=1, norm_layer=norm_layer, activation_layer=None
+        )
+        super().__init__(layers)
+
+
+class ResBottleneckBlock(nn.Module):
+    """Residual bottleneck block: x + F(x), F = bottleneck transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        group_width: int = 1,
+        bottleneck_multiplier: float = 1.0,
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+
+        # Use skip connection with projection if shape changes
+        self.proj = None
+        should_proj = (width_in != width_out) or (stride != 1)
+        if should_proj:
+            self.proj = ConvNormActivation(
+                width_in, width_out, kernel_size=1, stride=stride, norm_layer=norm_layer, activation_layer=None
+            )
+        self.f = BottleneckTransform(
+            width_in,
+            width_out,
+            stride,
+            norm_layer,
+            activation_layer,
+            group_width,
+            bottleneck_multiplier,
+            se_ratio,
+        )
+        self.activation = activation_layer(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.proj is not None:
+            x = self.proj(x) + self.f(x)
+        else:
+            x = x + self.f(x)
+        return self.activation(x)
+
+
+class AnyStage(nn.Sequential):
+    """AnyNet stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        depth: int,
+        block_constructor: Callable[..., nn.Module],
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
+        group_width: int,
+        bottleneck_multiplier: float,
+        se_ratio: Optional[float] = None,
+        stage_index: int = 0,
+    ) -> None:
+        super().__init__()
+
+        for i in range(depth):
+            block = block_constructor(
+                width_in if i == 0 else width_out,
+                width_out,
+                stride if i == 0 else 1,
+                norm_layer,
+                activation_layer,
+                group_width,
+                bottleneck_multiplier,
+                se_ratio,
+            )
+
+            self.add_module(f"block{stage_index}-{i}", block)
+
+
+class BlockParams:
+    def __init__(
+        self,
+        depths: List[int],
+        widths: List[int],
+        group_widths: List[int],
+        bottleneck_multipliers: List[float],
+        strides: List[int],
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        self.depths = depths
+        self.widths = widths
+        self.group_widths = group_widths
+        self.bottleneck_multipliers = bottleneck_multipliers
+        self.strides = strides
+        self.se_ratio = se_ratio
+
+    @classmethod
+    def from_init_params(
+        cls,
+        depth: int,
+        w_0: int,
+        w_a: float,
+        w_m: float,
+        group_width: int,
+        bottleneck_multiplier: float = 1.0,
+        se_ratio: Optional[float] = None,
+        **kwargs: Any,
+    ) -> "BlockParams":
+        """
+        Programatically compute all the per-block settings,
+        given the RegNet parameters.
+
+        The first step is to compute the quantized linear block parameters,
+        in log space. Key parameters are:
+        - `w_a` is the width progression slope
+        - `w_0` is the initial width
+        - `w_m` is the width stepping in the log space
+
+        In other terms
+        `log(block_width) = log(w_0) + w_m * block_capacity`,
+        with `bock_capacity` ramping up following the w_0 and w_a params.
+        This block width is finally quantized to multiples of 8.
+
+        The second step is to compute the parameters per stage,
+        taking into account the skip connection and the final 1x1 convolutions.
+        We use the fact that the output width is constant within a stage.
+        """
+
+        QUANT = 8
+        STRIDE = 2
+
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
+        # Compute the block widths. Each stage has one unique block width
+        widths_cont = torch.arange(depth) * w_a + w_0
+        block_capacity = torch.round(torch.log(widths_cont / w_0) / math.log(w_m))
+        block_widths = (torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT)) * QUANT).int().tolist()
+        num_stages = len(set(block_widths))
+
+        # Convert to per stage parameters
+        split_helper = zip(
+            block_widths + [0],
+            [0] + block_widths,
+            block_widths + [0],
+            [0] + block_widths,
+        )
+        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
+
+        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
+        stage_depths = torch.diff(torch.tensor([d for d, t in enumerate(splits) if t])).int().tolist()
+
+        strides = [STRIDE] * num_stages
+        bottleneck_multipliers = [bottleneck_multiplier] * num_stages
+        group_widths = [group_width] * num_stages
+
+        # Adjust the compatibility of stage widths and group widths
+        stage_widths, group_widths = cls._adjust_widths_groups_compatibilty(
+            stage_widths, bottleneck_multipliers, group_widths
+        )
+
+        return cls(
+            depths=stage_depths,
+            widths=stage_widths,
+            group_widths=group_widths,
+            bottleneck_multipliers=bottleneck_multipliers,
+            strides=strides,
+            se_ratio=se_ratio,
+        )
+
+    def _get_expanded_params(self):
+        return zip(self.widths, self.strides, self.depths, self.group_widths, self.bottleneck_multipliers)
+
+    @staticmethod
+    def _adjust_widths_groups_compatibilty(
+        stage_widths: List[int], bottleneck_ratios: List[float], group_widths: List[int]
+    ) -> Tuple[List[int], List[int]]:
+        """
+        Adjusts the compatibility of widths and groups,
+        depending on the bottleneck ratio.
+        """
+        # Compute all widths for the current settings
+        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
+        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
+
+        # Compute the adjusted widths so that stage and group widths fit
+        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
+        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
+        return stage_widths, group_widths_min
+
+
+class RegNet(nn.Module):
+    def __init__(
+        self,
+        block_params: BlockParams,
+        num_classes: int = 1000,
+        stem_width: int = 32,
+        stem_type: Optional[Callable[..., nn.Module]] = None,
+        block_type: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if stem_type is None:
+            stem_type = SimpleStemIN
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if block_type is None:
+            block_type = ResBottleneckBlock
+        if activation is None:
+            activation = nn.ReLU
+
+        # Ad hoc stem
+        self.stem = stem_type(
+            3,  # width_in
+            stem_width,
+            norm_layer,
+            activation,
+        )
+
+        current_width = stem_width
+
+        blocks = []
+        for i, (
+            width_out,
+            stride,
+            depth,
+            group_width,
+            bottleneck_multiplier,
+        ) in enumerate(block_params._get_expanded_params()):
+            blocks.append(
+                (
+                    f"block{i+1}",
+                    AnyStage(
+                        current_width,
+                        width_out,
+                        stride,
+                        depth,
+                        block_type,
+                        norm_layer,
+                        activation,
+                        group_width,
+                        bottleneck_multiplier,
+                        block_params.se_ratio,
+                        stage_index=i + 1,
+                    ),
+                )
+            )
+
+            current_width = width_out
+
+        self.trunk_output = nn.Sequential(OrderedDict(blocks))
+
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
+
+        # Performs ResNet-style weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # Note that there is no bias due to BN
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2.0 / fan_out))
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, mean=0.0, std=0.01)
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        x = self.trunk_output(x)
+
+        x = self.avgpool(x)
+        x = x.flatten(start_dim=1)
+        x = self.fc(x)
+
+        return x
+
+
+def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
+    norm_layer = kwargs.pop("norm_layer", partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1))
+    model = RegNet(block_params, norm_layer=norm_layer, **kwargs)
+    if pretrained:
+        if arch not in model_urls:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_400MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, se_ratio=0.25, **kwargs)
+    return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_800MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, se_ratio=0.25, **kwargs)
+    return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_1.6GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(
+        depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, se_ratio=0.25, **kwargs
+    )
+    return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_3.2GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(
+        depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, se_ratio=0.25, **kwargs
+    )
+    return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_8GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(
+        depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, se_ratio=0.25, **kwargs
+    )
+    return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_16GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(
+        depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, se_ratio=0.25, **kwargs
+    )
+    return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_32GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(
+        depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, se_ratio=0.25, **kwargs
+    )
+    return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_128gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_128GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+    NOTE: Pretrained weights are not available for this model.
+    """
+    params = BlockParams.from_init_params(
+        depth=27, w_0=456, w_a=160.83, w_m=2.52, group_width=264, se_ratio=0.25, **kwargs
+    )
+    return _regnet("regnet_y_128gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_400MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=22, w_0=24, w_a=24.48, w_m=2.54, group_width=16, **kwargs)
+    return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_800MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, **kwargs)
+    return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_1.6GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, **kwargs)
+    return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_3.2GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, **kwargs)
+    return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_8GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, **kwargs)
+    return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_16GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, **kwargs)
+    return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_32GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, **kwargs)
+    return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs)
+
+
+# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/resnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0bb8d13adec56fef2bd76752889d46379a0d539
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/resnet.py
@@ -0,0 +1,413 @@
+from typing import Type, Any, Callable, Union, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+
+__all__ = [
+    "ResNet",
+    "resnet18",
+    "resnet34",
+    "resnet50",
+    "resnet101",
+    "resnet152",
+    "resnext50_32x4d",
+    "resnext101_32x8d",
+    "wide_resnet50_2",
+    "wide_resnet101_2",
+]
+
+
+model_urls = {
+    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
+    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
+    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
+    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+}
+
+
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                f"or a 3-element tuple, got {replace_stride_with_dilation}"
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(
+    arch: str,
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any,
+) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet("resnet18", BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs)
+
+
+def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet("resnet34", BasicBlock, [3, 4, 6, 3], pretrained, progress, **kwargs)
+
+
+def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
+
+
+def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet("resnet101", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs)
+
+
+def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet("resnet152", Bottleneck, [3, 8, 36, 3], pretrained, progress, **kwargs)
+
+
+def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs["groups"] = 32
+    kwargs["width_per_group"] = 4
+    return _resnet("resnext50_32x4d", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs["groups"] = 32
+    kwargs["width_per_group"] = 8
+    return _resnet("resnext101_32x8d", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs["width_per_group"] = 64 * 2
+    return _resnet("wide_resnet50_2", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs["width_per_group"] = 64 * 2
+    return _resnet("wide_resnet101_2", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/shufflenetv2.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/shufflenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3758c54aaf46cdd57c67fd217e68131ade903fb
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/shufflenetv2.py
@@ -0,0 +1,222 @@
+from typing import Callable, Any, List
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+
+__all__ = ["ShuffleNetV2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0"]
+
+model_urls = {
+    "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
+    "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
+    "shufflenetv2_x1.5": None,
+    "shufflenetv2_x2.0": None,
+}
+
+
+def channel_shuffle(x: Tensor, groups: int) -> Tensor:
+    batchsize, num_channels, height, width = x.size()
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = x.view(batchsize, groups, channels_per_group, height, width)
+
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    # flatten
+    x = x.view(batchsize, -1, height, width)
+
+    return x
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp: int, oup: int, stride: int) -> None:
+        super().__init__()
+
+        if not (1 <= stride <= 3):
+            raise ValueError("illegal stride value")
+        self.stride = stride
+
+        branch_features = oup // 2
+        assert (self.stride != 1) or (inp == branch_features << 1)
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1),
+                nn.BatchNorm2d(inp),
+                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(branch_features),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.branch1 = nn.Sequential()
+
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(
+                inp if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+            self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1),
+            nn.BatchNorm2d(branch_features),
+            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+        )
+
+    @staticmethod
+    def depthwise_conv(
+        i: int, o: int, kernel_size: int, stride: int = 1, padding: int = 0, bias: bool = False
+    ) -> nn.Conv2d:
+        return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = torch.cat((x1, self.branch2(x2)), dim=1)
+        else:
+            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+
+        out = channel_shuffle(out, 2)
+
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+    def __init__(
+        self,
+        stages_repeats: List[int],
+        stages_out_channels: List[int],
+        num_classes: int = 1000,
+        inverted_residual: Callable[..., nn.Module] = InvertedResidual,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+        if len(stages_repeats) != 3:
+            raise ValueError("expected stages_repeats as list of 3 positive ints")
+        if len(stages_out_channels) != 5:
+            raise ValueError("expected stages_out_channels as list of 5 positive ints")
+        self._stage_out_channels = stages_out_channels
+
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(output_channels),
+            nn.ReLU(inplace=True),
+        )
+        input_channels = output_channels
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Static annotations for mypy
+        self.stage2: nn.Sequential
+        self.stage3: nn.Sequential
+        self.stage4: nn.Sequential
+        stage_names = [f"stage{i}" for i in [2, 3, 4]]
+        for name, repeats, output_channels in zip(stage_names, stages_repeats, self._stage_out_channels[1:]):
+            seq = [inverted_residual(input_channels, output_channels, 2)]
+            for i in range(repeats - 1):
+                seq.append(inverted_residual(output_channels, output_channels, 1))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+
+        output_channels = self._stage_out_channels[-1]
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(output_channels),
+            nn.ReLU(inplace=True),
+        )
+
+        self.fc = nn.Linear(output_channels, num_classes)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.stage4(x)
+        x = self.conv5(x)
+        x = x.mean([2, 3])  # globalpool
+        x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _shufflenetv2(arch: str, pretrained: bool, progress: bool, *args: Any, **kwargs: Any) -> ShuffleNetV2:
+    model = ShuffleNetV2(*args, **kwargs)
+
+    if pretrained:
+        model_url = model_urls[arch]
+        if model_url is None:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        else:
+            state_dict = load_state_dict_from_url(model_url, progress=progress)
+            model.load_state_dict(state_dict)
+
+    return model
+
+
+def shufflenet_v2_x0_5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 0.5x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
+    <https://arxiv.org/abs/1807.11164>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _shufflenetv2("shufflenetv2_x0.5", pretrained, progress, [4, 8, 4], [24, 48, 96, 192, 1024], **kwargs)
+
+
+def shufflenet_v2_x1_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 1.0x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
+    <https://arxiv.org/abs/1807.11164>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _shufflenetv2("shufflenetv2_x1.0", pretrained, progress, [4, 8, 4], [24, 116, 232, 464, 1024], **kwargs)
+
+
+def shufflenet_v2_x1_5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 1.5x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
+    <https://arxiv.org/abs/1807.11164>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _shufflenetv2("shufflenetv2_x1.5", pretrained, progress, [4, 8, 4], [24, 176, 352, 704, 1024], **kwargs)
+
+
+def shufflenet_v2_x2_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2:
+    """
+    Constructs a ShuffleNetV2 with 2.0x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
+    <https://arxiv.org/abs/1807.11164>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _shufflenetv2("shufflenetv2_x2.0", pretrained, progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/squeezenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/squeezenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c1a30f225de90454228a961846238079f300cf1
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/squeezenet.py
@@ -0,0 +1,133 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+__all__ = ["SqueezeNet", "squeezenet1_0", "squeezenet1_1"]
+
+model_urls = {
+    "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
+    "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
+}
+
+
+class Fire(nn.Module):
+    def __init__(self, inplanes: int, squeeze_planes: int, expand1x1_planes: int, expand3x3_planes: int) -> None:
+        super().__init__()
+        self.inplanes = inplanes
+        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes, kernel_size=3, padding=1)
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat(
+            [self.expand1x1_activation(self.expand1x1(x)), self.expand3x3_activation(self.expand3x3(x))], 1
+        )
+
+
+class SqueezeNet(nn.Module):
+    def __init__(self, version: str = "1_0", num_classes: int = 1000, dropout: float = 0.5) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+        if version == "1_0":
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 96, kernel_size=7, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(96, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                Fire(128, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(256, 32, 128, 128),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(512, 64, 256, 256),
+            )
+        elif version == "1_1":
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 64, kernel_size=3, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(64, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(128, 32, 128, 128),
+                Fire(256, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                Fire(512, 64, 256, 256),
+            )
+        else:
+            # FIXME: Is this needed? SqueezeNet should only be called from the
+            # FIXME: squeezenet1_x() functions
+            # FIXME: This checking is not done for the other models
+            raise ValueError(f"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected")
+
+        # Final convolution is initialized differently from the rest
+        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout), final_conv, nn.ReLU(inplace=True), nn.AdaptiveAvgPool2d((1, 1))
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if m is final_conv:
+                    init.normal_(m.weight, mean=0.0, std=0.01)
+                else:
+                    init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.classifier(x)
+        return torch.flatten(x, 1)
+
+
+def _squeezenet(version: str, pretrained: bool, progress: bool, **kwargs: Any) -> SqueezeNet:
+    model = SqueezeNet(version, **kwargs)
+    if pretrained:
+        arch = "squeezenet" + version
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def squeezenet1_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> SqueezeNet:
+    r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level
+    accuracy with 50x fewer parameters and <0.5MB model size"
+    <https://arxiv.org/abs/1602.07360>`_ paper.
+    The required minimum input size of the model is 21x21.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _squeezenet("1_0", pretrained, progress, **kwargs)
+
+
+def squeezenet1_1(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> SqueezeNet:
+    r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
+    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
+    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
+    than SqueezeNet 1.0, without sacrificing accuracy.
+    The required minimum input size of the model is 17x17.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _squeezenet("1_1", pretrained, progress, **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vgg.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..07639017a317e60cbe75c6b1f47eb28eefd6d332
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vgg.py
@@ -0,0 +1,201 @@
+from typing import Union, List, Dict, Any, cast
+
+import torch
+import torch.nn as nn
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..utils import _log_api_usage_once
+
+
+__all__ = [
+    "VGG",
+    "vgg11",
+    "vgg11_bn",
+    "vgg13",
+    "vgg13_bn",
+    "vgg16",
+    "vgg16_bn",
+    "vgg19_bn",
+    "vgg19",
+]
+
+
+model_urls = {
+    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
+    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
+    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
+    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth",
+}
+
+
+class VGG(nn.Module):
+    def __init__(
+        self, features: nn.Module, num_classes: int = 1000, init_weights: bool = True, dropout: float = 0.5
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.features = features
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, 4096),
+            nn.ReLU(True),
+            nn.Dropout(p=dropout),
+            nn.Linear(4096, num_classes),
+        )
+        if init_weights:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                    if m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Linear):
+                    nn.init.normal_(m.weight, 0, 0.01)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+
+
+def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
+    layers: List[nn.Module] = []
+    in_channels = 3
+    for v in cfg:
+        if v == "M":
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            v = cast(int, v)
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)
+
+
+cfgs: Dict[str, List[Union[str, int]]] = {
+    "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
+    "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512, "M"],
+}
+
+
+def _vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, **kwargs: Any) -> VGG:
+    if pretrained:
+        kwargs["init_weights"] = False
+    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def vgg11(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 11-layer model (configuration "A") from
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg11", "A", False, pretrained, progress, **kwargs)
+
+
+def vgg11_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 11-layer model (configuration "A") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg11_bn", "A", True, pretrained, progress, **kwargs)
+
+
+def vgg13(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 13-layer model (configuration "B")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg13", "B", False, pretrained, progress, **kwargs)
+
+
+def vgg13_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 13-layer model (configuration "B") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg13_bn", "B", True, pretrained, progress, **kwargs)
+
+
+def vgg16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 16-layer model (configuration "D")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg16", "D", False, pretrained, progress, **kwargs)
+
+
+def vgg16_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 16-layer model (configuration "D") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg16_bn", "D", True, pretrained, progress, **kwargs)
+
+
+def vgg19(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration "E")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg19", "E", False, pretrained, progress, **kwargs)
+
+
+def vgg19_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration 'E') with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
+    The required minimum input size of the model is 32x32.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vgg("vgg19_bn", "E", True, pretrained, progress, **kwargs)
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vision_transformer.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36658e34d845abbe5475ed6b0d2ab34ac7e2047
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vision_transformer.py
@@ -0,0 +1,468 @@
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, List, NamedTuple, Optional
+
+import torch
+import torch.nn as nn
+
+from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation
+from ..utils import _log_api_usage_once
+
+__all__ = [
+    "VisionTransformer",
+    "vit_b_16",
+    "vit_b_32",
+    "vit_l_16",
+    "vit_l_32",
+]
+
+model_urls = {
+    "vit_b_16": "https://download.pytorch.org/models/vit_b_16-c867db91.pth",
+    "vit_b_32": "https://download.pytorch.org/models/vit_b_32-d86f8d99.pth",
+    "vit_l_16": "https://download.pytorch.org/models/vit_l_16-852ce7e3.pth",
+    "vit_l_32": "https://download.pytorch.org/models/vit_l_32-c7638314.pth",
+}
+
+
+class ConvStemConfig(NamedTuple):
+    out_channels: int
+    kernel_size: int
+    stride: int
+    norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d
+    activation_layer: Callable[..., nn.Module] = nn.ReLU
+
+
+class MLPBlock(nn.Sequential):
+    """Transformer MLP block."""
+
+    def __init__(self, in_dim: int, mlp_dim: int, dropout: float):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_dim, mlp_dim)
+        self.act = nn.GELU()
+        self.dropout_1 = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(mlp_dim, in_dim)
+        self.dropout_2 = nn.Dropout(dropout)
+
+        nn.init.xavier_uniform_(self.linear_1.weight)
+        nn.init.xavier_uniform_(self.linear_2.weight)
+        nn.init.normal_(self.linear_1.bias, std=1e-6)
+        nn.init.normal_(self.linear_2.bias, std=1e-6)
+
+
+class EncoderBlock(nn.Module):
+    """Transformer encoder block."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+
+        # Attention block
+        self.ln_1 = norm_layer(hidden_dim)
+        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+
+        # MLP block
+        self.ln_2 = norm_layer(hidden_dim)
+        self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout)
+
+    def forward(self, input: torch.Tensor):
+        torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}")
+        x = self.ln_1(input)
+        x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False)
+        x = self.dropout(x)
+        x = x + input
+
+        y = self.ln_2(x)
+        y = self.mlp(y)
+        return x + y
+
+
+class Encoder(nn.Module):
+    """Transformer Model Encoder for sequence to sequence translation."""
+
+    def __init__(
+        self,
+        seq_length: int,
+        num_layers: int,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        # Note that batch_size is on the first dim because
+        # we have batch_first=True in nn.MultiAttention() by default
+        self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02))  # from BERT
+        self.dropout = nn.Dropout(dropout)
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
+        for i in range(num_layers):
+            layers[f"encoder_layer_{i}"] = EncoderBlock(
+                num_heads,
+                hidden_dim,
+                mlp_dim,
+                dropout,
+                attention_dropout,
+                norm_layer,
+            )
+        self.layers = nn.Sequential(layers)
+        self.ln = norm_layer(hidden_dim)
+
+    def forward(self, input: torch.Tensor):
+        torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
+        input = input + self.pos_embedding
+        return self.ln(self.layers(self.dropout(input)))
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer as per https://arxiv.org/abs/2010.11929."""
+
+    def __init__(
+        self,
+        image_size: int,
+        patch_size: int,
+        num_layers: int,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        num_classes: int = 1000,
+        representation_size: Optional[int] = None,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+        conv_stem_configs: Optional[List[ConvStemConfig]] = None,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!")
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_dim = hidden_dim
+        self.mlp_dim = mlp_dim
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+        self.num_classes = num_classes
+        self.representation_size = representation_size
+        self.norm_layer = norm_layer
+
+        if conv_stem_configs is not None:
+            # As per https://arxiv.org/abs/2106.14881
+            seq_proj = nn.Sequential()
+            prev_channels = 3
+            for i, conv_stem_layer_config in enumerate(conv_stem_configs):
+                seq_proj.add_module(
+                    f"conv_bn_relu_{i}",
+                    ConvNormActivation(
+                        in_channels=prev_channels,
+                        out_channels=conv_stem_layer_config.out_channels,
+                        kernel_size=conv_stem_layer_config.kernel_size,
+                        stride=conv_stem_layer_config.stride,
+                        norm_layer=conv_stem_layer_config.norm_layer,
+                        activation_layer=conv_stem_layer_config.activation_layer,
+                    ),
+                )
+                prev_channels = conv_stem_layer_config.out_channels
+            seq_proj.add_module(
+                "conv_last", nn.Conv2d(in_channels=prev_channels, out_channels=hidden_dim, kernel_size=1)
+            )
+            self.conv_proj: nn.Module = seq_proj
+        else:
+            self.conv_proj = nn.Conv2d(
+                in_channels=3, out_channels=hidden_dim, kernel_size=patch_size, stride=patch_size
+            )
+
+        seq_length = (image_size // patch_size) ** 2
+
+        # Add a class token
+        self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim))
+        seq_length += 1
+
+        self.encoder = Encoder(
+            seq_length,
+            num_layers,
+            num_heads,
+            hidden_dim,
+            mlp_dim,
+            dropout,
+            attention_dropout,
+            norm_layer,
+        )
+        self.seq_length = seq_length
+
+        heads_layers: OrderedDict[str, nn.Module] = OrderedDict()
+        if representation_size is None:
+            heads_layers["head"] = nn.Linear(hidden_dim, num_classes)
+        else:
+            heads_layers["pre_logits"] = nn.Linear(hidden_dim, representation_size)
+            heads_layers["act"] = nn.Tanh()
+            heads_layers["head"] = nn.Linear(representation_size, num_classes)
+
+        self.heads = nn.Sequential(heads_layers)
+
+        if isinstance(self.conv_proj, nn.Conv2d):
+            # Init the patchify stem
+            fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1]
+            nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in))
+            if self.conv_proj.bias is not None:
+                nn.init.zeros_(self.conv_proj.bias)
+        elif self.conv_proj.conv_last is not None and isinstance(self.conv_proj.conv_last, nn.Conv2d):
+            # Init the last 1x1 conv of the conv stem
+            nn.init.normal_(
+                self.conv_proj.conv_last.weight, mean=0.0, std=math.sqrt(2.0 / self.conv_proj.conv_last.out_channels)
+            )
+            if self.conv_proj.conv_last.bias is not None:
+                nn.init.zeros_(self.conv_proj.conv_last.bias)
+
+        if hasattr(self.heads, "pre_logits") and isinstance(self.heads.pre_logits, nn.Linear):
+            fan_in = self.heads.pre_logits.in_features
+            nn.init.trunc_normal_(self.heads.pre_logits.weight, std=math.sqrt(1 / fan_in))
+            nn.init.zeros_(self.heads.pre_logits.bias)
+
+        if isinstance(self.heads.head, nn.Linear):
+            nn.init.zeros_(self.heads.head.weight)
+            nn.init.zeros_(self.heads.head.bias)
+
+    def _process_input(self, x: torch.Tensor) -> torch.Tensor:
+        n, c, h, w = x.shape
+        p = self.patch_size
+        torch._assert(h == self.image_size, "Wrong image height!")
+        torch._assert(w == self.image_size, "Wrong image width!")
+        n_h = h // p
+        n_w = w // p
+
+        # (n, c, h, w) -> (n, hidden_dim, n_h, n_w)
+        x = self.conv_proj(x)
+        # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w))
+        x = x.reshape(n, self.hidden_dim, n_h * n_w)
+
+        # (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim)
+        # The self attention layer expects inputs in the format (N, S, E)
+        # where S is the source sequence length, N is the batch size, E is the
+        # embedding dimension
+        x = x.permute(0, 2, 1)
+
+        return x
+
+    def forward(self, x: torch.Tensor):
+        # Reshape and permute the input tensor
+        x = self._process_input(x)
+        n = x.shape[0]
+
+        # Expand the class token to the full batch
+        batch_class_token = self.class_token.expand(n, -1, -1)
+        x = torch.cat([batch_class_token, x], dim=1)
+
+        x = self.encoder(x)
+
+        # Classifier "token" as used by standard language architectures
+        x = x[:, 0]
+
+        x = self.heads(x)
+
+        return x
+
+
+def _vision_transformer(
+    arch: str,
+    patch_size: int,
+    num_layers: int,
+    num_heads: int,
+    hidden_dim: int,
+    mlp_dim: int,
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any,
+) -> VisionTransformer:
+    image_size = kwargs.pop("image_size", 224)
+
+    model = VisionTransformer(
+        image_size=image_size,
+        patch_size=patch_size,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        hidden_dim=hidden_dim,
+        mlp_dim=mlp_dim,
+        **kwargs,
+    )
+
+    if pretrained:
+        if arch not in model_urls:
+            raise ValueError(f"No checkpoint is available for model type '{arch}'!")
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+
+    return model
+
+
+def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_b_16 architecture from
+    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vision_transformer(
+        arch="vit_b_16",
+        patch_size=16,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        pretrained=pretrained,
+        progress=progress,
+        **kwargs,
+    )
+
+
+def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_b_32 architecture from
+    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vision_transformer(
+        arch="vit_b_32",
+        patch_size=32,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        pretrained=pretrained,
+        progress=progress,
+        **kwargs,
+    )
+
+
+def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_l_16 architecture from
+    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vision_transformer(
+        arch="vit_l_16",
+        patch_size=16,
+        num_layers=24,
+        num_heads=16,
+        hidden_dim=1024,
+        mlp_dim=4096,
+        pretrained=pretrained,
+        progress=progress,
+        **kwargs,
+    )
+
+
+def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_l_32 architecture from
+    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _vision_transformer(
+        arch="vit_l_32",
+        patch_size=32,
+        num_layers=24,
+        num_heads=16,
+        hidden_dim=1024,
+        mlp_dim=4096,
+        pretrained=pretrained,
+        progress=progress,
+        **kwargs,
+    )
+
+
+def interpolate_embeddings(
+    image_size: int,
+    patch_size: int,
+    model_state: "OrderedDict[str, torch.Tensor]",
+    interpolation_mode: str = "bicubic",
+    reset_heads: bool = False,
+) -> "OrderedDict[str, torch.Tensor]":
+    """This function helps interpolating positional embeddings during checkpoint loading,
+    especially when you want to apply a pre-trained model on images with different resolution.
+
+    Args:
+        image_size (int): Image size of the new model.
+        patch_size (int): Patch size of the new model.
+        model_state (OrderedDict[str, torch.Tensor]): State dict of the pre-trained model.
+        interpolation_mode (str): The algorithm used for upsampling. Default: bicubic.
+        reset_heads (bool): If true, not copying the state of heads. Default: False.
+
+    Returns:
+        OrderedDict[str, torch.Tensor]: A state dict which can be loaded into the new model.
+    """
+    # Shape of pos_embedding is (1, seq_length, hidden_dim)
+    pos_embedding = model_state["encoder.pos_embedding"]
+    n, seq_length, hidden_dim = pos_embedding.shape
+    if n != 1:
+        raise ValueError(f"Unexpected position embedding shape: {pos_embedding.shape}")
+
+    new_seq_length = (image_size // patch_size) ** 2 + 1
+
+    # Need to interpolate the weights for the position embedding.
+    # We do this by reshaping the positions embeddings to a 2d grid, performing
+    # an interpolation in the (h, w) space and then reshaping back to a 1d grid.
+    if new_seq_length != seq_length:
+        # The class token embedding shouldn't be interpolated so we split it up.
+        seq_length -= 1
+        new_seq_length -= 1
+        pos_embedding_token = pos_embedding[:, :1, :]
+        pos_embedding_img = pos_embedding[:, 1:, :]
+
+        # (1, seq_length, hidden_dim) -> (1, hidden_dim, seq_length)
+        pos_embedding_img = pos_embedding_img.permute(0, 2, 1)
+        seq_length_1d = int(math.sqrt(seq_length))
+        torch._assert(seq_length_1d * seq_length_1d == seq_length, "seq_length is not a perfect square!")
+
+        # (1, hidden_dim, seq_length) -> (1, hidden_dim, seq_l_1d, seq_l_1d)
+        pos_embedding_img = pos_embedding_img.reshape(1, hidden_dim, seq_length_1d, seq_length_1d)
+        new_seq_length_1d = image_size // patch_size
+
+        # Perform interpolation.
+        # (1, hidden_dim, seq_l_1d, seq_l_1d) -> (1, hidden_dim, new_seq_l_1d, new_seq_l_1d)
+        new_pos_embedding_img = nn.functional.interpolate(
+            pos_embedding_img,
+            size=new_seq_length_1d,
+            mode=interpolation_mode,
+            align_corners=True,
+        )
+
+        # (1, hidden_dim, new_seq_l_1d, new_seq_l_1d) -> (1, hidden_dim, new_seq_length)
+        new_pos_embedding_img = new_pos_embedding_img.reshape(1, hidden_dim, new_seq_length)
+
+        # (1, hidden_dim, new_seq_length) -> (1, new_seq_length, hidden_dim)
+        new_pos_embedding_img = new_pos_embedding_img.permute(0, 2, 1)
+        new_pos_embedding = torch.cat([pos_embedding_token, new_pos_embedding_img], dim=1)
+
+        model_state["encoder.pos_embedding"] = new_pos_embedding
+
+        if reset_heads:
+            model_state_copy: "OrderedDict[str, torch.Tensor]" = OrderedDict()
+            for k, v in model_state.items():
+                if not k.startswith("heads"):
+                    model_state_copy[k] = v
+            model_state = model_state_copy
+
+    return model_state
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/__init__.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85d1e739fd1a14000efa90e03e11d8563ceb2d0
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/__init__.py
@@ -0,0 +1,4 @@
+from .misc import FrozenBatchNorm2d, ConvNormActivation, SqueezeExcitation
+from .stochastic_depth import stochastic_depth, StochasticDepth
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/misc.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..268962e204c87193f9bedfef3b886d2bf96d91a0
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/misc.py
@@ -0,0 +1,163 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch import Tensor
+
+from ..utils import _log_api_usage_once
+
+
+interpolate = torch.nn.functional.interpolate
+
+
+# This is not in nn
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed
+
+    Args:
+        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
+        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features))
+
+    def _load_from_state_dict(
+        self,
+        state_dict: dict,
+        prefix: str,
+        local_metadata: dict,
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
+
+
+class ConvNormActivation(torch.nn.Sequential):
+    """
+    Configurable block used for Convolution-Normalzation-Activation blocks.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolutiuon layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: int = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+    ) -> None:
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        if bias is None:
+            bias = norm_layer is None
+        layers = [
+            torch.nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+        ]
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            params = {} if inplace is None else {"inplace": inplace}
+            layers.append(activation_layer(**params))
+        super().__init__(*layers)
+        _log_api_usage_once(self)
+        self.out_channels = out_channels
+
+
+class SqueezeExcitation(torch.nn.Module):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
+        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
+    """
+
+    def __init__(
+        self,
+        input_channels: int,
+        squeeze_channels: int,
+        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
+        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
+        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
+        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input: Tensor) -> Tensor:
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+    def forward(self, input: Tensor) -> Tensor:
+        scale = self._scale(input)
+        return scale * input
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/stochastic_depth.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/stochastic_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8167b2315e941f7e31a0626eeec270d350a710
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/stochastic_depth.py
@@ -0,0 +1,66 @@
+import torch
+import torch.fx
+from torch import nn, Tensor
+
+from ..utils import _log_api_usage_once
+
+
+def stochastic_depth(input: Tensor, p: float, mode: str, training: bool = True) -> Tensor:
+    """
+    Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
+    <https://arxiv.org/abs/1603.09382>`_ used for randomly dropping residual
+    branches of residual architectures.
+
+    Args:
+        input (Tensor[N, ...]): The input tensor or arbitrary dimensions with the first one
+                    being its batch i.e. a batch with ``N`` rows.
+        p (float): probability of the input to be zeroed.
+        mode (str): ``"batch"`` or ``"row"``.
+                    ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
+                    randomly selected rows from the batch.
+        training: apply stochastic depth if is ``True``. Default: ``True``
+
+    Returns:
+        Tensor[N, ...]: The randomly zeroed tensor.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(stochastic_depth)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
+    if mode not in ["batch", "row"]:
+        raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
+    if not training or p == 0.0:
+        return input
+
+    survival_rate = 1.0 - p
+    if mode == "row":
+        size = [input.shape[0]] + [1] * (input.ndim - 1)
+    else:
+        size = [1] * input.ndim
+    noise = torch.empty(size, dtype=input.dtype, device=input.device)
+    noise = noise.bernoulli_(survival_rate)
+    if survival_rate > 0.0:
+        noise.div_(survival_rate)
+    return input * noise
+
+
+torch.fx.wrap("stochastic_depth")
+
+
+class StochasticDepth(nn.Module):
+    """
+    See :func:`stochastic_depth`.
+    """
+
+    def __init__(self, p: float, mode: str) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.p = p
+        self.mode = mode
+
+    def forward(self, input: Tensor) -> Tensor:
+        return stochastic_depth(input, self.p, self.mode, self.training)
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
+        return s
diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/utils.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03802d9a47ab2674e9f36add7be77197fc6ca2f
--- /dev/null
+++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/utils.py
@@ -0,0 +1,548 @@
+import math
+import pathlib
+import warnings
+from types import FunctionType
+from typing import Any, BinaryIO, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageColor, ImageDraw, ImageFont
+
+__all__ = [
+    "make_grid",
+    "save_image",
+    "draw_bounding_boxes",
+    "draw_segmentation_masks",
+    "draw_keypoints",
+    "flow_to_image",
+]
+
+
+@torch.no_grad()
+def make_grid(
+    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    nrow: int = 8,
+    padding: int = 2,
+    normalize: bool = False,
+    value_range: Optional[Tuple[int, int]] = None,
+    scale_each: bool = False,
+    pad_value: float = 0.0,
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Make a grid of images.
+
+    Args:
+        tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W)
+            or a list of images all of the same size.
+        nrow (int, optional): Number of images displayed in each row of the grid.
+            The final grid size is ``(B / nrow, nrow)``. Default: ``8``.
+        padding (int, optional): amount of padding. Default: ``2``.
+        normalize (bool, optional): If True, shift the image to the range (0, 1),
+            by the min and max values specified by ``value_range``. Default: ``False``.
+        value_range (tuple, optional): tuple (min, max) where min and max are numbers,
+            then these numbers are used to normalize the image. By default, min and max
+            are computed from the tensor.
+        range (tuple. optional):
+            .. warning::
+                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``value_range``
+                instead.
+        scale_each (bool, optional): If ``True``, scale each image in the batch of
+            images separately rather than the (min, max) over all images. Default: ``False``.
+        pad_value (float, optional): Value for the padded pixels. Default: ``0``.
+
+    Returns:
+        grid (Tensor): the tensor containing grid of images.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(make_grid)
+    if not (torch.is_tensor(tensor) or (isinstance(tensor, list) and all(torch.is_tensor(t) for t in tensor))):
+        raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
+
+    if "range" in kwargs.keys():
+        warnings.warn(
+            "The parameter 'range' is deprecated since 0.12 and will be removed in 0.14. "
+            "Please use 'value_range' instead."
+        )
+        value_range = kwargs["range"]
+
+    # if list of tensors, convert to a 4D mini-batch Tensor
+    if isinstance(tensor, list):
+        tensor = torch.stack(tensor, dim=0)
+
+    if tensor.dim() == 2:  # single image H x W
+        tensor = tensor.unsqueeze(0)
+    if tensor.dim() == 3:  # single image
+        if tensor.size(0) == 1:  # if single-channel, convert to 3-channel
+            tensor = torch.cat((tensor, tensor, tensor), 0)
+        tensor = tensor.unsqueeze(0)
+
+    if tensor.dim() == 4 and tensor.size(1) == 1:  # single-channel images
+        tensor = torch.cat((tensor, tensor, tensor), 1)
+
+    if normalize is True:
+        tensor = tensor.clone()  # avoid modifying tensor in-place
+        if value_range is not None:
+            assert isinstance(
+                value_range, tuple
+            ), "value_range has to be a tuple (min, max) if specified. min and max are numbers"
+
+        def norm_ip(img, low, high):
+            img.clamp_(min=low, max=high)
+            img.sub_(low).div_(max(high - low, 1e-5))
+
+        def norm_range(t, value_range):
+            if value_range is not None:
+                norm_ip(t, value_range[0], value_range[1])
+            else:
+                norm_ip(t, float(t.min()), float(t.max()))
+
+        if scale_each is True:
+            for t in tensor:  # loop over mini-batch dimension
+                norm_range(t, value_range)
+        else:
+            norm_range(tensor, value_range)
+
+    assert isinstance(tensor, torch.Tensor)
+    if tensor.size(0) == 1:
+        return tensor.squeeze(0)
+
+    # make the mini-batch of images into a grid
+    nmaps = tensor.size(0)
+    xmaps = min(nrow, nmaps)
+    ymaps = int(math.ceil(float(nmaps) / xmaps))
+    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
+    num_channels = tensor.size(1)
+    grid = tensor.new_full((num_channels, height * ymaps + padding, width * xmaps + padding), pad_value)
+    k = 0
+    for y in range(ymaps):
+        for x in range(xmaps):
+            if k >= nmaps:
+                break
+            # Tensor.copy_() is a valid method but seems to be missing from the stubs
+            # https://pytorch.org/docs/stable/tensors.html#torch.Tensor.copy_
+            grid.narrow(1, y * height + padding, height - padding).narrow(  # type: ignore[attr-defined]
+                2, x * width + padding, width - padding
+            ).copy_(tensor[k])
+            k = k + 1
+    return grid
+
+
+@torch.no_grad()
+def save_image(
+    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    fp: Union[str, pathlib.Path, BinaryIO],
+    format: Optional[str] = None,
+    **kwargs,
+) -> None:
+    """
+    Save a given Tensor into an image file.
+
+    Args:
+        tensor (Tensor or list): Image to be saved. If given a mini-batch tensor,
+            saves the tensor as a grid of images by calling ``make_grid``.
+        fp (string or file object): A filename or a file object
+        format(Optional):  If omitted, the format to use is determined from the filename extension.
+            If a file object was used instead of a filename, this parameter should always be used.
+        **kwargs: Other arguments are documented in ``make_grid``.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(save_image)
+    grid = make_grid(tensor, **kwargs)
+    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
+    ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+    im = Image.fromarray(ndarr)
+    im.save(fp, format=format)
+
+
+@torch.no_grad()
+def draw_bounding_boxes(
+    image: torch.Tensor,
+    boxes: torch.Tensor,
+    labels: Optional[List[str]] = None,
+    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+    fill: Optional[bool] = False,
+    width: int = 1,
+    font: Optional[str] = None,
+    font_size: int = 10,
+) -> torch.Tensor:
+
+    """
+    Draws bounding boxes on given image.
+    The values of the input image should be uint8 between 0 and 255.
+    If fill is True, Resulting Tensor should be saved as PNG image.
+
+    Args:
+        image (Tensor): Tensor of shape (C x H x W) and dtype uint8.
+        boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax) format. Note that
+            the boxes are absolute coordinates with respect to the image. In other words: `0 <= xmin < xmax < W` and
+            `0 <= ymin < ymax < H`.
+        labels (List[str]): List containing the labels of bounding boxes.
+        colors (color or list of colors, optional): List containing the colors
+            of the boxes or single color for all boxes. The color can be represented as
+            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+            By default, random colors are generated for boxes.
+        fill (bool): If `True` fills the bounding box with specified color.
+        width (int): Width of bounding box.
+        font (str): A filename containing a TrueType font. If the file is not found in this filename, the loader may
+            also search in other directories, such as the `fonts/` directory on Windows or `/Library/Fonts/`,
+            `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
+        font_size (int): The requested font size in points.
+
+    Returns:
+        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(draw_bounding_boxes)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Tensor expected, got {type(image)}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
+    elif image.dim() != 3:
+        raise ValueError("Pass individual images, not batches")
+    elif image.size(0) not in {1, 3}:
+        raise ValueError("Only grayscale and RGB images are supported")
+
+    num_boxes = boxes.shape[0]
+
+    if labels is None:
+        labels: Union[List[str], List[None]] = [None] * num_boxes  # type: ignore[no-redef]
+    elif len(labels) != num_boxes:
+        raise ValueError(
+            f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box."
+        )
+
+    if colors is None:
+        colors = _generate_color_palette(num_boxes)
+    elif isinstance(colors, list):
+        if len(colors) < num_boxes:
+            raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ")
+    else:  # colors specifies a single color for all boxes
+        colors = [colors] * num_boxes
+
+    colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
+
+    # Handle Grayscale images
+    if image.size(0) == 1:
+        image = torch.tile(image, (3, 1, 1))
+
+    ndarr = image.permute(1, 2, 0).cpu().numpy()
+    img_to_draw = Image.fromarray(ndarr)
+    img_boxes = boxes.to(torch.int64).tolist()
+
+    if fill:
+        draw = ImageDraw.Draw(img_to_draw, "RGBA")
+    else:
+        draw = ImageDraw.Draw(img_to_draw)
+
+    txt_font = ImageFont.load_default() if font is None else ImageFont.truetype(font=font, size=font_size)
+
+    for bbox, color, label in zip(img_boxes, colors, labels):  # type: ignore[arg-type]
+        if fill:
+            fill_color = color + (100,)
+            draw.rectangle(bbox, width=width, outline=color, fill=fill_color)
+        else:
+            draw.rectangle(bbox, width=width, outline=color)
+
+        if label is not None:
+            margin = width + 1
+            draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font)
+
+    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+
+
+@torch.no_grad()
+def draw_segmentation_masks(
+    image: torch.Tensor,
+    masks: torch.Tensor,
+    alpha: float = 0.8,
+    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+) -> torch.Tensor:
+
+    """
+    Draws segmentation masks on given RGB image.
+    The values of the input image should be uint8 between 0 and 255.
+
+    Args:
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
+        masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
+        alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
+            0 means full transparency, 1 means no transparency.
+        colors (color or list of colors, optional): List containing the colors
+            of the masks or single color for all masks. The color can be represented as
+            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+            By default, random colors are generated for each mask.
+
+    Returns:
+        img (Tensor[C, H, W]): Image Tensor, with segmentation masks drawn on top.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(draw_segmentation_masks)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"The image must be a tensor, got {type(image)}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
+    elif image.dim() != 3:
+        raise ValueError("Pass individual images, not batches")
+    elif image.size()[0] != 3:
+        raise ValueError("Pass an RGB image. Other Image formats are not supported")
+    if masks.ndim == 2:
+        masks = masks[None, :, :]
+    if masks.ndim != 3:
+        raise ValueError("masks must be of shape (H, W) or (batch_size, H, W)")
+    if masks.dtype != torch.bool:
+        raise ValueError(f"The masks must be of dtype bool. Got {masks.dtype}")
+    if masks.shape[-2:] != image.shape[-2:]:
+        raise ValueError("The image and the masks must have the same height and width")
+
+    num_masks = masks.size()[0]
+    if colors is not None and num_masks > len(colors):
+        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
+
+    if colors is None:
+        colors = _generate_color_palette(num_masks)
+
+    if not isinstance(colors, list):
+        colors = [colors]
+    if not isinstance(colors[0], (tuple, str)):
+        raise ValueError("colors must be a tuple or a string, or a list thereof")
+    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
+        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
+
+    out_dtype = torch.uint8
+
+    colors_ = []
+    for color in colors:
+        if isinstance(color, str):
+            color = ImageColor.getrgb(color)
+        colors_.append(torch.tensor(color, dtype=out_dtype))
+
+    img_to_draw = image.detach().clone()
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors_):
+        img_to_draw[:, mask] = color[:, None]
+
+    out = image * (1 - alpha) + img_to_draw * alpha
+    return out.to(out_dtype)
+
+
+@torch.no_grad()
+def draw_keypoints(
+    image: torch.Tensor,
+    keypoints: torch.Tensor,
+    connectivity: Optional[List[Tuple[int, int]]] = None,
+    colors: Optional[Union[str, Tuple[int, int, int]]] = None,
+    radius: int = 2,
+    width: int = 3,
+) -> torch.Tensor:
+
+    """
+    Draws Keypoints on given RGB image.
+    The values of the input image should be uint8 between 0 and 255.
+
+    Args:
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
+        keypoints (Tensor): Tensor of shape (num_instances, K, 2) the K keypoints location for each of the N instances,
+            in the format [x, y].
+        connectivity (List[Tuple[int, int]]]): A List of tuple where,
+            each tuple contains pair of keypoints to be connected.
+        colors (str, Tuple): The color can be represented as
+            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+        radius (int): Integer denoting radius of keypoint.
+        width (int): Integer denoting width of line connecting keypoints.
+
+    Returns:
+        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with keypoints drawn.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(draw_keypoints)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"The image must be a tensor, got {type(image)}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
+    elif image.dim() != 3:
+        raise ValueError("Pass individual images, not batches")
+    elif image.size()[0] != 3:
+        raise ValueError("Pass an RGB image. Other Image formats are not supported")
+
+    if keypoints.ndim != 3:
+        raise ValueError("keypoints must be of shape (num_instances, K, 2)")
+
+    ndarr = image.permute(1, 2, 0).cpu().numpy()
+    img_to_draw = Image.fromarray(ndarr)
+    draw = ImageDraw.Draw(img_to_draw)
+    img_kpts = keypoints.to(torch.int64).tolist()
+
+    for kpt_id, kpt_inst in enumerate(img_kpts):
+        for inst_id, kpt in enumerate(kpt_inst):
+            x1 = kpt[0] - radius
+            x2 = kpt[0] + radius
+            y1 = kpt[1] - radius
+            y2 = kpt[1] + radius
+            draw.ellipse([x1, y1, x2, y2], fill=colors, outline=None, width=0)
+
+        if connectivity:
+            for connection in connectivity:
+                start_pt_x = kpt_inst[connection[0]][0]
+                start_pt_y = kpt_inst[connection[0]][1]
+
+                end_pt_x = kpt_inst[connection[1]][0]
+                end_pt_y = kpt_inst[connection[1]][1]
+
+                draw.line(
+                    ((start_pt_x, start_pt_y), (end_pt_x, end_pt_y)),
+                    width=width,
+                )
+
+    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+
+
+# Flow visualization code adapted from https://github.com/tomrunia/OpticalFlow_Visualization
+@torch.no_grad()
+def flow_to_image(flow: torch.Tensor) -> torch.Tensor:
+
+    """
+    Converts a flow to an RGB image.
+
+    Args:
+        flow (Tensor): Flow of shape (N, 2, H, W) or (2, H, W) and dtype torch.float.
+
+    Returns:
+        img (Tensor): Image Tensor of dtype uint8 where each color corresponds
+            to a given flow direction. Shape is (N, 3, H, W) or (3, H, W) depending on the input.
+    """
+
+    if flow.dtype != torch.float:
+        raise ValueError(f"Flow should be of dtype torch.float, got {flow.dtype}.")
+
+    orig_shape = flow.shape
+    if flow.ndim == 3:
+        flow = flow[None]  # Add batch dim
+
+    if flow.ndim != 4 or flow.shape[1] != 2:
+        raise ValueError(f"Input flow should have shape (2, H, W) or (N, 2, H, W), got {orig_shape}.")
+
+    max_norm = torch.sum(flow ** 2, dim=1).sqrt().max()
+    epsilon = torch.finfo((flow).dtype).eps
+    normalized_flow = flow / (max_norm + epsilon)
+    img = _normalized_flow_to_image(normalized_flow)
+
+    if len(orig_shape) == 3:
+        img = img[0]  # Remove batch dim
+    return img
+
+
+@torch.no_grad()
+def _normalized_flow_to_image(normalized_flow: torch.Tensor) -> torch.Tensor:
+
+    """
+    Converts a batch of normalized flow to an RGB image.
+
+    Args:
+        normalized_flow (torch.Tensor): Normalized flow tensor of shape (N, 2, H, W)
+    Returns:
+       img (Tensor(N, 3, H, W)): Flow visualization image of dtype uint8.
+    """
+
+    N, _, H, W = normalized_flow.shape
+    flow_image = torch.zeros((N, 3, H, W), dtype=torch.uint8)
+    colorwheel = _make_colorwheel()  # shape [55x3]
+    num_cols = colorwheel.shape[0]
+    norm = torch.sum(normalized_flow ** 2, dim=1).sqrt()
+    a = torch.atan2(-normalized_flow[:, 1, :, :], -normalized_flow[:, 0, :, :]) / torch.pi
+    fk = (a + 1) / 2 * (num_cols - 1)
+    k0 = torch.floor(fk).to(torch.long)
+    k1 = k0 + 1
+    k1[k1 == num_cols] = 0
+    f = fk - k0
+
+    for c in range(colorwheel.shape[1]):
+        tmp = colorwheel[:, c]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1 - f) * col0 + f * col1
+        col = 1 - norm * (1 - col)
+        flow_image[:, c, :, :] = torch.floor(255 * col)
+    return flow_image
+
+
+def _make_colorwheel() -> torch.Tensor:
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+    Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+    URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf.
+
+    Returns:
+        colorwheel (Tensor[55, 3]): Colorwheel Tensor.
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = torch.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = torch.floor(255 * torch.arange(0, RY) / RY)
+    col = col + RY
+    # YG
+    colorwheel[col : col + YG, 0] = 255 - torch.floor(255 * torch.arange(0, YG) / YG)
+    colorwheel[col : col + YG, 1] = 255
+    col = col + YG
+    # GC
+    colorwheel[col : col + GC, 1] = 255
+    colorwheel[col : col + GC, 2] = torch.floor(255 * torch.arange(0, GC) / GC)
+    col = col + GC
+    # CB
+    colorwheel[col : col + CB, 1] = 255 - torch.floor(255 * torch.arange(CB) / CB)
+    colorwheel[col : col + CB, 2] = 255
+    col = col + CB
+    # BM
+    colorwheel[col : col + BM, 2] = 255
+    colorwheel[col : col + BM, 0] = torch.floor(255 * torch.arange(0, BM) / BM)
+    col = col + BM
+    # MR
+    colorwheel[col : col + MR, 2] = 255 - torch.floor(255 * torch.arange(MR) / MR)
+    colorwheel[col : col + MR, 0] = 255
+    return colorwheel
+
+
+def _generate_color_palette(num_objects: int):
+    palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
+    return [tuple((i * palette) % 255) for i in range(num_objects)]
+
+
+def _log_api_usage_once(obj: Any) -> None:
+
+    """
+    Logs API usage(module and name) within an organization.
+    In a large ecosystem, it's often useful to track the PyTorch and
+    TorchVision APIs usage. This API provides the similar functionality to the
+    logging module in the Python stdlib. It can be used for debugging purpose
+    to log which methods are used and by default it is inactive, unless the user
+    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
+    Please note it is triggered only once for the same API call within a process.
+    It does not collect any data from open-source users since it is no-op by default.
+    For more information, please refer to
+    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
+    * Logging policy: https://github.com/pytorch/vision/issues/5052;
+
+    Args:
+        obj (class instance or method): an object to extract info from.
+    """
+    if not obj.__module__.startswith("torchvision"):
+        return
+    name = obj.__class__.__name__
+    if isinstance(obj, FunctionType):
+        name = obj.__name__
+    torch._C._log_api_usage_once(f"{obj.__module__}.{name}")
diff --git a/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py b/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py
index 7c5fa1179da79e495eae3d591c322865a76b6dfc..ce54196fe3088672f04b1b1edc432fc8023c061e 100755
--- a/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py
+++ b/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py
@@ -1,6 +1,6 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
 
 from collections import defaultdict, deque
 import datetime
@@ -37,7 +37,7 @@ class SmoothedValue(object):
         """
         if not is_dist_avail_and_initialized():
             return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        t = torch.tensor([self.count, self.total], dtype=torch.float32, device='cuda')
         dist.barrier()
         dist.all_reduce(t)
         t = t.tolist()
@@ -72,4 +72,4 @@ class SmoothedValue(object):
             avg=self.avg,
             global_avg=self.global_avg,
             max=self.max,
-            value=self.value)
\ No newline at end of file
+            value=self.value)
diff --git a/cv/classification/efficientnet_b4/pytorch/train.py b/cv/classification/efficientnet_b4/pytorch/train.py
index 449cf2544553612fbac89bd9e30a6880cca601fb..9ba7b468b380ea1735cd6be9823b85c34e58a3cb 100755
--- a/cv/classification/efficientnet_b4/pytorch/train.py
+++ b/cv/classification/efficientnet_b4/pytorch/train.py
@@ -1,4 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
 
 import datetime
 import os
@@ -6,6 +8,7 @@ import sys
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 
 import time
+import math
 
 import torch
 import torch.utils.data
@@ -20,7 +23,7 @@ except:
 
 from torch import nn
 import torch.distributed as dist
-import torchvision as torchvision
+import _torchvision as torchvision
 
 import utils
 
@@ -74,6 +77,10 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri
 
         acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
         batch_size = image.shape[0]
+        loss_value = loss.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
         metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
         metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
         metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
@@ -194,6 +201,7 @@ def main(args):
 
     print("Start training")
     start_time = time.time()
+    best_acc = 0
     for epoch in range(args.start_epoch, args.epochs):
         epoch_start_time = time.time()
         if args.distributed and not args.dali:
@@ -204,23 +212,21 @@ def main(args):
         if acc_avg > args.acc_thresh:
             print("The accuracy has been exceeded {},and the training is terminated at epoch {}".format(args.acc_thresh, epoch))
             return
-
-        if args.output_dir:
-            checkpoint = {
-                'model': model_without_ddp.state_dict(),
-                'optimizer': optimizer.state_dict(),
-                'lr_scheduler': lr_scheduler.state_dict(),
-                'epoch': epoch,
-                'args': args}
-            utils.save_on_master(
-                checkpoint,
-                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
-            utils.save_on_master(
-                checkpoint,
-                os.path.join(args.output_dir, 'checkpoint.pth'))
-            epoch_total_time = time.time() - epoch_start_time
-            epoch_total_time_str = str(datetime.timedelta(seconds=int(epoch_total_time)))
-            print('epoch time {}'.format(epoch_total_time_str))
+        if acc_avg > best_acc:
+            if args.output_dir:
+                checkpoint = {
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'args': args}
+                utils.save_on_master(
+                    checkpoint,
+                    os.path.join(args.output_dir, 'model_best.pth'))
+                epoch_total_time = time.time() - epoch_start_time
+                epoch_total_time_str = str(datetime.timedelta(seconds=int(epoch_total_time)))
+                print('epoch time {}'.format(epoch_total_time_str))
+            best_acc = acc_avg
 
         if args.dali:
             data_loader.reset()
@@ -301,7 +307,7 @@ def get_args_parser(add_help=True):
     )
 
     # distributed training parameters
-    parser.add_argument('--local_rank', default=-1, type=int,
+    parser.add_argument('--local_rank', '--local-rank', default=-1, type=int,
                         help='Local rank')
     parser.add_argument('--world-size', default=1, type=int,
                         help='number of distributed processes')
@@ -328,4 +334,9 @@ def check_args(args):
 if __name__ == "__main__":
     args = get_args_parser().parse_args()
     args = check_args(args)
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments(args)
+    except:
+        pass
     main(args)