diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/__init__.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e419fff0d973284773a0042b507d03a2fb46dd4 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/__init__.py @@ -0,0 +1,3 @@ +from . import models + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/_internally_replaced_utils.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/_internally_replaced_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..18afc3ed93a8272600d73cc240047a0a49f23991 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/_internally_replaced_utils.py @@ -0,0 +1,58 @@ +import importlib.machinery +import os + +from torch.hub import _get_torch_home + + +_HOME = os.path.join(_get_torch_home(), "datasets", "vision") +_USE_SHARDED_DATASETS = False + + +def _download_file_from_remote_location(fpath: str, url: str) -> None: + pass + + +def _is_remote_location_available() -> bool: + return False + + +try: + from torch.hub import load_state_dict_from_url # noqa: 401 +except ImportError: + from torch.utils.model_zoo import load_url as load_state_dict_from_url # noqa: 401 + + +def _get_extension_path(lib_name): + + lib_dir = os.path.dirname(__file__) + if os.name == "nt": + # Register the main torchvision library location on the default DLL path + import ctypes + import sys + + kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) + with_load_library_flags = hasattr(kernel32, "AddDllDirectory") + prev_error_mode = kernel32.SetErrorMode(0x0001) + + if with_load_library_flags: + kernel32.AddDllDirectory.restype = ctypes.c_void_p + + if sys.version_info >= (3, 8): + os.add_dll_directory(lib_dir) + elif with_load_library_flags: + res = kernel32.AddDllDirectory(lib_dir) + if res is None: + err = ctypes.WinError(ctypes.get_last_error()) + err.strerror += f' Error adding "{lib_dir}" to the DLL directories.' + raise err + + kernel32.SetErrorMode(prev_error_mode) + + loader_details = (importlib.machinery.ExtensionFileLoader, importlib.machinery.EXTENSION_SUFFIXES) + + extfinder = importlib.machinery.FileFinder(lib_dir, loader_details) + ext_specs = extfinder.find_spec(lib_name) + if ext_specs is None: + raise ImportError + + return ext_specs.origin diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/__init__.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d04500cd4585b18a44150a8fc37a21264f2f41 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/__init__.py @@ -0,0 +1,14 @@ +from .alexnet import * +from .convnext import * +from .resnet import * +from .vgg import * +from .squeezenet import * +from .inception import * +from .densenet import * +from .googlenet import * +from .mobilenet import * +from .mnasnet import * +from .shufflenetv2 import * +from .efficientnet import * +from .regnet import * +from .vision_transformer import * diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/_utils.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e1cd845088f491bbb29ee9b4b2548063dc730a --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/_utils.py @@ -0,0 +1,83 @@ +from collections import OrderedDict +from typing import Dict, Optional + +from torch import nn + + +class IntermediateLayerGetter(nn.ModuleDict): + """ + Module wrapper that returns intermediate layers from a model + + It has a strong assumption that the modules have been registered + into the model in the same order as they are used. + This means that one should **not** reuse the same nn.Module + twice in the forward if you want this to work. + + Additionally, it is only able to query submodules that are directly + assigned to the model. So if `model` is passed, `model.feature1` can + be returned, but not `model.feature1.layer2`. + + Args: + model (nn.Module): model on which we will extract the features + return_layers (Dict[name, new_name]): a dict containing the names + of the modules for which the activations will be returned as + the key of the dict, and the value of the dict is the name + of the returned activation (which the user can specify). + + Examples:: + + >>> m = torchvision.models.resnet18(pretrained=True) + >>> # extract layer1 and layer3, giving as names `feat1` and feat2` + >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m, + >>> {'layer1': 'feat1', 'layer3': 'feat2'}) + >>> out = new_m(torch.rand(1, 3, 224, 224)) + >>> print([(k, v.shape) for k, v in out.items()]) + >>> [('feat1', torch.Size([1, 64, 56, 56])), + >>> ('feat2', torch.Size([1, 256, 14, 14]))] + """ + + _version = 2 + __annotations__ = { + "return_layers": Dict[str, str], + } + + def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None: + if not set(return_layers).issubset([name for name, _ in model.named_children()]): + raise ValueError("return_layers are not present in model") + orig_return_layers = return_layers + return_layers = {str(k): str(v) for k, v in return_layers.items()} + layers = OrderedDict() + for name, module in model.named_children(): + layers[name] = module + if name in return_layers: + del return_layers[name] + if not return_layers: + break + + super().__init__(layers) + self.return_layers = orig_return_layers + + def forward(self, x): + out = OrderedDict() + for name, module in self.items(): + x = module(x) + if name in self.return_layers: + out_name = self.return_layers[name] + out[out_name] = x + return out + + +def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int: + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/alexnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..bb812febdc43fa6d428cdd5541f27917649ece62 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/alexnet.py @@ -0,0 +1,69 @@ +from typing import Any + +import torch +import torch.nn as nn + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + + +__all__ = ["AlexNet", "alexnet"] + + +model_urls = { + "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth", +} + + +class AlexNet(nn.Module): + def __init__(self, num_classes: int = 1000, dropout: float = 0.5) -> None: + super().__init__() + _log_api_usage_once(self) + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + self.avgpool = nn.AdaptiveAvgPool2d((6, 6)) + self.classifier = nn.Sequential( + nn.Dropout(p=dropout), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(p=dropout), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.features(x) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.classifier(x) + return x + + +def alexnet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> AlexNet: + r"""AlexNet model architecture from the + `"One weird trick..." `_ paper. + The required minimum input size of the model is 63x63. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = AlexNet(**kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls["alexnet"], progress=progress) + model.load_state_dict(state_dict) + return model diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/convnext.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..9067b6876fdfef39c5acb1ab776ee1433e05a539 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/convnext.py @@ -0,0 +1,271 @@ +from functools import partial +from typing import Any, Callable, Dict, List, Optional, Sequence + +import torch +from torch import nn, Tensor +from torch.nn import functional as F + +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops.misc import ConvNormActivation +from ..ops.stochastic_depth import StochasticDepth +from ..utils import _log_api_usage_once + + +__all__ = [ + "ConvNeXt", + "convnext_tiny", + "convnext_small", + "convnext_base", + "convnext_large", +] + + +_MODELS_URLS: Dict[str, Optional[str]] = { + "convnext_tiny": "https://download.pytorch.org/models/convnext_tiny-983f1562.pth", + "convnext_small": "https://download.pytorch.org/models/convnext_small-0c510722.pth", + "convnext_base": "https://download.pytorch.org/models/convnext_base-6075fbad.pth", + "convnext_large": "https://download.pytorch.org/models/convnext_large-ea097f82.pth", +} + + +class LayerNorm2d(nn.LayerNorm): + def forward(self, x: Tensor) -> Tensor: + x = x.permute(0, 2, 3, 1) + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + x = x.permute(0, 3, 1, 2) + return x + + +class Permute(nn.Module): + def __init__(self, dims: List[int]): + super().__init__() + self.dims = dims + + def forward(self, x): + return torch.permute(x, self.dims) + + +class CNBlock(nn.Module): + def __init__( + self, + dim, + layer_scale: float, + stochastic_depth_prob: float, + norm_layer: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.block = nn.Sequential( + nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim, bias=True), + Permute([0, 2, 3, 1]), + norm_layer(dim), + nn.Linear(in_features=dim, out_features=4 * dim, bias=True), + nn.GELU(), + nn.Linear(in_features=4 * dim, out_features=dim, bias=True), + Permute([0, 3, 1, 2]), + ) + self.layer_scale = nn.Parameter(torch.ones(dim, 1, 1) * layer_scale) + self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row") + + def forward(self, input: Tensor) -> Tensor: + result = self.layer_scale * self.block(input) + result = self.stochastic_depth(result) + result += input + return result + + +class CNBlockConfig: + # Stores information listed at Section 3 of the ConvNeXt paper + def __init__( + self, + input_channels: int, + out_channels: Optional[int], + num_layers: int, + ) -> None: + self.input_channels = input_channels + self.out_channels = out_channels + self.num_layers = num_layers + + def __repr__(self) -> str: + s = self.__class__.__name__ + "(" + s += "input_channels={input_channels}" + s += ", out_channels={out_channels}" + s += ", num_layers={num_layers}" + s += ")" + return s.format(**self.__dict__) + + +class ConvNeXt(nn.Module): + def __init__( + self, + block_setting: List[CNBlockConfig], + stochastic_depth_prob: float = 0.0, + layer_scale: float = 1e-6, + num_classes: int = 1000, + block: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + **kwargs: Any, + ) -> None: + super().__init__() + _log_api_usage_once(self) + + if not block_setting: + raise ValueError("The block_setting should not be empty") + elif not (isinstance(block_setting, Sequence) and all([isinstance(s, CNBlockConfig) for s in block_setting])): + raise TypeError("The block_setting should be List[CNBlockConfig]") + + if block is None: + block = CNBlock + + if norm_layer is None: + norm_layer = partial(LayerNorm2d, eps=1e-6) + + layers: List[nn.Module] = [] + + # Stem + firstconv_output_channels = block_setting[0].input_channels + layers.append( + ConvNormActivation( + 3, + firstconv_output_channels, + kernel_size=4, + stride=4, + padding=0, + norm_layer=norm_layer, + activation_layer=None, + bias=True, + ) + ) + + total_stage_blocks = sum(cnf.num_layers for cnf in block_setting) + stage_block_id = 0 + for cnf in block_setting: + # Bottlenecks + stage: List[nn.Module] = [] + for _ in range(cnf.num_layers): + # adjust stochastic depth probability based on the depth of the stage block + sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0) + stage.append(block(cnf.input_channels, layer_scale, sd_prob)) + stage_block_id += 1 + layers.append(nn.Sequential(*stage)) + if cnf.out_channels is not None: + # Downsampling + layers.append( + nn.Sequential( + norm_layer(cnf.input_channels), + nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=2, stride=2), + ) + ) + + self.features = nn.Sequential(*layers) + self.avgpool = nn.AdaptiveAvgPool2d(1) + + lastblock = block_setting[-1] + lastconv_output_channels = ( + lastblock.out_channels if lastblock.out_channels is not None else lastblock.input_channels + ) + self.classifier = nn.Sequential( + norm_layer(lastconv_output_channels), nn.Flatten(1), nn.Linear(lastconv_output_channels, num_classes) + ) + + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + nn.init.trunc_normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.zeros_(m.bias) + + def _forward_impl(self, x: Tensor) -> Tensor: + x = self.features(x) + x = self.avgpool(x) + x = self.classifier(x) + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def _convnext( + arch: str, + block_setting: List[CNBlockConfig], + stochastic_depth_prob: float, + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> ConvNeXt: + model = ConvNeXt(block_setting, stochastic_depth_prob=stochastic_depth_prob, **kwargs) + if pretrained: + if arch not in _MODELS_URLS: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(_MODELS_URLS[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def convnext_tiny(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt: + r"""ConvNeXt Tiny model architecture from the + `"A ConvNet for the 2020s" `_ paper. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + CNBlockConfig(96, 192, 3), + CNBlockConfig(192, 384, 3), + CNBlockConfig(384, 768, 9), + CNBlockConfig(768, None, 3), + ] + stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.1) + return _convnext("convnext_tiny", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs) + + +def convnext_small(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt: + r"""ConvNeXt Small model architecture from the + `"A ConvNet for the 2020s" `_ paper. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + CNBlockConfig(96, 192, 3), + CNBlockConfig(192, 384, 3), + CNBlockConfig(384, 768, 27), + CNBlockConfig(768, None, 3), + ] + stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.4) + return _convnext("convnext_small", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs) + + +def convnext_base(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt: + r"""ConvNeXt Base model architecture from the + `"A ConvNet for the 2020s" `_ paper. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + CNBlockConfig(128, 256, 3), + CNBlockConfig(256, 512, 3), + CNBlockConfig(512, 1024, 27), + CNBlockConfig(1024, None, 3), + ] + stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.5) + return _convnext("convnext_base", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs) + + +def convnext_large(*, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ConvNeXt: + r"""ConvNeXt Large model architecture from the + `"A ConvNet for the 2020s" `_ paper. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + CNBlockConfig(192, 384, 3), + CNBlockConfig(384, 768, 3), + CNBlockConfig(768, 1536, 27), + CNBlockConfig(1536, None, 3), + ] + stochastic_depth_prob = kwargs.pop("stochastic_depth_prob", 0.5) + return _convnext("convnext_large", block_setting, stochastic_depth_prob, pretrained, progress, **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/densenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/densenet.py new file mode 100644 index 0000000000000000000000000000000000000000..14e318360afc03f11d14ebd5c2e1be9811923b7a --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/densenet.py @@ -0,0 +1,310 @@ +import re +from collections import OrderedDict +from typing import Any, List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from torch import Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + + +__all__ = ["DenseNet", "densenet121", "densenet169", "densenet201", "densenet161"] + +model_urls = { + "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth", + "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth", + "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth", + "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth", +} + + +class _DenseLayer(nn.Module): + def __init__( + self, num_input_features: int, growth_rate: int, bn_size: int, drop_rate: float, memory_efficient: bool = False + ) -> None: + super().__init__() + self.norm1: nn.BatchNorm2d + self.add_module("norm1", nn.BatchNorm2d(num_input_features)) + self.relu1: nn.ReLU + self.add_module("relu1", nn.ReLU(inplace=True)) + self.conv1: nn.Conv2d + self.add_module( + "conv1", nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False) + ) + self.norm2: nn.BatchNorm2d + self.add_module("norm2", nn.BatchNorm2d(bn_size * growth_rate)) + self.relu2: nn.ReLU + self.add_module("relu2", nn.ReLU(inplace=True)) + self.conv2: nn.Conv2d + self.add_module( + "conv2", nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False) + ) + self.drop_rate = float(drop_rate) + self.memory_efficient = memory_efficient + + def bn_function(self, inputs: List[Tensor]) -> Tensor: + concated_features = torch.cat(inputs, 1) + bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features))) # noqa: T484 + return bottleneck_output + + # todo: rewrite when torchscript supports any + def any_requires_grad(self, input: List[Tensor]) -> bool: + for tensor in input: + if tensor.requires_grad: + return True + return False + + @torch.jit.unused # noqa: T484 + def call_checkpoint_bottleneck(self, input: List[Tensor]) -> Tensor: + def closure(*inputs): + return self.bn_function(inputs) + + return cp.checkpoint(closure, *input) + + @torch.jit._overload_method # noqa: F811 + def forward(self, input: List[Tensor]) -> Tensor: # noqa: F811 + pass + + @torch.jit._overload_method # noqa: F811 + def forward(self, input: Tensor) -> Tensor: # noqa: F811 + pass + + # torchscript does not yet support *args, so we overload method + # allowing it to take either a List[Tensor] or single Tensor + def forward(self, input: Tensor) -> Tensor: # noqa: F811 + if isinstance(input, Tensor): + prev_features = [input] + else: + prev_features = input + + if self.memory_efficient and self.any_requires_grad(prev_features): + if torch.jit.is_scripting(): + raise Exception("Memory Efficient not supported in JIT") + + bottleneck_output = self.call_checkpoint_bottleneck(prev_features) + else: + bottleneck_output = self.bn_function(prev_features) + + new_features = self.conv2(self.relu2(self.norm2(bottleneck_output))) + if self.drop_rate > 0: + new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) + return new_features + + +class _DenseBlock(nn.ModuleDict): + _version = 2 + + def __init__( + self, + num_layers: int, + num_input_features: int, + bn_size: int, + growth_rate: int, + drop_rate: float, + memory_efficient: bool = False, + ) -> None: + super().__init__() + for i in range(num_layers): + layer = _DenseLayer( + num_input_features + i * growth_rate, + growth_rate=growth_rate, + bn_size=bn_size, + drop_rate=drop_rate, + memory_efficient=memory_efficient, + ) + self.add_module("denselayer%d" % (i + 1), layer) + + def forward(self, init_features: Tensor) -> Tensor: + features = [init_features] + for name, layer in self.items(): + new_features = layer(features) + features.append(new_features) + return torch.cat(features, 1) + + +class _Transition(nn.Sequential): + def __init__(self, num_input_features: int, num_output_features: int) -> None: + super().__init__() + self.add_module("norm", nn.BatchNorm2d(num_input_features)) + self.add_module("relu", nn.ReLU(inplace=True)) + self.add_module("conv", nn.Conv2d(num_input_features, num_output_features, kernel_size=1, stride=1, bias=False)) + self.add_module("pool", nn.AvgPool2d(kernel_size=2, stride=2)) + + +class DenseNet(nn.Module): + r"""Densenet-BC model class, based on + `"Densely Connected Convolutional Networks" `_. + + Args: + growth_rate (int) - how many filters to add each layer (`k` in paper) + block_config (list of 4 ints) - how many layers in each pooling block + num_init_features (int) - the number of filters to learn in the first convolution layer + bn_size (int) - multiplicative factor for number of bottle neck layers + (i.e. bn_size * k features in the bottleneck layer) + drop_rate (float) - dropout rate after each dense layer + num_classes (int) - number of classification classes + memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient, + but slower. Default: *False*. See `"paper" `_. + """ + + def __init__( + self, + growth_rate: int = 32, + block_config: Tuple[int, int, int, int] = (6, 12, 24, 16), + num_init_features: int = 64, + bn_size: int = 4, + drop_rate: float = 0, + num_classes: int = 1000, + memory_efficient: bool = False, + ) -> None: + + super().__init__() + _log_api_usage_once(self) + + # First convolution + self.features = nn.Sequential( + OrderedDict( + [ + ("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)), + ("norm0", nn.BatchNorm2d(num_init_features)), + ("relu0", nn.ReLU(inplace=True)), + ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)), + ] + ) + ) + + # Each denseblock + num_features = num_init_features + for i, num_layers in enumerate(block_config): + block = _DenseBlock( + num_layers=num_layers, + num_input_features=num_features, + bn_size=bn_size, + growth_rate=growth_rate, + drop_rate=drop_rate, + memory_efficient=memory_efficient, + ) + self.features.add_module("denseblock%d" % (i + 1), block) + num_features = num_features + num_layers * growth_rate + if i != len(block_config) - 1: + trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2) + self.features.add_module("transition%d" % (i + 1), trans) + num_features = num_features // 2 + + # Final batch norm + self.features.add_module("norm5", nn.BatchNorm2d(num_features)) + + # Linear layer + self.classifier = nn.Linear(num_features, num_classes) + + # Official init from torch repo. + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor) -> Tensor: + features = self.features(x) + out = F.relu(features, inplace=True) + out = F.adaptive_avg_pool2d(out, (1, 1)) + out = torch.flatten(out, 1) + out = self.classifier(out) + return out + + +def _load_state_dict(model: nn.Module, model_url: str, progress: bool) -> None: + # '.'s are no longer allowed in module names, but previous _DenseLayer + # has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'. + # They are also in the checkpoints in model_urls. This pattern is used + # to find such keys. + pattern = re.compile( + r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$" + ) + + state_dict = load_state_dict_from_url(model_url, progress=progress) + for key in list(state_dict.keys()): + res = pattern.match(key) + if res: + new_key = res.group(1) + res.group(2) + state_dict[new_key] = state_dict[key] + del state_dict[key] + model.load_state_dict(state_dict) + + +def _densenet( + arch: str, + growth_rate: int, + block_config: Tuple[int, int, int, int], + num_init_features: int, + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> DenseNet: + model = DenseNet(growth_rate, block_config, num_init_features, **kwargs) + if pretrained: + _load_state_dict(model, model_urls[arch], progress) + return model + + +def densenet121(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet: + r"""Densenet-121 model from + `"Densely Connected Convolutional Networks" `_. + The required minimum input size of the model is 29x29. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient, + but slower. Default: *False*. See `"paper" `_. + """ + return _densenet("densenet121", 32, (6, 12, 24, 16), 64, pretrained, progress, **kwargs) + + +def densenet161(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet: + r"""Densenet-161 model from + `"Densely Connected Convolutional Networks" `_. + The required minimum input size of the model is 29x29. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient, + but slower. Default: *False*. See `"paper" `_. + """ + return _densenet("densenet161", 48, (6, 12, 36, 24), 96, pretrained, progress, **kwargs) + + +def densenet169(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet: + r"""Densenet-169 model from + `"Densely Connected Convolutional Networks" `_. + The required minimum input size of the model is 29x29. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient, + but slower. Default: *False*. See `"paper" `_. + """ + return _densenet("densenet169", 32, (6, 12, 32, 32), 64, pretrained, progress, **kwargs) + + +def densenet201(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> DenseNet: + r"""Densenet-201 model from + `"Densely Connected Convolutional Networks" `_. + The required minimum input size of the model is 29x29. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient, + but slower. Default: *False*. See `"paper" `_. + """ + return _densenet("densenet201", 32, (6, 12, 48, 32), 64, pretrained, progress, **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/efficientnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/efficientnet.py new file mode 100644 index 0000000000000000000000000000000000000000..e2f1086329dc4c9676e68e88e08184929bac276f --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/efficientnet.py @@ -0,0 +1,418 @@ +import copy +import math +from functools import partial +from typing import Any, Callable, Optional, List, Sequence + +import torch +from torch import nn, Tensor +from ..ops import StochasticDepth + +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops.misc import ConvNormActivation, SqueezeExcitation +from ..utils import _log_api_usage_once +from ._utils import _make_divisible + + +__all__ = [ + "EfficientNet", + "efficientnet_b0", + "efficientnet_b1", + "efficientnet_b2", + "efficientnet_b3", + "efficientnet_b4", + "efficientnet_b5", + "efficientnet_b6", + "efficientnet_b7", +] + + +model_urls = { + # Weights ported from https://github.com/rwightman/pytorch-image-models/ + "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth", + "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth", + "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth", + "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth", + "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth", + # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/ + "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth", + "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth", + "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth", +} + + +class MBConvConfig: + # Stores information listed at Table 1 of the EfficientNet paper + def __init__( + self, + expand_ratio: float, + kernel: int, + stride: int, + input_channels: int, + out_channels: int, + num_layers: int, + width_mult: float, + depth_mult: float, + ) -> None: + self.expand_ratio = expand_ratio + self.kernel = kernel + self.stride = stride + self.input_channels = self.adjust_channels(input_channels, width_mult) + self.out_channels = self.adjust_channels(out_channels, width_mult) + self.num_layers = self.adjust_depth(num_layers, depth_mult) + + def __repr__(self) -> str: + s = ( + f"{self.__class__.__name__}(" + f"expand_ratio={self.expand_ratio}" + f", kernel={self.kernel}" + f", stride={self.stride}" + f", input_channels={self.input_channels}" + f", out_channels={self.out_channels}" + f", num_layers={self.num_layers}" + f")" + ) + return s + + @staticmethod + def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int: + return _make_divisible(channels * width_mult, 8, min_value) + + @staticmethod + def adjust_depth(num_layers: int, depth_mult: float): + return int(math.ceil(num_layers * depth_mult)) + + +class MBConv(nn.Module): + def __init__( + self, + cnf: MBConvConfig, + stochastic_depth_prob: float, + norm_layer: Callable[..., nn.Module], + se_layer: Callable[..., nn.Module] = SqueezeExcitation, + ) -> None: + super().__init__() + + if not (1 <= cnf.stride <= 2): + raise ValueError("illegal stride value") + + self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels + + layers: List[nn.Module] = [] + activation_layer = nn.SiLU + + # expand + expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio) + if expanded_channels != cnf.input_channels: + layers.append( + ConvNormActivation( + cnf.input_channels, + expanded_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + # depthwise + layers.append( + ConvNormActivation( + expanded_channels, + expanded_channels, + kernel_size=cnf.kernel, + stride=cnf.stride, + groups=expanded_channels, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + # squeeze and excitation + squeeze_channels = max(1, cnf.input_channels // 4) + layers.append(se_layer(expanded_channels, squeeze_channels, activation=partial(nn.SiLU, inplace=True))) + + # project + layers.append( + ConvNormActivation( + expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None + ) + ) + + self.block = nn.Sequential(*layers) + self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row") + self.out_channels = cnf.out_channels + + def forward(self, input: Tensor) -> Tensor: + result = self.block(input) + if self.use_res_connect: + result = self.stochastic_depth(result) + result += input + return result + + +class EfficientNet(nn.Module): + def __init__( + self, + inverted_residual_setting: List[MBConvConfig], + dropout: float, + stochastic_depth_prob: float = 0.2, + num_classes: int = 1000, + block: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + **kwargs: Any, + ) -> None: + """ + EfficientNet main class + + Args: + inverted_residual_setting (List[MBConvConfig]): Network structure + dropout (float): The droupout probability + stochastic_depth_prob (float): The stochastic depth probability + num_classes (int): Number of classes + block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet + norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use + """ + super().__init__() + _log_api_usage_once(self) + + if not inverted_residual_setting: + raise ValueError("The inverted_residual_setting should not be empty") + elif not ( + isinstance(inverted_residual_setting, Sequence) + and all([isinstance(s, MBConvConfig) for s in inverted_residual_setting]) + ): + raise TypeError("The inverted_residual_setting should be List[MBConvConfig]") + + if block is None: + block = MBConv + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + layers: List[nn.Module] = [] + + # building first layer + firstconv_output_channels = inverted_residual_setting[0].input_channels + layers.append( + ConvNormActivation( + 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.SiLU + ) + ) + + # building inverted residual blocks + total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting) + stage_block_id = 0 + for cnf in inverted_residual_setting: + stage: List[nn.Module] = [] + for _ in range(cnf.num_layers): + # copy to avoid modifications. shallow copy is enough + block_cnf = copy.copy(cnf) + + # overwrite info if not the first conv in the stage + if stage: + block_cnf.input_channels = block_cnf.out_channels + block_cnf.stride = 1 + + # adjust stochastic depth probability based on the depth of the stage block + sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks + + stage.append(block(block_cnf, sd_prob, norm_layer)) + stage_block_id += 1 + + layers.append(nn.Sequential(*stage)) + + # building last several layers + lastconv_input_channels = inverted_residual_setting[-1].out_channels + lastconv_output_channels = 4 * lastconv_input_channels + layers.append( + ConvNormActivation( + lastconv_input_channels, + lastconv_output_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=nn.SiLU, + ) + ) + + self.features = nn.Sequential(*layers) + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.classifier = nn.Sequential( + nn.Dropout(p=dropout, inplace=True), + nn.Linear(lastconv_output_channels, num_classes), + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + init_range = 1.0 / math.sqrt(m.out_features) + nn.init.uniform_(m.weight, -init_range, init_range) + nn.init.zeros_(m.bias) + + def _forward_impl(self, x: Tensor) -> Tensor: + x = self.features(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + + x = self.classifier(x) + + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def _efficientnet( + arch: str, + width_mult: float, + depth_mult: float, + dropout: float, + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> EfficientNet: + bneck_conf = partial(MBConvConfig, width_mult=width_mult, depth_mult=depth_mult) + inverted_residual_setting = [ + bneck_conf(1, 3, 1, 32, 16, 1), + bneck_conf(6, 3, 2, 16, 24, 2), + bneck_conf(6, 5, 2, 24, 40, 2), + bneck_conf(6, 3, 2, 40, 80, 3), + bneck_conf(6, 5, 1, 80, 112, 3), + bneck_conf(6, 5, 2, 112, 192, 4), + bneck_conf(6, 3, 1, 192, 320, 1), + ] + model = EfficientNet(inverted_residual_setting, dropout, **kwargs) + if pretrained: + if model_urls.get(arch, None) is None: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def efficientnet_b0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B0 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet("efficientnet_b0", 1.0, 1.0, 0.2, pretrained, progress, **kwargs) + + +def efficientnet_b1(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B1 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet("efficientnet_b1", 1.0, 1.1, 0.2, pretrained, progress, **kwargs) + + +def efficientnet_b2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B2 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet("efficientnet_b2", 1.1, 1.2, 0.3, pretrained, progress, **kwargs) + + +def efficientnet_b3(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B3 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet("efficientnet_b3", 1.2, 1.4, 0.3, pretrained, progress, **kwargs) + + +def efficientnet_b4(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B4 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet("efficientnet_b4", 1.4, 1.8, 0.4, pretrained, progress, **kwargs) + + +def efficientnet_b5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B5 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet( + "efficientnet_b5", + 1.6, + 2.2, + 0.4, + pretrained, + progress, + norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01), + **kwargs, + ) + + +def efficientnet_b6(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B6 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet( + "efficientnet_b6", + 1.8, + 2.6, + 0.5, + pretrained, + progress, + norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01), + **kwargs, + ) + + +def efficientnet_b7(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNet: + """ + Constructs a EfficientNet B7 architecture from + `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _efficientnet( + "efficientnet_b7", + 2.0, + 3.1, + 0.5, + pretrained, + progress, + norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.01), + **kwargs, + ) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/feature_extraction.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/feature_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..0a2b597da2368959eb74596547e87901cc7f8ea3 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/feature_extraction.py @@ -0,0 +1,547 @@ +import inspect +import math +import re +import warnings +from collections import OrderedDict +from copy import deepcopy +from itertools import chain +from typing import Dict, Callable, List, Union, Optional, Tuple, Any + +import torch +import torchvision +from torch import fx +from torch import nn +from torch.fx.graph_module import _copy_attr + + +__all__ = ["create_feature_extractor", "get_graph_node_names"] + + +class LeafModuleAwareTracer(fx.Tracer): + """ + An fx.Tracer that allows the user to specify a set of leaf modules, ie. + modules that are not to be traced through. The resulting graph ends up + having single nodes referencing calls to the leaf modules' forward methods. + """ + + def __init__(self, *args, **kwargs): + self.leaf_modules = {} + if "leaf_modules" in kwargs: + leaf_modules = kwargs.pop("leaf_modules") + self.leaf_modules = leaf_modules + super().__init__(*args, **kwargs) + + def is_leaf_module(self, m: nn.Module, module_qualname: str) -> bool: + if isinstance(m, tuple(self.leaf_modules)): + return True + return super().is_leaf_module(m, module_qualname) + + +class NodePathTracer(LeafModuleAwareTracer): + """ + NodePathTracer is an FX tracer that, for each operation, also records the + name of the Node from which the operation originated. A node name here is + a `.` separated path walking the hierarchy from top level module down to + leaf operation or leaf module. The name of the top level module is not + included as part of the node name. For example, if we trace a module whose + forward method applies a ReLU module, the name for that node will simply + be 'relu'. + + Some notes on the specifics: + - Nodes are recorded to `self.node_to_qualname` which is a dictionary + mapping a given Node object to its node name. + - Nodes are recorded in the order which they are executed during + tracing. + - When a duplicate node name is encountered, a suffix of the form + _{int} is added. The counter starts from 1. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Track the qualified name of the Node being traced + self.current_module_qualname = "" + # A map from FX Node to the qualified name\# + # NOTE: This is loosely like the "qualified name" mentioned in the + # torch.fx docs https://pytorch.org/docs/stable/fx.html but adapted + # for the purposes of the torchvision feature extractor + self.node_to_qualname = OrderedDict() + + def call_module(self, m: torch.nn.Module, forward: Callable, args, kwargs): + """ + Override of `fx.Tracer.call_module` + This override: + 1) Stores away the qualified name of the caller for restoration later + 2) Adds the qualified name of the caller to + `current_module_qualname` for retrieval by `create_proxy` + 3) Once a leaf module is reached, calls `create_proxy` + 4) Restores the caller's qualified name into current_module_qualname + """ + old_qualname = self.current_module_qualname + try: + module_qualname = self.path_of_module(m) + self.current_module_qualname = module_qualname + if not self.is_leaf_module(m, module_qualname): + out = forward(*args, **kwargs) + return out + return self.create_proxy("call_module", module_qualname, args, kwargs) + finally: + self.current_module_qualname = old_qualname + + def create_proxy( + self, kind: str, target: fx.node.Target, args, kwargs, name=None, type_expr=None, *_ + ) -> fx.proxy.Proxy: + """ + Override of `Tracer.create_proxy`. This override intercepts the recording + of every operation and stores away the current traced module's qualified + name in `node_to_qualname` + """ + proxy = super().create_proxy(kind, target, args, kwargs, name, type_expr) + self.node_to_qualname[proxy.node] = self._get_node_qualname(self.current_module_qualname, proxy.node) + return proxy + + def _get_node_qualname(self, module_qualname: str, node: fx.node.Node) -> str: + node_qualname = module_qualname + + if node.op != "call_module": + # In this case module_qualname from torch.fx doesn't go all the + # way to the leaf function/op so we need to append it + if len(node_qualname) > 0: + # Only append '.' if we are deeper than the top level module + node_qualname += "." + node_qualname += str(node) + + # Now we need to add an _{index} postfix on any repeated node names + # For modules we do this from scratch + # But for anything else, torch.fx already has a globally scoped + # _{index} postfix. But we want it locally (relative to direct parent) + # scoped. So first we need to undo the torch.fx postfix + if re.match(r".+_[0-9]+$", node_qualname) is not None: + node_qualname = node_qualname.rsplit("_", 1)[0] + + # ... and now we add on our own postfix + for existing_qualname in reversed(self.node_to_qualname.values()): + # Check to see if existing_qualname is of the form + # {node_qualname} or {node_qualname}_{int} + if re.match(rf"{node_qualname}(_[0-9]+)?$", existing_qualname) is not None: + postfix = existing_qualname.replace(node_qualname, "") + if len(postfix): + # existing_qualname is of the form {node_qualname}_{int} + next_index = int(postfix[1:]) + 1 + else: + # existing_qualname is of the form {node_qualname} + next_index = 1 + node_qualname += f"_{next_index}" + break + + return node_qualname + + +def _is_subseq(x, y): + """Check if y is a subseqence of x + https://stackoverflow.com/a/24017747/4391249 + """ + iter_x = iter(x) + return all(any(x_item == y_item for x_item in iter_x) for y_item in y) + + +def _warn_graph_differences(train_tracer: NodePathTracer, eval_tracer: NodePathTracer): + """ + Utility function for warning the user if there are differences between + the train graph nodes and the eval graph nodes. + """ + train_nodes = list(train_tracer.node_to_qualname.values()) + eval_nodes = list(eval_tracer.node_to_qualname.values()) + + if len(train_nodes) == len(eval_nodes) and all(t == e for t, e in zip(train_nodes, eval_nodes)): + return + + suggestion_msg = ( + "When choosing nodes for feature extraction, you may need to specify " + "output nodes for train and eval mode separately." + ) + + if _is_subseq(train_nodes, eval_nodes): + msg = ( + "NOTE: The nodes obtained by tracing the model in eval mode " + "are a subsequence of those obtained in train mode. " + ) + elif _is_subseq(eval_nodes, train_nodes): + msg = ( + "NOTE: The nodes obtained by tracing the model in train mode " + "are a subsequence of those obtained in eval mode. " + ) + else: + msg = "The nodes obtained by tracing the model in train mode are different to those obtained in eval mode. " + warnings.warn(msg + suggestion_msg) + + +def _get_leaf_modules_for_ops() -> List[type]: + members = inspect.getmembers(torchvision.ops) + result = [] + for _, obj in members: + if inspect.isclass(obj) and issubclass(obj, torch.nn.Module): + result.append(obj) + return result + + +def get_graph_node_names( + model: nn.Module, + tracer_kwargs: Optional[Dict[str, Any]] = None, + suppress_diff_warning: bool = False, +) -> Tuple[List[str], List[str]]: + """ + Dev utility to return node names in order of execution. See note on node + names under :func:`create_feature_extractor`. Useful for seeing which node + names are available for feature extraction. There are two reasons that + node names can't easily be read directly from the code for a model: + + 1. Not all submodules are traced through. Modules from ``torch.nn`` all + fall within this category. + 2. Nodes representing the repeated application of the same operation + or leaf module get a ``_{counter}`` postfix. + + The model is traced twice: once in train mode, and once in eval mode. Both + sets of node names are returned. + + For more details on the node naming conventions used here, please see the + :ref:`relevant subheading ` in the + `documentation `_. + + Args: + model (nn.Module): model for which we'd like to print node names + tracer_kwargs (dict, optional): a dictionary of keywork arguments for + ``NodePathTracer`` (they are eventually passed onto + `torch.fx.Tracer `_). + By default it will be set to wrap and make leaf nodes all torchvision ops. + suppress_diff_warning (bool, optional): whether to suppress a warning + when there are discrepancies between the train and eval version of + the graph. Defaults to False. + + Returns: + tuple(list, list): a list of node names from tracing the model in + train mode, and another from tracing the model in eval mode. + + Examples:: + + >>> model = torchvision.models.resnet18() + >>> train_nodes, eval_nodes = get_graph_node_names(model) + """ + if tracer_kwargs is None: + tracer_kwargs = { + "autowrap_modules": ( + math, + torchvision.ops, + ), + "leaf_modules": _get_leaf_modules_for_ops(), + } + is_training = model.training + train_tracer = NodePathTracer(**tracer_kwargs) + train_tracer.trace(model.train()) + eval_tracer = NodePathTracer(**tracer_kwargs) + eval_tracer.trace(model.eval()) + train_nodes = list(train_tracer.node_to_qualname.values()) + eval_nodes = list(eval_tracer.node_to_qualname.values()) + if not suppress_diff_warning: + _warn_graph_differences(train_tracer, eval_tracer) + # Restore training state + model.train(is_training) + return train_nodes, eval_nodes + + +class DualGraphModule(fx.GraphModule): + """ + A derivative of `fx.GraphModule`. Differs in the following ways: + - Requires a train and eval version of the underlying graph + - Copies submodules according to the nodes of both train and eval graphs. + - Calling train(mode) switches between train graph and eval graph. + """ + + def __init__( + self, root: torch.nn.Module, train_graph: fx.Graph, eval_graph: fx.Graph, class_name: str = "GraphModule" + ): + """ + Args: + root (nn.Module): module from which the copied module hierarchy is + built + train_graph (fx.Graph): the graph that should be used in train mode + eval_graph (fx.Graph): the graph that should be used in eval mode + """ + super(fx.GraphModule, self).__init__() + + self.__class__.__name__ = class_name + + self.train_graph = train_graph + self.eval_graph = eval_graph + + # Copy all get_attr and call_module ops (indicated by BOTH train and + # eval graphs) + for node in chain(iter(train_graph.nodes), iter(eval_graph.nodes)): + if node.op in ["get_attr", "call_module"]: + assert isinstance(node.target, str) + _copy_attr(root, self, node.target) + + # train mode by default + self.train() + self.graph = train_graph + + # (borrowed from fx.GraphModule): + # Store the Tracer class responsible for creating a Graph separately as part of the + # GraphModule state, except when the Tracer is defined in a local namespace. + # Locally defined Tracers are not pickleable. This is needed because torch.package will + # serialize a GraphModule without retaining the Graph, and needs to use the correct Tracer + # to re-create the Graph during deserialization. + assert ( + self.eval_graph._tracer_cls == self.train_graph._tracer_cls + ), "Train mode and eval mode should use the same tracer class" + self._tracer_cls = None + if self.graph._tracer_cls and "" not in self.graph._tracer_cls.__qualname__: + self._tracer_cls = self.graph._tracer_cls + + def train(self, mode=True): + """ + Swap out the graph depending on the selected training mode. + NOTE this should be safe when calling model.eval() because that just + calls this with mode == False. + """ + # NOTE: Only set self.graph if the current graph is not the desired + # one. This saves us from recompiling the graph where not necessary. + if mode and not self.training: + self.graph = self.train_graph + elif not mode and self.training: + self.graph = self.eval_graph + return super().train(mode=mode) + + +def create_feature_extractor( + model: nn.Module, + return_nodes: Optional[Union[List[str], Dict[str, str]]] = None, + train_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None, + eval_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None, + tracer_kwargs: Optional[Dict[str, Any]] = None, + suppress_diff_warning: bool = False, +) -> fx.GraphModule: + """ + Creates a new graph module that returns intermediate nodes from a given + model as dictionary with user specified keys as strings, and the requested + outputs as values. This is achieved by re-writing the computation graph of + the model via FX to return the desired nodes as outputs. All unused nodes + are removed, together with their corresponding parameters. + + Desired output nodes must be specified as a ``.`` separated + path walking the module hierarchy from top level module down to leaf + operation or leaf module. For more details on the node naming conventions + used here, please see the :ref:`relevant subheading ` + in the `documentation `_. + + Not all models will be FX traceable, although with some massaging they can + be made to cooperate. Here's a (not exhaustive) list of tips: + + - If you don't need to trace through a particular, problematic + sub-module, turn it into a "leaf module" by passing a list of + ``leaf_modules`` as one of the ``tracer_kwargs`` (see example below). + It will not be traced through, but rather, the resulting graph will + hold a reference to that module's forward method. + - Likewise, you may turn functions into leaf functions by passing a + list of ``autowrap_functions`` as one of the ``tracer_kwargs`` (see + example below). + - Some inbuilt Python functions can be problematic. For instance, + ``int`` will raise an error during tracing. You may wrap them in your + own function and then pass that in ``autowrap_functions`` as one of + the ``tracer_kwargs``. + + For further information on FX see the + `torch.fx documentation `_. + + Args: + model (nn.Module): model on which we will extract the features + return_nodes (list or dict, optional): either a ``List`` or a ``Dict`` + containing the names (or partial names - see note above) + of the nodes for which the activations will be returned. If it is + a ``Dict``, the keys are the node names, and the values + are the user-specified keys for the graph module's returned + dictionary. If it is a ``List``, it is treated as a ``Dict`` mapping + node specification strings directly to output names. In the case + that ``train_return_nodes`` and ``eval_return_nodes`` are specified, + this should not be specified. + train_return_nodes (list or dict, optional): similar to + ``return_nodes``. This can be used if the return nodes + for train mode are different than those from eval mode. + If this is specified, ``eval_return_nodes`` must also be specified, + and ``return_nodes`` should not be specified. + eval_return_nodes (list or dict, optional): similar to + ``return_nodes``. This can be used if the return nodes + for train mode are different than those from eval mode. + If this is specified, ``train_return_nodes`` must also be specified, + and `return_nodes` should not be specified. + tracer_kwargs (dict, optional): a dictionary of keywork arguments for + ``NodePathTracer`` (which passes them onto it's parent class + `torch.fx.Tracer `_). + By default it will be set to wrap and make leaf nodes all torchvision ops. + suppress_diff_warning (bool, optional): whether to suppress a warning + when there are discrepancies between the train and eval version of + the graph. Defaults to False. + + Examples:: + + >>> # Feature extraction with resnet + >>> model = torchvision.models.resnet18() + >>> # extract layer1 and layer3, giving as names `feat1` and feat2` + >>> model = create_feature_extractor( + >>> model, {'layer1': 'feat1', 'layer3': 'feat2'}) + >>> out = model(torch.rand(1, 3, 224, 224)) + >>> print([(k, v.shape) for k, v in out.items()]) + >>> [('feat1', torch.Size([1, 64, 56, 56])), + >>> ('feat2', torch.Size([1, 256, 14, 14]))] + + >>> # Specifying leaf modules and leaf functions + >>> def leaf_function(x): + >>> # This would raise a TypeError if traced through + >>> return int(x) + >>> + >>> class LeafModule(torch.nn.Module): + >>> def forward(self, x): + >>> # This would raise a TypeError if traced through + >>> int(x.shape[0]) + >>> return torch.nn.functional.relu(x + 4) + >>> + >>> class MyModule(torch.nn.Module): + >>> def __init__(self): + >>> super().__init__() + >>> self.conv = torch.nn.Conv2d(3, 1, 3) + >>> self.leaf_module = LeafModule() + >>> + >>> def forward(self, x): + >>> leaf_function(x.shape[0]) + >>> x = self.conv(x) + >>> return self.leaf_module(x) + >>> + >>> model = create_feature_extractor( + >>> MyModule(), return_nodes=['leaf_module'], + >>> tracer_kwargs={'leaf_modules': [LeafModule], + >>> 'autowrap_functions': [leaf_function]}) + + """ + if tracer_kwargs is None: + tracer_kwargs = { + "autowrap_modules": ( + math, + torchvision.ops, + ), + "leaf_modules": _get_leaf_modules_for_ops(), + } + is_training = model.training + + assert any( + arg is not None for arg in [return_nodes, train_return_nodes, eval_return_nodes] + ), "Either `return_nodes` or `train_return_nodes` and `eval_return_nodes` together, should be specified" + + assert not ( + (train_return_nodes is None) ^ (eval_return_nodes is None) + ), "If any of `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified" + + assert (return_nodes is None) ^ ( + train_return_nodes is None + ), "If `train_return_nodes` and `eval_return_nodes` are specified, then both should be specified" + + # Put *_return_nodes into Dict[str, str] format + def to_strdict(n) -> Dict[str, str]: + if isinstance(n, list): + return {str(i): str(i) for i in n} + return {str(k): str(v) for k, v in n.items()} + + if train_return_nodes is None: + return_nodes = to_strdict(return_nodes) + train_return_nodes = deepcopy(return_nodes) + eval_return_nodes = deepcopy(return_nodes) + else: + train_return_nodes = to_strdict(train_return_nodes) + eval_return_nodes = to_strdict(eval_return_nodes) + + # Repeat the tracing and graph rewriting for train and eval mode + tracers = {} + graphs = {} + mode_return_nodes: Dict[str, Dict[str, str]] = {"train": train_return_nodes, "eval": eval_return_nodes} + for mode in ["train", "eval"]: + if mode == "train": + model.train() + elif mode == "eval": + model.eval() + + # Instantiate our NodePathTracer and use that to trace the model + tracer = NodePathTracer(**tracer_kwargs) + graph = tracer.trace(model) + + name = model.__class__.__name__ if isinstance(model, nn.Module) else model.__name__ + graph_module = fx.GraphModule(tracer.root, graph, name) + + available_nodes = list(tracer.node_to_qualname.values()) + # FIXME We don't know if we should expect this to happen + assert len(set(available_nodes)) == len( + available_nodes + ), "There are duplicate nodes! Please raise an issue https://github.com/pytorch/vision/issues" + # Check that all outputs in return_nodes are present in the model + for query in mode_return_nodes[mode].keys(): + # To check if a query is available we need to check that at least + # one of the available names starts with it up to a . + if not any([re.match(rf"^{query}(\.|$)", n) is not None for n in available_nodes]): + raise ValueError( + f"node: '{query}' is not present in model. Hint: use " + "`get_graph_node_names` to make sure the " + "`return_nodes` you specified are present. It may even " + "be that you need to specify `train_return_nodes` and " + "`eval_return_nodes` separately." + ) + + # Remove existing output nodes (train mode) + orig_output_nodes = [] + for n in reversed(graph_module.graph.nodes): + if n.op == "output": + orig_output_nodes.append(n) + assert len(orig_output_nodes) + for n in orig_output_nodes: + graph_module.graph.erase_node(n) + + # Find nodes corresponding to return_nodes and make them into output_nodes + nodes = [n for n in graph_module.graph.nodes] + output_nodes = OrderedDict() + for n in reversed(nodes): + module_qualname = tracer.node_to_qualname.get(n) + if module_qualname is None: + # NOTE - Know cases where this happens: + # - Node representing creation of a tensor constant - probably + # not interesting as a return node + # - When packing outputs into a named tuple like in InceptionV3 + continue + for query in mode_return_nodes[mode]: + depth = query.count(".") + if ".".join(module_qualname.split(".")[: depth + 1]) == query: + output_nodes[mode_return_nodes[mode][query]] = n + mode_return_nodes[mode].pop(query) + break + output_nodes = OrderedDict(reversed(list(output_nodes.items()))) + + # And add them in the end of the graph + with graph_module.graph.inserting_after(nodes[-1]): + graph_module.graph.output(output_nodes) + + # Remove unused modules / parameters + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + + # Keep track of the tracer and graph so we can choose the main one + tracers[mode] = tracer + graphs[mode] = graph + + # Warn user if there are any discrepancies between the graphs of the + # train and eval modes + if not suppress_diff_warning: + _warn_graph_differences(tracers["train"], tracers["eval"]) + + # Build the final graph module + graph_module = DualGraphModule(model, graphs["train"], graphs["eval"], class_name=name) + + # Restore original training mode + model.train(is_training) + graph_module.train(is_training) + + return graph_module diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/googlenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/googlenet.py new file mode 100644 index 0000000000000000000000000000000000000000..9e4c3498aab83e97ec9b1b96c8d5be85bfc4b7a7 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/googlenet.py @@ -0,0 +1,310 @@ +import warnings +from collections import namedtuple +from typing import Optional, Tuple, List, Callable, Any + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + +__all__ = ["GoogLeNet", "googlenet", "GoogLeNetOutputs", "_GoogLeNetOutputs"] + +model_urls = { + # GoogLeNet ported from TensorFlow + "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth", +} + +GoogLeNetOutputs = namedtuple("GoogLeNetOutputs", ["logits", "aux_logits2", "aux_logits1"]) +GoogLeNetOutputs.__annotations__ = {"logits": Tensor, "aux_logits2": Optional[Tensor], "aux_logits1": Optional[Tensor]} + +# Script annotations failed with _GoogleNetOutputs = namedtuple ... +# _GoogLeNetOutputs set here for backwards compat +_GoogLeNetOutputs = GoogLeNetOutputs + + +class GoogLeNet(nn.Module): + __constants__ = ["aux_logits", "transform_input"] + + def __init__( + self, + num_classes: int = 1000, + aux_logits: bool = True, + transform_input: bool = False, + init_weights: Optional[bool] = None, + blocks: Optional[List[Callable[..., nn.Module]]] = None, + dropout: float = 0.2, + dropout_aux: float = 0.7, + ) -> None: + super().__init__() + _log_api_usage_once(self) + if blocks is None: + blocks = [BasicConv2d, Inception, InceptionAux] + if init_weights is None: + warnings.warn( + "The default weight initialization of GoogleNet will be changed in future releases of " + "torchvision. If you wish to keep the old behavior (which leads to long initialization times" + " due to scipy/scipy#11299), please set init_weights=True.", + FutureWarning, + ) + init_weights = True + assert len(blocks) == 3 + conv_block = blocks[0] + inception_block = blocks[1] + inception_aux_block = blocks[2] + + self.aux_logits = aux_logits + self.transform_input = transform_input + + self.conv1 = conv_block(3, 64, kernel_size=7, stride=2, padding=3) + self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True) + self.conv2 = conv_block(64, 64, kernel_size=1) + self.conv3 = conv_block(64, 192, kernel_size=3, padding=1) + self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True) + + self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32) + self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64) + self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True) + + self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64) + self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64) + self.inception4c = inception_block(512, 128, 128, 256, 24, 64, 64) + self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64) + self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128) + self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128) + self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128) + + if aux_logits: + self.aux1 = inception_aux_block(512, num_classes, dropout=dropout_aux) + self.aux2 = inception_aux_block(528, num_classes, dropout=dropout_aux) + else: + self.aux1 = None # type: ignore[assignment] + self.aux2 = None # type: ignore[assignment] + + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.dropout = nn.Dropout(p=dropout) + self.fc = nn.Linear(1024, num_classes) + + if init_weights: + for m in self.modules(): + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): + torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=0.01, a=-2, b=2) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _transform_input(self, x: Tensor) -> Tensor: + if self.transform_input: + x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5 + x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5 + x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5 + x = torch.cat((x_ch0, x_ch1, x_ch2), 1) + return x + + def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + # N x 3 x 224 x 224 + x = self.conv1(x) + # N x 64 x 112 x 112 + x = self.maxpool1(x) + # N x 64 x 56 x 56 + x = self.conv2(x) + # N x 64 x 56 x 56 + x = self.conv3(x) + # N x 192 x 56 x 56 + x = self.maxpool2(x) + + # N x 192 x 28 x 28 + x = self.inception3a(x) + # N x 256 x 28 x 28 + x = self.inception3b(x) + # N x 480 x 28 x 28 + x = self.maxpool3(x) + # N x 480 x 14 x 14 + x = self.inception4a(x) + # N x 512 x 14 x 14 + aux1: Optional[Tensor] = None + if self.aux1 is not None: + if self.training: + aux1 = self.aux1(x) + + x = self.inception4b(x) + # N x 512 x 14 x 14 + x = self.inception4c(x) + # N x 512 x 14 x 14 + x = self.inception4d(x) + # N x 528 x 14 x 14 + aux2: Optional[Tensor] = None + if self.aux2 is not None: + if self.training: + aux2 = self.aux2(x) + + x = self.inception4e(x) + # N x 832 x 14 x 14 + x = self.maxpool4(x) + # N x 832 x 7 x 7 + x = self.inception5a(x) + # N x 832 x 7 x 7 + x = self.inception5b(x) + # N x 1024 x 7 x 7 + + x = self.avgpool(x) + # N x 1024 x 1 x 1 + x = torch.flatten(x, 1) + # N x 1024 + x = self.dropout(x) + x = self.fc(x) + # N x 1000 (num_classes) + return x, aux2, aux1 + + @torch.jit.unused + def eager_outputs(self, x: Tensor, aux2: Tensor, aux1: Optional[Tensor]) -> GoogLeNetOutputs: + if self.training and self.aux_logits: + return _GoogLeNetOutputs(x, aux2, aux1) + else: + return x # type: ignore[return-value] + + def forward(self, x: Tensor) -> GoogLeNetOutputs: + x = self._transform_input(x) + x, aux1, aux2 = self._forward(x) + aux_defined = self.training and self.aux_logits + if torch.jit.is_scripting(): + if not aux_defined: + warnings.warn("Scripted GoogleNet always returns GoogleNetOutputs Tuple") + return GoogLeNetOutputs(x, aux2, aux1) + else: + return self.eager_outputs(x, aux2, aux1) + + +class Inception(nn.Module): + def __init__( + self, + in_channels: int, + ch1x1: int, + ch3x3red: int, + ch3x3: int, + ch5x5red: int, + ch5x5: int, + pool_proj: int, + conv_block: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1 = conv_block(in_channels, ch1x1, kernel_size=1) + + self.branch2 = nn.Sequential( + conv_block(in_channels, ch3x3red, kernel_size=1), conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1) + ) + + self.branch3 = nn.Sequential( + conv_block(in_channels, ch5x5red, kernel_size=1), + # Here, kernel_size=3 instead of kernel_size=5 is a known bug. + # Please see https://github.com/pytorch/vision/issues/906 for details. + conv_block(ch5x5red, ch5x5, kernel_size=3, padding=1), + ) + + self.branch4 = nn.Sequential( + nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True), + conv_block(in_channels, pool_proj, kernel_size=1), + ) + + def _forward(self, x: Tensor) -> List[Tensor]: + branch1 = self.branch1(x) + branch2 = self.branch2(x) + branch3 = self.branch3(x) + branch4 = self.branch4(x) + + outputs = [branch1, branch2, branch3, branch4] + return outputs + + def forward(self, x: Tensor) -> Tensor: + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionAux(nn.Module): + def __init__( + self, + in_channels: int, + num_classes: int, + conv_block: Optional[Callable[..., nn.Module]] = None, + dropout: float = 0.7, + ) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.conv = conv_block(in_channels, 128, kernel_size=1) + + self.fc1 = nn.Linear(2048, 1024) + self.fc2 = nn.Linear(1024, num_classes) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x: Tensor) -> Tensor: + # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14 + x = F.adaptive_avg_pool2d(x, (4, 4)) + # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4 + x = self.conv(x) + # N x 128 x 4 x 4 + x = torch.flatten(x, 1) + # N x 2048 + x = F.relu(self.fc1(x), inplace=True) + # N x 1024 + x = self.dropout(x) + # N x 1024 + x = self.fc2(x) + # N x 1000 (num_classes) + + return x + + +class BasicConv2d(nn.Module): + def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None: + super().__init__() + self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) + self.bn = nn.BatchNorm2d(out_channels, eps=0.001) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv(x) + x = self.bn(x) + return F.relu(x, inplace=True) + + +def googlenet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> GoogLeNet: + r"""GoogLeNet (Inception v1) model architecture from + `"Going Deeper with Convolutions" `_. + The required minimum input size of the model is 15x15. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + aux_logits (bool): If True, adds two auxiliary branches that can improve training. + Default: *False* when pretrained is True otherwise *True* + transform_input (bool): If True, preprocesses the input according to the method with which it + was trained on ImageNet. Default: True if ``pretrained=True``, else False. + """ + if pretrained: + if "transform_input" not in kwargs: + kwargs["transform_input"] = True + if "aux_logits" not in kwargs: + kwargs["aux_logits"] = False + if kwargs["aux_logits"]: + warnings.warn( + "auxiliary heads in the pretrained googlenet model are NOT pretrained, so make sure to train them" + ) + original_aux_logits = kwargs["aux_logits"] + kwargs["aux_logits"] = True + kwargs["init_weights"] = False + model = GoogLeNet(**kwargs) + state_dict = load_state_dict_from_url(model_urls["googlenet"], progress=progress) + model.load_state_dict(state_dict) + if not original_aux_logits: + model.aux_logits = False + model.aux1 = None # type: ignore[assignment] + model.aux2 = None # type: ignore[assignment] + return model + + return GoogLeNet(**kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/inception.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/inception.py new file mode 100644 index 0000000000000000000000000000000000000000..c489925cb458d9700afbb377bc2c449d6e6d0066 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/inception.py @@ -0,0 +1,444 @@ +import warnings +from collections import namedtuple +from typing import Callable, Any, Optional, Tuple, List + +import torch +import torch.nn.functional as F +from torch import nn, Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + + +__all__ = ["Inception3", "inception_v3", "InceptionOutputs", "_InceptionOutputs"] + + +model_urls = { + # Inception v3 ported from TensorFlow + "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth", +} + +InceptionOutputs = namedtuple("InceptionOutputs", ["logits", "aux_logits"]) +InceptionOutputs.__annotations__ = {"logits": Tensor, "aux_logits": Optional[Tensor]} + +# Script annotations failed with _GoogleNetOutputs = namedtuple ... +# _InceptionOutputs set here for backwards compat +_InceptionOutputs = InceptionOutputs + + +class Inception3(nn.Module): + def __init__( + self, + num_classes: int = 1000, + aux_logits: bool = True, + transform_input: bool = False, + inception_blocks: Optional[List[Callable[..., nn.Module]]] = None, + init_weights: Optional[bool] = None, + dropout: float = 0.5, + ) -> None: + super().__init__() + _log_api_usage_once(self) + if inception_blocks is None: + inception_blocks = [BasicConv2d, InceptionA, InceptionB, InceptionC, InceptionD, InceptionE, InceptionAux] + if init_weights is None: + warnings.warn( + "The default weight initialization of inception_v3 will be changed in future releases of " + "torchvision. If you wish to keep the old behavior (which leads to long initialization times" + " due to scipy/scipy#11299), please set init_weights=True.", + FutureWarning, + ) + init_weights = True + assert len(inception_blocks) == 7 + conv_block = inception_blocks[0] + inception_a = inception_blocks[1] + inception_b = inception_blocks[2] + inception_c = inception_blocks[3] + inception_d = inception_blocks[4] + inception_e = inception_blocks[5] + inception_aux = inception_blocks[6] + + self.aux_logits = aux_logits + self.transform_input = transform_input + self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2) + self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3) + self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1) + self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2) + self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1) + self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3) + self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2) + self.Mixed_5b = inception_a(192, pool_features=32) + self.Mixed_5c = inception_a(256, pool_features=64) + self.Mixed_5d = inception_a(288, pool_features=64) + self.Mixed_6a = inception_b(288) + self.Mixed_6b = inception_c(768, channels_7x7=128) + self.Mixed_6c = inception_c(768, channels_7x7=160) + self.Mixed_6d = inception_c(768, channels_7x7=160) + self.Mixed_6e = inception_c(768, channels_7x7=192) + self.AuxLogits: Optional[nn.Module] = None + if aux_logits: + self.AuxLogits = inception_aux(768, num_classes) + self.Mixed_7a = inception_d(768) + self.Mixed_7b = inception_e(1280) + self.Mixed_7c = inception_e(2048) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.dropout = nn.Dropout(p=dropout) + self.fc = nn.Linear(2048, num_classes) + if init_weights: + for m in self.modules(): + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): + stddev = float(m.stddev) if hasattr(m, "stddev") else 0.1 # type: ignore + torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=stddev, a=-2, b=2) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _transform_input(self, x: Tensor) -> Tensor: + if self.transform_input: + x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5 + x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5 + x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5 + x = torch.cat((x_ch0, x_ch1, x_ch2), 1) + return x + + def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor]]: + # N x 3 x 299 x 299 + x = self.Conv2d_1a_3x3(x) + # N x 32 x 149 x 149 + x = self.Conv2d_2a_3x3(x) + # N x 32 x 147 x 147 + x = self.Conv2d_2b_3x3(x) + # N x 64 x 147 x 147 + x = self.maxpool1(x) + # N x 64 x 73 x 73 + x = self.Conv2d_3b_1x1(x) + # N x 80 x 73 x 73 + x = self.Conv2d_4a_3x3(x) + # N x 192 x 71 x 71 + x = self.maxpool2(x) + # N x 192 x 35 x 35 + x = self.Mixed_5b(x) + # N x 256 x 35 x 35 + x = self.Mixed_5c(x) + # N x 288 x 35 x 35 + x = self.Mixed_5d(x) + # N x 288 x 35 x 35 + x = self.Mixed_6a(x) + # N x 768 x 17 x 17 + x = self.Mixed_6b(x) + # N x 768 x 17 x 17 + x = self.Mixed_6c(x) + # N x 768 x 17 x 17 + x = self.Mixed_6d(x) + # N x 768 x 17 x 17 + x = self.Mixed_6e(x) + # N x 768 x 17 x 17 + aux: Optional[Tensor] = None + if self.AuxLogits is not None: + if self.training: + aux = self.AuxLogits(x) + # N x 768 x 17 x 17 + x = self.Mixed_7a(x) + # N x 1280 x 8 x 8 + x = self.Mixed_7b(x) + # N x 2048 x 8 x 8 + x = self.Mixed_7c(x) + # N x 2048 x 8 x 8 + # Adaptive average pooling + x = self.avgpool(x) + # N x 2048 x 1 x 1 + x = self.dropout(x) + # N x 2048 x 1 x 1 + x = torch.flatten(x, 1) + # N x 2048 + x = self.fc(x) + # N x 1000 (num_classes) + return x, aux + + @torch.jit.unused + def eager_outputs(self, x: Tensor, aux: Optional[Tensor]) -> InceptionOutputs: + if self.training and self.aux_logits: + return InceptionOutputs(x, aux) + else: + return x # type: ignore[return-value] + + def forward(self, x: Tensor) -> InceptionOutputs: + x = self._transform_input(x) + x, aux = self._forward(x) + aux_defined = self.training and self.aux_logits + if torch.jit.is_scripting(): + if not aux_defined: + warnings.warn("Scripted Inception3 always returns Inception3 Tuple") + return InceptionOutputs(x, aux) + else: + return self.eager_outputs(x, aux) + + +class InceptionA(nn.Module): + def __init__( + self, in_channels: int, pool_features: int, conv_block: Optional[Callable[..., nn.Module]] = None + ) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1x1 = conv_block(in_channels, 64, kernel_size=1) + + self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1) + self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2) + + self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1) + self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1) + self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1) + + self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1) + + def _forward(self, x: Tensor) -> List[Tensor]: + branch1x1 = self.branch1x1(x) + + branch5x5 = self.branch5x5_1(x) + branch5x5 = self.branch5x5_2(branch5x5) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) + + branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] + return outputs + + def forward(self, x: Tensor) -> Tensor: + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionB(nn.Module): + def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2) + + self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1) + self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1) + self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2) + + def _forward(self, x: Tensor) -> List[Tensor]: + branch3x3 = self.branch3x3(x) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) + + branch_pool = F.max_pool2d(x, kernel_size=3, stride=2) + + outputs = [branch3x3, branch3x3dbl, branch_pool] + return outputs + + def forward(self, x: Tensor) -> Tensor: + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionC(nn.Module): + def __init__( + self, in_channels: int, channels_7x7: int, conv_block: Optional[Callable[..., nn.Module]] = None + ) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1x1 = conv_block(in_channels, 192, kernel_size=1) + + c7 = channels_7x7 + self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1) + self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3)) + self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0)) + + self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1) + self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0)) + self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3)) + self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0)) + self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3)) + + self.branch_pool = conv_block(in_channels, 192, kernel_size=1) + + def _forward(self, x: Tensor) -> List[Tensor]: + branch1x1 = self.branch1x1(x) + + branch7x7 = self.branch7x7_1(x) + branch7x7 = self.branch7x7_2(branch7x7) + branch7x7 = self.branch7x7_3(branch7x7) + + branch7x7dbl = self.branch7x7dbl_1(x) + branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) + + branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool] + return outputs + + def forward(self, x: Tensor) -> Tensor: + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionD(nn.Module): + def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1) + self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2) + + self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1) + self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3)) + self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0)) + self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2) + + def _forward(self, x: Tensor) -> List[Tensor]: + branch3x3 = self.branch3x3_1(x) + branch3x3 = self.branch3x3_2(branch3x3) + + branch7x7x3 = self.branch7x7x3_1(x) + branch7x7x3 = self.branch7x7x3_2(branch7x7x3) + branch7x7x3 = self.branch7x7x3_3(branch7x7x3) + branch7x7x3 = self.branch7x7x3_4(branch7x7x3) + + branch_pool = F.max_pool2d(x, kernel_size=3, stride=2) + outputs = [branch3x3, branch7x7x3, branch_pool] + return outputs + + def forward(self, x: Tensor) -> Tensor: + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionE(nn.Module): + def __init__(self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1x1 = conv_block(in_channels, 320, kernel_size=1) + + self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1) + self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1)) + self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0)) + + self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1) + self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1) + self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1)) + self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0)) + + self.branch_pool = conv_block(in_channels, 192, kernel_size=1) + + def _forward(self, x: Tensor) -> List[Tensor]: + branch1x1 = self.branch1x1(x) + + branch3x3 = self.branch3x3_1(x) + branch3x3 = [ + self.branch3x3_2a(branch3x3), + self.branch3x3_2b(branch3x3), + ] + branch3x3 = torch.cat(branch3x3, 1) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = [ + self.branch3x3dbl_3a(branch3x3dbl), + self.branch3x3dbl_3b(branch3x3dbl), + ] + branch3x3dbl = torch.cat(branch3x3dbl, 1) + + branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] + return outputs + + def forward(self, x: Tensor) -> Tensor: + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionAux(nn.Module): + def __init__( + self, in_channels: int, num_classes: int, conv_block: Optional[Callable[..., nn.Module]] = None + ) -> None: + super().__init__() + if conv_block is None: + conv_block = BasicConv2d + self.conv0 = conv_block(in_channels, 128, kernel_size=1) + self.conv1 = conv_block(128, 768, kernel_size=5) + self.conv1.stddev = 0.01 # type: ignore[assignment] + self.fc = nn.Linear(768, num_classes) + self.fc.stddev = 0.001 # type: ignore[assignment] + + def forward(self, x: Tensor) -> Tensor: + # N x 768 x 17 x 17 + x = F.avg_pool2d(x, kernel_size=5, stride=3) + # N x 768 x 5 x 5 + x = self.conv0(x) + # N x 128 x 5 x 5 + x = self.conv1(x) + # N x 768 x 1 x 1 + # Adaptive average pooling + x = F.adaptive_avg_pool2d(x, (1, 1)) + # N x 768 x 1 x 1 + x = torch.flatten(x, 1) + # N x 768 + x = self.fc(x) + # N x 1000 + return x + + +class BasicConv2d(nn.Module): + def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None: + super().__init__() + self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) + self.bn = nn.BatchNorm2d(out_channels, eps=0.001) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv(x) + x = self.bn(x) + return F.relu(x, inplace=True) + + +def inception_v3(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> Inception3: + r"""Inception v3 model architecture from + `"Rethinking the Inception Architecture for Computer Vision" `_. + The required minimum input size of the model is 75x75. + + .. note:: + **Important**: In contrast to the other models the inception_v3 expects tensors with a size of + N x 3 x 299 x 299, so ensure your images are sized accordingly. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + aux_logits (bool): If True, add an auxiliary branch that can improve training. + Default: *True* + transform_input (bool): If True, preprocesses the input according to the method with which it + was trained on ImageNet. Default: True if ``pretrained=True``, else False. + """ + if pretrained: + if "transform_input" not in kwargs: + kwargs["transform_input"] = True + if "aux_logits" in kwargs: + original_aux_logits = kwargs["aux_logits"] + kwargs["aux_logits"] = True + else: + original_aux_logits = True + kwargs["init_weights"] = False # we are loading weights from a pretrained model + model = Inception3(**kwargs) + state_dict = load_state_dict_from_url(model_urls["inception_v3_google"], progress=progress) + model.load_state_dict(state_dict) + if not original_aux_logits: + model.aux_logits = False + model.AuxLogits = None + return model + + return Inception3(**kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mnasnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mnasnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d4013f30c8ec7725665a09a7db9cdb41ea9988 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mnasnet.py @@ -0,0 +1,263 @@ +import warnings +from typing import Any, Dict, List + +import torch +import torch.nn as nn +from torch import Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + +__all__ = ["MNASNet", "mnasnet0_5", "mnasnet0_75", "mnasnet1_0", "mnasnet1_3"] + +_MODEL_URLS = { + "mnasnet0_5": "https://download.pytorch.org/models/mnasnet0.5_top1_67.823-3ffadce67e.pth", + "mnasnet0_75": None, + "mnasnet1_0": "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth", + "mnasnet1_3": None, +} + +# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is +# 1.0 - tensorflow. +_BN_MOMENTUM = 1 - 0.9997 + + +class _InvertedResidual(nn.Module): + def __init__( + self, in_ch: int, out_ch: int, kernel_size: int, stride: int, expansion_factor: int, bn_momentum: float = 0.1 + ) -> None: + super().__init__() + assert stride in [1, 2] + assert kernel_size in [3, 5] + mid_ch = in_ch * expansion_factor + self.apply_residual = in_ch == out_ch and stride == 1 + self.layers = nn.Sequential( + # Pointwise + nn.Conv2d(in_ch, mid_ch, 1, bias=False), + nn.BatchNorm2d(mid_ch, momentum=bn_momentum), + nn.ReLU(inplace=True), + # Depthwise + nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2, stride=stride, groups=mid_ch, bias=False), + nn.BatchNorm2d(mid_ch, momentum=bn_momentum), + nn.ReLU(inplace=True), + # Linear pointwise. Note that there's no activation. + nn.Conv2d(mid_ch, out_ch, 1, bias=False), + nn.BatchNorm2d(out_ch, momentum=bn_momentum), + ) + + def forward(self, input: Tensor) -> Tensor: + if self.apply_residual: + return self.layers(input) + input + else: + return self.layers(input) + + +def _stack( + in_ch: int, out_ch: int, kernel_size: int, stride: int, exp_factor: int, repeats: int, bn_momentum: float +) -> nn.Sequential: + """Creates a stack of inverted residuals.""" + assert repeats >= 1 + # First one has no skip, because feature map size changes. + first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum=bn_momentum) + remaining = [] + for _ in range(1, repeats): + remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum=bn_momentum)) + return nn.Sequential(first, *remaining) + + +def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9) -> int: + """Asymmetric rounding to make `val` divisible by `divisor`. With default + bias, will round up, unless the number is no more than 10% greater than the + smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88.""" + assert 0.0 < round_up_bias < 1.0 + new_val = max(divisor, int(val + divisor / 2) // divisor * divisor) + return new_val if new_val >= round_up_bias * val else new_val + divisor + + +def _get_depths(alpha: float) -> List[int]: + """Scales tensor depths as in reference MobileNet code, prefers rouding up + rather than down.""" + depths = [32, 16, 24, 40, 80, 96, 192, 320] + return [_round_to_multiple_of(depth * alpha, 8) for depth in depths] + + +class MNASNet(torch.nn.Module): + """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This + implements the B1 variant of the model. + >>> model = MNASNet(1.0, num_classes=1000) + >>> x = torch.rand(1, 3, 224, 224) + >>> y = model(x) + >>> y.dim() + 2 + >>> y.nelement() + 1000 + """ + + # Version 2 adds depth scaling in the initial stages of the network. + _version = 2 + + def __init__(self, alpha: float, num_classes: int = 1000, dropout: float = 0.2) -> None: + super().__init__() + _log_api_usage_once(self) + assert alpha > 0.0 + self.alpha = alpha + self.num_classes = num_classes + depths = _get_depths(alpha) + layers = [ + # First layer: regular conv. + nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True), + # Depthwise separable, no skip. + nn.Conv2d(depths[0], depths[0], 3, padding=1, stride=1, groups=depths[0], bias=False), + nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True), + nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1, bias=False), + nn.BatchNorm2d(depths[1], momentum=_BN_MOMENTUM), + # MNASNet blocks: stacks of inverted residuals. + _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM), + _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM), + _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM), + _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM), + _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM), + _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM), + # Final mapping to classifier input. + nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False), + nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True), + ] + self.layers = nn.Sequential(*layers) + self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True), nn.Linear(1280, num_classes)) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, mode="fan_out", nonlinearity="sigmoid") + nn.init.zeros_(m.bias) + + def forward(self, x: Tensor) -> Tensor: + x = self.layers(x) + # Equivalent to global avgpool and removing H and W dimensions. + x = x.mean([2, 3]) + return self.classifier(x) + + def _load_from_state_dict( + self, + state_dict: Dict, + prefix: str, + local_metadata: Dict, + strict: bool, + missing_keys: List[str], + unexpected_keys: List[str], + error_msgs: List[str], + ) -> None: + version = local_metadata.get("version", None) + assert version in [1, 2] + + if version == 1 and not self.alpha == 1.0: + # In the initial version of the model (v1), stem was fixed-size. + # All other layer configurations were the same. This will patch + # the model so that it's identical to v1. Model with alpha 1.0 is + # unaffected. + depths = _get_depths(self.alpha) + v1_stem = [ + nn.Conv2d(3, 32, 3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(32, momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True), + nn.Conv2d(32, 32, 3, padding=1, stride=1, groups=32, bias=False), + nn.BatchNorm2d(32, momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True), + nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False), + nn.BatchNorm2d(16, momentum=_BN_MOMENTUM), + _stack(16, depths[2], 3, 2, 3, 3, _BN_MOMENTUM), + ] + for idx, layer in enumerate(v1_stem): + self.layers[idx] = layer + + # The model is now identical to v1, and must be saved as such. + self._version = 1 + warnings.warn( + "A new version of MNASNet model has been implemented. " + "Your checkpoint was saved using the previous version. " + "This checkpoint will load and work as before, but " + "you may want to upgrade by training a newer model or " + "transfer learning from an updated ImageNet checkpoint.", + UserWarning, + ) + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + +def _load_pretrained(model_name: str, model: nn.Module, progress: bool) -> None: + if model_name not in _MODEL_URLS or _MODEL_URLS[model_name] is None: + raise ValueError(f"No checkpoint is available for model type {model_name}") + checkpoint_url = _MODEL_URLS[model_name] + model.load_state_dict(load_state_dict_from_url(checkpoint_url, progress=progress)) + + +def mnasnet0_5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet: + r"""MNASNet with depth multiplier of 0.5 from + `"MnasNet: Platform-Aware Neural Architecture Search for Mobile" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = MNASNet(0.5, **kwargs) + if pretrained: + _load_pretrained("mnasnet0_5", model, progress) + return model + + +def mnasnet0_75(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet: + r"""MNASNet with depth multiplier of 0.75 from + `"MnasNet: Platform-Aware Neural Architecture Search for Mobile" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = MNASNet(0.75, **kwargs) + if pretrained: + _load_pretrained("mnasnet0_75", model, progress) + return model + + +def mnasnet1_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet: + r"""MNASNet with depth multiplier of 1.0 from + `"MnasNet: Platform-Aware Neural Architecture Search for Mobile" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = MNASNet(1.0, **kwargs) + if pretrained: + _load_pretrained("mnasnet1_0", model, progress) + return model + + +def mnasnet1_3(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MNASNet: + r"""MNASNet with depth multiplier of 1.3 from + `"MnasNet: Platform-Aware Neural Architecture Search for Mobile" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = MNASNet(1.3, **kwargs) + if pretrained: + _load_pretrained("mnasnet1_3", model, progress) + return model diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenet.py new file mode 100644 index 0000000000000000000000000000000000000000..4108305d3f5371ee10f02a7c5fb4858e94b35d5d --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenet.py @@ -0,0 +1,4 @@ +from .mobilenetv2 import MobileNetV2, mobilenet_v2, __all__ as mv2_all +from .mobilenetv3 import MobileNetV3, mobilenet_v3_large, mobilenet_v3_small, __all__ as mv3_all + +__all__ = mv2_all + mv3_all diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv2.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..e24c5962d7e20a5e1f2bde83fa330c509b881415 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv2.py @@ -0,0 +1,211 @@ +import warnings +from typing import Callable, Any, Optional, List + +import torch +from torch import Tensor +from torch import nn + +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops.misc import ConvNormActivation +from ..utils import _log_api_usage_once +from ._utils import _make_divisible + + +__all__ = ["MobileNetV2", "mobilenet_v2"] + + +model_urls = { + "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth", +} + + +# necessary for backwards compatibility +class _DeprecatedConvBNAct(ConvNormActivation): + def __init__(self, *args, **kwargs): + warnings.warn( + "The ConvBNReLU/ConvBNActivation classes are deprecated since 0.12 and will be removed in 0.14. " + "Use torchvision.ops.misc.ConvNormActivation instead.", + FutureWarning, + ) + if kwargs.get("norm_layer", None) is None: + kwargs["norm_layer"] = nn.BatchNorm2d + if kwargs.get("activation_layer", None) is None: + kwargs["activation_layer"] = nn.ReLU6 + super().__init__(*args, **kwargs) + + +ConvBNReLU = _DeprecatedConvBNAct +ConvBNActivation = _DeprecatedConvBNAct + + +class InvertedResidual(nn.Module): + def __init__( + self, inp: int, oup: int, stride: int, expand_ratio: int, norm_layer: Optional[Callable[..., nn.Module]] = None + ) -> None: + super().__init__() + self.stride = stride + assert stride in [1, 2] + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + hidden_dim = int(round(inp * expand_ratio)) + self.use_res_connect = self.stride == 1 and inp == oup + + layers: List[nn.Module] = [] + if expand_ratio != 1: + # pw + layers.append( + ConvNormActivation(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6) + ) + layers.extend( + [ + # dw + ConvNormActivation( + hidden_dim, + hidden_dim, + stride=stride, + groups=hidden_dim, + norm_layer=norm_layer, + activation_layer=nn.ReLU6, + ), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + norm_layer(oup), + ] + ) + self.conv = nn.Sequential(*layers) + self.out_channels = oup + self._is_cn = stride > 1 + + def forward(self, x: Tensor) -> Tensor: + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Module): + def __init__( + self, + num_classes: int = 1000, + width_mult: float = 1.0, + inverted_residual_setting: Optional[List[List[int]]] = None, + round_nearest: int = 8, + block: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + dropout: float = 0.2, + ) -> None: + """ + MobileNet V2 main class + + Args: + num_classes (int): Number of classes + width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount + inverted_residual_setting: Network structure + round_nearest (int): Round the number of channels in each layer to be a multiple of this number + Set to 1 to turn off rounding + block: Module specifying inverted residual building block for mobilenet + norm_layer: Module specifying the normalization layer to use + dropout (float): The droupout probability + + """ + super().__init__() + _log_api_usage_once(self) + + if block is None: + block = InvertedResidual + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + input_channel = 32 + last_channel = 1280 + + if inverted_residual_setting is None: + inverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] + + # only check the first element, assuming user knows t,c,n,s are required + if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: + raise ValueError( + f"inverted_residual_setting should be non-empty or a 4-element list, got {inverted_residual_setting}" + ) + + # building first layer + input_channel = _make_divisible(input_channel * width_mult, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) + features: List[nn.Module] = [ + ConvNormActivation(3, input_channel, stride=2, norm_layer=norm_layer, activation_layer=nn.ReLU6) + ] + # building inverted residual blocks + for t, c, n, s in inverted_residual_setting: + output_channel = _make_divisible(c * width_mult, round_nearest) + for i in range(n): + stride = s if i == 0 else 1 + features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer)) + input_channel = output_channel + # building last several layers + features.append( + ConvNormActivation( + input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6 + ) + ) + # make it nn.Sequential + self.features = nn.Sequential(*features) + + # building classifier + self.classifier = nn.Sequential( + nn.Dropout(p=dropout), + nn.Linear(self.last_channel, num_classes), + ) + + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + + def _forward_impl(self, x: Tensor) -> Tensor: + # This exists since TorchScript doesn't support inheritance, so the superclass method + # (this one) needs to have a name other than `forward` that can be accessed in a subclass + x = self.features(x) + # Cannot use "squeeze" as batch-size can be 1 + x = nn.functional.adaptive_avg_pool2d(x, (1, 1)) + x = torch.flatten(x, 1) + x = self.classifier(x) + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def mobilenet_v2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV2: + """ + Constructs a MobileNetV2 architecture from + `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + model = MobileNetV2(**kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls["mobilenet_v2"], progress=progress) + model.load_state_dict(state_dict) + return model diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv3.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..711888b7c8b236e3f370ec5833b8a875dce7883f --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/mobilenetv3.py @@ -0,0 +1,328 @@ +import warnings +from functools import partial +from typing import Any, Callable, List, Optional, Sequence + +import torch +from torch import nn, Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops.misc import ConvNormActivation, SqueezeExcitation as SElayer +from ..utils import _log_api_usage_once +from ._utils import _make_divisible + + +__all__ = ["MobileNetV3", "mobilenet_v3_large", "mobilenet_v3_small"] + + +model_urls = { + "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth", + "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth", +} + + +class SqueezeExcitation(SElayer): + """DEPRECATED""" + + def __init__(self, input_channels: int, squeeze_factor: int = 4): + squeeze_channels = _make_divisible(input_channels // squeeze_factor, 8) + super().__init__(input_channels, squeeze_channels, scale_activation=nn.Hardsigmoid) + self.relu = self.activation + delattr(self, "activation") + warnings.warn( + "This SqueezeExcitation class is deprecated since 0.12 and will be removed in 0.14. " + "Use torchvision.ops.SqueezeExcitation instead.", + FutureWarning, + ) + + +class InvertedResidualConfig: + # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper + def __init__( + self, + input_channels: int, + kernel: int, + expanded_channels: int, + out_channels: int, + use_se: bool, + activation: str, + stride: int, + dilation: int, + width_mult: float, + ): + self.input_channels = self.adjust_channels(input_channels, width_mult) + self.kernel = kernel + self.expanded_channels = self.adjust_channels(expanded_channels, width_mult) + self.out_channels = self.adjust_channels(out_channels, width_mult) + self.use_se = use_se + self.use_hs = activation == "HS" + self.stride = stride + self.dilation = dilation + + @staticmethod + def adjust_channels(channels: int, width_mult: float): + return _make_divisible(channels * width_mult, 8) + + +class InvertedResidual(nn.Module): + # Implemented as described at section 5 of MobileNetV3 paper + def __init__( + self, + cnf: InvertedResidualConfig, + norm_layer: Callable[..., nn.Module], + se_layer: Callable[..., nn.Module] = partial(SElayer, scale_activation=nn.Hardsigmoid), + ): + super().__init__() + if not (1 <= cnf.stride <= 2): + raise ValueError("illegal stride value") + + self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels + + layers: List[nn.Module] = [] + activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU + + # expand + if cnf.expanded_channels != cnf.input_channels: + layers.append( + ConvNormActivation( + cnf.input_channels, + cnf.expanded_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + # depthwise + stride = 1 if cnf.dilation > 1 else cnf.stride + layers.append( + ConvNormActivation( + cnf.expanded_channels, + cnf.expanded_channels, + kernel_size=cnf.kernel, + stride=stride, + dilation=cnf.dilation, + groups=cnf.expanded_channels, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + if cnf.use_se: + squeeze_channels = _make_divisible(cnf.expanded_channels // 4, 8) + layers.append(se_layer(cnf.expanded_channels, squeeze_channels)) + + # project + layers.append( + ConvNormActivation( + cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None + ) + ) + + self.block = nn.Sequential(*layers) + self.out_channels = cnf.out_channels + self._is_cn = cnf.stride > 1 + + def forward(self, input: Tensor) -> Tensor: + result = self.block(input) + if self.use_res_connect: + result += input + return result + + +class MobileNetV3(nn.Module): + def __init__( + self, + inverted_residual_setting: List[InvertedResidualConfig], + last_channel: int, + num_classes: int = 1000, + block: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + dropout: float = 0.2, + **kwargs: Any, + ) -> None: + """ + MobileNet V3 main class + + Args: + inverted_residual_setting (List[InvertedResidualConfig]): Network structure + last_channel (int): The number of channels on the penultimate layer + num_classes (int): Number of classes + block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet + norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use + dropout (float): The droupout probability + """ + super().__init__() + _log_api_usage_once(self) + + if not inverted_residual_setting: + raise ValueError("The inverted_residual_setting should not be empty") + elif not ( + isinstance(inverted_residual_setting, Sequence) + and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting]) + ): + raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]") + + if block is None: + block = InvertedResidual + + if norm_layer is None: + norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01) + + layers: List[nn.Module] = [] + + # building first layer + firstconv_output_channels = inverted_residual_setting[0].input_channels + layers.append( + ConvNormActivation( + 3, + firstconv_output_channels, + kernel_size=3, + stride=2, + norm_layer=norm_layer, + activation_layer=nn.Hardswish, + ) + ) + + # building inverted residual blocks + for cnf in inverted_residual_setting: + layers.append(block(cnf, norm_layer)) + + # building last several layers + lastconv_input_channels = inverted_residual_setting[-1].out_channels + lastconv_output_channels = 6 * lastconv_input_channels + layers.append( + ConvNormActivation( + lastconv_input_channels, + lastconv_output_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=nn.Hardswish, + ) + ) + + self.features = nn.Sequential(*layers) + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.classifier = nn.Sequential( + nn.Linear(lastconv_output_channels, last_channel), + nn.Hardswish(inplace=True), + nn.Dropout(p=dropout, inplace=True), + nn.Linear(last_channel, num_classes), + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + + def _forward_impl(self, x: Tensor) -> Tensor: + x = self.features(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + + x = self.classifier(x) + + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def _mobilenet_v3_conf( + arch: str, width_mult: float = 1.0, reduced_tail: bool = False, dilated: bool = False, **kwargs: Any +): + reduce_divider = 2 if reduced_tail else 1 + dilation = 2 if dilated else 1 + + bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult) + adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult) + + if arch == "mobilenet_v3_large": + inverted_residual_setting = [ + bneck_conf(16, 3, 16, 16, False, "RE", 1, 1), + bneck_conf(16, 3, 64, 24, False, "RE", 2, 1), # C1 + bneck_conf(24, 3, 72, 24, False, "RE", 1, 1), + bneck_conf(24, 5, 72, 40, True, "RE", 2, 1), # C2 + bneck_conf(40, 5, 120, 40, True, "RE", 1, 1), + bneck_conf(40, 5, 120, 40, True, "RE", 1, 1), + bneck_conf(40, 3, 240, 80, False, "HS", 2, 1), # C3 + bneck_conf(80, 3, 200, 80, False, "HS", 1, 1), + bneck_conf(80, 3, 184, 80, False, "HS", 1, 1), + bneck_conf(80, 3, 184, 80, False, "HS", 1, 1), + bneck_conf(80, 3, 480, 112, True, "HS", 1, 1), + bneck_conf(112, 3, 672, 112, True, "HS", 1, 1), + bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation), # C4 + bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation), + bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation), + ] + last_channel = adjust_channels(1280 // reduce_divider) # C5 + elif arch == "mobilenet_v3_small": + inverted_residual_setting = [ + bneck_conf(16, 3, 16, 16, True, "RE", 2, 1), # C1 + bneck_conf(16, 3, 72, 24, False, "RE", 2, 1), # C2 + bneck_conf(24, 3, 88, 24, False, "RE", 1, 1), + bneck_conf(24, 5, 96, 40, True, "HS", 2, 1), # C3 + bneck_conf(40, 5, 240, 40, True, "HS", 1, 1), + bneck_conf(40, 5, 240, 40, True, "HS", 1, 1), + bneck_conf(40, 5, 120, 48, True, "HS", 1, 1), + bneck_conf(48, 5, 144, 48, True, "HS", 1, 1), + bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation), # C4 + bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation), + bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation), + ] + last_channel = adjust_channels(1024 // reduce_divider) # C5 + else: + raise ValueError(f"Unsupported model type {arch}") + + return inverted_residual_setting, last_channel + + +def _mobilenet_v3( + arch: str, + inverted_residual_setting: List[InvertedResidualConfig], + last_channel: int, + pretrained: bool, + progress: bool, + **kwargs: Any, +): + model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs) + if pretrained: + if model_urls.get(arch, None) is None: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def mobilenet_v3_large(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV3: + """ + Constructs a large MobileNetV3 architecture from + `"Searching for MobileNetV3" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + arch = "mobilenet_v3_large" + inverted_residual_setting, last_channel = _mobilenet_v3_conf(arch, **kwargs) + return _mobilenet_v3(arch, inverted_residual_setting, last_channel, pretrained, progress, **kwargs) + + +def mobilenet_v3_small(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV3: + """ + Constructs a small MobileNetV3 architecture from + `"Searching for MobileNetV3" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + arch = "mobilenet_v3_small" + inverted_residual_setting, last_channel = _mobilenet_v3_conf(arch, **kwargs) + return _mobilenet_v3(arch, inverted_residual_setting, last_channel, pretrained, progress, **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/regnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/regnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3f393c8e82d9bb13a5b7b571313fca4343b7e237 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/regnet.py @@ -0,0 +1,608 @@ +# Modified from +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py + + +import math +from collections import OrderedDict +from functools import partial +from typing import Any, Callable, List, Optional, Tuple + +import torch +from torch import nn, Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops.misc import ConvNormActivation, SqueezeExcitation +from ..utils import _log_api_usage_once +from ._utils import _make_divisible + + +__all__ = [ + "RegNet", + "regnet_y_400mf", + "regnet_y_800mf", + "regnet_y_1_6gf", + "regnet_y_3_2gf", + "regnet_y_8gf", + "regnet_y_16gf", + "regnet_y_32gf", + "regnet_y_128gf", + "regnet_x_400mf", + "regnet_x_800mf", + "regnet_x_1_6gf", + "regnet_x_3_2gf", + "regnet_x_8gf", + "regnet_x_16gf", + "regnet_x_32gf", +] + + +model_urls = { + "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth", + "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth", + "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth", + "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth", + "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth", + "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth", + "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth", + "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth", + "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth", + "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth", + "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth", + "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth", + "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth", + "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth", +} + + +class SimpleStemIN(ConvNormActivation): + """Simple stem for ImageNet: 3x3, BN, ReLU.""" + + def __init__( + self, + width_in: int, + width_out: int, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], + ) -> None: + super().__init__( + width_in, width_out, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=activation_layer + ) + + +class BottleneckTransform(nn.Sequential): + """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], + group_width: int, + bottleneck_multiplier: float, + se_ratio: Optional[float], + ) -> None: + layers: OrderedDict[str, nn.Module] = OrderedDict() + w_b = int(round(width_out * bottleneck_multiplier)) + g = w_b // group_width + + layers["a"] = ConvNormActivation( + width_in, w_b, kernel_size=1, stride=1, norm_layer=norm_layer, activation_layer=activation_layer + ) + layers["b"] = ConvNormActivation( + w_b, w_b, kernel_size=3, stride=stride, groups=g, norm_layer=norm_layer, activation_layer=activation_layer + ) + + if se_ratio: + # The SE reduction ratio is defined with respect to the + # beginning of the block + width_se_out = int(round(se_ratio * width_in)) + layers["se"] = SqueezeExcitation( + input_channels=w_b, + squeeze_channels=width_se_out, + activation=activation_layer, + ) + + layers["c"] = ConvNormActivation( + w_b, width_out, kernel_size=1, stride=1, norm_layer=norm_layer, activation_layer=None + ) + super().__init__(layers) + + +class ResBottleneckBlock(nn.Module): + """Residual bottleneck block: x + F(x), F = bottleneck transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], + group_width: int = 1, + bottleneck_multiplier: float = 1.0, + se_ratio: Optional[float] = None, + ) -> None: + super().__init__() + + # Use skip connection with projection if shape changes + self.proj = None + should_proj = (width_in != width_out) or (stride != 1) + if should_proj: + self.proj = ConvNormActivation( + width_in, width_out, kernel_size=1, stride=stride, norm_layer=norm_layer, activation_layer=None + ) + self.f = BottleneckTransform( + width_in, + width_out, + stride, + norm_layer, + activation_layer, + group_width, + bottleneck_multiplier, + se_ratio, + ) + self.activation = activation_layer(inplace=True) + + def forward(self, x: Tensor) -> Tensor: + if self.proj is not None: + x = self.proj(x) + self.f(x) + else: + x = x + self.f(x) + return self.activation(x) + + +class AnyStage(nn.Sequential): + """AnyNet stage (sequence of blocks w/ the same output shape).""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + depth: int, + block_constructor: Callable[..., nn.Module], + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], + group_width: int, + bottleneck_multiplier: float, + se_ratio: Optional[float] = None, + stage_index: int = 0, + ) -> None: + super().__init__() + + for i in range(depth): + block = block_constructor( + width_in if i == 0 else width_out, + width_out, + stride if i == 0 else 1, + norm_layer, + activation_layer, + group_width, + bottleneck_multiplier, + se_ratio, + ) + + self.add_module(f"block{stage_index}-{i}", block) + + +class BlockParams: + def __init__( + self, + depths: List[int], + widths: List[int], + group_widths: List[int], + bottleneck_multipliers: List[float], + strides: List[int], + se_ratio: Optional[float] = None, + ) -> None: + self.depths = depths + self.widths = widths + self.group_widths = group_widths + self.bottleneck_multipliers = bottleneck_multipliers + self.strides = strides + self.se_ratio = se_ratio + + @classmethod + def from_init_params( + cls, + depth: int, + w_0: int, + w_a: float, + w_m: float, + group_width: int, + bottleneck_multiplier: float = 1.0, + se_ratio: Optional[float] = None, + **kwargs: Any, + ) -> "BlockParams": + """ + Programatically compute all the per-block settings, + given the RegNet parameters. + + The first step is to compute the quantized linear block parameters, + in log space. Key parameters are: + - `w_a` is the width progression slope + - `w_0` is the initial width + - `w_m` is the width stepping in the log space + + In other terms + `log(block_width) = log(w_0) + w_m * block_capacity`, + with `bock_capacity` ramping up following the w_0 and w_a params. + This block width is finally quantized to multiples of 8. + + The second step is to compute the parameters per stage, + taking into account the skip connection and the final 1x1 convolutions. + We use the fact that the output width is constant within a stage. + """ + + QUANT = 8 + STRIDE = 2 + + if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: + raise ValueError("Invalid RegNet settings") + # Compute the block widths. Each stage has one unique block width + widths_cont = torch.arange(depth) * w_a + w_0 + block_capacity = torch.round(torch.log(widths_cont / w_0) / math.log(w_m)) + block_widths = (torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT)) * QUANT).int().tolist() + num_stages = len(set(block_widths)) + + # Convert to per stage parameters + split_helper = zip( + block_widths + [0], + [0] + block_widths, + block_widths + [0], + [0] + block_widths, + ) + splits = [w != wp or r != rp for w, wp, r, rp in split_helper] + + stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] + stage_depths = torch.diff(torch.tensor([d for d, t in enumerate(splits) if t])).int().tolist() + + strides = [STRIDE] * num_stages + bottleneck_multipliers = [bottleneck_multiplier] * num_stages + group_widths = [group_width] * num_stages + + # Adjust the compatibility of stage widths and group widths + stage_widths, group_widths = cls._adjust_widths_groups_compatibilty( + stage_widths, bottleneck_multipliers, group_widths + ) + + return cls( + depths=stage_depths, + widths=stage_widths, + group_widths=group_widths, + bottleneck_multipliers=bottleneck_multipliers, + strides=strides, + se_ratio=se_ratio, + ) + + def _get_expanded_params(self): + return zip(self.widths, self.strides, self.depths, self.group_widths, self.bottleneck_multipliers) + + @staticmethod + def _adjust_widths_groups_compatibilty( + stage_widths: List[int], bottleneck_ratios: List[float], group_widths: List[int] + ) -> Tuple[List[int], List[int]]: + """ + Adjusts the compatibility of widths and groups, + depending on the bottleneck ratio. + """ + # Compute all widths for the current settings + widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] + group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] + + # Compute the adjusted widths so that stage and group widths fit + ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] + stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] + return stage_widths, group_widths_min + + +class RegNet(nn.Module): + def __init__( + self, + block_params: BlockParams, + num_classes: int = 1000, + stem_width: int = 32, + stem_type: Optional[Callable[..., nn.Module]] = None, + block_type: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + activation: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + _log_api_usage_once(self) + + if stem_type is None: + stem_type = SimpleStemIN + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if block_type is None: + block_type = ResBottleneckBlock + if activation is None: + activation = nn.ReLU + + # Ad hoc stem + self.stem = stem_type( + 3, # width_in + stem_width, + norm_layer, + activation, + ) + + current_width = stem_width + + blocks = [] + for i, ( + width_out, + stride, + depth, + group_width, + bottleneck_multiplier, + ) in enumerate(block_params._get_expanded_params()): + blocks.append( + ( + f"block{i+1}", + AnyStage( + current_width, + width_out, + stride, + depth, + block_type, + norm_layer, + activation, + group_width, + bottleneck_multiplier, + block_params.se_ratio, + stage_index=i + 1, + ), + ) + ) + + current_width = width_out + + self.trunk_output = nn.Sequential(OrderedDict(blocks)) + + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(in_features=current_width, out_features=num_classes) + + # Performs ResNet-style weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # Note that there is no bias due to BN + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2.0 / fan_out)) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, mean=0.0, std=0.01) + nn.init.zeros_(m.bias) + + def forward(self, x: Tensor) -> Tensor: + x = self.stem(x) + x = self.trunk_output(x) + + x = self.avgpool(x) + x = x.flatten(start_dim=1) + x = self.fc(x) + + return x + + +def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: + norm_layer = kwargs.pop("norm_layer", partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1)) + model = RegNet(block_params, norm_layer=norm_layer, **kwargs) + if pretrained: + if arch not in model_urls: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_400MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, se_ratio=0.25, **kwargs) + return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) + + +def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_800MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, se_ratio=0.25, **kwargs) + return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs) + + +def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_1.6GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params( + depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, se_ratio=0.25, **kwargs + ) + return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs) + + +def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_3.2GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params( + depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, se_ratio=0.25, **kwargs + ) + return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs) + + +def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_8GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params( + depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, se_ratio=0.25, **kwargs + ) + return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs) + + +def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_16GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params( + depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, se_ratio=0.25, **kwargs + ) + return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs) + + +def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_32GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params( + depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, se_ratio=0.25, **kwargs + ) + return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs) + + +def regnet_y_128gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_128GF architecture from + `"Designing Network Design Spaces" `_. + NOTE: Pretrained weights are not available for this model. + """ + params = BlockParams.from_init_params( + depth=27, w_0=456, w_a=160.83, w_m=2.52, group_width=264, se_ratio=0.25, **kwargs + ) + return _regnet("regnet_y_128gf", params, pretrained, progress, **kwargs) + + +def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_400MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=22, w_0=24, w_a=24.48, w_m=2.54, group_width=16, **kwargs) + return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) + + +def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_800MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, **kwargs) + return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs) + + +def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_1.6GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, **kwargs) + return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs) + + +def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_3.2GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, **kwargs) + return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs) + + +def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_8GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, **kwargs) + return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs) + + +def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_16GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, **kwargs) + return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs) + + +def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_32GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, **kwargs) + return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs) + + +# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/resnet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b0bb8d13adec56fef2bd76752889d46379a0d539 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/resnet.py @@ -0,0 +1,413 @@ +from typing import Type, Any, Callable, Union, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + + +__all__ = [ + "ResNet", + "resnet18", + "resnet34", + "resnet50", + "resnet101", + "resnet152", + "resnext50_32x4d", + "resnext101_32x8d", + "wide_resnet50_2", + "wide_resnet101_2", +] + + +model_urls = { + "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth", + "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth", + "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth", + "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth", + "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth", + "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", + "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", + "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth", + "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth", +} + + +def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d: + """3x3 convolution with padding""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation, + ) + + +def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d: + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class BasicBlock(nn.Module): + expansion: int = 1 + + def __init__( + self, + inplanes: int, + planes: int, + stride: int = 1, + downsample: Optional[nn.Module] = None, + groups: int = 1, + base_width: int = 64, + dilation: int = 1, + norm_layer: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError("BasicBlock only supports groups=1 and base_width=64") + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x: Tensor) -> Tensor: + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) + # while original implementation places the stride at the first 1x1 convolution(self.conv1) + # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. + # This variant is also known as ResNet V1.5 and improves accuracy according to + # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. + + expansion: int = 4 + + def __init__( + self, + inplanes: int, + planes: int, + stride: int = 1, + downsample: Optional[nn.Module] = None, + groups: int = 1, + base_width: int = 64, + dilation: int = 1, + norm_layer: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.0)) * groups + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x: Tensor) -> Tensor: + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + def __init__( + self, + block: Type[Union[BasicBlock, Bottleneck]], + layers: List[int], + num_classes: int = 1000, + zero_init_residual: bool = False, + groups: int = 1, + width_per_group: int = 64, + replace_stride_with_dilation: Optional[List[bool]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + _log_api_usage_once(self) + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError( + "replace_stride_with_dilation should be None " + f"or a 3-element tuple, got {replace_stride_with_dilation}" + ) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) # type: ignore[arg-type] + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type] + + def _make_layer( + self, + block: Type[Union[BasicBlock, Bottleneck]], + planes: int, + blocks: int, + stride: int = 1, + dilate: bool = False, + ) -> nn.Sequential: + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append( + block( + self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer + ) + ) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + groups=self.groups, + base_width=self.base_width, + dilation=self.dilation, + norm_layer=norm_layer, + ) + ) + + return nn.Sequential(*layers) + + def _forward_impl(self, x: Tensor) -> Tensor: + # See note [TorchScript super()] + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def _resnet( + arch: str, + block: Type[Union[BasicBlock, Bottleneck]], + layers: List[int], + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> ResNet: + model = ResNet(block, layers, **kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNet-18 model from + `"Deep Residual Learning for Image Recognition" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet("resnet18", BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs) + + +def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNet-34 model from + `"Deep Residual Learning for Image Recognition" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet("resnet34", BasicBlock, [3, 4, 6, 3], pretrained, progress, **kwargs) + + +def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNet-50 model from + `"Deep Residual Learning for Image Recognition" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) + + +def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNet-101 model from + `"Deep Residual Learning for Image Recognition" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet("resnet101", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) + + +def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNet-152 model from + `"Deep Residual Learning for Image Recognition" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet("resnet152", Bottleneck, [3, 8, 36, 3], pretrained, progress, **kwargs) + + +def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNeXt-50 32x4d model from + `"Aggregated Residual Transformation for Deep Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs["groups"] = 32 + kwargs["width_per_group"] = 4 + return _resnet("resnext50_32x4d", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) + + +def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""ResNeXt-101 32x8d model from + `"Aggregated Residual Transformation for Deep Neural Networks" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs["groups"] = 32 + kwargs["width_per_group"] = 8 + return _resnet("resnext101_32x8d", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) + + +def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""Wide ResNet-50-2 model from + `"Wide Residual Networks" `_. + + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs["width_per_group"] = 64 * 2 + return _resnet("wide_resnet50_2", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) + + +def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: + r"""Wide ResNet-101-2 model from + `"Wide Residual Networks" `_. + + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs["width_per_group"] = 64 * 2 + return _resnet("wide_resnet101_2", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/shufflenetv2.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/shufflenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..f3758c54aaf46cdd57c67fd217e68131ade903fb --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/shufflenetv2.py @@ -0,0 +1,222 @@ +from typing import Callable, Any, List + +import torch +import torch.nn as nn +from torch import Tensor + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + + +__all__ = ["ShuffleNetV2", "shufflenet_v2_x0_5", "shufflenet_v2_x1_0", "shufflenet_v2_x1_5", "shufflenet_v2_x2_0"] + +model_urls = { + "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth", + "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth", + "shufflenetv2_x1.5": None, + "shufflenetv2_x2.0": None, +} + + +def channel_shuffle(x: Tensor, groups: int) -> Tensor: + batchsize, num_channels, height, width = x.size() + channels_per_group = num_channels // groups + + # reshape + x = x.view(batchsize, groups, channels_per_group, height, width) + + x = torch.transpose(x, 1, 2).contiguous() + + # flatten + x = x.view(batchsize, -1, height, width) + + return x + + +class InvertedResidual(nn.Module): + def __init__(self, inp: int, oup: int, stride: int) -> None: + super().__init__() + + if not (1 <= stride <= 3): + raise ValueError("illegal stride value") + self.stride = stride + + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + + if self.stride > 1: + self.branch1 = nn.Sequential( + self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + else: + self.branch1 = nn.Sequential() + + self.branch2 = nn.Sequential( + nn.Conv2d( + inp if (self.stride > 1) else branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + + @staticmethod + def depthwise_conv( + i: int, o: int, kernel_size: int, stride: int = 1, padding: int = 0, bias: bool = False + ) -> nn.Conv2d: + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + def forward(self, x: Tensor) -> Tensor: + if self.stride == 1: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + else: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + + out = channel_shuffle(out, 2) + + return out + + +class ShuffleNetV2(nn.Module): + def __init__( + self, + stages_repeats: List[int], + stages_out_channels: List[int], + num_classes: int = 1000, + inverted_residual: Callable[..., nn.Module] = InvertedResidual, + ) -> None: + super().__init__() + _log_api_usage_once(self) + + if len(stages_repeats) != 3: + raise ValueError("expected stages_repeats as list of 3 positive ints") + if len(stages_out_channels) != 5: + raise ValueError("expected stages_out_channels as list of 5 positive ints") + self._stage_out_channels = stages_out_channels + + input_channels = 3 + output_channels = self._stage_out_channels[0] + self.conv1 = nn.Sequential( + nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False), + nn.BatchNorm2d(output_channels), + nn.ReLU(inplace=True), + ) + input_channels = output_channels + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + # Static annotations for mypy + self.stage2: nn.Sequential + self.stage3: nn.Sequential + self.stage4: nn.Sequential + stage_names = [f"stage{i}" for i in [2, 3, 4]] + for name, repeats, output_channels in zip(stage_names, stages_repeats, self._stage_out_channels[1:]): + seq = [inverted_residual(input_channels, output_channels, 2)] + for i in range(repeats - 1): + seq.append(inverted_residual(output_channels, output_channels, 1)) + setattr(self, name, nn.Sequential(*seq)) + input_channels = output_channels + + output_channels = self._stage_out_channels[-1] + self.conv5 = nn.Sequential( + nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False), + nn.BatchNorm2d(output_channels), + nn.ReLU(inplace=True), + ) + + self.fc = nn.Linear(output_channels, num_classes) + + def _forward_impl(self, x: Tensor) -> Tensor: + # See note [TorchScript super()] + x = self.conv1(x) + x = self.maxpool(x) + x = self.stage2(x) + x = self.stage3(x) + x = self.stage4(x) + x = self.conv5(x) + x = x.mean([2, 3]) # globalpool + x = self.fc(x) + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def _shufflenetv2(arch: str, pretrained: bool, progress: bool, *args: Any, **kwargs: Any) -> ShuffleNetV2: + model = ShuffleNetV2(*args, **kwargs) + + if pretrained: + model_url = model_urls[arch] + if model_url is None: + raise ValueError(f"No checkpoint is available for model type {arch}") + else: + state_dict = load_state_dict_from_url(model_url, progress=progress) + model.load_state_dict(state_dict) + + return model + + +def shufflenet_v2_x0_5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2: + """ + Constructs a ShuffleNetV2 with 0.5x output channels, as described in + `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _shufflenetv2("shufflenetv2_x0.5", pretrained, progress, [4, 8, 4], [24, 48, 96, 192, 1024], **kwargs) + + +def shufflenet_v2_x1_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2: + """ + Constructs a ShuffleNetV2 with 1.0x output channels, as described in + `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _shufflenetv2("shufflenetv2_x1.0", pretrained, progress, [4, 8, 4], [24, 116, 232, 464, 1024], **kwargs) + + +def shufflenet_v2_x1_5(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2: + """ + Constructs a ShuffleNetV2 with 1.5x output channels, as described in + `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _shufflenetv2("shufflenetv2_x1.5", pretrained, progress, [4, 8, 4], [24, 176, 352, 704, 1024], **kwargs) + + +def shufflenet_v2_x2_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ShuffleNetV2: + """ + Constructs a ShuffleNetV2 with 2.0x output channels, as described in + `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" + `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _shufflenetv2("shufflenetv2_x2.0", pretrained, progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/squeezenet.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/squeezenet.py new file mode 100644 index 0000000000000000000000000000000000000000..2c1a30f225de90454228a961846238079f300cf1 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/squeezenet.py @@ -0,0 +1,133 @@ +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.init as init + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + +__all__ = ["SqueezeNet", "squeezenet1_0", "squeezenet1_1"] + +model_urls = { + "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth", + "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth", +} + + +class Fire(nn.Module): + def __init__(self, inplanes: int, squeeze_planes: int, expand1x1_planes: int, expand3x3_planes: int) -> None: + super().__init__() + self.inplanes = inplanes + self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1) + self.squeeze_activation = nn.ReLU(inplace=True) + self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, kernel_size=1) + self.expand1x1_activation = nn.ReLU(inplace=True) + self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes, kernel_size=3, padding=1) + self.expand3x3_activation = nn.ReLU(inplace=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.squeeze_activation(self.squeeze(x)) + return torch.cat( + [self.expand1x1_activation(self.expand1x1(x)), self.expand3x3_activation(self.expand3x3(x))], 1 + ) + + +class SqueezeNet(nn.Module): + def __init__(self, version: str = "1_0", num_classes: int = 1000, dropout: float = 0.5) -> None: + super().__init__() + _log_api_usage_once(self) + self.num_classes = num_classes + if version == "1_0": + self.features = nn.Sequential( + nn.Conv2d(3, 96, kernel_size=7, stride=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + Fire(96, 16, 64, 64), + Fire(128, 16, 64, 64), + Fire(128, 32, 128, 128), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + Fire(256, 32, 128, 128), + Fire(256, 48, 192, 192), + Fire(384, 48, 192, 192), + Fire(384, 64, 256, 256), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + Fire(512, 64, 256, 256), + ) + elif version == "1_1": + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=3, stride=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + Fire(64, 16, 64, 64), + Fire(128, 16, 64, 64), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + Fire(128, 32, 128, 128), + Fire(256, 32, 128, 128), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + Fire(256, 48, 192, 192), + Fire(384, 48, 192, 192), + Fire(384, 64, 256, 256), + Fire(512, 64, 256, 256), + ) + else: + # FIXME: Is this needed? SqueezeNet should only be called from the + # FIXME: squeezenet1_x() functions + # FIXME: This checking is not done for the other models + raise ValueError(f"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected") + + # Final convolution is initialized differently from the rest + final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1) + self.classifier = nn.Sequential( + nn.Dropout(p=dropout), final_conv, nn.ReLU(inplace=True), nn.AdaptiveAvgPool2d((1, 1)) + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + if m is final_conv: + init.normal_(m.weight, mean=0.0, std=0.01) + else: + init.kaiming_uniform_(m.weight) + if m.bias is not None: + init.constant_(m.bias, 0) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.features(x) + x = self.classifier(x) + return torch.flatten(x, 1) + + +def _squeezenet(version: str, pretrained: bool, progress: bool, **kwargs: Any) -> SqueezeNet: + model = SqueezeNet(version, **kwargs) + if pretrained: + arch = "squeezenet" + version + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def squeezenet1_0(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> SqueezeNet: + r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level + accuracy with 50x fewer parameters and <0.5MB model size" + `_ paper. + The required minimum input size of the model is 21x21. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _squeezenet("1_0", pretrained, progress, **kwargs) + + +def squeezenet1_1(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> SqueezeNet: + r"""SqueezeNet 1.1 model from the `official SqueezeNet repo + `_. + SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters + than SqueezeNet 1.0, without sacrificing accuracy. + The required minimum input size of the model is 17x17. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _squeezenet("1_1", pretrained, progress, **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vgg.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..07639017a317e60cbe75c6b1f47eb28eefd6d332 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vgg.py @@ -0,0 +1,201 @@ +from typing import Union, List, Dict, Any, cast + +import torch +import torch.nn as nn + +from .._internally_replaced_utils import load_state_dict_from_url +from ..utils import _log_api_usage_once + + +__all__ = [ + "VGG", + "vgg11", + "vgg11_bn", + "vgg13", + "vgg13_bn", + "vgg16", + "vgg16_bn", + "vgg19_bn", + "vgg19", +] + + +model_urls = { + "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth", + "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth", + "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth", + "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth", + "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth", + "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth", + "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth", + "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth", +} + + +class VGG(nn.Module): + def __init__( + self, features: nn.Module, num_classes: int = 1000, init_weights: bool = True, dropout: float = 0.5 + ) -> None: + super().__init__() + _log_api_usage_once(self) + self.features = features + self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(True), + nn.Dropout(p=dropout), + nn.Linear(4096, 4096), + nn.ReLU(True), + nn.Dropout(p=dropout), + nn.Linear(4096, num_classes), + ) + if init_weights: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.constant_(m.bias, 0) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.features(x) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.classifier(x) + return x + + +def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential: + layers: List[nn.Module] = [] + in_channels = 3 + for v in cfg: + if v == "M": + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + v = cast(int, v) + conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) + if batch_norm: + layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] + else: + layers += [conv2d, nn.ReLU(inplace=True)] + in_channels = v + return nn.Sequential(*layers) + + +cfgs: Dict[str, List[Union[str, int]]] = { + "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"], + "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"], + "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"], + "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512, "M"], +} + + +def _vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, **kwargs: Any) -> VGG: + if pretrained: + kwargs["init_weights"] = False + model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs) + if pretrained: + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def vgg11(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 11-layer model (configuration "A") from + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg11", "A", False, pretrained, progress, **kwargs) + + +def vgg11_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 11-layer model (configuration "A") with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg11_bn", "A", True, pretrained, progress, **kwargs) + + +def vgg13(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 13-layer model (configuration "B") + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg13", "B", False, pretrained, progress, **kwargs) + + +def vgg13_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 13-layer model (configuration "B") with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg13_bn", "B", True, pretrained, progress, **kwargs) + + +def vgg16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 16-layer model (configuration "D") + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg16", "D", False, pretrained, progress, **kwargs) + + +def vgg16_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 16-layer model (configuration "D") with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg16_bn", "D", True, pretrained, progress, **kwargs) + + +def vgg19(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 19-layer model (configuration "E") + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg19", "E", False, pretrained, progress, **kwargs) + + +def vgg19_bn(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VGG: + r"""VGG 19-layer model (configuration 'E') with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_. + The required minimum input size of the model is 32x32. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vgg("vgg19_bn", "E", True, pretrained, progress, **kwargs) diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vision_transformer.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b36658e34d845abbe5475ed6b0d2ab34ac7e2047 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/models/vision_transformer.py @@ -0,0 +1,468 @@ +import math +from collections import OrderedDict +from functools import partial +from typing import Any, Callable, List, NamedTuple, Optional + +import torch +import torch.nn as nn + +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops.misc import ConvNormActivation +from ..utils import _log_api_usage_once + +__all__ = [ + "VisionTransformer", + "vit_b_16", + "vit_b_32", + "vit_l_16", + "vit_l_32", +] + +model_urls = { + "vit_b_16": "https://download.pytorch.org/models/vit_b_16-c867db91.pth", + "vit_b_32": "https://download.pytorch.org/models/vit_b_32-d86f8d99.pth", + "vit_l_16": "https://download.pytorch.org/models/vit_l_16-852ce7e3.pth", + "vit_l_32": "https://download.pytorch.org/models/vit_l_32-c7638314.pth", +} + + +class ConvStemConfig(NamedTuple): + out_channels: int + kernel_size: int + stride: int + norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d + activation_layer: Callable[..., nn.Module] = nn.ReLU + + +class MLPBlock(nn.Sequential): + """Transformer MLP block.""" + + def __init__(self, in_dim: int, mlp_dim: int, dropout: float): + super().__init__() + self.linear_1 = nn.Linear(in_dim, mlp_dim) + self.act = nn.GELU() + self.dropout_1 = nn.Dropout(dropout) + self.linear_2 = nn.Linear(mlp_dim, in_dim) + self.dropout_2 = nn.Dropout(dropout) + + nn.init.xavier_uniform_(self.linear_1.weight) + nn.init.xavier_uniform_(self.linear_2.weight) + nn.init.normal_(self.linear_1.bias, std=1e-6) + nn.init.normal_(self.linear_2.bias, std=1e-6) + + +class EncoderBlock(nn.Module): + """Transformer encoder block.""" + + def __init__( + self, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float, + attention_dropout: float, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), + ): + super().__init__() + self.num_heads = num_heads + + # Attention block + self.ln_1 = norm_layer(hidden_dim) + self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True) + self.dropout = nn.Dropout(dropout) + + # MLP block + self.ln_2 = norm_layer(hidden_dim) + self.mlp = MLPBlock(hidden_dim, mlp_dim, dropout) + + def forward(self, input: torch.Tensor): + torch._assert(input.dim() == 3, f"Expected (seq_length, batch_size, hidden_dim) got {input.shape}") + x = self.ln_1(input) + x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False) + x = self.dropout(x) + x = x + input + + y = self.ln_2(x) + y = self.mlp(y) + return x + y + + +class Encoder(nn.Module): + """Transformer Model Encoder for sequence to sequence translation.""" + + def __init__( + self, + seq_length: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float, + attention_dropout: float, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), + ): + super().__init__() + # Note that batch_size is on the first dim because + # we have batch_first=True in nn.MultiAttention() by default + self.pos_embedding = nn.Parameter(torch.empty(1, seq_length, hidden_dim).normal_(std=0.02)) # from BERT + self.dropout = nn.Dropout(dropout) + layers: OrderedDict[str, nn.Module] = OrderedDict() + for i in range(num_layers): + layers[f"encoder_layer_{i}"] = EncoderBlock( + num_heads, + hidden_dim, + mlp_dim, + dropout, + attention_dropout, + norm_layer, + ) + self.layers = nn.Sequential(layers) + self.ln = norm_layer(hidden_dim) + + def forward(self, input: torch.Tensor): + torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}") + input = input + self.pos_embedding + return self.ln(self.layers(self.dropout(input))) + + +class VisionTransformer(nn.Module): + """Vision Transformer as per https://arxiv.org/abs/2010.11929.""" + + def __init__( + self, + image_size: int, + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float = 0.0, + attention_dropout: float = 0.0, + num_classes: int = 1000, + representation_size: Optional[int] = None, + norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6), + conv_stem_configs: Optional[List[ConvStemConfig]] = None, + ): + super().__init__() + _log_api_usage_once(self) + torch._assert(image_size % patch_size == 0, "Input shape indivisible by patch size!") + self.image_size = image_size + self.patch_size = patch_size + self.hidden_dim = hidden_dim + self.mlp_dim = mlp_dim + self.attention_dropout = attention_dropout + self.dropout = dropout + self.num_classes = num_classes + self.representation_size = representation_size + self.norm_layer = norm_layer + + if conv_stem_configs is not None: + # As per https://arxiv.org/abs/2106.14881 + seq_proj = nn.Sequential() + prev_channels = 3 + for i, conv_stem_layer_config in enumerate(conv_stem_configs): + seq_proj.add_module( + f"conv_bn_relu_{i}", + ConvNormActivation( + in_channels=prev_channels, + out_channels=conv_stem_layer_config.out_channels, + kernel_size=conv_stem_layer_config.kernel_size, + stride=conv_stem_layer_config.stride, + norm_layer=conv_stem_layer_config.norm_layer, + activation_layer=conv_stem_layer_config.activation_layer, + ), + ) + prev_channels = conv_stem_layer_config.out_channels + seq_proj.add_module( + "conv_last", nn.Conv2d(in_channels=prev_channels, out_channels=hidden_dim, kernel_size=1) + ) + self.conv_proj: nn.Module = seq_proj + else: + self.conv_proj = nn.Conv2d( + in_channels=3, out_channels=hidden_dim, kernel_size=patch_size, stride=patch_size + ) + + seq_length = (image_size // patch_size) ** 2 + + # Add a class token + self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim)) + seq_length += 1 + + self.encoder = Encoder( + seq_length, + num_layers, + num_heads, + hidden_dim, + mlp_dim, + dropout, + attention_dropout, + norm_layer, + ) + self.seq_length = seq_length + + heads_layers: OrderedDict[str, nn.Module] = OrderedDict() + if representation_size is None: + heads_layers["head"] = nn.Linear(hidden_dim, num_classes) + else: + heads_layers["pre_logits"] = nn.Linear(hidden_dim, representation_size) + heads_layers["act"] = nn.Tanh() + heads_layers["head"] = nn.Linear(representation_size, num_classes) + + self.heads = nn.Sequential(heads_layers) + + if isinstance(self.conv_proj, nn.Conv2d): + # Init the patchify stem + fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1] + nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in)) + if self.conv_proj.bias is not None: + nn.init.zeros_(self.conv_proj.bias) + elif self.conv_proj.conv_last is not None and isinstance(self.conv_proj.conv_last, nn.Conv2d): + # Init the last 1x1 conv of the conv stem + nn.init.normal_( + self.conv_proj.conv_last.weight, mean=0.0, std=math.sqrt(2.0 / self.conv_proj.conv_last.out_channels) + ) + if self.conv_proj.conv_last.bias is not None: + nn.init.zeros_(self.conv_proj.conv_last.bias) + + if hasattr(self.heads, "pre_logits") and isinstance(self.heads.pre_logits, nn.Linear): + fan_in = self.heads.pre_logits.in_features + nn.init.trunc_normal_(self.heads.pre_logits.weight, std=math.sqrt(1 / fan_in)) + nn.init.zeros_(self.heads.pre_logits.bias) + + if isinstance(self.heads.head, nn.Linear): + nn.init.zeros_(self.heads.head.weight) + nn.init.zeros_(self.heads.head.bias) + + def _process_input(self, x: torch.Tensor) -> torch.Tensor: + n, c, h, w = x.shape + p = self.patch_size + torch._assert(h == self.image_size, "Wrong image height!") + torch._assert(w == self.image_size, "Wrong image width!") + n_h = h // p + n_w = w // p + + # (n, c, h, w) -> (n, hidden_dim, n_h, n_w) + x = self.conv_proj(x) + # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w)) + x = x.reshape(n, self.hidden_dim, n_h * n_w) + + # (n, hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), hidden_dim) + # The self attention layer expects inputs in the format (N, S, E) + # where S is the source sequence length, N is the batch size, E is the + # embedding dimension + x = x.permute(0, 2, 1) + + return x + + def forward(self, x: torch.Tensor): + # Reshape and permute the input tensor + x = self._process_input(x) + n = x.shape[0] + + # Expand the class token to the full batch + batch_class_token = self.class_token.expand(n, -1, -1) + x = torch.cat([batch_class_token, x], dim=1) + + x = self.encoder(x) + + # Classifier "token" as used by standard language architectures + x = x[:, 0] + + x = self.heads(x) + + return x + + +def _vision_transformer( + arch: str, + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> VisionTransformer: + image_size = kwargs.pop("image_size", 224) + + model = VisionTransformer( + image_size=image_size, + patch_size=patch_size, + num_layers=num_layers, + num_heads=num_heads, + hidden_dim=hidden_dim, + mlp_dim=mlp_dim, + **kwargs, + ) + + if pretrained: + if arch not in model_urls: + raise ValueError(f"No checkpoint is available for model type '{arch}'!") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + + return model + + +def vit_b_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a vit_b_16 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + arch="vit_b_16", + patch_size=16, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def vit_b_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a vit_b_32 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + arch="vit_b_32", + patch_size=32, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def vit_l_16(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a vit_l_16 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + arch="vit_l_16", + patch_size=16, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a vit_l_32 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _vision_transformer( + arch="vit_l_32", + patch_size=32, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + pretrained=pretrained, + progress=progress, + **kwargs, + ) + + +def interpolate_embeddings( + image_size: int, + patch_size: int, + model_state: "OrderedDict[str, torch.Tensor]", + interpolation_mode: str = "bicubic", + reset_heads: bool = False, +) -> "OrderedDict[str, torch.Tensor]": + """This function helps interpolating positional embeddings during checkpoint loading, + especially when you want to apply a pre-trained model on images with different resolution. + + Args: + image_size (int): Image size of the new model. + patch_size (int): Patch size of the new model. + model_state (OrderedDict[str, torch.Tensor]): State dict of the pre-trained model. + interpolation_mode (str): The algorithm used for upsampling. Default: bicubic. + reset_heads (bool): If true, not copying the state of heads. Default: False. + + Returns: + OrderedDict[str, torch.Tensor]: A state dict which can be loaded into the new model. + """ + # Shape of pos_embedding is (1, seq_length, hidden_dim) + pos_embedding = model_state["encoder.pos_embedding"] + n, seq_length, hidden_dim = pos_embedding.shape + if n != 1: + raise ValueError(f"Unexpected position embedding shape: {pos_embedding.shape}") + + new_seq_length = (image_size // patch_size) ** 2 + 1 + + # Need to interpolate the weights for the position embedding. + # We do this by reshaping the positions embeddings to a 2d grid, performing + # an interpolation in the (h, w) space and then reshaping back to a 1d grid. + if new_seq_length != seq_length: + # The class token embedding shouldn't be interpolated so we split it up. + seq_length -= 1 + new_seq_length -= 1 + pos_embedding_token = pos_embedding[:, :1, :] + pos_embedding_img = pos_embedding[:, 1:, :] + + # (1, seq_length, hidden_dim) -> (1, hidden_dim, seq_length) + pos_embedding_img = pos_embedding_img.permute(0, 2, 1) + seq_length_1d = int(math.sqrt(seq_length)) + torch._assert(seq_length_1d * seq_length_1d == seq_length, "seq_length is not a perfect square!") + + # (1, hidden_dim, seq_length) -> (1, hidden_dim, seq_l_1d, seq_l_1d) + pos_embedding_img = pos_embedding_img.reshape(1, hidden_dim, seq_length_1d, seq_length_1d) + new_seq_length_1d = image_size // patch_size + + # Perform interpolation. + # (1, hidden_dim, seq_l_1d, seq_l_1d) -> (1, hidden_dim, new_seq_l_1d, new_seq_l_1d) + new_pos_embedding_img = nn.functional.interpolate( + pos_embedding_img, + size=new_seq_length_1d, + mode=interpolation_mode, + align_corners=True, + ) + + # (1, hidden_dim, new_seq_l_1d, new_seq_l_1d) -> (1, hidden_dim, new_seq_length) + new_pos_embedding_img = new_pos_embedding_img.reshape(1, hidden_dim, new_seq_length) + + # (1, hidden_dim, new_seq_length) -> (1, new_seq_length, hidden_dim) + new_pos_embedding_img = new_pos_embedding_img.permute(0, 2, 1) + new_pos_embedding = torch.cat([pos_embedding_token, new_pos_embedding_img], dim=1) + + model_state["encoder.pos_embedding"] = new_pos_embedding + + if reset_heads: + model_state_copy: "OrderedDict[str, torch.Tensor]" = OrderedDict() + for k, v in model_state.items(): + if not k.startswith("heads"): + model_state_copy[k] = v + model_state = model_state_copy + + return model_state diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/__init__.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a85d1e739fd1a14000efa90e03e11d8563ceb2d0 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/__init__.py @@ -0,0 +1,4 @@ +from .misc import FrozenBatchNorm2d, ConvNormActivation, SqueezeExcitation +from .stochastic_depth import stochastic_depth, StochasticDepth + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/misc.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..268962e204c87193f9bedfef3b886d2bf96d91a0 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/misc.py @@ -0,0 +1,163 @@ +from typing import Callable, List, Optional + +import torch +from torch import Tensor + +from ..utils import _log_api_usage_once + + +interpolate = torch.nn.functional.interpolate + + +# This is not in nn +class FrozenBatchNorm2d(torch.nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed + + Args: + num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)`` + eps (float): a value added to the denominator for numerical stability. Default: 1e-5 + """ + + def __init__( + self, + num_features: int, + eps: float = 1e-5, + ): + super().__init__() + _log_api_usage_once(self) + self.eps = eps + self.register_buffer("weight", torch.ones(num_features)) + self.register_buffer("bias", torch.zeros(num_features)) + self.register_buffer("running_mean", torch.zeros(num_features)) + self.register_buffer("running_var", torch.ones(num_features)) + + def _load_from_state_dict( + self, + state_dict: dict, + prefix: str, + local_metadata: dict, + strict: bool, + missing_keys: List[str], + unexpected_keys: List[str], + error_msgs: List[str], + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x: Tensor) -> Tensor: + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + scale = w * (rv + self.eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})" + + +class ConvNormActivation(torch.nn.Sequential): + """ + Configurable block used for Convolution-Normalzation-Activation blocks. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block + kernel_size: (int, optional): Size of the convolving kernel. Default: 3 + stride (int, optional): Stride of the convolution. Default: 1 + padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation`` + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolutiuon layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d`` + activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU`` + dilation (int): Spacing between kernel elements. Default: 1 + inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True`` + bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``. + + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 1, + padding: Optional[int] = None, + groups: int = 1, + norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d, + activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU, + dilation: int = 1, + inplace: Optional[bool] = True, + bias: Optional[bool] = None, + ) -> None: + if padding is None: + padding = (kernel_size - 1) // 2 * dilation + if bias is None: + bias = norm_layer is None + layers = [ + torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + ] + if norm_layer is not None: + layers.append(norm_layer(out_channels)) + if activation_layer is not None: + params = {} if inplace is None else {"inplace": inplace} + layers.append(activation_layer(**params)) + super().__init__(*layers) + _log_api_usage_once(self) + self.out_channels = out_channels + + +class SqueezeExcitation(torch.nn.Module): + """ + This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1). + Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3. + + Args: + input_channels (int): Number of channels in the input image + squeeze_channels (int): Number of squeeze channels + activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU`` + scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid`` + """ + + def __init__( + self, + input_channels: int, + squeeze_channels: int, + activation: Callable[..., torch.nn.Module] = torch.nn.ReLU, + scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid, + ) -> None: + super().__init__() + _log_api_usage_once(self) + self.avgpool = torch.nn.AdaptiveAvgPool2d(1) + self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1) + self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1) + self.activation = activation() + self.scale_activation = scale_activation() + + def _scale(self, input: Tensor) -> Tensor: + scale = self.avgpool(input) + scale = self.fc1(scale) + scale = self.activation(scale) + scale = self.fc2(scale) + return self.scale_activation(scale) + + def forward(self, input: Tensor) -> Tensor: + scale = self._scale(input) + return scale * input diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/stochastic_depth.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/stochastic_depth.py new file mode 100644 index 0000000000000000000000000000000000000000..ff8167b2315e941f7e31a0626eeec270d350a710 --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/ops/stochastic_depth.py @@ -0,0 +1,66 @@ +import torch +import torch.fx +from torch import nn, Tensor + +from ..utils import _log_api_usage_once + + +def stochastic_depth(input: Tensor, p: float, mode: str, training: bool = True) -> Tensor: + """ + Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth" + `_ used for randomly dropping residual + branches of residual architectures. + + Args: + input (Tensor[N, ...]): The input tensor or arbitrary dimensions with the first one + being its batch i.e. a batch with ``N`` rows. + p (float): probability of the input to be zeroed. + mode (str): ``"batch"`` or ``"row"``. + ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes + randomly selected rows from the batch. + training: apply stochastic depth if is ``True``. Default: ``True`` + + Returns: + Tensor[N, ...]: The randomly zeroed tensor. + """ + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(stochastic_depth) + if p < 0.0 or p > 1.0: + raise ValueError(f"drop probability has to be between 0 and 1, but got {p}") + if mode not in ["batch", "row"]: + raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}") + if not training or p == 0.0: + return input + + survival_rate = 1.0 - p + if mode == "row": + size = [input.shape[0]] + [1] * (input.ndim - 1) + else: + size = [1] * input.ndim + noise = torch.empty(size, dtype=input.dtype, device=input.device) + noise = noise.bernoulli_(survival_rate) + if survival_rate > 0.0: + noise.div_(survival_rate) + return input * noise + + +torch.fx.wrap("stochastic_depth") + + +class StochasticDepth(nn.Module): + """ + See :func:`stochastic_depth`. + """ + + def __init__(self, p: float, mode: str) -> None: + super().__init__() + _log_api_usage_once(self) + self.p = p + self.mode = mode + + def forward(self, input: Tensor) -> Tensor: + return stochastic_depth(input, self.p, self.mode, self.training) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})" + return s diff --git a/cv/classification/efficientnet_b4/pytorch/_torchvision/utils.py b/cv/classification/efficientnet_b4/pytorch/_torchvision/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d03802d9a47ab2674e9f36add7be77197fc6ca2f --- /dev/null +++ b/cv/classification/efficientnet_b4/pytorch/_torchvision/utils.py @@ -0,0 +1,548 @@ +import math +import pathlib +import warnings +from types import FunctionType +from typing import Any, BinaryIO, List, Optional, Tuple, Union + +import numpy as np +import torch +from PIL import Image, ImageColor, ImageDraw, ImageFont + +__all__ = [ + "make_grid", + "save_image", + "draw_bounding_boxes", + "draw_segmentation_masks", + "draw_keypoints", + "flow_to_image", +] + + +@torch.no_grad() +def make_grid( + tensor: Union[torch.Tensor, List[torch.Tensor]], + nrow: int = 8, + padding: int = 2, + normalize: bool = False, + value_range: Optional[Tuple[int, int]] = None, + scale_each: bool = False, + pad_value: float = 0.0, + **kwargs, +) -> torch.Tensor: + """ + Make a grid of images. + + Args: + tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W) + or a list of images all of the same size. + nrow (int, optional): Number of images displayed in each row of the grid. + The final grid size is ``(B / nrow, nrow)``. Default: ``8``. + padding (int, optional): amount of padding. Default: ``2``. + normalize (bool, optional): If True, shift the image to the range (0, 1), + by the min and max values specified by ``value_range``. Default: ``False``. + value_range (tuple, optional): tuple (min, max) where min and max are numbers, + then these numbers are used to normalize the image. By default, min and max + are computed from the tensor. + range (tuple. optional): + .. warning:: + This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``value_range`` + instead. + scale_each (bool, optional): If ``True``, scale each image in the batch of + images separately rather than the (min, max) over all images. Default: ``False``. + pad_value (float, optional): Value for the padded pixels. Default: ``0``. + + Returns: + grid (Tensor): the tensor containing grid of images. + """ + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(make_grid) + if not (torch.is_tensor(tensor) or (isinstance(tensor, list) and all(torch.is_tensor(t) for t in tensor))): + raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}") + + if "range" in kwargs.keys(): + warnings.warn( + "The parameter 'range' is deprecated since 0.12 and will be removed in 0.14. " + "Please use 'value_range' instead." + ) + value_range = kwargs["range"] + + # if list of tensors, convert to a 4D mini-batch Tensor + if isinstance(tensor, list): + tensor = torch.stack(tensor, dim=0) + + if tensor.dim() == 2: # single image H x W + tensor = tensor.unsqueeze(0) + if tensor.dim() == 3: # single image + if tensor.size(0) == 1: # if single-channel, convert to 3-channel + tensor = torch.cat((tensor, tensor, tensor), 0) + tensor = tensor.unsqueeze(0) + + if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images + tensor = torch.cat((tensor, tensor, tensor), 1) + + if normalize is True: + tensor = tensor.clone() # avoid modifying tensor in-place + if value_range is not None: + assert isinstance( + value_range, tuple + ), "value_range has to be a tuple (min, max) if specified. min and max are numbers" + + def norm_ip(img, low, high): + img.clamp_(min=low, max=high) + img.sub_(low).div_(max(high - low, 1e-5)) + + def norm_range(t, value_range): + if value_range is not None: + norm_ip(t, value_range[0], value_range[1]) + else: + norm_ip(t, float(t.min()), float(t.max())) + + if scale_each is True: + for t in tensor: # loop over mini-batch dimension + norm_range(t, value_range) + else: + norm_range(tensor, value_range) + + assert isinstance(tensor, torch.Tensor) + if tensor.size(0) == 1: + return tensor.squeeze(0) + + # make the mini-batch of images into a grid + nmaps = tensor.size(0) + xmaps = min(nrow, nmaps) + ymaps = int(math.ceil(float(nmaps) / xmaps)) + height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding) + num_channels = tensor.size(1) + grid = tensor.new_full((num_channels, height * ymaps + padding, width * xmaps + padding), pad_value) + k = 0 + for y in range(ymaps): + for x in range(xmaps): + if k >= nmaps: + break + # Tensor.copy_() is a valid method but seems to be missing from the stubs + # https://pytorch.org/docs/stable/tensors.html#torch.Tensor.copy_ + grid.narrow(1, y * height + padding, height - padding).narrow( # type: ignore[attr-defined] + 2, x * width + padding, width - padding + ).copy_(tensor[k]) + k = k + 1 + return grid + + +@torch.no_grad() +def save_image( + tensor: Union[torch.Tensor, List[torch.Tensor]], + fp: Union[str, pathlib.Path, BinaryIO], + format: Optional[str] = None, + **kwargs, +) -> None: + """ + Save a given Tensor into an image file. + + Args: + tensor (Tensor or list): Image to be saved. If given a mini-batch tensor, + saves the tensor as a grid of images by calling ``make_grid``. + fp (string or file object): A filename or a file object + format(Optional): If omitted, the format to use is determined from the filename extension. + If a file object was used instead of a filename, this parameter should always be used. + **kwargs: Other arguments are documented in ``make_grid``. + """ + + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(save_image) + grid = make_grid(tensor, **kwargs) + # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer + ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() + im = Image.fromarray(ndarr) + im.save(fp, format=format) + + +@torch.no_grad() +def draw_bounding_boxes( + image: torch.Tensor, + boxes: torch.Tensor, + labels: Optional[List[str]] = None, + colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None, + fill: Optional[bool] = False, + width: int = 1, + font: Optional[str] = None, + font_size: int = 10, +) -> torch.Tensor: + + """ + Draws bounding boxes on given image. + The values of the input image should be uint8 between 0 and 255. + If fill is True, Resulting Tensor should be saved as PNG image. + + Args: + image (Tensor): Tensor of shape (C x H x W) and dtype uint8. + boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax) format. Note that + the boxes are absolute coordinates with respect to the image. In other words: `0 <= xmin < xmax < W` and + `0 <= ymin < ymax < H`. + labels (List[str]): List containing the labels of bounding boxes. + colors (color or list of colors, optional): List containing the colors + of the boxes or single color for all boxes. The color can be represented as + PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``. + By default, random colors are generated for boxes. + fill (bool): If `True` fills the bounding box with specified color. + width (int): Width of bounding box. + font (str): A filename containing a TrueType font. If the file is not found in this filename, the loader may + also search in other directories, such as the `fonts/` directory on Windows or `/Library/Fonts/`, + `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS. + font_size (int): The requested font size in points. + + Returns: + img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted. + """ + + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(draw_bounding_boxes) + if not isinstance(image, torch.Tensor): + raise TypeError(f"Tensor expected, got {type(image)}") + elif image.dtype != torch.uint8: + raise ValueError(f"Tensor uint8 expected, got {image.dtype}") + elif image.dim() != 3: + raise ValueError("Pass individual images, not batches") + elif image.size(0) not in {1, 3}: + raise ValueError("Only grayscale and RGB images are supported") + + num_boxes = boxes.shape[0] + + if labels is None: + labels: Union[List[str], List[None]] = [None] * num_boxes # type: ignore[no-redef] + elif len(labels) != num_boxes: + raise ValueError( + f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box." + ) + + if colors is None: + colors = _generate_color_palette(num_boxes) + elif isinstance(colors, list): + if len(colors) < num_boxes: + raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ") + else: # colors specifies a single color for all boxes + colors = [colors] * num_boxes + + colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors] + + # Handle Grayscale images + if image.size(0) == 1: + image = torch.tile(image, (3, 1, 1)) + + ndarr = image.permute(1, 2, 0).cpu().numpy() + img_to_draw = Image.fromarray(ndarr) + img_boxes = boxes.to(torch.int64).tolist() + + if fill: + draw = ImageDraw.Draw(img_to_draw, "RGBA") + else: + draw = ImageDraw.Draw(img_to_draw) + + txt_font = ImageFont.load_default() if font is None else ImageFont.truetype(font=font, size=font_size) + + for bbox, color, label in zip(img_boxes, colors, labels): # type: ignore[arg-type] + if fill: + fill_color = color + (100,) + draw.rectangle(bbox, width=width, outline=color, fill=fill_color) + else: + draw.rectangle(bbox, width=width, outline=color) + + if label is not None: + margin = width + 1 + draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font) + + return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8) + + +@torch.no_grad() +def draw_segmentation_masks( + image: torch.Tensor, + masks: torch.Tensor, + alpha: float = 0.8, + colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None, +) -> torch.Tensor: + + """ + Draws segmentation masks on given RGB image. + The values of the input image should be uint8 between 0 and 255. + + Args: + image (Tensor): Tensor of shape (3, H, W) and dtype uint8. + masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool. + alpha (float): Float number between 0 and 1 denoting the transparency of the masks. + 0 means full transparency, 1 means no transparency. + colors (color or list of colors, optional): List containing the colors + of the masks or single color for all masks. The color can be represented as + PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``. + By default, random colors are generated for each mask. + + Returns: + img (Tensor[C, H, W]): Image Tensor, with segmentation masks drawn on top. + """ + + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(draw_segmentation_masks) + if not isinstance(image, torch.Tensor): + raise TypeError(f"The image must be a tensor, got {type(image)}") + elif image.dtype != torch.uint8: + raise ValueError(f"The image dtype must be uint8, got {image.dtype}") + elif image.dim() != 3: + raise ValueError("Pass individual images, not batches") + elif image.size()[0] != 3: + raise ValueError("Pass an RGB image. Other Image formats are not supported") + if masks.ndim == 2: + masks = masks[None, :, :] + if masks.ndim != 3: + raise ValueError("masks must be of shape (H, W) or (batch_size, H, W)") + if masks.dtype != torch.bool: + raise ValueError(f"The masks must be of dtype bool. Got {masks.dtype}") + if masks.shape[-2:] != image.shape[-2:]: + raise ValueError("The image and the masks must have the same height and width") + + num_masks = masks.size()[0] + if colors is not None and num_masks > len(colors): + raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})") + + if colors is None: + colors = _generate_color_palette(num_masks) + + if not isinstance(colors, list): + colors = [colors] + if not isinstance(colors[0], (tuple, str)): + raise ValueError("colors must be a tuple or a string, or a list thereof") + if isinstance(colors[0], tuple) and len(colors[0]) != 3: + raise ValueError("It seems that you passed a tuple of colors instead of a list of colors") + + out_dtype = torch.uint8 + + colors_ = [] + for color in colors: + if isinstance(color, str): + color = ImageColor.getrgb(color) + colors_.append(torch.tensor(color, dtype=out_dtype)) + + img_to_draw = image.detach().clone() + # TODO: There might be a way to vectorize this + for mask, color in zip(masks, colors_): + img_to_draw[:, mask] = color[:, None] + + out = image * (1 - alpha) + img_to_draw * alpha + return out.to(out_dtype) + + +@torch.no_grad() +def draw_keypoints( + image: torch.Tensor, + keypoints: torch.Tensor, + connectivity: Optional[List[Tuple[int, int]]] = None, + colors: Optional[Union[str, Tuple[int, int, int]]] = None, + radius: int = 2, + width: int = 3, +) -> torch.Tensor: + + """ + Draws Keypoints on given RGB image. + The values of the input image should be uint8 between 0 and 255. + + Args: + image (Tensor): Tensor of shape (3, H, W) and dtype uint8. + keypoints (Tensor): Tensor of shape (num_instances, K, 2) the K keypoints location for each of the N instances, + in the format [x, y]. + connectivity (List[Tuple[int, int]]]): A List of tuple where, + each tuple contains pair of keypoints to be connected. + colors (str, Tuple): The color can be represented as + PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``. + radius (int): Integer denoting radius of keypoint. + width (int): Integer denoting width of line connecting keypoints. + + Returns: + img (Tensor[C, H, W]): Image Tensor of dtype uint8 with keypoints drawn. + """ + + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(draw_keypoints) + if not isinstance(image, torch.Tensor): + raise TypeError(f"The image must be a tensor, got {type(image)}") + elif image.dtype != torch.uint8: + raise ValueError(f"The image dtype must be uint8, got {image.dtype}") + elif image.dim() != 3: + raise ValueError("Pass individual images, not batches") + elif image.size()[0] != 3: + raise ValueError("Pass an RGB image. Other Image formats are not supported") + + if keypoints.ndim != 3: + raise ValueError("keypoints must be of shape (num_instances, K, 2)") + + ndarr = image.permute(1, 2, 0).cpu().numpy() + img_to_draw = Image.fromarray(ndarr) + draw = ImageDraw.Draw(img_to_draw) + img_kpts = keypoints.to(torch.int64).tolist() + + for kpt_id, kpt_inst in enumerate(img_kpts): + for inst_id, kpt in enumerate(kpt_inst): + x1 = kpt[0] - radius + x2 = kpt[0] + radius + y1 = kpt[1] - radius + y2 = kpt[1] + radius + draw.ellipse([x1, y1, x2, y2], fill=colors, outline=None, width=0) + + if connectivity: + for connection in connectivity: + start_pt_x = kpt_inst[connection[0]][0] + start_pt_y = kpt_inst[connection[0]][1] + + end_pt_x = kpt_inst[connection[1]][0] + end_pt_y = kpt_inst[connection[1]][1] + + draw.line( + ((start_pt_x, start_pt_y), (end_pt_x, end_pt_y)), + width=width, + ) + + return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8) + + +# Flow visualization code adapted from https://github.com/tomrunia/OpticalFlow_Visualization +@torch.no_grad() +def flow_to_image(flow: torch.Tensor) -> torch.Tensor: + + """ + Converts a flow to an RGB image. + + Args: + flow (Tensor): Flow of shape (N, 2, H, W) or (2, H, W) and dtype torch.float. + + Returns: + img (Tensor): Image Tensor of dtype uint8 where each color corresponds + to a given flow direction. Shape is (N, 3, H, W) or (3, H, W) depending on the input. + """ + + if flow.dtype != torch.float: + raise ValueError(f"Flow should be of dtype torch.float, got {flow.dtype}.") + + orig_shape = flow.shape + if flow.ndim == 3: + flow = flow[None] # Add batch dim + + if flow.ndim != 4 or flow.shape[1] != 2: + raise ValueError(f"Input flow should have shape (2, H, W) or (N, 2, H, W), got {orig_shape}.") + + max_norm = torch.sum(flow ** 2, dim=1).sqrt().max() + epsilon = torch.finfo((flow).dtype).eps + normalized_flow = flow / (max_norm + epsilon) + img = _normalized_flow_to_image(normalized_flow) + + if len(orig_shape) == 3: + img = img[0] # Remove batch dim + return img + + +@torch.no_grad() +def _normalized_flow_to_image(normalized_flow: torch.Tensor) -> torch.Tensor: + + """ + Converts a batch of normalized flow to an RGB image. + + Args: + normalized_flow (torch.Tensor): Normalized flow tensor of shape (N, 2, H, W) + Returns: + img (Tensor(N, 3, H, W)): Flow visualization image of dtype uint8. + """ + + N, _, H, W = normalized_flow.shape + flow_image = torch.zeros((N, 3, H, W), dtype=torch.uint8) + colorwheel = _make_colorwheel() # shape [55x3] + num_cols = colorwheel.shape[0] + norm = torch.sum(normalized_flow ** 2, dim=1).sqrt() + a = torch.atan2(-normalized_flow[:, 1, :, :], -normalized_flow[:, 0, :, :]) / torch.pi + fk = (a + 1) / 2 * (num_cols - 1) + k0 = torch.floor(fk).to(torch.long) + k1 = k0 + 1 + k1[k1 == num_cols] = 0 + f = fk - k0 + + for c in range(colorwheel.shape[1]): + tmp = colorwheel[:, c] + col0 = tmp[k0] / 255.0 + col1 = tmp[k1] / 255.0 + col = (1 - f) * col0 + f * col1 + col = 1 - norm * (1 - col) + flow_image[:, c, :, :] = torch.floor(255 * col) + return flow_image + + +def _make_colorwheel() -> torch.Tensor: + """ + Generates a color wheel for optical flow visualization as presented in: + Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) + URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf. + + Returns: + colorwheel (Tensor[55, 3]): Colorwheel Tensor. + """ + + RY = 15 + YG = 6 + GC = 4 + CB = 11 + BM = 13 + MR = 6 + + ncols = RY + YG + GC + CB + BM + MR + colorwheel = torch.zeros((ncols, 3)) + col = 0 + + # RY + colorwheel[0:RY, 0] = 255 + colorwheel[0:RY, 1] = torch.floor(255 * torch.arange(0, RY) / RY) + col = col + RY + # YG + colorwheel[col : col + YG, 0] = 255 - torch.floor(255 * torch.arange(0, YG) / YG) + colorwheel[col : col + YG, 1] = 255 + col = col + YG + # GC + colorwheel[col : col + GC, 1] = 255 + colorwheel[col : col + GC, 2] = torch.floor(255 * torch.arange(0, GC) / GC) + col = col + GC + # CB + colorwheel[col : col + CB, 1] = 255 - torch.floor(255 * torch.arange(CB) / CB) + colorwheel[col : col + CB, 2] = 255 + col = col + CB + # BM + colorwheel[col : col + BM, 2] = 255 + colorwheel[col : col + BM, 0] = torch.floor(255 * torch.arange(0, BM) / BM) + col = col + BM + # MR + colorwheel[col : col + MR, 2] = 255 - torch.floor(255 * torch.arange(MR) / MR) + colorwheel[col : col + MR, 0] = 255 + return colorwheel + + +def _generate_color_palette(num_objects: int): + palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) + return [tuple((i * palette) % 255) for i in range(num_objects)] + + +def _log_api_usage_once(obj: Any) -> None: + + """ + Logs API usage(module and name) within an organization. + In a large ecosystem, it's often useful to track the PyTorch and + TorchVision APIs usage. This API provides the similar functionality to the + logging module in the Python stdlib. It can be used for debugging purpose + to log which methods are used and by default it is inactive, unless the user + manually subscribes a logger via the `SetAPIUsageLogger method `_. + Please note it is triggered only once for the same API call within a process. + It does not collect any data from open-source users since it is no-op by default. + For more information, please refer to + * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging; + * Logging policy: https://github.com/pytorch/vision/issues/5052; + + Args: + obj (class instance or method): an object to extract info from. + """ + if not obj.__module__.startswith("torchvision"): + return + name = obj.__class__.__name__ + if isinstance(obj, FunctionType): + name = obj.__name__ + torch._C._log_api_usage_once(f"{obj.__module__}.{name}") diff --git a/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py b/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py index 7c5fa1179da79e495eae3d591c322865a76b6dfc..ce54196fe3088672f04b1b1edc432fc8023c061e 100755 --- a/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py +++ b/cv/classification/efficientnet_b4/pytorch/common_utils/smooth_value.py @@ -1,6 +1,6 @@ -# Copyright (c) 2022 Iluvatar CoreX. All rights reserved. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. - +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. from collections import defaultdict, deque import datetime @@ -37,7 +37,7 @@ class SmoothedValue(object): """ if not is_dist_avail_and_initialized(): return - t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + t = torch.tensor([self.count, self.total], dtype=torch.float32, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() @@ -72,4 +72,4 @@ class SmoothedValue(object): avg=self.avg, global_avg=self.global_avg, max=self.max, - value=self.value) \ No newline at end of file + value=self.value) diff --git a/cv/classification/efficientnet_b4/pytorch/train.py b/cv/classification/efficientnet_b4/pytorch/train.py index 449cf2544553612fbac89bd9e30a6880cca601fb..9ba7b468b380ea1735cd6be9823b85c34e58a3cb 100755 --- a/cv/classification/efficientnet_b4/pytorch/train.py +++ b/cv/classification/efficientnet_b4/pytorch/train.py @@ -1,4 +1,6 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. import datetime import os @@ -6,6 +8,7 @@ import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import time +import math import torch import torch.utils.data @@ -20,7 +23,7 @@ except: from torch import nn import torch.distributed as dist -import torchvision as torchvision +import _torchvision as torchvision import utils @@ -74,6 +77,10 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) batch_size = image.shape[0] + loss_value = loss.item() + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) @@ -194,6 +201,7 @@ def main(args): print("Start training") start_time = time.time() + best_acc = 0 for epoch in range(args.start_epoch, args.epochs): epoch_start_time = time.time() if args.distributed and not args.dali: @@ -204,23 +212,21 @@ def main(args): if acc_avg > args.acc_thresh: print("The accuracy has been exceeded {},and the training is terminated at epoch {}".format(args.acc_thresh, epoch)) return - - if args.output_dir: - checkpoint = { - 'model': model_without_ddp.state_dict(), - 'optimizer': optimizer.state_dict(), - 'lr_scheduler': lr_scheduler.state_dict(), - 'epoch': epoch, - 'args': args} - utils.save_on_master( - checkpoint, - os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) - utils.save_on_master( - checkpoint, - os.path.join(args.output_dir, 'checkpoint.pth')) - epoch_total_time = time.time() - epoch_start_time - epoch_total_time_str = str(datetime.timedelta(seconds=int(epoch_total_time))) - print('epoch time {}'.format(epoch_total_time_str)) + if acc_avg > best_acc: + if args.output_dir: + checkpoint = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'epoch': epoch, + 'args': args} + utils.save_on_master( + checkpoint, + os.path.join(args.output_dir, 'model_best.pth')) + epoch_total_time = time.time() - epoch_start_time + epoch_total_time_str = str(datetime.timedelta(seconds=int(epoch_total_time))) + print('epoch time {}'.format(epoch_total_time_str)) + best_acc = acc_avg if args.dali: data_loader.reset() @@ -301,7 +307,7 @@ def get_args_parser(add_help=True): ) # distributed training parameters - parser.add_argument('--local_rank', default=-1, type=int, + parser.add_argument('--local_rank', '--local-rank', default=-1, type=int, help='Local rank') parser.add_argument('--world-size', default=1, type=int, help='number of distributed processes') @@ -328,4 +334,9 @@ def check_args(args): if __name__ == "__main__": args = get_args_parser().parse_args() args = check_args(args) + try: + from dltest import show_training_arguments + show_training_arguments(args) + except: + pass main(args)