Source code for segmentation_models_pytorch.decoders.unet.model

import warnings
from typing import Any, Dict, Optional, Union, Callable, Sequence

from segmentation_models_pytorch.base import (
    ClassificationHead,
    SegmentationHead,
    SegmentationModel,
)
from segmentation_models_pytorch.encoders import get_encoder
from segmentation_models_pytorch.base.hub_mixin import supports_config_loading

from .decoder import UnetDecoder


[docs] class Unet(SegmentationModel): """ U-Net is a fully convolutional neural network architecture designed for semantic image segmentation. It consists of two main parts: 1. An encoder (downsampling path) that extracts increasingly abstract features 2. A decoder (upsampling path) that gradually recovers spatial details The key is the use of skip connections between corresponding encoder and decoder layers. These connections allow the decoder to access fine-grained details from earlier encoder layers, which helps produce more precise segmentation masks. The skip connections work by concatenating feature maps from the encoder directly into the decoder at corresponding resolutions. This helps preserve important spatial information that would otherwise be lost during the encoding process. Args: encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone) to extract features of different spatial resolution encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on). Default is 5 encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and other pretrained weights (see table with available weights for each encoder_name) decoder_channels: List of integers which specify **in_channels** parameter for convolutions used in decoder. Length of the list should be the same as **encoder_depth** decoder_use_norm: Specifies normalization between Conv2D and activation. Accepts the following types: - **True**: Defaults to `"batchnorm"`. - **False**: No normalization (`nn.Identity`). - **str**: Specifies normalization type using default parameters. Available values: `"batchnorm"`, `"identity"`, `"layernorm"`, `"instancenorm"`, `"inplace"`. - **dict**: Fully customizable normalization settings. Structure: ```python {"type": <norm_type>, **kwargs} ``` where `norm_name` corresponds to normalization type (see above), and `kwargs` are passed directly to the normalization layer as defined in PyTorch documentation. **Example**: ```python decoder_use_norm={"type": "layernorm", "eps": 1e-2} ``` decoder_attention_type: Attention module used in decoder of the model. Available options are **None** and **scse** (https://arxiv.org/abs/1808.08127). decoder_interpolation: Interpolation mode used in decoder of the model. Available options are **"nearest"**, **"bilinear"**, **"bicubic"**, **"area"**, **"nearest-exact"**. Default is **"nearest"**. in_channels: A number of input channels for the model, default is 3 (RGB images) classes: A number of classes for output mask (or you can think as a number of channels of output mask) activation: An activation function to apply after the final convolution layer. Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**, **callable** and **None**. Default is **None**. aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build on top of encoder if **aux_params** is not **None** (default). Supported params: - classes (int): A number of classes - pooling (str): One of "max", "avg". Default is "avg" - dropout (float): Dropout factor in [0, 1) - activation (str): An activation function to apply "sigmoid"/"softmax" (could be **None** to return logits) kwargs: Arguments passed to the encoder class ``__init__()`` function. Applies only to ``timm`` models. Keys with ``None`` values are pruned before passing. Returns: ``torch.nn.Module``: Unet Example: .. code-block:: python import torch import segmentation_models_pytorch as smp model = smp.Unet("resnet18", encoder_weights="imagenet", classes=5) model.eval() # generate random images images = torch.rand(2, 3, 256, 256) with torch.inference_mode(): mask = model(images) print(mask.shape) # torch.Size([2, 5, 256, 256]) .. _Unet: https://arxiv.org/abs/1505.04597 """ requires_divisible_input_shape = False @supports_config_loading def __init__( self, encoder_name: str = "resnet34", encoder_depth: int = 5, encoder_weights: Optional[str] = "imagenet", decoder_use_norm: Union[bool, str, Dict[str, Any]] = "batchnorm", decoder_channels: Sequence[int] = (256, 128, 64, 32, 16), decoder_attention_type: Optional[str] = None, decoder_interpolation: str = "nearest", in_channels: int = 3, classes: int = 1, activation: Optional[Union[str, Callable]] = None, aux_params: Optional[dict] = None, **kwargs: dict[str, Any], ): super().__init__() decoder_use_batchnorm = kwargs.pop("decoder_use_batchnorm", None) if decoder_use_batchnorm is not None: warnings.warn( "The usage of decoder_use_batchnorm is deprecated. Please modify your code for decoder_use_norm", DeprecationWarning, stacklevel=2, ) decoder_use_norm = decoder_use_batchnorm self.encoder = get_encoder( encoder_name, in_channels=in_channels, depth=encoder_depth, weights=encoder_weights, **kwargs, ) add_center_block = encoder_name.startswith("vgg") self.decoder = UnetDecoder( encoder_channels=self.encoder.out_channels, decoder_channels=decoder_channels, n_blocks=encoder_depth, use_norm=decoder_use_norm, add_center_block=add_center_block, attention_type=decoder_attention_type, interpolation_mode=decoder_interpolation, ) self.segmentation_head = SegmentationHead( in_channels=decoder_channels[-1], out_channels=classes, activation=activation, kernel_size=3, ) if aux_params is not None: self.classification_head = ClassificationHead( in_channels=self.encoder.out_channels[-1], **aux_params ) else: self.classification_head = None self.name = "u-{}".format(encoder_name) self.initialize()