File size: 5,014 Bytes
9d5f243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License") and the MIT License (the "License2");

""" FDViT model configuration"""

from collections import OrderedDict
from typing import Mapping

from packaging import version

from transformers.configuration_utils import PretrainedConfig
from transformers.onnx import OnnxConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


#from ..deprecated._archive_maps import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP  # noqa: F401, E402


class FDViTConfig(PretrainedConfig):
    r"""

    This is the configuration class to store the configuration of a [`FDViTModel`]. It is used to instantiate an FDViT

    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the

    defaults will yield a similar configuration to that of the FDViT

    [amd/fdvit_ti](https://huggingface.co/amd/fdvit_ti) architecture.



    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the

    documentation from [`PretrainedConfig`] for more information.





    Args:

        image_size (`int`, *optional*, defaults to 224):

            The size (resolution) of each image.

        patch_size (`int`, *optional*, defaults to 16):

            The size of the input patch.

        stride (`int`, *optional*, defaults to 16):

            The stride of the input patch.

        base_dims (`list`, *optional*, defaults to `[32, 23, 21, 23, 26]`):

            The basic dimension of each encoder block.

        depth (`list`, *optional*, defaults to `[2, 3, 3, 2, 2]`):

            The depth of each encoder block.

        heads (`list`, *optional*, defaults to `[2, 4, 6, 8, 10]`):

            The depth of each encoder block.

        channels (`list`, *optional*, defaults to `[64, 92, 126, 184, 260]`):

            The depth of each encoder block.

        out_size (`list`, *optional*, defaults to `[27, 19, 14, 10, 7]`):

            The output size of each encoder block.

        mlp_ratio (`float`, *optional*, defaults to 4.0):

            The ratio of the number of channels in the output of the MLP to the number of channels in the input.

        num_classes (`int`, *optional*, defaults to 1000):

            The number of classes of the dataset.

        in_chans (`int`, *optional*, defaults to 3):

            The number of channels in the input image.

        attn_drop_rate (`float`, *optional*, defaults to 0.0):

            The attention drop rate for the attention dropout layers.

        drop_rate (`float`, *optional*, defaults to 0.0):

            The dropout rate for the dropout layers.

        drop_path_rate (`float`, *optional*, defaults to 0.1):

            The droppath rate for the droppath layers.

        initializer_range (`float`, *optional*, defaults to 0.02):

            The initializer range for the weights.





    Example:



    ```python

    >>> from transformers import FDViTConfig, FDViTModel



    >>> # Initializing a FDViT fdvit_ti style configuration

    >>> configuration = FDViTConfig()



    >>> # Initializing a model (with random weights) from the fdvit_ti style configuration

    >>> model = FDViTModel(configuration)



    >>> # Accessing the model configuration

    >>> configuration = model.config

    ```"""

    model_type = "fdvit"

    def __init__(

        self,

        image_size=224,

        patch_size=16,

        stride=8,

        base_dims=[32, 23, 21, 23, 26],

        depth=[2, 3, 3, 2, 2],

        heads=[2, 4, 6, 8, 10],

        channels=[64,92,126,184,260],

        out_size=[27, 19, 14, 10, 7],

        mlp_ratio=4,

        num_classes=1000,

        in_chans=3,

        attn_drop_rate=0.0,

        drop_rate=0.0,

        drop_path_rate=0.1,

        initializer_range=0.02,

        **kwargs,

    ):
        super().__init__(**kwargs)

        self.image_size = image_size
        self.patch_size = patch_size
        self.stride = stride
        self.base_dims = base_dims
        self.depth = depth
        self.heads = heads
        self.channels = channels
        self.out_size = out_size
        self.mlp_ratio = mlp_ratio
        self.num_classes = num_classes
        self.in_chans = in_chans
        self.attn_drop_rate = attn_drop_rate
        self.drop_rate = drop_rate
        self.drop_path_rate = drop_path_rate
        self.initializer_range = initializer_range
        


class FDViTOnnxConfig(OnnxConfig):
    torch_onnx_minimum_version = version.parse("1.11")

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    @property
    def atol_for_validation(self) -> float:
        return 1e-4