
    hCQ                         d Z ddlmZ ddlmZ ddlmZmZ  ej                  e	      Z
 G d de      Z G d d	e      Z G d
 de      Z G d de      Z G d de      Zg dZy)zSAM2 model configuration   )PretrainedConfig)logging   )CONFIG_MAPPING
AutoConfigc                   R     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Sam2HieraDetConfiga  
    This is the configuration class to store the configuration of a [`Sam2HieraDetModel`]. It is used to instantiate
    a HieraDet model as defined in the original sam2 repo according to the specified arguments, defining the model architecture.
    Instantiating a configuration defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 96):
            The hidden dimension of the image encoder.
        num_attention_heads (`int`, *optional*, defaults to 1):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of channels in the image.
        image_size (`list[int]`, *optional*, defaults to `[1024, 1024]`):
            The size of the image.
        patch_kernel_size (`list[int]`, *optional*, defaults to `[7, 7]`):
            The kernel size of the patch.
        patch_stride (`list[int]`, *optional*, defaults to `[4, 4]`):
            The stride of the patch.
        patch_padding (`list[int]`, *optional*, defaults to `[3, 3]`):
            The padding of the patch.
        query_stride (`list[int]`, *optional*, defaults to `[2, 2]`):
            The downsample stride between stages.
        window_positional_embedding_background_size (`list[int]`, *optional*, defaults to `[7, 7]`):
            The window size per stage when not using global attention.
        num_query_pool_stages (`int`, *optional*, defaults to 3):
            The number of query pool stages.
        blocks_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 7, 2]`):
            The number of blocks per stage.
        embed_dim_per_stage (`list[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
            The embedding dimension per stage.
        num_attention_heads_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 4, 8]`):
            The number of attention heads per stage.
        window_size_per_stage (`list[int]`, *optional*, defaults to `[8, 4, 14, 7]`):
            The window size per stage.
        global_attention_blocks (`list[int]`, *optional*, defaults to `[5, 7, 9]`):
            The blocks where global attention is used.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            The ratio of the MLP hidden dimension to the embedding dimension.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    backbone_configsam2_hiera_det_modelc                    t        |   di | ||nddg}||nddg}||nddg}||nddg}||nddg}|	|	nddg}	||ng d}||ng d}||ng d}||ng d	}||ng d
}|| _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        y )N         r   r   )   r   r   r   )`           )r   r   r      )r   r      r   )   r   	    )super__init__hidden_sizenum_attention_headsnum_channels
image_sizepatch_kernel_sizepatch_stridepatch_paddingquery_stride+window_positional_embedding_background_sizenum_query_pool_stagesblocks_per_stageembed_dim_per_stagenum_attention_heads_per_stagewindow_size_per_stageglobal_attention_blocks	mlp_ratio
hidden_actlayer_norm_epsinitializer_range)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   kwargs	__class__s                        i/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/sam2/configuration_sam2.pyr   zSam2HieraDetConfig.__init__P   sk   . 	"6"#-#9Zd|
1B1N-UVXYTZ'3'?|aV)6)BA'3'?|aV ;F 8Q 	4
 0@/K+Q]5H5T1Zm-J-V)\h 	& :O9Z 5`m=T=`"9fo&#6 ($!2(*(;f8%:" 0#6 -J*%:"'>$"$,!2    )r   r   r   NNNNNNr   NNNNNg      @geluư>{Gz?)__name__
__module____qualname____doc__base_config_key
model_typer   __classcell__r1   s   @r2   r	   r	      sZ    1f (O'J 48 &*" $)=3 =3r3   r	   c                   L     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )Sam2VisionConfiga  
    This is the configuration class to store the configuration of a [`Sam2VisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[768, 384, 192, 96]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configsam2_vision_modelr
   c                    t        |   d	i | |g dn|}|ddgddgddggn|}|ddgn|}t        |t              r'|j	                  dd      |d<   t        |d      d	i |}nt        |t              r|}n|
t               }|| _        || _        || _	        || _
        || _        || _        || _        || _        |	| _        |
| _        || _        || _        y )
N)r   r   r   r         @   r   r   r<   r   r   )r   r   
isinstancedictgetr   r	   r
   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levelsr,   r-   r.   )r/   r
   rJ   rK   rL   rM   rN   rO   rP   rQ   r,   r-   r.   r0   r1   s                 r2   r   zSam2VisionConfig.__init__   s     	"6"7L7T 3Zo2H2Pc3Z#sb"X.Vl 	 )<(Cq!fI\ot,,;,?,?Nd,eOL),_\-JK^o^O);<-O$02O. &;"&<#..$&#6 "4$,!2r3   )NNNrD   r   r       Nr   r4   r5   r6   )
r7   r8   r9   r:   r;   r<   r   sub_configsr   r=   r>   s   @r2   r@   r@      sQ    $L &O$J:K "# .3 .3r3   r@   c                   8     e Zd ZdZdZ	 	 	 	 	 	 	 	 d fd	Z xZS )Sam2PromptEncoderConfiga<  
    This is the configuration class to store the configuration of a [`Sam2PromptEncoder`]. The [`Sam2PromptEncoder`]
    module is used to encode the input 2D points and bounding boxes.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        scale (`float`, *optional*, defaults to 1):
            The scale factor for the prompt encoder.
    prompt_encoder_configc	                     t        
|   di |	 || _        || _        || _        || _        || _        || _        || _        || _	        y Nr   )
r   r   r   r   
patch_sizemask_input_channelsnum_point_embeddingsr,   r-   scale)r/   r   r   rY   rZ   r[   r,   r-   r\   r0   r1   s             r2   r   z Sam2PromptEncoderConfig.__init__  sQ     	"6"&$$#6 $8!$,
r3   )rD   r      r]   r   r4   r5   r   r7   r8   r9   r:   r;   r   r=   r>   s   @r2   rU   rU      s3    4 .O  r3   rU   c                   @     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Sam2MaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`Sam2MaskDecoder`]. It is used to instantiate a SAM2
    memory encoder according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the SAM2 mask decoder.
        mlp_dim (`int`, *optional*, defaults to 2048):
            The dimension of the MLP in the two-way transformer.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            The number of hidden layers in the two-way transformer.
        num_attention_heads (`int`, *optional*, defaults to 8):
            The number of attention heads in the two-way transformer.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsample rate for the attention layers.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of multimask outputs.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The depth of the IoU head.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The hidden dimension of the IoU head.
        dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`):
            Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05):
            The stability delta for the dynamic multimask.
        dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98):
            The stability threshold for the dynamic multimask.

    mask_decoder_configc                     t        |   di | || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        || _        || _        || _        || _        y rX   )r   r   r   num_multimask_outputsr,   iou_head_depthiou_head_hidden_dimdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_threshnum_hidden_layersr   mlp_dimattention_downsample_rate)r/   r   r,   rj   ri   r   rk   rc   rd   re   rf   rg   rh   r0   r1   s                 r2   r   zSam2MaskDecoderConfig.__init__H  s}      	"6"&%:"$,#6 /N,1R.2T/ "3&#6 )B&r3   )rD   r4   i   r   r   r   r   r   rD   Tg?g\(\?r^   r>   s   @r2   r`   r`   "  sB    !F ,O "#(,*.+/ C  Cr3   r`   c                   <     e Zd ZdZdZeeedZ	 	 	 	 d fd	Z	 xZ
S )
Sam2Configal	  
    [`Sam2Config`] is the configuration class to store the configuration of a [`Sam2Model`]. It is used to instantiate a
    SAM2 model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
    configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
    [facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `Sam2VisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`Sam2VisionConfig`].
        prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`Sam2PromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `Sam2MaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`Sam2MaskDecoderConfig`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for parameter initialization.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     Sam2VisionConfig,
    ...     Sam2PromptEncoderConfig,
    ...     Sam2MaskDecoderConfig,
    ...     Sam2Model,
    ... )

    >>> # Initializing a Sam2Config with `"facebook/sam2.1_hiera_tiny"` style configuration
    >>> configuration = Sam2config()

    >>> # Initializing a Sam2Model (with random weights) from the `"facebook/sam2.1_hiera_tiny"` style configuration
    >>> model = Sam2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Sam2Config from a Sam2VisionConfig, Sam2PromptEncoderConfig, and Sam2MaskDecoderConfig

    >>> # Initializing SAM2 vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = Sam2VisionConfig()
    >>> prompt_encoder_config = Sam2PromptEncoderConfig()
    >>> mask_decoder_config = Sam2MaskDecoderConfig()

    >>> config = Sam2Config(vision_config, prompt_encoder_config, mask_decoder_config)
    ```sam2)rA   rV   ra   c                    t        |   di | ||ni }||ni }||ni }t        |t              r'|j	                  dd      |d<   t        |d      di |}nt        |t              r|}t        |t              r|j                         }t        |t              r|j                         }|| _
        t        di || _        t        di || _        || _        y )Nr<   rB   r   )r   r   rG   rH   rI   r   r   rU   to_dictr`   rA   rV   ra   r.   )r/   rA   rV   ra   r.   r0   r1   s         r2   r   zSam2Config.__init__  s     	"6")6)B9N9Z 5`b5H5T1Z\mT**7*;*;LJ]*^M,'*=+FGX-XM'78)M+-DE$9$A$A$C!)+@A"5"="="?*%<%U?T%U"#8#O;N#O !2r3   )NNNr6   )r7   r8   r9   r:   r<   r   rU   r`   rS   r   r=   r>   s   @r2   rm   rm   k  s8    0d J#!84K " 3 3r3   rm   )rm   r	   r@   rU   r`   N)r:   configuration_utilsr   utilsr   autor   r   
get_loggerr7   loggerr	   r@   rU   r`   rm   __all__r   r3   r2   <module>rw      s~     3  - 
		H	%t3) t3n[3' [3|1. 1hFC, FCRU3! U3pr3   