
    h                    b   d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlmc mZ d dl	mZ d dlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, e e!d       G d de                     Z-e e!d       G d de                     Z. G d dej^                        Z0 G d dej^                        Z1 G d dej^                        Z2	 dRd ej^                  d!e	j                  d"e	j                  d#e	j                  d$ee	j                     d%e3d&e3fd'Z4dSd(e	j                  d)ee5   d*e	j                  fd+Z6 G d, d-ej^                        Z7 G d. d/ej^                        Z8d0 Z9d1 Z: G d2 d3e      Z;e e!d4       G d5 d6e                     Z<e! G d7 d8e             Z= G d9 d:e=      Z> e!d;       G d< d=e=             Z? G d> d?ej^                        Z@ G d@ dAej^                        ZA G dB dCej^                        ZB G dD dEej^                        ZC G dF dGej^                        ZD G dH dIej^                        ZE G dJ dKej                        ZG G dL dMej^                        ZH e!dN       G dO dPe=             ZIg dQZJy)T    N)	dataclass)CallableOptionalUnion)Tensor)OutputRecorder   )ACT2FN)GradientCheckpointingLayer)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)#compile_compatible_method_lru_cache)ModelOutputauto_docstring)TransformersKwargscheck_model_inputs   )	AutoModel   )
Sam2ConfigSam2HieraDetConfigSam2MaskDecoderConfigSam2PromptEncoderConfigSam2VisionConfigz,Base class for the vision encoder's outputs.)custom_introc                       e Zd ZU dZdZej                  ed<   dZe	ej                     ed<   dZ
e	ej                     ed<   dZe	eej                  df      ed<   dZe	eej                  df      ed<   y)	Sam2VisionEncoderOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    fpn_hidden_states (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
        `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
    fpn_position_encoding (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
        `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
        model at the output of each stage.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    Nlast_hidden_statefpn_hidden_statesfpn_position_encoding.hidden_states
attentions)__name__
__module____qualname____doc__r    torchFloatTensor__annotations__r!   r   r"   r#   tupler$        d/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/sam2/modeling_sam2.pyr   r   7   s    & ,0u((/59x 1 1299=8E$5$56==AM8E%"3"3S"89:A:>Ju00#567>r.   r   z'Base class for the Sam2 model's output.c                   :   e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	ej                  ed<   dZ
eej                  df   ed<   dZeeej                  df      ed<   dZeeej                  df      ed	<   dZeeej                  df      ed
<   y)Sam2ImageSegmentationOutputa  
    iou_scores (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks)`):
        The Intersection over Union (IoU) scores of the predicted masks.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, height, width)`):
        The predicted low-resolution masks. This is an alias for `low_res_masks`. These masks need to be post-processed
        by the processor to be brought to the original image size.
    object_score_logits (`torch.FloatTensor` of shape `(batch_size, point_batch_size, 1)`):
        Logits for the object score, indicating if an object is present.
    image_embeddings (`tuple(torch.FloatTensor)`):
        The features from the FPN, which are used by the mask decoder. This is a tuple of `torch.FloatTensor` where each
        tensor has shape `(batch_size, channels, height, width)`.
    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`.
        Hidden-states of the vision model at the output of each stage.
    vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
        Attentions weights of the vision model.
    mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
        Attentions weights of the mask decoder.
    N
iou_scores
pred_masksobject_score_logits.image_embeddingsvision_hidden_statesvision_attentionsmask_decoder_attentions)r%   r&   r'   r(   r2   r)   r*   r+   r3   r4   r5   r,   r6   r   r7   r8   r-   r.   r/   r1   r1   T   s    , %)J!!($(J!!(-1**16:eE--s23:DH(5):):C)?#@AHAExe&7&7&< =>EGKXeE,=,=s,B&CDKr.   r1   c                   .     e Zd ZdZdef fdZd Z xZS )Sam2PatchEmbeddingsa  
    Turns pixel values into patch embeddings for transformer consumption.

    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`Sam2ImageProcessorFast.__call__`] for details.

    Returns:
        embeddings (`torch.FloatTensor`):
            Patch embeddings depend on image_size, patch_kernel_size, patch_stride and patch_padding
    configc                     t         |           |j                  }|j                  }t	        j
                  |||j                  |j                  |j                        | _	        y )N)kernel_sizestridepadding)
super__init__num_channelshidden_sizennConv2dpatch_kernel_sizepatch_stridepatch_padding
projection)selfr;   rB   rC   	__class__s       r/   rA   zSam2PatchEmbeddings.__init__   sU    **(())00&&((
r.   c                 n    |j                   \  }}}}| j                  |      j                  dddd      }|S )Nr   r   r	   r   )shaperI   permute)rJ   pixel_values_rB   heightwidth
embeddingss          r/   forwardzSam2PatchEmbeddings.forward   s;    )5););&<__\2::1aAF
r.   )r%   r&   r'   r(   r   rA   rT   __classcell__rK   s   @r/   r:   r:   v   s    
1 
r.   r:   c                        e Zd ZdZ	 ddedededee   f fdZ e	d      	 dd	e
j                  d
ee
j                  ef   de
j                  dee   def
d       Z xZS )Sam2SinePositionEmbeddingz
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    num_pos_featstemperature	normalizescalec                     t         |           ||du rt        d      || _        || _        || _        |dt        j                  z  | _        y || _        y )NFz+normalize should be True if scale is passedr   )	r@   rA   
ValueErrorrY   rZ   r[   mathpir\   )rJ   rY   rZ   r[   r\   rK   s        r/   rA   z"Sam2SinePositionEmbedding.__init__   sY     	e!3JKK*&"$)MQ[
u
r.   r   )maxsizerM   devicedtypemaskreturnc           
         |2t        j                  |d   |d   |d   f|t         j                        }| j                  |      }|j	                  d      }|j	                  d      }| j
                  rDd}||d d dd d d f   |z   z  | j                  z  }||d d d d dd f   |z   z  | j                  z  }t        j                  | j                  t         j                  |      j                  |      }	| j                  dt        j                  |	dd	
      z  | j                  z  z  }	|d d d d d d d f   |	z  }
|d d d d d d d f   |	z  }t        j                  |
d d d d d d dd df   j                         |
d d d d d d dd df   j                         fd      j                  d      }
t        j                  |d d d d d d dd df   j                         |d d d d d d dd df   j                         fd      j                  d      }t        j                   ||
fd      j#                  dddd      }|S )Nr   r   r	   rb   rc   r   ư>rc   rb   floor)rounding_mode   dim)r)   zerosbooltocumsumr[   r\   arangerY   int64rZ   divstacksincosflattencatrN   )rJ   rM   rb   rc   rd   not_masky_embedx_embedepsdim_tpos_xpos_yposs                r/   rT   z!Sam2SinePositionEmbedding.forward   s    <;;a%(E!H=fTYT^T^_DE::e$//!$//!$>>CBC!3c!9:TZZGGArs!3c!9:TZZGGT//u{{6RUUV[\  Q5!7)S%SVZVhVh%hi1a&.1a&.U1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgU1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgiiA.66q!QB
r.   )@   i'  FNN)r%   r&   r'   r(   intrq   r   floatrA   r   r)   Sizer   rb   strrc   r   rT   rU   rV   s   @r/   rX   rX      s     tx	= 	=47	=LP	=aijoap	= )3 "&zz ellC'( {{	
 v 
 4r.   rX   c                        e Zd Zdef fdZdej                  deeej                  df   eej                  df   f   fdZ xZ	S )Sam2VisionNeckr;   c           
         t         |           || _        t        |j                  dz  d      | _        t        j                         | _        |j                  D ]]  }| j                  j                  t        j                  ||j                  |j                  |j                  |j                               _ |j                  | _        y )Nr   T)rY   r[   )in_channelsout_channelsr=   r>   r?   )r@   rA   r;   rX   fpn_hidden_sizeposition_encodingrD   
ModuleListconvsbackbone_channel_listappendrE   fpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levels)rJ   r;   r   rK   s      r/   rA   zSam2VisionNeck.__init__   s    !:I_I_cdIdpt!u]]_
!77 		KJJ		 +!'!7!7 & 6 6!,,"..		 $*#=#= r.   r#   re   .c                 H   d}d}t        | j                        dz
  }t        |dd      D ]  }||   j                  dddd      } | j                  ||z
     |      }|| j                  vs||k(  r|}nVt        j                  j                  t        j                        dd	d d
      j                  |j                        }||z   }| j                  |j                  |j                  |j                        j                  |j                        }	||fz  }||	fz  } ||fS )Nr-   r   ri   r   r	   r   )rc   g       @nearestF)scale_factormodealign_corners	antialias)lenr   rangerN   r   Finterpolaterr   r)   float32rc   r   rM   rb   )
rJ   r#   r!   r"   nilateral_featuresprev_featurestop_down_featuresprev_position_encodings
             r/   rT   zSam2VisionNeck.forward   sE    " 

Oaq"b! 	?A,Q/771aC0tzz!a%01AB000AF 0$%MM!$$5==$9!$""&#% "%++, " !13D D%)%;%;##]%9%9=;N;N&b$$% # -!11!&<%>>!)	?, !"777r.   )
r%   r&   r'   r   rA   r)   r   r,   rT   rU   rV   s   @r/   r   r      sS    >/ >$8U\\ 8eE%,,PSBS<TV[\a\h\hjm\mVn<n6o 8r.   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr   r	   ri   )ro   rc   )ptrainingr   )r)   matmul	transposerD   
functionalsoftmaxr   rr   rc   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardr      s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r.   xquery_stridere   c                     || S | j                  dddd      } t        j                  j                  | ||d      } | j                  dddd      } | S )Nr   r	   r   r   F)r=   r>   	ceil_mode)rN   rD   r   
max_pool2d)r   r   s     r/   do_poolr     sX    			!Q1A
  \]b cA			!Q1AHr.   c                        e Zd Z	 d
dededededeeeef      f
 fdZdej                  dej                  fd	Z
 xZS )Sam2MultiScaleAttentionr;   ro   dim_outnum_attention_headsr   c                    t         |           || _        || _        || _        || _        || _        ||z  }|dz  | _        t        j                  ||dz        | _
        t        j                  ||      | _        d| _        y )N      r	   F)r@   rA   r;   ro   r   r   r   r\   rD   Linearqkvproj	is_causal)rJ   r;   ro   r   r   r   head_dimrK   s          r/   rA   z Sam2MultiScaleAttention.__init__  sz     	(#6 11t^
99S'A+.IIgw/	r.   r#   re   c                    |j                   \  }}}}| j                  |      j                  |||z  d| j                  d      }t	        j
                  |d      \  }}	}
|| j                  z  |	j                  dd      z  }t        j                  j                  j                  |t        j                  d      j                  |j                        }| j                  r[t        |j                  |||d      | j                        }|j                   dd \  }}|j                  |||z  | j                  d      }|j                  dd      }|	j                  dd      }	|
j                  dd      }
t         }| j"                  j$                  dk7  rt&        | j"                  j$                     } || ||	|
fd | j(                  | j                  d|\  }}|j                  |||d      }| j+                  |      }|S )	Nr	   ri   r   )rc   ro   r   eager)r   r   r   )rM   r   reshaper   r)   unbindr\   r   rD   r   r   r   rr   rc   r   r   r   r;   _attn_implementationr   r   r   )rJ   r#   r   
batch_sizerQ   rR   rP   r   r   r   r   r   attention_interfacer   s                 r/   rT   zSam2MultiScaleAttention.forward2  s   '4':':$
FE1hh}%--j&5.!TMeMegij!LLa0sE

*cmmB.CCxx**22<u}}Z\2]``afalalm EMM*feRH$J[J[\E!KK!,MFEMM*fund>V>VXZ[E 1%mmAq!1%(?;;++w6"9$++:Z:Z"[,		

  nnJJ	
 	
Q "))*feRHii,r.   r   )r%   r&   r'   r   r   r   r,   rA   r)   r   rT   rU   rV   s   @r/   r   r     se     37"  	
 ! uS#X/0&U\\ & &r.   r   c                   D     e Zd Z	 	 d	dedededededef fdZd Z xZS )
Sam2FeedForward	input_dim
hidden_dim
output_dim
num_layers
activationsigmoid_outputc           	      `   t         |           || _        t        |   | _        t        j                  ||      | _        t        j                  ||      | _        t        j                  t        |dz
        D cg c]  }t        j                  ||       c}      | _        || _        y c c}w )Nr   )r@   rA   r   r
   r   rD   r   proj_inproj_outr   r   layersr   )	rJ   r   r   r   r   r   r   rP   rK   s	           r/   rA   zSam2FeedForward.__init__\  s     	$ ,yyJ7		*j9mmPUV`cdVdPe$f1RYYz:%F$fg, %gs   :B+c                     | j                  |      }| j                  |      }| j                  D ]  }| j                   ||            } | j                  |      }| j                  rt        j                  |      }|S r   )r   r   r   r   r   r   sigmoid)rJ   r#   layers      r/   rT   zSam2FeedForward.forwardm  ss    ]36[[ 	BE OOE-,@AM	B m4IIm4Mr.   )reluF)	r%   r&   r'   r   r   rq   rA   rT   rU   rV   s   @r/   r   r   [  sO     !$-- - 	-
 - - -"	r.   r   c           	      `   | j                   \  }}}}|||z  z
  |z  }|||z  z
  |z  }t        j                  j                  | ddd|d|f      } ||z   ||z   }	}| j	                  |||z  ||	|z  ||      } | j                  dddddd      j                         j	                  d|||      }
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (padded_height, padded_width): padded height and width before partition
    r   r   r	   r   rm      ri   )rM   rD   r   padviewrN   r   )hidden_statewindow_sizer   rQ   rR   rB   
pad_height	pad_widthpadded_heightpadded_widthwindowss              r/   window_partitionr   y  s     /;.@.@+J| 44CJu{22kAI ==$$\Aq!Y:3VWL"(:"5uy7H<M$$M[0+|{?Z\giuL ""1aAq!4??AFFr;XceqrG]L111r.   c                 6   |\  }}|\  }}| j                   d   ||z  |z  |z  z  }| j                  |||z  ||z  ||d      }	|	j                  dddddd      j                         }	|	j                  |||d      }	|	ddd|d|ddf   j                         }	|	S )	aB  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`tuple[int]`):
            Padded height and width (padded_height, padded_width).
        height_width (`tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   ri   r   r	   r   rm   r   N)rM   r   rN   r   )
r   r   pad_height_widthheight_widthr   r   rQ   rR   r   r   s
             r/   window_unpartitionr     s    " #3M< MFEq!ml&Bk&QU`&`aJ<<M[0,+2M{\gikL  ''1aAq9DDFL$$ZbQL  7F7FUFA 56AACLr.   c                   r     e Zd Zdedededef fdZdej                  dee	   dej                  fd	Z xZS )
Sam2MultiScaleBlockr;   	stage_idx	block_idxtotal_block_idxc                 $   t         |           |dkD  r|dk(  r|j                  |dz
     n|j                  |   | _        |j                  |   | _        t        j                  | j                  |j                        | _        |dkD  r|dk(  r|j                  |dz
     n|j                  |   | _
        ||j                  v rdn| j                  | _
        d|cxk  r|j                  k  rn n|dk(  r|j                  nd | _        t        || j                  | j                  |j                  |   | j                        | _        t        j                  | j                  |j                        | _        t%        | j                  t'        | j                  |j(                  z        | j                  d|j*                        | _        | j                  | j                  k7  r0t        j.                  | j                  | j                        | _        y y )Nr   r   )r   )r   r   r   )r   r   )r@   rA   embed_dim_per_stagero   r   rD   	LayerNormlayer_norm_epslayer_norm1window_size_per_stager   global_attention_blocksnum_query_pool_stagesr   r   num_attention_heads_per_stageattnlayer_norm2r   r   	mlp_ratio
hidden_actmlpr   r   )rJ   r;   r   r   r   rK   s        r/   rA   zSam2MultiScaleBlock.__init__  s    	
 1}a &&y1}5++I6 	
 11)<<<f6K6KL 1}a ((Q7--i8 	
 !063Q3Q Q1W[WgWg $%y#PF4P4P#PU^bcUcFim 	 ,HHLL & D DY O**
	 <<&:O:OP"LLv///0LL((
 88t||#		$((DLL9DI $r.   r#   r   re   c                    |}| j                  |      }| j                  | j                  k7  r%t        | j	                  |      | j
                        }| j                  }| j                  dkD  r-|j                  d   |j                  d   }}t        ||      \  }} | j                  dd|i|}|}| j
                  rN| j                  | j
                  d   z  }|j                  dd \  }}|||z  z
  |z  }	|||z  z
  |z  }
||	z   ||
z   f}| j                  dkD  rt        ||f      }||z   }| j                  |      }|| j                  |      z   }|S )Nr   r   r   r#   r	   r-   )r   ro   r   r   r   r   r   rM   r   r  r   r  r  )rJ   r#   r   residualr   HWpad_hwr   pad_hpad_wlayernorm_outputs               r/   rT   zSam2MultiScaleBlock.forward  s   
 !((7 88t||#tyy79J9JKH &&a &&q)=+>+>q+AqA$4]K$P!M6  dii 
'

 $**d.?.?.BBK>>!A&DAq 1{?2kAE 1{?2kAE%iU+F a.}k6TUWXSYZM =0++M:%1A(BBr.   )r%   r&   r'   r   r   rA   r)   r   r   r   r*   rT   rU   rV   s   @r/   r   r     s`    -:"-: -: 	-:
 -:^*||* +,* 
			*r.   r   zW
    Hiera model's outputs that also contains a pooling of the last hidden states.
    c                   l    e Zd ZU dZdZeej                     ed<   dZ	ee
ej                  df      ed<   y)Sam2HieraDetModelOutputat  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
        hidden-states at the output of the last layer of the model.
    intermediate_hidden_states (`tuple[torch.FloatTensor]` of shape `(batch_size, height, width, hidden_size)`):
        Sequence of hidden-states at the output of the intermediate layers of the model.
    Nr    .intermediate_hidden_states)r%   r&   r'   r(   r    r   r)   r*   r+   r  r,   r-   r.   r/   r  r    s?     6:x 1 129JNu/@/@#/E)F GNr.   r  c                   *    e Zd ZeZdZdZdZdZdZ	d Z
y)Sam2PreTrainedModelsam2rO   Tc                    | j                   j                  }t        |t        j                  t        j
                  t        j                  f      rY|j                  j                  j                  d|       |j                  |j                  j                  j                          nt        |t        j                        re|j                  j                  j                  d|       |j                  |j                  j                  |j                     j                          nit        |t        j                  t        f      rI|j                  j                  j!                  d       |j                  j                  j                          t        |t"              r`|j$                  $|j$                  j                  j                          |j&                  $|j&                  j                  j                          t        |t(              r2|j*                  %|j*                  j                  j                          y y y )N        )meanstd      ?)r;   initializer_range
isinstancerD   r   rE   ConvTranspose2dweightdatanormal_biaszero_	Embeddingpadding_idxr   Sam2LayerNormfill_Sam2HieraDetModel	pos_embedpos_embed_window	Sam2Modelno_memory_embedding)rJ   r   r  s      r/   _init_weightsz!Sam2PreTrainedModel._init_weights0  s   kk++fryy"))R5G5GHIMM&&CS&9{{&  &&(-MM&&CS&9!!-""6#5#56<<>} =>MM$$S)KK""$f/0+  %%++-&&2'',,224fi())5**//557 6 )r.   N)r%   r&   r'   r   config_classbase_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_2_supports_attention_backendr-  r-   r.   r/   r  r  '  s(    L$ON!"&8r.   r  c            
            e Zd ZeZdZeedZdef fdZ	d Z
deeef   dej                  fdZe	 ddeej$                     d	ee   deeef   fd
       Z xZS )r(  rO   r#   r$   r;   c           	         t         |   |       t        |      | _        t	        j
                  t        j                  d|j                  g|j                         | _
        t	        j
                  t        j                  d|j                  |j                  d   |j                  d               | _        t        j                  |j                        dz
  j!                         | _        t	        j$                         | _        d}t)        |j                        D ]D  \  }}t+        |      D ]1  }t-        ||||      }| j&                  j/                  |       |dz  }3 F y )Nr   r   )r;   r   r   r   )r@   rA   r:   patch_embedrD   	Parameterr)   rp   rC   +window_positional_embedding_background_sizer)  r   r*  nprs   blocks_per_stagetolist
stage_endsr   blocks	enumerater   r   r   )rJ   r;   r   r   r;  r   blockrK   s          r/   rA   zSam2HieraDetModel.__init__O  s.    .v6KK6--c0b0bc
 !#KK6--v/K/KA/NPVPlPlmnPop!
 99V%<%<=AIIKmmo+4V5L5L+M 	%'I'"#34 %	+!Y)]l ""5)1$%	%r.   c                     | j                   S r   )r7  rJ   s    r/   get_input_embeddingsz&Sam2HieraDetModel.get_input_embeddingse  s    r.   hwre   c           	      4   |\  }}| j                   }t        j                  | j                  ||fd      }||j	                  t        |j                  |j                        D cg c]
  \  }}||z   c}}      z   }|j                  dddd      }|S c c}}w )Nbicubic)sizer   r   r   r	   r   )r*  r   r   r)  tileziprM   rN   )rJ   rD  hwwindow_embedr)  r   ys           r/   _get_pos_embedz Sam2HieraDetModel._get_pos_embedh  s    1,,MM$..1vIN	 1 1c)//[g[m[mFn2oda162o pp	%%aAq1	 3ps   %Br   c                    |t        d      | j                  |      }|| j                  |j                  dd       z   }d}t	        | j
                        D ]#  \  }} ||fi |}|| j                  v s||fz   }% t        ||      S )N You have to specify pixel_valuesr   r	   r-   )r    r  )r^   r7  rN  rM   r?  r>  r=  r  )rJ   rO   r   r#   r  r   block_modules          r/   rT   zSam2HieraDetModel.forwardp  s     ?@@((6%(;(;M<O<OPQRS<T(UU%'"(5 	[OA|(A&AMDOO#-G=JZ-Z*		[ '+'A
 	
r.   r   )r%   r&   r'   r   r.  r0  r   r   _can_record_outputsrA   rC  r,   r   r)   r   rN  r   r   r*   r   r   r   r  rT   rU   rV   s   @r/   r(  r(  G  s    %L$O,-
%1 %, sCx U\\   59
u001
 +,
 
u--	.	
 
r.   r(  zJ
    The vision model from Sam without any head or projection on top.
    c            
            e Zd ZeZdZeedZdef fdZ	d Z
e	 d	deej                     dee   deeef   fd       Z xZS )
Sam2VisionModelrO   r5  r;   c                     t         |   |       || _        t        j                  |j
                        | _        t        |      | _        |j                  | _	        | j                          y r   )r@   rA   r;   r   from_configbackbone_configbackboner   necknum_feature_levels	post_initrJ   r;   rK   s     r/   rA   zSam2VisionModel.__init__  sS     !--f.D.DE"6*	"(";";r.   c                 6    | j                   j                         S r   )rX  rC  rB  s    r/   rC  z$Sam2VisionModel.get_input_embeddings  s    }}1133r.   r   re   c                    |t        d       | j                  |fi |}|j                  }|j                  }| j	                  |      \  }}|| j
                   d  d d d   }|| j
                   d  d d d   }t        |||      S )NrP  ri   )r    r!   r"   )r^   rX  r    r  rY  rZ  r   )rJ   rO   r   backbone_outputr#   r  r!   r"   s           r/   rT   zSam2VisionModel.forward  s     ?@@ ($--??'99%4%O%O"3799=W3X00-t/F/F.F.HI$B$O 5t7N7N6N6P QRVTVRV W&+/"7
 	
r.   r   )r%   r&   r'   r   r.  r0  r   r   rR  rA   rC  r   r   r)   r*   r   r   r   r,   r   rT   rU   rV   s   @r/   rT  rT    s     $L$O,-
	/ 	4  59
u001
 +,
 
u--	.	
 
r.   rT  c                   ,     e Zd Zdef fdZddZ xZS )Sam2PositionalEmbeddingr;   c                     t         |           |j                  | _        | j                  t        j                  d|j
                  dz  f      z  }| j                  d|       y )Nr   positional_embedding)r@   rA   r\   r)   randnrC   register_buffer)rJ   r;   rc  rK   s      r/   rA   z Sam2PositionalEmbedding.__init__  sT    \\
#zzEKKF<N<NRS<S8T,UU35IJr.   c                    |j                         }|D|dddddddf   |d   z  |dddddddf<   |dddddddf   |d   z  |dddddddf<   |j                  t        j                         d|z  dz
  }|j                  | j                  j
                        }|| j                  z  }dt        j                  z  |z  }t        j                  t        j                  |      t        j                  |      gd      S )z8Positionally encode points that are normalized to [0,1].Nr   r   r   ri   rn   )clonerr   r)   r   rc  rc   r:  r`   r{   rx   ry   )rJ   input_coordsinput_shapecoordinatess       r/   rT   zSam2PositionalEmbedding.forward  s    "((*"&1!Q1*&=A&NK1a
#&1!Q1*&=A&NK1a
#u}}% +o)!nnT%>%>%D%DE!D$=$=="%%i+-yy%))K0%))K2HIrRRr.   r   r%   r&   r'   r   rA   rT   rU   rV   s   @r/   ra  ra    s    K6 KSr.   ra  c                   *     e Zd Zdef fdZd Z xZS )Sam2MaskEmbeddingr;   c                 2   t         |           |j                  dz  | _        t        |j                     | _        t        j                  d| j                  dd      | _        t        j                  | j                  |j                  dd      | _	        t        j                  |j                  |j                  d      | _        t        | j                  |j                  d      | _        t        | j                  dz  |j                  d      | _        y )Nrm   r   r   r=   r>   )r=   channels_firstr   data_format)r@   rA   mask_input_channelsr
   r  r   rD   rE   conv1conv2rC   conv3r&  r   r   r  r\  s     r/   rA   zSam2MaskEmbedding.__init__  s    #)#=#=#B  !2!23YYq$":":RST
YYt779S9Sabklm
YYv996;M;M[\]
($$&*?*?M]
 )$$q(f.C.CQa
r.   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }|S r   )rt  r   r   ru  r  rv  )rJ   masksr#   dense_embeddingss       r/   rT   zSam2MaskEmbedding.forward  so    

5)((76

=1((76::m4r.   rk  rV   s   @r/   rm  rm    s    
6 
	 r.   rm  c                       e Zd Zdef fdZdej                  dej                  dedej                  fdZdej                  dej                  fd	Z	d
e
eej                  ej                  f      de
ej                     de
ej                     de
ej                     deej                  ej                  f   f
dZ xZS )Sam2PromptEncoderr;   c                    t         |           t        |      | _        t	        |      | _        t        j                  d|j                        | _	        |j                  |j                  z  |j                  |j                  z  f| _        d|j                  z  |j                  z  d|j                  z  |j                  z  f| _        |j                  | _        t        j                  |j                  |j                        | _        |j                  | _        t        j                  d|j                        | _        y )Nr   rm   )r@   rA   ra  shared_embeddingrm  
mask_embedrD   r$  rC   no_mask_embed
image_size
patch_sizeimage_embedding_sizemask_input_sizeinput_image_sizenum_point_embeddingspoint_embednot_a_point_embedr\  s     r/   rA   zSam2PromptEncoder.__init__  s     7 ?+F3\\!V-?-?@%+%6%6&:K:K%KVM^M^bhbsbsMs$t! !F$5$5 59J9J JAPVPaPaLaekevevLvw & 1 1<<(C(CVEWEWX!--!#a1C1C!Dr.   pointslabelsr   re   c                 P   |dz   }|rZt         j                  j                  j                  |ddd      }t         j                  j                  j                  |ddd      }| j                  | j                  f}| j                  ||      }t        j                  |d   dk(  | j                  j                  |      }t        j                  |d   d	k7  |t        j                  |            }|| j                  |j                  d
            |dk\  j                  d      z  z   }|S )zEmbeds point prompts.      ?)r   r   r   r   constantr   )r   r   )r   r   ri   ).Ni)min)r)   rD   r   r   r  r}  wherer  r  
zeros_liker  clamp	unsqueeze)rJ   r  r  r   ri  point_embeddings         r/   _embed_pointszSam2PromptEncoder._embed_points  s   #XX((,,V\
Z[,\FXX((,,VV*TV,WF,,d.C.CD//D  ++fY&72&=t?U?U?\?\^mn  ++9$_-
 *D,<,<V\\a\=P,QU[_`U`TkTklnTo,oor.   boxesc                 h   |dz   }|j                   dd \  }}|j                  ||dd      }| j                  | j                  f}| j                  ||      }|dddddddfxx   | j                  j
                  d   z  cc<   |dddddddfxx   | j                  j
                  d   z  cc<   |S )zEmbeds box prompts.r  Nr   r   r   r	   )rM   r   r  r}  r  r  )rJ   r  r   nb_boxescoordsri  corner_embeddings          r/   _embed_boxeszSam2PromptEncoder._embed_boxes  s    ${{2A
Hz8Q:,,d.C.CD00EAq!$(8(8(?(?(BB$Aq!$(8(8(?(?(BB$r.   input_pointsinput_labelsinput_boxesinput_masksc                    d}d}|4|j                   d   }|t        d      | j                  |||du       }|}|>|j                   d   }| j                  |      }||}nt	        j
                  ||gd      }|| j                  |      }	||	fS | j                  j                  j                  dddd      j                  |d| j                  d   | j                  d         }	||	fS )	au  
        Embeds different types of prompts, returning both sparse and dense embeddings.

        Args:
            points (`torch.Tensor`, *optional*):
                point coordinates and labels to embed.
            boxes (`torch.Tensor`, *optional*):
                boxes to embed
            masks (`torch.Tensor`, *optional*):
                masks to embed
        Nr   r   z5If points are provided, labels must also be provided.)r   r   rn   ri   )rM   r^   r  r  r)   r{   r~  r  r  r   expandr  )
rJ   r  r  r  r  sparse_embeddingsr   point_embeddingsbox_embeddingsry  s
             r/   rT   zSam2PromptEncoder.forward%  s'   $ !
#%++A.J# !XYY#11,S^bfSf1h 0"$**1-J!..{;N ($2!$)II/@..QWX$Y!"#{; !"222	  $1188@@B1MTTB 9 9! <d>W>WXY>Z  !"222r.   )r%   r&   r'   r   rA   r)   r   rq   r  r  r   r,   rT   rU   rV   s   @r/   r{  r{    s    E6 EELL %,, T V[VbVb 2	 %,, 	 5<< 	 (3uU\\5<<%?@A(3 u||,(3 ell+	(3
 ell+(3 
u||U\\)	*(3r.   r{  c                        e Zd ZdZd
 fd	Z	 d
dej                  dej                  dej                  deej                     dee	   de
ej                  ej                  f   fd	Z xZS )Sam2Attentionz
    SAM2's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
    values.
    c                    t         |           ||j                  n|}|| _        |j                  | _        |j                  |z  | _        |j                  | _        | j
                  |j                  z  | _        | j                  dz  | _        d| _	        t        j                  | j                  | j
                        | _        t        j                  | j                  | j
                        | _        t        j                  | j                  | j
                        | _        t        j                  | j
                  | j                        | _        y )Nr   F)r@   rA   attention_downsample_rater;   rC   internal_dimr   r   r   r   rD   r   q_projk_projv_projo_proj)rJ   r;   downsample_raterK   s      r/   rA   zSam2Attention.__init__V  s    >M>U&::[j!--"../A#)#=#= ))V-G-GG}}d*ii 0 0$2C2CDii 0 0$2C2CDii 0 0$2C2CDii 1 143C3CDr.   r   r   r   attention_similarityr   re   c                    |j                   d d \  }}||z  d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        }	| j                  j                  dk7  rt        | j                  j                     }	 |	| |||f|d| j                  | j                  d|\  }
}|
j                  ||d| j                  | j                  z        j                         }
| j!                  |
      }
|
|fS )Nr   ri   r   r   r  )r   r   r   r   )rM   r   r   r  r   r   r  r  r   r;   r   r   r   r   r   r   r  )rJ   r   r   r   r  r   r   point_batch_size	new_shaper   r   r   s               r/   rT   zSam2Attention.forwardf  so    (-{{2A$
$"22B8P8PRVR_R_`	'E"''3==aC#dkk###Y/99!Q?'E"''3==aC(?;;++w6"9$++:Z:Z"[$7	
%

 0LLnn
%
 
%
!\ "))("d.F.F.V

*, 	 kk+.L((r.   r   )r%   r&   r'   r(   rA   r)   r   r   r   r   r,   rT   rU   rV   s   @r/   r  r  P  s    
E* 8<%)||%) \\%) ||	%)
 'u||4%) +,%) 
u||U\\)	*%)r.   r  c                   P     e Zd Zddedef fdZdededededed	ee   fd
Z	 xZ
S )Sam2TwoWayAttentionBlockr;   skip_first_layer_pec                 ,   t         |           t        |d      | _        t	        j
                  |j                        | _        t        |      | _        t	        j
                  |j                        | _	        t        |j                  |j                  |j                  |j                        | _        t	        j
                  |j                        | _        t	        j
                  |j                        | _        t        |      | _        || _        y)a  
        A transformer block with four layers:
            (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
            sparse inputs (4) cross attention of dense inputs -> sparse inputs

        Arguments:
            config (`Sam2MaskDecoderConfig`):
                The configuration file used to instantiate the block
            attention_downsample_rate (*optionalk*, int, defaults to 2):
                The downsample ratio of the block used to reduce the inner dim of the attention.
            skip_first_layer_pe (*optional*, bool, defaults to `False`):
                Whether or not to skip the addition of the query_point_embedding on the first layer.
        r   )r  )r   N)r@   rA   r  	self_attnrD   r   rC   r   cross_attn_token_to_imager  r   mlp_dimnum_hidden_layersr  layer_norm3layer_norm4cross_attn_image_to_tokenr  )rJ   r;   r  rK   s      r/   rA   z!Sam2TwoWayAttentionBlock.__init__  s     	&vqA<<(:(:;)6v)>&<<(:(:;"0B0BvOgOg
 <<(:(:;<<(:(:;)6v)>&#6 r.   querieskeysquery_point_embeddingkey_point_embeddingr  r   c                    | j                   r| j                  |||      \  }}n!||z   }| j                  |||      \  }	}||	z   }| j                  |      }||z   }||z   }
| j                  ||
||      \  }	}||	z   }| j	                  |      }| j                  |      }||z   }| j                  |      }||z   }||z   }
| j                  |
||      \  }	}||	z   }| j                  |      }|||	fS )Nr   r   r   )r   r   r   r  )	r  r  r   r  r  r  r  r  r  )rJ   r  r  r  r  r  r   rP   r   attn_outr   mlp_outs               r/   rT   z Sam2TwoWayAttentionBlock.forward  s=    ##g7'RJGQ33E..u%w.OKHa(G""7+ //((44SCW 5 
! H$""7+ ((7#G#""7+ //((443EQX4Y!h%h&&r.   )F)r%   r&   r'   r   rq   rA   r   r   r   rT   rU   rV   s   @r/   r  r    s\    74 74 7>*'*' *'  &	*'
 $*' %*' +,*'r.   r  c                   X     e Zd Zdef fdZ	 d
dededededee   dee	e
f   fd	Z xZS )Sam2TwoWayTransformerr;   c                 r   t         |           || _        |j                  | _        t	        j
                         | _        t        | j                        D ]+  }| j                  j                  t        ||dk(               - t        |      | _        t	        j                  |j                        | _        y )Nr   )r  )r@   rA   r;   r  rD   r   r   r   r   r  r  final_attn_token_to_imager   rC   layer_norm_final_attn)rJ   r;   r   rK   s      r/   rA   zSam2TwoWayTransformer.__init__  s    !'!9!9mmot--. 	_AKK7UVZ[U[]^	_ *7v)>&%'\\&2D2D%E"r.   r  r5   image_positional_embeddingsr  r   re   c           
         |t        d      |j                  d      j                  ddd      j                  d      }|j                  d      j                  ddd      j                  d      }|}|}| j                  D ]  }	|||z  } |	d|||||d|\  }}}
 ||z   }||z   }| j                  |||      \  }}
||z   }| j                  |      }||fS )Nz&You have to specify an image_embeddingr   r   r   )r  r  r  r  r  r  r-   )r^   rz   rN   r  r   r  r  )rJ   r  r5   r  r  target_embeddingr   r  r  r   rP   r   r   r  s                 r/   rT   zSam2TwoWayTransformer.forward  s*    #EFF+33A6>>q!QGQQRST&A&I&I!&L&T&TUVXY[\&]&g&ghi&j# # [[ 	E+++$  &6$?%9   GT1		 **00445cQU4V!H$,,W5}r.   r   )r%   r&   r'   r   rA   r   r   r   r   r,   r   rT   rU   rV   s   @r/   r  r    sf    F4 F& ( ( !( &,	(
 %( +,( 
uo%	&(r.   r  c                   f     e Zd ZdZddd fd
Zdej                  dej                  f fdZ xZS )	r&  aA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    rh   channels_lastrq  c                \    t        |   |fd|i| |dvrt        d|       || _        y )Nr   )r  rp  zUnsupported data format: )r@   rA   NotImplementedErrorrr  )rJ   normalized_shaper   rr  r   rK   s        r/   rA   zSam2LayerNorm.__init__  s?    )=s=f=AA%(A+&OPP&r.   featuresre   c                     | j                   dk(  r9|j                  dddd      }t        |   |      }|j                  dddd      }|S t        |   |      }|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        rp  r   r   r	   r   )rr  rN   r@   rT   )rJ   r  rK   s     r/   rT   zSam2LayerNorm.forward   sj    
 //''1a3Hwx0H''1a3H  wx0Hr.   )	r%   r&   r'   r(   rA   r)   r   rT   rU   rV   s   @r/   r&  r&    s4    
 15/ '   r.   r&  c                   h    e Zd Zdef fdZ	 	 ddej                  dej                  dej                  dej                  dedeej                     d	e	ej                     d
e	ej                     de
e   deej                  ej                  ej                  ej                  f   fdZd Zd Z xZS )Sam2MaskDecoderr;   c                    t         |           || _        |j                  | _        |j                  | _        |j                  dz   | _        t        j                  d| j                        | _        t        j                  | j
                  | j                        | _	        t        |      | _        t        j                  | j                  | j                  dz  dd      | _        t        j                  | j                  dz  | j                  dz  dd      | _        t        | j                  dz  d      | _        t        j"                         | _        g }t'        | j
                        D ]5  }|t)        | j                  | j                  | j                  dz  d      gz  }7 t        j*                  |      | _        t)        | j                  |j.                  | j
                  |j0                  d	
      | _        t        j4                  |j                  |j                  dz  dd      | _        t        j4                  |j                  |j                  dz  dd      | _        t        j                  d| j                        | _        t)        | j                  | j                  dd      | _        |j>                  | _        |j@                  | _         |jB                  | _!        y )Nr   rm   r   ro     rp  )rr  r	   T)r   )"r@   rA   r;   rC   num_multimask_outputsnum_mask_tokensrD   r$  	iou_tokenmask_tokensr  transformerr  upscale_conv1upscale_conv2r&  upscale_layer_normGELUr   r   r   r   output_hypernetworks_mlpsiou_head_hidden_dimiou_head_depthiou_prediction_headrE   conv_s0conv_s1obj_score_tokenpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)rJ   r;   	mlps_listrP   rK   s       r/   rA   zSam2MaskDecoder.__init__/  s`   !--%+%A%A"%;;a?a)9)9:<<(<(<d>N>NO08  //0@0@$BRBRVWBWefopq//0@0@A0EtGWGW[\G\jktuv"/0@0@A0ESc"d'')	t++, 	iA/$*:*:D<L<LdN^N^bcNcefghhI	i)+y)A&#2&&  !!$
  yy!3!3V5G5G15LZ[defyy!3!3V5G5G15LZ[def!||At/?/?@#243C3CTEUEUWXZ[#\ /5/U/U,171Y1Y.282[2[/r.   r5   r  sparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputhigh_resolution_featuresr  r  r   re   c	           
         |j                   \  }
}}}|j                   d   }t        j                  | j                  j                  | j
                  j                  | j                  j                  gd      }|j                  |
|dd      }|j                   d   dk7  rt        j                  ||fd      }n|}|j                  | j
                  j                  j                        }||z   }|j                  |d      }|j                  |d      } | j                  d	|||||d|	\  }}|dddddddf   }|dddddd| j                  z   ddf   }|j                  dd      j                  |
|z  |||      }|\  }}|j                  |d      }|j                  |d      }| j                  |      |z   }| j!                  | j#                  |            }| j!                  | j%                  |      |z         }g }t'        | j                        D ]*  }| j(                  |   }| ||dddd|ddf         gz  }, t        j*                  |d      }|j                   \  }}}}|j                  |
||||z        }||z  j                  |
|d||      }| j-                  |      }| j/                  |dddddddf         }|r+t1        dd      }|dddd|ddddf   }|dddd|f   }nd| j2                  r.| j4                  s"t1        dd      }| j7                  ||      \  }}n*t1        dd      }|dddd|ddddf   }|dddd|f   }|dddd|f   } ||| |fS )
a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (`torch.Tensor`):
                The embeddings from the image encoder.
            image_positional_embeddings (`torch.Tensor`):
                Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (`torch.Tensor`):
                The embeddings of the points and boxes.
            dense_prompt_embeddings (`torch.Tensor`):
                The embeddings of the mask inputs.
            multimask_output (`bool`):
                Whether to return multiple masks or a single mask.
            high_resolution_features (`list[torch.Tensor]`, *optional*):
                The high-resolution features from the vision encoder.
            attention_similarity (`torch.Tensor`, *optional*):
                The attention similarity tensor.
            target_embedding (`torch.Tensor`, *optional*):
                The target embedding.
        r   r   rn   r   )r  r5   r  r  r  Nr	   ri   r-   )rM   r)   r{   r  r  r  r  repeatrr   rc   repeat_interleaver  r  r   r   r  r   r  r  r   r  rw   r  r  slicer  r    _dynamic_multimask_via_stability)!rJ   r5   r  r  r  r  r  r  r  r   r   rB   rQ   rR   r  output_tokenstokensr  iou_token_outmask_tokens_outfeat_s0feat_s1upscaled_embeddinghyper_in_listr   current_mlphyper_inrP   rx  iou_predr4   
mask_slicesam_tokens_outs!                                    r/   rT   zSam2MaskDecoder.forwardX  s   B 3C2H2H/
L&%399!<		$$++%%  ''
 
 &,,Z9I1aP#))!,1YY/GHaPF"F!99T^^%:%:%@%@A ,.EE+==>NTU=V&A&S&STdfg&h#-=T-=-= .
--(C!5-.
 .
** )Aq!4*1aa$:N:N6N1OQR+RS ,55a;@@))<
 4++,<!+D++,<!+D!//0@AGK!__T-D-DEW-XY!__T-?-?@R-SV]-]^,.t++, 	HA88;Kk/!Q1**EFGGM	H ;;}!4);)A)A&</44ZAQS_agjoaop..44ZAQSUW]_de ++M:"667G1aQR
7ST q$J!Q
Aq01E1j 01H11$--q!J"CCE8TOE8q!J!Q
Aq01E1j 01H(Az)9:h0CCCr.   c                    |j                  d      }| j                  }t        j                  ||kD  d      j	                         }t        j                  || kD  d      j	                         }t        j
                  |dkD  ||z  d      }|S )zz
        Compute stability scores of the mask logits based on the IoU between upper and
        lower thresholds.
        r   ri   rn   r   r  )rz   r  r)   sumr   r  )rJ   mask_logitsstability_deltaarea_iarea_ustability_scoress         r/   _get_stability_scoresz%Sam2MaskDecoder._get_stability_scores  s    
 "))"-@@;8bAGGI;/)99rBHHJ ;;vz6F?CHr.   c           	         |ddddddddddf   }|ddddddf   }t        j                  |d      }|j                  d      j                  d      j                  d      }|j                  ddd|j	                  d      |j	                  d            }t        j
                  |d|      }t        j
                  |d|j                  d            }|ddddddddddf   }	|ddddddf   }
| j                  |	      }|| j                  k\  }t        j                  |d   j                  |	      |	|      }t        j                  |j                  |
      |
|      }||fS )	as  
        When outputting a single mask, if the stability score from the current single-mask
        output (based on output token 0) falls below a threshold, we instead select from
        multi-mask outputs (based on output token 1~3) the mask with the highest predicted
        IoU score. This is intended to ensure a valid mask for both clicking and tracking.
        Nr   ri   rn   r   r   r   ).NN)
r)   argmaxr  r  rG  gatherr	  r  r  	expand_as)rJ   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_indsbest_scores_inds_expandedbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresr  	is_stablemask_logits_outiou_scores_outs                  r/   r  z0Sam2MaskDecoder._dynamic_multimask_via_stability  s    +1aQ>:-aABh7 <<(<"E$4$>$>r$B$L$LR$P$Z$Z[]$^!$=$D$DA',,R02B2G2G2K%
! !&-=qB[ \$)LL1EqJZJdJdegJh$i! ,Aq!A#q!O< .q!QqSy 9556GH$(O(OO	  ++o&001BC!

  56!%

 ..r.   )NN)r%   r&   r'   r   rA   r)   r   rq   listr   r   r   r,   rT   r	  r  rU   rV   s   @r/   r  r  .  s    '\4 '\b 8<37mD,,mD &+\\mD #(,,	mD
 "'mD mD #'u||"4mD 'u||4mD #5<<0mD +,mD 
u||U\\5<<E	FmD^
 #/r.   r  z
    Segment Anything Model 2 (SAM 2) for generating segmentation masks, given an input image and
    input points and labels, boxes, or masks.
    c                       e Zd ZdgZdgZd eed      iZg dZde	f fdZ
d Zd	 Zd
ej                  fdZ ej                          dej"                  dee   d
eej                     fd       Z ej                          	 	 	 	 ddeej"                     deej.                     deej"                     deej.                     fd       Zee	 	 	 	 	 	 	 	 	 ddeej"                     deej"                     deej.                     deej"                     deej.                     deej"                     dedeej"                     deej"                     dee   d
efd              Zdej"                  dee   d
eeej                     eej                     eeej"                  df      eeej"                  df      f   fdZ xZ S )r+  z4prompt_encoder.shared_embedding.positional_embeddingr8   r   )index)z
^memory_.*z^mask_downsample.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterr;   c                    t         |   |       t        |j                        | _        t        j                  |j                        | _        t        |j                        | _
        |j                  |j                  _        t        |j                        | _        |j                  j                  | _        |j                  j                   | _        |j                  j"                  | _        t&        j(                  j+                  t'        j,                  dd| j$                              | _        | j1                          y )Nr   )r@   rA   ra  prompt_encoder_configshared_image_embeddingr   rV  vision_configvision_encoderr{  prompt_encoderr   mask_decoder_configr  mask_decoderrZ  backbone_feature_sizesr   r   r)   rD   r8  rp   r,  r[  r\  s     r/   rA   zSam2Model.__init__  s     &=f>Z>Z&[#'33F4H4HI/0L0LM:@:U:U""7+F,F,FG"("6"6"I"I&,&:&:&Q&Q# ..>>#(88#5#5ekk!Q6X#Y r.   c                     | j                   j                  j                  | j                  j                  j                  _        y r   )r#  rc  r   r&  r}  rB  s    r/   _tie_weightszSam2Model._tie_weights  s1    ''<<AA 	,,AAFr.   c                 6    | j                   j                         S r   )r%  rC  rB  s    r/   rC  zSam2Model.get_input_embeddings$  s    ""7799r.   re   c                    | j                   j                  }| j                  j                  j                  }| j                  j                  j
                  }t        j                  |||      }|j                  d      dz
  }|j                  d      dz
  }||d   z  }||d   z  }| j                  t        j                  ||gd            }|j                  ddd      j                  d      S )Nrg   r   rn   r  r   ri   r   )r&  r  r#  rc  rb   rc   r)   onesrs   rw   rN   r  )rJ   rG  target_devicetarget_dtypegridr}   r~   rc  s           r/   $get_image_wide_positional_embeddingsz.Sam2Model.get_image_wide_positional_embeddings'  s    ""7733HHOO22GGMMzz$}LI++!+$s*++!+$s*DG#DG##::5;;QXGY_a;bc#++Aq!4>>qAAr.   rO   r   c           
          |j                   d   } | j                  |fi |\  }}}}|d   | j                  z   |d<   t        || j                        D cg c]*  \  }} |j                  ddd      j                  |dg| , }}}|S c c}}w )z
        Returns the image embeddings by passing the pixel values through the vision encoder.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Input pixel values
        r   ri   r   r   )rM   get_image_featuresr,  rI  r)  rN   r   )	rJ   rO   r   r   feature_mapsrP   feat	feat_sizer5   s	            r/   get_image_embeddingszSam2Model.get_image_embeddings4  s     "''*
 7 7 7 O OaA (+d.F.FFR
 $'|T5P5P#Q
i 'DLLAq!&&z2B	B
 

  
s   /B
r  r  r  r  c                 0    | j                  ||||      }|S )a  
        Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.

        Args:
            input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
                Optional input points for the prompt encoder. The padding of the point is automatically done by the
                processor. `point_batch_size` refers to the number of masks that we want the model to predict per
                point. The model will output `point_batch_size` times 3 masks in total.
            input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
                Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
                processor, or can be fed by the user.
            input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`):
                Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
                processor. users can also pass manually the input boxes.
            input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`):
                Optional input masks for the prompt encoder.
        r  r  r  r  )r&  )rJ   r  r  r  r  prompt_outputs         r/   get_prompt_embeddingszSam2Model.get_prompt_embeddingsO  s-    2 ++%%##	 , 
 r.   r5   r  r  r  c
                 h   |du |du z  st        d      |V|T|j                  d   |j                  d   k7  r5t        dj                  |j                  d   |j                  d               | j                         }||j                  d   n|d   j                  d   }|j	                  |ddd      }d}d}|x | j
                  |fi |
\  }}}}|d   | j                  z   |d<   t        || j                        D cg c]*  \  }} |j                  ddd      j                  |dg| , }}}|?|=t        j                  |dddddddf   t        j                  |j                        }|m|kt        j                  |ddd|d   j                   |d   j                        }t        j"                  |ddt        j$                  |d   j                         }|{|j                  d	d | j&                  j(                  k7  rUt+        j,                  |j/                         | j&                  j(                  d
dd      j1                  |j                         }| j'                  ||||      \  }} | j2                  d|d   |||||dd ||	d|
\  }}}}t5        ||||||      S c c}}w )a  
        input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`):
            Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
            better results. The points can be obtained by passing a list of list of list to the processor that will
            create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the
            second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict
            per input point), the third dimension is the number of points per segmentation mask (it is possible to pass
            multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
            coordinates of the point. If a different number of points is passed either for each image, or for each
            mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
            computation of the embedding will be skipped for these points using the labels.
        input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`):
            Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
            official implementation, there are 3 types of labels

            - `1`: the point is a point that contains the object of interest
            - `0`: the point is a point that does not contain the object of interest
            - `-1`: the point corresponds to the background

            We added the label:

            - `-10`: the point is a padding point, thus should be ignored by the prompt encoder

            The padding labels should be automatically done by the processor.
        input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
            Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
            much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
            that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch
            size, the number of boxes per image and the coordinates of the top left and bottom right point of the box.
            In the order (`x1`, `y1`, `x2`, `y2`):

            - `x1`: the x coordinate of the top left point of the input box
            - `y1`: the y coordinate of the top left point of the input box
            - `x2`: the x coordinate of the bottom right point of the input box
            - `y2`: the y coordinate of the bottom right point of the input box
        input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`):
            SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
            generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
            manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`):
            Image embeddings, this is used by the mask decoder to generate masks and iou scores. For more memory
            efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
            method, and then feed them to the `forward` method instead of feeding the `pixel_values`.
        multimask_output (`bool`, *optional*):
            In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
            bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
            "best" mask, by specifying `multimask_output=False`.
        attention_similarity (`torch.FloatTensor`, *optional*):
            Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the
            model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048).
        target_embedding (`torch.FloatTensor`, *optional*):
            Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case
            the model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048).

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoModel, AutoProcessor

        >>> model = AutoModel.from_pretrained("danelcsb/sam2.1_hiera_tiny")
        >>> processor = AutoProcessor.from_pretrained("danelcsb/sam2.1_hiera_tiny")

        >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
        >>> input_points = [[[400, 650]]]  # 2D location of a window on the car
        >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")

        >>> # Get segmentation mask
        >>> outputs = model(**inputs)

        >>> # Postprocess masks
        >>> masks = processor.post_process_masks(
        ...     outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
        ... )
        ```
        NzAExactly one of pixel_values or image_embeddings must be provided.r   zQYou should provide as many bounding boxes as input points per box. Got {} and {}.r   ri   r   rj   r   FbilinearT)rG  r   r   r   r:  )r5   r  r  r  r  r  r  r  )r2   r3   r4   r5   r6   r7   r-   )r^   rM   formatr2  r  r4  r,  rI  r)  rN   r   r)   	ones_liker   rb   rp   rc   r.  int32r&  r  r   r   r   rr   r(  r1   )rJ   rO   r  r  r  r  r5   r  r  r  r   r  r   r7   r6   r5  rP   r6  r7  r  ry  low_res_multimasksr2   r4   s                           r/   rT   zSam2Model.forwardp  s=   z %*:d*BC`aa#(?!!!$(9(9!(<< gnn$**1-{/@/@/C  '+&O&O&Q#.:.F\''*L\]_L`LfLfghLi
&A&H&HUVXY[\&]# ##G^tG^G^HHDL!13D  ,B/$2J2JJL
 (+<9T9T'U #D) +Q1%**:rFIF   
 #(< ??<1a
+C599]i]p]pqLK$7 ;;Aq!+;B+?+E+EN^_aNbNiNiL "JJz1au{{ScdfSgSnSnooL"   %)<)<)L)LLmm%%',,<<"'#" "[&&'  /3.A.A%%##	 /B /
++ BSARAR 
B
-b1(C%6$4-%5cr%:!5-
B
 
B
>J+> +!) 3-!5/
 	
[ s    /J..c                     | j                   |fi |}|j                  }|j                  }t        |      }| j                  j                  |d         |d<   | j                  j                  |d         |d<   |D cg c]$  }|j                  d      j                  ddd      & }}|D cg c]$  }|j                  d      j                  ddd      & }}|||j                  |j                  fS c c}w c c}w )a  
        Extract and preprocess image features using the vision encoder.

        Args:
            pixel_values (`torch.FloatTensor`):
                Input pixel values of shape `(batch_size, num_channels, height, width)`.

        Returns:
            `tuple`: A tuple containing:
                - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
                - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
                - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
                - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
        r   r   r   )r%  r!   r"   r  r(  r  r  rz   rN   r#   r$   )rJ   rO   r   vision_outputsr5   feature_maps_position_embeddingsfeature_mapfeature_map_position_embeddings           r/   r4  zSam2Model.get_image_features  s   0 3F$2E2E3
3

 &77+9+O+O( L)++33LODQ++33LODQ T``K++A.66q!Q?`` 3S,
. +2215==aAF,
( ,

 =~?[?[]k]v]vvv a,
s   =)C1,)C6)NNNN)	NNNNNNTNN)!r%   r&   r'   _tied_weights_keys_keys_to_ignore_on_load_missingr   r  rR  "_keys_to_ignore_on_load_unexpectedr   rA   r+  rC  r)   r   r2  no_gradr*   r   r   r  r8  r   
LongTensorr<  r   r   rq   r1   rT   r,   r4  rU   rV   s   @r/   r+  r+    s    QQ']&^#4nE]ef6gh*&z "

:Bell B U]]_ ''  +,  
ell		   4 U]]_ 59373726u001 u//0 e//0	
 e../ @  59483737268<!%<@8<k
u001k
 u001k
 u//0	k

 e//0k
 e../k
 #5#4#45k
 k
 'u'8'89k
 #5#4#45k
 +,k
 
%k
  k
Z-w''-w +,-w 
U\\U\\u((#-./u((#-./	1
	-wr.   r+  )r+  rT  r  r(  )r  r   )Kr_   dataclassesr   typingr   r   r   numpyr:  r)   torch.nnrD   torch.nn.functionalr   r   r   transformers.utils.genericr   activationsr
   modeling_layersr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   utils.genericr   r   autor   configuration_sam2r   r   r   r   r   r   r1   Moduler:   rX   r   r   r   r   r   r   r   r   r   r   r  r  r(  rT  ra  rm  r{  r  r  r  r   r&  r  r+  __all__r-   r.   r/   <module>r_     s  ,  ! , ,       5 ! 9 / F & @ D   KL?k ? M ?6 FGL+ L H L@")) D+		 +\/8RYY /8r %II%<<% 
% <<	%
 U\\*% % %,u|| 8C= ELL ?bii ?Dbii <2@>Z4 Zz 
	Ok 	O 	O 8/ 8 8>?
+ ?
D 
-
) -

-
`Sbii S2 		  6[3		 [3|;)BII ;)|J'ryy J'Z6BII 6rBLL 4H/bii H/V Mw# MwMw`
 Wr.   