
    h+p                     4   d Z ddlZddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  ejN                  e(      Z) G d de	jT                        Z+ G d de	jT                        Z,	 d@de	jT                  dejZ                  dejZ                  dejZ                  deejZ                     de.de.fdZ/ G d de	jT                        Z0 G d d e	jT                        Z1 G d! d"e	jT                        Z2 G d# d$e	jT                        Z3dAd%ejZ                  d&e.d'e4d(ejZ                  fd)Z5 G d* d+e	jT                        Z6 G d, d-e	jT                        Z7 G d. d/e	jT                        Z8 G d0 d1e      Z9 G d2 d3e	jT                        Z:e G d4 d5e             Z;e G d6 d7e;             Z< ed89       G d: d;e;             Z= ed<9       G d= d>e;e!             Z>g d?Z?y)BzPyTorch DINOv2 model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)BackboneMixin)can_return_tuplecheck_model_inputs   )Dinov2Configc                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z	dd
ej                  de
ej                     dej                  fdZ xZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    configreturnNc                 z   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r8t        j                  t	        j                  d|j                              | _
        t        |      | _        | j                  j                  }t        j                  t	        j
                  d|dz   |j                              | _        t        j                  |j                         | _        |j$                  | _        |j                  | _        || _        y )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenuse_mask_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r-   	__class__s      h/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/dinov2/modeling_dinov2.pyr"   zDinov2Embeddings.__init__,   s    ekk!Q8J8J&KL   ll5;;q&:L:L+MNDO 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<= ++$33    
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }|j                  }t        j                  j                  |j                  t        j                        |	|
fdd	
      j                  |      }|j                  dddd      j                  dd|      }t        j                   ||fd      S )a-  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdtypedim)shaper.   r$   jit
is_tracingr2   r   reshapepermuterB   r   
functionalinterpolatetofloat32viewcat)r3   r7   r8   r9   r-   num_positionsclass_pos_embedpatch_pos_embedrD   
new_height	new_widthsqrt_num_positionstarget_dtypes                r5   interpolate_pos_encodingz)Dinov2Embeddings.interpolate_pos_encoding:   s    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=&,,--33u}}-i(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/?;CCr6   pixel_valuesbool_masked_posc                 D   |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }|d| j                  rXt        j                  |j                  d      | j                  j                  |j                        j                  d      |      }| j                  j                  |dd      }	t        j                  |	|fd      }|| j                  |||      z   }| j                  |      }|S )NrA   r;   r   r   rC   )rE   r,   
projectionweightrB   rL   r(   r$   where	unsqueezer*   r'   expandrO   rW   r1   )
r3   rX   rY   
batch_size_r8   r9   rV   r7   
cls_tokenss
             r5   forwardzDinov2Embeddings.forwardb   s    '3'9'9$
Avu,,77>>DD**<???+NO
&4+>+>))"-t/A/A*BRBR/S/]/]^_/`blJ
 ^^**:r2>
YY
J7Q?
  $"?"?
FTY"ZZ
\\*-
r6   N)__name__
__module____qualname____doc__r   r"   r$   TensorintrW   r   rc   __classcell__r4   s   @r5   r   r   '   s|    |  &D5<< &D &DUX &D]b]i]i &DPELL 8ELLCY ejeqeq r6   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )r+   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r!   r"   
image_sizer2   num_channelsr&   
isinstancecollectionsabcIterabler-   r   Conv2dr[   )r3   r   rq   r2   rr   r&   r-   r4   s          r5   r"   zDinov2PatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir6   rX   r   c                     |j                   d   }|| j                  k7  rt        d| j                   d| d      | j                  |      j	                  d      j                  dd      }|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r<   )rE   rr   
ValueErrorr[   flatten	transpose)r3   rX   rr   r7   s       r5   rc   zDinov2PatchEmbeddings.forward   sz    #))!,4,,,!../yaI  __\2::1=GG1M
r6   )	re   rf   rg   rh   r"   r$   ri   rc   rk   rl   s   @r5   r+   r+   x   s)    jELL U\\ r6   r+   modulequerykeyvalueattention_maskscalingr1   c                    t        j                  ||j                  dd            |z  }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }|||z  }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr;   )rD   rB   )ptrainingr   r<   )r$   matmulr|   r   rJ   softmaxrM   rL   rB   r1   r   
contiguous)
r}   r~   r   r   r   r   r1   kwargsattn_weightsattn_outputs
             r5   eager_attention_forwardr      s     <<s}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#n4,,|U3K''1-88:K$$r6   c            	            e Zd Zdef fdZ	 ddej                  deej                     deej                  ej                  f   fdZ	 xZ
S )Dinov2SelfAttentionr   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads ry   g      Fbias)r!   r"   r&   num_attention_headshasattrrz   r   rj   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr~   r   r   r3   r   r4   s     r5   r"   zDinov2SelfAttention.__init__   sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r6   hidden_states	head_maskr   c           
         |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        }| j                  j                  dk7  rt        | j                  j                     } || ||||| j                  | j                  | j                  sdn| j                        \  }	}
|	j!                         d d | j"                  fz   }|	j%                  |      }	|	|
fS )	Nr   r;   r   r<   eager        )r   r   r1   r   )rE   r   r   r   rN   r|   r   r~   r   r   _attn_implementationr   r   r   r   r   r>   r   rH   )r3   r   r   r`   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r5   rc   zDinov2SelfAttention.forward   sR    #((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?;;++w6"9$++:Z:Z"[)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r6   rd   )re   rf   rg   r   r"   r$   ri   r   tuplerc   rk   rl   s   @r5   r   r      sT    ]| ]* PT."\\.6>u||6L.	u||U\\)	*.r6   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y rd   )	r!   r"   r   r   r&   denser/   r0   r1   r   s     r5   r"   zDinov2SelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r6   r   input_tensorr   c                 J    | j                  |      }| j                  |      }|S rd   )r   r1   )r3   r   r   s      r5   rc   zDinov2SelfOutput.forward   s$    

=1]3r6   )
re   rf   rg   rh   r   r"   r$   ri   rc   rk   rl   s   @r5   r   r      s=    
>| >
U\\  RWR^R^ r6   r   c                        e Zd Zdef fdZdee   fdZd	dej                  de
ej                     dej                  fdZ xZS )
Dinov2Attentionr   c                     t         |           t        |      | _        t	        |      | _        t               | _        y rd   )r!   r"   r   	attentionr   outputsetpruned_headsr   s     r5   r"   zDinov2Attention.__init__  s0    ,V4&v.Er6   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rC   )lenr   r   r   r   r   r   r~   r   r   r   r   r   union)r3   r   indexs      r5   prune_headszDinov2Attention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r6   r   r   r   c                 T    | j                  ||      \  }}| j                  ||      }|S rd   )r   r   )r3   r   r   self_attn_outputra   r   s         r5   rc   zDinov2Attention.forward  s.    "nn]IF!-}=r6   rd   )re   rf   rg   r   r"   r   rj   r   r$   ri   r   rc   rk   rl   s   @r5   r   r      sM    "| ";S ;$U\\ hu||>T `e`l`l r6   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )Dinov2LayerScaler   c                     t         |           t        j                  |j                  t        j                  |j                        z        | _        y rd   )	r!   r"   r   r#   layerscale_valuer$   onesr&   lambda1r   s     r5   r"   zDinov2LayerScale.__init__   s8    ||F$;$;ejjI[I[>\$\]r6   hidden_statec                      || j                   z  S rd   )r   r3   r   s     r5   rc   zDinov2LayerScale.forward$  s    dll**r6   r   Nre   rf   rg   r"   r$   ri   rc   rk   rl   s   @r5   r   r     s$    ^+ELL +U\\ +r6   r   input	drop_probr   r   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )rB   device)rE   ndimr$   randrB   r   floor_div)r   r   r   	keep_probrE   random_tensorr   s          r5   	drop_pathr   )  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr6   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 0    t         |           || _        y rd   )r!   r"   r   )r3   r   r4   s     r5   r"   zDinov2DropPath.__init__A  s    "r6   r   c                 D    t        || j                  | j                        S rd   )r   r   r   )r3   r   s     r5   rc   zDinov2DropPath.forwardE  s    FFr6   c                      d| j                    S )Nzp=)r   r3   s    r5   
extra_reprzDinov2DropPath.extra_reprH  s    DNN#$$r6   rd   )re   rf   rg   rh   r   floatr"   r$   ri   rc   strr   rk   rl   s   @r5   r   r   >  sG    b#(5/ #T #GU\\ Gell G%C %r6   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )	Dinov2MLPr   c                 ~   t         |           |j                  x}}t        |j                  |j                  z        }t        j                  ||d      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||d      | _        y )NTr   )r!   r"   r&   rj   	mlp_ratior   r   fc1rs   
hidden_actr   r   
activationfc2r3   r   in_featuresout_featureshidden_featuresr4   s        r5   r"   zDinov2MLP.__init__M  s    %+%7%77lf0063C3CCD99[/Ef''-$V%6%67DO$//DO99_lFr6   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rd   )r   r   r   r   s     r5   rc   zDinov2MLP.forwardX  s2    xx-|4xx-r6   r   r   rl   s   @r5   r   r   L  s$    	GELL U\\ r6   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )Dinov2SwiGLUFFNr   c                 0   t         |           |j                  x}}t        |j                  |j                  z        }t        |dz  dz        dz   dz  dz  }t        j                  |d|z  d      | _        t        j                  ||d      | _        y )Nr<   r         Tr   )	r!   r"   r&   rj   r   r   r   
weights_inweights_outr   s        r5   r"   zDinov2SwiGLUFFN.__init__`  s    %+%7%77lf0063C3CCD2Q67!;AAE))K_1D4P99_lNr6   r   c                     | j                  |      }|j                  dd      \  }}t        j                  j	                  |      |z  }| j                  |      S )Nr<   r;   rC   )r   chunkr   rJ   silur   )r3   r   x1x2hiddens        r5   rc   zDinov2SwiGLUFFN.forwardi  sS    |4##A2#.B##B'",''r6   r   r   rl   s   @r5   r   r   _  s$    O(ELL (U\\ (r6   r   c                        e Zd ZdZdeddf fdZ	 d	dej                  deej                     dej                  fdZ	 xZ
S )
Dinov2LayerzCThis corresponds to the Block class in the original implementation.r   r   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        |      | _
        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        |j                   rt#        |      | _        nt'        |      | _        t        |      | _        y )Nepsr   )r!   r"   r   	LayerNormr&   layer_norm_epsnorm1r   r   r   layer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr   mlpr   layer_scale2r   s     r5   r"   zDinov2Layer.__init__s  s    \\&"4"4&:O:OP
(0,V4BHBWBWZ]B](=(=>cecncncp\\&"4"4&:O:OP
  &v.DH (DH,V4r6   r   r   c                 $   | j                  |      }| j                  ||      }| j                  |      }| j                  |      |z   }| j	                  |      }| j                  |      }| j                  |      }| j                  |      |z   }|S rd   )r  r   r  r   r  r  r	  )r3   r   r   hidden_states_normself_attention_outputlayer_outputs         r5   rc   zDinov2Layer.forward  s    
 "ZZ6 $/A9 M $ 1 12G H '<=M zz-0xx-((6 ~~l3mCr6   rd   )re   rf   rg   rh   r   r"   r$   ri   r   rc   rk   rl   s   @r5   r   r   p  sP    M5| 5 5& -1|| ELL) 
	r6   r   c            	       n     e Zd Zdef fdZ	 ddej                  deej                     dede	fdZ
 xZS )	Dinov2Encoderr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r!   r"   r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingr3   r   ra   r4   s      r5   r"   zDinov2Encoder.__init__  sN    ]]vG_G_A`#aAK$7#ab
&+# $bs   A#r   r   output_hidden_statesr   c                     |r|gnd }t        | j                        D ]+  \  }}|||   nd } |||      }|s|j                  |       - t        ||rt	        |            S d       S )N)last_hidden_stater   )	enumerater  appendr   r   )r3   r   r   r  all_hidden_statesilayer_modulelayer_head_masks           r5   rc   zDinov2Encoder.forward  s     0D]O(4 	8OA|.7.CilO(HM !((7		8 +6G% 12
 	
MQ
 	
r6   r  )re   rf   rg   r   r"   r$   ri   r   boolr   rc   rk   rl   s   @r5   r  r    sH    ,| , sx
"\\
6>u||6L
ko
	
r6   r  c                       e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdeiZdeej"                  ej$                  ej&                  f   dd	fd
Zy	)Dinov2PreTrainedModelr   dinov2rX   Tr   
attentionsr}   r   Nc                 H   t        |t        j                  t        j                  f      rt        j                  j                  |j                  j                  j                  t        j                        d| j                  j                        j                  |j                  j                        |j                  _        |j                  %|j                  j                  j                          yyt        |t        j                         rJ|j                  j                  j                          |j                  j                  j#                  d       yt        |t$              rnt        j                  j                  |j&                  j                  j                  t        j                        d| j                  j                        j                  |j&                  j                        |j&                  _        t        j                  j                  |j(                  j                  j                  t        j                        d| j                  j                        j                  |j(                  j                        |j(                  _        | j                  j*                  r%|j,                  j                  j                          yyt        |t.              r:|j0                  j                  j#                  | j                  j2                         yy)zInitialize the weightsr   )meanstdNg      ?)rs   r   r   rw   inittrunc_normal_r\   datarL   r$   rM   r   initializer_rangerB   r   zero_r   fill_r   r.   r'   r(   r*   r   r   r   )r3   r}   s     r5   _init_weightsz#Dinov2PreTrainedModel._init_weights  s!   fryy"))45 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '-KK""$MM$$S) 01.0gg.C.C**//225==AKK11 /D / b++112	 &&+ %'GG$9$9  %%((7KK11 %: % b!!''(	 ! {{))!!&&,,. * 01NN%%dkk&B&BC 2r6   )re   rf   rg   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr   r   r   rw   r   r/   r6   r5   r#  r#    su     $O&*#&N"&)DE"))RYY*L$M DRV Dr6   r#  c                        e Zd Zdef fdZdefdZdeee	e   f   ddfdZ
ee	 	 	 	 ddeej                     d	eej                     d
eej                     dee   def
d              Z xZS )Dinov2Modelr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr   )r!   r"   r   r   r7   r  encoderr   r   r&   r  	layernorm	post_initr   s     r5   r"   zDinov2Model.__init__  sY     *62$V,f&8&8f>S>ST 	r6   r   c                 .    | j                   j                  S rd   r7   r,   r   s    r5   get_input_embeddingsz Dinov2Model.get_input_embeddings      ///r6   heads_to_pruneNc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr>  r  r   r   )r3   rE  r  r   s       r5   _prune_headszDinov2Model._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr6   rX   rY   r   r  c                 h   || j                   j                  }|t        d      | j                  || j                   j                        }| j                  ||      }| j                  |||      }|j                  }| j                  |      }|dddddf   }	t        ||	|j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.
        Nz You have to specify pixel_values)rY   )r   r  r   )r  pooler_outputr   )r   r  rz   get_head_maskr  r7   r>  r  r?  r   r   )
r3   rX   rY   r   r  r   embedding_outputencoder_outputssequence_outputpooled_outputs
             r5   rc   zDinov2Model.forward  s      '#';;#C#C ?@@ &&y$++2O2OP	??<?Y+/<<	H\ ,8 ,
 *;;..9'1a0)-')77
 	
r6   )NNNN)re   rf   rg   r   r"   r+   rC  dictrj   listrH  r   r   r   r$   ri   r!  r   rc   rk   rl   s   @r5   r<  r<    s    
| 
0&; 0C4T#Y+? CD C  0426,0/3'
u||,'
 "%,,/'
 ELL)	'

 'tn'
 
$'
  '
r6   r<  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )custom_introc                        e Zd Zdeddf fdZee	 	 	 d
deej                     deej                     deej                     de
e   def
d	              Z xZS )Dinov2ForImageClassificationr   r   Nc                 0   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r-t        j                  |j                  dz  |j                        nt        j                         | _	        | j                          y )Nr   r<   )r!   r"   
num_labelsr<  r$  r   r   r&   r  
classifierr@  r   s     r5   r"   z%Dinov2ForImageClassification.__init__,  sy      ++!&) EKDUDUXYDYBIIf((1,f.?.?@_a_j_j_l 	
 	r6   rX   r   labelsr   c                 l    | j                   |fd|i|}|j                  }|dddf   }|ddddf   }t        j                  ||j	                  d      gd      }	| j                  |	      }
d}| | j                  ||
| j                  fi |}t        ||
|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   Nr   r   rC   )losslogitsr   r%  )r$  r  r$   rO   r'  rW  loss_functionr   r   r   r%  )r3   rX   r   rX  r   outputsrN  r'   patch_tokenslinear_inputr[  rZ  s               r5   rc   z$Dinov2ForImageClassification.forward:  s     /:dkk,.fR[.f_e.f!33#AqD)	&q!"u-yy)\->->1->-E!FAN.%4%%ffdkkLVLD$!//))	
 	
r6   )NNN)re   rf   rg   r   r"   r   r   r   r$   ri   r   r   r   rc   rk   rl   s   @r5   rT  rT  %  s    |    04,0)-	
u||,
 ELL)
 &	

 +,
 

  
r6   rT  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c            	       p     e Zd Z fdZdefdZee	 ddej                  de
e   defd              Z xZS )Dinov2Backbonec                 v   t         |   |       t         | 	  |       t        |j                  dz         D cg c]  }|j
                   c}| _        t        |      | _        t        |      | _
        t        j                  |j
                  |j                        | _        | j                          y c c}w )Nr   r   )r!   r"   _init_backboner  r  r&   num_featuresr   r7   r  r>  r   r   r  r?  r@  r  s      r5   r"   zDinov2Backbone.__init__d  s     v&9>v?W?WZ[?[9\]AV//]*62$V,f&8&8f>S>ST 	 ^s   B6r   c                 .    | j                   j                  S rd   rB  r   s    r5   rC  z#Dinov2Backbone.get_input_embeddingsq  rD  r6   rX   r  c                    || j                   j                  }| j                  |      }| j                  |d      }|j                  }g }t        | j                  |      D ]  \  }}	|| j                  v s| j                   j                  r| j                  |	      }	| j                   j                  rn|	ddddf   }	|j                  \  }
}}}| j                   j                  }|	j                  |
||z  ||z  d      }	|	j                  dddd      j                         }	|j!                  |	        t#        t%        |      |r|	      S d	      S )
a%  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NT)r  r   r;   r   r   r<   )feature_mapsr   )r   r  r7   r>  r   zipstage_namesr   apply_layernormr?  reshape_hidden_statesrE   r2   rH   rI   r   r  r
   r   )r3   rX   r  r   rL  r   r   rg  stager   r`   ra   r8   r9   r2   s                  r5   rc   zDinov2Backbone.forwardt  sS   :  '#';;#C#C ??<8"&,,/?VZ,"[,,#&t'7'7#G 	2E<)));;..#'>>,#?L;;44#/12#6L 4@3E3E0J65!%!7!7J#/#7#7
FjDXZ_cmZmoq#rL#/#7#71a#C#N#N#PL##L1	2 |,+?-
 	
EI
 	
r6   rd   )re   rf   rg   r"   r+   rC  r   r   r$   ri   r   r!  r
   rc   rk   rl   s   @r5   ra  ra  ^  sT    0&; 0 QU4
!LL4
@H4
	4
  4
r6   ra  )rT  r<  r#  ra  )r   )r   F)@rh   collections.abcrt   typingr   r   r   r$   torch.utils.checkpointr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   utils.genericr   r   configuration_dinov2r   
get_loggerre   loggerModuler   r+   ri   r   r   r   r   r   r   r!  r   r   r   r   r   r  r#  r<  rT  ra  __all__r:  r6   r5   <module>r~     s2     , ,    ! 9 r r F & Q K K 1 A . 
		H	%Nryy NbBII R %II%<<% 
% <<	%
 U\\*% % %>1.")) 1.jryy $bii >+ryy +U\\ e T V[VbVb *%RYY %		 &(bii ("', 'T
BII 
. +DO +D +D\ A
' A
 A
H 0
#8 0
0
f 
G
*M G

G
T er6   