
    h9                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,  e&jZ                  e.      Z/e e$d       G d de#                    Z0 G d de
jb                        Z2 G d de
jb                        Z3 G d de
jb                        Z4de3iZ5 G d de
jb                        Z6 G d  d!e
jb                        Z7 G d" d#e
jb                        Z8 G d$ d%e      Z9 G d& d'e
jb                        Z:e$ G d( d)e             Z; G d* d+e
jb                        Z< G d, d-e
jb                        Z=	 dJd.e
jb                  d/ej|                  d0ej|                  d1ej|                  d2eej|                     d3e?d4e?fd5Z@ G d6 d7e
jb                        ZA G d8 d9e      ZB G d: d;e
jb                        ZC G d< d=e
jb                        ZD e$d>       G d? d@e;             ZE G dA dBe
jb                        ZF e$dC       G dD dEe;             ZG e$dF       G dG dHe;e             ZHg dIZIy)KzPyTorch GIT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)GitVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r$   r   torchFloatTensor__annotations__r%   r&   tupler'        b/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.pyr#   r#   5   sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r1   r#   c                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     de	dej                  f
dZ xZS )
GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                 B   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr9   register_bufferr,   arangeexpandselfconfig	__class__s     r2   rA   zGitEmbeddings.__init__K   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
r1   	input_idsr;   inputs_embedspast_key_values_lengthreturnc                 J   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }n|}| j                  dk(  r| j	                  |      }||z  }| j                  |      }| j                  |      }|S )Nr=   r   r:   )sizer;   rF   r9   rH   rI   rM   )	rS   rV   r;   rW   rX   input_shape
seq_length
embeddingsrH   s	            r2   forwardzGitEmbeddings.forwardZ   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J'':5"&":":<"H--J^^J/
\\*-
r1   )NNNr   )r(   r)   r*   r+   rA   r   r,   
LongTensorr-   intTensorr_   __classcell__rU   s   @r2   r4   r4   H   ss    E
" 153759&'E,,- u//0   1 12	
 !$ 
r1   r4   c                        e Zd Zd fd	Z eddd      	 	 	 	 	 ddej                  deej                     deej                     dee	   d	ee
   d
ee
   deej                     fd       Z xZS )GitSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |-t        j                  d| j                  j                   d       |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  j                  |j                  j                   z  dz  d	z         | _        |j$                  | xj"                  |j$                  z  c_        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j0                  |j2                        | _        |xs t7        |d
d      | _        | j8                  dk(  s| j8                  dk(  rG|j:                  | _        t'        j<                  d|j:                  z  d	z
  | j                        | _        y y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r9   r:   relative_keyrelative_key_query) r@   rA   rD   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerU   r(   ra   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyvaluerK   attention_probs_dropout_probrM   rN   r9   rG   rB   distance_embeddingrS   rT   r9   rp   rU   s       r2   rA   zGitSelfAttention.__init__y   s(    : ::a?PVXhHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr1   past_key_valuepast_key_values4.58new_nameversionr&   attention_mask	head_maskoutput_attentionspixel_values_presentrY   c           	      d   |j                   \  }}}	| j                  |      j                  |d| j                  | j                        j                  dd      }
|r| j                  nd}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }||j                  |d d d d |d d d f   |d d d d |d d d f   | j                        \  }}t        j                  |d d d d d |d d f   |gd      }t        j                  |d d d d d |d d f   |gd      }t        j                  |
|j                  dd            }| j                  dk(  s| j                  dk(  r|
j                   d   |j                   d   }}|Dt        j                  |dz
  t        j                   |j"                  	      j                  dd      }n@t        j$                  |t        j                   |j"                  	      j                  dd      }t        j$                  |t        j                   |j"                  	      j                  dd      }||z
  }| j'                  || j(                  z   dz
        }|j+                  |
j,                  
      }| j                  dk(  rt        j.                  d|
|      }||z   }nE| j                  dk(  r6t        j.                  d|
|      }t        j.                  d||      }||z   |z   }|t1        j2                  | j                        z  }|||z   }t4        j6                  j9                  |d      }| j;                  |      }|||z  }t        j                  ||      }|j=                  dddd      j?                         }|jA                         d d | jB                  fz   }|j                  |      }||fS )Nr=   r   rj   r   dimrk   rl   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )"shaper{   viewrm   rs   	transposerx   r|   r}   updaterp   r,   catmatmulr9   tensorlongr   rP   r   rG   tor   einsummathsqrtr   
functionalsoftmaxrM   permute
contiguousr[   rt   )rS   r&   r   r   r   r   r   
batch_sizer]   _query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                r2   r_   zGitSelfAttention.forward   s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 -A((aHH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	
 &/>/E/E!Q*+[Avw9I-JDNN0,N, 		9Q7F7A-=#>"OUVWI))[Aww1A%BDT$U[\]K !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L*!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r1   NNNNNFF)r(   r)   r*   rA   r   r,   rb   r   r-   r
   boolr/   r_   rc   rd   s   @r2   rf   rf   x   s     uD %0A6R 7;15+/,1/4R.||R. !!2!23R. E--.	R.
 "%R. $D>R. 'tnR. 
u||	R. SR.r1   rf   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )GitSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr7   )r@   rA   r   rz   rD   denserI   rJ   rK   rL   rM   rR   s     r2   rA   zGitSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r&   input_tensorrY   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rM   rI   rS   r&   r   s      r2   r_   zGitSelfOutput.forward   7    

=1]3}|'CDr1   r(   r)   r*   rA   r,   rb   r_   rc   rd   s   @r2   r   r      1    >U\\  RWR^R^ r1   r   eagerc                        e Zd Zd fd	Zd Z eddd      	 	 	 	 	 ddej                  deej                     d	eej                     dee
   d
ee   dee   deej                     fd       Z xZS )GitAttentionc                     t         |           t        |j                     |||      | _        t        |      | _        t               | _        y )N)r9   rp   )	r@   rA   GIT_SELF_ATTENTION_CLASSES_attn_implementationrS   r   outputsetpruned_headsr   s       r2   rA   zGitAttention.__init__  sE    .v/J/JK,Cy
	 $F+Er1   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rS   rm   rs   r   r   r{   r|   r}   r   r   rt   union)rS   headsindexs      r2   prune_headszGitAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r1   r   r   r   r   r&   r   r   r   r   rY   c                 `    | j                  ||||||      \  }}| j                  ||      }	|	|fS r   )rS   r   )
rS   r&   r   r   r   r   r   attn_outputself_attn_weightsattention_outputs
             r2   r_   zGitAttention.forward!  sI     *. *
&&  ;;{MB!222r1   r   r   )r(   r)   r*   rA   r   r   r,   rb   r   r-   r
   r   r/   r_   rc   rd   s   @r2   r   r     s    ";$ %0A6R 7;15+/,1/43||3 !!2!233 E--.	3
 "%3 $D>3 'tn3 
u||	3 S3r1   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r@   rA   r   rz   rD   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrR   s     r2   rA   zGitIntermediate.__init__9  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r1   r&   rY   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rS   r&   s     r2   r_   zGitIntermediate.forwardA  s&    

=100?r1   r   rd   s   @r2   r   r   8  s#    9U\\ ell r1   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	GitOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r@   rA   r   rz   r   rD   r   rI   rJ   rK   rL   rM   rR   s     r2   rA   zGitOutput.__init__I  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r&   r   rY   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r2   r_   zGitOutput.forwardO  r   r1   r   rd   s   @r2   r   r   H  r   r1   r   c                        e Zd Zd fd	Z eddd      	 	 	 	 	 ddej                  deej                     deej                     dee	   d	ee
   d
ee
   deej                     fd       Zd Z xZS )GitLayerc                     t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        y )Nr   )rp   )
r@   rA   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rS   rT   rp   rU   s      r2   rA   zGitLayer.__init__W  sK    '-'E'E$%f	B+F3'r1   r   r   r   r   r&   r   r   r   r   rY   c                     | j                  ||||||      \  }}t        | j                  | j                  | j                  |      }	|	|fS )N)r   r   r   )r   r   feed_forward_chunkr   r   )
rS   r&   r   r   r   r   r   r   self_attention_weightslayer_outputs
             r2   r_   zGitLayer.forward_  sg     48>>/+!5 4B 4
00 1##T%A%A4CSCSUe
 333r1   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rS   r   intermediate_outputr   s       r2   r   zGitLayer.feed_forward_chunkx  s,    "//0@A{{#68HIr1   r   r   )r(   r)   r*   rA   r   r,   rb   r   r-   r
   r   r/   r_   r   rc   rd   s   @r2   r   r   V  s    ( %0A6R 7;15+/,1/44||4 !!2!234 E--.	4
 "%4 $D>4 'tn4 
u||	4 S40r1   r   c                       e Zd Z fdZ	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deee	e
e
ej                        f      dee   dee   dee   d	ee   d
ee   dee
ej
                     ef   fdZ xZS )
GitEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w NF)
r@   rA   rT   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rS   rT   irU   s      r2   rA   zGitEncoder.__init__  sP    ]]vG_G_A`#aAHVQ$7#ab
&+# $bs   A$r&   r   r   r   	use_cacher   output_hidden_statesr   return_dictrY   c
           	         | j                   r%| j                  r|rt        j                  d       d}t	        |t        d       t        f      st        d      |r|t        | j                        }|rdnd }
|rdnd }t        | j                        D ]4  \  }}|r|
|fz   }
|||   nd } |||||||      }|d   }|s,||d   fz   }6 |r|
|fz   }
|	st        d |||
|fD              S t        |||
|	      S )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzBThe `past_key_values` should be either a `Cache` object or `None`.)rT   r0   r   r   c              3   $   K   | ]  }|| 
 y wr   r0   ).0vs     r2   	<genexpr>z%GitEncoder.forward.<locals>.<genexpr>  s      	 = 	s   r%   r   r&   r'   )r   trainingrq   rr   r   typer
   ro   r   rT   	enumerater   r/   r   )rS   r&   r   r   r   r   r   r  r   r  all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                   r2   r_   zGitEncoder.forward  sQ    &&4==##p "	 /DJ+>?abb0*$++>O"6BD$5b4(4 	POA|#$58H$H!.7.CilO(!$M *!,M &9]1=M<O&O##	P&   1]4D D 	 "#%'		 	 	 '+++*	
 	
r1   )NNNNFFFT)r(   r)   r*   rA   r,   rb   r   r-   r   r
   r/   r   r   r_   rc   rd   s   @r2   r   r   ~  s    , 7;15SW$(,1/4/4&*B
||B
 !!2!23B
 E--.	B

 "%uU5;L;L5M/N(N"OPB
 D>B
 $D>B
 'tnB
 'tnB
 d^B
 
uU\\"$;;	<B
r1   r   c                   &    e Zd ZU eed<   dZdZd Zy)GitPreTrainedModelrT   gitTc                    t        |t              rt        j                  j	                  |j
                  d| j                  j                         t        j                  j	                  |j                  j                  | j                  j                         t        j                  j	                  |j                  j                  | j                  j                         t        |t        j                        rm|j                  j                  j	                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j	                  d| j                  j                         |j                   2|j                  j                  |j                      j                          yyt        |t        j"                        rJ|j                  j                  j                          |j                  j                  j%                  d       yy)zInitialize the weights        )meanstd)r  Ng      ?)r   GitVisionEmbeddingsr   initnormal_class_embeddingrT   initializer_rangepatch_embeddingweightposition_embeddingrz   databiaszero_rB   r6   rI   fill_)rS   modules     r2   _init_weightsz GitPreTrainedModel._init_weights  s   f12GGOOF22$++B_B_O`GGOOF2299t{{?\?\O]GGOOF55<<$++B_B_O`fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r1   N)r(   r)   r*   r   r.   base_model_prefixsupports_gradient_checkpointingr%  r0   r1   r2   r  r    s    &*#*r1   r  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )r  rT   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestrider!  rj   r   r;   r<   r>   )r@   rA   rT   rD   	embed_dimrv   rw   r   	Parameterr,   randnr  Conv2dnum_channelsr  num_patchesnum_positionsrB   r  rO   rP   rQ   rR   s     r2   rA   zGitVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr1   r^   heightwidthrY   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr=   g      ?r   rj   bicubicF)r[   modealign_cornersr   )r   r  r  	unsqueezer,   jit
is_tracingr;   rw   r   reshaper   r   r   interpolater   r   )rS   r^   r5  r6  r3  r  r4  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encodingz,GitVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr1   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r   rj   r   r=   r   )r   rv   ro   r  r  r   r   flattenr   r  rQ   r,   r   rE  r  r;   )rS   rF  rE  r   r   r5  r6  target_dtypepatch_embedsclass_embedsr^   s              r2   r_   zGitVisionEmbeddings.forward&  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr1   F)r(   r)   r*   r    rA   r,   rb   ra   rE  r-   r_   rc   rd   s   @r2   r  r    sd    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r1   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r@   rA   rT   r	   r   activation_fnr   rz   rD   r   fc1fc2rR   s     r2   rA   zGitVisionMLP.__init__:  sd    #F$5$5699V//1I1IJ99V55v7I7IJr1   r&   rY   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rS  rR  rT  r   s     r2   r_   zGitVisionMLP.forwardA  s4    /**=9/r1   r   rd   s   @r2   rP  rP  9  s$    KU\\ ell r1   rP  r$  r{   r|   r}   r   scalingrM   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr=   r   )r   r   )pr	  r   rj   )r,   r   r   r   r   r   float32r   r   rM   r	  r   )
r$  r{   r|   r}   r   rV  rM   kwargsattn_weightsr   s
             r2   eager_attention_forwardr\  I  s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r1   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rI  g      F)r@   rA   rT   rD   r.  rm   	num_headshead_dimro   scaleattention_dropoutrM   	is_causalr   rz   k_projv_projq_projout_projrR   s     r2   rA   zGitVisionAttention.__init__c  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar1   r&   r   causal_attention_maskr   rY   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
z#Input shape: Batch x Time x Channelr   rj   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r  )rd  rV  rM   )r   rg  re  rf  r   r`  ra  r   rT   r   rd  r\  rq   rr   r   rb  r	  rM   r>  r   rh  )rS   r&   r   ri  r   r   r]   r.  querieskeysvaluesattention_interfacer   r[  s                 r2   r_   zGitVisionAttention.forwardw  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r1   )NNF)r(   r)   r*   r+   rA   r,   rb   r   r   r/   r_   rc   rd   s   @r2   r^  r^  `  s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r1   r^  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
GitVisionEncoderLayerrT   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r@   rA   rD   r.  r^  	self_attnr   rI   rJ   layer_norm1rP  mlplayer_norm2rR   s     r2   rA   zGitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr1   r&   r   ri  r   rY   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r&   r   ri  r   )ru  rt  rw  rv  )rS   r&   r   ri  r   residualr[  outputss           r2   r_   zGitVisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr1   rN  )r(   r)   r*   r    rA   r,   rb   r   r   r/   r-   r_   rc   rd   s   @r2   rr  rr    sf    S S -2&||& &  %||	&
 $D>& 
u  	!&r1   rr  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rT   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r@   rA   rT   r   r   r   r   rr  layersr   )rS   rT   r   rU   s      r2   rA   zGitVisionEncoder.__init__  sP    mmERXRjRjLk$lq%:6%B$lm&+# %ms   A#r   ri  r   r  r  rY   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr0   )r   r   r   r%   r&   r'   )rT   r   r  use_return_dictr  r~  r   )rS   rW   r   ri  r   r  r  encoder_statesall_attentionsr&   idxencoder_layerr  s                r2   r_   zGitVisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r1   )NNNNN)r(   r)   r*   r+   r    rA   r   r   r,   rb   r   r   r/   r   r_   rc   rd   s   @r2   r|  r|    s    , ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r1   r|  c                        e Zd Zdef fdZe	 	 	 	 	 d
deej                     dee	   dee	   dee	   dee	   de
eef   fd	       Z xZS )GitVisionTransformerrT   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r@   rA   rT   rD   r  r^   r   rI   rJ   pre_layrnormr|  encoderpost_layernorm)rS   rT   r.  rU   s      r2   rA   zGitVisionTransformer.__init__<  sj    &&	-f5LL8M8MN'/ ll9&:O:OPr1   rF  r   r  rE  r  rY   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }| j                  |      }|s	|f|dd  z   S t        ||j                  |j                        S )Nz You have to specify pixel_valuesrE  )rW   r   r  r  r   r   r  )rT   r   r  r  ro   r^   r  r  r  r   r&   r'   )	rS   rF  r   r  rE  r  r&   encoder_outputsr%   s	            r2   r_   zGitVisionTransformer.forwardF  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r1   NNNFN)r(   r)   r*   r    rA   r   r   r,   r-   r   r   r/   r   r_   rc   rd   s   @r2   r  r  :  s    Q Q  59,0/338&*&
u001&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
r1   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )GitVisionModelrT   rF  c                 d    t         |   |       t        |      | _        | j	                          y r   )r@   rA   r  vision_model	post_initrR   s     r2   rA   zGitVisionModel.__init__z  s'     08r1   rY   c                 B    | j                   j                  j                  S r   )r  r^   r  rS   s    r2   get_input_embeddingsz#GitVisionModel.get_input_embeddings  s      ++;;;r1   r   r  rE  r  c                 b    ||n| j                   j                  }| j                  |||||      S )a{  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```)rF  r   r  rE  r  )rT   r  r  )rS   rF  r   r  rE  r  s         r2   r_   zGitVisionModel.forward  sA    8 &1%<k$++B]B]  %/!5%=# ! 
 	
r1   r  )r(   r)   r*   r    r.   main_input_namerA   r   Moduler  r   r   r,   r-   r   r   r/   r   r_   rc   rd   s   @r2   r  r  p  s     $O <bii <  59,0/3).&*#
u001#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
r1   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )GitProjectionrT   c                 0   t         |           || _        t        j                  t        j
                  |j                  j                  |j                        t        j                  |j                  |j                  j                              | _
        y r   )r@   rA   rT   r   
Sequentialrz   ru   rD   rI   rJ   visual_projectionrR   s     r2   rA   zGitProjection.__init__  sf    !#IIf**668J8JKLL++1E1E1T1TU"
r1   r^   rY   c                 $    | j                  |      S r   )r  )rS   r^   s     r2   r_   zGitProjection.forward  s    %%j11r1   )	r(   r)   r*   r   rA   r,   rb   r_   rc   rd   s   @r2   r  r    s*    
y 
2%,, 25<< 2r1   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                       e Zd Z fdZd Zd Zd Zdedej                  dej                  dej                  fd	Zdd
Ze	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deeeeej&                     f      dee   dee   dee   dedee   deeej                     ef   fd       Z xZS )GitModelc                 l   t         |          | _        t              | _        t        j                        | _        t              | _	        t              | _        j                  6t        j                  fdt        j                        D              | _        | j#                          y )Nc              3      K   | ]B  }t        j                  t        j                  d d j                  j
                               D yw)r   N)r   r/  r,   zerosru   rD   )r  r   rT   s     r2   r  z$GitModel.__init__.<locals>.<genexpr>  s;      ; U[[Av/C/C/O/OPQ;s   AA)r@   rA   rT   r4   r^   r  ru   image_encoderr   r  r  r  ry   r   ParameterListr   img_temperal_embeddingr  rR   s    `r2   rA   zGitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r1   c                 .    | j                   j                  S r   r^   rF   r  s    r2   r  zGitModel.get_input_embeddings  s    ...r1   c                 &    || j                   _        y r   r  )rS   r}   s     r2   set_input_embeddingszGitModel.set_input_embeddings  s    */'r1   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rS   heads_to_pruner   r   s       r2   _prune_headszGitModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr1   r[   r   r   rY   c                     t        j                  t        j                  ||||      d      }|j                  |dk(  t	        d            }|S )Nr   r   r   )diagonal-inf)r,   triuonesmasked_fillfloat)rS   r[   r   r   masks        r2   _generate_future_maskzGitModel._generate_future_mask  sA    zz%**T4eLWXY	5=9r1   c                    |j                   d   }|j                   d   }|j                  }|j                  }	t        j                  ||f||	      }
t        j
                  |||z   ft        d      |j                  |	      }t        j                  ||f|	|j                        }|dkD  rAt        j                  |j                   d   |j                   d   |z   f|	|j                        }t        j                  |
|fd      }t        j                  ||j                  |	      fd      }t        j                  ||fd      d d d f   }|4t        j
                  |j                   d   |j                   d   fd|      }|j                  t        j                  k7  rt        d	      t        j                  ||j                  
      }t        d      ||<   |j                  |j                   d   ||z   ||z   |z   f      }|j                         }|d d d d d |f   }|d d d d d f   }||z   |d d d d d |f<   |d d d d d d d f   }|S )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r,   r  fullr  r   r   r   ro   
zeros_likerQ   clone)rS   tgtmemorytgt_maskrX   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r2   create_attention_maskzGitModel.create_attention_mask  sF   ))A,\\!_
		;;
J7eTJJ#99:&M::	
	 kkj!??
 "A%{{"HNN1$58N$NOH yy(K0a8		9hkk%&89qA#iiu1=dAgF"*&+jj&,,q/6<<PQ?1S`ent&u#"((EJJ6PQQ!&!1!12IQTQZQZ![:?-67188$**1-zG/CZRhEhkrErs
 2779)!Q*;<'4
31<v1EAq+:+-. 2!T1a-@""r1   rV   r   r;   rF  r   rW   r   r   r   r  rE  r  c                 \   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |d   }d}|0t        |t              s|j                         n|j                         }| j                  || j                   j                        }d}||j                  dk(  r| j                  ||      j                  }n|j                  d	k(  rg }t!        |j"                  d         D ]O  }| j                  |dd|ddddf   |      j                  }|| j$                  |   z  }|j'                  |       Q t)        j*                  |d
      }nt        d      | j-                  |      }| j/                  ||||      }|It)        j0                  |j"                  d   d|j"                  d   f|j2                  |j4                        }|j7                  |j                  d      |j                  d      z  dd      }t)        j*                  ||fd
      }| j9                  ||j2                  |j4                        }| j;                  ||||      }|mt=        ||j2                  |d         j?                  |j4                        }|dkD  r|dddd| dddf   }n!|dddd|d    d|d    dfxx   |z  cc<   | jA                  ||||||	|
||du	      }|d   }|s	|f|dd z   S tC        ||jD                  |jF                  |jH                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer=   z5You have to specify either input_ids or inputs_embedsr   r      r     r   z#pixel_values must be of rank 4 or 5)rV   r;   rW   rX   rj   r   )r  r  r  rX   )tgt_len)r   r   r   r   r   r  r  r   r  )%rT   r   r  r   r  ro   %warn_if_padding_and_no_attention_maskr[   r   r
   get_seq_lengthget_head_maskr   ndimr  r%   r   r   r  appendr,   r   r  r^   r  r   r   repeatr  r  r   r   r  r   r   r&   r'   )rS   rV   r   r;   rF  r   rW   r   r   r   r  rE  r  r\   r]   rX   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr&   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r2   r_   zGitModel.forward  s7   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU ^
 "#& "/59  ..0$335 # &&y$++2O2OP	$(!#  A%"&"4"4 ;S #5 ###   ""a'"$!&|'9'9!'<!= BI,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@AB #())O"C !!FGG(,(>(>(O%??%'#9	 + 
 %,(-!''*A/?/E/Ea/HI&,,'..)% %>$D$D!!!$(A(F(Fq(II1a%
!
 		#<>N"OUVW --j:J:P:PRbRiRij #'"<"< ,#9	 #= #
 % "< 0 6 6B"b!(()  &)%71?U>U>VXY8Y%Z"'1{1~o.?+a.AR(RSWiiS,,2+/!5#!-T!9 ' 

 *!,#%(;;;&-+;;)77&11	
 	
r1   r   )NNNNNNNNNNFN)r(   r)   r*   rA   r  r  r  ra   r,   r   r   rb   r  r  r   r   r   r
   listr-   r   r/   r   r_   rc   rd   s   @r2   r  r    s   &/0C# ekk 5<< \a\h\h 0#d  -115/3/3,004KO$(,0/3).&*c
ELL)c
 !.c
 u||,	c

 u||,c
 ELL)c
  -c
 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\\"$>>	?c
 c
r1   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     deeeee	j                     f      dee   dee   dee   dedee   deee	j                     ef   fd       Z	 ddZ xZS )GitForCausalLMzoutput.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
r@   rA   r  r  r   rz   rD   rC   r   r  rR   s     r2   rA   zGitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r1   c                     | j                   S r   r   r  s    r2   get_output_embeddingsz$GitForCausalLM.get_output_embeddings  s    {{r1   c                     || _         y r   r  )rS   new_embeddingss     r2   set_output_embeddingsz$GitForCausalLM.set_output_embeddings  s	    $r1   rV   r   r;   rF  r   rW   labelsr   r   r   r  rE  r  rY   c                    ||n| j                   j                  }|d}	| j                  ||||||||	|
|||      }|d   }| j                  |      }d}|| j                  j                  j
                  d   j                  j                  j                  }|dd|dddf   j                         }|ddddf   j                         } | j                  |j                  d| j                   j                        |j                  d      fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                         S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)r   r;   rF  r   rW   r   r   r   r  rE  r  r   r=   r   rC   )losslogitsr   r&   r'   )rT   r  r  r   r  r   r   rS   rx   r   loss_functionr   rC   r   r   r&   r'   )rS   rV   r   r;   rF  r   rW   r  r   r   r   r  rE  r  rZ  rz  r  r  r  num_image_tokensshifted_logitsr   s                         r2   r_   zGitForCausalLM.forward  s   j &1%<k$++B]B]I(()%%'+/!5%=#  
 "!*_-#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%4%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r1   c                     |B|j                         }|j                  d   |kD  r|}n|j                  d   dz
  }|d d |d f   }|j                  }||j                  |      }|||j                  d      ||dS )Nr   rF  )rV   r   rF  r   r   )r  r   new_onesget)	rS   rV   r   r   r   rZ  past_lengthremove_prefix_lengthr\   s	            r2   prepare_inputs_for_generationz,GitForCausalLM.prepare_inputs_for_generation  s     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I  oo!&//<N #,"JJ~6."
 	
r1   )NNNNNNNNNNNFN)NNN)r(   r)   r*   _tied_weights_keysrA   r  r  r   r   r,   rb   r   r
   r  r   r/   r   r_   r  rc   rd   s   @r2   r  r    s|    **%  -115/3/3,004)-FJ$(,0/3).&*A
ELL)A
 !.A
 u||,	A

 u||,A
 ELL)A
  -A
 &A
 "%tELL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\\"$::	;!A
 A
H OS
r1   r  )r  r  r  r  )r  )Jr+   r   dataclassesr   typingr   r   r   r,   torch.utils.checkpointr   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   utils.deprecationr   configuration_gitr   r    
get_loggerr(   rq   r#   r  r4   rf   r   r   r   r   r   r   r   r  r  rP  rb   r  r\  r^  rr  r|  r  r  r  r  r  __all__r0   r1   r2   <module>r	     s      ! , ,    ! . ) B 9  G l l  1 9 
		H	% 	?; 	? 	?-BII -`v.ryy v.tBII   
/3299 /3fbii  		 %) %PI
 I
X * * *6P")) Pf299 . %II%<<% 
% <<	%
 U\\*% % %.L) L)`/6 /fT
ryy T
n3
299 3
l 
2
' 2

2
j
2BII 
2 
~
! ~

~
B 
s
' s

s
l Qr1   